Context Navigation

htmlfilter.inc @ 2

Revision 2, 35.9 KB checked in by niltonneto, 17 years ago (diff)
Removida todas as tags usadas pelo CVS ($Id, $Source). Primeira versão no CVS externo.
Property svn:eol-style set to `native` Property svn:executable set to ``*

Line
1	<?php
2	/**
3	* htmlfilter.inc
4	* ---------------
5	* This set of functions allows you to filter html in order to remove
6	* any malicious tags from it. Useful in cases when you need to filter
7	* user input for any cross-site-scripting attempts.
8	*
9	* Copyright (C) 2002-2004 by Duke University
10	*
11	* This library is free software; you can redistribute it and/or
12	* modify it under the terms of the GNU Lesser General Public
13	* License as published by the Free Software Foundation; either
14	* version 2.1 of the License, or (at your option) any later version.
15	*
16	* This library is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19	* Lesser General Public License for more details.
20	*
21	* You should have received a copy of the GNU Lesser General Public
22	* License along with this library; if not, write to the Free Software
23	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
24	* 02110-1301 USA
25	*
26	* @Author Konstantin Riabitsev <icon@linux.duke.edu>
27	* @Version 1.1 ($Date$)
28	*/
29
30	/**
31	* This is a debugging function used throughout the code. To enable
32	* debugging you have to specify a global variable called "debug" before
33	* calling sanitize() and set it to true.
34	*
35	* Note: Although insignificantly, debugging does slow you down even
36	* when $debug is set to false. If you wish to get rid of all
37	* debugging calls, run the following command:
38	*
39	* fgrep -v 'spew("' htmlfilter.inc > htmlfilter.inc.new
40	*
41	* htmlfilter.inc.new will contain no debugging calls.
42	*
43	* @param $message A string with the message to output.
44	* @return void.
45	*/
46	function spew($message){
47	global $debug;
48	if ($debug == true){
49	echo "$message";
50	}
51	}
52
53	/**
54	* This function returns the final tag out of the tag name, an array
55	* of attributes, and the type of the tag. This function is called by
56	* sanitize internally.
57	*
58	* @param $tagname the name of the tag.
59	* @param $attary the array of attributes and their values
60	* @param $tagtype The type of the tag (see in comments).
61	* @return a string with the final tag representation.
62	*/
63	function tagprint($tagname, $attary, $tagtype){
64	$me = 'tagprint';
65	if ($tagtype == 2){
66	$fulltag = '</' . $tagname . '>';
67	} else {
68	$fulltag = '<' . $tagname;
69	if (is_array($attary) && sizeof($attary)){
70	$atts = Array();
71	while (list($attname, $attvalue) = each($attary)){
72	array_push($atts, "$attname=$attvalue");
73	}
74	$fulltag .= ' ' . join(' ', $atts);
75	}
76	if ($tagtype == 3){
77	$fulltag .= ' /';
78	}
79	$fulltag .= '>';
80	}
81	spew("$me: $fulltag\n");
82	return $fulltag;
83	}
84
85	/**
86	* A small helper function to use with array_walk. Modifies a by-ref
87	* value and makes it lowercase.
88	*
89	* @param $val a value passed by-ref.
90	* @return void since it modifies a by-ref value.
91	*/
92	function casenormalize(&$val){
93	$val = strtolower($val);
94	}
95
96	/**
97	* This function skips any whitespace from the current position within
98	* a string and to the next non-whitespace value.
99	*
100	* @param $body the string
101	* @param $offset the offset within the string where we should start
102	* looking for the next non-whitespace character.
103	* @return the location within the $body where the next
104	* non-whitespace char is located.
105	*/
106	function skipspace($body, $offset){
107	$me = 'skipspace';
108	preg_match('/^(\s*)/s', substr($body, $offset), $matches);
109	if (sizeof($matches{1})){
110	$count = strlen($matches{1});
111	spew("$me: skipped $count chars\n");
112	$offset += $count;
113	}
114	return $offset;
115	}
116
117	/**
118	* This function looks for the next character within a string. It's
119	* really just a glorified "strpos", except it catches the failures
120	* nicely.
121	*
122	* @param $body The string to look for needle in.
123	* @param $offset Start looking from this position.
124	* @param $needle The character/string to look for.
125	* @return location of the next occurance of the needle, or
126	* strlen($body) if needle wasn't found.
127	*/
128	function findnxstr($body, $offset, $needle){
129	$me = 'findnxstr';
130	$pos = strpos($body, $needle, $offset);
131	if ($pos === FALSE){
132	$pos = strlen($body);
133	spew("$me: end of body reached\n");
134	}
135	spew("$me: '$needle' found at pos $pos\n");
136	return $pos;
137	}
138
139	/**
140	* This function takes a PCRE-style regexp and tries to match it
141	* within the string.
142	*
143	* @param $body The string to look for needle in.
144	* @param $offset Start looking from here.
145	* @param $reg A PCRE-style regex to match.
146	* @return Returns a false if no matches found, or an array
147	* with the following members:
148	* - integer with the location of the match within $body
149	* - string with whatever content between offset and the match
150	* - string with whatever it is we matched
151	*/
152	function findnxreg($body, $offset, $reg){
153	$me = 'findnxreg';
154	$matches = Array();
155	$retarr = Array();
156	$preg_rule = '%^(.*?)(' . $reg . ')%s';
157	preg_match($preg_rule, substr($body, $offset), $matches);
158	if (!isset($matches{0})){
159	spew("$me: No matches found.\n");
160	$retarr = false;
161	} else {
162	$retarr{0} = $offset + strlen($matches{1});
163	$retarr{1} = $matches{1};
164	$retarr{2} = $matches{2};
165	spew("$me: '$reg' found at pos $offset matching '".$matches{2}."'\n");
166	}
167	return $retarr;
168	}
169
170	/**
171	* This function looks for the next tag.
172	*
173	* @param $body String where to look for the next tag.
174	* @param $offset Start looking from here.
175	* @return false if no more tags exist in the body, or
176	* an array with the following members:
177	* - string with the name of the tag
178	* - array with attributes and their values
179	* - integer with tag type (1, 2, or 3)
180	* - integer where the tag starts (starting "<")
181	* - integer where the tag ends (ending ">")
182	* first three members will be false, if the tag is invalid.
183	*/
184	function getnxtag($body, $offset){
185	$me = 'getnxtag';
186	if ($offset > strlen($body)){
187	spew("$me: Past the end of body\n");
188	return false;
189	}
190	$lt = findnxstr($body, $offset, '<');
191	if ($lt == strlen($body)){
192	spew("$me: No more tags found!\n");
193	return false;
194	}
195	/**
196	* We are here:
197	* blah blah <tag attribute="value">
198	* \---------^
199	*/
200	spew("$me: Found '<' at pos $lt\n");
201	$pos = skipspace($body, $lt + 1);
202	if ($pos >= strlen($body)){
203	spew("$me: End of body reached.\n");
204	return Array(false, false, false, $lt, strlen($body));
205	}
206	/**
207	* There are 3 kinds of tags:
208	* 1. Opening tag, e.g.:
209	* <a href="blah">
210	* 2. Closing tag, e.g.:
211	* </a>
212	* 3. XHTML-style content-less tag, e.g.:
213	* <img src="blah"/>
214	*/
215	$tagtype = false;
216	switch (substr($body, $pos, 1)){
217	case '/':
218	spew("$me: This is a closing tag (type 2)\n");
219	$tagtype = 2;
220	$pos++;
221	break;
222	case '!':
223	/**
224	* A comment or an SGML declaration.
225	*/
226	if (substr($body, $pos+1, 2) == '--'){
227	spew("$me: A comment found. Stripping.\n");
228	$gt = strpos($body, '-->', $pos);
229	if ($gt === false){
230	$gt = strlen($body);
231	} else {
232	$gt += 2;
233	}
234	return Array(false, false, false, $lt, $gt);
235	} else {
236	spew("$me: An SGML declaration found. Stripping.\n");
237	$gt = findnxstr($body, $pos, '>');
238	return Array(false, false, false, $lt, $gt);
239	}
240	break;
241	default:
242	/**
243	* Assume tagtype 1 for now. If it's type 3, we'll switch values
244	* later.
245	*/
246	$tagtype = 1;
247	break;
248	}
249
250	$tag_start = $pos;
251	$tagname = '';
252	/**
253	* Look for next [\W-_], which will indicate the end of the tag name.
254	*/
255	$regary = findnxreg($body, $pos, '[^\w\-_]');
256	if ($regary == false){
257	spew("$me: End of body reached while analyzing tag name\n");
258	return Array(false, false, false, $lt, strlen($body));
259	}
260	list($pos, $tagname, $match) = $regary;
261	$tagname = strtolower($tagname);
262
263	/**
264	* $match can be either of these:
265	* '>' indicating the end of the tag entirely.
266	* '\s' indicating the end of the tag name.
267	* '/' indicating that this is type-3 xhtml tag.
268	*
269	* Whatever else we find there indicates an invalid tag.
270	*/
271	switch ($match){
272	case '/':
273	/**
274	* This is an xhtml-style tag with a closing / at the
275	* end, like so: <img src="blah"/>. Check if it's followed
276	* by the closing bracket. If not, then this tag is invalid
277	*/
278	if (substr($body, $pos, 2) == '/>'){
279	spew("$me: XHTML-style tag found.\n");
280	$pos++;
281	spew("$me: Setting tagtype to 3\n");
282	$tagtype = 3;
283	} else {
284	spew("$me: Found invalid character '/'.\n");
285	$gt = findnxstr($body, $pos, '>');
286	spew("$me: Tag is invalid. Returning.\n");
287	$retary = Array(false, false, false, $lt, $gt);
288	return $retary;
289	}
290	case '>':
291	spew("$me: End of tag found at $pos\n");
292	spew("$me: Tagname is '$tagname'\n");
293	spew("$me: This tag has no attributes\n");
294	return Array($tagname, false, $tagtype, $lt, $pos);
295	break;
296	default:
297	/**
298	* Check if it's whitespace
299	*/
300	if (preg_match('/\s/', $match)){
301	spew("$me: Tagname is '$tagname'\n");
302	} else {
303	/**
304	* This is an invalid tag! Look for the next closing ">".
305	*/
306	spew("$me: Invalid characters found in tag name: $match\n");
307	$gt = findnxstr($body, $lt, '>');
308	return Array(false, false, false, $lt, $gt);
309	}
310	}
311
312	/**
313	* At this point we're here:
314	* <tagname attribute='blah'>
315	* \-------^
316	*
317	* At this point we loop in order to find all attributes.
318	*/
319	$attname = '';
320	$atttype = false;
321	$attary = Array();
322
323	while ($pos <= strlen($body)){
324	$pos = skipspace($body, $pos);
325	if ($pos == strlen($body)){
326	/**
327	* Non-closed tag.
328	*/
329	spew("$me: End of body reached before end of tag. Discarding.\n");
330	return Array(false, false, false, $lt, $pos);
331	}
332	/**
333	* See if we arrived at a ">" or "/>", which means that we reached
334	* the end of the tag.
335	*/
336	$matches = Array();
337	preg_match('%^(\s*)(>\|/>)%s', substr($body, $pos), $matches);
338	if (isset($matches{0}) && $matches{0}){
339	/**
340	* Yep. So we did.
341	*/
342	spew("$me: Arrived at the end of the tag.\n");
343	$pos += strlen($matches{1});
344	if ($matches{2} == '/>'){
345	$tagtype = 3;
346	$pos++;
347	}
348	return Array($tagname, $attary, $tagtype, $lt, $pos);
349	}
350
351	/**
352	* There are several types of attributes, with optional
353	* [:space:] between members.
354	* Type 1:
355	* attrname[:space:]=[:space:]'CDATA'
356	* Type 2:
357	* attrname[:space:]=[:space:]"CDATA"
358	* Type 3:
359	* attr[:space:]=[:space:]CDATA
360	* Type 4:
361	* attrname
362	*
363	* We leave types 1 and 2 the same, type 3 we check for
364	* '"' and convert to "&quot" if needed, then wrap in
365	* double quotes. Type 4 we convert into:
366	* attrname="yes".
367	*/
368	$regary = findnxreg($body, $pos, '[^\w\-_]');
369	if ($regary == false){
370	/**
371	* Looks like body ended before the end of tag.
372	*/
373	spew("$me: End of body found before end of tag.\n");
374	spew("$me: Invalid, returning\n");
375	return Array(false, false, false, $lt, strlen($body));
376	}
377	list($pos, $attname, $match) = $regary;
378	$attname = strtolower($attname);
379	spew("$me: Attribute '$attname' found\n");
380	/**
381	* We arrived at the end of attribute name. Several things possible
382	* here:
383	* '>' means the end of the tag and this is attribute type 4
384	* '/' if followed by '>' means the same thing as above
385	* '\s' means a lot of things -- look what it's followed by.
386	* anything else means the attribute is invalid.
387	*/
388	switch($match){
389	case '/':
390	/**
391	* This is an xhtml-style tag with a closing / at the
392	* end, like so: <img src="blah"/>. Check if it's followed
393	* by the closing bracket. If not, then this tag is invalid
394	*/
395	if (substr($body, $pos, 2) == '/>'){
396	spew("$me: This is an xhtml-style tag.\n");
397	$pos++;
398	spew("$me: Setting tagtype to 3\n");
399	$tagtype = 3;
400	} else {
401	spew("$me: Found invalid character '/'.\n");
402	$gt = findnxstr($body, $pos, '>');
403	spew("$me: Tag is invalid. Returning.\n");
404	$retary = Array(false, false, false, $lt, $gt);
405	return $retary;
406	}
407	case '>':
408	spew("$me: found type 4 attribute.\n");
409	spew("$me: Additionally, end of tag found at $pos\n");
410	spew("$me: Attname is '$attname'\n");
411	spew("$me: Setting attvalue to 'yes'\n");
412	$attary{$attname} = '"yes"';
413	return Array($tagname, $attary, $tagtype, $lt, $pos);
414	break;
415	default:
416	/**
417	* Skip whitespace and see what we arrive at.
418	*/
419	$pos = skipspace($body, $pos);
420	$char = substr($body, $pos, 1);
421	/**
422	* Two things are valid here:
423	* '=' means this is attribute type 1 2 or 3.
424	* \w means this was attribute type 4.
425	* anything else we ignore and re-loop. End of tag and
426	* invalid stuff will be caught by our checks at the beginning
427	* of the loop.
428	*/
429	if ($char == '='){
430	spew("$me: Attribute type 1, 2, or 3 found.\n");
431	$pos++;
432	$pos = skipspace($body, $pos);
433	/**
434	* Here are 3 possibilities:
435	* "'" attribute type 1
436	* '"' attribute type 2
437	* everything else is the content of tag type 3
438	*/
439	$quot = substr($body, $pos, 1);
440	if ($quot == '\''){
441	spew("$me: In fact, this is attribute type 1\n");
442	spew("$me: looking for closing quote\n");
443	$regary = findnxreg($body, $pos+1, '\'');
444	if ($regary == false){
445	spew("$me: end of body reached before end of val\n");
446	spew("$me: Returning\n");
447	return Array(false, false, false, $lt, strlen($body));
448	}
449	list($pos, $attval, $match) = $regary;
450	spew("$me: Attvalue is '$attval'\n");
451	$pos++;
452	$attary{$attname} = '\'' . $attval . '\'';
453	} else if ($quot == '"'){
454	spew("$me: In fact, this is attribute type 2\n");
455	spew("$me: looking for closing quote\n");
456	$regary = findnxreg($body, $pos+1, '\"');
457	if ($regary == false){
458	spew("$me: end of body reached before end of val\n");
459	spew("$me: Returning\n");
460	return Array(false, false, false, $lt, strlen($body));
461	}
462	list($pos, $attval, $match) = $regary;
463	spew("$me: Attvalue is \"$attval\"\n");
464	$pos++;
465	$attary{$attname} = '"' . $attval . '"';
466	} else {
467	spew("$me: This looks like attribute type 3\n");
468	/**
469	* These are hateful. Look for \s, or >.
470	*/
471	spew("$me: Looking for end of attval\n");
472	$regary = findnxreg($body, $pos, '[\s>]');
473	if ($regary == false){
474	spew("$me: end of body reached before end of val\n");
475	spew("$me: Returning\n");
476	return Array(false, false, false, $lt, strlen($body));
477	}
478	list($pos, $attval, $match) = $regary;
479	/**
480	* If it's ">" it will be caught at the top.
481	*/
482	spew("$me: translating '\"' into "\n");
483	$attval = preg_replace('/\"/s', '"', $attval);
484	spew("$me: wrapping in quotes\n");
485	$attary{$attname} = '"' . $attval . '"';
486	}
487	} else if (preg_match('\|[\w/>]\|', $char)) {
488	/**
489	* That was attribute type 4.
490	*/
491	spew("$me: attribute type 4 found.\n");
492	spew("$me: Setting value to 'yes'\n");
493	$attary{$attname} = '"yes"';
494	} else {
495	/**
496	* An illegal character. Find next '>' and return.
497	*/
498	spew("$me: illegal character '$char' found.\n");
499	spew("$me: returning\n");
500	$gt = findnxstr($body, $pos, '>');
501	return Array(false, false, false, $lt, $gt);
502	}
503	}
504	}
505	/**
506	* The fact that we got here indicates that the tag end was never
507	* found. Return invalid tag indication so it gets stripped.
508	*/
509	spew("$me: No tag end found\n");
510	return Array(false, false, false, $lt, strlen($body));
511	}
512
513	/**
514	* Translates entities into literal values so they can be checked.
515	*
516	* @param $attvalue the by-ref value to check.
517	* @param $regex the regular expression to check against.
518	* @param $hex whether the entites are hexadecimal.
519	* @return True or False depending on whether there were matches.
520	*/
521	function deent(&$attvalue, $regex, $hex=false){
522	$me = 'deent';
523	spew("$me: matching '$regex' against: $attvalue\n");
524	$ret_match = false;
525	preg_match_all($regex, $attvalue, $matches);
526	if (is_array($matches) && sizeof($matches[0]) > 0){
527	spew("$me: found " . sizeof($matches[0]) . " matches\n");
528	$repl = Array();
529	for ($i = 0; $i < sizeof($matches[0]); $i++){
530	$numval = $matches[1][$i];
531	spew("$me: numval is $numval\n");
532	if ($hex){
533	$numval = hexdec($numval);
534	spew("$me: hex! Numval is now $numval\n");
535	}
536	$repl{$matches[0][$i]} = chr($numval);
537	}
538	$attvalue = strtr($attvalue, $repl);
539	spew("$me: attvalue after translation: $attvalue\n");
540	return true;
541	} else {
542	spew("$me: no matches! Returning false.\n");
543	return false;
544	}
545	}
546
547	/**
548	* This function checks attribute values for entity-encoded values
549	* and returns them translated into 8-bit strings so we can run
550	* checks on them.
551	*
552	* @param $attvalue A string to run entity check against.
553	* @return Nothing, modifies a reference value.
554	*/
555	function defang(&$attvalue){
556	$me = 'defang';
557	/**
558	* Skip this if there aren't ampersands or backslashes.
559	*/
560	spew("$me: Checking '$attvalue' for suspicious content\n");
561	if (strpos($attvalue, '&') === false
562	&& strpos($attvalue, '\\') === false){
563	spew("$me: no suspicious content found, returning.\n");
564	return;
565	}
566	$m = false;
567	do {
568	$m = false;
569	$m = $m \|\| deent($attvalue, '/\&#0(\d+);/s');
570	$m = $m \|\| deent($attvalue, '/\&#x0((\d\|[a-f])+);/si', true);
571	$m = $m \|\| deent($attvalue, '/\\\\(\d+)/s', true);
572	spew("$me: m=$m\n");
573	} while ($m == true);
574	$attvalue = stripslashes($attvalue);
575	spew("$me: translated into: $attvalue\n");
576	}
577
578	/**
579	* Kill any tabs, newlines, or carriage returns. Our friends the
580	* makers of the browser with 95% market value decided that it'd
581	* be funny to make "java[tab]script" be just as good as "javascript".
582	*
583	* @param attvalue The attribute value before extraneous spaces removed.
584	* @return attvalue Nothing, modifies a reference value.
585	*/
586	function unspace(&$attvalue){
587	$me = 'unspace';
588	if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)){
589	spew("$me: Killing whitespace.\n");
590	$attvalue = str_replace(Array("\t", "\r", "\n", "\0", " "),
591	Array('', '', '', '', ''), $attvalue);
592	}
593	spew("$me: after unspace: $attvalue\n");
594	}
595
596	/**
597	* This function runs various checks against the attributes.
598	*
599	* @param $tagname String with the name of the tag.
600	* @param $attary Array with all tag attributes.
601	* @param $rm_attnames See description for sanitize
602	* @param $bad_attvals See description for sanitize
603	* @param $add_attr_to_tag See description for sanitize
604	* @return Array with modified attributes.
605	*/
606	function fixatts($tagname,
607	$attary,
608	$rm_attnames,
609	$bad_attvals,
610	$add_attr_to_tag
611	){
612	$me = 'fixatts';
613	spew("$me: Fixing attributes\n");
614	while (list($attname, $attvalue) = each($attary)){
615	/**
616	* See if this attribute should be removed.
617	*/
618	foreach ($rm_attnames as $matchtag=>$matchattrs){
619	if (preg_match($matchtag, $tagname)){
620	foreach ($matchattrs as $matchattr){
621	if (preg_match($matchattr, $attname)){
622	spew("$me: Attribute '$attname' defined as bad.\n");
623	spew("$me: Removing.\n");
624	unset($attary{$attname});
625	continue;
626	}
627	}
628	}
629	}
630	/**
631	* Remove any backslashes, entities, or extraneous whitespace.
632	*/
633	defang($attvalue);
634	unspace($attvalue);
635
636	/**
637	* Now let's run checks on the attvalues.
638	* I don't expect anyone to comprehend this. If you do,
639	* get in touch with me so I can drive to where you live and
640	* shake your hand personally. :)
641	*/
642	foreach ($bad_attvals as $matchtag=>$matchattrs){
643	if (preg_match($matchtag, $tagname)){
644	foreach ($matchattrs as $matchattr=>$valary){
645	if (preg_match($matchattr, $attname)){
646	/**
647	* There are two arrays in valary.
648	* First is matches.
649	* Second one is replacements
650	*/
651	list($valmatch, $valrepl) = $valary;
652	$newvalue = preg_replace($valmatch,$valrepl,$attvalue);
653	if ($newvalue != $attvalue){
654	spew("$me: attvalue is now $newvalue\n");
655	$attary{$attname} = $newvalue;
656	}
657	}
658	}
659	}
660	}
661	}
662	/**
663	* See if we need to append any attributes to this tag.
664	*/
665	foreach ($add_attr_to_tag as $matchtag=>$addattary){
666	if (preg_match($matchtag, $tagname)){
667	$attary = array_merge($attary, $addattary);
668	spew("$me: Added attributes to this tag\n");
669	}
670	}
671	return $attary;
672	}
673
674	/**
675	* This is the main function and the one you should actually be calling.
676	* There are several variables you should be aware of an which need
677	* special description.
678	*
679	* $tag_list
680	* ----------
681	* This is a simple one-dimentional array of strings, except for the
682	* very first one. The first member should be einter false or true.
683	* In case it's FALSE, the following list will be considered a list of
684	* tags that should be explicitly REMOVED from the body, and all
685	* others that did not match the list will be allowed. If the first
686	* member is TRUE, then the list is the list of tags that should be
687	* explicitly ALLOWED -- any tag not matching this list will be
688	* discarded.
689	*
690	* Examples:
691	* $tag_list = Array(
692	* false,
693	* "blink",
694	* "link",
695	* "object",
696	* "meta",
697	* "marquee",
698	* "html"
699	* );
700	*
701	* This will allow all tags except for blink, link, object, meta, marquee,
702	* and html.
703	*
704	* $tag_list = Array(
705	* true,
706	* "b",
707	* "a",
708	* "i",
709	* "img",
710	* "strong",
711	* "em",
712	* "p"
713	* );
714	*
715	* This will remove all tags from the body except b, a, i, img, strong, em and
716	* p.
717	*
718	* $rm_tags_with_content
719	* ---------------------
720	* This is a simple one-dimentional array of strings, which specifies the
721	* tags to be removed with any and all content between the beginning and
722	* the end of the tag.
723	* Example:
724	* $rm_tags_with_content = Array(
725	* "script",
726	* "style",
727	* "applet",
728	* "embed"
729	* );
730	*
731	* This will remove the following structure:
732	* <script>
733	* window.alert("Isn't cross-site-scripting fun?!");
734	* </script>
735	*
736	* $self_closing_tags
737	* ------------------
738	* This is a simple one-dimentional array of strings, which specifies which
739	* tags contain no content and should not be forcefully closed if this option
740	* is turned on (see further).
741	* Example:
742	* $self_closing_tags = Array(
743	* "img",
744	* "br",
745	* "hr",
746	* "input"
747	* );
748	*
749	* $force_tag_closing
750	* ------------------
751	* Set it to true to forcefully close any tags opened within the document.
752	* This is good if you want to take care of people who like to screw up
753	* the pages by leaving unclosed tags like <a>, <b>, <i>, etc.
754	*
755	* $rm_attnames
756	* -------------
757	* Now we come to parameters that are more obscure. This parameter is
758	* a nested array which is used to specify which attributes should be
759	* removed. It goes like so:
760	*
761	* $rm_attnames = Array(
762	* "PCRE regex to match tag name" =>
763	* Array(
764	* "PCRE regex to match attribute name"
765	* )
766	* );
767	*
768	* Example:
769	* $rm_attnames = Array(
770	* "\|.*\|" =>
771	* Array(
772	* "\|target\|i",
773	* "\|^on.*\|i"
774	* )
775	* );
776	*
777	* This will match all attributes (.*), and specify that all attributes
778	* named "target" and starting with "on" should be removed. This will take
779	* care of the following problem:
780	* <em onmouseover="window.alert('muahahahaha')">
781	* The "onmouseover" will be removed.
782	*
783	* $bad_attvals
784	* ------------
785	* This is where it gets ugly. This is a nested array with many levels.
786	* It goes like so:
787	*
788	* $bad_attvals = Array(
789	* "pcre regex to match tag name" =>
790	* Array(
791	* "pcre regex to match attribute name" =>
792	* Array(
793	* "pcre regex to match attribute value"
794	* )
795	* Array(
796	* "pcre regex replace a match from above with"
797	* )
798	* )
799	* );
800	*
801	* An extensive example:
802	*
803	* $bad_attvals = Array(
804	* "\|.*\|" =>
805	* Array(
806	* "/^src\|background\|href\|action/i" =>
807	* Array(
808	* Array(
809	* "/^([\'\"])\s\S+script\s:.*([\'\"])/si"
810	* ),
811	* Array(
812	* "\\1http://veryfunny.com/\\2"
813	* )
814	* ),
815	* "/^style/i" =>
816	* Array(
817	* Array(
818	* "/expression/si",
819	* "/url$([\'\"])\shttps:.*([\'\"])$/si",
820	* "/url$([\'\"])\s\S+script:.([\'\"])$/si"
821	* ),
822	* Array(
823	* "idiocy",
824	* "url(\\1http://veryfunny.com/\\2)",
825	* "url(\\1http://veryfynny.com/\\2)"
826	* )
827	* )
828	* )
829	* );
830	*
831	* This will take care of nearly all known cross-site scripting exploits,
832	* plus some (see my filter sample at
833	* http://www.mricon.com/html/phpfilter.html for a working version).
834	*
835	* $add_attr_to_tag
836	* ----------------
837	* This is a useful little feature which lets you add attributes to
838	* certain tags. It is a nested array as well, but not at all like
839	* the previous one. It goes like so:
840	*
841	* $add_attr_to_tag = Array(
842	* "PCRE regex to match tag name" =>
843	* Array(
844	* "attribute name"=>'"attribute value"'
845	* )
846	* );
847	*
848	* Note: don't forget quotes around attribute value.
849	*
850	* Example:
851	*
852	* $add_attr_to_tag = Array(
853	* "/^a$/si" =>
854	* Array(
855	* 'target'=>'"_new"'
856	* )
857	* );
858	*
859	* This will change all <a> tags and add target="_new" to them so all links
860	* open in a new window.
861	*
862	*
863	*
864	* @param $body the string with HTML you wish to filter
865	* @param $tag_list see description above
866	* @param $rm_tags_with_content see description above
867	* @param $self_closing_tags see description above
868	* @param $force_tag_closing see description above
869	* @param $rm_attnames see description above
870	* @param $bad_attvals see description above
871	* @param $add_attr_to_tag see description above
872	* @return sanitized html safe to show on your pages.
873	*/
874	function sanitize($body,
875	$tag_list,
876	$rm_tags_with_content,
877	$self_closing_tags,
878	$force_tag_closing,
879	$rm_attnames,
880	$bad_attvals,
881	$add_attr_to_tag
882	){
883	$me = 'sanitize';
884	/**
885	* Normalize rm_tags and rm_tags_with_content.
886	*/
887	@array_walk($tag_list, 'casenormalize');
888	@array_walk($rm_tags_with_content, 'casenormalize');
889	@array_walk($self_closing_tags, 'casenormalize');
890	/**
891	* See if tag_list is of tags to remove or tags to allow.
892	* false means remove these tags
893	* true means allow these tags
894	*/
895	$rm_tags = array_shift($tag_list);
896	$curpos = 0;
897	$open_tags = Array();
898	//$trusted = "<!-- begin sanitized html -->\n";
899	$trusted = "";
900	$skip_content = false;
901	/**
902	* Take care of netscape's stupid javascript entities like
903	* &{alert('boo')};
904	*/
905	$body = preg_replace('/&(\{.*?\};)/si', '&\\1', $body);
906	spew("$me: invoking the loop\n");
907	while (($curtag = getnxtag($body, $curpos)) != FALSE){
908	list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
909	spew("$me: grabbing free-standing content\n");
910	$free_content = substr($body, $curpos, $lt - $curpos);
911	spew("$me: " . strlen($free_content) . " chars grabbed\n");
912	if ($skip_content == false){
913	spew("$me: appending free content to trusted.\n");
914	$trusted .= $free_content;
915	} else {
916	spew("$me: Skipping free content.\n");
917	}
918	if ($tagname != FALSE){
919	spew("$me: tagname is '$tagname'\n");
920	if ($tagtype == 2){
921	spew("$me: This is a closing tag\n");
922	if ($skip_content == $tagname){
923	/**
924	* Got to the end of tag we needed to remove.
925	*/
926	spew("$me: Finished removing tag with content\n");
927	$tagname = false;
928	$skip_content = false;
929	} else {
930	if ($skip_content == false){
931	if (isset($open_tags{$tagname}) &&
932	$open_tags{$tagname} > 0){
933	spew("$me: popping '$tagname' from open_tags\n");
934	$open_tags{$tagname}--;
935	} else {
936	spew("$me: '$tagname' was never opened\n");
937	spew("$me: removing\n");
938	$tagname = false;
939	}
940	} else {
941	spew("$me: Skipping this tag\n");
942	}
943	}
944	} else {
945	/**
946	* $rm_tags_with_content
947	*/
948	if ($skip_content == false){
949	/**
950	* See if this is a self-closing type and change
951	* tagtype appropriately.
952	*/
953	if ($tagtype == 1
954	&& in_array($tagname, $self_closing_tags)){
955	spew("$me: Self-closing tag. Changing tagtype.\n");
956	$tagtype = 3;
957	}
958	/**
959	* See if we should skip this tag and any content
960	* inside it.
961	*/
962	if ($tagtype == 1
963	&& in_array($tagname, $rm_tags_with_content)){
964	spew("$me: removing this tag with content\n");
965	$skip_content = $tagname;
966	} else {
967	if (($rm_tags == false
968	&& in_array($tagname, $tag_list)) \|\|
969	($rm_tags == true
970	&& !in_array($tagname, $tag_list))){
971	spew("$me: Removing this tag.\n");
972	$tagname = false;
973	} else {
974	if ($tagtype == 1){
975	spew("$me: adding '$tagname' to open_tags\n");
976	if (isset($open_tags{$tagname})){
977	$open_tags{$tagname}++;
978	} else {
979	$open_tags{$tagname} = 1;
980	}
981	}
982	/**
983	* This is where we run other checks.
984	*/
985	if (is_array($attary) && sizeof($attary) > 0){
986	$attary = fixatts($tagname,
987	$attary,
988	$rm_attnames,
989	$bad_attvals,
990	$add_attr_to_tag);
991	}
992	}
993	}
994	} else {
995	spew("$me: Skipping this tag\n");
996	}
997	}
998	if ($tagname != false && $skip_content == false){
999	spew("$me: Appending tag to trusted.\n");
1000	$trusted .= tagprint($tagname, $attary, $tagtype);
1001	}
1002	} else {
1003	spew("$me: Removing invalid tag\n");
1004	}
1005	$curpos = $gt + 1;
1006	}
1007	spew("$me: Appending any leftover content\n");
1008	$trusted .= substr($body, $curpos, strlen($body) - $curpos);
1009	if ($force_tag_closing == true){
1010	foreach ($open_tags as $tagname=>$opentimes){
1011	while ($opentimes > 0){
1012	spew("$me: '$tagname' left open. Closing by force.\n");
1013	$trusted .= '</' . $tagname . '>';
1014	$opentimes--;
1015	}
1016	}
1017	$trusted .= "\n";
1018	}
1019	//$trusted .= "<!-- end sanitized html -->\n";
1020	return $trusted;
1021	}
1022	?>

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format