Context Navigation

htmlfilter.inc @ 3

Revision 3, 35.9 KB checked in by niltonneto, 18 years ago (diff)
* empty log message *
Property svn:eol-style set to `native` Property svn:executable set to ``*

Line
1	<?php
2	/**
3	* htmlfilter.inc
4	* ---------------
5	* This set of functions allows you to filter html in order to remove
6	* any malicious tags from it. Useful in cases when you need to filter
7	* user input for any cross-site-scripting attempts.
8	*
9	* Copyright (C) 2002-2004 by Duke University
10	*
11	* This library is free software; you can redistribute it and/or
12	* modify it under the terms of the GNU Lesser General Public
13	* License as published by the Free Software Foundation; either
14	* version 2.1 of the License, or (at your option) any later version.
15	*
16	* This library is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19	* Lesser General Public License for more details.
20	*
21	* You should have received a copy of the GNU Lesser General Public
22	* License along with this library; if not, write to the Free Software
23	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
24	* 02110-1301 USA
25	*
26	* @Author Konstantin Riabitsev <icon@linux.duke.edu>
27	*/
28
29	/**
30	* This is a debugging function used throughout the code. To enable
31	* debugging you have to specify a global variable called "debug" before
32	* calling sanitize() and set it to true.
33	*
34	* Note: Although insignificantly, debugging does slow you down even
35	* when $debug is set to false. If you wish to get rid of all
36	* debugging calls, run the following command:
37	*
38	* fgrep -v 'spew("' htmlfilter.inc > htmlfilter.inc.new
39	*
40	* htmlfilter.inc.new will contain no debugging calls.
41	*
42	* @param $message A string with the message to output.
43	* @return void.
44	*/
45	function spew($message){
46	global $debug;
47	if ($debug == true){
48	echo "$message";
49	}
50	}
51
52	/**
53	* This function returns the final tag out of the tag name, an array
54	* of attributes, and the type of the tag. This function is called by
55	* sanitize internally.
56	*
57	* @param $tagname the name of the tag.
58	* @param $attary the array of attributes and their values
59	* @param $tagtype The type of the tag (see in comments).
60	* @return a string with the final tag representation.
61	*/
62	function tagprint($tagname, $attary, $tagtype){
63	$me = 'tagprint';
64	if ($tagtype == 2){
65	$fulltag = '</' . $tagname . '>';
66	} else {
67	$fulltag = '<' . $tagname;
68	if (is_array($attary) && sizeof($attary)){
69	$atts = Array();
70	while (list($attname, $attvalue) = each($attary)){
71	array_push($atts, "$attname=$attvalue");
72	}
73	$fulltag .= ' ' . join(' ', $atts);
74	}
75	if ($tagtype == 3){
76	$fulltag .= ' /';
77	}
78	$fulltag .= '>';
79	}
80	spew("$me: $fulltag\n");
81	return $fulltag;
82	}
83
84	/**
85	* A small helper function to use with array_walk. Modifies a by-ref
86	* value and makes it lowercase.
87	*
88	* @param $val a value passed by-ref.
89	* @return void since it modifies a by-ref value.
90	*/
91	function casenormalize(&$val){
92	$val = strtolower($val);
93	}
94
95	/**
96	* This function skips any whitespace from the current position within
97	* a string and to the next non-whitespace value.
98	*
99	* @param $body the string
100	* @param $offset the offset within the string where we should start
101	* looking for the next non-whitespace character.
102	* @return the location within the $body where the next
103	* non-whitespace char is located.
104	*/
105	function skipspace($body, $offset){
106	$me = 'skipspace';
107	preg_match('/^(\s*)/s', substr($body, $offset), $matches);
108	if (sizeof($matches{1})){
109	$count = strlen($matches{1});
110	spew("$me: skipped $count chars\n");
111	$offset += $count;
112	}
113	return $offset;
114	}
115
116	/**
117	* This function looks for the next character within a string. It's
118	* really just a glorified "strpos", except it catches the failures
119	* nicely.
120	*
121	* @param $body The string to look for needle in.
122	* @param $offset Start looking from this position.
123	* @param $needle The character/string to look for.
124	* @return location of the next occurance of the needle, or
125	* strlen($body) if needle wasn't found.
126	*/
127	function findnxstr($body, $offset, $needle){
128	$me = 'findnxstr';
129	$pos = strpos($body, $needle, $offset);
130	if ($pos === FALSE){
131	$pos = strlen($body);
132	spew("$me: end of body reached\n");
133	}
134	spew("$me: '$needle' found at pos $pos\n");
135	return $pos;
136	}
137
138	/**
139	* This function takes a PCRE-style regexp and tries to match it
140	* within the string.
141	*
142	* @param $body The string to look for needle in.
143	* @param $offset Start looking from here.
144	* @param $reg A PCRE-style regex to match.
145	* @return Returns a false if no matches found, or an array
146	* with the following members:
147	* - integer with the location of the match within $body
148	* - string with whatever content between offset and the match
149	* - string with whatever it is we matched
150	*/
151	function findnxreg($body, $offset, $reg){
152	$me = 'findnxreg';
153	$matches = Array();
154	$retarr = Array();
155	$preg_rule = '%^(.*?)(' . $reg . ')%s';
156	preg_match($preg_rule, substr($body, $offset), $matches);
157	if (!isset($matches{0})){
158	spew("$me: No matches found.\n");
159	$retarr = false;
160	} else {
161	$retarr{0} = $offset + strlen($matches{1});
162	$retarr{1} = $matches{1};
163	$retarr{2} = $matches{2};
164	spew("$me: '$reg' found at pos $offset matching '".$matches{2}."'\n");
165	}
166	return $retarr;
167	}
168
169	/**
170	* This function looks for the next tag.
171	*
172	* @param $body String where to look for the next tag.
173	* @param $offset Start looking from here.
174	* @return false if no more tags exist in the body, or
175	* an array with the following members:
176	* - string with the name of the tag
177	* - array with attributes and their values
178	* - integer with tag type (1, 2, or 3)
179	* - integer where the tag starts (starting "<")
180	* - integer where the tag ends (ending ">")
181	* first three members will be false, if the tag is invalid.
182	*/
183	function getnxtag($body, $offset){
184	$me = 'getnxtag';
185	if ($offset > strlen($body)){
186	spew("$me: Past the end of body\n");
187	return false;
188	}
189	$lt = findnxstr($body, $offset, '<');
190	if ($lt == strlen($body)){
191	spew("$me: No more tags found!\n");
192	return false;
193	}
194	/**
195	* We are here:
196	* blah blah <tag attribute="value">
197	* \---------^
198	*/
199	spew("$me: Found '<' at pos $lt\n");
200	$pos = skipspace($body, $lt + 1);
201	if ($pos >= strlen($body)){
202	spew("$me: End of body reached.\n");
203	return Array(false, false, false, $lt, strlen($body));
204	}
205	/**
206	* There are 3 kinds of tags:
207	* 1. Opening tag, e.g.:
208	* <a href="blah">
209	* 2. Closing tag, e.g.:
210	* </a>
211	* 3. XHTML-style content-less tag, e.g.:
212	* <img src="blah"/>
213	*/
214	$tagtype = false;
215	switch (substr($body, $pos, 1)){
216	case '/':
217	spew("$me: This is a closing tag (type 2)\n");
218	$tagtype = 2;
219	$pos++;
220	break;
221	case '!':
222	/**
223	* A comment or an SGML declaration.
224	*/
225	if (substr($body, $pos+1, 2) == '--'){
226	spew("$me: A comment found. Stripping.\n");
227	$gt = strpos($body, '-->', $pos);
228	if ($gt === false){
229	$gt = strlen($body);
230	} else {
231	$gt += 2;
232	}
233	return Array(false, false, false, $lt, $gt);
234	} else {
235	spew("$me: An SGML declaration found. Stripping.\n");
236	$gt = findnxstr($body, $pos, '>');
237	return Array(false, false, false, $lt, $gt);
238	}
239	break;
240	default:
241	/**
242	* Assume tagtype 1 for now. If it's type 3, we'll switch values
243	* later.
244	*/
245	$tagtype = 1;
246	break;
247	}
248
249	$tag_start = $pos;
250	$tagname = '';
251	/**
252	* Look for next [\W-_], which will indicate the end of the tag name.
253	*/
254	$regary = findnxreg($body, $pos, '[^\w\-_]');
255	if ($regary == false){
256	spew("$me: End of body reached while analyzing tag name\n");
257	return Array(false, false, false, $lt, strlen($body));
258	}
259	list($pos, $tagname, $match) = $regary;
260	$tagname = strtolower($tagname);
261
262	/**
263	* $match can be either of these:
264	* '>' indicating the end of the tag entirely.
265	* '\s' indicating the end of the tag name.
266	* '/' indicating that this is type-3 xhtml tag.
267	*
268	* Whatever else we find there indicates an invalid tag.
269	*/
270	switch ($match){
271	case '/':
272	/**
273	* This is an xhtml-style tag with a closing / at the
274	* end, like so: <img src="blah"/>. Check if it's followed
275	* by the closing bracket. If not, then this tag is invalid
276	*/
277	if (substr($body, $pos, 2) == '/>'){
278	spew("$me: XHTML-style tag found.\n");
279	$pos++;
280	spew("$me: Setting tagtype to 3\n");
281	$tagtype = 3;
282	} else {
283	spew("$me: Found invalid character '/'.\n");
284	$gt = findnxstr($body, $pos, '>');
285	spew("$me: Tag is invalid. Returning.\n");
286	$retary = Array(false, false, false, $lt, $gt);
287	return $retary;
288	}
289	case '>':
290	spew("$me: End of tag found at $pos\n");
291	spew("$me: Tagname is '$tagname'\n");
292	spew("$me: This tag has no attributes\n");
293	return Array($tagname, false, $tagtype, $lt, $pos);
294	break;
295	default:
296	/**
297	* Check if it's whitespace
298	*/
299	if (preg_match('/\s/', $match)){
300	spew("$me: Tagname is '$tagname'\n");
301	} else {
302	/**
303	* This is an invalid tag! Look for the next closing ">".
304	*/
305	spew("$me: Invalid characters found in tag name: $match\n");
306	$gt = findnxstr($body, $lt, '>');
307	return Array(false, false, false, $lt, $gt);
308	}
309	}
310
311	/**
312	* At this point we're here:
313	* <tagname attribute='blah'>
314	* \-------^
315	*
316	* At this point we loop in order to find all attributes.
317	*/
318	$attname = '';
319	$atttype = false;
320	$attary = Array();
321
322	while ($pos <= strlen($body)){
323	$pos = skipspace($body, $pos);
324	if ($pos == strlen($body)){
325	/**
326	* Non-closed tag.
327	*/
328	spew("$me: End of body reached before end of tag. Discarding.\n");
329	return Array(false, false, false, $lt, $pos);
330	}
331	/**
332	* See if we arrived at a ">" or "/>", which means that we reached
333	* the end of the tag.
334	*/
335	$matches = Array();
336	preg_match('%^(\s*)(>\|/>)%s', substr($body, $pos), $matches);
337	if (isset($matches{0}) && $matches{0}){
338	/**
339	* Yep. So we did.
340	*/
341	spew("$me: Arrived at the end of the tag.\n");
342	$pos += strlen($matches{1});
343	if ($matches{2} == '/>'){
344	$tagtype = 3;
345	$pos++;
346	}
347	return Array($tagname, $attary, $tagtype, $lt, $pos);
348	}
349
350	/**
351	* There are several types of attributes, with optional
352	* [:space:] between members.
353	* Type 1:
354	* attrname[:space:]=[:space:]'CDATA'
355	* Type 2:
356	* attrname[:space:]=[:space:]"CDATA"
357	* Type 3:
358	* attr[:space:]=[:space:]CDATA
359	* Type 4:
360	* attrname
361	*
362	* We leave types 1 and 2 the same, type 3 we check for
363	* '"' and convert to "&quot" if needed, then wrap in
364	* double quotes. Type 4 we convert into:
365	* attrname="yes".
366	*/
367	$regary = findnxreg($body, $pos, '[^\w\-_]');
368	if ($regary == false){
369	/**
370	* Looks like body ended before the end of tag.
371	*/
372	spew("$me: End of body found before end of tag.\n");
373	spew("$me: Invalid, returning\n");
374	return Array(false, false, false, $lt, strlen($body));
375	}
376	list($pos, $attname, $match) = $regary;
377	$attname = strtolower($attname);
378	spew("$me: Attribute '$attname' found\n");
379	/**
380	* We arrived at the end of attribute name. Several things possible
381	* here:
382	* '>' means the end of the tag and this is attribute type 4
383	* '/' if followed by '>' means the same thing as above
384	* '\s' means a lot of things -- look what it's followed by.
385	* anything else means the attribute is invalid.
386	*/
387	switch($match){
388	case '/':
389	/**
390	* This is an xhtml-style tag with a closing / at the
391	* end, like so: <img src="blah"/>. Check if it's followed
392	* by the closing bracket. If not, then this tag is invalid
393	*/
394	if (substr($body, $pos, 2) == '/>'){
395	spew("$me: This is an xhtml-style tag.\n");
396	$pos++;
397	spew("$me: Setting tagtype to 3\n");
398	$tagtype = 3;
399	} else {
400	spew("$me: Found invalid character '/'.\n");
401	$gt = findnxstr($body, $pos, '>');
402	spew("$me: Tag is invalid. Returning.\n");
403	$retary = Array(false, false, false, $lt, $gt);
404	return $retary;
405	}
406	case '>':
407	spew("$me: found type 4 attribute.\n");
408	spew("$me: Additionally, end of tag found at $pos\n");
409	spew("$me: Attname is '$attname'\n");
410	spew("$me: Setting attvalue to 'yes'\n");
411	$attary{$attname} = '"yes"';
412	return Array($tagname, $attary, $tagtype, $lt, $pos);
413	break;
414	default:
415	/**
416	* Skip whitespace and see what we arrive at.
417	*/
418	$pos = skipspace($body, $pos);
419	$char = substr($body, $pos, 1);
420	/**
421	* Two things are valid here:
422	* '=' means this is attribute type 1 2 or 3.
423	* \w means this was attribute type 4.
424	* anything else we ignore and re-loop. End of tag and
425	* invalid stuff will be caught by our checks at the beginning
426	* of the loop.
427	*/
428	if ($char == '='){
429	spew("$me: Attribute type 1, 2, or 3 found.\n");
430	$pos++;
431	$pos = skipspace($body, $pos);
432	/**
433	* Here are 3 possibilities:
434	* "'" attribute type 1
435	* '"' attribute type 2
436	* everything else is the content of tag type 3
437	*/
438	$quot = substr($body, $pos, 1);
439	if ($quot == '\''){
440	spew("$me: In fact, this is attribute type 1\n");
441	spew("$me: looking for closing quote\n");
442	$regary = findnxreg($body, $pos+1, '\'');
443	if ($regary == false){
444	spew("$me: end of body reached before end of val\n");
445	spew("$me: Returning\n");
446	return Array(false, false, false, $lt, strlen($body));
447	}
448	list($pos, $attval, $match) = $regary;
449	spew("$me: Attvalue is '$attval'\n");
450	$pos++;
451	$attary{$attname} = '\'' . $attval . '\'';
452	} else if ($quot == '"'){
453	spew("$me: In fact, this is attribute type 2\n");
454	spew("$me: looking for closing quote\n");
455	$regary = findnxreg($body, $pos+1, '\"');
456	if ($regary == false){
457	spew("$me: end of body reached before end of val\n");
458	spew("$me: Returning\n");
459	return Array(false, false, false, $lt, strlen($body));
460	}
461	list($pos, $attval, $match) = $regary;
462	spew("$me: Attvalue is \"$attval\"\n");
463	$pos++;
464	$attary{$attname} = '"' . $attval . '"';
465	} else {
466	spew("$me: This looks like attribute type 3\n");
467	/**
468	* These are hateful. Look for \s, or >.
469	*/
470	spew("$me: Looking for end of attval\n");
471	$regary = findnxreg($body, $pos, '[\s>]');
472	if ($regary == false){
473	spew("$me: end of body reached before end of val\n");
474	spew("$me: Returning\n");
475	return Array(false, false, false, $lt, strlen($body));
476	}
477	list($pos, $attval, $match) = $regary;
478	/**
479	* If it's ">" it will be caught at the top.
480	*/
481	spew("$me: translating '\"' into "\n");
482	$attval = preg_replace('/\"/s', '"', $attval);
483	spew("$me: wrapping in quotes\n");
484	$attary{$attname} = '"' . $attval . '"';
485	}
486	} else if (preg_match('\|[\w/>]\|', $char)) {
487	/**
488	* That was attribute type 4.
489	*/
490	spew("$me: attribute type 4 found.\n");
491	spew("$me: Setting value to 'yes'\n");
492	$attary{$attname} = '"yes"';
493	} else {
494	/**
495	* An illegal character. Find next '>' and return.
496	*/
497	spew("$me: illegal character '$char' found.\n");
498	spew("$me: returning\n");
499	$gt = findnxstr($body, $pos, '>');
500	return Array(false, false, false, $lt, $gt);
501	}
502	}
503	}
504	/**
505	* The fact that we got here indicates that the tag end was never
506	* found. Return invalid tag indication so it gets stripped.
507	*/
508	spew("$me: No tag end found\n");
509	return Array(false, false, false, $lt, strlen($body));
510	}
511
512	/**
513	* Translates entities into literal values so they can be checked.
514	*
515	* @param $attvalue the by-ref value to check.
516	* @param $regex the regular expression to check against.
517	* @param $hex whether the entites are hexadecimal.
518	* @return True or False depending on whether there were matches.
519	*/
520	function deent(&$attvalue, $regex, $hex=false){
521	$me = 'deent';
522	spew("$me: matching '$regex' against: $attvalue\n");
523	$ret_match = false;
524	preg_match_all($regex, $attvalue, $matches);
525	if (is_array($matches) && sizeof($matches[0]) > 0){
526	spew("$me: found " . sizeof($matches[0]) . " matches\n");
527	$repl = Array();
528	for ($i = 0; $i < sizeof($matches[0]); $i++){
529	$numval = $matches[1][$i];
530	spew("$me: numval is $numval\n");
531	if ($hex){
532	$numval = hexdec($numval);
533	spew("$me: hex! Numval is now $numval\n");
534	}
535	$repl{$matches[0][$i]} = chr($numval);
536	}
537	$attvalue = strtr($attvalue, $repl);
538	spew("$me: attvalue after translation: $attvalue\n");
539	return true;
540	} else {
541	spew("$me: no matches! Returning false.\n");
542	return false;
543	}
544	}
545
546	/**
547	* This function checks attribute values for entity-encoded values
548	* and returns them translated into 8-bit strings so we can run
549	* checks on them.
550	*
551	* @param $attvalue A string to run entity check against.
552	* @return Nothing, modifies a reference value.
553	*/
554	function defang(&$attvalue){
555	$me = 'defang';
556	/**
557	* Skip this if there aren't ampersands or backslashes.
558	*/
559	spew("$me: Checking '$attvalue' for suspicious content\n");
560	if (strpos($attvalue, '&') === false
561	&& strpos($attvalue, '\\') === false){
562	spew("$me: no suspicious content found, returning.\n");
563	return;
564	}
565	$m = false;
566	do {
567	$m = false;
568	$m = $m \|\| deent($attvalue, '/\&#0(\d+);/s');
569	$m = $m \|\| deent($attvalue, '/\&#x0((\d\|[a-f])+);/si', true);
570	$m = $m \|\| deent($attvalue, '/\\\\(\d+)/s', true);
571	spew("$me: m=$m\n");
572	} while ($m == true);
573	$attvalue = stripslashes($attvalue);
574	spew("$me: translated into: $attvalue\n");
575	}
576
577	/**
578	* Kill any tabs, newlines, or carriage returns. Our friends the
579	* makers of the browser with 95% market value decided that it'd
580	* be funny to make "java[tab]script" be just as good as "javascript".
581	*
582	* @param attvalue The attribute value before extraneous spaces removed.
583	* @return attvalue Nothing, modifies a reference value.
584	*/
585	function unspace(&$attvalue){
586	$me = 'unspace';
587	if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)){
588	spew("$me: Killing whitespace.\n");
589	$attvalue = str_replace(Array("\t", "\r", "\n", "\0", " "),
590	Array('', '', '', '', ''), $attvalue);
591	}
592	spew("$me: after unspace: $attvalue\n");
593	}
594
595	/**
596	* This function runs various checks against the attributes.
597	*
598	* @param $tagname String with the name of the tag.
599	* @param $attary Array with all tag attributes.
600	* @param $rm_attnames See description for sanitize
601	* @param $bad_attvals See description for sanitize
602	* @param $add_attr_to_tag See description for sanitize
603	* @return Array with modified attributes.
604	*/
605	function fixatts($tagname,
606	$attary,
607	$rm_attnames,
608	$bad_attvals,
609	$add_attr_to_tag
610	){
611	$me = 'fixatts';
612	spew("$me: Fixing attributes\n");
613	while (list($attname, $attvalue) = each($attary)){
614	/**
615	* See if this attribute should be removed.
616	*/
617	foreach ($rm_attnames as $matchtag=>$matchattrs){
618	if (preg_match($matchtag, $tagname)){
619	foreach ($matchattrs as $matchattr){
620	if (preg_match($matchattr, $attname)){
621	spew("$me: Attribute '$attname' defined as bad.\n");
622	spew("$me: Removing.\n");
623	unset($attary{$attname});
624	continue;
625	}
626	}
627	}
628	}
629	/**
630	* Remove any backslashes, entities, or extraneous whitespace.
631	*/
632	defang($attvalue);
633	unspace($attvalue);
634
635	/**
636	* Now let's run checks on the attvalues.
637	* I don't expect anyone to comprehend this. If you do,
638	* get in touch with me so I can drive to where you live and
639	* shake your hand personally. :)
640	*/
641	foreach ($bad_attvals as $matchtag=>$matchattrs){
642	if (preg_match($matchtag, $tagname)){
643	foreach ($matchattrs as $matchattr=>$valary){
644	if (preg_match($matchattr, $attname)){
645	/**
646	* There are two arrays in valary.
647	* First is matches.
648	* Second one is replacements
649	*/
650	list($valmatch, $valrepl) = $valary;
651	$newvalue = preg_replace($valmatch,$valrepl,$attvalue);
652	if ($newvalue != $attvalue){
653	spew("$me: attvalue is now $newvalue\n");
654	$attary{$attname} = $newvalue;
655	}
656	}
657	}
658	}
659	}
660	}
661	/**
662	* See if we need to append any attributes to this tag.
663	*/
664	foreach ($add_attr_to_tag as $matchtag=>$addattary){
665	if (preg_match($matchtag, $tagname)){
666	$attary = array_merge($attary, $addattary);
667	spew("$me: Added attributes to this tag\n");
668	}
669	}
670	return $attary;
671	}
672
673	/**
674	* This is the main function and the one you should actually be calling.
675	* There are several variables you should be aware of an which need
676	* special description.
677	*
678	* $tag_list
679	* ----------
680	* This is a simple one-dimentional array of strings, except for the
681	* very first one. The first member should be einter false or true.
682	* In case it's FALSE, the following list will be considered a list of
683	* tags that should be explicitly REMOVED from the body, and all
684	* others that did not match the list will be allowed. If the first
685	* member is TRUE, then the list is the list of tags that should be
686	* explicitly ALLOWED -- any tag not matching this list will be
687	* discarded.
688	*
689	* Examples:
690	* $tag_list = Array(
691	* false,
692	* "blink",
693	* "link",
694	* "object",
695	* "meta",
696	* "marquee",
697	* "html"
698	* );
699	*
700	* This will allow all tags except for blink, link, object, meta, marquee,
701	* and html.
702	*
703	* $tag_list = Array(
704	* true,
705	* "b",
706	* "a",
707	* "i",
708	* "img",
709	* "strong",
710	* "em",
711	* "p"
712	* );
713	*
714	* This will remove all tags from the body except b, a, i, img, strong, em and
715	* p.
716	*
717	* $rm_tags_with_content
718	* ---------------------
719	* This is a simple one-dimentional array of strings, which specifies the
720	* tags to be removed with any and all content between the beginning and
721	* the end of the tag.
722	* Example:
723	* $rm_tags_with_content = Array(
724	* "script",
725	* "style",
726	* "applet",
727	* "embed"
728	* );
729	*
730	* This will remove the following structure:
731	* <script>
732	* window.alert("Isn't cross-site-scripting fun?!");
733	* </script>
734	*
735	* $self_closing_tags
736	* ------------------
737	* This is a simple one-dimentional array of strings, which specifies which
738	* tags contain no content and should not be forcefully closed if this option
739	* is turned on (see further).
740	* Example:
741	* $self_closing_tags = Array(
742	* "img",
743	* "br",
744	* "hr",
745	* "input"
746	* );
747	*
748	* $force_tag_closing
749	* ------------------
750	* Set it to true to forcefully close any tags opened within the document.
751	* This is good if you want to take care of people who like to screw up
752	* the pages by leaving unclosed tags like <a>, <b>, <i>, etc.
753	*
754	* $rm_attnames
755	* -------------
756	* Now we come to parameters that are more obscure. This parameter is
757	* a nested array which is used to specify which attributes should be
758	* removed. It goes like so:
759	*
760	* $rm_attnames = Array(
761	* "PCRE regex to match tag name" =>
762	* Array(
763	* "PCRE regex to match attribute name"
764	* )
765	* );
766	*
767	* Example:
768	* $rm_attnames = Array(
769	* "\|.*\|" =>
770	* Array(
771	* "\|target\|i",
772	* "\|^on.*\|i"
773	* )
774	* );
775	*
776	* This will match all attributes (.*), and specify that all attributes
777	* named "target" and starting with "on" should be removed. This will take
778	* care of the following problem:
779	* <em onmouseover="window.alert('muahahahaha')">
780	* The "onmouseover" will be removed.
781	*
782	* $bad_attvals
783	* ------------
784	* This is where it gets ugly. This is a nested array with many levels.
785	* It goes like so:
786	*
787	* $bad_attvals = Array(
788	* "pcre regex to match tag name" =>
789	* Array(
790	* "pcre regex to match attribute name" =>
791	* Array(
792	* "pcre regex to match attribute value"
793	* )
794	* Array(
795	* "pcre regex replace a match from above with"
796	* )
797	* )
798	* );
799	*
800	* An extensive example:
801	*
802	* $bad_attvals = Array(
803	* "\|.*\|" =>
804	* Array(
805	* "/^src\|background\|href\|action/i" =>
806	* Array(
807	* Array(
808	* "/^([\'\"])\s\S+script\s:.*([\'\"])/si"
809	* ),
810	* Array(
811	* "\\1http://veryfunny.com/\\2"
812	* )
813	* ),
814	* "/^style/i" =>
815	* Array(
816	* Array(
817	* "/expression/si",
818	* "/url$([\'\"])\shttps:.*([\'\"])$/si",
819	* "/url$([\'\"])\s\S+script:.([\'\"])$/si"
820	* ),
821	* Array(
822	* "idiocy",
823	* "url(\\1http://veryfunny.com/\\2)",
824	* "url(\\1http://veryfynny.com/\\2)"
825	* )
826	* )
827	* )
828	* );
829	*
830	* This will take care of nearly all known cross-site scripting exploits,
831	* plus some (see my filter sample at
832	* http://www.mricon.com/html/phpfilter.html for a working version).
833	*
834	* $add_attr_to_tag
835	* ----------------
836	* This is a useful little feature which lets you add attributes to
837	* certain tags. It is a nested array as well, but not at all like
838	* the previous one. It goes like so:
839	*
840	* $add_attr_to_tag = Array(
841	* "PCRE regex to match tag name" =>
842	* Array(
843	* "attribute name"=>'"attribute value"'
844	* )
845	* );
846	*
847	* Note: don't forget quotes around attribute value.
848	*
849	* Example:
850	*
851	* $add_attr_to_tag = Array(
852	* "/^a$/si" =>
853	* Array(
854	* 'target'=>'"_new"'
855	* )
856	* );
857	*
858	* This will change all <a> tags and add target="_new" to them so all links
859	* open in a new window.
860	*
861	*
862	*
863	* @param $body the string with HTML you wish to filter
864	* @param $tag_list see description above
865	* @param $rm_tags_with_content see description above
866	* @param $self_closing_tags see description above
867	* @param $force_tag_closing see description above
868	* @param $rm_attnames see description above
869	* @param $bad_attvals see description above
870	* @param $add_attr_to_tag see description above
871	* @return sanitized html safe to show on your pages.
872	*/
873	function sanitize($body,
874	$tag_list,
875	$rm_tags_with_content,
876	$self_closing_tags,
877	$force_tag_closing,
878	$rm_attnames,
879	$bad_attvals,
880	$add_attr_to_tag
881	){
882	$me = 'sanitize';
883	/**
884	* Normalize rm_tags and rm_tags_with_content.
885	*/
886	@array_walk($tag_list, 'casenormalize');
887	@array_walk($rm_tags_with_content, 'casenormalize');
888	@array_walk($self_closing_tags, 'casenormalize');
889	/**
890	* See if tag_list is of tags to remove or tags to allow.
891	* false means remove these tags
892	* true means allow these tags
893	*/
894	$rm_tags = array_shift($tag_list);
895	$curpos = 0;
896	$open_tags = Array();
897	//$trusted = "<!-- begin sanitized html -->\n";
898	$trusted = "";
899	$skip_content = false;
900	/**
901	* Take care of netscape's stupid javascript entities like
902	* &{alert('boo')};
903	*/
904	$body = preg_replace('/&(\{.*?\};)/si', '&\\1', $body);
905	spew("$me: invoking the loop\n");
906	while (($curtag = getnxtag($body, $curpos)) != FALSE){
907	list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
908	spew("$me: grabbing free-standing content\n");
909	$free_content = substr($body, $curpos, $lt - $curpos);
910	spew("$me: " . strlen($free_content) . " chars grabbed\n");
911	if ($skip_content == false){
912	spew("$me: appending free content to trusted.\n");
913	$trusted .= $free_content;
914	} else {
915	spew("$me: Skipping free content.\n");
916	}
917	if ($tagname != FALSE){
918	spew("$me: tagname is '$tagname'\n");
919	if ($tagtype == 2){
920	spew("$me: This is a closing tag\n");
921	if ($skip_content == $tagname){
922	/**
923	* Got to the end of tag we needed to remove.
924	*/
925	spew("$me: Finished removing tag with content\n");
926	$tagname = false;
927	$skip_content = false;
928	} else {
929	if ($skip_content == false){
930	if (isset($open_tags{$tagname}) &&
931	$open_tags{$tagname} > 0){
932	spew("$me: popping '$tagname' from open_tags\n");
933	$open_tags{$tagname}--;
934	} else {
935	spew("$me: '$tagname' was never opened\n");
936	spew("$me: removing\n");
937	$tagname = false;
938	}
939	} else {
940	spew("$me: Skipping this tag\n");
941	}
942	}
943	} else {
944	/**
945	* $rm_tags_with_content
946	*/
947	if ($skip_content == false){
948	/**
949	* See if this is a self-closing type and change
950	* tagtype appropriately.
951	*/
952	if ($tagtype == 1
953	&& in_array($tagname, $self_closing_tags)){
954	spew("$me: Self-closing tag. Changing tagtype.\n");
955	$tagtype = 3;
956	}
957	/**
958	* See if we should skip this tag and any content
959	* inside it.
960	*/
961	if ($tagtype == 1
962	&& in_array($tagname, $rm_tags_with_content)){
963	spew("$me: removing this tag with content\n");
964	$skip_content = $tagname;
965	} else {
966	if (($rm_tags == false
967	&& in_array($tagname, $tag_list)) \|\|
968	($rm_tags == true
969	&& !in_array($tagname, $tag_list))){
970	spew("$me: Removing this tag.\n");
971	$tagname = false;
972	} else {
973	if ($tagtype == 1){
974	spew("$me: adding '$tagname' to open_tags\n");
975	if (isset($open_tags{$tagname})){
976	$open_tags{$tagname}++;
977	} else {
978	$open_tags{$tagname} = 1;
979	}
980	}
981	/**
982	* This is where we run other checks.
983	*/
984	if (is_array($attary) && sizeof($attary) > 0){
985	$attary = fixatts($tagname,
986	$attary,
987	$rm_attnames,
988	$bad_attvals,
989	$add_attr_to_tag);
990	}
991	}
992	}
993	} else {
994	spew("$me: Skipping this tag\n");
995	}
996	}
997	if ($tagname != false && $skip_content == false){
998	spew("$me: Appending tag to trusted.\n");
999	$trusted .= tagprint($tagname, $attary, $tagtype);
1000	}
1001	} else {
1002	spew("$me: Removing invalid tag\n");
1003	}
1004	$curpos = $gt + 1;
1005	}
1006	spew("$me: Appending any leftover content\n");
1007	$trusted .= substr($body, $curpos, strlen($body) - $curpos);
1008	if ($force_tag_closing == true){
1009	foreach ($open_tags as $tagname=>$opentimes){
1010	while ($opentimes > 0){
1011	spew("$me: '$tagname' left open. Closing by force.\n");
1012	$trusted .= '</' . $tagname . '>';
1013	$opentimes--;
1014	}
1015	}
1016	$trusted .= "\n";
1017	}
1018	//$trusted .= "<!-- end sanitized html -->\n";
1019	return $trusted;
1020	}
1021	?>

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format