Context Navigation

htmlfilter.inc @ 3

Revision 3, 35.9 KB checked in by niltonneto, 18 years ago (diff)
* empty log message *
Property svn:eol-style set to `native` Property svn:executable set to ``*

Rev	Line
[2]	1	<?php
	2	/**
	3	* htmlfilter.inc
	4	* ---------------
	5	* This set of functions allows you to filter html in order to remove
	6	* any malicious tags from it. Useful in cases when you need to filter
	7	* user input for any cross-site-scripting attempts.
	8	*
	9	* Copyright (C) 2002-2004 by Duke University
	10	*
	11	* This library is free software; you can redistribute it and/or
	12	* modify it under the terms of the GNU Lesser General Public
	13	* License as published by the Free Software Foundation; either
	14	* version 2.1 of the License, or (at your option) any later version.
	15	*
	16	* This library is distributed in the hope that it will be useful,
	17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	19	* Lesser General Public License for more details.
	20	*
	21	* You should have received a copy of the GNU Lesser General Public
	22	* License along with this library; if not, write to the Free Software
	23	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
	24	* 02110-1301 USA
	25	*
	26	* @Author Konstantin Riabitsev <icon@linux.duke.edu>
	27	*/
	28
	29	/**
	30	* This is a debugging function used throughout the code. To enable
	31	* debugging you have to specify a global variable called "debug" before
	32	* calling sanitize() and set it to true.
	33	*
	34	* Note: Although insignificantly, debugging does slow you down even
	35	* when $debug is set to false. If you wish to get rid of all
	36	* debugging calls, run the following command:
	37	*
	38	* fgrep -v 'spew("' htmlfilter.inc > htmlfilter.inc.new
	39	*
	40	* htmlfilter.inc.new will contain no debugging calls.
	41	*
	42	* @param $message A string with the message to output.
	43	* @return void.
	44	*/
	45	function spew($message){
	46	global $debug;
	47	if ($debug == true){
	48	echo "$message";
	49	}
	50	}
	51
	52	/**
	53	* This function returns the final tag out of the tag name, an array
	54	* of attributes, and the type of the tag. This function is called by
	55	* sanitize internally.
	56	*
	57	* @param $tagname the name of the tag.
	58	* @param $attary the array of attributes and their values
	59	* @param $tagtype The type of the tag (see in comments).
	60	* @return a string with the final tag representation.
	61	*/
	62	function tagprint($tagname, $attary, $tagtype){
	63	$me = 'tagprint';
	64	if ($tagtype == 2){
	65	$fulltag = '</' . $tagname . '>';
	66	} else {
	67	$fulltag = '<' . $tagname;
	68	if (is_array($attary) && sizeof($attary)){
	69	$atts = Array();
	70	while (list($attname, $attvalue) = each($attary)){
	71	array_push($atts, "$attname=$attvalue");
	72	}
	73	$fulltag .= ' ' . join(' ', $atts);
	74	}
	75	if ($tagtype == 3){
	76	$fulltag .= ' /';
	77	}
	78	$fulltag .= '>';
	79	}
	80	spew("$me: $fulltag\n");
	81	return $fulltag;
	82	}
	83
	84	/**
	85	* A small helper function to use with array_walk. Modifies a by-ref
	86	* value and makes it lowercase.
	87	*
	88	* @param $val a value passed by-ref.
	89	* @return void since it modifies a by-ref value.
	90	*/
	91	function casenormalize(&$val){
	92	$val = strtolower($val);
	93	}
	94
	95	/**
	96	* This function skips any whitespace from the current position within
	97	* a string and to the next non-whitespace value.
	98	*
	99	* @param $body the string
	100	* @param $offset the offset within the string where we should start
	101	* looking for the next non-whitespace character.
	102	* @return the location within the $body where the next
	103	* non-whitespace char is located.
	104	*/
	105	function skipspace($body, $offset){
	106	$me = 'skipspace';
	107	preg_match('/^(\s*)/s', substr($body, $offset), $matches);
	108	if (sizeof($matches{1})){
	109	$count = strlen($matches{1});
	110	spew("$me: skipped $count chars\n");
	111	$offset += $count;
	112	}
	113	return $offset;
	114	}
	115
	116	/**
	117	* This function looks for the next character within a string. It's
	118	* really just a glorified "strpos", except it catches the failures
	119	* nicely.
	120	*
	121	* @param $body The string to look for needle in.
	122	* @param $offset Start looking from this position.
	123	* @param $needle The character/string to look for.
	124	* @return location of the next occurance of the needle, or
	125	* strlen($body) if needle wasn't found.
	126	*/
	127	function findnxstr($body, $offset, $needle){
	128	$me = 'findnxstr';
	129	$pos = strpos($body, $needle, $offset);
	130	if ($pos === FALSE){
	131	$pos = strlen($body);
	132	spew("$me: end of body reached\n");
	133	}
	134	spew("$me: '$needle' found at pos $pos\n");
	135	return $pos;
	136	}
	137
	138	/**
	139	* This function takes a PCRE-style regexp and tries to match it
	140	* within the string.
	141	*
	142	* @param $body The string to look for needle in.
	143	* @param $offset Start looking from here.
	144	* @param $reg A PCRE-style regex to match.
	145	* @return Returns a false if no matches found, or an array
	146	* with the following members:
	147	* - integer with the location of the match within $body
	148	* - string with whatever content between offset and the match
	149	* - string with whatever it is we matched
	150	*/
	151	function findnxreg($body, $offset, $reg){
	152	$me = 'findnxreg';
	153	$matches = Array();
	154	$retarr = Array();
	155	$preg_rule = '%^(.*?)(' . $reg . ')%s';
	156	preg_match($preg_rule, substr($body, $offset), $matches);
	157	if (!isset($matches{0})){
	158	spew("$me: No matches found.\n");
	159	$retarr = false;
	160	} else {
	161	$retarr{0} = $offset + strlen($matches{1});
	162	$retarr{1} = $matches{1};
	163	$retarr{2} = $matches{2};
	164	spew("$me: '$reg' found at pos $offset matching '".$matches{2}."'\n");
	165	}
	166	return $retarr;
	167	}
	168
	169	/**
	170	* This function looks for the next tag.
	171	*
	172	* @param $body String where to look for the next tag.
	173	* @param $offset Start looking from here.
	174	* @return false if no more tags exist in the body, or
	175	* an array with the following members:
	176	* - string with the name of the tag
	177	* - array with attributes and their values
	178	* - integer with tag type (1, 2, or 3)
	179	* - integer where the tag starts (starting "<")
	180	* - integer where the tag ends (ending ">")
	181	* first three members will be false, if the tag is invalid.
	182	*/
	183	function getnxtag($body, $offset){
	184	$me = 'getnxtag';
	185	if ($offset > strlen($body)){
	186	spew("$me: Past the end of body\n");
	187	return false;
	188	}
	189	$lt = findnxstr($body, $offset, '<');
	190	if ($lt == strlen($body)){
	191	spew("$me: No more tags found!\n");
	192	return false;
	193	}
	194	/**
	195	* We are here:
	196	* blah blah <tag attribute="value">
	197	* \---------^
	198	*/
	199	spew("$me: Found '<' at pos $lt\n");
	200	$pos = skipspace($body, $lt + 1);
	201	if ($pos >= strlen($body)){
	202	spew("$me: End of body reached.\n");
	203	return Array(false, false, false, $lt, strlen($body));
	204	}
	205	/**
	206	* There are 3 kinds of tags:
	207	* 1. Opening tag, e.g.:
	208	* <a href="blah">
	209	* 2. Closing tag, e.g.:
	210	* </a>
	211	* 3. XHTML-style content-less tag, e.g.:
	212	* <img src="blah"/>
	213	*/
	214	$tagtype = false;
	215	switch (substr($body, $pos, 1)){
	216	case '/':
	217	spew("$me: This is a closing tag (type 2)\n");
	218	$tagtype = 2;
	219	$pos++;
	220	break;
	221	case '!':
	222	/**
	223	* A comment or an SGML declaration.
	224	*/
	225	if (substr($body, $pos+1, 2) == '--'){
	226	spew("$me: A comment found. Stripping.\n");
	227	$gt = strpos($body, '-->', $pos);
	228	if ($gt === false){
	229	$gt = strlen($body);
	230	} else {
	231	$gt += 2;
	232	}
	233	return Array(false, false, false, $lt, $gt);
	234	} else {
	235	spew("$me: An SGML declaration found. Stripping.\n");
	236	$gt = findnxstr($body, $pos, '>');
	237	return Array(false, false, false, $lt, $gt);
	238	}
	239	break;
	240	default:
	241	/**
	242	* Assume tagtype 1 for now. If it's type 3, we'll switch values
	243	* later.
	244	*/
	245	$tagtype = 1;
	246	break;
	247	}
	248
	249	$tag_start = $pos;
	250	$tagname = '';
	251	/**
	252	* Look for next [\W-_], which will indicate the end of the tag name.
	253	*/
	254	$regary = findnxreg($body, $pos, '[^\w\-_]');
	255	if ($regary == false){
	256	spew("$me: End of body reached while analyzing tag name\n");
	257	return Array(false, false, false, $lt, strlen($body));
	258	}
	259	list($pos, $tagname, $match) = $regary;
	260	$tagname = strtolower($tagname);
	261
	262	/**
	263	* $match can be either of these:
	264	* '>' indicating the end of the tag entirely.
	265	* '\s' indicating the end of the tag name.
	266	* '/' indicating that this is type-3 xhtml tag.
	267	*
	268	* Whatever else we find there indicates an invalid tag.
	269	*/
	270	switch ($match){
	271	case '/':
	272	/**
	273	* This is an xhtml-style tag with a closing / at the
	274	* end, like so: <img src="blah"/>. Check if it's followed
	275	* by the closing bracket. If not, then this tag is invalid
	276	*/
	277	if (substr($body, $pos, 2) == '/>'){
	278	spew("$me: XHTML-style tag found.\n");
	279	$pos++;
	280	spew("$me: Setting tagtype to 3\n");
	281	$tagtype = 3;
	282	} else {
	283	spew("$me: Found invalid character '/'.\n");
	284	$gt = findnxstr($body, $pos, '>');
	285	spew("$me: Tag is invalid. Returning.\n");
	286	$retary = Array(false, false, false, $lt, $gt);
	287	return $retary;
	288	}
	289	case '>':
	290	spew("$me: End of tag found at $pos\n");
	291	spew("$me: Tagname is '$tagname'\n");
	292	spew("$me: This tag has no attributes\n");
	293	return Array($tagname, false, $tagtype, $lt, $pos);
	294	break;
	295	default:
	296	/**
	297	* Check if it's whitespace
	298	*/
	299	if (preg_match('/\s/', $match)){
	300	spew("$me: Tagname is '$tagname'\n");
	301	} else {
	302	/**
	303	* This is an invalid tag! Look for the next closing ">".
	304	*/
	305	spew("$me: Invalid characters found in tag name: $match\n");
	306	$gt = findnxstr($body, $lt, '>');
	307	return Array(false, false, false, $lt, $gt);
	308	}
	309	}
	310
	311	/**
	312	* At this point we're here:
	313	* <tagname attribute='blah'>
	314	* \-------^
	315	*
	316	* At this point we loop in order to find all attributes.
	317	*/
	318	$attname = '';
	319	$atttype = false;
	320	$attary = Array();
	321
	322	while ($pos <= strlen($body)){
	323	$pos = skipspace($body, $pos);
	324	if ($pos == strlen($body)){
	325	/**
	326	* Non-closed tag.
	327	*/
	328	spew("$me: End of body reached before end of tag. Discarding.\n");
	329	return Array(false, false, false, $lt, $pos);
	330	}
	331	/**
	332	* See if we arrived at a ">" or "/>", which means that we reached
	333	* the end of the tag.
	334	*/
	335	$matches = Array();
	336	preg_match('%^(\s*)(>\|/>)%s', substr($body, $pos), $matches);
	337	if (isset($matches{0}) && $matches{0}){
	338	/**
	339	* Yep. So we did.
	340	*/
	341	spew("$me: Arrived at the end of the tag.\n");
	342	$pos += strlen($matches{1});
	343	if ($matches{2} == '/>'){
	344	$tagtype = 3;
	345	$pos++;
	346	}
	347	return Array($tagname, $attary, $tagtype, $lt, $pos);
	348	}
	349
	350	/**
	351	* There are several types of attributes, with optional
	352	* [:space:] between members.
	353	* Type 1:
	354	* attrname[:space:]=[:space:]'CDATA'
	355	* Type 2:
	356	* attrname[:space:]=[:space:]"CDATA"
	357	* Type 3:
	358	* attr[:space:]=[:space:]CDATA
	359	* Type 4:
	360	* attrname
	361	*
	362	* We leave types 1 and 2 the same, type 3 we check for
	363	* '"' and convert to "&quot" if needed, then wrap in
	364	* double quotes. Type 4 we convert into:
	365	* attrname="yes".
	366	*/
	367	$regary = findnxreg($body, $pos, '[^\w\-_]');
	368	if ($regary == false){
	369	/**
	370	* Looks like body ended before the end of tag.
	371	*/
	372	spew("$me: End of body found before end of tag.\n");
	373	spew("$me: Invalid, returning\n");
	374	return Array(false, false, false, $lt, strlen($body));
	375	}
	376	list($pos, $attname, $match) = $regary;
	377	$attname = strtolower($attname);
	378	spew("$me: Attribute '$attname' found\n");
	379	/**
	380	* We arrived at the end of attribute name. Several things possible
	381	* here:
	382	* '>' means the end of the tag and this is attribute type 4
	383	* '/' if followed by '>' means the same thing as above
	384	* '\s' means a lot of things -- look what it's followed by.
	385	* anything else means the attribute is invalid.
	386	*/
	387	switch($match){
	388	case '/':
	389	/**
	390	* This is an xhtml-style tag with a closing / at the
	391	* end, like so: <img src="blah"/>. Check if it's followed
	392	* by the closing bracket. If not, then this tag is invalid
	393	*/
	394	if (substr($body, $pos, 2) == '/>'){
	395	spew("$me: This is an xhtml-style tag.\n");
	396	$pos++;
	397	spew("$me: Setting tagtype to 3\n");
	398	$tagtype = 3;
	399	} else {
	400	spew("$me: Found invalid character '/'.\n");
	401	$gt = findnxstr($body, $pos, '>');
	402	spew("$me: Tag is invalid. Returning.\n");
	403	$retary = Array(false, false, false, $lt, $gt);
	404	return $retary;
	405	}
	406	case '>':
	407	spew("$me: found type 4 attribute.\n");
	408	spew("$me: Additionally, end of tag found at $pos\n");
	409	spew("$me: Attname is '$attname'\n");
	410	spew("$me: Setting attvalue to 'yes'\n");
	411	$attary{$attname} = '"yes"';
	412	return Array($tagname, $attary, $tagtype, $lt, $pos);
	413	break;
	414	default:
	415	/**
	416	* Skip whitespace and see what we arrive at.
	417	*/
	418	$pos = skipspace($body, $pos);
	419	$char = substr($body, $pos, 1);
	420	/**
	421	* Two things are valid here:
	422	* '=' means this is attribute type 1 2 or 3.
	423	* \w means this was attribute type 4.
	424	* anything else we ignore and re-loop. End of tag and
	425	* invalid stuff will be caught by our checks at the beginning
	426	* of the loop.
	427	*/
	428	if ($char == '='){
	429	spew("$me: Attribute type 1, 2, or 3 found.\n");
	430	$pos++;
	431	$pos = skipspace($body, $pos);
	432	/**
	433	* Here are 3 possibilities:
	434	* "'" attribute type 1
	435	* '"' attribute type 2
	436	* everything else is the content of tag type 3
	437	*/
	438	$quot = substr($body, $pos, 1);
	439	if ($quot == '\''){
	440	spew("$me: In fact, this is attribute type 1\n");
	441	spew("$me: looking for closing quote\n");
	442	$regary = findnxreg($body, $pos+1, '\'');
	443	if ($regary == false){
	444	spew("$me: end of body reached before end of val\n");
	445	spew("$me: Returning\n");
	446	return Array(false, false, false, $lt, strlen($body));
	447	}
	448	list($pos, $attval, $match) = $regary;
	449	spew("$me: Attvalue is '$attval'\n");
	450	$pos++;
	451	$attary{$attname} = '\'' . $attval . '\'';
	452	} else if ($quot == '"'){
	453	spew("$me: In fact, this is attribute type 2\n");
	454	spew("$me: looking for closing quote\n");
	455	$regary = findnxreg($body, $pos+1, '\"');
	456	if ($regary == false){
	457	spew("$me: end of body reached before end of val\n");
	458	spew("$me: Returning\n");
	459	return Array(false, false, false, $lt, strlen($body));
	460	}
	461	list($pos, $attval, $match) = $regary;
	462	spew("$me: Attvalue is \"$attval\"\n");
	463	$pos++;
	464	$attary{$attname} = '"' . $attval . '"';
	465	} else {
	466	spew("$me: This looks like attribute type 3\n");
	467	/**
	468	* These are hateful. Look for \s, or >.
	469	*/
	470	spew("$me: Looking for end of attval\n");
	471	$regary = findnxreg($body, $pos, '[\s>]');
	472	if ($regary == false){
	473	spew("$me: end of body reached before end of val\n");
	474	spew("$me: Returning\n");
	475	return Array(false, false, false, $lt, strlen($body));
	476	}
	477	list($pos, $attval, $match) = $regary;
	478	/**
	479	* If it's ">" it will be caught at the top.
	480	*/
	481	spew("$me: translating '\"' into "\n");
	482	$attval = preg_replace('/\"/s', '"', $attval);
	483	spew("$me: wrapping in quotes\n");
	484	$attary{$attname} = '"' . $attval . '"';
	485	}
	486	} else if (preg_match('\|[\w/>]\|', $char)) {
	487	/**
	488	* That was attribute type 4.
	489	*/
	490	spew("$me: attribute type 4 found.\n");
	491	spew("$me: Setting value to 'yes'\n");
	492	$attary{$attname} = '"yes"';
	493	} else {
	494	/**
	495	* An illegal character. Find next '>' and return.
	496	*/
	497	spew("$me: illegal character '$char' found.\n");
	498	spew("$me: returning\n");
	499	$gt = findnxstr($body, $pos, '>');
	500	return Array(false, false, false, $lt, $gt);
	501	}
	502	}
	503	}
	504	/**
	505	* The fact that we got here indicates that the tag end was never
	506	* found. Return invalid tag indication so it gets stripped.
	507	*/
	508	spew("$me: No tag end found\n");
	509	return Array(false, false, false, $lt, strlen($body));
	510	}
	511
	512	/**
	513	* Translates entities into literal values so they can be checked.
	514	*
	515	* @param $attvalue the by-ref value to check.
	516	* @param $regex the regular expression to check against.
	517	* @param $hex whether the entites are hexadecimal.
	518	* @return True or False depending on whether there were matches.
	519	*/
	520	function deent(&$attvalue, $regex, $hex=false){
	521	$me = 'deent';
	522	spew("$me: matching '$regex' against: $attvalue\n");
	523	$ret_match = false;
	524	preg_match_all($regex, $attvalue, $matches);
	525	if (is_array($matches) && sizeof($matches[0]) > 0){
	526	spew("$me: found " . sizeof($matches[0]) . " matches\n");
	527	$repl = Array();
	528	for ($i = 0; $i < sizeof($matches[0]); $i++){
	529	$numval = $matches[1][$i];
	530	spew("$me: numval is $numval\n");
	531	if ($hex){
	532	$numval = hexdec($numval);
	533	spew("$me: hex! Numval is now $numval\n");
	534	}
	535	$repl{$matches[0][$i]} = chr($numval);
	536	}
	537	$attvalue = strtr($attvalue, $repl);
	538	spew("$me: attvalue after translation: $attvalue\n");
	539	return true;
	540	} else {
	541	spew("$me: no matches! Returning false.\n");
	542	return false;
	543	}
	544	}
	545
	546	/**
	547	* This function checks attribute values for entity-encoded values
	548	* and returns them translated into 8-bit strings so we can run
	549	* checks on them.
	550	*
	551	* @param $attvalue A string to run entity check against.
	552	* @return Nothing, modifies a reference value.
	553	*/
	554	function defang(&$attvalue){
	555	$me = 'defang';
	556	/**
	557	* Skip this if there aren't ampersands or backslashes.
	558	*/
	559	spew("$me: Checking '$attvalue' for suspicious content\n");
	560	if (strpos($attvalue, '&') === false
	561	&& strpos($attvalue, '\\') === false){
	562	spew("$me: no suspicious content found, returning.\n");
	563	return;
	564	}
	565	$m = false;
	566	do {
	567	$m = false;
	568	$m = $m \|\| deent($attvalue, '/\&#0(\d+);/s');
	569	$m = $m \|\| deent($attvalue, '/\&#x0((\d\|[a-f])+);/si', true);
	570	$m = $m \|\| deent($attvalue, '/\\\\(\d+)/s', true);
	571	spew("$me: m=$m\n");
	572	} while ($m == true);
	573	$attvalue = stripslashes($attvalue);
	574	spew("$me: translated into: $attvalue\n");
	575	}
	576
	577	/**
	578	* Kill any tabs, newlines, or carriage returns. Our friends the
	579	* makers of the browser with 95% market value decided that it'd
	580	* be funny to make "java[tab]script" be just as good as "javascript".
	581	*
	582	* @param attvalue The attribute value before extraneous spaces removed.
	583	* @return attvalue Nothing, modifies a reference value.
	584	*/
	585	function unspace(&$attvalue){
	586	$me = 'unspace';
	587	if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)){
	588	spew("$me: Killing whitespace.\n");
	589	$attvalue = str_replace(Array("\t", "\r", "\n", "\0", " "),
	590	Array('', '', '', '', ''), $attvalue);
	591	}
	592	spew("$me: after unspace: $attvalue\n");
	593	}
	594
	595	/**
	596	* This function runs various checks against the attributes.
	597	*
	598	* @param $tagname String with the name of the tag.
	599	* @param $attary Array with all tag attributes.
	600	* @param $rm_attnames See description for sanitize
	601	* @param $bad_attvals See description for sanitize
	602	* @param $add_attr_to_tag See description for sanitize
	603	* @return Array with modified attributes.
	604	*/
	605	function fixatts($tagname,
	606	$attary,
	607	$rm_attnames,
	608	$bad_attvals,
	609	$add_attr_to_tag
	610	){
	611	$me = 'fixatts';
	612	spew("$me: Fixing attributes\n");
	613	while (list($attname, $attvalue) = each($attary)){
	614	/**
	615	* See if this attribute should be removed.
	616	*/
	617	foreach ($rm_attnames as $matchtag=>$matchattrs){
	618	if (preg_match($matchtag, $tagname)){
	619	foreach ($matchattrs as $matchattr){
	620	if (preg_match($matchattr, $attname)){
	621	spew("$me: Attribute '$attname' defined as bad.\n");
	622	spew("$me: Removing.\n");
	623	unset($attary{$attname});
	624	continue;
	625	}
	626	}
	627	}
	628	}
	629	/**
	630	* Remove any backslashes, entities, or extraneous whitespace.
	631	*/
	632	defang($attvalue);
	633	unspace($attvalue);
	634
	635	/**
	636	* Now let's run checks on the attvalues.
	637	* I don't expect anyone to comprehend this. If you do,
	638	* get in touch with me so I can drive to where you live and
	639	* shake your hand personally. :)
	640	*/
	641	foreach ($bad_attvals as $matchtag=>$matchattrs){
	642	if (preg_match($matchtag, $tagname)){
	643	foreach ($matchattrs as $matchattr=>$valary){
	644	if (preg_match($matchattr, $attname)){
	645	/**
	646	* There are two arrays in valary.
	647	* First is matches.
	648	* Second one is replacements
	649	*/
	650	list($valmatch, $valrepl) = $valary;
	651	$newvalue = preg_replace($valmatch,$valrepl,$attvalue);
	652	if ($newvalue != $attvalue){
	653	spew("$me: attvalue is now $newvalue\n");
	654	$attary{$attname} = $newvalue;
	655	}
	656	}
	657	}
	658	}
	659	}
	660	}
	661	/**
	662	* See if we need to append any attributes to this tag.
	663	*/
	664	foreach ($add_attr_to_tag as $matchtag=>$addattary){
	665	if (preg_match($matchtag, $tagname)){
	666	$attary = array_merge($attary, $addattary);
	667	spew("$me: Added attributes to this tag\n");
	668	}
	669	}
	670	return $attary;
	671	}
	672
	673	/**
	674	* This is the main function and the one you should actually be calling.
	675	* There are several variables you should be aware of an which need
	676	* special description.
	677	*
	678	* $tag_list
	679	* ----------
	680	* This is a simple one-dimentional array of strings, except for the
	681	* very first one. The first member should be einter false or true.
	682	* In case it's FALSE, the following list will be considered a list of
	683	* tags that should be explicitly REMOVED from the body, and all
	684	* others that did not match the list will be allowed. If the first
	685	* member is TRUE, then the list is the list of tags that should be
	686	* explicitly ALLOWED -- any tag not matching this list will be
	687	* discarded.
	688	*
	689	* Examples:
	690	* $tag_list = Array(
	691	* false,
	692	* "blink",
	693	* "link",
	694	* "object",
	695	* "meta",
	696	* "marquee",
	697	* "html"
	698	* );
	699	*
	700	* This will allow all tags except for blink, link, object, meta, marquee,
	701	* and html.
	702	*
	703	* $tag_list = Array(
	704	* true,
	705	* "b",
	706	* "a",
	707	* "i",
	708	* "img",
	709	* "strong",
	710	* "em",
	711	* "p"
	712	* );
	713	*
	714	* This will remove all tags from the body except b, a, i, img, strong, em and
	715	* p.
	716	*
	717	* $rm_tags_with_content
	718	* ---------------------
	719	* This is a simple one-dimentional array of strings, which specifies the
	720	* tags to be removed with any and all content between the beginning and
	721	* the end of the tag.
	722	* Example:
	723	* $rm_tags_with_content = Array(
	724	* "script",
	725	* "style",
	726	* "applet",
	727	* "embed"
	728	* );
	729	*
	730	* This will remove the following structure:
	731	* <script>
	732	* window.alert("Isn't cross-site-scripting fun?!");
	733	* </script>
	734	*
	735	* $self_closing_tags
	736	* ------------------
	737	* This is a simple one-dimentional array of strings, which specifies which
	738	* tags contain no content and should not be forcefully closed if this option
	739	* is turned on (see further).
	740	* Example:
	741	* $self_closing_tags = Array(
	742	* "img",
	743	* "br",
	744	* "hr",
	745	* "input"
	746	* );
	747	*
	748	* $force_tag_closing
	749	* ------------------
	750	* Set it to true to forcefully close any tags opened within the document.
	751	* This is good if you want to take care of people who like to screw up
	752	* the pages by leaving unclosed tags like <a>, <b>, <i>, etc.
	753	*
	754	* $rm_attnames
	755	* -------------
	756	* Now we come to parameters that are more obscure. This parameter is
	757	* a nested array which is used to specify which attributes should be
	758	* removed. It goes like so:
	759	*
	760	* $rm_attnames = Array(
	761	* "PCRE regex to match tag name" =>
	762	* Array(
	763	* "PCRE regex to match attribute name"
	764	* )
	765	* );
	766	*
	767	* Example:
	768	* $rm_attnames = Array(
	769	* "\|.*\|" =>
	770	* Array(
	771	* "\|target\|i",
	772	* "\|^on.*\|i"
	773	* )
	774	* );
	775	*
	776	* This will match all attributes (.*), and specify that all attributes
	777	* named "target" and starting with "on" should be removed. This will take
	778	* care of the following problem:
	779	* <em onmouseover="window.alert('muahahahaha')">
	780	* The "onmouseover" will be removed.
	781	*
	782	* $bad_attvals
	783	* ------------
	784	* This is where it gets ugly. This is a nested array with many levels.
	785	* It goes like so:
	786	*
	787	* $bad_attvals = Array(
	788	* "pcre regex to match tag name" =>
	789	* Array(
	790	* "pcre regex to match attribute name" =>
	791	* Array(
	792	* "pcre regex to match attribute value"
	793	* )
	794	* Array(
	795	* "pcre regex replace a match from above with"
	796	* )
	797	* )
	798	* );
	799	*
	800	* An extensive example:
	801	*
	802	* $bad_attvals = Array(
	803	* "\|.*\|" =>
	804	* Array(
	805	* "/^src\|background\|href\|action/i" =>
	806	* Array(
	807	* Array(
	808	* "/^([\'\"])\s\S+script\s:.*([\'\"])/si"
	809	* ),
	810	* Array(
	811	* "\\1http://veryfunny.com/\\2"
	812	* )
	813	* ),
	814	* "/^style/i" =>
	815	* Array(
	816	* Array(
	817	* "/expression/si",
	818	* "/url$([\'\"])\shttps:.*([\'\"])$/si",
	819	* "/url$([\'\"])\s\S+script:.([\'\"])$/si"
	820	* ),
	821	* Array(
	822	* "idiocy",
	823	* "url(\\1http://veryfunny.com/\\2)",
	824	* "url(\\1http://veryfynny.com/\\2)"
	825	* )
	826	* )
	827	* )
	828	* );
	829	*
	830	* This will take care of nearly all known cross-site scripting exploits,
	831	* plus some (see my filter sample at
	832	* http://www.mricon.com/html/phpfilter.html for a working version).
	833	*
	834	* $add_attr_to_tag
	835	* ----------------
	836	* This is a useful little feature which lets you add attributes to
	837	* certain tags. It is a nested array as well, but not at all like
	838	* the previous one. It goes like so:
	839	*
	840	* $add_attr_to_tag = Array(
	841	* "PCRE regex to match tag name" =>
	842	* Array(
	843	* "attribute name"=>'"attribute value"'
	844	* )
	845	* );
	846	*
	847	* Note: don't forget quotes around attribute value.
	848	*
	849	* Example:
	850	*
	851	* $add_attr_to_tag = Array(
	852	* "/^a$/si" =>
	853	* Array(
	854	* 'target'=>'"_new"'
	855	* )
	856	* );
	857	*
	858	* This will change all <a> tags and add target="_new" to them so all links
	859	* open in a new window.
	860	*
	861	*
	862	*
	863	* @param $body the string with HTML you wish to filter
	864	* @param $tag_list see description above
	865	* @param $rm_tags_with_content see description above
	866	* @param $self_closing_tags see description above
	867	* @param $force_tag_closing see description above
	868	* @param $rm_attnames see description above
	869	* @param $bad_attvals see description above
	870	* @param $add_attr_to_tag see description above
	871	* @return sanitized html safe to show on your pages.
	872	*/
	873	function sanitize($body,
	874	$tag_list,
	875	$rm_tags_with_content,
	876	$self_closing_tags,
	877	$force_tag_closing,
	878	$rm_attnames,
	879	$bad_attvals,
	880	$add_attr_to_tag
	881	){
	882	$me = 'sanitize';
	883	/**
	884	* Normalize rm_tags and rm_tags_with_content.
	885	*/
	886	@array_walk($tag_list, 'casenormalize');
	887	@array_walk($rm_tags_with_content, 'casenormalize');
	888	@array_walk($self_closing_tags, 'casenormalize');
	889	/**
	890	* See if tag_list is of tags to remove or tags to allow.
	891	* false means remove these tags
	892	* true means allow these tags
	893	*/
	894	$rm_tags = array_shift($tag_list);
	895	$curpos = 0;
	896	$open_tags = Array();
	897	//$trusted = "<!-- begin sanitized html -->\n";
	898	$trusted = "";
	899	$skip_content = false;
	900	/**
	901	* Take care of netscape's stupid javascript entities like
	902	* &{alert('boo')};
	903	*/
	904	$body = preg_replace('/&(\{.*?\};)/si', '&\\1', $body);
	905	spew("$me: invoking the loop\n");
	906	while (($curtag = getnxtag($body, $curpos)) != FALSE){
	907	list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
	908	spew("$me: grabbing free-standing content\n");
	909	$free_content = substr($body, $curpos, $lt - $curpos);
	910	spew("$me: " . strlen($free_content) . " chars grabbed\n");
	911	if ($skip_content == false){
	912	spew("$me: appending free content to trusted.\n");
	913	$trusted .= $free_content;
	914	} else {
	915	spew("$me: Skipping free content.\n");
	916	}
	917	if ($tagname != FALSE){
	918	spew("$me: tagname is '$tagname'\n");
	919	if ($tagtype == 2){
	920	spew("$me: This is a closing tag\n");
	921	if ($skip_content == $tagname){
	922	/**
	923	* Got to the end of tag we needed to remove.
	924	*/
	925	spew("$me: Finished removing tag with content\n");
	926	$tagname = false;
	927	$skip_content = false;
	928	} else {
	929	if ($skip_content == false){
	930	if (isset($open_tags{$tagname}) &&
	931	$open_tags{$tagname} > 0){
	932	spew("$me: popping '$tagname' from open_tags\n");
	933	$open_tags{$tagname}--;
	934	} else {
	935	spew("$me: '$tagname' was never opened\n");
	936	spew("$me: removing\n");
	937	$tagname = false;
	938	}
	939	} else {
	940	spew("$me: Skipping this tag\n");
	941	}
	942	}
	943	} else {
	944	/**
	945	* $rm_tags_with_content
	946	*/
	947	if ($skip_content == false){
	948	/**
	949	* See if this is a self-closing type and change
	950	* tagtype appropriately.
	951	*/
	952	if ($tagtype == 1
	953	&& in_array($tagname, $self_closing_tags)){
	954	spew("$me: Self-closing tag. Changing tagtype.\n");
	955	$tagtype = 3;
	956	}
	957	/**
	958	* See if we should skip this tag and any content
	959	* inside it.
	960	*/
	961	if ($tagtype == 1
	962	&& in_array($tagname, $rm_tags_with_content)){
	963	spew("$me: removing this tag with content\n");
	964	$skip_content = $tagname;
	965	} else {
	966	if (($rm_tags == false
	967	&& in_array($tagname, $tag_list)) \|\|
	968	($rm_tags == true
	969	&& !in_array($tagname, $tag_list))){
	970	spew("$me: Removing this tag.\n");
	971	$tagname = false;
	972	} else {
	973	if ($tagtype == 1){
	974	spew("$me: adding '$tagname' to open_tags\n");
	975	if (isset($open_tags{$tagname})){
	976	$open_tags{$tagname}++;
	977	} else {
	978	$open_tags{$tagname} = 1;
	979	}
	980	}
	981	/**
	982	* This is where we run other checks.
	983	*/
	984	if (is_array($attary) && sizeof($attary) > 0){
	985	$attary = fixatts($tagname,
	986	$attary,
	987	$rm_attnames,
	988	$bad_attvals,
	989	$add_attr_to_tag);
	990	}
	991	}
	992	}
	993	} else {
	994	spew("$me: Skipping this tag\n");
	995	}
	996	}
	997	if ($tagname != false && $skip_content == false){
	998	spew("$me: Appending tag to trusted.\n");
	999	$trusted .= tagprint($tagname, $attary, $tagtype);
	1000	}
	1001	} else {
	1002	spew("$me: Removing invalid tag\n");
	1003	}
	1004	$curpos = $gt + 1;
	1005	}
	1006	spew("$me: Appending any leftover content\n");
	1007	$trusted .= substr($body, $curpos, strlen($body) - $curpos);
	1008	if ($force_tag_closing == true){
	1009	foreach ($open_tags as $tagname=>$opentimes){
	1010	while ($opentimes > 0){
	1011	spew("$me: '$tagname' left open. Closing by force.\n");
	1012	$trusted .= '</' . $tagname . '>';
	1013	$opentimes--;
	1014	}
	1015	}
	1016	$trusted .= "\n";
	1017	}
	1018	//$trusted .= "<!-- end sanitized html -->\n";
	1019	return $trusted;
	1020	}
	1021	?>

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format