Context Navigation

source: branches/2.1/phpgwapi/inc/class.kses.inc.php @ 2

Revision 2, 20.8 KB checked in by niltonneto, 18 years ago (diff)
Removida todas as tags usadas pelo CVS ($Id, $Source). Primeira versão no CVS externo.
Property svn:eol-style set to `native` Property svn:executable set to ``*

Line
1	<?php
2	/*
3	* This is a fork of a slick piece of procedural code called 'kses' written by Ulf Harnhammar
4	* The entire set of functions was wrapped in a PHP object with some internal modifications
5	* by Richard Vasquez (http://www.chaos.org/) 7/25/2003
6	*
7	* The original (procedural) version of the code can be found at:
8	* http://sourceforge.net/projects/kses/
9	*
10	* [kses strips evil scripts!]
11	*
12	* ==========================================================================================
13	*
14	* class.kses.php 0.0.2 - PHP class that filters HTML/XHTML only allowing some elements and
15	* attributes to be passed through.
16	*
17	* Copyright (C) 2003 Richard R. Vasquez, Jr.
18	*
19	* Derived from kses 0.2.1 - HTML/XHTML filter that only allows some elements and attributes
20	* Copyright (C) 2002, 2003 Ulf Harnhammar
21	*
22	* ==========================================================================================
23	*
24	* This program is free software and open source software; you can redistribute
25	* it and/or modify it under the terms of the GNU General Public License as
26	* published by the Free Software Foundation; either version 2 of the License,
27	* or (at your option) any later version.
28	*
29	* This program is distributed in the hope that it will be useful, but WITHOUT
30	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
31	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
32	* more details.
33	*
34	* You should have received a copy of the GNU General Public License along
35	* with this program; if not, write to the Free Software Foundation, Inc.,
36	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA or visit
37	* http://www.gnu.org/licenses/gpl.html
38	*
39	* ==========================================================================================
40	* CONTACT INFORMATION:
41	*
42	* Email: View current valid email address at http://www.chaos.org/contact/
43	*/
44
45	class kses
46	{
47	var $allowed_protocols = array('http', 'https', 'ftp', 'news', 'nntp', 'telnet', 'gopher', 'mailto');
48	var $allowed_html = array();
49
50	function kses()
51	{
52	}
53
54	function Parse($string = "")
55	{
56	if (get_magic_quotes_gpc())
57	{
58	$string = stripslashes($string);
59	}
60	$string = $this->_no_null($string);
61	$string = $this->_js_entities($string);
62	$string = $this->_normalize_entities($string);
63	$string = $this->_hook($string);
64	return $this->_split($string);
65	}
66
67	function Protocols()
68	{
69	$c_args = func_num_args();
70	if($c_args != 1)
71	{
72	return false;
73	}
74
75	$protocol_data = func_get_arg(0);
76
77	if(is_array($protocol_data))
78	{
79	foreach($protocol_data as $protocol)
80	{
81	$this->AddProtocol($protocol);
82	}
83	}
84	elseif(is_string($protocol_data))
85	{
86	$this->AddProtocol($protocol_data);
87	return true;
88	}
89	else
90	{
91	trigger_error("kses::Protocols() did not receive a string or an array.", E_USER_WARNING);
92	return false;
93	}
94	}
95
96	function AddProtocol($protocol = "")
97	{
98	if(!is_string($protocol))
99	{
100	trigger_error("kses::AddProtocol() requires a string.", E_USER_WARNING);
101	return false;
102	}
103
104	$protocol = strtolower(trim($protocol));
105	if($protocol == "")
106	{
107	trigger_error("kses::AddProtocol() tried to add an empty/NULL protocol.", E_USER_WARNING);
108	return false;
109	}
110
111	// Remove any inadvertent ':' at the end of the protocol.
112	if(substr($protocol, strlen($protocol) - 1, 1) == ":")
113	{
114	$protocol = substr($protocol, 0, strlen($protocol) - 1);
115	}
116
117	if(!in_array($protocol, $this->allowed_protocols))
118	{
119	array_push($this->allowed_protocols, $protocol);
120	sort($this->allowed_protocols);
121	}
122	return true;
123	}
124
125	function AddHTML($tag = "", $attribs = array())
126	{
127	if(!is_string($tag))
128	{
129	trigger_error("kses::AddHTML() requires the tag to be a string", E_USER_WARNING);
130	return false;
131	}
132
133	$tag = strtolower(trim($tag));
134	if($tag == "")
135	{
136	trigger_error("kses::AddHTML() tried to add an empty/NULL tag", E_USER_WARNING);
137	return false;
138	}
139
140	if(!is_array($attribs))
141	{
142	trigger_error("kses::AddHTML() requires an array (even an empty one) of attributes for '$tag'", E_USER_WARNING);
143	return false;
144	}
145
146	$new_attribs = array();
147	foreach($attribs as $idx1 => $val1)
148	{
149	$new_idx1 = strtolower($idx1);
150	$new_val1 = $attribs[$idx1];
151
152	if(is_array($new_val1))
153	{
154	$tmp_val = array();
155	foreach($new_val1 as $idx2 => $val2)
156	{
157	$new_idx2 = strtolower($idx2);
158	$tmp_val[$new_idx2] = $val2;
159	}
160	$new_val1 = $tmp_val;
161	}
162
163	$new_attribs[$new_idx1] = $new_val1;
164	}
165
166	$this->allowed_html[$tag] = $new_attribs;
167	return true;
168	}
169
170	###############################################################################
171	# This function removes any NULL or chr(173) characters in $string.
172	###############################################################################
173	function _no_null($string)
174	{
175	$string = preg_replace('/\0+/', '', $string);
176	$string = preg_replace('/(\\\\0)+/', '', $string);
177	# commented out, because it breaks chinese chars
178	#$string = preg_replace('/\xad+/', '', $string); # deals with Opera "feature"
179	return $string;
180	} # function _no_null
181
182	###############################################################################
183	# This function removes the HTML JavaScript entities found in early versions of
184	# Netscape 4.
185	###############################################################################
186	function _js_entities($string)
187	{
188	return preg_replace('%&\s\{[^}](\}\s*;?\|$)%', '', $string);
189	} # function _js_entities
190
191
192	###############################################################################
193	# This function normalizes HTML entities. It will convert "AT&T" to the correct
194	# "AT&T", ":" to ":", "&#XYZZY;" to "&#XYZZY;" and so on.
195	###############################################################################
196	function _normalize_entities($string)
197	{
198	# Disarm all entities by converting & to &
199	$string = str_replace('&', '&', $string);
200
201	# Change back the allowed entities in our entity white list
202
203	$string = preg_replace('/&([A-Za-z][A-Za-z0-9]{0,19});/', '&\\1;', $string);
204	$string = preg_replace('/&#0*([0-9]{1,5});/e', '\$this->_normalize_entities2("\\1")', $string);
205	$string = preg_replace('/&#([Xx])0*(([0-9A-Fa-f]{2}){1,2});/', '&#\\1\\2;', $string);
206
207	return $string;
208	} # function _normalize_entities
209
210
211	###############################################################################
212	# This function helps _normalize_entities() to only accept 16 bit values
213	# and nothing more for &#number; entities.
214	###############################################################################
215	function _normalize_entities2($i)
216	{
217	return (($i > 65535) ? "&#$i;" : "&#$i;");
218	} # function _normalize_entities2
219
220	###############################################################################
221	# You add any kses hooks here.
222	###############################################################################
223	function _hook($string)
224	{
225	return $string;
226	} # function _hook
227
228	###############################################################################
229	# This function goes through an array, and changes the keys to all lower case.
230	###############################################################################
231	function _array_lc($inarray)
232	{
233	$outarray = array();
234
235	foreach ($inarray as $inkey => $inval)
236	{
237	$outkey = strtolower($inkey);
238	$outarray[$outkey] = array();
239
240	foreach ($inval as $inkey2 => $inval2)
241	{
242	$outkey2 = strtolower($inkey2);
243	$outarray[$outkey][$outkey2] = $inval2;
244	} # foreach $inval
245	} # foreach $inarray
246
247	return $outarray;
248	} # function _array_lc
249
250	###############################################################################
251	# This function searches for HTML tags, no matter how malformed. It also
252	# matches stray ">" characters.
253	###############################################################################
254	function _split($string)
255	{
256	return preg_replace(
257	'%(<'. # EITHER: <
258	'[^>]*'. # things that aren't >
259	'(>\|$)'. # > or end of string
260	'\|>)%e', # OR: just a >
261	"\$this->_split2('\\1')",
262	$string);
263	} # function _split
264
265	function _split2($string)
266	###############################################################################
267	# This function does a lot of work. It rejects some very malformed things
268	# like <:::>. It returns an empty string, if the element isn't allowed (look
269	# ma, no strip_tags()!). Otherwise it splits the tag into an element and an
270	# attribute list.
271	###############################################################################
272	{
273	$string = $this->_stripslashes($string);
274
275	if (substr($string, 0, 1) != '<')
276	{
277	# It matched a ">" character
278	return '>';
279	}
280
281	if (!preg_match('%^<\s(/\s)?([a-zA-Z0-9]+)([^>]*)>?$%', $string, $matches))
282	{
283	# It's seriously malformed
284	return '';
285	}
286
287	$slash = trim($matches[1]);
288	$elem = $matches[2];
289	$attrlist = $matches[3];
290
291	if (!is_array($this->allowed_html[strtolower($elem)]))
292	{
293	# They are using a not allowed HTML element
294	return '';
295	}
296
297	return $this->_attr("$slash$elem", $attrlist);
298	} # function _split2
299
300	###############################################################################
301	# This function removes all attributes, if none are allowed for this element.
302	# If some are allowed it calls s_hair() to split them further, and then it
303	# builds up new HTML code from the data that _hair() returns. It also
304	# removes "<" and ">" characters, if there are any left. One more thing it
305	# does is to check if the tag has a closing XHTML slash, and if it does,
306	# it puts one in the returned code as well.
307	###############################################################################
308	function _attr($element, $attr)
309	{
310	# Is there a closing XHTML slash at the end of the attributes?
311	$xhtml_slash = '';
312	if (preg_match('%\s/\s*$%', $attr))
313	{
314	$xhtml_slash = ' /';
315	}
316
317	# Are any attributes allowed at all for this element?
318	if (count($this->allowed_html[strtolower($element)]) == 0)
319	{
320	return "<$element$xhtml_slash>";
321	}
322
323	# Split it
324	$attrarr = $this->_hair($attr);
325
326	# Go through $attrarr, and save the allowed attributes for this element
327	# in $attr2
328	$attr2 = '';
329	foreach ($attrarr as $arreach)
330	{
331	$current = $this->allowed_html[strtolower($element)][strtolower($arreach['name'])];
332	if ($current == '')
333	{
334	# the attribute is not allowed
335	continue;
336	}
337
338	if (!is_array($current))
339	{
340	# there are no checks
341	$attr2 .= ' '.$arreach['whole'];
342	}
343	else
344	{
345	# there are some checks
346	$ok = true;
347	foreach ($current as $currkey => $currval)
348	{
349	if (!$this->_check_attr_val($arreach['value'], $arreach['vless'], $currkey, $currval))
350	{
351	$ok = false;
352	break;
353	}
354	}
355
356	if ($ok)
357	{
358	# it passed them
359	$attr2 .= ' '.$arreach['whole'];
360	}
361	} # if !is_array($current)
362	} # foreach
363
364	# Remove any "<" or ">" characters
365	$attr2 = preg_replace('/[<>]/', '', $attr2);
366	return "<$element$attr2$xhtml_slash>";
367	} # function _attr
368
369	###############################################################################
370	# This function does a lot of work. It parses an attribute list into an array
371	# with attribute data, and tries to do the right thing even if it gets weird
372	# input. It will add quotes around attribute values that don't have any quotes
373	# or apostrophes around them, to make it easier to produce HTML code that will
374	# conform to W3C's HTML specification. It will also remove bad URL protocols
375	# from attribute values.
376	###############################################################################
377	function _hair($attr)
378	{
379	$attrarr = array();
380	$mode = 0;
381	$attrname = '';
382
383	# Loop through the whole attribute list
384
385	while (strlen($attr) != 0)
386	{
387	# Was the last operation successful?
388	$working = 0;
389
390	switch ($mode)
391	{
392	case 0: # attribute name, href for instance
393	if (preg_match('/^([-a-zA-Z]+)/', $attr, $match))
394	{
395	$attrname = $match[1];
396	$working = $mode = 1;
397	$attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
398	}
399	break;
400	case 1: # equals sign or valueless ("selected")
401	if (preg_match('/^\s=\s/', $attr)) # equals sign
402	{
403	$working = 1;
404	$mode = 2;
405	$attr = preg_replace('/^\s=\s/', '', $attr);
406	break;
407	}
408	if (preg_match('/^\s+/', $attr)) # valueless
409	{
410	$working = 1;
411	$mode = 0;
412	$attrarr[] = array(
413	'name' => $attrname,
414	'value' => '',
415	'whole' => $attrname,
416	'vless' => 'y'
417	);
418	$attr = preg_replace('/^\s+/', '', $attr);
419	}
420	break;
421	case 2: # attribute value, a URL after href= for instance
422	if (preg_match('/^"([^"]*)"(\s+\|$)/', $attr, $match)) # "value"
423	{
424	$thisval = $this->_bad_protocol($match[1]);
425	$attrarr[] = array(
426	'name' => $attrname,
427	'value' => $thisval,
428	'whole' => "$attrname=\"$thisval\"",
429	'vless' => 'n'
430	);
431	$working = 1;
432	$mode = 0;
433	$attr = preg_replace('/^"[^"]*"(\s+\|$)/', '', $attr);
434	break;
435	}
436	if (preg_match("/^'([^']*)'(\s+\|$)/", $attr, $match)) # 'value'
437	{
438	$thisval = $this->_bad_protocol($match[1]);
439	$attrarr[] = array(
440	'name' => $attrname,
441	'value' => $thisval,
442	'whole' => "$attrname='$thisval'",
443	'vless' => 'n'
444	);
445	$working = 1;
446	$mode = 0;
447	$attr = preg_replace("/^'[^']*'(\s+\|$)/", '', $attr);
448	break;
449	}
450	if (preg_match("%^([^\s\"']+)(\s+\|$)%", $attr, $match)) # value
451	{
452	$thisval = $this->_bad_protocol($match[1]);
453	$attrarr[] = array(
454	'name' => $attrname,
455	'value' => $thisval,
456	'whole' => "$attrname=\"$thisval\"",
457	'vless' => 'n'
458	);
459	# We add quotes to conform to W3C's HTML spec.
460	$working = 1;
461	$mode = 0;
462	$attr = preg_replace("%^[^\s\"']+(\s+\|$)%", '', $attr);
463	}
464	break;
465	} # switch
466
467	if ($working == 0) # not well formed, remove and try again
468	{
469	$attr = $this->_html_error($attr);
470	$mode = 0;
471	}
472	} # while
473
474	# special case, for when the attribute list ends with a valueless
475	# attribute like "selected"
476	if ($mode == 1)
477	{
478	$attrarr[] = array(
479	'name' => $attrname,
480	'value' => '',
481	'whole' => $attrname,
482	'vless' => 'y'
483	);
484	}
485
486	return $attrarr;
487	} # function _hair
488
489	###############################################################################
490	# This function removes all non-allowed protocols from the beginning of
491	# $string. It ignores whitespace and the case of the letters, and it does
492	# understand HTML entities. It does its work in a while loop, so it won't be
493	# fooled by a string like "javascript:javascript:alert(57)".
494	###############################################################################
495	function _bad_protocol($string)
496	{
497	$string = $this->_no_null($string);
498	$string2 = $string.'a';
499
500	while ($string != $string2)
501	{
502	$string2 = $string;
503	$string = $this->_bad_protocol_once($string);
504	} # while
505
506	return $string;
507	} # function _bad_protocol
508
509	###############################################################################
510	# This function searches for URL protocols at the beginning of $string, while
511	# handling whitespace and HTML entities.
512	###############################################################################
513	function _bad_protocol_once($string)
514	{
515	return preg_replace(
516	'/^((&[^;];\|[\sA-Za-z0-9]))'.
517	'(:\|:\|&#[Xx]3[Aa];)\s*/e',
518	'\$this->_bad_protocol_once2("\\1")',
519	$string
520	);
521	return $string;
522	} # function _bad_protocol_once
523
524
525	###############################################################################
526	# This function processes URL protocols, checks to see if they're in the white-
527	# list or not, and returns different data depending on the answer.
528	###############################################################################
529	function _bad_protocol_once2($string)
530	{
531	$string2 = $this->_decode_entities($string2);
532	$string2 = preg_replace('/\s/', '', $string);
533	$string2 = $this->_no_null($string2);
534	$string2 = strtolower($string2);
535
536	$allowed = false;
537	foreach ($this->allowed_protocols as $one_protocol)
538	{
539	if (strtolower($one_protocol) == $string2)
540	{
541	$allowed = true;
542	break;
543	}
544	}
545
546	if ($allowed)
547	{
548	return "$string2:";
549	}
550	else
551	{
552	return '';
553	}
554	} # function _bad_protocol_once2
555
556	###############################################################################
557	# This function performs different checks for attribute values. The currently
558	# implemented checks are "maxlen", "minlen", "maxval", "minval" and "valueless"
559	# with even more checks to come soon.
560	###############################################################################
561	function _check_attr_val($value, $vless, $checkname, $checkvalue)
562	{
563	$ok = true;
564
565	switch (strtolower($checkname))
566	{
567	# The maxlen check makes sure that the attribute value has a length not
568	# greater than the given value. This can be used to avoid Buffer Overflows
569	# in WWW clients and various Internet servers.
570	case 'maxlen':
571	if (strlen($value) > $checkvalue)
572	{
573	$ok = false;
574	}
575	break;
576
577	# The minlen check makes sure that the attribute value has a length not
578	# smaller than the given value.
579	case 'minlen':
580	if (strlen($value) < $checkvalue)
581	{
582	$ok = false;
583	}
584	break;
585
586	# The maxval check does two things: it checks that the attribute value is
587	# an integer from 0 and up, without an excessive amount of zeroes or
588	# whitespace (to avoid Buffer Overflows). It also checks that the attribute
589	# value is not greater than the given value.
590	# This check can be used to avoid Denial of Service attacks.
591	case 'maxval':
592	if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value))
593	{
594	$ok = false;
595	}
596	if ($value > $checkvalue)
597	{
598	$ok = false;
599	}
600	break;
601
602	# The minval check checks that the attribute value is a positive integer,
603	# and that it is not smaller than the given value.
604	case 'minval':
605	if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value))
606	{
607	$ok = false;
608	}
609	if ($value < $checkvalue)
610	{
611	$ok = false;
612	}
613	break;
614
615	# The valueless check checks if the attribute has a value
616	# (like <a href="blah">) or not (<option selected>). If the given value
617	# is a "y" or a "Y", the attribute must not have a value.
618	# If the given value is an "n" or an "N", the attribute must have one.
619	case 'valueless':
620	if (strtolower($checkvalue) != $vless)
621	{
622	$ok = false;
623	}
624	break;
625
626	} # switch
627
628	return $ok;
629	} # function _check_attr_val
630
631	###############################################################################
632	# This function changes the character sequence \" to just "
633	# It leaves all other slashes alone. It's really weird, but the quoting from
634	# preg_replace(//e) seems to require this.
635	###############################################################################
636	function _stripslashes($string)
637	{
638	return preg_replace('%\\\\"%', '"', $string);
639	} # function _stripslashes
640
641	###############################################################################
642	# This function deals with parsing errors in _hair(). The general plan is
643	# to remove everything to and including some whitespace, but it deals with
644	# quotes and apostrophes as well.
645	###############################################################################
646	function _html_error($string)
647	{
648	return preg_replace('/^("[^"]("\|$)\|\'[^\'](\'\|$)\|\S)\s/', '', $string);
649	} # function _html_error
650
651	###############################################################################
652	# This function decodes numeric HTML entities (A and A). It doesn't
653	# do anything with other entities like ä, but we don't need them in the
654	# URL protocol white listing system anyway.
655	###############################################################################
656	function _decode_entities($string)
657	{
658	$string = preg_replace('/&#([0-9]+);/e', 'chr("\\1")', $string);
659	$string = preg_replace('/&#[Xx]([0-9A-Fa-f]+);/e', 'chr(hexdec("\\1"))', $string);
660	return $string;
661	} # function _decode_entities
662
663	###############################################################################
664	# This function returns kses' version number.
665	###############################################################################
666	function _version()
667	{
668	return '0.0.2 (OOP fork of kses 0.2.1)';
669	} # function _version
670	}
671	?>

Note: See TracBrowser for help on using the repository browser.

Download in other formats: