source: contrib/davical/inc/check_UTF8.php @ 3733

Revision 3733, 7.7 KB checked in by gabriel.malheiros, 13 years ago (diff)

Ticket #1541 - <Davical customizado para o Expresso.Utiliza Caldav e CardDav?>

Line 
1<?php
2/* ***** BEGIN LICENSE BLOCK *****
3 * Version: NPL 1.1/GPL 2.0/LGPL 2.1
4 *
5 * The contents of this file are subject to the Netscape Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/NPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
14 *
15 * The Original Code is Mozilla Communicator client code.
16 *
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 1998
20 * the Initial Developer. All Rights Reserved.
21 *
22 * Contributor(s):
23 * Henri Sivonen, hsivonen@iki.fi
24 *
25 *
26 * Alternatively, the contents of this file may be used under the terms of
27 * either the GNU General Public License Version 2 or later (the "GPL"), or
28 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the NPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the NPL, the GPL or the LGPL.
37 *
38 * ***** END LICENSE BLOCK ***** */
39
40/*
41 * For the original C++ code, see
42 * http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
43 * http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
44 *
45 * The latest version of this file can be obtained from
46 * http://iki.fi/hsivonen/php-utf8/
47 *
48 * Version 1.0, 2003-05-30
49 */
50
51/**
52 * Takes an UTF-8 string and returns an array of ints representing the
53 * Unicode characters. Astral planes are supported ie. the ints in the
54 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
55 * are not allowed.
56 *
57 * Returns false if the input string isn't a valid UTF-8 octet sequence.
58 */
59function utf8ToUnicode(&$str)
60{
61  $mState = 0;     // cached expected number of octets after the current octet
62                   // until the beginning of the next UTF8 character sequence
63  $mUcs4  = 0;     // cached Unicode character
64  $mBytes = 1;     // cached expected number of octets in the current sequence
65
66  $out = array();
67
68  $len = strlen($str);
69  for($i = 0; $i < $len; $i++) {
70    $in = ord($str{$i});
71    if (0 == $mState) {
72      // When mState is zero we expect either a US-ASCII character or a
73      // multi-octet sequence.
74      if (0 == (0x80 & ($in))) {
75        // US-ASCII, pass straight through.
76        $out[] = $in;
77        $mBytes = 1;
78      } else if (0xC0 == (0xE0 & ($in))) {
79        // First octet of 2 octet sequence
80        $mUcs4 = ($in);
81        $mUcs4 = ($mUcs4 & 0x1F) << 6;
82        $mState = 1;
83        $mBytes = 2;
84      } else if (0xE0 == (0xF0 & ($in))) {
85        // First octet of 3 octet sequence
86        $mUcs4 = ($in);
87        $mUcs4 = ($mUcs4 & 0x0F) << 12;
88        $mState = 2;
89        $mBytes = 3;
90      } else if (0xF0 == (0xF8 & ($in))) {
91        // First octet of 4 octet sequence
92        $mUcs4 = ($in);
93        $mUcs4 = ($mUcs4 & 0x07) << 18;
94        $mState = 3;
95        $mBytes = 4;
96      } else if (0xF8 == (0xFC & ($in))) {
97        /* First octet of 5 octet sequence.
98         *
99         * This is illegal because the encoded codepoint must be either
100         * (a) not the shortest form or
101         * (b) outside the Unicode range of 0-0x10FFFF.
102         * Rather than trying to resynchronize, we will carry on until the end
103         * of the sequence and let the later error handling code catch it.
104         */
105        $mUcs4 = ($in);
106        $mUcs4 = ($mUcs4 & 0x03) << 24;
107        $mState = 4;
108        $mBytes = 5;
109      } else if (0xFC == (0xFE & ($in))) {
110        // First octet of 6 octet sequence, see comments for 5 octet sequence.
111        $mUcs4 = ($in);
112        $mUcs4 = ($mUcs4 & 1) << 30;
113        $mState = 5;
114        $mBytes = 6;
115      } else {
116        /* Current octet is neither in the US-ASCII range nor a legal first
117         * octet of a multi-octet sequence.
118         */
119        return false;
120      }
121    } else {
122      // When mState is non-zero, we expect a continuation of the multi-octet
123      // sequence
124      if (0x80 == (0xC0 & ($in))) {
125        // Legal continuation.
126        $shift = ($mState - 1) * 6;
127        $tmp = $in;
128        $tmp = ($tmp & 0x0000003F) << $shift;
129        $mUcs4 |= $tmp;
130
131        if (0 == --$mState) {
132          /* End of the multi-octet sequence. mUcs4 now contains the final
133           * Unicode codepoint to be output
134           *
135           * Check for illegal sequences and codepoints.
136           */
137
138          // From Unicode 3.1, non-shortest form is illegal
139          if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
140              ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
141              ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
142              (4 < $mBytes) ||
143              // From Unicode 3.2, surrogate characters are illegal
144              (($mUcs4 & 0xFFFFF800) == 0xD800) ||
145              // Codepoints outside the Unicode range are illegal
146              ($mUcs4 > 0x10FFFF)) {
147            return false;
148          }
149          if (0xFEFF != $mUcs4) {
150            // BOM is legal but we don't want to output it
151            $out[] = $mUcs4;
152          }
153          //initialize UTF8 cache
154          $mState = 0;
155          $mUcs4  = 0;
156          $mBytes = 1;
157        }
158      } else {
159        /* ((0xC0 & (*in) != 0x80) && (mState != 0))
160         *
161         * Incomplete multi-octet sequence.
162         */
163        return false;
164      }
165    }
166  }
167  return $out;
168}
169
170/**
171 * Takes an array of ints representing the Unicode characters and returns
172 * a UTF-8 string. Astral planes are supported ie. the ints in the
173 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
174 * are not allowed.
175 *
176 * Returns false if the input array contains ints that represent
177 * surrogates or are outside the Unicode range.
178 */
179function unicodeToUtf8(&$arr)
180{
181  $dest = '';
182  foreach ($arr as $src) {
183    if($src < 0) {
184      return false;
185    } else if ( $src <= 0x007f) {
186      $dest .= chr($src);
187    } else if ($src <= 0x07ff) {
188      $dest .= chr(0xc0 | ($src >> 6));
189      $dest .= chr(0x80 | ($src & 0x003f));
190    } else if($src == 0xFEFF) {
191      // nop -- zap the BOM
192    } else if ($src >= 0xD800 && $src <= 0xDFFF) {
193      // found a surrogate
194      return false;
195    } else if ($src <= 0xffff) {
196      $dest .= chr(0xe0 | ($src >> 12));
197      $dest .= chr(0x80 | (($src >> 6) & 0x003f));
198      $dest .= chr(0x80 | ($src & 0x003f));
199    } else if ($src <= 0x10ffff) {
200      $dest .= chr(0xf0 | ($src >> 18));
201      $dest .= chr(0x80 | (($src >> 12) & 0x3f));
202      $dest .= chr(0x80 | (($src >> 6) & 0x3f));
203      $dest .= chr(0x80 | ($src & 0x3f));
204    } else {
205      // out of range
206      return false;
207    }
208  }
209  return $dest;
210}
211function check_string($ics){
212    $ics_file = explode("\n",$ics);
213    foreach($ics_file as $line => $str){
214        if(false === utf8ToUnicode($str)){
215            $error[] = $line;
216        }
217    }
218    if(isset($error) && is_array($error)){
219        foreach($error as $line){
220            dbg_error_log( "LOG check_string","error on lines %  invalid character in string %s" , ($line +1),$ics_file[$line]  );
221            return false;
222        }
223    } else {
224        dbg_error_log( "LOG check_string","the string is UTF8 compliant");
225        return true;
226    }
227}
228?>
Note: See TracBrowser for help on using the repository browser.