1 | /**************************************************************** |
---|
2 | * Licensed to the Apache Software Foundation (ASF) under one * |
---|
3 | * or more contributor license agreements. See the NOTICE file * |
---|
4 | * distributed with this work for additional information * |
---|
5 | * regarding copyright ownership. The ASF licenses this file * |
---|
6 | * to you under the Apache License, Version 2.0 (the * |
---|
7 | * "License"); you may not use this file except in compliance * |
---|
8 | * with the License. You may obtain a copy of the License at * |
---|
9 | * * |
---|
10 | * http://www.apache.org/licenses/LICENSE-2.0 * |
---|
11 | * * |
---|
12 | * Unless required by applicable law or agreed to in writing, * |
---|
13 | * software distributed under the License is distributed on an * |
---|
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * |
---|
15 | * KIND, either express or implied. See the License for the * |
---|
16 | * specific language governing permissions and limitations * |
---|
17 | * under the License. * |
---|
18 | ****************************************************************/ |
---|
19 | |
---|
20 | package org.apache.james.mime4j.util; |
---|
21 | |
---|
22 | import java.io.UnsupportedEncodingException; |
---|
23 | import java.util.HashMap; |
---|
24 | import java.util.Map; |
---|
25 | import java.util.SortedSet; |
---|
26 | import java.util.TreeSet; |
---|
27 | |
---|
28 | /** |
---|
29 | * Utility class for working with character sets. It is somewhat similar to |
---|
30 | * the Java 1.4 <code>java.nio.charset.Charset</code> class but knows many |
---|
31 | * more aliases and is compatible with Java 1.3. It will use a simple detection |
---|
32 | * mechanism to detect what character sets the current VM supports. This will |
---|
33 | * be a sub-set of the character sets listed in the |
---|
34 | * <a href="http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html"> |
---|
35 | * Java 1.5 (J2SE5.0) Supported Encodings</a> document. |
---|
36 | * <p> |
---|
37 | * The <a href="http://www.iana.org/assignments/character-sets"> |
---|
38 | * IANA Character Sets</a> document has been used to determine the preferred |
---|
39 | * MIME character set names and to get a list of known aliases. |
---|
40 | * <p> |
---|
41 | * This is a complete list of the character sets known to this class: |
---|
42 | * <table> |
---|
43 | * <tr> |
---|
44 | * <td>Canonical (Java) name</td> |
---|
45 | * <td>MIME preferred</td> |
---|
46 | * <td>Aliases</td> |
---|
47 | * </tr> |
---|
48 | * <tr> |
---|
49 | * <td>ASCII</td> |
---|
50 | * <td>US-ASCII</td> |
---|
51 | * <td>ANSI_X3.4-1968 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ISO646-US us IBM367 cp367 csASCII ascii7 646 iso_646.irv:1983 </td> |
---|
52 | * </tr> |
---|
53 | * <tr> |
---|
54 | * <td>Big5</td> |
---|
55 | * <td>Big5</td> |
---|
56 | * <td>csBig5 CN-Big5 BIG-FIVE BIGFIVE </td> |
---|
57 | * </tr> |
---|
58 | * <tr> |
---|
59 | * <td>Big5_HKSCS</td> |
---|
60 | * <td>Big5-HKSCS</td> |
---|
61 | * <td>big5hkscs </td> |
---|
62 | * </tr> |
---|
63 | * <tr> |
---|
64 | * <td>Big5_Solaris</td> |
---|
65 | * <td>?</td> |
---|
66 | * <td></td> |
---|
67 | * </tr> |
---|
68 | * <tr> |
---|
69 | * <td>Cp037</td> |
---|
70 | * <td>IBM037</td> |
---|
71 | * <td>ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 </td> |
---|
72 | * </tr> |
---|
73 | * <tr> |
---|
74 | * <td>Cp1006</td> |
---|
75 | * <td>?</td> |
---|
76 | * <td></td> |
---|
77 | * </tr> |
---|
78 | * <tr> |
---|
79 | * <td>Cp1025</td> |
---|
80 | * <td>?</td> |
---|
81 | * <td></td> |
---|
82 | * </tr> |
---|
83 | * <tr> |
---|
84 | * <td>Cp1026</td> |
---|
85 | * <td>IBM1026</td> |
---|
86 | * <td>csIBM1026 </td> |
---|
87 | * </tr> |
---|
88 | * <tr> |
---|
89 | * <td>Cp1046</td> |
---|
90 | * <td>?</td> |
---|
91 | * <td></td> |
---|
92 | * </tr> |
---|
93 | * <tr> |
---|
94 | * <td>Cp1047</td> |
---|
95 | * <td>IBM1047</td> |
---|
96 | * <td>IBM-1047 </td> |
---|
97 | * </tr> |
---|
98 | * <tr> |
---|
99 | * <td>Cp1097</td> |
---|
100 | * <td>?</td> |
---|
101 | * <td></td> |
---|
102 | * </tr> |
---|
103 | * <tr> |
---|
104 | * <td>Cp1098</td> |
---|
105 | * <td>?</td> |
---|
106 | * <td></td> |
---|
107 | * </tr> |
---|
108 | * <tr> |
---|
109 | * <td>Cp1112</td> |
---|
110 | * <td>?</td> |
---|
111 | * <td></td> |
---|
112 | * </tr> |
---|
113 | * <tr> |
---|
114 | * <td>Cp1122</td> |
---|
115 | * <td>?</td> |
---|
116 | * <td></td> |
---|
117 | * </tr> |
---|
118 | * <tr> |
---|
119 | * <td>Cp1123</td> |
---|
120 | * <td>?</td> |
---|
121 | * <td></td> |
---|
122 | * </tr> |
---|
123 | * <tr> |
---|
124 | * <td>Cp1124</td> |
---|
125 | * <td>?</td> |
---|
126 | * <td></td> |
---|
127 | * </tr> |
---|
128 | * <tr> |
---|
129 | * <td>Cp1140</td> |
---|
130 | * <td>IBM01140</td> |
---|
131 | * <td>CCSID01140 CP01140 ebcdic-us-37+euro </td> |
---|
132 | * </tr> |
---|
133 | * <tr> |
---|
134 | * <td>Cp1141</td> |
---|
135 | * <td>IBM01141</td> |
---|
136 | * <td>CCSID01141 CP01141 ebcdic-de-273+euro </td> |
---|
137 | * </tr> |
---|
138 | * <tr> |
---|
139 | * <td>Cp1142</td> |
---|
140 | * <td>IBM01142</td> |
---|
141 | * <td>CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro </td> |
---|
142 | * </tr> |
---|
143 | * <tr> |
---|
144 | * <td>Cp1143</td> |
---|
145 | * <td>IBM01143</td> |
---|
146 | * <td>CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro </td> |
---|
147 | * </tr> |
---|
148 | * <tr> |
---|
149 | * <td>Cp1144</td> |
---|
150 | * <td>IBM01144</td> |
---|
151 | * <td>CCSID01144 CP01144 ebcdic-it-280+euro </td> |
---|
152 | * </tr> |
---|
153 | * <tr> |
---|
154 | * <td>Cp1145</td> |
---|
155 | * <td>IBM01145</td> |
---|
156 | * <td>CCSID01145 CP01145 ebcdic-es-284+euro </td> |
---|
157 | * </tr> |
---|
158 | * <tr> |
---|
159 | * <td>Cp1146</td> |
---|
160 | * <td>IBM01146</td> |
---|
161 | * <td>CCSID01146 CP01146 ebcdic-gb-285+euro </td> |
---|
162 | * </tr> |
---|
163 | * <tr> |
---|
164 | * <td>Cp1147</td> |
---|
165 | * <td>IBM01147</td> |
---|
166 | * <td>CCSID01147 CP01147 ebcdic-fr-297+euro </td> |
---|
167 | * </tr> |
---|
168 | * <tr> |
---|
169 | * <td>Cp1148</td> |
---|
170 | * <td>IBM01148</td> |
---|
171 | * <td>CCSID01148 CP01148 ebcdic-international-500+euro </td> |
---|
172 | * </tr> |
---|
173 | * <tr> |
---|
174 | * <td>Cp1149</td> |
---|
175 | * <td>IBM01149</td> |
---|
176 | * <td>CCSID01149 CP01149 ebcdic-is-871+euro </td> |
---|
177 | * </tr> |
---|
178 | * <tr> |
---|
179 | * <td>Cp1250</td> |
---|
180 | * <td>windows-1250</td> |
---|
181 | * <td></td> |
---|
182 | * </tr> |
---|
183 | * <tr> |
---|
184 | * <td>Cp1251</td> |
---|
185 | * <td>windows-1251</td> |
---|
186 | * <td></td> |
---|
187 | * </tr> |
---|
188 | * <tr> |
---|
189 | * <td>Cp1252</td> |
---|
190 | * <td>windows-1252</td> |
---|
191 | * <td></td> |
---|
192 | * </tr> |
---|
193 | * <tr> |
---|
194 | * <td>Cp1253</td> |
---|
195 | * <td>windows-1253</td> |
---|
196 | * <td></td> |
---|
197 | * </tr> |
---|
198 | * <tr> |
---|
199 | * <td>Cp1254</td> |
---|
200 | * <td>windows-1254</td> |
---|
201 | * <td></td> |
---|
202 | * </tr> |
---|
203 | * <tr> |
---|
204 | * <td>Cp1255</td> |
---|
205 | * <td>windows-1255</td> |
---|
206 | * <td></td> |
---|
207 | * </tr> |
---|
208 | * <tr> |
---|
209 | * <td>Cp1256</td> |
---|
210 | * <td>windows-1256</td> |
---|
211 | * <td></td> |
---|
212 | * </tr> |
---|
213 | * <tr> |
---|
214 | * <td>Cp1257</td> |
---|
215 | * <td>windows-1257</td> |
---|
216 | * <td></td> |
---|
217 | * </tr> |
---|
218 | * <tr> |
---|
219 | * <td>Cp1258</td> |
---|
220 | * <td>windows-1258</td> |
---|
221 | * <td></td> |
---|
222 | * </tr> |
---|
223 | * <tr> |
---|
224 | * <td>Cp1381</td> |
---|
225 | * <td>?</td> |
---|
226 | * <td></td> |
---|
227 | * </tr> |
---|
228 | * <tr> |
---|
229 | * <td>Cp1383</td> |
---|
230 | * <td>?</td> |
---|
231 | * <td></td> |
---|
232 | * </tr> |
---|
233 | * <tr> |
---|
234 | * <td>Cp273</td> |
---|
235 | * <td>IBM273</td> |
---|
236 | * <td>csIBM273 </td> |
---|
237 | * </tr> |
---|
238 | * <tr> |
---|
239 | * <td>Cp277</td> |
---|
240 | * <td>IBM277</td> |
---|
241 | * <td>EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 </td> |
---|
242 | * </tr> |
---|
243 | * <tr> |
---|
244 | * <td>Cp278</td> |
---|
245 | * <td>IBM278</td> |
---|
246 | * <td>CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278 </td> |
---|
247 | * </tr> |
---|
248 | * <tr> |
---|
249 | * <td>Cp280</td> |
---|
250 | * <td>IBM280</td> |
---|
251 | * <td>ebcdic-cp-it csIBM280 </td> |
---|
252 | * </tr> |
---|
253 | * <tr> |
---|
254 | * <td>Cp284</td> |
---|
255 | * <td>IBM284</td> |
---|
256 | * <td>ebcdic-cp-es csIBM284 </td> |
---|
257 | * </tr> |
---|
258 | * <tr> |
---|
259 | * <td>Cp285</td> |
---|
260 | * <td>IBM285</td> |
---|
261 | * <td>ebcdic-cp-gb csIBM285 </td> |
---|
262 | * </tr> |
---|
263 | * <tr> |
---|
264 | * <td>Cp297</td> |
---|
265 | * <td>IBM297</td> |
---|
266 | * <td>ebcdic-cp-fr csIBM297 </td> |
---|
267 | * </tr> |
---|
268 | * <tr> |
---|
269 | * <td>Cp33722</td> |
---|
270 | * <td>?</td> |
---|
271 | * <td></td> |
---|
272 | * </tr> |
---|
273 | * <tr> |
---|
274 | * <td>Cp420</td> |
---|
275 | * <td>IBM420</td> |
---|
276 | * <td>ebcdic-cp-ar1 csIBM420 </td> |
---|
277 | * </tr> |
---|
278 | * <tr> |
---|
279 | * <td>Cp424</td> |
---|
280 | * <td>IBM424</td> |
---|
281 | * <td>ebcdic-cp-he csIBM424 </td> |
---|
282 | * </tr> |
---|
283 | * <tr> |
---|
284 | * <td>Cp437</td> |
---|
285 | * <td>IBM437</td> |
---|
286 | * <td>437 csPC8CodePage437 </td> |
---|
287 | * </tr> |
---|
288 | * <tr> |
---|
289 | * <td>Cp500</td> |
---|
290 | * <td>IBM500</td> |
---|
291 | * <td>ebcdic-cp-be ebcdic-cp-ch csIBM500 </td> |
---|
292 | * </tr> |
---|
293 | * <tr> |
---|
294 | * <td>Cp737</td> |
---|
295 | * <td>?</td> |
---|
296 | * <td></td> |
---|
297 | * </tr> |
---|
298 | * <tr> |
---|
299 | * <td>Cp775</td> |
---|
300 | * <td>IBM775</td> |
---|
301 | * <td>csPC775Baltic </td> |
---|
302 | * </tr> |
---|
303 | * <tr> |
---|
304 | * <td>Cp838</td> |
---|
305 | * <td>IBM-Thai</td> |
---|
306 | * <td></td> |
---|
307 | * </tr> |
---|
308 | * <tr> |
---|
309 | * <td>Cp850</td> |
---|
310 | * <td>IBM850</td> |
---|
311 | * <td>850 csPC850Multilingual </td> |
---|
312 | * </tr> |
---|
313 | * <tr> |
---|
314 | * <td>Cp852</td> |
---|
315 | * <td>IBM852</td> |
---|
316 | * <td>852 csPCp852 </td> |
---|
317 | * </tr> |
---|
318 | * <tr> |
---|
319 | * <td>Cp855</td> |
---|
320 | * <td>IBM855</td> |
---|
321 | * <td>855 csIBM855 </td> |
---|
322 | * </tr> |
---|
323 | * <tr> |
---|
324 | * <td>Cp856</td> |
---|
325 | * <td>?</td> |
---|
326 | * <td></td> |
---|
327 | * </tr> |
---|
328 | * <tr> |
---|
329 | * <td>Cp857</td> |
---|
330 | * <td>IBM857</td> |
---|
331 | * <td>857 csIBM857 </td> |
---|
332 | * </tr> |
---|
333 | * <tr> |
---|
334 | * <td>Cp858</td> |
---|
335 | * <td>IBM00858</td> |
---|
336 | * <td>CCSID00858 CP00858 PC-Multilingual-850+euro </td> |
---|
337 | * </tr> |
---|
338 | * <tr> |
---|
339 | * <td>Cp860</td> |
---|
340 | * <td>IBM860</td> |
---|
341 | * <td>860 csIBM860 </td> |
---|
342 | * </tr> |
---|
343 | * <tr> |
---|
344 | * <td>Cp861</td> |
---|
345 | * <td>IBM861</td> |
---|
346 | * <td>861 cp-is csIBM861 </td> |
---|
347 | * </tr> |
---|
348 | * <tr> |
---|
349 | * <td>Cp862</td> |
---|
350 | * <td>IBM862</td> |
---|
351 | * <td>862 csPC862LatinHebrew </td> |
---|
352 | * </tr> |
---|
353 | * <tr> |
---|
354 | * <td>Cp863</td> |
---|
355 | * <td>IBM863</td> |
---|
356 | * <td>863 csIBM863 </td> |
---|
357 | * </tr> |
---|
358 | * <tr> |
---|
359 | * <td>Cp864</td> |
---|
360 | * <td>IBM864</td> |
---|
361 | * <td>cp864 csIBM864 </td> |
---|
362 | * </tr> |
---|
363 | * <tr> |
---|
364 | * <td>Cp865</td> |
---|
365 | * <td>IBM865</td> |
---|
366 | * <td>865 csIBM865 </td> |
---|
367 | * </tr> |
---|
368 | * <tr> |
---|
369 | * <td>Cp866</td> |
---|
370 | * <td>IBM866</td> |
---|
371 | * <td>866 csIBM866 </td> |
---|
372 | * </tr> |
---|
373 | * <tr> |
---|
374 | * <td>Cp868</td> |
---|
375 | * <td>IBM868</td> |
---|
376 | * <td>cp-ar csIBM868 </td> |
---|
377 | * </tr> |
---|
378 | * <tr> |
---|
379 | * <td>Cp869</td> |
---|
380 | * <td>IBM869</td> |
---|
381 | * <td>cp-gr csIBM869 </td> |
---|
382 | * </tr> |
---|
383 | * <tr> |
---|
384 | * <td>Cp870</td> |
---|
385 | * <td>IBM870</td> |
---|
386 | * <td>ebcdic-cp-roece ebcdic-cp-yu csIBM870 </td> |
---|
387 | * </tr> |
---|
388 | * <tr> |
---|
389 | * <td>Cp871</td> |
---|
390 | * <td>IBM871</td> |
---|
391 | * <td>ebcdic-cp-is csIBM871 </td> |
---|
392 | * </tr> |
---|
393 | * <tr> |
---|
394 | * <td>Cp875</td> |
---|
395 | * <td>?</td> |
---|
396 | * <td></td> |
---|
397 | * </tr> |
---|
398 | * <tr> |
---|
399 | * <td>Cp918</td> |
---|
400 | * <td>IBM918</td> |
---|
401 | * <td>ebcdic-cp-ar2 csIBM918 </td> |
---|
402 | * </tr> |
---|
403 | * <tr> |
---|
404 | * <td>Cp921</td> |
---|
405 | * <td>?</td> |
---|
406 | * <td></td> |
---|
407 | * </tr> |
---|
408 | * <tr> |
---|
409 | * <td>Cp922</td> |
---|
410 | * <td>?</td> |
---|
411 | * <td></td> |
---|
412 | * </tr> |
---|
413 | * <tr> |
---|
414 | * <td>Cp930</td> |
---|
415 | * <td>?</td> |
---|
416 | * <td></td> |
---|
417 | * </tr> |
---|
418 | * <tr> |
---|
419 | * <td>Cp933</td> |
---|
420 | * <td>?</td> |
---|
421 | * <td></td> |
---|
422 | * </tr> |
---|
423 | * <tr> |
---|
424 | * <td>Cp935</td> |
---|
425 | * <td>?</td> |
---|
426 | * <td></td> |
---|
427 | * </tr> |
---|
428 | * <tr> |
---|
429 | * <td>Cp937</td> |
---|
430 | * <td>?</td> |
---|
431 | * <td></td> |
---|
432 | * </tr> |
---|
433 | * <tr> |
---|
434 | * <td>Cp939</td> |
---|
435 | * <td>?</td> |
---|
436 | * <td></td> |
---|
437 | * </tr> |
---|
438 | * <tr> |
---|
439 | * <td>Cp942</td> |
---|
440 | * <td>?</td> |
---|
441 | * <td></td> |
---|
442 | * </tr> |
---|
443 | * <tr> |
---|
444 | * <td>Cp942C</td> |
---|
445 | * <td>?</td> |
---|
446 | * <td></td> |
---|
447 | * </tr> |
---|
448 | * <tr> |
---|
449 | * <td>Cp943</td> |
---|
450 | * <td>?</td> |
---|
451 | * <td></td> |
---|
452 | * </tr> |
---|
453 | * <tr> |
---|
454 | * <td>Cp943C</td> |
---|
455 | * <td>?</td> |
---|
456 | * <td></td> |
---|
457 | * </tr> |
---|
458 | * <tr> |
---|
459 | * <td>Cp948</td> |
---|
460 | * <td>?</td> |
---|
461 | * <td></td> |
---|
462 | * </tr> |
---|
463 | * <tr> |
---|
464 | * <td>Cp949</td> |
---|
465 | * <td>?</td> |
---|
466 | * <td></td> |
---|
467 | * </tr> |
---|
468 | * <tr> |
---|
469 | * <td>Cp949C</td> |
---|
470 | * <td>?</td> |
---|
471 | * <td></td> |
---|
472 | * </tr> |
---|
473 | * <tr> |
---|
474 | * <td>Cp950</td> |
---|
475 | * <td>?</td> |
---|
476 | * <td></td> |
---|
477 | * </tr> |
---|
478 | * <tr> |
---|
479 | * <td>Cp964</td> |
---|
480 | * <td>?</td> |
---|
481 | * <td></td> |
---|
482 | * </tr> |
---|
483 | * <tr> |
---|
484 | * <td>Cp970</td> |
---|
485 | * <td>?</td> |
---|
486 | * <td></td> |
---|
487 | * </tr> |
---|
488 | * <tr> |
---|
489 | * <td>EUC_CN</td> |
---|
490 | * <td>GB2312</td> |
---|
491 | * <td>x-EUC-CN csGB2312 euccn euc-cn gb2312-80 gb2312-1980 CN-GB CN-GB-ISOIR165 </td> |
---|
492 | * </tr> |
---|
493 | * <tr> |
---|
494 | * <td>EUC_JP</td> |
---|
495 | * <td>EUC-JP</td> |
---|
496 | * <td>csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese eucjis x-eucjp eucjp x-euc-jp </td> |
---|
497 | * </tr> |
---|
498 | * <tr> |
---|
499 | * <td>EUC_JP_LINUX</td> |
---|
500 | * <td>?</td> |
---|
501 | * <td></td> |
---|
502 | * </tr> |
---|
503 | * <tr> |
---|
504 | * <td>EUC_JP_Solaris</td> |
---|
505 | * <td>?</td> |
---|
506 | * <td></td> |
---|
507 | * </tr> |
---|
508 | * <tr> |
---|
509 | * <td>EUC_KR</td> |
---|
510 | * <td>EUC-KR</td> |
---|
511 | * <td>csEUCKR ksc5601 5601 ksc5601_1987 ksc_5601 ksc5601-1987 ks_c_5601-1987 euckr </td> |
---|
512 | * </tr> |
---|
513 | * <tr> |
---|
514 | * <td>EUC_TW</td> |
---|
515 | * <td>EUC-TW</td> |
---|
516 | * <td>x-EUC-TW cns11643 euctw </td> |
---|
517 | * </tr> |
---|
518 | * <tr> |
---|
519 | * <td>GB18030</td> |
---|
520 | * <td>GB18030</td> |
---|
521 | * <td>gb18030-2000 </td> |
---|
522 | * </tr> |
---|
523 | * <tr> |
---|
524 | * <td>GBK</td> |
---|
525 | * <td>windows-936</td> |
---|
526 | * <td>CP936 MS936 ms_936 x-mswin-936 </td> |
---|
527 | * </tr> |
---|
528 | * <tr> |
---|
529 | * <td>ISCII91</td> |
---|
530 | * <td>?</td> |
---|
531 | * <td>x-ISCII91 iscii </td> |
---|
532 | * </tr> |
---|
533 | * <tr> |
---|
534 | * <td>ISO2022CN</td> |
---|
535 | * <td>ISO-2022-CN</td> |
---|
536 | * <td></td> |
---|
537 | * </tr> |
---|
538 | * <tr> |
---|
539 | * <td>ISO2022JP</td> |
---|
540 | * <td>ISO-2022-JP</td> |
---|
541 | * <td>csISO2022JP JIS jis_encoding csjisencoding </td> |
---|
542 | * </tr> |
---|
543 | * <tr> |
---|
544 | * <td>ISO2022KR</td> |
---|
545 | * <td>ISO-2022-KR</td> |
---|
546 | * <td>csISO2022KR </td> |
---|
547 | * </tr> |
---|
548 | * <tr> |
---|
549 | * <td>ISO2022_CN_CNS</td> |
---|
550 | * <td>?</td> |
---|
551 | * <td></td> |
---|
552 | * </tr> |
---|
553 | * <tr> |
---|
554 | * <td>ISO2022_CN_GB</td> |
---|
555 | * <td>?</td> |
---|
556 | * <td></td> |
---|
557 | * </tr> |
---|
558 | * <tr> |
---|
559 | * <td>ISO8859_1</td> |
---|
560 | * <td>ISO-8859-1</td> |
---|
561 | * <td>ISO_8859-1:1987 iso-ir-100 ISO_8859-1 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 819 IBM-819 ISO8859-1 ISO_8859_1 </td> |
---|
562 | * </tr> |
---|
563 | * <tr> |
---|
564 | * <td>ISO8859_13</td> |
---|
565 | * <td>ISO-8859-13</td> |
---|
566 | * <td></td> |
---|
567 | * </tr> |
---|
568 | * <tr> |
---|
569 | * <td>ISO8859_15</td> |
---|
570 | * <td>ISO-8859-15</td> |
---|
571 | * <td>ISO_8859-15 Latin-9 8859_15 csISOlatin9 IBM923 cp923 923 L9 IBM-923 ISO8859-15 LATIN9 LATIN0 csISOlatin0 ISO8859_15_FDIS </td> |
---|
572 | * </tr> |
---|
573 | * <tr> |
---|
574 | * <td>ISO8859_2</td> |
---|
575 | * <td>ISO-8859-2</td> |
---|
576 | * <td>ISO_8859-2:1987 iso-ir-101 ISO_8859-2 latin2 l2 csISOLatin2 8859_2 iso8859_2 </td> |
---|
577 | * </tr> |
---|
578 | * <tr> |
---|
579 | * <td>ISO8859_3</td> |
---|
580 | * <td>ISO-8859-3</td> |
---|
581 | * <td>ISO_8859-3:1988 iso-ir-109 ISO_8859-3 latin3 l3 csISOLatin3 8859_3 </td> |
---|
582 | * </tr> |
---|
583 | * <tr> |
---|
584 | * <td>ISO8859_4</td> |
---|
585 | * <td>ISO-8859-4</td> |
---|
586 | * <td>ISO_8859-4:1988 iso-ir-110 ISO_8859-4 latin4 l4 csISOLatin4 8859_4 </td> |
---|
587 | * </tr> |
---|
588 | * <tr> |
---|
589 | * <td>ISO8859_5</td> |
---|
590 | * <td>ISO-8859-5</td> |
---|
591 | * <td>ISO_8859-5:1988 iso-ir-144 ISO_8859-5 cyrillic csISOLatinCyrillic 8859_5 </td> |
---|
592 | * </tr> |
---|
593 | * <tr> |
---|
594 | * <td>ISO8859_6</td> |
---|
595 | * <td>ISO-8859-6</td> |
---|
596 | * <td>ISO_8859-6:1987 iso-ir-127 ISO_8859-6 ECMA-114 ASMO-708 arabic csISOLatinArabic 8859_6 </td> |
---|
597 | * </tr> |
---|
598 | * <tr> |
---|
599 | * <td>ISO8859_7</td> |
---|
600 | * <td>ISO-8859-7</td> |
---|
601 | * <td>ISO_8859-7:1987 iso-ir-126 ISO_8859-7 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 sun_eu_greek </td> |
---|
602 | * </tr> |
---|
603 | * <tr> |
---|
604 | * <td>ISO8859_8</td> |
---|
605 | * <td>ISO-8859-8</td> |
---|
606 | * <td>ISO_8859-8:1988 iso-ir-138 ISO_8859-8 hebrew csISOLatinHebrew 8859_8 </td> |
---|
607 | * </tr> |
---|
608 | * <tr> |
---|
609 | * <td>ISO8859_9</td> |
---|
610 | * <td>ISO-8859-9</td> |
---|
611 | * <td>ISO_8859-9:1989 iso-ir-148 ISO_8859-9 latin5 l5 csISOLatin5 8859_9 </td> |
---|
612 | * </tr> |
---|
613 | * <tr> |
---|
614 | * <td>JISAutoDetect</td> |
---|
615 | * <td>?</td> |
---|
616 | * <td></td> |
---|
617 | * </tr> |
---|
618 | * <tr> |
---|
619 | * <td>JIS_C6626-1983</td> |
---|
620 | * <td>JIS_C6626-1983</td> |
---|
621 | * <td>x-JIS0208 JIS0208 csISO87JISX0208 x0208 JIS_X0208-1983 iso-ir-87 </td> |
---|
622 | * </tr> |
---|
623 | * <tr> |
---|
624 | * <td>JIS_X0201</td> |
---|
625 | * <td>JIS_X0201</td> |
---|
626 | * <td>X0201 JIS0201 csHalfWidthKatakana </td> |
---|
627 | * </tr> |
---|
628 | * <tr> |
---|
629 | * <td>JIS_X0212-1990</td> |
---|
630 | * <td>JIS_X0212-1990</td> |
---|
631 | * <td>iso-ir-159 x0212 JIS0212 csISO159JISX02121990 </td> |
---|
632 | * </tr> |
---|
633 | * <tr> |
---|
634 | * <td>KOI8_R</td> |
---|
635 | * <td>KOI8-R</td> |
---|
636 | * <td>csKOI8R koi8 </td> |
---|
637 | * </tr> |
---|
638 | * <tr> |
---|
639 | * <td>MS874</td> |
---|
640 | * <td>windows-874</td> |
---|
641 | * <td>cp874 </td> |
---|
642 | * </tr> |
---|
643 | * <tr> |
---|
644 | * <td>MS932</td> |
---|
645 | * <td>Windows-31J</td> |
---|
646 | * <td>windows-932 csWindows31J x-ms-cp932 </td> |
---|
647 | * </tr> |
---|
648 | * <tr> |
---|
649 | * <td>MS949</td> |
---|
650 | * <td>windows-949</td> |
---|
651 | * <td>windows949 ms_949 x-windows-949 </td> |
---|
652 | * </tr> |
---|
653 | * <tr> |
---|
654 | * <td>MS950</td> |
---|
655 | * <td>windows-950</td> |
---|
656 | * <td>x-windows-950 </td> |
---|
657 | * </tr> |
---|
658 | * <tr> |
---|
659 | * <td>MS950_HKSCS</td> |
---|
660 | * <td></td> |
---|
661 | * <td></td> |
---|
662 | * </tr> |
---|
663 | * <tr> |
---|
664 | * <td>MacArabic</td> |
---|
665 | * <td>?</td> |
---|
666 | * <td></td> |
---|
667 | * </tr> |
---|
668 | * <tr> |
---|
669 | * <td>MacCentralEurope</td> |
---|
670 | * <td>?</td> |
---|
671 | * <td></td> |
---|
672 | * </tr> |
---|
673 | * <tr> |
---|
674 | * <td>MacCroatian</td> |
---|
675 | * <td>?</td> |
---|
676 | * <td></td> |
---|
677 | * </tr> |
---|
678 | * <tr> |
---|
679 | * <td>MacCyrillic</td> |
---|
680 | * <td>?</td> |
---|
681 | * <td></td> |
---|
682 | * </tr> |
---|
683 | * <tr> |
---|
684 | * <td>MacDingbat</td> |
---|
685 | * <td>?</td> |
---|
686 | * <td></td> |
---|
687 | * </tr> |
---|
688 | * <tr> |
---|
689 | * <td>MacGreek</td> |
---|
690 | * <td>MacGreek</td> |
---|
691 | * <td></td> |
---|
692 | * </tr> |
---|
693 | * <tr> |
---|
694 | * <td>MacHebrew</td> |
---|
695 | * <td>?</td> |
---|
696 | * <td></td> |
---|
697 | * </tr> |
---|
698 | * <tr> |
---|
699 | * <td>MacIceland</td> |
---|
700 | * <td>?</td> |
---|
701 | * <td></td> |
---|
702 | * </tr> |
---|
703 | * <tr> |
---|
704 | * <td>MacRoman</td> |
---|
705 | * <td>MacRoman</td> |
---|
706 | * <td>Macintosh MAC csMacintosh </td> |
---|
707 | * </tr> |
---|
708 | * <tr> |
---|
709 | * <td>MacRomania</td> |
---|
710 | * <td>?</td> |
---|
711 | * <td></td> |
---|
712 | * </tr> |
---|
713 | * <tr> |
---|
714 | * <td>MacSymbol</td> |
---|
715 | * <td>?</td> |
---|
716 | * <td></td> |
---|
717 | * </tr> |
---|
718 | * <tr> |
---|
719 | * <td>MacThai</td> |
---|
720 | * <td>?</td> |
---|
721 | * <td></td> |
---|
722 | * </tr> |
---|
723 | * <tr> |
---|
724 | * <td>MacTurkish</td> |
---|
725 | * <td>?</td> |
---|
726 | * <td></td> |
---|
727 | * </tr> |
---|
728 | * <tr> |
---|
729 | * <td>MacUkraine</td> |
---|
730 | * <td>?</td> |
---|
731 | * <td></td> |
---|
732 | * </tr> |
---|
733 | * <tr> |
---|
734 | * <td>SJIS</td> |
---|
735 | * <td>Shift_JIS</td> |
---|
736 | * <td>MS_Kanji csShiftJIS shift-jis x-sjis pck </td> |
---|
737 | * </tr> |
---|
738 | * <tr> |
---|
739 | * <td>TIS620</td> |
---|
740 | * <td>TIS-620</td> |
---|
741 | * <td></td> |
---|
742 | * </tr> |
---|
743 | * <tr> |
---|
744 | * <td>UTF-16</td> |
---|
745 | * <td>UTF-16</td> |
---|
746 | * <td>UTF_16 </td> |
---|
747 | * </tr> |
---|
748 | * <tr> |
---|
749 | * <td>UTF8</td> |
---|
750 | * <td>UTF-8</td> |
---|
751 | * <td></td> |
---|
752 | * </tr> |
---|
753 | * <tr> |
---|
754 | * <td>UnicodeBig</td> |
---|
755 | * <td>?</td> |
---|
756 | * <td></td> |
---|
757 | * </tr> |
---|
758 | * <tr> |
---|
759 | * <td>UnicodeBigUnmarked</td> |
---|
760 | * <td>UTF-16BE</td> |
---|
761 | * <td>X-UTF-16BE UTF_16BE ISO-10646-UCS-2 </td> |
---|
762 | * </tr> |
---|
763 | * <tr> |
---|
764 | * <td>UnicodeLittle</td> |
---|
765 | * <td>?</td> |
---|
766 | * <td></td> |
---|
767 | * </tr> |
---|
768 | * <tr> |
---|
769 | * <td>UnicodeLittleUnmarked</td> |
---|
770 | * <td>UTF-16LE</td> |
---|
771 | * <td>UTF_16LE X-UTF-16LE </td> |
---|
772 | * </tr> |
---|
773 | * <tr> |
---|
774 | * <td>x-Johab</td> |
---|
775 | * <td>johab</td> |
---|
776 | * <td>johab cp1361 ms1361 ksc5601-1992 ksc5601_1992 </td> |
---|
777 | * </tr> |
---|
778 | * <tr> |
---|
779 | * <td>x-iso-8859-11</td> |
---|
780 | * <td>?</td> |
---|
781 | * <td></td> |
---|
782 | * </tr> |
---|
783 | * </table> |
---|
784 | */ |
---|
785 | public class CharsetUtil { |
---|
786 | |
---|
787 | private static class Charset implements Comparable<Charset> { |
---|
788 | private String canonical = null; |
---|
789 | private String mime = null; |
---|
790 | private String[] aliases = null; |
---|
791 | |
---|
792 | private Charset(String canonical, String mime, String[] aliases) { |
---|
793 | this.canonical = canonical; |
---|
794 | this.mime = mime; |
---|
795 | this.aliases = aliases; |
---|
796 | } |
---|
797 | |
---|
798 | public int compareTo(Charset c) { |
---|
799 | return this.canonical.compareTo(c.canonical); |
---|
800 | } |
---|
801 | } |
---|
802 | |
---|
803 | private static Charset[] JAVA_CHARSETS = { |
---|
804 | new Charset("ISO8859_1", "ISO-8859-1", |
---|
805 | new String[] {"ISO_8859-1:1987", "iso-ir-100", "ISO_8859-1", |
---|
806 | "latin1", "l1", "IBM819", "CP819", |
---|
807 | "csISOLatin1", "8859_1", "819", "IBM-819", |
---|
808 | "ISO8859-1", "ISO_8859_1"}), |
---|
809 | new Charset("ISO8859_2", "ISO-8859-2", |
---|
810 | new String[] {"ISO_8859-2:1987", "iso-ir-101", "ISO_8859-2", |
---|
811 | "latin2", "l2", "csISOLatin2", "8859_2", |
---|
812 | "iso8859_2"}), |
---|
813 | new Charset("ISO8859_3", "ISO-8859-3", new String[] {"ISO_8859-3:1988", "iso-ir-109", "ISO_8859-3", "latin3", "l3", "csISOLatin3", "8859_3"}), |
---|
814 | new Charset("ISO8859_4", "ISO-8859-4", |
---|
815 | new String[] {"ISO_8859-4:1988", "iso-ir-110", "ISO_8859-4", |
---|
816 | "latin4", "l4", "csISOLatin4", "8859_4"}), |
---|
817 | new Charset("ISO8859_5", "ISO-8859-5", |
---|
818 | new String[] {"ISO_8859-5:1988", "iso-ir-144", "ISO_8859-5", |
---|
819 | "cyrillic", "csISOLatinCyrillic", "8859_5"}), |
---|
820 | new Charset("ISO8859_6", "ISO-8859-6", new String[] {"ISO_8859-6:1987", "iso-ir-127", "ISO_8859-6", "ECMA-114", "ASMO-708", "arabic", "csISOLatinArabic", "8859_6"}), |
---|
821 | new Charset("ISO8859_7", "ISO-8859-7", |
---|
822 | new String[] {"ISO_8859-7:1987", "iso-ir-126", "ISO_8859-7", |
---|
823 | "ELOT_928", "ECMA-118", "greek", "greek8", |
---|
824 | "csISOLatinGreek", "8859_7", "sun_eu_greek"}), |
---|
825 | new Charset("ISO8859_8", "ISO-8859-8", new String[] {"ISO_8859-8:1988", "iso-ir-138", "ISO_8859-8", "hebrew", "csISOLatinHebrew", "8859_8"}), |
---|
826 | new Charset("ISO8859_9", "ISO-8859-9", |
---|
827 | new String[] {"ISO_8859-9:1989", "iso-ir-148", "ISO_8859-9", |
---|
828 | "latin5", "l5", "csISOLatin5", "8859_9"}), |
---|
829 | |
---|
830 | new Charset("ISO8859_13", "ISO-8859-13", new String[] {}), |
---|
831 | new Charset("ISO8859_15", "ISO-8859-15", |
---|
832 | new String[] {"ISO_8859-15", "Latin-9", "8859_15", |
---|
833 | "csISOlatin9", "IBM923", "cp923", "923", "L9", |
---|
834 | "IBM-923", "ISO8859-15", "LATIN9", "LATIN0", |
---|
835 | "csISOlatin0", "ISO8859_15_FDIS"}), |
---|
836 | new Charset("KOI8_R", "KOI8-R", new String[] {"csKOI8R", "koi8"}), |
---|
837 | new Charset("ASCII", "US-ASCII", |
---|
838 | new String[] {"ANSI_X3.4-1968", "iso-ir-6", |
---|
839 | "ANSI_X3.4-1986", "ISO_646.irv:1991", |
---|
840 | "ISO646-US", "us", "IBM367", "cp367", |
---|
841 | "csASCII", "ascii7", "646", "iso_646.irv:1983"}), |
---|
842 | new Charset("UTF8", "UTF-8", new String[] {}), |
---|
843 | new Charset("UTF-16", "UTF-16", new String[] {"UTF_16"}), |
---|
844 | new Charset("UnicodeBigUnmarked", "UTF-16BE", new String[] {"X-UTF-16BE", "UTF_16BE", "ISO-10646-UCS-2"}), |
---|
845 | new Charset("UnicodeLittleUnmarked", "UTF-16LE", new String[] {"UTF_16LE", "X-UTF-16LE"}), |
---|
846 | new Charset("Big5", "Big5", new String[] {"csBig5", "CN-Big5", "BIG-FIVE", "BIGFIVE"}), |
---|
847 | new Charset("Big5_HKSCS", "Big5-HKSCS", new String[] {"big5hkscs"}), |
---|
848 | new Charset("EUC_JP", "EUC-JP", |
---|
849 | new String[] {"csEUCPkdFmtJapanese", |
---|
850 | "Extended_UNIX_Code_Packed_Format_for_Japanese", |
---|
851 | "eucjis", "x-eucjp", "eucjp", "x-euc-jp"}), |
---|
852 | new Charset("EUC_KR", "EUC-KR", |
---|
853 | new String[] {"csEUCKR", "ksc5601", "5601", "ksc5601_1987", |
---|
854 | "ksc_5601", "ksc5601-1987", "ks_c_5601-1987", |
---|
855 | "euckr"}), |
---|
856 | new Charset("GB18030", "GB18030", new String[] {"gb18030-2000"}), |
---|
857 | new Charset("EUC_CN", "GB2312", new String[] {"x-EUC-CN", "csGB2312", "euccn", "euc-cn", "gb2312-80", "gb2312-1980", "CN-GB", "CN-GB-ISOIR165"}), |
---|
858 | new Charset("GBK", "windows-936", new String[] {"CP936", "MS936", "ms_936", "x-mswin-936"}), |
---|
859 | |
---|
860 | new Charset("Cp037", "IBM037", new String[] {"ebcdic-cp-us", "ebcdic-cp-ca", "ebcdic-cp-wt", "ebcdic-cp-nl", "csIBM037"}), |
---|
861 | new Charset("Cp273", "IBM273", new String[] {"csIBM273"}), |
---|
862 | new Charset("Cp277", "IBM277", new String[] {"EBCDIC-CP-DK", "EBCDIC-CP-NO", "csIBM277"}), |
---|
863 | new Charset("Cp278", "IBM278", new String[] {"CP278", "ebcdic-cp-fi", "ebcdic-cp-se", "csIBM278"}), |
---|
864 | new Charset("Cp280", "IBM280", new String[] {"ebcdic-cp-it", "csIBM280"}), |
---|
865 | new Charset("Cp284", "IBM284", new String[] {"ebcdic-cp-es", "csIBM284"}), |
---|
866 | new Charset("Cp285", "IBM285", new String[] {"ebcdic-cp-gb", "csIBM285"}), |
---|
867 | new Charset("Cp297", "IBM297", new String[] {"ebcdic-cp-fr", "csIBM297"}), |
---|
868 | new Charset("Cp420", "IBM420", new String[] {"ebcdic-cp-ar1", "csIBM420"}), |
---|
869 | new Charset("Cp424", "IBM424", new String[] {"ebcdic-cp-he", "csIBM424"}), |
---|
870 | new Charset("Cp437", "IBM437", new String[] {"437", "csPC8CodePage437"}), |
---|
871 | new Charset("Cp500", "IBM500", new String[] {"ebcdic-cp-be", "ebcdic-cp-ch", "csIBM500"}), |
---|
872 | new Charset("Cp775", "IBM775", new String[] {"csPC775Baltic"}), |
---|
873 | new Charset("Cp838", "IBM-Thai", new String[] {}), |
---|
874 | new Charset("Cp850", "IBM850", new String[] {"850", "csPC850Multilingual"}), |
---|
875 | new Charset("Cp852", "IBM852", new String[] {"852", "csPCp852"}), |
---|
876 | new Charset("Cp855", "IBM855", new String[] {"855", "csIBM855"}), |
---|
877 | new Charset("Cp857", "IBM857", new String[] {"857", "csIBM857"}), |
---|
878 | new Charset("Cp858", "IBM00858", |
---|
879 | new String[] {"CCSID00858", "CP00858", |
---|
880 | "PC-Multilingual-850+euro"}), |
---|
881 | new Charset("Cp860", "IBM860", new String[] {"860", "csIBM860"}), |
---|
882 | new Charset("Cp861", "IBM861", new String[] {"861", "cp-is", "csIBM861"}), |
---|
883 | new Charset("Cp862", "IBM862", new String[] {"862", "csPC862LatinHebrew"}), |
---|
884 | new Charset("Cp863", "IBM863", new String[] {"863", "csIBM863"}), |
---|
885 | new Charset("Cp864", "IBM864", new String[] {"cp864", "csIBM864"}), |
---|
886 | new Charset("Cp865", "IBM865", new String[] {"865", "csIBM865"}), |
---|
887 | new Charset("Cp866", "IBM866", new String[] {"866", "csIBM866"}), |
---|
888 | new Charset("Cp868", "IBM868", new String[] {"cp-ar", "csIBM868"}), |
---|
889 | new Charset("Cp869", "IBM869", new String[] {"cp-gr", "csIBM869"}), |
---|
890 | new Charset("Cp870", "IBM870", new String[] {"ebcdic-cp-roece", "ebcdic-cp-yu", "csIBM870"}), |
---|
891 | new Charset("Cp871", "IBM871", new String[] {"ebcdic-cp-is", "csIBM871"}), |
---|
892 | new Charset("Cp918", "IBM918", new String[] {"ebcdic-cp-ar2", "csIBM918"}), |
---|
893 | new Charset("Cp1026", "IBM1026", new String[] {"csIBM1026"}), |
---|
894 | new Charset("Cp1047", "IBM1047", new String[] {"IBM-1047"}), |
---|
895 | new Charset("Cp1140", "IBM01140", |
---|
896 | new String[] {"CCSID01140", "CP01140", |
---|
897 | "ebcdic-us-37+euro"}), |
---|
898 | new Charset("Cp1141", "IBM01141", |
---|
899 | new String[] {"CCSID01141", "CP01141", |
---|
900 | "ebcdic-de-273+euro"}), |
---|
901 | new Charset("Cp1142", "IBM01142", new String[] {"CCSID01142", "CP01142", "ebcdic-dk-277+euro", "ebcdic-no-277+euro"}), |
---|
902 | new Charset("Cp1143", "IBM01143", new String[] {"CCSID01143", "CP01143", "ebcdic-fi-278+euro", "ebcdic-se-278+euro"}), |
---|
903 | new Charset("Cp1144", "IBM01144", new String[] {"CCSID01144", "CP01144", "ebcdic-it-280+euro"}), |
---|
904 | new Charset("Cp1145", "IBM01145", new String[] {"CCSID01145", "CP01145", "ebcdic-es-284+euro"}), |
---|
905 | new Charset("Cp1146", "IBM01146", new String[] {"CCSID01146", "CP01146", "ebcdic-gb-285+euro"}), |
---|
906 | new Charset("Cp1147", "IBM01147", new String[] {"CCSID01147", "CP01147", "ebcdic-fr-297+euro"}), |
---|
907 | new Charset("Cp1148", "IBM01148", new String[] {"CCSID01148", "CP01148", "ebcdic-international-500+euro"}), |
---|
908 | new Charset("Cp1149", "IBM01149", new String[] {"CCSID01149", "CP01149", "ebcdic-is-871+euro"}), |
---|
909 | new Charset("Cp1250", "windows-1250", new String[] {}), |
---|
910 | new Charset("Cp1251", "windows-1251", new String[] {}), |
---|
911 | new Charset("Cp1252", "windows-1252", new String[] {}), |
---|
912 | new Charset("Cp1253", "windows-1253", new String[] {}), |
---|
913 | new Charset("Cp1254", "windows-1254", new String[] {}), |
---|
914 | new Charset("Cp1255", "windows-1255", new String[] {}), |
---|
915 | new Charset("Cp1256", "windows-1256", new String[] {}), |
---|
916 | new Charset("Cp1257", "windows-1257", new String[] {}), |
---|
917 | new Charset("Cp1258", "windows-1258", new String[] {}), |
---|
918 | new Charset("ISO2022CN", "ISO-2022-CN", new String[] {}), |
---|
919 | new Charset("ISO2022JP", "ISO-2022-JP", new String[] {"csISO2022JP", "JIS", "jis_encoding", "csjisencoding"}), |
---|
920 | new Charset("ISO2022KR", "ISO-2022-KR", new String[] {"csISO2022KR"}), |
---|
921 | new Charset("JIS_X0201", "JIS_X0201", new String[] {"X0201", "JIS0201", "csHalfWidthKatakana"}), |
---|
922 | new Charset("JIS_X0212-1990", "JIS_X0212-1990", new String[] {"iso-ir-159", "x0212", "JIS0212", "csISO159JISX02121990"}), |
---|
923 | new Charset("JIS_C6626-1983", "JIS_C6626-1983", new String[] {"x-JIS0208", "JIS0208", "csISO87JISX0208", "x0208", "JIS_X0208-1983", "iso-ir-87"}), |
---|
924 | new Charset("SJIS", "Shift_JIS", new String[] {"MS_Kanji", "csShiftJIS", "shift-jis", "x-sjis", "pck"}), |
---|
925 | new Charset("TIS620", "TIS-620", new String[] {}), |
---|
926 | new Charset("MS932", "Windows-31J", new String[] {"windows-932", "csWindows31J", "x-ms-cp932"}), |
---|
927 | new Charset("EUC_TW", "EUC-TW", new String[] {"x-EUC-TW", "cns11643", "euctw"}), |
---|
928 | new Charset("x-Johab", "johab", new String[] {"johab", "cp1361", "ms1361", "ksc5601-1992", "ksc5601_1992"}), |
---|
929 | new Charset("MS950_HKSCS", "", new String[] {}), |
---|
930 | new Charset("MS874", "windows-874", new String[] {"cp874"}), |
---|
931 | new Charset("MS949", "windows-949", new String[] {"windows949", "ms_949", "x-windows-949"}), |
---|
932 | new Charset("MS950", "windows-950", new String[] {"x-windows-950"}), |
---|
933 | |
---|
934 | new Charset("Cp737", null, new String[] {}), |
---|
935 | new Charset("Cp856", null, new String[] {}), |
---|
936 | new Charset("Cp875", null, new String[] {}), |
---|
937 | new Charset("Cp921", null, new String[] {}), |
---|
938 | new Charset("Cp922", null, new String[] {}), |
---|
939 | new Charset("Cp930", null, new String[] {}), |
---|
940 | new Charset("Cp933", null, new String[] {}), |
---|
941 | new Charset("Cp935", null, new String[] {}), |
---|
942 | new Charset("Cp937", null, new String[] {}), |
---|
943 | new Charset("Cp939", null, new String[] {}), |
---|
944 | new Charset("Cp942", null, new String[] {}), |
---|
945 | new Charset("Cp942C", null, new String[] {}), |
---|
946 | new Charset("Cp943", null, new String[] {}), |
---|
947 | new Charset("Cp943C", null, new String[] {}), |
---|
948 | new Charset("Cp948", null, new String[] {}), |
---|
949 | new Charset("Cp949", null, new String[] {}), |
---|
950 | new Charset("Cp949C", null, new String[] {}), |
---|
951 | new Charset("Cp950", null, new String[] {}), |
---|
952 | new Charset("Cp964", null, new String[] {}), |
---|
953 | new Charset("Cp970", null, new String[] {}), |
---|
954 | new Charset("Cp1006", null, new String[] {}), |
---|
955 | new Charset("Cp1025", null, new String[] {}), |
---|
956 | new Charset("Cp1046", null, new String[] {}), |
---|
957 | new Charset("Cp1097", null, new String[] {}), |
---|
958 | new Charset("Cp1098", null, new String[] {}), |
---|
959 | new Charset("Cp1112", null, new String[] {}), |
---|
960 | new Charset("Cp1122", null, new String[] {}), |
---|
961 | new Charset("Cp1123", null, new String[] {}), |
---|
962 | new Charset("Cp1124", null, new String[] {}), |
---|
963 | new Charset("Cp1381", null, new String[] {}), |
---|
964 | new Charset("Cp1383", null, new String[] {}), |
---|
965 | new Charset("Cp33722", null, new String[] {}), |
---|
966 | new Charset("Big5_Solaris", null, new String[] {}), |
---|
967 | new Charset("EUC_JP_LINUX", null, new String[] {}), |
---|
968 | new Charset("EUC_JP_Solaris", null, new String[] {}), |
---|
969 | new Charset("ISCII91", null, new String[] {"x-ISCII91", "iscii"}), |
---|
970 | new Charset("ISO2022_CN_CNS", null, new String[] {}), |
---|
971 | new Charset("ISO2022_CN_GB", null, new String[] {}), |
---|
972 | new Charset("x-iso-8859-11", null, new String[] {}), |
---|
973 | new Charset("JISAutoDetect", null, new String[] {}), |
---|
974 | new Charset("MacArabic", null, new String[] {}), |
---|
975 | new Charset("MacCentralEurope", null, new String[] {}), |
---|
976 | new Charset("MacCroatian", null, new String[] {}), |
---|
977 | new Charset("MacCyrillic", null, new String[] {}), |
---|
978 | new Charset("MacDingbat", null, new String[] {}), |
---|
979 | new Charset("MacGreek", "MacGreek", new String[] {}), |
---|
980 | new Charset("MacHebrew", null, new String[] {}), |
---|
981 | new Charset("MacIceland", null, new String[] {}), |
---|
982 | new Charset("MacRoman", "MacRoman", new String[] {"Macintosh", "MAC", "csMacintosh"}), |
---|
983 | new Charset("MacRomania", null, new String[] {}), |
---|
984 | new Charset("MacSymbol", null, new String[] {}), |
---|
985 | new Charset("MacThai", null, new String[] {}), |
---|
986 | new Charset("MacTurkish", null, new String[] {}), |
---|
987 | new Charset("MacUkraine", null, new String[] {}), |
---|
988 | new Charset("UnicodeBig", null, new String[] {}), |
---|
989 | new Charset("UnicodeLittle", null, new String[] {}) |
---|
990 | }; |
---|
991 | |
---|
992 | /** |
---|
993 | * Contains the canonical names of character sets which can be used to |
---|
994 | * decode bytes into Java chars. |
---|
995 | */ |
---|
996 | private static SortedSet<String> decodingSupported = new TreeSet<String>(); |
---|
997 | |
---|
998 | /** |
---|
999 | * Contains the canonical names of character sets which can be used to |
---|
1000 | * encode Java chars into bytes. |
---|
1001 | */ |
---|
1002 | private static SortedSet<String> encodingSupported = new TreeSet<String>(); |
---|
1003 | |
---|
1004 | /** |
---|
1005 | * Maps character set names to Charset objects. All possible names of |
---|
1006 | * a charset will be mapped to the Charset. |
---|
1007 | */ |
---|
1008 | private static Map<String, Charset> charsetMap = null; |
---|
1009 | |
---|
1010 | /** |
---|
1011 | * Map tracking which charset encoding/decodings have been |
---|
1012 | * tested during runtime. |
---|
1013 | */ |
---|
1014 | private static Map<String,Boolean> charsetsTested = new HashMap<String,Boolean>(); |
---|
1015 | |
---|
1016 | static { |
---|
1017 | |
---|
1018 | charsetMap = new HashMap<String, Charset>(); |
---|
1019 | for (Charset c : JAVA_CHARSETS) { |
---|
1020 | charsetMap.put(c.canonical.toLowerCase(), c); |
---|
1021 | if (c.mime != null) { |
---|
1022 | charsetMap.put(c.mime.toLowerCase(), c); |
---|
1023 | } |
---|
1024 | if (c.aliases != null) { |
---|
1025 | for (String str : c.aliases) { |
---|
1026 | charsetMap.put(str.toLowerCase(), c); |
---|
1027 | } |
---|
1028 | } |
---|
1029 | } |
---|
1030 | } |
---|
1031 | |
---|
1032 | /** carriage return - line feed sequence */ |
---|
1033 | public static final String CRLF = "\r\n"; |
---|
1034 | |
---|
1035 | /** US-ASCII CR, carriage return (13) */ |
---|
1036 | public static final int CR = '\r'; |
---|
1037 | |
---|
1038 | /** US-ASCII LF, line feed (10) */ |
---|
1039 | public static final int LF = '\n'; |
---|
1040 | |
---|
1041 | /** US-ASCII SP, space (32) */ |
---|
1042 | public static final int SP = ' '; |
---|
1043 | |
---|
1044 | /** US-ASCII HT, horizontal-tab (9) */ |
---|
1045 | public static final int HT = '\t'; |
---|
1046 | |
---|
1047 | public static final java.nio.charset.Charset US_ASCII = java.nio.charset.Charset |
---|
1048 | .forName("US-ASCII"); |
---|
1049 | |
---|
1050 | public static final java.nio.charset.Charset ISO_8859_1 = java.nio.charset.Charset |
---|
1051 | .forName("ISO-8859-1"); |
---|
1052 | |
---|
1053 | public static final java.nio.charset.Charset UTF_8 = java.nio.charset.Charset |
---|
1054 | .forName("UTF-8"); |
---|
1055 | |
---|
1056 | public static final java.nio.charset.Charset DEFAULT_CHARSET = US_ASCII; |
---|
1057 | |
---|
1058 | /** |
---|
1059 | * Returns <code>true</code> if the specified character falls into the US |
---|
1060 | * ASCII character set (Unicode range 0000 to 007f). |
---|
1061 | * |
---|
1062 | * @param ch |
---|
1063 | * character to test. |
---|
1064 | * @return <code>true</code> if the specified character falls into the US |
---|
1065 | * ASCII character set, <code>false</code> otherwise. |
---|
1066 | */ |
---|
1067 | public static boolean isASCII(char ch) { |
---|
1068 | return (0xFF80 & ch) == 0; |
---|
1069 | } |
---|
1070 | |
---|
1071 | /** |
---|
1072 | * Returns <code>true</code> if the specified string consists entirely of |
---|
1073 | * US ASCII characters. |
---|
1074 | * |
---|
1075 | * @param s |
---|
1076 | * string to test. |
---|
1077 | * @return <code>true</code> if the specified string consists entirely of |
---|
1078 | * US ASCII characters, <code>false</code> otherwise. |
---|
1079 | */ |
---|
1080 | public static boolean isASCII(final String s) { |
---|
1081 | if (s == null) { |
---|
1082 | throw new IllegalArgumentException("String may not be null"); |
---|
1083 | } |
---|
1084 | final int len = s.length(); |
---|
1085 | for (int i = 0; i < len; i++) { |
---|
1086 | if (!isASCII(s.charAt(i))) { |
---|
1087 | return false; |
---|
1088 | } |
---|
1089 | } |
---|
1090 | return true; |
---|
1091 | } |
---|
1092 | |
---|
1093 | /** |
---|
1094 | * Returns <code>true</code> if the specified character is a whitespace |
---|
1095 | * character (CR, LF, SP or HT). |
---|
1096 | * |
---|
1097 | * @param ch |
---|
1098 | * character to test. |
---|
1099 | * @return <code>true</code> if the specified character is a whitespace |
---|
1100 | * character, <code>false</code> otherwise. |
---|
1101 | */ |
---|
1102 | public static boolean isWhitespace(char ch) { |
---|
1103 | return ch == SP || ch == HT || ch == CR || ch == LF; |
---|
1104 | } |
---|
1105 | |
---|
1106 | /** |
---|
1107 | * Returns <code>true</code> if the specified string consists entirely of |
---|
1108 | * whitespace characters. |
---|
1109 | * |
---|
1110 | * @param s |
---|
1111 | * string to test. |
---|
1112 | * @return <code>true</code> if the specified string consists entirely of |
---|
1113 | * whitespace characters, <code>false</code> otherwise. |
---|
1114 | */ |
---|
1115 | public static boolean isWhitespace(final String s) { |
---|
1116 | if (s == null) { |
---|
1117 | throw new IllegalArgumentException("String may not be null"); |
---|
1118 | } |
---|
1119 | final int len = s.length(); |
---|
1120 | for (int i = 0; i < len; i++) { |
---|
1121 | if (!isWhitespace(s.charAt(i))) { |
---|
1122 | return false; |
---|
1123 | } |
---|
1124 | } |
---|
1125 | return true; |
---|
1126 | } |
---|
1127 | |
---|
1128 | /** |
---|
1129 | * Determines if the VM supports encoding (chars to bytes) the |
---|
1130 | * specified character set. NOTE: the given character set name may |
---|
1131 | * not be known to the VM even if this method returns <code>true</code>. |
---|
1132 | * Use {@link #toJavaCharset(String)} to get the canonical Java character |
---|
1133 | * set name. |
---|
1134 | * |
---|
1135 | * @param charsetName the characters set name. |
---|
1136 | * @return <code>true</code> if encoding is supported, <code>false</code> |
---|
1137 | * otherwise. |
---|
1138 | */ |
---|
1139 | public static boolean isEncodingSupported(String charsetName) { |
---|
1140 | if (!charsetsTested.containsKey(charsetName.toLowerCase())) { |
---|
1141 | testCharset(charsetName.toLowerCase()); |
---|
1142 | } |
---|
1143 | return encodingSupported.contains(charsetName.toLowerCase()); |
---|
1144 | } |
---|
1145 | |
---|
1146 | /** |
---|
1147 | * Determines if the VM supports decoding (bytes to chars) the |
---|
1148 | * specified character set. NOTE: the given character set name may |
---|
1149 | * not be known to the VM even if this method returns <code>true</code>. |
---|
1150 | * Use {@link #toJavaCharset(String)} to get the canonical Java character |
---|
1151 | * set name. |
---|
1152 | * |
---|
1153 | * @param charsetName the characters set name. |
---|
1154 | * @return <code>true</code> if decoding is supported, <code>false</code> |
---|
1155 | * otherwise. |
---|
1156 | */ |
---|
1157 | public static boolean isDecodingSupported(String charsetName) { |
---|
1158 | if (!charsetsTested.containsKey(charsetName.toLowerCase())) { |
---|
1159 | testCharset(charsetName.toLowerCase()); |
---|
1160 | } |
---|
1161 | return decodingSupported.contains(charsetName.toLowerCase()); |
---|
1162 | } |
---|
1163 | |
---|
1164 | |
---|
1165 | /** |
---|
1166 | * Runs underlying encoding/decodings tests to determine appropriate |
---|
1167 | * responses for {@link #isDecodingSupported(String)} and {@link #isEncodingSupported(String)} |
---|
1168 | * |
---|
1169 | * @param charsetName the characters set name. |
---|
1170 | */ |
---|
1171 | private static void testCharset(String charsetName) { |
---|
1172 | byte[] dummy = new byte[] {'d', 'u', 'm', 'm', 'y'}; |
---|
1173 | Charset c = charsetMap.get(charsetName.toLowerCase()); |
---|
1174 | if (null == c) { |
---|
1175 | charsetsTested.put(charsetName.toLowerCase(), null); |
---|
1176 | return; |
---|
1177 | } |
---|
1178 | |
---|
1179 | try { |
---|
1180 | new String(dummy, c.canonical); |
---|
1181 | decodingSupported.add(c.canonical.toLowerCase()); |
---|
1182 | } catch (UnsupportedOperationException e) { |
---|
1183 | } catch (UnsupportedEncodingException e) { |
---|
1184 | } |
---|
1185 | |
---|
1186 | try { |
---|
1187 | "dummy".getBytes(c.canonical); |
---|
1188 | encodingSupported.add(c.canonical.toLowerCase()); |
---|
1189 | } catch (UnsupportedOperationException e) { |
---|
1190 | } catch (UnsupportedEncodingException e) { |
---|
1191 | } |
---|
1192 | |
---|
1193 | charsetsTested.put(charsetName.toLowerCase(), null); |
---|
1194 | } |
---|
1195 | |
---|
1196 | |
---|
1197 | |
---|
1198 | /** |
---|
1199 | * Gets the preferred MIME character set name for the specified |
---|
1200 | * character set or <code>null</code> if not known. |
---|
1201 | * |
---|
1202 | * @param charsetName the character set name to look for. |
---|
1203 | * @return the MIME preferred name or <code>null</code> if not known. |
---|
1204 | */ |
---|
1205 | public static String toMimeCharset(String charsetName) { |
---|
1206 | Charset c = charsetMap.get(charsetName.toLowerCase()); |
---|
1207 | if (c != null) { |
---|
1208 | return c.mime; |
---|
1209 | } |
---|
1210 | return null; |
---|
1211 | } |
---|
1212 | |
---|
1213 | /** |
---|
1214 | * Gets the canonical Java character set name for the specified |
---|
1215 | * character set or <code>null</code> if not known. This should be |
---|
1216 | * called before doing any conversions using the Java API. NOTE: |
---|
1217 | * you must use {@link #isEncodingSupported(String)} or |
---|
1218 | * {@link #isDecodingSupported(String)} to make sure the returned |
---|
1219 | * Java character set is supported by the current VM. |
---|
1220 | * |
---|
1221 | * @param charsetName the character set name to look for. |
---|
1222 | * @return the canonical Java name or <code>null</code> if not known. |
---|
1223 | */ |
---|
1224 | public static String toJavaCharset(String charsetName) { |
---|
1225 | Charset c = charsetMap.get(charsetName.toLowerCase()); |
---|
1226 | if (c != null) { |
---|
1227 | return c.canonical; |
---|
1228 | } |
---|
1229 | return null; |
---|
1230 | } |
---|
1231 | |
---|
1232 | /* |
---|
1233 | * Uncomment the code below and run the main method to regenerate the |
---|
1234 | * Javadoc table above when the known charsets change. |
---|
1235 | */ |
---|
1236 | |
---|
1237 | /* |
---|
1238 | private static String dumpHtmlTable() { |
---|
1239 | List<Charset> l = new LinkedList<Charset>(Arrays.asList(JAVA_CHARSETS)); |
---|
1240 | Collections.sort(l); |
---|
1241 | StringBuilder sb = new StringBuilder(); |
---|
1242 | sb.append(" * <table>\n"); |
---|
1243 | sb.append(" * <tr>\n"); |
---|
1244 | sb.append(" * <td>Canonical (Java) name</td>\n"); |
---|
1245 | sb.append(" * <td>MIME preferred</td>\n"); |
---|
1246 | sb.append(" * <td>Aliases</td>\n"); |
---|
1247 | sb.append(" * </tr>\n"); |
---|
1248 | |
---|
1249 | for (Charset c : l) { |
---|
1250 | sb.append(" * <tr>\n"); |
---|
1251 | sb.append(" * <td>" + c.canonical + "</td>\n"); |
---|
1252 | sb.append(" * <td>" + (c.mime == null ? "?" : c.mime)+ "</td>\n"); |
---|
1253 | sb.append(" * <td>"); |
---|
1254 | for (int i = 0; c.aliases != null && i < c.aliases.length; i++) { |
---|
1255 | sb.append(c.aliases[i] + " "); |
---|
1256 | } |
---|
1257 | sb.append("</td>\n"); |
---|
1258 | sb.append(" * </tr>\n"); |
---|
1259 | } |
---|
1260 | sb.append(" * </table>\n"); |
---|
1261 | return sb.toString(); |
---|
1262 | } |
---|
1263 | |
---|
1264 | public static void main(String[] args) { |
---|
1265 | System.out.println(dumpHtmlTable()); |
---|
1266 | } |
---|
1267 | */ |
---|
1268 | } |
---|