1 | /* |
2 | * Copyright 2004 Sun Microsystems, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | * |
16 | */ |
17 | package com.sun.syndication.io; |
18 | |
19 | import java.io.*; |
20 | import java.net.URL; |
21 | import java.net.URLConnection; |
22 | import java.net.HttpURLConnection; |
23 | import java.util.regex.Pattern; |
24 | import java.util.regex.Matcher; |
25 | import java.text.MessageFormat; |
26 | |
27 | /** |
28 | * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out |
29 | * the charset encoding of the XML document within the stream. |
30 | * <p> |
31 | * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a |
32 | * character stream. |
33 | * <p> |
34 | * All this has to be done without consuming characters from the stream, if not the XML parser |
35 | * will not recognized the document as a valid XML. This is not 100% true, but it's close enough |
36 | * (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all |
37 | * parsers). |
38 | * <p> |
39 | * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and |
40 | * HTTP streams by offering a wide set of constructors. |
41 | * <P> |
42 | * By default the charset encoding detection is lenient, the constructor with the lenient flag |
43 | * can be used for an script (following HTTP MIME and XML specifications). |
44 | * All this is nicely explained by Mark Pilgrim in his blog, |
45 | * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> |
46 | * Determining the character encoding of a feed</a>. |
47 | * <p> |
48 | * @author Alejandro Abdelnur |
49 | * @version revision 1.18 taken on 2008-03-06 from Rome (see |
50 | * https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java) |
51 | */ |
52 | public class XmlReader extends Reader { |
53 | private static final int BUFFER_SIZE = 4096; |
54 | |
55 | private static final String UTF_8 = "UTF-8"; |
56 | private static final String US_ASCII = "US-ASCII"; |
57 | private static final String UTF_16BE = "UTF-16BE"; |
58 | private static final String UTF_16LE = "UTF-16LE"; |
59 | private static final String UTF_16 = "UTF-16"; |
60 | |
61 | private static String _staticDefaultEncoding = null; |
62 | |
63 | private Reader _reader; |
64 | private String _encoding; |
65 | private String _defaultEncoding; |
66 | |
67 | /** |
68 | * Sets the default encoding to use if none is set in HTTP content-type, |
69 | * XML prolog and the rules based on content-type are not adequate. |
70 | * <p/> |
71 | * If it is set to NULL the content-type based rules are used. |
72 | * <p/> |
73 | * By default it is NULL. |
74 | * <p/> |
75 | * |
76 | * @param encoding charset encoding to default to. |
77 | */ |
78 | public static void setDefaultEncoding(String encoding) { |
79 | _staticDefaultEncoding = encoding; |
80 | } |
81 | |
82 | /** |
83 | * Returns the default encoding to use if none is set in HTTP content-type, |
84 | * XML prolog and the rules based on content-type are not adequate. |
85 | * <p/> |
86 | * If it is NULL the content-type based rules are used. |
87 | * <p/> |
88 | * |
89 | * @return the default encoding to use. |
90 | */ |
91 | public static String getDefaultEncoding() { |
92 | return _staticDefaultEncoding; |
93 | } |
94 | |
95 | /** |
96 | * Creates a Reader for a File. |
97 | * <p> |
98 | * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also |
99 | * missing defaults to UTF-8. |
100 | * <p> |
101 | * It does a lenient charset encoding detection, check the constructor with the lenient parameter |
102 | * for details. |
103 | * <p> |
104 | * @param file File to create a Reader from. |
105 | * @throws IOException thrown if there is a problem reading the file. |
106 | * |
107 | */ |
108 | public XmlReader(File file) throws IOException { |
109 | this(new FileInputStream(file)); |
110 | } |
111 | |
112 | /** |
113 | * Creates a Reader for a raw InputStream. |
114 | * <p> |
115 | * It follows the same logic used for files. |
116 | * <p> |
117 | * It does a lenient charset encoding detection, check the constructor with the lenient parameter |
118 | * for details. |
119 | * <p> |
120 | * @param is InputStream to create a Reader from. |
121 | * @throws IOException thrown if there is a problem reading the stream. |
122 | * |
123 | */ |
124 | public XmlReader(InputStream is) throws IOException { |
125 | this(is,true); |
126 | } |
127 | |
128 | /** |
129 | * Creates a Reader for a raw InputStream. |
130 | * <p> |
131 | * It follows the same logic used for files. |
132 | * <p> |
133 | * If lenient detection is indicated and the detection above fails as per specifications it then attempts |
134 | * the following: |
135 | * <p> |
136 | * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. |
137 | * <p> |
138 | * Else if the XML prolog had a charset encoding that encoding is used. |
139 | * <p> |
140 | * Else if the content type had a charset encoding that encoding is used. |
141 | * <p> |
142 | * Else 'UTF-8' is used. |
143 | * <p> |
144 | * If lenient detection is indicated an XmlReaderException is never thrown. |
145 | * <p> |
146 | * @param is InputStream to create a Reader from. |
147 | * @param lenient indicates if the charset encoding detection should be relaxed. |
148 | * @throws IOException thrown if there is a problem reading the stream. |
149 | * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs. |
150 | * |
151 | */ |
152 | public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException { |
153 | _defaultEncoding = _staticDefaultEncoding; |
154 | try { |
155 | doRawStream(is,lenient); |
156 | } |
157 | catch (XmlReaderException ex) { |
158 | if (!lenient) { |
159 | throw ex; |
160 | } |
161 | else { |
162 | doLenientDetection(null,ex); |
163 | } |
164 | } |
165 | } |
166 | |
167 | /** |
168 | * Creates a Reader using the InputStream of a URL. |
169 | * <p> |
170 | * If the URL is not of type HTTP and there is not 'content-type' header in the fetched |
171 | * data it uses the same logic used for Files. |
172 | * <p> |
173 | * If the URL is a HTTP Url or there is a 'content-type' header in the fetched |
174 | * data it uses the same logic used for an InputStream with content-type. |
175 | * <p> |
176 | * It does a lenient charset encoding detection, check the constructor with the lenient parameter |
177 | * for details. |
178 | * <p> |
179 | * @param url URL to create a Reader from. |
180 | * @throws IOException thrown if there is a problem reading the stream of the URL. |
181 | * |
182 | */ |
183 | public XmlReader(URL url) throws IOException { |
184 | this(url.openConnection()); |
185 | } |
186 | |
187 | /** |
188 | * Creates a Reader using the InputStream of a URLConnection. |
189 | * <p> |
190 | * If the URLConnection is not of type HttpURLConnection and there is not |
191 | * 'content-type' header in the fetched data it uses the same logic used for files. |
192 | * <p> |
193 | * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched |
194 | * data it uses the same logic used for an InputStream with content-type. |
195 | * <p> |
196 | * It does a lenient charset encoding detection, check the constructor with the lenient parameter |
197 | * for details. |
198 | * <p> |
199 | * @param conn URLConnection to create a Reader from. |
200 | * @throws IOException thrown if there is a problem reading the stream of the URLConnection. |
201 | * |
202 | */ |
203 | public XmlReader(URLConnection conn) throws IOException { |
204 | _defaultEncoding = _staticDefaultEncoding; |
205 | boolean lenient = true; |
206 | if (conn instanceof HttpURLConnection) { |
207 | try { |
208 | doHttpStream(conn.getInputStream(),conn.getContentType(),lenient); |
209 | } |
210 | catch (XmlReaderException ex) { |
211 | doLenientDetection(conn.getContentType(),ex); |
212 | } |
213 | } |
214 | else |
215 | if (conn.getContentType()!=null) { |
216 | try { |
217 | doHttpStream(conn.getInputStream(),conn.getContentType(),lenient); |
218 | } |
219 | catch (XmlReaderException ex) { |
220 | doLenientDetection(conn.getContentType(),ex); |
221 | } |
222 | } |
223 | else { |
224 | try { |
225 | doRawStream(conn.getInputStream(),lenient); |
226 | } |
227 | catch (XmlReaderException ex) { |
228 | doLenientDetection(null,ex); |
229 | } |
230 | } |
231 | } |
232 | |
233 | /** |
234 | * Creates a Reader using an InputStream an the associated content-type header. |
235 | * <p> |
236 | * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. |
237 | * If there is not content-type encoding checks the XML prolog encoding. If there is not XML |
238 | * prolog encoding uses the default encoding mandated by the content-type MIME type. |
239 | * <p> |
240 | * It does a lenient charset encoding detection, check the constructor with the lenient parameter |
241 | * for details. |
242 | * <p> |
243 | * @param is InputStream to create the reader from. |
244 | * @param httpContentType content-type header to use for the resolution of the charset encoding. |
245 | * @throws IOException thrown if there is a problem reading the file. |
246 | * |
247 | */ |
248 | public XmlReader(InputStream is,String httpContentType) throws IOException { |
249 | this(is,httpContentType,true); |
250 | } |
251 | |
252 | /** |
253 | * Creates a Reader using an InputStream an the associated content-type header. This constructor is |
254 | * lenient regarding the encoding detection. |
255 | * <p> |
256 | * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. |
257 | * If there is not content-type encoding checks the XML prolog encoding. If there is not XML |
258 | * prolog encoding uses the default encoding mandated by the content-type MIME type. |
259 | * <p> |
260 | * If lenient detection is indicated and the detection above fails as per specifications it then attempts |
261 | * the following: |
262 | * <p> |
263 | * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. |
264 | * <p> |
265 | * Else if the XML prolog had a charset encoding that encoding is used. |
266 | * <p> |
267 | * Else if the content type had a charset encoding that encoding is used. |
268 | * <p> |
269 | * Else 'UTF-8' is used. |
270 | * <p> |
271 | * If lenient detection is indicated an XmlReaderException is never thrown. |
272 | * <p> |
273 | * @param is InputStream to create the reader from. |
274 | * @param httpContentType content-type header to use for the resolution of the charset encoding. |
275 | * @param lenient indicates if the charset encoding detection should be relaxed. |
276 | * @throws IOException thrown if there is a problem reading the file. |
277 | * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs. |
278 | * |
279 | */ |
280 | public XmlReader(InputStream is,String httpContentType,boolean lenient, String defaultEncoding) |
281 | throws IOException, XmlReaderException { |
282 | _defaultEncoding = (defaultEncoding == null) ? _staticDefaultEncoding : defaultEncoding; |
283 | try { |
284 | doHttpStream(is,httpContentType,lenient); |
285 | } |
286 | catch (XmlReaderException ex) { |
287 | if (!lenient) { |
288 | throw ex; |
289 | } |
290 | else { |
291 | doLenientDetection(httpContentType,ex); |
292 | } |
293 | } |
294 | } |
295 | |
296 | /** |
297 | * Creates a Reader using an InputStream an the associated content-type header. This constructor is |
298 | * lenient regarding the encoding detection. |
299 | * <p> |
300 | * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. |
301 | * If there is not content-type encoding checks the XML prolog encoding. If there is not XML |
302 | * prolog encoding uses the default encoding mandated by the content-type MIME type. |
303 | * <p> |
304 | * If lenient detection is indicated and the detection above fails as per specifications it then attempts |
305 | * the following: |
306 | * <p> |
307 | * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. |
308 | * <p> |
309 | * Else if the XML prolog had a charset encoding that encoding is used. |
310 | * <p> |
311 | * Else if the content type had a charset encoding that encoding is used. |
312 | * <p> |
313 | * Else 'UTF-8' is used. |
314 | * <p> |
315 | * If lenient detection is indicated an XmlReaderException is never thrown. |
316 | * <p> |
317 | * @param is InputStream to create the reader from. |
318 | * @param httpContentType content-type header to use for the resolution of the charset encoding. |
319 | * @param lenient indicates if the charset encoding detection should be relaxed. |
320 | * @throws IOException thrown if there is a problem reading the file. |
321 | * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs. |
322 | * |
323 | */ |
324 | public XmlReader(InputStream is, String httpContentType, boolean lenient) |
325 | throws IOException, XmlReaderException { |
326 | this(is, httpContentType, lenient, null); |
327 | } |
328 | |
329 | private void doLenientDetection(String httpContentType,XmlReaderException ex) throws IOException { |
330 | if (httpContentType!=null) { |
331 | if (httpContentType.startsWith("text/html")) { |
332 | httpContentType = httpContentType.substring("text/html".length()); |
333 | httpContentType = "text/xml" + httpContentType; |
334 | try { |
335 | doHttpStream(ex.getInputStream(),httpContentType,true); |
336 | ex = null; |
337 | } |
338 | catch (XmlReaderException ex2) { |
339 | ex = ex2; |
340 | } |
341 | } |
342 | } |
343 | if (ex!=null) { |
344 | String encoding = ex.getXmlEncoding(); |
345 | if (encoding==null) { |
346 | encoding = ex.getContentTypeEncoding(); |
347 | } |
348 | if (encoding==null) { |
349 | encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding; |
350 | } |
351 | prepareReader(ex.getInputStream(),encoding); |
352 | } |
353 | } |
354 | |
355 | /** |
356 | * Returns the charset encoding of the XmlReader. |
357 | * <p> |
358 | * @return charset encoding. |
359 | * |
360 | */ |
361 | public String getEncoding() { |
362 | return _encoding; |
363 | } |
364 | |
365 | public int read(char[] buf,int offset,int len) throws IOException { |
366 | return _reader.read(buf,offset,len); |
367 | } |
368 | |
369 | /** |
370 | * Closes the XmlReader stream. |
371 | * <p> |
372 | * @throws IOException thrown if there was a problem closing the stream. |
373 | * |
374 | */ |
375 | public void close() throws IOException { |
376 | _reader.close(); |
377 | } |
378 | |
379 | private void doRawStream(InputStream is,boolean lenient) throws IOException { |
380 | BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE); |
381 | String bomEnc = getBOMEncoding(pis); |
382 | String xmlGuessEnc = getXMLGuessEncoding(pis); |
383 | String xmlEnc = getXmlProlog(pis,xmlGuessEnc); |
384 | String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis); |
385 | prepareReader(pis,encoding); |
386 | } |
387 | |
388 | private void doHttpStream(InputStream is,String httpContentType,boolean lenient) throws IOException { |
389 | BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE); |
390 | String cTMime = getContentTypeMime(httpContentType); |
391 | String cTEnc = getContentTypeEncoding(httpContentType); |
392 | String bomEnc = getBOMEncoding(pis); |
393 | String xmlGuessEnc = getXMLGuessEncoding(pis); |
394 | String xmlEnc = getXmlProlog(pis,xmlGuessEnc); |
395 | String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis,lenient); |
396 | prepareReader(pis,encoding); |
397 | } |
398 | |
399 | private void prepareReader(InputStream is,String encoding) throws IOException { |
400 | _reader = new InputStreamReader(is,encoding); |
401 | _encoding = encoding; |
402 | } |
403 | |
404 | // InputStream is passed for XmlReaderException creation only |
405 | private String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException { |
406 | String encoding; |
407 | if (bomEnc==null) { |
408 | if (xmlGuessEnc==null || xmlEnc==null) { |
409 | encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding; |
410 | } |
411 | else |
412 | if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { |
413 | encoding = xmlGuessEnc; |
414 | } |
415 | else { |
416 | encoding = xmlEnc; |
417 | } |
418 | } |
419 | else |
420 | if (bomEnc.equals(UTF_8)) { |
421 | if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) { |
422 | throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}), |
423 | bomEnc,xmlGuessEnc,xmlEnc,is); |
424 | } |
425 | if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) { |
426 | throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}), |
427 | bomEnc,xmlGuessEnc,xmlEnc,is); |
428 | } |
429 | encoding = UTF_8; |
430 | } |
431 | else |
432 | if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { |
433 | if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) { |
434 | throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc})); |
435 | } |
436 | if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { |
437 | throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}), |
438 | bomEnc,xmlGuessEnc,xmlEnc,is); |
439 | } |
440 | encoding =bomEnc; |
441 | } |
442 | else { |
443 | throw new XmlReaderException(RAW_EX_2.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}), |
444 | bomEnc,xmlGuessEnc,xmlEnc,is); |
445 | } |
446 | return encoding; |
447 | } |
448 | |
449 | // InputStream is passed for XmlReaderException creation only |
450 | private String calculateHttpEncoding(String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is,boolean lenient) throws IOException { |
451 | String encoding; |
452 | if (lenient & xmlEnc!=null) { |
453 | encoding = xmlEnc; |
454 | } |
455 | else { |
456 | boolean appXml = isAppXml(cTMime); |
457 | boolean textXml = isTextXml(cTMime); |
458 | if (appXml || textXml) { |
459 | if (cTEnc==null) { |
460 | if (appXml) { |
461 | encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is); |
462 | } |
463 | else { |
464 | encoding = (_defaultEncoding == null) ? US_ASCII : _defaultEncoding; |
465 | } |
466 | } |
467 | else |
468 | if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) { |
469 | throw new XmlReaderException(HTTP_EX_1.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}), |
470 | cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is); |
471 | } |
472 | else |
473 | if (cTEnc.equals(UTF_16)) { |
474 | if (bomEnc!=null && bomEnc.startsWith(UTF_16)) { |
475 | encoding = bomEnc; |
476 | } |
477 | else { |
478 | throw new XmlReaderException(HTTP_EX_2.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}), |
479 | cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is); |
480 | } |
481 | } |
482 | else { |
483 | encoding = cTEnc; |
484 | } |
485 | } |
486 | else { |
487 | throw new XmlReaderException(HTTP_EX_3.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}), |
488 | cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is); |
489 | } |
490 | } |
491 | return encoding; |
492 | } |
493 | |
494 | // returns MIME type or NULL if httpContentType is NULL |
495 | private static String getContentTypeMime(String httpContentType) { |
496 | String mime = null; |
497 | if (httpContentType!=null) { |
498 | int i = httpContentType.indexOf(";"); |
499 | mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim(); |
500 | } |
501 | return mime; |
502 | } |
503 | |
504 | private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)"); |
505 | |
506 | // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL |
507 | private static String getContentTypeEncoding(String httpContentType) { |
508 | String encoding = null; |
509 | if (httpContentType!=null) { |
510 | int i = httpContentType.indexOf(";"); |
511 | if (i>-1) { |
512 | String postMime = httpContentType.substring(i+1); |
513 | Matcher m = CHARSET_PATTERN.matcher(postMime); |
514 | encoding = (m.find()) ? m.group(1) : null; |
515 | encoding = (encoding!=null) ? encoding.toUpperCase() : null; |
516 | } |
517 | if (encoding != null && |
518 | ((encoding.startsWith("\"") && encoding.endsWith("\"")) || |
519 | (encoding.startsWith("'") && encoding.endsWith("'")) |
520 | )) { |
521 | encoding = encoding.substring(1, encoding.length() - 1); |
522 | } |
523 | } |
524 | return encoding; |
525 | } |
526 | |
527 | // returns the BOM in the stream, NULL if not present, |
528 | // if there was BOM the in the stream it is consumed |
529 | private static String getBOMEncoding(BufferedInputStream is) throws IOException { |
530 | String encoding = null; |
531 | int[] bytes = new int[3]; |
532 | is.mark(3); |
533 | bytes[0] = is.read(); |
534 | bytes[1] = is.read(); |
535 | bytes[2] = is.read(); |
536 | |
537 | if (bytes[0] == 0xFE && bytes[1] == 0xFF) { |
538 | encoding = UTF_16BE; |
539 | is.reset(); |
540 | is.read(); |
541 | is.read(); |
542 | } |
543 | else |
544 | if (bytes[0] == 0xFF && bytes[1] == 0xFE) { |
545 | encoding = UTF_16LE; |
546 | is.reset(); |
547 | is.read(); |
548 | is.read(); |
549 | } |
550 | else |
551 | if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) { |
552 | encoding = UTF_8; |
553 | } |
554 | else { |
555 | is.reset(); |
556 | } |
557 | return encoding; |
558 | } |
559 | |
560 | // returns the best guess for the encoding by looking the first bytes of the stream, '<?' |
561 | private static String getXMLGuessEncoding(BufferedInputStream is) throws IOException { |
562 | String encoding = null; |
563 | int[] bytes = new int[4]; |
564 | is.mark(4); |
565 | bytes[0] = is.read(); |
566 | bytes[1] = is.read(); |
567 | bytes[2] = is.read(); |
568 | bytes[3] = is.read(); |
569 | is.reset(); |
570 | |
571 | if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) { |
572 | encoding = UTF_16BE; |
573 | } |
574 | else |
575 | if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) { |
576 | encoding = UTF_16LE; |
577 | } |
578 | else |
579 | if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) { |
580 | encoding = UTF_8; |
581 | } |
582 | return encoding; |
583 | } |
584 | |
585 | |
586 | private static final Pattern ENCODING_PATTERN = |
587 | Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE); |
588 | |
589 | // returns the encoding declared in the <?xml encoding=...?>, NULL if none |
590 | private static String getXmlProlog(BufferedInputStream is,String guessedEnc) throws IOException { |
591 | String encoding = null; |
592 | if (guessedEnc!=null) { |
593 | byte[] bytes = new byte[BUFFER_SIZE]; |
594 | is.mark(BUFFER_SIZE); |
595 | int offset = 0; |
596 | int max = BUFFER_SIZE; |
597 | int c = is.read(bytes,offset,max); |
598 | int firstGT = -1; |
599 | while (c!=-1 && firstGT==-1 && offset< BUFFER_SIZE) { |
600 | offset += c; |
601 | max -= c; |
602 | c = is.read(bytes,offset,max); |
603 | firstGT = new String(bytes, 0, offset).indexOf(">"); |
604 | } |
605 | if (firstGT == -1) { |
606 | if (c == -1) { |
607 | throw new IOException("Unexpected end of XML stream"); |
608 | } |
609 | else { |
610 | throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes"); |
611 | } |
612 | } |
613 | int bytesRead = offset; |
614 | if (bytesRead>0) { |
615 | is.reset(); |
616 | Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,firstGT + 1), guessedEnc); |
617 | BufferedReader bReader = new BufferedReader(reader); |
618 | StringBuffer prolog = new StringBuffer(); |
619 | String line = bReader.readLine(); |
620 | while (line != null) { |
621 | prolog.append(line); |
622 | line = bReader.readLine(); |
623 | } |
624 | Matcher m = ENCODING_PATTERN.matcher(prolog); |
625 | if (m.find()) { |
626 | encoding = m.group(1).toUpperCase(); |
627 | encoding = encoding.substring(1,encoding.length()-1); |
628 | } |
629 | } |
630 | } |
631 | return encoding; |
632 | } |
633 | |
634 | // indicates if the MIME type belongs to the APPLICATION XML family |
635 | private static boolean isAppXml(String mime) { |
636 | return mime!=null && |
637 | (mime.equals("application/xml") || |
638 | mime.equals("application/xml-dtd") || |
639 | mime.equals("application/xml-external-parsed-entity") || |
640 | (mime.startsWith("application/") && mime.endsWith("+xml"))); |
641 | } |
642 | |
643 | // indicates if the MIME type belongs to the TEXT XML family |
644 | private static boolean isTextXml(String mime) { |
645 | return mime!=null && |
646 | (mime.equals("text/xml") || |
647 | mime.equals("text/xml-external-parsed-entity") || |
648 | (mime.startsWith("text/") && mime.endsWith("+xml"))); |
649 | } |
650 | |
651 | private static final MessageFormat RAW_EX_1 = new MessageFormat( |
652 | "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"); |
653 | |
654 | private static final MessageFormat RAW_EX_2 = new MessageFormat( |
655 | "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"); |
656 | |
657 | private static final MessageFormat HTTP_EX_1 = new MessageFormat( |
658 | "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"); |
659 | |
660 | private static final MessageFormat HTTP_EX_2 = new MessageFormat( |
661 | "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"); |
662 | |
663 | private static final MessageFormat HTTP_EX_3 = new MessageFormat( |
664 | "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME"); |
665 | |
666 | } |