1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package com.sun.syndication.io;
18
19 import java.io.*;
20 import java.net.URL;
21 import java.net.URLConnection;
22 import java.net.HttpURLConnection;
23 import java.util.regex.Pattern;
24 import java.util.regex.Matcher;
25 import java.text.MessageFormat;
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 public class XmlReader extends Reader {
53 private static final int BUFFER_SIZE = 4096;
54
55 private static final String UTF_8 = "UTF-8";
56 private static final String US_ASCII = "US-ASCII";
57 private static final String UTF_16BE = "UTF-16BE";
58 private static final String UTF_16LE = "UTF-16LE";
59 private static final String UTF_16 = "UTF-16";
60
61 private static String _staticDefaultEncoding = null;
62
63 private Reader _reader;
64 private String _encoding;
65 private String _defaultEncoding;
66
67
68
69
70
71
72
73
74
75
76
77
78 public static void setDefaultEncoding(String encoding) {
79 _staticDefaultEncoding = encoding;
80 }
81
82
83
84
85
86
87
88
89
90
91 public static String getDefaultEncoding() {
92 return _staticDefaultEncoding;
93 }
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108 public XmlReader(File file) throws IOException {
109 this(new FileInputStream(file));
110 }
111
112
113
114
115
116
117
118
119
120
121
122
123
124 public XmlReader(InputStream is) throws IOException {
125 this(is,true);
126 }
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152 public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException {
153 _defaultEncoding = _staticDefaultEncoding;
154 try {
155 doRawStream(is,lenient);
156 }
157 catch (XmlReaderException ex) {
158 if (!lenient) {
159 throw ex;
160 }
161 else {
162 doLenientDetection(null,ex);
163 }
164 }
165 }
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183 public XmlReader(URL url) throws IOException {
184 this(url.openConnection());
185 }
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203 public XmlReader(URLConnection conn) throws IOException {
204 _defaultEncoding = _staticDefaultEncoding;
205 boolean lenient = true;
206 if (conn instanceof HttpURLConnection) {
207 try {
208 doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
209 }
210 catch (XmlReaderException ex) {
211 doLenientDetection(conn.getContentType(),ex);
212 }
213 }
214 else
215 if (conn.getContentType()!=null) {
216 try {
217 doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
218 }
219 catch (XmlReaderException ex) {
220 doLenientDetection(conn.getContentType(),ex);
221 }
222 }
223 else {
224 try {
225 doRawStream(conn.getInputStream(),lenient);
226 }
227 catch (XmlReaderException ex) {
228 doLenientDetection(null,ex);
229 }
230 }
231 }
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248 public XmlReader(InputStream is,String httpContentType) throws IOException {
249 this(is,httpContentType,true);
250 }
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280 public XmlReader(InputStream is,String httpContentType,boolean lenient, String defaultEncoding)
281 throws IOException, XmlReaderException {
282 _defaultEncoding = (defaultEncoding == null) ? _staticDefaultEncoding : defaultEncoding;
283 try {
284 doHttpStream(is,httpContentType,lenient);
285 }
286 catch (XmlReaderException ex) {
287 if (!lenient) {
288 throw ex;
289 }
290 else {
291 doLenientDetection(httpContentType,ex);
292 }
293 }
294 }
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324 public XmlReader(InputStream is, String httpContentType, boolean lenient)
325 throws IOException, XmlReaderException {
326 this(is, httpContentType, lenient, null);
327 }
328
329 private void doLenientDetection(String httpContentType,XmlReaderException ex) throws IOException {
330 if (httpContentType!=null) {
331 if (httpContentType.startsWith("text/html")) {
332 httpContentType = httpContentType.substring("text/html".length());
333 httpContentType = "text/xml" + httpContentType;
334 try {
335 doHttpStream(ex.getInputStream(),httpContentType,true);
336 ex = null;
337 }
338 catch (XmlReaderException ex2) {
339 ex = ex2;
340 }
341 }
342 }
343 if (ex!=null) {
344 String encoding = ex.getXmlEncoding();
345 if (encoding==null) {
346 encoding = ex.getContentTypeEncoding();
347 }
348 if (encoding==null) {
349 encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
350 }
351 prepareReader(ex.getInputStream(),encoding);
352 }
353 }
354
355
356
357
358
359
360
361 public String getEncoding() {
362 return _encoding;
363 }
364
365 public int read(char[] buf,int offset,int len) throws IOException {
366 return _reader.read(buf,offset,len);
367 }
368
369
370
371
372
373
374
375 public void close() throws IOException {
376 _reader.close();
377 }
378
379 private void doRawStream(InputStream is,boolean lenient) throws IOException {
380 BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
381 String bomEnc = getBOMEncoding(pis);
382 String xmlGuessEnc = getXMLGuessEncoding(pis);
383 String xmlEnc = getXmlProlog(pis,xmlGuessEnc);
384 String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
385 prepareReader(pis,encoding);
386 }
387
388 private void doHttpStream(InputStream is,String httpContentType,boolean lenient) throws IOException {
389 BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
390 String cTMime = getContentTypeMime(httpContentType);
391 String cTEnc = getContentTypeEncoding(httpContentType);
392 String bomEnc = getBOMEncoding(pis);
393 String xmlGuessEnc = getXMLGuessEncoding(pis);
394 String xmlEnc = getXmlProlog(pis,xmlGuessEnc);
395 String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis,lenient);
396 prepareReader(pis,encoding);
397 }
398
399 private void prepareReader(InputStream is,String encoding) throws IOException {
400 _reader = new InputStreamReader(is,encoding);
401 _encoding = encoding;
402 }
403
404
405 private String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException {
406 String encoding;
407 if (bomEnc==null) {
408 if (xmlGuessEnc==null || xmlEnc==null) {
409 encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
410 }
411 else
412 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
413 encoding = xmlGuessEnc;
414 }
415 else {
416 encoding = xmlEnc;
417 }
418 }
419 else
420 if (bomEnc.equals(UTF_8)) {
421 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
422 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
423 bomEnc,xmlGuessEnc,xmlEnc,is);
424 }
425 if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
426 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
427 bomEnc,xmlGuessEnc,xmlEnc,is);
428 }
429 encoding = UTF_8;
430 }
431 else
432 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
433 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
434 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
435 }
436 if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
437 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
438 bomEnc,xmlGuessEnc,xmlEnc,is);
439 }
440 encoding =bomEnc;
441 }
442 else {
443 throw new XmlReaderException(RAW_EX_2.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
444 bomEnc,xmlGuessEnc,xmlEnc,is);
445 }
446 return encoding;
447 }
448
449
450 private String calculateHttpEncoding(String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is,boolean lenient) throws IOException {
451 String encoding;
452 if (lenient & xmlEnc!=null) {
453 encoding = xmlEnc;
454 }
455 else {
456 boolean appXml = isAppXml(cTMime);
457 boolean textXml = isTextXml(cTMime);
458 if (appXml || textXml) {
459 if (cTEnc==null) {
460 if (appXml) {
461 encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
462 }
463 else {
464 encoding = (_defaultEncoding == null) ? US_ASCII : _defaultEncoding;
465 }
466 }
467 else
468 if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
469 throw new XmlReaderException(HTTP_EX_1.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
470 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
471 }
472 else
473 if (cTEnc.equals(UTF_16)) {
474 if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
475 encoding = bomEnc;
476 }
477 else {
478 throw new XmlReaderException(HTTP_EX_2.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
479 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
480 }
481 }
482 else {
483 encoding = cTEnc;
484 }
485 }
486 else {
487 throw new XmlReaderException(HTTP_EX_3.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
488 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
489 }
490 }
491 return encoding;
492 }
493
494
495 private static String getContentTypeMime(String httpContentType) {
496 String mime = null;
497 if (httpContentType!=null) {
498 int i = httpContentType.indexOf(";");
499 mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
500 }
501 return mime;
502 }
503
504 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
505
506
507 private static String getContentTypeEncoding(String httpContentType) {
508 String encoding = null;
509 if (httpContentType!=null) {
510 int i = httpContentType.indexOf(";");
511 if (i>-1) {
512 String postMime = httpContentType.substring(i+1);
513 Matcher m = CHARSET_PATTERN.matcher(postMime);
514 encoding = (m.find()) ? m.group(1) : null;
515 encoding = (encoding!=null) ? encoding.toUpperCase() : null;
516 }
517 if (encoding != null &&
518 ((encoding.startsWith("\"") && encoding.endsWith("\"")) ||
519 (encoding.startsWith("'") && encoding.endsWith("'"))
520 )) {
521 encoding = encoding.substring(1, encoding.length() - 1);
522 }
523 }
524 return encoding;
525 }
526
527
528
529 private static String getBOMEncoding(BufferedInputStream is) throws IOException {
530 String encoding = null;
531 int[] bytes = new int[3];
532 is.mark(3);
533 bytes[0] = is.read();
534 bytes[1] = is.read();
535 bytes[2] = is.read();
536
537 if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
538 encoding = UTF_16BE;
539 is.reset();
540 is.read();
541 is.read();
542 }
543 else
544 if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
545 encoding = UTF_16LE;
546 is.reset();
547 is.read();
548 is.read();
549 }
550 else
551 if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
552 encoding = UTF_8;
553 }
554 else {
555 is.reset();
556 }
557 return encoding;
558 }
559
560
561 private static String getXMLGuessEncoding(BufferedInputStream is) throws IOException {
562 String encoding = null;
563 int[] bytes = new int[4];
564 is.mark(4);
565 bytes[0] = is.read();
566 bytes[1] = is.read();
567 bytes[2] = is.read();
568 bytes[3] = is.read();
569 is.reset();
570
571 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
572 encoding = UTF_16BE;
573 }
574 else
575 if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
576 encoding = UTF_16LE;
577 }
578 else
579 if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
580 encoding = UTF_8;
581 }
582 return encoding;
583 }
584
585
586 private static final Pattern ENCODING_PATTERN =
587 Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE);
588
589
590 private static String getXmlProlog(BufferedInputStream is,String guessedEnc) throws IOException {
591 String encoding = null;
592 if (guessedEnc!=null) {
593 byte[] bytes = new byte[BUFFER_SIZE];
594 is.mark(BUFFER_SIZE);
595 int offset = 0;
596 int max = BUFFER_SIZE;
597 int c = is.read(bytes,offset,max);
598 int firstGT = -1;
599 while (c!=-1 && firstGT==-1 && offset< BUFFER_SIZE) {
600 offset += c;
601 max -= c;
602 c = is.read(bytes,offset,max);
603 firstGT = new String(bytes, 0, offset).indexOf(">");
604 }
605 if (firstGT == -1) {
606 if (c == -1) {
607 throw new IOException("Unexpected end of XML stream");
608 }
609 else {
610 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
611 }
612 }
613 int bytesRead = offset;
614 if (bytesRead>0) {
615 is.reset();
616 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,firstGT + 1), guessedEnc);
617 BufferedReader bReader = new BufferedReader(reader);
618 StringBuffer prolog = new StringBuffer();
619 String line = bReader.readLine();
620 while (line != null) {
621 prolog.append(line);
622 line = bReader.readLine();
623 }
624 Matcher m = ENCODING_PATTERN.matcher(prolog);
625 if (m.find()) {
626 encoding = m.group(1).toUpperCase();
627 encoding = encoding.substring(1,encoding.length()-1);
628 }
629 }
630 }
631 return encoding;
632 }
633
634
635 private static boolean isAppXml(String mime) {
636 return mime!=null &&
637 (mime.equals("application/xml") ||
638 mime.equals("application/xml-dtd") ||
639 mime.equals("application/xml-external-parsed-entity") ||
640 (mime.startsWith("application/") && mime.endsWith("+xml")));
641 }
642
643
644 private static boolean isTextXml(String mime) {
645 return mime!=null &&
646 (mime.equals("text/xml") ||
647 mime.equals("text/xml-external-parsed-entity") ||
648 (mime.startsWith("text/") && mime.endsWith("+xml")));
649 }
650
651 private static final MessageFormat RAW_EX_1 = new MessageFormat(
652 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
653
654 private static final MessageFormat RAW_EX_2 = new MessageFormat(
655 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
656
657 private static final MessageFormat HTTP_EX_1 = new MessageFormat(
658 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
659
660 private static final MessageFormat HTTP_EX_2 = new MessageFormat(
661 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
662
663 private static final MessageFormat HTTP_EX_3 = new MessageFormat(
664 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
665
666 }