EMMA Coverage Report

EMMA Coverage Report (generated Fri Feb 14 08:28:31 UTC 2014)
[all classes][com.sun.syndication.io]

COVERAGE SUMMARY FOR SOURCE FILE [XmlReader.java]

name	class, %	method, %	block, %	line, %
XmlReader.java	100% (1/1)	44% (12/27)	36% (409/1147)	43% (94.8/220)

COVERAGE BREAKDOWN BY CLASS AND METHOD

name	class, %	method, %	block, %	line, %

class XmlReader	100% (1/1)	44% (12/27)	36% (409/1147)	43% (94.8/220)
XmlReader (InputStream, String): void		0% (0/1)	0% (0/6)	0% (0/2)
XmlReader (InputStream, String, boolean): void		0% (0/1)	0% (0/7)	0% (0/2)
XmlReader (InputStream, String, boolean, String): void		0% (0/1)	0% (0/25)	0% (0/9)
XmlReader (URL): void		0% (0/1)	0% (0/5)	0% (0/2)
XmlReader (URLConnection): void		0% (0/1)	0% (0/55)	0% (0/18)
calculateHttpEncoding (String, String, String, String, String, InputStream, b...		0% (0/1)	0% (0/172)	0% (0/18)
doHttpStream (InputStream, String, boolean): void		0% (0/1)	0% (0/37)	0% (0/9)
doLenientDetection (String, XmlReaderException): void		0% (0/1)	0% (0/58)	0% (0/17)
getContentTypeEncoding (String): String		0% (0/1)	0% (0/65)	0% (0/11)
getContentTypeMime (String): String		0% (0/1)	0% (0/21)	0% (0/5)
getDefaultEncoding (): String		0% (0/1)	0% (0/2)	0% (0/1)
getEncoding (): String		0% (0/1)	0% (0/3)	0% (0/1)
isAppXml (String): boolean		0% (0/1)	0% (0/26)	0% (0/1)
isTextXml (String): boolean		0% (0/1)	0% (0/22)	0% (0/1)
setDefaultEncoding (String): void		0% (0/1)	0% (0/3)	0% (0/2)
calculateRawEncoding (String, String, String, InputStream): String		100% (1/1)	19% (38/197)	42% (8.4/20)
XmlReader (InputStream, boolean): void		100% (1/1)	55% (11/20)	56% (5/9)
getBOMEncoding (BufferedInputStream): String		100% (1/1)	67% (58/87)	69% (13.8/20)
getXmlProlog (BufferedInputStream, String): String		100% (1/1)	85% (121/143)	91% (29/32)
getXMLGuessEncoding (BufferedInputStream): String		100% (1/1)	88% (84/96)	90% (13.5/15)
<static initializer>		100% (1/1)	100% (35/35)	100% (8/8)
XmlReader (File): void		100% (1/1)	100% (7/7)	100% (2/2)
XmlReader (InputStream): void		100% (1/1)	100% (5/5)	100% (2/2)
close (): void		100% (1/1)	100% (4/4)	100% (2/2)
doRawStream (InputStream, boolean): void		100% (1/1)	100% (28/28)	100% (7/7)
prepareReader (InputStream, String): void		100% (1/1)	100% (11/11)	100% (3/3)
read (char [], int, int): int		100% (1/1)	100% (7/7)	100% (1/1)

1	/*
2	* Copyright 2004 Sun Microsystems, Inc.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*
16	*/
17	package com.sun.syndication.io;
18
19	import java.io.*;
20	import java.net.URL;
21	import java.net.URLConnection;
22	import java.net.HttpURLConnection;
23	import java.util.regex.Pattern;
24	import java.util.regex.Matcher;
25	import java.text.MessageFormat;
26
27	/**
28	* Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out
29	* the charset encoding of the XML document within the stream.
30	* <p>
31	* IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
32	* character stream.
33	* <p>
34	* All this has to be done without consuming characters from the stream, if not the XML parser
35	* will not recognized the document as a valid XML. This is not 100% true, but it's close enough
36	* (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all
37	* parsers).
38	* <p>
39	* The XmlReader class handles the charset encoding of XML documents in Files, raw streams and
40	* HTTP streams by offering a wide set of constructors.
41	* <P>
42	* By default the charset encoding detection is lenient, the constructor with the lenient flag
43	* can be used for an script (following HTTP MIME and XML specifications).
44	* All this is nicely explained by Mark Pilgrim in his blog,
45	* <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
46	* Determining the character encoding of a feed</a>.
47	* <p>
48	* @author Alejandro Abdelnur
49	* @version revision 1.18 taken on 2008-03-06 from Rome (see
50	* https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java)
51	*/
52	public class XmlReader extends Reader {
53	private static final int BUFFER_SIZE = 4096;
54
55	private static final String UTF_8 = "UTF-8";
56	private static final String US_ASCII = "US-ASCII";
57	private static final String UTF_16BE = "UTF-16BE";
58	private static final String UTF_16LE = "UTF-16LE";
59	private static final String UTF_16 = "UTF-16";
60
61	private static String _staticDefaultEncoding = null;
62
63	private Reader _reader;
64	private String _encoding;
65	private String _defaultEncoding;
66
67	/**
68	* Sets the default encoding to use if none is set in HTTP content-type,
69	* XML prolog and the rules based on content-type are not adequate.
70	* <p/>
71	* If it is set to NULL the content-type based rules are used.
72	* <p/>
73	* By default it is NULL.
74	* <p/>
75	*
76	* @param encoding charset encoding to default to.
77	*/
78	public static void setDefaultEncoding(String encoding) {
79	_staticDefaultEncoding = encoding;
80	}
81
82	/**
83	* Returns the default encoding to use if none is set in HTTP content-type,
84	* XML prolog and the rules based on content-type are not adequate.
85	* <p/>
86	* If it is NULL the content-type based rules are used.
87	* <p/>
88	*
89	* @return the default encoding to use.
90	*/
91	public static String getDefaultEncoding() {
92	return _staticDefaultEncoding;
93	}
94
95	/**
96	* Creates a Reader for a File.
97	* <p>
98	* It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
99	* missing defaults to UTF-8.
100	* <p>
101	* It does a lenient charset encoding detection, check the constructor with the lenient parameter
102	* for details.
103	* <p>
104	* @param file File to create a Reader from.
105	* @throws IOException thrown if there is a problem reading the file.
106	*
107	*/
108	public XmlReader(File file) throws IOException {
109	this(new FileInputStream(file));
110	}
111
112	/**
113	* Creates a Reader for a raw InputStream.
114	* <p>
115	* It follows the same logic used for files.
116	* <p>
117	* It does a lenient charset encoding detection, check the constructor with the lenient parameter
118	* for details.
119	* <p>
120	* @param is InputStream to create a Reader from.
121	* @throws IOException thrown if there is a problem reading the stream.
122	*
123	*/
124	public XmlReader(InputStream is) throws IOException {
125	this(is,true);
126	}
127
128	/**
129	* Creates a Reader for a raw InputStream.
130	* <p>
131	* It follows the same logic used for files.
132	* <p>
133	* If lenient detection is indicated and the detection above fails as per specifications it then attempts
134	* the following:
135	* <p>
136	* If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
137	* <p>
138	* Else if the XML prolog had a charset encoding that encoding is used.
139	* <p>
140	* Else if the content type had a charset encoding that encoding is used.
141	* <p>
142	* Else 'UTF-8' is used.
143	* <p>
144	* If lenient detection is indicated an XmlReaderException is never thrown.
145	* <p>
146	* @param is InputStream to create a Reader from.
147	* @param lenient indicates if the charset encoding detection should be relaxed.
148	* @throws IOException thrown if there is a problem reading the stream.
149	* @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
150	*
151	*/
152	public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException {
153	_defaultEncoding = _staticDefaultEncoding;
154	try {
155	doRawStream(is,lenient);
156	}
157	catch (XmlReaderException ex) {
158	if (!lenient) {
159	throw ex;
160	}
161	else {
162	doLenientDetection(null,ex);
163	}
164	}
165	}
166
167	/**
168	* Creates a Reader using the InputStream of a URL.
169	* <p>
170	* If the URL is not of type HTTP and there is not 'content-type' header in the fetched
171	* data it uses the same logic used for Files.
172	* <p>
173	* If the URL is a HTTP Url or there is a 'content-type' header in the fetched
174	* data it uses the same logic used for an InputStream with content-type.
175	* <p>
176	* It does a lenient charset encoding detection, check the constructor with the lenient parameter
177	* for details.
178	* <p>
179	* @param url URL to create a Reader from.
180	* @throws IOException thrown if there is a problem reading the stream of the URL.
181	*
182	*/
183	public XmlReader(URL url) throws IOException {
184	this(url.openConnection());
185	}
186
187	/**
188	* Creates a Reader using the InputStream of a URLConnection.
189	* <p>
190	* If the URLConnection is not of type HttpURLConnection and there is not
191	* 'content-type' header in the fetched data it uses the same logic used for files.
192	* <p>
193	* If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched
194	* data it uses the same logic used for an InputStream with content-type.
195	* <p>
196	* It does a lenient charset encoding detection, check the constructor with the lenient parameter
197	* for details.
198	* <p>
199	* @param conn URLConnection to create a Reader from.
200	* @throws IOException thrown if there is a problem reading the stream of the URLConnection.
201	*
202	*/
203	public XmlReader(URLConnection conn) throws IOException {
204	_defaultEncoding = _staticDefaultEncoding;
205	boolean lenient = true;
206	if (conn instanceof HttpURLConnection) {
207	try {
208	doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
209	}
210	catch (XmlReaderException ex) {
211	doLenientDetection(conn.getContentType(),ex);
212	}
213	}
214	else
215	if (conn.getContentType()!=null) {
216	try {
217	doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
218	}
219	catch (XmlReaderException ex) {
220	doLenientDetection(conn.getContentType(),ex);
221	}
222	}
223	else {
224	try {
225	doRawStream(conn.getInputStream(),lenient);
226	}
227	catch (XmlReaderException ex) {
228	doLenientDetection(null,ex);
229	}
230	}
231	}
232
233	/**
234	* Creates a Reader using an InputStream an the associated content-type header.
235	* <p>
236	* First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
237	* If there is not content-type encoding checks the XML prolog encoding. If there is not XML
238	* prolog encoding uses the default encoding mandated by the content-type MIME type.
239	* <p>
240	* It does a lenient charset encoding detection, check the constructor with the lenient parameter
241	* for details.
242	* <p>
243	* @param is InputStream to create the reader from.
244	* @param httpContentType content-type header to use for the resolution of the charset encoding.
245	* @throws IOException thrown if there is a problem reading the file.
246	*
247	*/
248	public XmlReader(InputStream is,String httpContentType) throws IOException {
249	this(is,httpContentType,true);
250	}
251
252	/**
253	* Creates a Reader using an InputStream an the associated content-type header. This constructor is
254	* lenient regarding the encoding detection.
255	* <p>
256	* First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
257	* If there is not content-type encoding checks the XML prolog encoding. If there is not XML
258	* prolog encoding uses the default encoding mandated by the content-type MIME type.
259	* <p>
260	* If lenient detection is indicated and the detection above fails as per specifications it then attempts
261	* the following:
262	* <p>
263	* If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
264	* <p>
265	* Else if the XML prolog had a charset encoding that encoding is used.
266	* <p>
267	* Else if the content type had a charset encoding that encoding is used.
268	* <p>
269	* Else 'UTF-8' is used.
270	* <p>
271	* If lenient detection is indicated an XmlReaderException is never thrown.
272	* <p>
273	* @param is InputStream to create the reader from.
274	* @param httpContentType content-type header to use for the resolution of the charset encoding.
275	* @param lenient indicates if the charset encoding detection should be relaxed.
276	* @throws IOException thrown if there is a problem reading the file.
277	* @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
278	*
279	*/
280	public XmlReader(InputStream is,String httpContentType,boolean lenient, String defaultEncoding)
281	throws IOException, XmlReaderException {
282	_defaultEncoding = (defaultEncoding == null) ? _staticDefaultEncoding : defaultEncoding;
283	try {
284	doHttpStream(is,httpContentType,lenient);
285	}
286	catch (XmlReaderException ex) {
287	if (!lenient) {
288	throw ex;
289	}
290	else {
291	doLenientDetection(httpContentType,ex);
292	}
293	}
294	}
295
296	/**
297	* Creates a Reader using an InputStream an the associated content-type header. This constructor is
298	* lenient regarding the encoding detection.
299	* <p>
300	* First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
301	* If there is not content-type encoding checks the XML prolog encoding. If there is not XML
302	* prolog encoding uses the default encoding mandated by the content-type MIME type.
303	* <p>
304	* If lenient detection is indicated and the detection above fails as per specifications it then attempts
305	* the following:
306	* <p>
307	* If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
308	* <p>
309	* Else if the XML prolog had a charset encoding that encoding is used.
310	* <p>
311	* Else if the content type had a charset encoding that encoding is used.
312	* <p>
313	* Else 'UTF-8' is used.
314	* <p>
315	* If lenient detection is indicated an XmlReaderException is never thrown.
316	* <p>
317	* @param is InputStream to create the reader from.
318	* @param httpContentType content-type header to use for the resolution of the charset encoding.
319	* @param lenient indicates if the charset encoding detection should be relaxed.
320	* @throws IOException thrown if there is a problem reading the file.
321	* @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
322	*
323	*/
324	public XmlReader(InputStream is, String httpContentType, boolean lenient)
325	throws IOException, XmlReaderException {
326	this(is, httpContentType, lenient, null);
327	}
328
329	private void doLenientDetection(String httpContentType,XmlReaderException ex) throws IOException {
330	if (httpContentType!=null) {
331	if (httpContentType.startsWith("text/html")) {
332	httpContentType = httpContentType.substring("text/html".length());
333	httpContentType = "text/xml" + httpContentType;
334	try {
335	doHttpStream(ex.getInputStream(),httpContentType,true);
336	ex = null;
337	}
338	catch (XmlReaderException ex2) {
339	ex = ex2;
340	}
341	}
342	}
343	if (ex!=null) {
344	String encoding = ex.getXmlEncoding();
345	if (encoding==null) {
346	encoding = ex.getContentTypeEncoding();
347	}
348	if (encoding==null) {
349	encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
350	}
351	prepareReader(ex.getInputStream(),encoding);
352	}
353	}
354
355	/**
356	* Returns the charset encoding of the XmlReader.
357	* <p>
358	* @return charset encoding.
359	*
360	*/
361	public String getEncoding() {
362	return _encoding;
363	}
364
365	public int read(char[] buf,int offset,int len) throws IOException {
366	return _reader.read(buf,offset,len);
367	}
368
369	/**
370	* Closes the XmlReader stream.
371	* <p>
372	* @throws IOException thrown if there was a problem closing the stream.
373	*
374	*/
375	public void close() throws IOException {
376	_reader.close();
377	}
378
379	private void doRawStream(InputStream is,boolean lenient) throws IOException {
380	BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
381	String bomEnc = getBOMEncoding(pis);
382	String xmlGuessEnc = getXMLGuessEncoding(pis);
383	String xmlEnc = getXmlProlog(pis,xmlGuessEnc);
384	String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
385	prepareReader(pis,encoding);
386	}
387
388	private void doHttpStream(InputStream is,String httpContentType,boolean lenient) throws IOException {
389	BufferedInputStream pis = new BufferedInputStream(is, BUFFER_SIZE);
390	String cTMime = getContentTypeMime(httpContentType);
391	String cTEnc = getContentTypeEncoding(httpContentType);
392	String bomEnc = getBOMEncoding(pis);
393	String xmlGuessEnc = getXMLGuessEncoding(pis);
394	String xmlEnc = getXmlProlog(pis,xmlGuessEnc);
395	String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis,lenient);
396	prepareReader(pis,encoding);
397	}
398
399	private void prepareReader(InputStream is,String encoding) throws IOException {
400	_reader = new InputStreamReader(is,encoding);
401	_encoding = encoding;
402	}
403
404	// InputStream is passed for XmlReaderException creation only
405	private String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException {
406	String encoding;
407	if (bomEnc==null) {
408	if (xmlGuessEnc==null \|\| xmlEnc==null) {
409	encoding = (_defaultEncoding == null) ? UTF_8 : _defaultEncoding;
410	}
411	else
412	if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) \|\| xmlGuessEnc.equals(UTF_16LE))) {
413	encoding = xmlGuessEnc;
414	}
415	else {
416	encoding = xmlEnc;
417	}
418	}
419	else
420	if (bomEnc.equals(UTF_8)) {
421	if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
422	throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
423	bomEnc,xmlGuessEnc,xmlEnc,is);
424	}
425	if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
426	throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
427	bomEnc,xmlGuessEnc,xmlEnc,is);
428	}
429	encoding = UTF_8;
430	}
431	else
432	if (bomEnc.equals(UTF_16BE) \|\| bomEnc.equals(UTF_16LE)) {
433	if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
434	throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
435	}
436	if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
437	throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
438	bomEnc,xmlGuessEnc,xmlEnc,is);
439	}
440	encoding =bomEnc;
441	}
442	else {
443	throw new XmlReaderException(RAW_EX_2.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
444	bomEnc,xmlGuessEnc,xmlEnc,is);
445	}
446	return encoding;
447	}
448
449	// InputStream is passed for XmlReaderException creation only
450	private String calculateHttpEncoding(String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is,boolean lenient) throws IOException {
451	String encoding;
452	if (lenient & xmlEnc!=null) {
453	encoding = xmlEnc;
454	}
455	else {
456	boolean appXml = isAppXml(cTMime);
457	boolean textXml = isTextXml(cTMime);
458	if (appXml \|\| textXml) {
459	if (cTEnc==null) {
460	if (appXml) {
461	encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
462	}
463	else {
464	encoding = (_defaultEncoding == null) ? US_ASCII : _defaultEncoding;
465	}
466	}
467	else
468	if (bomEnc!=null && (cTEnc.equals(UTF_16BE) \|\| cTEnc.equals(UTF_16LE))) {
469	throw new XmlReaderException(HTTP_EX_1.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
470	cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
471	}
472	else
473	if (cTEnc.equals(UTF_16)) {
474	if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
475	encoding = bomEnc;
476	}
477	else {
478	throw new XmlReaderException(HTTP_EX_2.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
479	cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
480	}
481	}
482	else {
483	encoding = cTEnc;
484	}
485	}
486	else {
487	throw new XmlReaderException(HTTP_EX_3.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
488	cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
489	}
490	}
491	return encoding;
492	}
493
494	// returns MIME type or NULL if httpContentType is NULL
495	private static String getContentTypeMime(String httpContentType) {
496	String mime = null;
497	if (httpContentType!=null) {
498	int i = httpContentType.indexOf(";");
499	mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
500	}
501	return mime;
502	}
503
504	private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
505
506	// returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
507	private static String getContentTypeEncoding(String httpContentType) {
508	String encoding = null;
509	if (httpContentType!=null) {
510	int i = httpContentType.indexOf(";");
511	if (i>-1) {
512	String postMime = httpContentType.substring(i+1);
513	Matcher m = CHARSET_PATTERN.matcher(postMime);
514	encoding = (m.find()) ? m.group(1) : null;
515	encoding = (encoding!=null) ? encoding.toUpperCase() : null;
516	}
517	if (encoding != null &&
518	((encoding.startsWith("\"") && encoding.endsWith("\"")) \|\|
519	(encoding.startsWith("'") && encoding.endsWith("'"))
520	)) {
521	encoding = encoding.substring(1, encoding.length() - 1);
522	}
523	}
524	return encoding;
525	}
526
527	// returns the BOM in the stream, NULL if not present,
528	// if there was BOM the in the stream it is consumed
529	private static String getBOMEncoding(BufferedInputStream is) throws IOException {
530	String encoding = null;
531	int[] bytes = new int[3];
532	is.mark(3);
533	bytes[0] = is.read();
534	bytes[1] = is.read();
535	bytes[2] = is.read();
536
537	if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
538	encoding = UTF_16BE;
539	is.reset();
540	is.read();
541	is.read();
542	}
543	else
544	if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
545	encoding = UTF_16LE;
546	is.reset();
547	is.read();
548	is.read();
549	}
550	else
551	if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
552	encoding = UTF_8;
553	}
554	else {
555	is.reset();
556	}
557	return encoding;
558	}
559
560	// returns the best guess for the encoding by looking the first bytes of the stream, '<?'
561	private static String getXMLGuessEncoding(BufferedInputStream is) throws IOException {
562	String encoding = null;
563	int[] bytes = new int[4];
564	is.mark(4);
565	bytes[0] = is.read();
566	bytes[1] = is.read();
567	bytes[2] = is.read();
568	bytes[3] = is.read();
569	is.reset();
570
571	if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
572	encoding = UTF_16BE;
573	}
574	else
575	if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
576	encoding = UTF_16LE;
577	}
578	else
579	if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
580	encoding = UTF_8;
581	}
582	return encoding;
583	}
584
585
586	private static final Pattern ENCODING_PATTERN =
587	Pattern.compile("<\\?xml.encoding[\\s]=[\\s]((?:\".[^\"]\")\|(?:'.[^']*'))", Pattern.MULTILINE);
588
589	// returns the encoding declared in the <?xml encoding=...?>, NULL if none
590	private static String getXmlProlog(BufferedInputStream is,String guessedEnc) throws IOException {
591	String encoding = null;
592	if (guessedEnc!=null) {
593	byte[] bytes = new byte[BUFFER_SIZE];
594	is.mark(BUFFER_SIZE);
595	int offset = 0;
596	int max = BUFFER_SIZE;
597	int c = is.read(bytes,offset,max);
598	int firstGT = -1;
599	while (c!=-1 && firstGT==-1 && offset< BUFFER_SIZE) {
600	offset += c;
601	max -= c;
602	c = is.read(bytes,offset,max);
603	firstGT = new String(bytes, 0, offset).indexOf(">");
604	}
605	if (firstGT == -1) {
606	if (c == -1) {
607	throw new IOException("Unexpected end of XML stream");
608	}
609	else {
610	throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
611	}
612	}
613	int bytesRead = offset;
614	if (bytesRead>0) {
615	is.reset();
616	Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,firstGT + 1), guessedEnc);
617	BufferedReader bReader = new BufferedReader(reader);
618	StringBuffer prolog = new StringBuffer();
619	String line = bReader.readLine();
620	while (line != null) {
621	prolog.append(line);
622	line = bReader.readLine();
623	}
624	Matcher m = ENCODING_PATTERN.matcher(prolog);
625	if (m.find()) {
626	encoding = m.group(1).toUpperCase();
627	encoding = encoding.substring(1,encoding.length()-1);
628	}
629	}
630	}
631	return encoding;
632	}
633
634	// indicates if the MIME type belongs to the APPLICATION XML family
635	private static boolean isAppXml(String mime) {
636	return mime!=null &&
637	(mime.equals("application/xml") \|\|
638	mime.equals("application/xml-dtd") \|\|
639	mime.equals("application/xml-external-parsed-entity") \|\|
640	(mime.startsWith("application/") && mime.endsWith("+xml")));
641	}
642
643	// indicates if the MIME type belongs to the TEXT XML family
644	private static boolean isTextXml(String mime) {
645	return mime!=null &&
646	(mime.equals("text/xml") \|\|
647	mime.equals("text/xml-external-parsed-entity") \|\|
648	(mime.startsWith("text/") && mime.endsWith("+xml")));
649	}
650
651	private static final MessageFormat RAW_EX_1 = new MessageFormat(
652	"Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
653
654	private static final MessageFormat RAW_EX_2 = new MessageFormat(
655	"Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
656
657	private static final MessageFormat HTTP_EX_1 = new MessageFormat(
658	"Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
659
660	private static final MessageFormat HTTP_EX_2 = new MessageFormat(
661	"Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
662
663	private static final MessageFormat HTTP_EX_3 = new MessageFormat(
664	"Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
665
666	}

[all classes][com.sun.syndication.io]

EMMA 2.1.5320 (stable) (C) Vladimir Roubtsov