Skip to content

Commit cce8bf5

Browse files
committed
Fix parsing an UTF-8 file without BOM and ISO-8859-1 encoding (#242)
* Deleted most code handling encoding (leaving that job to the XmlReader * Fixed tests exercising encoding checks. Unsupported tests were skipped * Simplified test-encoding-ISO-8859-1.xml test file
1 parent 6714fe0 commit cce8bf5

File tree

4 files changed

+163
-1553
lines changed

4 files changed

+163
-1553
lines changed

src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java

+1-29
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,10 @@
1111

1212
import java.io.EOFException;
1313
import java.io.IOException;
14-
import java.io.InputStreamReader;
1514
import java.io.Reader;
1615
import java.io.UnsupportedEncodingException;
1716

1817
import org.codehaus.plexus.util.ReaderFactory;
19-
import org.codehaus.plexus.util.xml.XmlReader;
2018

2119
//import java.util.Hashtable;
2220

@@ -124,7 +122,6 @@ private String newStringIntern( char[] cbuf, int off, int len )
124122
// private String elValue[];
125123
private int elNamespaceCount[];
126124

127-
private String fileEncoding = null;
128125

129126
/**
130127
* Make sure that we have enough space to keep element stack if passed size. It will always create one additional
@@ -663,20 +660,6 @@ public void setInput( Reader in )
663660
{
664661
reset();
665662
reader = in;
666-
667-
if ( reader instanceof XmlReader ) {
668-
// encoding already detected
669-
XmlReader xsr = (XmlReader) reader;
670-
fileEncoding = xsr.getEncoding();
671-
}
672-
else if ( reader instanceof InputStreamReader )
673-
{
674-
InputStreamReader isr = (InputStreamReader) reader;
675-
if ( isr.getEncoding() != null )
676-
{
677-
fileEncoding = isr.getEncoding().toUpperCase();
678-
}
679-
}
680663
}
681664

682665
@Override
@@ -3432,18 +3415,7 @@ private void parseXmlDeclWithVersion( int versionStart, int versionEnd )
34323415
final int encodingEnd = pos - 1;
34333416

34343417
// TODO reconcile with setInput encodingName
3435-
inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );
3436-
3437-
if ( "UTF8".equals( fileEncoding ) && inputEncoding.toUpperCase().startsWith( "ISO-" ) )
3438-
{
3439-
throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + inputEncoding + " is incompatible",
3440-
this, null );
3441-
}
3442-
else if ("UTF-16".equals( fileEncoding ) && inputEncoding.equalsIgnoreCase( "UTF-8" ))
3443-
{
3444-
throw new XmlPullParserException( "UTF-16 BOM plus xml decl of " + inputEncoding + " is incompatible",
3445-
this, null );
3446-
}
3418+
// inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );
34473419

34483420
lastParsedAttr = "encoding";
34493421

src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java

+91-9
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import java.io.InputStream;
2828
import java.io.Reader;
2929
import java.io.StringReader;
30+
import java.nio.charset.StandardCharsets;
3031
import java.nio.file.Files;
3132
import java.nio.file.Paths;
3233

@@ -968,7 +969,7 @@ public void testXMLDeclVersionEncodingStandaloneNoSpace()
968969
* @since 3.4.1
969970
*/
970971
@Test
971-
public void testEncodingISO_8859_1setInputReader()
972+
public void testEncodingISO_8859_1_newXmlReader()
972973
throws IOException
973974
{
974975
try ( Reader reader =
@@ -994,7 +995,7 @@ public void testEncodingISO_8859_1setInputReader()
994995
* @since 3.4.1
995996
*/
996997
@Test
997-
public void testEncodingISO_8859_1_setInputStream()
998+
public void testEncodingISO_8859_1_InputStream()
998999
throws IOException
9991000
{
10001001
try ( InputStream input =
@@ -1012,12 +1013,6 @@ public void testEncodingISO_8859_1_setInputStream()
10121013
}
10131014
}
10141015

1015-
private static void assertPosition( int row, int col, MXParser parser )
1016-
{
1017-
assertEquals( "Current line", row, parser.getLineNumber() );
1018-
assertEquals( "Current column", col, parser.getColumnNumber() );
1019-
}
1020-
10211016
/**
10221017
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
10231018
*
@@ -1028,7 +1023,7 @@ private static void assertPosition( int row, int col, MXParser parser )
10281023
* @since 3.4.2
10291024
*/
10301025
@Test
1031-
public void testEncodingISO_8859_1setStringReader()
1026+
public void testEncodingISO_8859_1_StringReader()
10321027
throws IOException
10331028
{
10341029
try ( Reader reader =
@@ -1047,6 +1042,93 @@ public void testEncodingISO_8859_1setStringReader()
10471042
}
10481043
}
10491044

1045+
/**
1046+
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
1047+
*
1048+
* Another case of bug #163: Reader generated with ReaderFactory.newReader and the right file encoding.
1049+
*
1050+
* @throws IOException if IO error.
1051+
*
1052+
* @since 3.5.2
1053+
*/
1054+
@Test
1055+
public void testEncodingISO_8859_1_newReader()
1056+
throws IOException
1057+
{
1058+
try ( Reader reader =
1059+
ReaderFactory.newReader( new File( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ),
1060+
StandardCharsets.UTF_8.name() ) )
1061+
{
1062+
MXParser parser = new MXParser();
1063+
parser.setInput( reader );
1064+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
1065+
;
1066+
assertTrue( true );
1067+
}
1068+
catch ( XmlPullParserException e )
1069+
{
1070+
fail( "should not raise exception: " + e );
1071+
}
1072+
}
1073+
1074+
/**
1075+
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
1076+
*
1077+
* Another case of bug #163: InputStream supplied with the right file encoding.
1078+
*
1079+
* @throws IOException if IO error.
1080+
*
1081+
* @since 3.5.2
1082+
*/
1083+
@Test
1084+
public void testEncodingISO_8859_1_InputStream_encoded() throws IOException {
1085+
try ( InputStream input =
1086+
Files.newInputStream( Paths.get( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
1087+
{
1088+
MXParser parser = new MXParser();
1089+
parser.setInput( input, StandardCharsets.UTF_8.name() );
1090+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
1091+
;
1092+
assertTrue( true );
1093+
}
1094+
catch ( XmlPullParserException e )
1095+
{
1096+
fail( "should not raise exception: " + e );
1097+
}
1098+
}
1099+
1100+
/**
1101+
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
1102+
*
1103+
* @throws IOException if IO error.
1104+
*
1105+
* @since 3.4.1
1106+
*/
1107+
@Test
1108+
public void testEncodingUTF8_newXmlReader()
1109+
throws IOException
1110+
{
1111+
try ( Reader reader =
1112+
ReaderFactory.newXmlReader( new File( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
1113+
{
1114+
MXParser parser = new MXParser();
1115+
parser.setInput( reader );
1116+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
1117+
;
1118+
assertTrue( true );
1119+
}
1120+
catch ( XmlPullParserException e )
1121+
{
1122+
fail( "should not raise exception: " + e );
1123+
}
1124+
}
1125+
1126+
private static void assertPosition( int row, int col, MXParser parser )
1127+
{
1128+
assertEquals( "Current line", row, parser.getLineNumber() );
1129+
assertEquals( "Current column", col, parser.getColumnNumber() );
1130+
}
1131+
10501132
/**
10511133
* <p>
10521134
* Test custom Entity not found.

src/test/java/org/codehaus/plexus/util/xml/pull/eduni_misc_Test_BjoernHoehrmannviaHST2013_09_18_Test.java

+70-13
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,11 @@
44
import static org.junit.Assert.fail;
55

66
import java.io.File;
7-
import java.io.FileInputStream;
87
import java.io.FileReader;
98
import java.io.IOException;
10-
import java.io.InputStreamReader;
119
import java.io.Reader;
12-
import java.nio.charset.StandardCharsets;
1310

11+
import org.codehaus.plexus.util.ReaderFactory;
1412
import org.junit.Before;
1513
import org.junit.Test;
1614

@@ -207,13 +205,15 @@ public void testhst_bh_006()
207205
* Version:
208206
*
209207
* @throws java.io.IOException if there is an I/O error
208+
*
209+
* NOTE: This test is SKIPPED as the MXParser object alone is unable to detect whether UTF-8 file
210+
* has a BOM or not
210211
*/
211-
@Test
212+
// @Test
212213
public void testhst_lhs_007()
213214
throws IOException
214215
{
215-
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) );
216-
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
216+
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "007.xml" ) ) )
217217
{
218218
parser.setInput( reader );
219219
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
@@ -236,11 +236,39 @@ public void testhst_lhs_007()
236236
* @throws java.io.IOException if there is an I/O error
237237
*/
238238
@Test
239-
public void testhst_lhs_008()
239+
public void testhst_lhs_008_FileReader()
240+
throws IOException
241+
{
242+
try ( Reader reader = new FileReader( new File( testResourcesDir, "008.xml" ) ) )
243+
{
244+
parser.setInput( reader );
245+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
246+
;
247+
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible" );
248+
}
249+
catch ( XmlPullParserException e )
250+
{
251+
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
252+
}
253+
}
254+
255+
/**
256+
* Test ID: <pre>hst-lhs-008</pre>
257+
* Test URI: <pre>008.xml</pre>
258+
* Comment: <pre>UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible</pre>
259+
* Sections: <pre>4.3.3</pre>
260+
* Version:
261+
*
262+
* @throws java.io.IOException if there is an I/O error
263+
*
264+
* NOTE: This test is SKIPPED as MXParser is unable to detect UTF-16 BOM detection when chars are read as
265+
* UTF-8.
266+
*/
267+
// @Test
268+
public void testhst_lhs_008_XmlReader()
240269
throws IOException
241270
{
242-
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) );
243-
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_16 ) )
271+
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "008.xml" ) ) )
244272
{
245273
parser.setInput( reader );
246274
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
@@ -263,12 +291,11 @@ public void testhst_lhs_008()
263291
* @throws java.io.IOException if there is an I/O error
264292
*/
265293
@Test
266-
public void testhst_lhs_009()
294+
public void testhst_lhs_009_FileReader()
267295
throws IOException
268296
{
269-
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) );
270-
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
271-
{
297+
try ( Reader reader = new FileReader( new File( testResourcesDir, "009.xml" ) ) )
298+
{
272299
parser.setInput( reader );
273300
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
274301
;
@@ -280,4 +307,34 @@ public void testhst_lhs_009()
280307
}
281308
}
282309

310+
/**
311+
* Test ID: <pre>hst-lhs-009</pre>
312+
* Test URI: <pre>009.xml</pre>
313+
* Comment: <pre>UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible</pre>
314+
* Sections: <pre>4.3.3</pre>
315+
* Version:
316+
*
317+
* @throws java.io.IOException if there is an I/O error
318+
*/
319+
@Test
320+
public void testhst_lhs_009_XmlReader()
321+
throws IOException
322+
{
323+
try ( Reader reader = ReaderFactory.newXmlReader( new File( testResourcesDir, "009.xml" ) ) )
324+
{
325+
parser.setInput( reader );
326+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
327+
;
328+
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible" );
329+
}
330+
catch ( IOException e )
331+
{
332+
assertTrue( e.getMessage().contains( "Invalid encoding, BOM [UTF-16BE] XML guess [UTF-8] XML prolog [UTF-8] encoding mismatch" ) );
333+
}
334+
catch ( XmlPullParserException e )
335+
{
336+
fail( "Encoding problem should be detected by the XmlReader" );
337+
}
338+
}
339+
283340
}

0 commit comments

Comments
 (0)