Skip to content

Fix parsing an UTF-8 file without BOM and ISO-8859-1 encoding (#242) #243

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions src/main/java/org/codehaus/plexus/util/xml/XmlReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -504,11 +504,8 @@ else if ( bomEnc.equals( UTF_8 ) )
}
else if ( bomEnc.equals( UTF_16BE ) || bomEnc.equals( UTF_16LE ) )
{
if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc ) )
{
throw new IOException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ) );
}
if ( xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc )
|| xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
{
throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ),
bomEnc, xmlGuessEnc, xmlEnc, is );
Expand Down
44 changes: 16 additions & 28 deletions src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
import java.io.Reader;
import java.io.UnsupportedEncodingException;

import org.codehaus.plexus.util.ReaderFactory;
import org.codehaus.plexus.util.xml.XmlReader;
import org.codehaus.plexus.util.xml.XmlStreamReader;
import org.codehaus.plexus.util.xml.XmlStreamReaderException;

//import java.util.Hashtable;

Expand Down Expand Up @@ -663,20 +664,6 @@ public void setInput( Reader in )
{
reset();
reader = in;

if ( reader instanceof XmlReader ) {
// encoding already detected
XmlReader xsr = (XmlReader) reader;
fileEncoding = xsr.getEncoding();
}
else if ( reader instanceof InputStreamReader )
{
InputStreamReader isr = (InputStreamReader) reader;
if ( isr.getEncoding() != null )
{
fileEncoding = isr.getEncoding().toUpperCase();
}
}
}

@Override
Expand All @@ -692,18 +679,30 @@ public void setInput( java.io.InputStream inputStream, String inputEncoding )
{
if ( inputEncoding != null )
{
reader = ReaderFactory.newReader( inputStream, inputEncoding );
reader = new InputStreamReader( inputStream, inputEncoding );
}
else
{
reader = ReaderFactory.newXmlReader( inputStream );
reader = new XmlStreamReader( inputStream, false );
}
}
catch ( UnsupportedEncodingException une )
{
throw new XmlPullParserException( "could not create reader for encoding " + inputEncoding + " : " + une,
this, une );
}
catch ( XmlStreamReaderException e )
{
if ( "UTF-8".equals( e.getBomEncoding() ) )
{
throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + e.getXmlEncoding() + " is incompatible", this, e );
}
if ( e.getBomEncoding() != null && e.getBomEncoding().startsWith( "UTF-16" ) )
{
throw new XmlPullParserException( "UTF-16 BOM in a " + e.getXmlEncoding() + " encoded file is incompatible", this, e );
}
throw new XmlPullParserException( "could not create reader : " + e, this, e );
}
catch ( IOException e )
{
throw new XmlPullParserException( "could not create reader : " + e, this, e );
Expand Down Expand Up @@ -3434,17 +3433,6 @@ private void parseXmlDeclWithVersion( int versionStart, int versionEnd )
// TODO reconcile with setInput encodingName
inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );

if ( "UTF8".equals( fileEncoding ) && inputEncoding.toUpperCase().startsWith( "ISO-" ) )
{
throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + inputEncoding + " is incompatible",
this, null );
}
else if ("UTF-16".equals( fileEncoding ) && inputEncoding.equalsIgnoreCase( "UTF-8" ))
{
throw new XmlPullParserException( "UTF-16 BOM plus xml decl of " + inputEncoding + " is incompatible",
this, null );
}

lastParsedAttr = "encoding";

ch = more();
Expand Down
100 changes: 91 additions & 9 deletions src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;

Expand Down Expand Up @@ -968,7 +969,7 @@ public void testXMLDeclVersionEncodingStandaloneNoSpace()
* @since 3.4.1
*/
@Test
public void testEncodingISO_8859_1setInputReader()
public void testEncodingISO_8859_1_newXmlReader()
throws IOException
{
try ( Reader reader =
Expand All @@ -994,7 +995,7 @@ public void testEncodingISO_8859_1setInputReader()
* @since 3.4.1
*/
@Test
public void testEncodingISO_8859_1_setInputStream()
public void testEncodingISO_8859_1_InputStream()
throws IOException
{
try ( InputStream input =
Expand All @@ -1012,12 +1013,6 @@ public void testEncodingISO_8859_1_setInputStream()
}
}

private static void assertPosition( int row, int col, MXParser parser )
{
assertEquals( "Current line", row, parser.getLineNumber() );
assertEquals( "Current column", col, parser.getColumnNumber() );
}

/**
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
*
Expand All @@ -1028,7 +1023,7 @@ private static void assertPosition( int row, int col, MXParser parser )
* @since 3.4.2
*/
@Test
public void testEncodingISO_8859_1setStringReader()
public void testEncodingISO_8859_1_StringReader()
throws IOException
{
try ( Reader reader =
Expand All @@ -1047,6 +1042,93 @@ public void testEncodingISO_8859_1setStringReader()
}
}

/**
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
*
* Another case of bug #163: Reader generated with ReaderFactory.newReader and the right file encoding.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The BOM is ISO-8859-1 but the file is decoded with UTF-8

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The file is encoded with UTF-8 without BOM, and XML entities are encoded in ISO-8859-1 (this is a valid combination). This is the test that reproduces the bug, which throws an exception as if the file encoding were UTF-8 with BOM (the invalid combination).
So as you said before, even if Maven is not affected by this, the bug in the parser is real.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand this is considered a valid combination by the xml spec. But this will produce incorrect results, as the input stream is most probably decoded using UTF-8 instead of ISO-8859-1.
I'm fine with that if that's what is desired. However, I think we can achieve both and I'll work on a fix.

*
* @throws IOException if IO error.
*
* @since 3.5.2
*/
@Test
public void testEncodingISO_8859_1_newReader()
throws IOException
{
try ( Reader reader =
ReaderFactory.newReader( new File( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test is using a simple Reader, so the BOM is completely ignored by the reader, meaning the file will be read with UTF-8 while the BOM indicates it's encoded in ISO-8859-1.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reproduces the same problem, showing that the Reader created by ReaderFactory.newReader() is affected too.

StandardCharsets.UTF_8.name() ) )
{
MXParser parser = new MXParser();
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
assertTrue( true );
}
catch ( XmlPullParserException e )
{
fail( "should not raise exception: " + e );
}
}

/**
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
*
* Another case of bug #163: InputStream supplied with the right file encoding.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case, the encoding is not the correct one.

*
* @throws IOException if IO error.
*
* @since 3.5.2
*/
@Test
public void testEncodingISO_8859_1_InputStream_encoded() throws IOException {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So in this test, the BOM says ISO-8859-1 but the parser is forced to use UTF-8. Isn't that wrong ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same but using an InputStream encoded in UTF-8, instead of a Reader.

try ( InputStream input =
Files.newInputStream( Paths.get( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
{
MXParser parser = new MXParser();
parser.setInput( input, StandardCharsets.UTF_8.name() );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
assertTrue( true );
}
catch ( XmlPullParserException e )
{
fail( "should not raise exception: " + e );
}
}

/**
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
*
* @throws IOException if IO error.
*
* @since 3.4.1
*/
@Test
public void testEncodingUTF8_newXmlReader()
throws IOException
{
try ( Reader reader =
ReaderFactory.newXmlReader( new File( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
{
MXParser parser = new MXParser();
parser.setInput( reader );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
assertTrue( true );
}
catch ( XmlPullParserException e )
{
fail( "should not raise exception: " + e );
}
}

private static void assertPosition( int row, int col, MXParser parser )
{
assertEquals( "Current line", row, parser.getLineNumber() );
assertEquals( "Current column", col, parser.getColumnNumber() );
}

/**
* <p>
* Test custom Entity not found.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.StandardCharsets;

import org.codehaus.plexus.util.ReaderFactory;
import org.junit.Before;
import org.junit.Test;

Expand Down Expand Up @@ -212,17 +213,16 @@ public void testhst_bh_006()
public void testhst_lhs_007()
throws IOException
{
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) );
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) ) )
{
parser.setInput( reader );
parser.setInput( is, null );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "UTF-8 BOM plus xml decl of iso-8859-1 incompatible" );
fail( "UTF-8 BOM plus xml decl of ISO-8859-1 incompatible" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "UTF-8 BOM plus xml decl of iso-8859-1 is incompatible" ) );
assertTrue( e.getMessage().contains( "UTF-8 BOM plus xml decl of ISO-8859-1 is incompatible" ) );
}
}

Expand All @@ -239,17 +239,16 @@ public void testhst_lhs_007()
public void testhst_lhs_008()
throws IOException
{
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) );
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_16 ) )
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) ) )
{
parser.setInput( reader );
parser.setInput( is, null );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible" );
fail( "UTF-16 BOM plus xml decl of UTF-8 (using UTF-16 coding) incompatible" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "UTF-16 BOM plus xml decl of utf-8 is incompatible" ) );
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
}
}

Expand All @@ -261,22 +260,24 @@ public void testhst_lhs_008()
* Version:
*
* @throws java.io.IOException if there is an I/O error
*
* NOTE: This test is SKIPPED as MXParser is unable to detect UTF-16 BOM detection when chars are read as
* UTF-8.
*/
@Test
public void testhst_lhs_009()
throws IOException
{
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) );
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
{
parser.setInput( reader );
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) ) )
{
parser.setInput( is, null );
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
;
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible" );
fail( "UTF-16 BOM plus xml decl of UTF-8 (using UTF-8 coding) incompatible" );
}
catch ( XmlPullParserException e )
{
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
assertTrue( e.getMessage(), e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
}
}

Expand Down
Loading