-
Notifications
You must be signed in to change notification settings - Fork 41
Fix parsing an UTF-8 file without BOM and ISO-8859-1 encoding (#242) #243
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ | |
import java.io.InputStream; | ||
import java.io.Reader; | ||
import java.io.StringReader; | ||
import java.nio.charset.StandardCharsets; | ||
import java.nio.file.Files; | ||
import java.nio.file.Paths; | ||
|
||
|
@@ -968,7 +969,7 @@ public void testXMLDeclVersionEncodingStandaloneNoSpace() | |
* @since 3.4.1 | ||
*/ | ||
@Test | ||
public void testEncodingISO_8859_1setInputReader() | ||
public void testEncodingISO_8859_1_newXmlReader() | ||
throws IOException | ||
{ | ||
try ( Reader reader = | ||
|
@@ -994,7 +995,7 @@ public void testEncodingISO_8859_1setInputReader() | |
* @since 3.4.1 | ||
*/ | ||
@Test | ||
public void testEncodingISO_8859_1_setInputStream() | ||
public void testEncodingISO_8859_1_InputStream() | ||
throws IOException | ||
{ | ||
try ( InputStream input = | ||
|
@@ -1012,12 +1013,6 @@ public void testEncodingISO_8859_1_setInputStream() | |
} | ||
} | ||
|
||
private static void assertPosition( int row, int col, MXParser parser ) | ||
{ | ||
assertEquals( "Current line", row, parser.getLineNumber() ); | ||
assertEquals( "Current column", col, parser.getColumnNumber() ); | ||
} | ||
|
||
/** | ||
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163 | ||
* | ||
|
@@ -1028,7 +1023,7 @@ private static void assertPosition( int row, int col, MXParser parser ) | |
* @since 3.4.2 | ||
*/ | ||
@Test | ||
public void testEncodingISO_8859_1setStringReader() | ||
public void testEncodingISO_8859_1_StringReader() | ||
throws IOException | ||
{ | ||
try ( Reader reader = | ||
|
@@ -1047,6 +1042,93 @@ public void testEncodingISO_8859_1setStringReader() | |
} | ||
} | ||
|
||
/** | ||
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163 | ||
* | ||
* Another case of bug #163: Reader generated with ReaderFactory.newReader and the right file encoding. | ||
* | ||
* @throws IOException if IO error. | ||
* | ||
* @since 3.5.2 | ||
*/ | ||
@Test | ||
public void testEncodingISO_8859_1_newReader() | ||
throws IOException | ||
{ | ||
try ( Reader reader = | ||
ReaderFactory.newReader( new File( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The test is using a simple There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reproduces the same problem, showing that the Reader created by ReaderFactory.newReader() is affected too. |
||
StandardCharsets.UTF_8.name() ) ) | ||
{ | ||
MXParser parser = new MXParser(); | ||
parser.setInput( reader ); | ||
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) | ||
; | ||
assertTrue( true ); | ||
} | ||
catch ( XmlPullParserException e ) | ||
{ | ||
fail( "should not raise exception: " + e ); | ||
} | ||
} | ||
|
||
/** | ||
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163 | ||
* | ||
* Another case of bug #163: InputStream supplied with the right file encoding. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case, the encoding is not the correct one. |
||
* | ||
* @throws IOException if IO error. | ||
* | ||
* @since 3.5.2 | ||
*/ | ||
@Test | ||
public void testEncodingISO_8859_1_InputStream_encoded() throws IOException { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So in this test, the BOM says There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same but using an InputStream encoded in UTF-8, instead of a Reader. |
||
try ( InputStream input = | ||
Files.newInputStream( Paths.get( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) ) | ||
{ | ||
MXParser parser = new MXParser(); | ||
parser.setInput( input, StandardCharsets.UTF_8.name() ); | ||
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) | ||
; | ||
assertTrue( true ); | ||
} | ||
catch ( XmlPullParserException e ) | ||
{ | ||
fail( "should not raise exception: " + e ); | ||
} | ||
} | ||
|
||
/** | ||
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163 | ||
* | ||
* @throws IOException if IO error. | ||
* | ||
* @since 3.4.1 | ||
*/ | ||
@Test | ||
public void testEncodingUTF8_newXmlReader() | ||
throws IOException | ||
{ | ||
try ( Reader reader = | ||
ReaderFactory.newXmlReader( new File( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) ) | ||
{ | ||
MXParser parser = new MXParser(); | ||
parser.setInput( reader ); | ||
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT ) | ||
; | ||
assertTrue( true ); | ||
} | ||
catch ( XmlPullParserException e ) | ||
{ | ||
fail( "should not raise exception: " + e ); | ||
} | ||
} | ||
|
||
private static void assertPosition( int row, int col, MXParser parser ) | ||
{ | ||
assertEquals( "Current line", row, parser.getLineNumber() ); | ||
assertEquals( "Current column", col, parser.getColumnNumber() ); | ||
} | ||
|
||
/** | ||
* <p> | ||
* Test custom Entity not found. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The BOM is
ISO-8859-1
but the file is decoded withUTF-8
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The file is encoded with UTF-8 without BOM, and XML entities are encoded in ISO-8859-1 (this is a valid combination). This is the test that reproduces the bug, which throws an exception as if the file encoding were UTF-8 with BOM (the invalid combination).
So as you said before, even if Maven is not affected by this, the bug in the parser is real.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I understand this is considered a valid combination by the xml spec. But this will produce incorrect results, as the input stream is most probably decoded using UTF-8 instead of ISO-8859-1.
I'm fine with that if that's what is desired. However, I think we can achieve both and I'll work on a fix.