Skip to content

Commit b99f7c0

Browse files
belingueresmichael-o
authored andcommitted
Fixed regressions:
* #163 - new case: Don't assume UTF8 as default, to allow parsing from String. * #194 - Incorrect getText() after parsing the DOCDECL section. * Added tests exercising other regressions exposed while fixing this issues.
1 parent 3896620 commit b99f7c0

File tree

6 files changed

+637
-45
lines changed

6 files changed

+637
-45
lines changed

src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java

+112-45
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ private String newStringIntern( char[] cbuf, int off, int len )
124124
// private String elValue[];
125125
private int elNamespaceCount[];
126126

127-
private String fileEncoding = "UTF8";
127+
private String fileEncoding = null;
128128

129129
/**
130130
* Make sure that we have enough space to keep element stack if passed size. It will always create one additional
@@ -587,8 +587,8 @@ else if ( FEATURE_XML_ROUNDTRIP.equals( name ) )
587587
}
588588
}
589589

590-
/**
591-
* Unknown properties are <strong>always</strong> returned as false
590+
/**
591+
* Unknown properties are <strong>always</strong> returned as false
592592
*/
593593
@Override
594594
public boolean getFeature( String name )
@@ -1596,11 +1596,11 @@ else if ( ch == '&' )
15961596
}
15971597
final int oldStart = posStart + bufAbsoluteStart;
15981598
final int oldEnd = posEnd + bufAbsoluteStart;
1599-
final char[] resolvedEntity = parseEntityRef();
1599+
parseEntityRef();
16001600
if ( tokenize )
16011601
return eventType = ENTITY_REF;
16021602
// check if replacement text can be resolved !!!
1603-
if ( resolvedEntity == null )
1603+
if ( resolvedEntityRefCharBuf == BUF_NOT_RESOLVED )
16041604
{
16051605
if ( entityRefName == null )
16061606
{
@@ -1628,7 +1628,7 @@ else if ( ch == '&' )
16281628
}
16291629
// assert usePC == true;
16301630
// write into PC replacement text - do merge for replacement text!!!!
1631-
for ( char aResolvedEntity : resolvedEntity )
1631+
for ( char aResolvedEntity : resolvedEntityRefCharBuf )
16321632
{
16331633
if ( pcEnd >= pc.length )
16341634
{
@@ -2675,9 +2675,28 @@ else if ( ch == '\t' || ch == '\n' || ch == '\r' )
26752675
return ch;
26762676
}
26772677

2678-
private char[] charRefOneCharBuf = new char[1];
2678+
// state representing that no entity ref have been resolved
2679+
private static final char[] BUF_NOT_RESOLVED = new char[0];
2680+
2681+
// predefined entity refs
2682+
private static final char[] BUF_LT = new char[] { '<' };
2683+
private static final char[] BUF_AMP = new char[] { '&' };
2684+
private static final char[] BUF_GT = new char[] { '>' };
2685+
private static final char[] BUF_APO = new char[] { '\'' };
2686+
private static final char[] BUF_QUOT = new char[] { '"' };
26792687

2680-
private char[] parseEntityRef()
2688+
private char[] resolvedEntityRefCharBuf = BUF_NOT_RESOLVED;
2689+
2690+
/**
2691+
* parse Entity Ref, either a character entity or one of the predefined name entities.
2692+
*
2693+
* @return the length of the valid found character reference, which may be one of the predefined character reference
2694+
* names (resolvedEntityRefCharBuf contains the replaced chars). Returns the length of the not found entity
2695+
* name, otherwise.
2696+
* @throws XmlPullParserException if invalid XML is detected.
2697+
* @throws IOException if an I/O error is found.
2698+
*/
2699+
private int parseCharOrPredefinedEntityRef()
26812700
throws XmlPullParserException, IOException
26822701
{
26832702
// entity reference http://www.w3.org/TR/2000/REC-xml-20001006#NT-Reference
@@ -2686,6 +2705,8 @@ private char[] parseEntityRef()
26862705
// ASSUMPTION just after &
26872706
entityRefName = null;
26882707
posStart = pos;
2708+
int len = 0;
2709+
resolvedEntityRefCharBuf = BUF_NOT_RESOLVED;
26892710
char ch = more();
26902711
if ( ch == '#' )
26912712
{
@@ -2750,7 +2771,6 @@ else if ( ch >= 'A' && ch <= 'F' )
27502771
ch = more();
27512772
}
27522773
}
2753-
posEnd = pos - 1;
27542774

27552775
boolean isValidCodePoint = true;
27562776
try
@@ -2759,7 +2779,7 @@ else if ( ch >= 'A' && ch <= 'F' )
27592779
isValidCodePoint = isValidCodePoint( codePoint );
27602780
if ( isValidCodePoint )
27612781
{
2762-
charRefOneCharBuf = Character.toChars( codePoint );
2782+
resolvedEntityRefCharBuf = Character.toChars( codePoint );
27632783
}
27642784
}
27652785
catch ( IllegalArgumentException e )
@@ -2775,14 +2795,14 @@ else if ( ch >= 'A' && ch <= 'F' )
27752795

27762796
if ( tokenize )
27772797
{
2778-
text = newString( charRefOneCharBuf, 0, charRefOneCharBuf.length );
2798+
text = newString( resolvedEntityRefCharBuf, 0, resolvedEntityRefCharBuf.length );
27792799
}
2780-
return charRefOneCharBuf;
2800+
len = resolvedEntityRefCharBuf.length;
27812801
}
27822802
else
27832803
{
27842804
// [68] EntityRef ::= '&' Name ';'
2785-
// scan anem until ;
2805+
// scan name until ;
27862806
if ( !isNameStartChar( ch ) )
27872807
{
27882808
throw new XmlPullParserException( "entity reference names can not start with character '"
@@ -2801,17 +2821,15 @@ else if ( ch >= 'A' && ch <= 'F' )
28012821
+ printable( ch ) + "'", this, null );
28022822
}
28032823
}
2804-
posEnd = pos - 1;
28052824
// determine what name maps to
2806-
final int len = posEnd - posStart;
2825+
len = ( pos - 1 ) - posStart;
28072826
if ( len == 2 && buf[posStart] == 'l' && buf[posStart + 1] == 't' )
28082827
{
28092828
if ( tokenize )
28102829
{
28112830
text = "<";
28122831
}
2813-
charRefOneCharBuf[0] = '<';
2814-
return charRefOneCharBuf;
2832+
resolvedEntityRefCharBuf = BUF_LT;
28152833
// if(paramPC || isParserTokenizing) {
28162834
// if(pcEnd >= pc.length) ensurePC();
28172835
// pc[pcEnd++] = '<';
@@ -2823,17 +2841,15 @@ else if ( len == 3 && buf[posStart] == 'a' && buf[posStart + 1] == 'm' && buf[po
28232841
{
28242842
text = "&";
28252843
}
2826-
charRefOneCharBuf[0] = '&';
2827-
return charRefOneCharBuf;
2844+
resolvedEntityRefCharBuf = BUF_AMP;
28282845
}
28292846
else if ( len == 2 && buf[posStart] == 'g' && buf[posStart + 1] == 't' )
28302847
{
28312848
if ( tokenize )
28322849
{
28332850
text = ">";
28342851
}
2835-
charRefOneCharBuf[0] = '>';
2836-
return charRefOneCharBuf;
2852+
resolvedEntityRefCharBuf = BUF_GT;
28372853
}
28382854
else if ( len == 4 && buf[posStart] == 'a' && buf[posStart + 1] == 'p' && buf[posStart + 2] == 'o'
28392855
&& buf[posStart + 3] == 's' )
@@ -2842,8 +2858,7 @@ else if ( len == 4 && buf[posStart] == 'a' && buf[posStart + 1] == 'p' && buf[po
28422858
{
28432859
text = "'";
28442860
}
2845-
charRefOneCharBuf[0] = '\'';
2846-
return charRefOneCharBuf;
2861+
resolvedEntityRefCharBuf = BUF_APO;
28472862
}
28482863
else if ( len == 4 && buf[posStart] == 'q' && buf[posStart + 1] == 'u' && buf[posStart + 2] == 'o'
28492864
&& buf[posStart + 3] == 't' )
@@ -2852,25 +2867,65 @@ else if ( len == 4 && buf[posStart] == 'q' && buf[posStart + 1] == 'u' && buf[po
28522867
{
28532868
text = "\"";
28542869
}
2855-
charRefOneCharBuf[0] = '"';
2856-
return charRefOneCharBuf;
2857-
}
2858-
else
2859-
{
2860-
final char[] result = lookuEntityReplacement( len );
2861-
if ( result != null )
2862-
{
2863-
return result;
2864-
}
2870+
resolvedEntityRefCharBuf = BUF_QUOT;
28652871
}
2866-
if ( tokenize )
2867-
text = null;
2868-
return null;
28692872
}
2873+
2874+
posEnd = pos;
2875+
2876+
return len;
2877+
}
2878+
2879+
/**
2880+
* Parse an entity reference inside the DOCDECL section.
2881+
*
2882+
* @throws XmlPullParserException if invalid XML is detected.
2883+
* @throws IOException if an I/O error is found.
2884+
*/
2885+
private void parseEntityRefInDocDecl()
2886+
throws XmlPullParserException, IOException
2887+
{
2888+
parseCharOrPredefinedEntityRef();
2889+
if (usePC) {
2890+
posStart--; // include in PC the starting '&' of the entity
2891+
joinPC();
2892+
}
2893+
2894+
if ( resolvedEntityRefCharBuf != BUF_NOT_RESOLVED )
2895+
return;
2896+
if ( tokenize )
2897+
text = null;
2898+
}
2899+
2900+
/**
2901+
* Parse an entity reference inside a tag or attribute.
2902+
*
2903+
* @throws XmlPullParserException if invalid XML is detected.
2904+
* @throws IOException if an I/O error is found.
2905+
*/
2906+
private void parseEntityRef()
2907+
throws XmlPullParserException, IOException
2908+
{
2909+
final int len = parseCharOrPredefinedEntityRef();
2910+
2911+
posEnd--; // don't involve the final ';' from the entity in the search
2912+
2913+
if ( resolvedEntityRefCharBuf != BUF_NOT_RESOLVED ) {
2914+
return;
2915+
}
2916+
2917+
resolvedEntityRefCharBuf = lookuEntityReplacement( len );
2918+
if ( resolvedEntityRefCharBuf != BUF_NOT_RESOLVED )
2919+
{
2920+
return;
2921+
}
2922+
if ( tokenize )
2923+
text = null;
28702924
}
28712925

28722926
/**
2873-
* Check if the provided parameter is a valid Char, according to: {@link https://www.w3.org/TR/REC-xml/#NT-Char}
2927+
* Check if the provided parameter is a valid Char. According to
2928+
* <a href="https://www.w3.org/TR/REC-xml/#NT-Char">https://www.w3.org/TR/REC-xml/#NT-Char</a>
28742929
*
28752930
* @param codePoint the numeric value to check
28762931
* @return true if it is a valid numeric character reference. False otherwise.
@@ -2883,8 +2938,6 @@ private static boolean isValidCodePoint( int codePoint )
28832938
}
28842939

28852940
private char[] lookuEntityReplacement( int entityNameLen )
2886-
throws XmlPullParserException, IOException
2887-
28882941
{
28892942
if ( !allStringsInterned )
28902943
{
@@ -2919,7 +2972,7 @@ private char[] lookuEntityReplacement( int entityNameLen )
29192972
}
29202973
}
29212974
}
2922-
return null;
2975+
return BUF_NOT_RESOLVED;
29232976
}
29242977

29252978
private void parseComment()
@@ -2977,7 +3030,7 @@ else if (isValidCodePoint( ch ))
29773030
}
29783031
else
29793032
{
2980-
throw new XmlPullParserException( "Illegal character 0x" + Integer.toHexString(((int) ch)) + " found in comment", this, null );
3033+
throw new XmlPullParserException( "Illegal character 0x" + Integer.toHexString(ch) + " found in comment", this, null );
29813034
}
29823035
if ( normalizeIgnorableWS )
29833036
{
@@ -3484,7 +3537,8 @@ else if ( ch == '>' && bracketLevel == 0 )
34843537
break;
34853538
else if ( ch == '&' )
34863539
{
3487-
extractEntityRef();
3540+
extractEntityRefInDocDecl();
3541+
continue;
34883542
}
34893543
if ( normalizeIgnorableWS )
34903544
{
@@ -3536,6 +3590,19 @@ else if ( ch == '\n' )
35363590

35373591
}
35383592
posEnd = pos - 1;
3593+
text = null;
3594+
}
3595+
3596+
private void extractEntityRefInDocDecl()
3597+
throws XmlPullParserException, IOException
3598+
{
3599+
// extractEntityRef
3600+
posEnd = pos - 1;
3601+
3602+
int prevPosStart = posStart;
3603+
parseEntityRefInDocDecl();
3604+
3605+
posStart = prevPosStart;
35393606
}
35403607

35413608
private void extractEntityRef()
@@ -3559,9 +3626,9 @@ private void extractEntityRef()
35593626
}
35603627
// assert usePC == true;
35613628

3562-
final char[] resolvedEntity = parseEntityRef();
3629+
parseEntityRef();
35633630
// check if replacement text can be resolved !!!
3564-
if ( resolvedEntity == null )
3631+
if ( resolvedEntityRefCharBuf == BUF_NOT_RESOLVED )
35653632
{
35663633
if ( entityRefName == null )
35673634
{
@@ -3571,7 +3638,7 @@ private void extractEntityRef()
35713638
+ "'", this, null );
35723639
}
35733640
// write into PC replacement text - do merge for replacement text!!!!
3574-
for ( char aResolvedEntity : resolvedEntity )
3641+
for ( char aResolvedEntity : resolvedEntityRefCharBuf )
35753642
{
35763643
if ( pcEnd >= pc.length )
35773644
{

0 commit comments

Comments
 (0)