Skip to content

Commit 156f2ea

Browse files
committed
XML allows for comments and processing instructions to be present before the start and after the end of the root element. Currently, FactoryAdapter does not capture those nodes, and XMLLoader.loadXML does not provide access to anything other than the root element anyway.
This pull request addresses the issue. Note: at least with the JDK's Xerces, whitespace in the prolog and epilogue gets lost in parsing: the parser does not fire any white-space related events.
1 parent df9759b commit 156f2ea

File tree

4 files changed

+75
-24
lines changed

4 files changed

+75
-24
lines changed

jvm/src/test/scala/scala/xml/XMLTest.scala

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,7 @@ class XMLTestJVM {
586586
def issue508commentParsing: Unit = {
587587
// confirm that comments are processed correctly now
588588
roundtrip("<a><!-- comment --> suffix</a>")
589-
roundtrip("<a>prefix <!-- comment --> suffix</a>")
589+
roundtrip("<a>prefix <!-- comment --> <!-- comment2 --> suffix</a>")
590590
roundtrip("<a>prefix <b><!-- comment --></b> suffix</a>")
591591
roundtrip("<a>prefix <b><!-- multi-\nline\n comment --></b> suffix</a>")
592592
roundtrip("""<a>prefix <b><!-- multi-
@@ -596,13 +596,7 @@ class XMLTestJVM {
596596
// confirm that processing instructions were always processed correctly
597597
roundtrip("<a><?target content ?> suffix</a>")
598598
roundtrip("<a>prefix <?target content ?> suffix</a>")
599-
roundtrip("<a>prefix <b><?target content?></b> suffix</a>")
600-
601-
// TODO since XMLLoader retrieves FactoryAdapter.rootNode,
602-
// capturing comments before and after the root element is not currently possible
603-
// (by the way, the same applies to processing instructions).
604-
//check("<!-- prologue --><a>text</a>")
605-
//check("<a>text</a><!-- epilogue -->")
599+
roundtrip("<a>prefix <b><?target content?> </b> suffix</a>")
606600
}
607601

608602
@UnitTest
@@ -613,7 +607,26 @@ class XMLTestJVM {
613607
roundtrip("""<a>prefix <b><![CDATA[
614608
| multi-
615609
| line cdata
616-
| section]]></b> suffix</a>""".stripMargin)
610+
| section]]> </b> suffix</a>""".stripMargin)
611+
}
612+
613+
def roundtripNodes(xml: String): Unit = assertEquals(xml, XML.loadStringNodes(xml).map(_.toString).mkString(""))
614+
615+
@UnitTest
616+
def xmlLoaderLoadNodes: Unit = {
617+
roundtripNodes("<!-- prolog --><a>text</a>")
618+
roundtripNodes("<!-- prolog --><?target content ?><!-- comment2 --><a>text</a>")
619+
roundtripNodes("""<!-- prolog
620+
| --><?target content ?><!--
621+
| comment2 --><a>text</a>""".stripMargin)
622+
623+
roundtripNodes("<a>text</a><!-- epilogue -->")
624+
roundtripNodes("<a>text</a><!-- epilogue --><?target content ?><!-- comment2 -->")
625+
626+
// Note: at least with the JDK's Xerces, whitespace in the prolog and epilogue gets lost in parsing:
627+
// the parser does not fire any white-space related events, so:
628+
// does not work: roundtripNodes("<!-- c --> <a/>")
629+
// does not work: roundtripNodes("<a/> <!-- epilogue -->")
617630
}
618631

619632
@UnitTest

shared/src/main/scala/scala/xml/factory/XMLLoader.scala

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,19 +51,29 @@ trait XMLLoader[T <: Node] {
5151
* The methods available in scala.xml.XML use the XML parser in the JDK.
5252
*/
5353
def loadXML(source: InputSource, parser: SAXParser): T = {
54-
val newAdapter = adapter
54+
val result: FactoryAdapter = parse(source, parser)
55+
result.rootElem.asInstanceOf[T]
56+
}
57+
58+
def loadXMLNodes(source: InputSource, parser: SAXParser): Seq[Node] = {
59+
val result: FactoryAdapter = parse(source, parser)
60+
result.prolog ++ (result.rootElem :: result.epilogue)
61+
}
62+
63+
private def parse(source: InputSource, parser: SAXParser): FactoryAdapter = {
64+
val result: FactoryAdapter = adapter
5565

5666
try {
57-
parser.setProperty("http://xml.org/sax/properties/lexical-handler", newAdapter)
67+
parser.setProperty("http://xml.org/sax/properties/lexical-handler", result)
5868
} catch {
5969
case _: SAXNotRecognizedException =>
6070
}
6171

62-
newAdapter.scopeStack = TopScope :: newAdapter.scopeStack
63-
parser.parse(source, newAdapter)
64-
newAdapter.scopeStack = newAdapter.scopeStack.tail
72+
result.scopeStack = TopScope :: result.scopeStack
73+
parser.parse(source, result)
74+
result.scopeStack = result.scopeStack.tail
6575

66-
newAdapter.rootElem.asInstanceOf[T]
76+
result
6777
}
6878

6979
/** Loads XML from the given file, file descriptor, or filename. */
@@ -80,4 +90,15 @@ trait XMLLoader[T <: Node] {
8090

8191
/** Loads XML from the given String. */
8292
def loadString(string: String): T = loadXML(fromString(string), parser)
83-
}
93+
94+
/** Load XML nodes, including comments and processing instructions that precede and follow the root element. */
95+
def loadFileNodes(file: File): Seq[Node] = loadXMLNodes(fromFile(file), parser)
96+
def loadFileNodes(fd: FileDescriptor): Seq[Node] = loadXMLNodes(fromFile(fd), parser)
97+
def loadFileNodes(name: String): Seq[Node] = loadXMLNodes(fromFile(name), parser)
98+
def loadNodes(is: InputStream): Seq[Node] = loadXMLNodes(fromInputStream(is), parser)
99+
def loadNodes(reader: Reader): Seq[Node] = loadXMLNodes(fromReader(reader), parser)
100+
def loadNodes(sysID: String): Seq[Node] = loadXMLNodes(fromSysId(sysID), parser)
101+
def loadNodes(source: InputSource): Seq[Node] = loadXMLNodes(source, parser)
102+
def loadNodes(url: URL): Seq[Node] = loadXMLNodes(fromInputStream(url.openStream()), parser)
103+
def loadStringNodes(string: String): Seq[Node] = loadXMLNodes(fromString(string), parser)
104+
}

shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,11 @@ trait ConsoleErrorHandler extends DefaultHandler2 {
4040
* underlying SAX parser.
4141
*/
4242
abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Node] {
43+
var prolog: List[Node] = List.empty
4344
var rootElem: Node = _
45+
var epilogue: List[Node] = List.empty
4446

45-
val buffer = new StringBuilder()
47+
val buffer: StringBuilder = new StringBuilder()
4648
private var inCDATA: Boolean = false
4749

4850
/** List of attributes
@@ -51,28 +53,28 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
5153
*
5254
* @since 2.0.0
5355
*/
54-
var attribStack = List.empty[MetaData]
56+
var attribStack: List[MetaData] = List.empty
5557
/** List of elements
5658
*
5759
* Previously was a mutable [[scala.collection.mutable.Stack Stack]], but is now a mutable reference to an immutable [[scala.collection.immutable.List List]].
5860
*
5961
* @since 2.0.0
6062
*/
61-
var hStack = List.empty[Node] // [ element ] contains siblings
63+
var hStack: List[Node] = List.empty // [ element ] contains siblings
6264
/** List of element names
6365
*
6466
* Previously was a mutable [[scala.collection.mutable.Stack Stack]], but is now a mutable reference to an immutable [[scala.collection.immutable.List List]].
6567
*
6668
* @since 2.0.0
6769
*/
68-
var tagStack = List.empty[String]
70+
var tagStack: List[String] = List.empty
6971
/** List of namespaces
7072
*
7173
* Previously was a mutable [[scala.collection.mutable.Stack Stack]], but is now a mutable reference to an immutable [[scala.collection.immutable.List List]].
7274
*
7375
* @since 2.0.0
7476
*/
75-
var scopeStack = List.empty[NamespaceBinding]
77+
var scopeStack: List[NamespaceBinding] = List.empty
7678

7779
var curTag: String = _
7880
var capture: Boolean = false
@@ -123,7 +125,7 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
123125
// ContentHandler methods
124126
//
125127

126-
val normalizeWhitespace = false
128+
val normalizeWhitespace: Boolean = false
127129

128130
/**
129131
* Capture characters, possibly normalizing whitespace.
@@ -177,13 +179,20 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
177179
attributes: Attributes): Unit =
178180
{
179181
captureText()
182+
183+
// capture the prolog at the start of the root element
184+
if (tagStack.isEmpty) {
185+
prolog = hStack.reverse
186+
hStack = List.empty
187+
}
188+
180189
tagStack = curTag :: tagStack
181190
curTag = qname
182191

183192
val localName = splitName(qname)._2
184193
capture = nodeContainsText(localName)
185194

186-
hStack = null :: hStack
195+
hStack = null :: hStack
187196
var m: MetaData = Null
188197
var scpe: NamespaceBinding =
189198
if (scopeStack.isEmpty) TopScope
@@ -193,7 +202,7 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
193202
val qname = attributes getQName i
194203
val value = attributes getValue i
195204
val (pre, key) = splitName(qname)
196-
def nullIfEmpty(s: String) = if (s == "") null else s
205+
def nullIfEmpty(s: String): String = if (s == "") null else s
197206

198207
if (pre == "xmlns" || (pre == null && qname == "xmlns")) {
199208
val arg = if (pre == null) null else key
@@ -250,6 +259,12 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
250259
capture = curTag != null && nodeContainsText(curTag) // root level
251260
}
252261

262+
override def endDocument(): Unit = {
263+
// capture the epilogue at the end of the document
264+
epilogue = hStack.init.reverse
265+
hStack = hStack.last :: Nil
266+
}
267+
253268
/**
254269
* Processing instruction.
255270
*/

shared/src/main/scala/scala/xml/parsing/MarkupParser.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ trait MarkupParser extends MarkupParserCommon with TokenTests {
9898
var extIndex = -1
9999

100100
/** holds temporary values of pos */
101+
// Note: this is clearly an override, but if marked as such it causes a "...cannot override a mutable variable"
102+
// error with Scala 3; does it work with Scala 3 if not explicitly marked as an override remains to be seen...
101103
var tmppos: Int = _
102104

103105
/** holds the next character */

0 commit comments

Comments
 (0)