Skip to content

Commit c2c14e9

Browse files
committed
OPENNLP-1124: Optimize XML parser configuration
1 parent 52573ea commit c2c14e9

File tree

5 files changed

+69
-32
lines changed

5 files changed

+69
-32
lines changed

opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,14 @@
2222
import java.util.ArrayList;
2323
import java.util.List;
2424

25-
import javax.xml.parsers.ParserConfigurationException;
2625
import javax.xml.parsers.SAXParser;
27-
import javax.xml.parsers.SAXParserFactory;
2826

2927
import org.xml.sax.SAXException;
3028

3129
import opennlp.tools.parser.Parse;
3230
import opennlp.tools.util.FilterObjectStream;
3331
import opennlp.tools.util.ObjectStream;
32+
import opennlp.tools.util.XmlUtil;
3433

3534
public class ConstitParseSampleStream extends FilterObjectStream<byte[], Parse> {
3635

@@ -40,13 +39,7 @@ public class ConstitParseSampleStream extends FilterObjectStream<byte[], Parse>
4039

4140
protected ConstitParseSampleStream(ObjectStream<byte[]> samples) {
4241
super(samples);
43-
44-
SAXParserFactory factory = SAXParserFactory.newInstance();
45-
try {
46-
saxParser = factory.newSAXParser();
47-
} catch (ParserConfigurationException | SAXException e) {
48-
throw new IllegalStateException(e);
49-
}
42+
saxParser = XmlUtil.createSaxParser();
5043
}
5144

5245
public Parse read() throws IOException {

opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@
3030
import java.util.Map.Entry;
3131

3232
import javax.xml.parsers.DocumentBuilder;
33-
import javax.xml.parsers.DocumentBuilderFactory;
34-
import javax.xml.parsers.ParserConfigurationException;
3533

3634
import org.w3c.dom.Document;
3735
import org.w3c.dom.Node;
@@ -40,6 +38,7 @@
4038

4139
import opennlp.tools.tokenize.TokenSample;
4240
import opennlp.tools.util.Span;
41+
import opennlp.tools.util.XmlUtil;
4342

4443
/**
4544
* A structure to hold an Irish Sentence Bank document, which is a collection
@@ -154,8 +153,7 @@ public static IrishSentenceBankDocument parse(InputStream is) throws IOException
154153
IrishSentenceBankDocument document = new IrishSentenceBankDocument();
155154

156155
try {
157-
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
158-
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
156+
DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();
159157
Document doc = docBuilder.parse(is);
160158

161159
String root = doc.getDocumentElement().getNodeName();
@@ -262,8 +260,6 @@ public static IrishSentenceBankDocument parse(InputStream is) throws IOException
262260
}
263261
}
264262
return document;
265-
} catch (ParserConfigurationException e) {
266-
throw new IllegalStateException(e);
267263
} catch (SAXException e) {
268264
throw new IOException("Failed to parse IrishSentenceBank document", e);
269265
}

opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtDocument.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,15 @@
2626
import java.util.Collections;
2727
import java.util.List;
2828

29-
import javax.xml.parsers.ParserConfigurationException;
3029
import javax.xml.parsers.SAXParser;
31-
import javax.xml.parsers.SAXParserFactory;
3230

3331
import org.xml.sax.InputSource;
3432
import org.xml.sax.SAXException;
3533
import org.xml.sax.XMLReader;
3634
import org.xml.sax.helpers.DefaultHandler;
3735

36+
import opennlp.tools.util.XmlUtil;
37+
3838
/**
3939
* A structure to hold the letsmt document. The documents contains sentences and depending on the
4040
* source it either contains tokenized text (words) or an un-tokenized sentence string.
@@ -118,18 +118,14 @@ public List<LetsmtSentence> getSentences() {
118118
}
119119

120120
static LetsmtDocument parse(InputStream letsmtXmlIn) throws IOException {
121-
SAXParserFactory spf = SAXParserFactory.newInstance();
121+
SAXParser saxParser = XmlUtil.createSaxParser();
122122

123123
try {
124-
SAXParser saxParser = spf.newSAXParser();
125-
126124
XMLReader xmlReader = saxParser.getXMLReader();
127125
LetsmtDocumentHandler docHandler = new LetsmtDocumentHandler();
128126
xmlReader.setContentHandler(docHandler);
129127
xmlReader.parse(new InputSource(letsmtXmlIn));
130128
return new LetsmtDocument(docHandler.sentences);
131-
} catch (ParserConfigurationException e) {
132-
throw new IllegalStateException(e);
133129
} catch (SAXException e) {
134130
throw new IOException("Failed to parse letsmt xml!", e);
135131
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package opennlp.tools.util;
19+
20+
import javax.xml.XMLConstants;
21+
import javax.xml.parsers.DocumentBuilder;
22+
import javax.xml.parsers.DocumentBuilderFactory;
23+
import javax.xml.parsers.ParserConfigurationException;
24+
import javax.xml.parsers.SAXParser;
25+
import javax.xml.parsers.SAXParserFactory;
26+
27+
import org.xml.sax.SAXException;
28+
29+
public class XmlUtil {
30+
31+
/**
32+
* Create a new DocumentBuilder which processes XML securely.
33+
*
34+
* @return a DocumentBuilder
35+
*/
36+
public static DocumentBuilder createDocumentBuilder() {
37+
try {
38+
DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
39+
documentBuilderFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
40+
return documentBuilderFactory.newDocumentBuilder();
41+
} catch (ParserConfigurationException e) {
42+
throw new IllegalStateException(e);
43+
}
44+
}
45+
46+
/**
47+
* Create a new SAXParser which processes XML securely.
48+
*
49+
* @return a SAXParser
50+
*/
51+
public static SAXParser createSaxParser() {
52+
SAXParserFactory spf = SAXParserFactory.newInstance();
53+
try {
54+
spf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
55+
return spf.newSAXParser();
56+
} catch (ParserConfigurationException | SAXException e) {
57+
throw new IllegalStateException(e);
58+
}
59+
}
60+
}

opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@
2828
import java.util.Objects;
2929

3030
import javax.xml.parsers.DocumentBuilder;
31-
import javax.xml.parsers.DocumentBuilderFactory;
32-
import javax.xml.parsers.ParserConfigurationException;
3331
import javax.xml.xpath.XPath;
3432
import javax.xml.xpath.XPathConstants;
3533
import javax.xml.xpath.XPathExpression;
@@ -45,6 +43,7 @@
4543
import opennlp.tools.dictionary.Dictionary;
4644
import opennlp.tools.postag.POSModel;
4745
import opennlp.tools.util.InvalidFormatException;
46+
import opennlp.tools.util.XmlUtil;
4847
import opennlp.tools.util.ext.ExtensionLoader;
4948
import opennlp.tools.util.model.ArtifactSerializer;
5049
import opennlp.tools.util.model.DictionarySerializer;
@@ -735,15 +734,8 @@ static AdaptiveFeatureGenerator createGenerator(Element generatorElement,
735734

736735
private static org.w3c.dom.Document createDOM(InputStream xmlDescriptorIn)
737736
throws IOException {
738-
DocumentBuilderFactory documentBuilderFacoty = DocumentBuilderFactory.newInstance();
739737

740-
DocumentBuilder documentBuilder;
741-
742-
try {
743-
documentBuilder = documentBuilderFacoty.newDocumentBuilder();
744-
} catch (ParserConfigurationException e) {
745-
throw new IllegalStateException(e);
746-
}
738+
DocumentBuilder documentBuilder = XmlUtil.createDocumentBuilder();
747739

748740
org.w3c.dom.Document xmlDescriptorDOM;
749741

0 commit comments

Comments
 (0)