Skip to content

Commit 01c823e

Browse files
authored
Make Parser instances threadsafe (#2314)
* Make Parser instances threadsafe I've seen examples of people trying to use a Parser in a concurrent manner, and this may succeed in low-concurrency modes, but will fail when under pressure. Adding an explicit lock to ensure safety. * Retain same Parser in Document & Element clone (#2315) When cloning an Element or a Document, the same underlying Parser should be retained. We were creating new Parsers with HTML treebuilders on each Element clone. Which is both heavyweight and incorrect. The Parser is now threadsafe so if the cloned element was being used (appending HTML) across threads, this will still be safe. (Albeit not concurrent; users can set a new Parser via ownerDocument().parser().newInstance() for a copy if required.)
1 parent dcb4f79 commit 01c823e

File tree

6 files changed

+137
-21
lines changed

6 files changed

+137
-21
lines changed

src/main/java/org/jsoup/nodes/Document.java

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,13 @@ public class Document extends Element {
3737
@see #createShell
3838
*/
3939
public Document(String namespace, String baseUri) {
40+
this(namespace, baseUri, Parser.htmlParser()); // default HTML parser, but overridable
41+
}
42+
43+
private Document(String namespace, String baseUri, Parser parser) {
4044
super(new Tag("#root", namespace), baseUri);
4145
this.location = baseUri;
42-
this.parser = Parser.htmlParser(); // default, but overridable
46+
this.parser = parser;
4347
}
4448

4549
/**
@@ -293,16 +297,16 @@ public boolean updateMetaCharsetElement() {
293297
@Override
294298
public Document clone() {
295299
Document clone = (Document) super.clone();
300+
if (attributes != null) clone.attributes = attributes.clone();
296301
clone.outputSettings = this.outputSettings.clone();
297-
clone.parser = this.parser.clone();
302+
// parser is pointer copy
298303
return clone;
299304
}
300305

301306
@Override
302307
public Document shallowClone() {
303-
Document clone = new Document(this.tag().namespace(), baseUri());
304-
if (attributes != null)
305-
clone.attributes = attributes.clone();
308+
Document clone = new Document(this.tag().namespace(), baseUri(), parser); // preserves parser pointer
309+
if (attributes != null) clone.attributes = attributes.clone();
306310
clone.outputSettings = this.outputSettings.clone();
307311
return clone;
308312
}

src/main/java/org/jsoup/nodes/Node.java

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -867,15 +867,20 @@ public boolean hasSameValue(@Nullable Object o) {
867867
}
868868

869869
/**
870-
* Create a stand-alone, deep copy of this node, and all of its children. The cloned node will have no siblings or
871-
* parent node. As a stand-alone object, any changes made to the clone or any of its children will not impact the
872-
* original node.
873-
* <p>
874-
* The cloned node may be adopted into another Document or node structure using {@link Element#appendChild(Node)}.
875-
* @return a stand-alone cloned node, including clones of any children
876-
* @see #shallowClone()
877-
*/
878-
@SuppressWarnings("MethodDoesntCallSuperMethod") // because it does call super.clone in doClone - analysis just isn't following
870+
Create a stand-alone, deep copy of this node, and all of its children. The cloned node will have no siblings.
871+
<p><ul>
872+
<li>If this node is a {@link LeafNode}, the clone will have no parent.</li>
873+
<li>If this node is an {@link Element}, the clone will have a simple owning {@link Document} to retain the
874+
configured output settings and parser.</li>
875+
</ul></p>
876+
<p>The cloned node may be adopted into another Document or node structure using
877+
{@link Element#appendChild(Node)}.</p>
878+
879+
@return a stand-alone cloned node, including clones of any children
880+
@see #shallowClone()
881+
*/
882+
@SuppressWarnings("MethodDoesntCallSuperMethod")
883+
// because it does call super.clone in doClone - analysis just isn't following
879884
@Override
880885
public Node clone() {
881886
Node thisClone = doClone(null); // splits for orphan

src/main/java/org/jsoup/parser/Parser.java

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,15 @@
99
import java.io.Reader;
1010
import java.io.StringReader;
1111
import java.util.List;
12+
import java.util.concurrent.locks.ReentrantLock;
1213

1314
/**
1415
Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in
1516
{@link org.jsoup.Jsoup}.
16-
<p>Note that a Parser instance object is not threadsafe. To reuse a Parser configuration in a multi-threaded
17-
environment, use {@link #newInstance()} to make copies. */
17+
<p>Note that a given Parser instance object is threadsafe, but not concurrent. (Concurrent parse calls will
18+
synchronize.) To reuse a Parser configuration in a multithreaded environment, use {@link #newInstance()} to make
19+
copies.</p>
20+
*/
1821
public class Parser implements Cloneable {
1922
public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml";
2023
public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace";
@@ -25,7 +28,8 @@ public class Parser implements Cloneable {
2528
private ParseErrorList errors;
2629
private ParseSettings settings;
2730
private boolean trackPosition = false;
28-
@Nullable TagSet tagSet;
31+
private @Nullable TagSet tagSet;
32+
private final ReentrantLock lock = new ReentrantLock();
2933

3034
/**
3135
* Create a new Parser, using the specified TreeBuilder
@@ -63,15 +67,25 @@ public Document parseInput(String html, String baseUri) {
6367
}
6468

6569
public Document parseInput(Reader inputHtml, String baseUri) {
66-
return treeBuilder.parse(inputHtml, baseUri, this);
70+
try {
71+
lock.lock(); // using a lock vs synchronized to support loom threads
72+
return treeBuilder.parse(inputHtml, baseUri, this);
73+
} finally {
74+
lock.unlock();
75+
}
6776
}
6877

6978
public List<Node> parseFragmentInput(String fragment, @Nullable Element context, String baseUri) {
7079
return parseFragmentInput(new StringReader(fragment), context, baseUri);
7180
}
7281

7382
public List<Node> parseFragmentInput(Reader fragment, @Nullable Element context, String baseUri) {
74-
return treeBuilder.parseFragment(fragment, context, baseUri, this);
83+
try {
84+
lock.lock();
85+
return treeBuilder.parseFragment(fragment, context, baseUri, this);
86+
} finally {
87+
lock.unlock();
88+
}
7589
}
7690

7791
// gets & sets
@@ -87,8 +101,9 @@ public TreeBuilder getTreeBuilder() {
87101
* Update the TreeBuilder used when parsing content.
88102
* @param treeBuilder new TreeBuilder
89103
* @return this, for chaining
104+
* @deprecated unused method, will be removed in 1.21.1
90105
*/
91-
public Parser setTreeBuilder(TreeBuilder treeBuilder) {
106+
@Deprecated public Parser setTreeBuilder(TreeBuilder treeBuilder) {
92107
this.treeBuilder = treeBuilder;
93108
treeBuilder.parser = this;
94109
return this;

src/test/java/org/jsoup/nodes/DocumentTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ public class DocumentTest {
112112
Document clone = doc.clone();
113113
assertNotSame(doc, clone);
114114
assertTrue(doc.hasSameValue(clone));
115-
assertNotSame(doc.parser(), clone.parser());
115+
assertSame(doc.parser(), clone.parser());
116116
assertNotSame(doc.outputSettings(), clone.outputSettings());
117117

118118
assertEquals("<html><head><title>Hello</title></head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));

src/test/java/org/jsoup/nodes/ElementTest.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,6 +1035,29 @@ public void testShallowClone() {
10351035
assertEquals(base, d2.baseUri());
10361036
}
10371037

1038+
@Test void cloneRetainsParser() {
1039+
Document htmlDoc = Jsoup.parse("<div><script></script></div>", Parser.htmlParser());
1040+
Document xmlDoc = Jsoup.parse("<div><script></script></div>", Parser.xmlParser());
1041+
1042+
Element hEl = htmlDoc.expectFirst("script");
1043+
Element hEl2 = hEl.clone();
1044+
assertNotSame(hEl, hEl2);
1045+
assertNotSame(hEl.ownerDocument(), hEl2.ownerDocument());
1046+
assertSame(hEl.ownerDocument().parser(), hEl2.ownerDocument().parser());
1047+
1048+
Document doc2 = htmlDoc.clone();
1049+
assertNotSame(htmlDoc, doc2);
1050+
assertSame(htmlDoc.parser(), doc2.parser());
1051+
1052+
hEl2.append("<foo></foo>"); // we are inside a script, should be parsed as data
1053+
assertEquals("<foo></foo>", hEl2.data());
1054+
1055+
Element xEl = xmlDoc.expectFirst("script");
1056+
Element xEl2 = xEl.clone();
1057+
xEl2.append("<foo></foo>"); // in XML, script doesn't mean anything, and so will be parsed as xml
1058+
assertEquals("<script><foo></foo></script>", xEl2.outerHtml());
1059+
}
1060+
10381061
@Test
10391062
public void testTagNameSet() {
10401063
Document doc = Jsoup.parse("<div><i>Hello</i>");

src/test/java/org/jsoup/parser/ParserIT.java

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
package org.jsoup.parser;
22

33
import org.jsoup.nodes.Document;
4+
import org.jsoup.nodes.Element;
45
import org.junit.jupiter.api.Disabled;
56
import org.junit.jupiter.api.Test;
67

8+
import java.util.ArrayList;
9+
import java.util.List;
10+
711
import static org.junit.jupiter.api.Assertions.assertEquals;
812
import static org.junit.jupiter.api.Assertions.assertTrue;
913

@@ -51,4 +55,69 @@ public void handlesDeepStack() {
5155
assertTrue(System.currentTimeMillis() - start < 20000); // I get ~ 1.5 seconds, but others have reported slower
5256
// was originally much longer, or stack overflow.
5357
}
58+
59+
@Test void parserIsThreadSafe() throws InterruptedException {
60+
// tests that a single parser can be called by multiple threads and won't blow up
61+
// without the lock, will see many exceptions in parse, and non-equal docs
62+
String html = "<div id=1><div id=2><div id=3>Text.</div></div></div>";
63+
Parser parser = Parser.htmlParser();
64+
Document expectDoc = parser.parseInput(html, "");
65+
66+
int numThreads = 10;
67+
int numLoops = 20;
68+
List<Thread> threads = new ArrayList<>(numThreads);
69+
List<Document> toCheck = new ArrayList<>(numThreads * numLoops);
70+
for (int i = 0; i < numThreads; i++) {
71+
Thread thread = new Thread(() -> {
72+
for (int j = 0; j < numLoops; j++) {
73+
Document doc = parser.parseInput(html, "");
74+
toCheck.add(doc);
75+
}
76+
});
77+
threads.add(thread);
78+
thread.start();
79+
}
80+
81+
for (Thread thread : threads) {
82+
thread.join();
83+
}
84+
85+
for (Document doc : toCheck) {
86+
assertTrue(doc.hasSameValue(expectDoc));
87+
}
88+
}
89+
90+
@Test void parserIsThreadSafeWithCloneAndAppend() throws InterruptedException {
91+
// tests that a single parser can be called by multiple threads via Element.clone().append()
92+
String html = "<div id=1><div id=2><div id=3></div></div></div>";
93+
String append = "<div id=4>Text.</div>";
94+
Parser parser = Parser.htmlParser();
95+
Document baseDoc = parser.parseInput(html, "");
96+
Element baseElement = baseDoc.expectFirst("#3");
97+
98+
int numThreads = 10;
99+
int numLoops = 20;
100+
List<Thread> threads = new ArrayList<>(numThreads);
101+
List<Element> toCheck = new ArrayList<>(numThreads * numLoops);
102+
for (int i = 0; i < numThreads; i++) {
103+
Thread thread = new Thread(() -> {
104+
for (int j = 0; j < numLoops; j++) {
105+
Element cloned = baseElement.clone();
106+
cloned.append(append); // invokes the parser internally - parseFragment
107+
toCheck.add(cloned);
108+
}
109+
});
110+
threads.add(thread);
111+
thread.start();
112+
}
113+
114+
for (Thread thread : threads) {
115+
thread.join();
116+
}
117+
118+
baseElement.append(append);
119+
for (Element element : toCheck) {
120+
assertTrue(element.hasSameValue(baseElement));
121+
}
122+
}
54123
}

0 commit comments

Comments
 (0)