Make Parser instances threadsafe (#2314)

jhy · web-flow · commit 01c823e35033 · 2025-04-28T13:24:06.000+10:00
* Make Parser instances threadsafe I've seen examples of people trying to use a Parser in a concurrent manner, and this may succeed in low-concurrency modes, but will fail when under pressure. Adding an explicit lock to ensure safety. * Retain same Parser in Document & Element clone (#2315) When cloning an Element or a Document, the same underlying Parser should be retained. We were creating new Parsers with HTML treebuilders on each Element clone. Which is both heavyweight and incorrect. The Parser is now threadsafe so if the cloned element was being used (appending HTML) across threads, this will still be safe. (Albeit not concurrent; users can set a new Parser via ownerDocument().parser().newInstance() for a copy if required.)
diff --git a/src/main/java/org/jsoup/nodes/Document.java b/src/main/java/org/jsoup/nodes/Document.java
@@ -37,9 +37,13 @@ public class Document extends Element {
      @see #createShell
      */
     public Document(String namespace, String baseUri) {
+        this(namespace, baseUri, Parser.htmlParser()); // default HTML parser, but overridable
+    }
+
+    private Document(String namespace, String baseUri, Parser parser) {
         super(new Tag("#root", namespace), baseUri);
         this.location = baseUri;
-        this.parser = Parser.htmlParser(); // default, but overridable
+        this.parser = parser;
     }
 
     /**
@@ -293,16 +297,16 @@ public boolean updateMetaCharsetElement() {
     @Override
     public Document clone() {
         Document clone = (Document) super.clone();
+        if (attributes != null) clone.attributes = attributes.clone();
         clone.outputSettings = this.outputSettings.clone();
-        clone.parser = this.parser.clone();
+        // parser is pointer copy
         return clone;
     }
 
     @Override
     public Document shallowClone() {
-        Document clone = new Document(this.tag().namespace(), baseUri());
-        if (attributes != null)
-            clone.attributes = attributes.clone();
+        Document clone = new Document(this.tag().namespace(), baseUri(), parser); // preserves parser pointer
+        if (attributes != null) clone.attributes = attributes.clone();
         clone.outputSettings = this.outputSettings.clone();
         return clone;
     }
diff --git a/src/main/java/org/jsoup/nodes/Node.java b/src/main/java/org/jsoup/nodes/Node.java
@@ -867,15 +867,20 @@ public boolean hasSameValue(@Nullable Object o) {
     }
 
     /**
-     * Create a stand-alone, deep copy of this node, and all of its children. The cloned node will have no siblings or
-     * parent node. As a stand-alone object, any changes made to the clone or any of its children will not impact the
-     * original node.
-     * <p>
-     * The cloned node may be adopted into another Document or node structure using {@link Element#appendChild(Node)}.
-     * @return a stand-alone cloned node, including clones of any children
-     * @see #shallowClone()
-     */
-    @SuppressWarnings("MethodDoesntCallSuperMethod") // because it does call super.clone in doClone - analysis just isn't following
+     Create a stand-alone, deep copy of this node, and all of its children. The cloned node will have no siblings.
+     <p><ul>
+     <li>If this node is a {@link LeafNode}, the clone will have no parent.</li>
+     <li>If this node is an {@link Element}, the clone will have a simple owning {@link Document} to retain the
+     configured output settings and parser.</li>
+     </ul></p>
+     <p>The cloned node may be adopted into another Document or node structure using
+     {@link Element#appendChild(Node)}.</p>
+
+     @return a stand-alone cloned node, including clones of any children
+     @see #shallowClone()
+     */
+    @SuppressWarnings("MethodDoesntCallSuperMethod")
+    // because it does call super.clone in doClone - analysis just isn't following
     @Override
     public Node clone() {
         Node thisClone = doClone(null); // splits for orphan
diff --git a/src/main/java/org/jsoup/parser/Parser.java b/src/main/java/org/jsoup/parser/Parser.java
@@ -9,12 +9,15 @@
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.List;
+import java.util.concurrent.locks.ReentrantLock;
 
 /**
  Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in
  {@link org.jsoup.Jsoup}.
- <p>Note that a Parser instance object is not threadsafe. To reuse a Parser configuration in a multi-threaded
- environment, use {@link #newInstance()} to make copies. */
+ <p>Note that a given Parser instance object is threadsafe, but not concurrent. (Concurrent parse calls will
+ synchronize.) To reuse a Parser configuration in a multithreaded environment, use {@link #newInstance()} to make
+ copies.</p>
+ */
 public class Parser implements Cloneable {
     public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml";
     public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace";
@@ -25,7 +28,8 @@ public class Parser implements Cloneable {
     private ParseErrorList errors;
     private ParseSettings settings;
     private boolean trackPosition = false;
-    @Nullable TagSet tagSet;
+    private @Nullable TagSet tagSet;
+    private final ReentrantLock lock = new ReentrantLock();
 
     /**
      * Create a new Parser, using the specified TreeBuilder
@@ -63,15 +67,25 @@ public Document parseInput(String html, String baseUri) {
     }
 
     public Document parseInput(Reader inputHtml, String baseUri) {
-        return treeBuilder.parse(inputHtml, baseUri, this);
+        try {
+            lock.lock(); // using a lock vs synchronized to support loom threads
+            return treeBuilder.parse(inputHtml, baseUri, this);
+        } finally {
+            lock.unlock();
+        }
     }
 
     public List<Node> parseFragmentInput(String fragment, @Nullable Element context, String baseUri) {
         return parseFragmentInput(new StringReader(fragment), context, baseUri);
     }
 
     public List<Node> parseFragmentInput(Reader fragment, @Nullable Element context, String baseUri) {
-        return treeBuilder.parseFragment(fragment, context, baseUri, this);
+        try {
+            lock.lock();
+            return treeBuilder.parseFragment(fragment, context, baseUri, this);
+        } finally {
+            lock.unlock();
+        }
     }
 
     // gets & sets
@@ -87,8 +101,9 @@ public TreeBuilder getTreeBuilder() {
      * Update the TreeBuilder used when parsing content.
      * @param treeBuilder new TreeBuilder
      * @return this, for chaining
+     * @deprecated unused method, will be removed in 1.21.1
      */
-    public Parser setTreeBuilder(TreeBuilder treeBuilder) {
+    @Deprecated public Parser setTreeBuilder(TreeBuilder treeBuilder) {
         this.treeBuilder = treeBuilder;
         treeBuilder.parser = this;
         return this;
diff --git a/src/test/java/org/jsoup/nodes/DocumentTest.java b/src/test/java/org/jsoup/nodes/DocumentTest.java
@@ -112,7 +112,7 @@ public class DocumentTest {
         Document clone = doc.clone();
         assertNotSame(doc, clone);
         assertTrue(doc.hasSameValue(clone));
-        assertNotSame(doc.parser(), clone.parser());
+        assertSame(doc.parser(), clone.parser());
         assertNotSame(doc.outputSettings(), clone.outputSettings());
 
         assertEquals("<html><head><title>Hello</title></head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
diff --git a/src/test/java/org/jsoup/nodes/ElementTest.java b/src/test/java/org/jsoup/nodes/ElementTest.java
@@ -1035,6 +1035,29 @@ public void testShallowClone() {
         assertEquals(base, d2.baseUri());
     }
 
+    @Test void cloneRetainsParser() {
+        Document htmlDoc = Jsoup.parse("<div><script></script></div>", Parser.htmlParser());
+        Document xmlDoc = Jsoup.parse("<div><script></script></div>", Parser.xmlParser());
+
+        Element hEl = htmlDoc.expectFirst("script");
+        Element hEl2 = hEl.clone();
+        assertNotSame(hEl, hEl2);
+        assertNotSame(hEl.ownerDocument(), hEl2.ownerDocument());
+        assertSame(hEl.ownerDocument().parser(), hEl2.ownerDocument().parser());
+
+        Document doc2 = htmlDoc.clone();
+        assertNotSame(htmlDoc, doc2);
+        assertSame(htmlDoc.parser(), doc2.parser());
+
+        hEl2.append("<foo></foo>"); // we are inside a script, should be parsed as data
+        assertEquals("<foo></foo>", hEl2.data());
+
+        Element xEl = xmlDoc.expectFirst("script");
+        Element xEl2 = xEl.clone();
+        xEl2.append("<foo></foo>"); // in XML, script doesn't mean anything, and so will be parsed as xml
+        assertEquals("<script><foo></foo></script>", xEl2.outerHtml());
+    }
+
     @Test
     public void testTagNameSet() {
         Document doc = Jsoup.parse("<div><i>Hello</i>");
diff --git a/src/test/java/org/jsoup/parser/ParserIT.java b/src/test/java/org/jsoup/parser/ParserIT.java
@@ -1,9 +1,13 @@
 package org.jsoup.parser;
 
 import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
+import java.util.ArrayList;
+import java.util.List;
+
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
@@ -51,4 +55,69 @@ public void handlesDeepStack() {
         assertTrue(System.currentTimeMillis() - start < 20000); // I get ~ 1.5 seconds, but others have reported slower
         // was originally much longer, or stack overflow.
     }
+
+    @Test void parserIsThreadSafe() throws InterruptedException {
+        // tests that a single parser can be called by multiple threads and won't blow up
+        // without the lock, will see many exceptions in parse, and non-equal docs
+        String html = "<div id=1><div id=2><div id=3>Text.</div></div></div>";
+        Parser parser = Parser.htmlParser();
+        Document expectDoc = parser.parseInput(html, "");
+
+        int numThreads = 10;
+        int numLoops = 20;
+        List<Thread> threads = new ArrayList<>(numThreads);
+        List<Document> toCheck = new ArrayList<>(numThreads * numLoops);
+        for (int i = 0; i < numThreads; i++) {
+            Thread thread = new Thread(() -> {
+                for (int j = 0; j < numLoops; j++) {
+                    Document doc = parser.parseInput(html, "");
+                    toCheck.add(doc);
+                }
+            });
+            threads.add(thread);
+            thread.start();
+        }
+
+        for (Thread thread : threads) {
+            thread.join();
+        }
+
+        for (Document doc : toCheck) {
+            assertTrue(doc.hasSameValue(expectDoc));
+        }
+    }
+
+    @Test void parserIsThreadSafeWithCloneAndAppend() throws InterruptedException {
+        // tests that a single parser can be called by multiple threads via Element.clone().append()
+        String html = "<div id=1><div id=2><div id=3></div></div></div>";
+        String append = "<div id=4>Text.</div>";
+        Parser parser = Parser.htmlParser();
+        Document baseDoc = parser.parseInput(html, "");
+        Element baseElement = baseDoc.expectFirst("#3");
+
+        int numThreads = 10;
+        int numLoops = 20;
+        List<Thread> threads = new ArrayList<>(numThreads);
+        List<Element> toCheck = new ArrayList<>(numThreads * numLoops);
+        for (int i = 0; i < numThreads; i++) {
+            Thread thread = new Thread(() -> {
+                for (int j = 0; j < numLoops; j++) {
+                    Element cloned = baseElement.clone();
+                    cloned.append(append); // invokes the parser internally - parseFragment
+                    toCheck.add(cloned);
+                }
+            });
+            threads.add(thread);
+            thread.start();
+        }
+
+        for (Thread thread : threads) {
+            thread.join();
+        }
+
+        baseElement.append(append);
+        for (Element element : toCheck) {
+            assertTrue(element.hasSameValue(baseElement));
+        }
+    }
 }