Skip to content

Commit ed8e017

Browse files
willkggsnedders
authored andcommitted
First pass at treebuilder docs (#378)
1 parent 6b13f55 commit ed8e017

File tree

3 files changed

+102
-57
lines changed

3 files changed

+102
-57
lines changed

html5lib/treebuilders/__init__.py

+51-39
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,32 @@
1-
"""A collection of modules for building different kinds of tree from
2-
HTML documents.
1+
"""A collection of modules for building different kinds of trees from HTML
2+
documents.
33
44
To create a treebuilder for a new type of tree, you need to do
55
implement several things:
66
7-
1) A set of classes for various types of elements: Document, Doctype,
8-
Comment, Element. These must implement the interface of
9-
_base.treebuilders.Node (although comment nodes have a different
10-
signature for their constructor, see treebuilders.etree.Comment)
11-
Textual content may also be implemented as another node type, or not, as
12-
your tree implementation requires.
13-
14-
2) A treebuilder object (called TreeBuilder by convention) that
15-
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
16-
documentClass - the class to use for the bottommost node of a document
17-
elementClass - the class to use for HTML Elements
18-
commentClass - the class to use for comments
19-
doctypeClass - the class to use for doctypes
20-
It also has one required method:
21-
getDocument - Returns the root node of the complete document tree
22-
23-
3) If you wish to run the unit tests, you must also create a
24-
testSerializer method on your treebuilder which accepts a node and
25-
returns a string containing Node and its children serialized according
26-
to the format used in the unittests
7+
1. A set of classes for various types of elements: Document, Doctype, Comment,
8+
Element. These must implement the interface of ``base.treebuilders.Node``
9+
(although comment nodes have a different signature for their constructor,
10+
see ``treebuilders.etree.Comment``) Textual content may also be implemented
11+
as another node type, or not, as your tree implementation requires.
12+
13+
2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits
14+
from ``treebuilders.base.TreeBuilder``. This has 4 required attributes:
15+
16+
* ``documentClass`` - the class to use for the bottommost node of a document
17+
* ``elementClass`` - the class to use for HTML Elements
18+
* ``commentClass`` - the class to use for comments
19+
* ``doctypeClass`` - the class to use for doctypes
20+
21+
It also has one required method:
22+
23+
* ``getDocument`` - Returns the root node of the complete document tree
24+
25+
3. If you wish to run the unit tests, you must also create a ``testSerializer``
26+
method on your treebuilder which accepts a node and returns a string
27+
containing Node and its children serialized according to the format used in
28+
the unittests
29+
2730
"""
2831

2932
from __future__ import absolute_import, division, unicode_literals
@@ -34,23 +37,32 @@
3437

3538

3639
def getTreeBuilder(treeType, implementation=None, **kwargs):
37-
"""Get a TreeBuilder class for various types of tree with built-in support
38-
39-
treeType - the name of the tree type required (case-insensitive). Supported
40-
values are:
41-
42-
"dom" - A generic builder for DOM implementations, defaulting to
43-
a xml.dom.minidom based implementation.
44-
"etree" - A generic builder for tree implementations exposing an
45-
ElementTree-like interface, defaulting to
46-
xml.etree.cElementTree if available and
47-
xml.etree.ElementTree if not.
48-
"lxml" - A etree-based builder for lxml.etree, handling
49-
limitations of lxml's implementation.
50-
51-
implementation - (Currently applies to the "etree" and "dom" tree types). A
52-
module implementing the tree type e.g.
53-
xml.etree.ElementTree or xml.etree.cElementTree."""
40+
"""Get a TreeBuilder class for various types of trees with built-in support
41+
42+
:arg treeType: the name of the tree type required (case-insensitive). Supported
43+
values are:
44+
45+
* "dom" - A generic builder for DOM implementations, defaulting to a
46+
xml.dom.minidom based implementation.
47+
* "etree" - A generic builder for tree implementations exposing an
48+
ElementTree-like interface, defaulting to xml.etree.cElementTree if
49+
available and xml.etree.ElementTree if not.
50+
* "lxml" - A etree-based builder for lxml.etree, handling limitations
51+
of lxml's implementation.
52+
53+
:arg implementation: (Currently applies to the "etree" and "dom" tree
54+
types). A module implementing the tree type e.g. xml.etree.ElementTree
55+
or xml.etree.cElementTree.
56+
57+
:arg kwargs: Any additional options to pass to the TreeBuilder when
58+
creating it.
59+
60+
Example:
61+
62+
>>> from html5lib.treebuilders import getTreeBuilder
63+
>>> builder = getTreeBuilder('etree')
64+
65+
"""
5466

5567
treeType = treeType.lower()
5668
if treeType not in treeBuilderCache:

html5lib/treebuilders/base.py

+51-17
Original file line numberDiff line numberDiff line change
@@ -21,22 +21,25 @@
2121

2222

2323
class Node(object):
24+
"""Represents an item in the tree"""
2425
def __init__(self, name):
25-
"""Node representing an item in the tree.
26-
name - The tag name associated with the node
27-
parent - The parent of the current node (or None for the document node)
28-
value - The value of the current node (applies to text nodes and
29-
comments
30-
attributes - a dict holding name, value pairs for attributes of the node
31-
childNodes - a list of child nodes of the current node. This must
32-
include all elements but not necessarily other node types
33-
_flags - A list of miscellaneous flags that can be set on the node
26+
"""Creates a Node
27+
28+
:arg name: The tag name associated with the node
29+
3430
"""
31+
# The tag name assocaited with the node
3532
self.name = name
33+
# The parent of the current node (or None for the document node)
3634
self.parent = None
35+
# The value of the current node (applies to text nodes and comments)
3736
self.value = None
37+
# A dict holding name -> value pairs for attributes of the node
3838
self.attributes = {}
39+
# A list of child nodes of the current node. This must include all
40+
# elements but not necessarily other node types.
3941
self.childNodes = []
42+
# A list of miscellaneous flags that can be set on the node.
4043
self._flags = []
4144

4245
def __str__(self):
@@ -53,30 +56,51 @@ def __repr__(self):
5356

5457
def appendChild(self, node):
5558
"""Insert node as a child of the current node
59+
60+
:arg node: the node to insert
61+
5662
"""
5763
raise NotImplementedError
5864

5965
def insertText(self, data, insertBefore=None):
6066
"""Insert data as text in the current node, positioned before the
6167
start of node insertBefore or to the end of the node's text.
68+
69+
:arg data: the data to insert
70+
71+
:arg insertBefore: True if you want to insert the text before the node
72+
and False if you want to insert it after the node
73+
6274
"""
6375
raise NotImplementedError
6476

6577
def insertBefore(self, node, refNode):
6678
"""Insert node as a child of the current node, before refNode in the
6779
list of child nodes. Raises ValueError if refNode is not a child of
68-
the current node"""
80+
the current node
81+
82+
:arg node: the node to insert
83+
84+
:arg refNode: the child node to insert the node before
85+
86+
"""
6987
raise NotImplementedError
7088

7189
def removeChild(self, node):
7290
"""Remove node from the children of the current node
91+
92+
:arg node: the child node to remove
93+
7394
"""
7495
raise NotImplementedError
7596

7697
def reparentChildren(self, newParent):
7798
"""Move all the children of the current node to newParent.
7899
This is needed so that trees that don't store text as nodes move the
79100
text in the correct way
101+
102+
:arg newParent: the node to move all this node's children to
103+
80104
"""
81105
# XXX - should this method be made more general?
82106
for child in self.childNodes:
@@ -121,10 +145,12 @@ def nodesEqual(self, node1, node2):
121145

122146
class TreeBuilder(object):
123147
"""Base treebuilder implementation
124-
documentClass - the class to use for the bottommost node of a document
125-
elementClass - the class to use for HTML Elements
126-
commentClass - the class to use for comments
127-
doctypeClass - the class to use for doctypes
148+
149+
* documentClass - the class to use for the bottommost node of a document
150+
* elementClass - the class to use for HTML Elements
151+
* commentClass - the class to use for comments
152+
* doctypeClass - the class to use for doctypes
153+
128154
"""
129155
# pylint:disable=not-callable
130156

@@ -144,6 +170,11 @@ class TreeBuilder(object):
144170
fragmentClass = None
145171

146172
def __init__(self, namespaceHTMLElements):
173+
"""Create a TreeBuilder
174+
175+
:arg namespaceHTMLElements: whether or not to namespace HTML elements
176+
177+
"""
147178
if namespaceHTMLElements:
148179
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
149180
else:
@@ -367,17 +398,20 @@ def generateImpliedEndTags(self, exclude=None):
367398
self.generateImpliedEndTags(exclude)
368399

369400
def getDocument(self):
370-
"Return the final tree"
401+
"""Return the final tree"""
371402
return self.document
372403

373404
def getFragment(self):
374-
"Return the final fragment"
405+
"""Return the final fragment"""
375406
# assert self.innerHTML
376407
fragment = self.fragmentClass()
377408
self.openElements[0].reparentChildren(fragment)
378409
return fragment
379410

380411
def testSerializer(self, node):
381412
"""Serialize the subtree of node in the format required by unit tests
382-
node - the node from which to start serializing"""
413+
414+
:arg node: the node from which to start serializing
415+
416+
"""
383417
raise NotImplementedError

html5lib/treebuilders/etree_lxml.py

-1
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,6 @@ def insertCommentMain(self, data, parent=None):
309309
super(TreeBuilder, self).insertComment(data, parent)
310310

311311
def insertRoot(self, token):
312-
"""Create the document root"""
313312
# Because of the way libxml2 works, it doesn't seem to be possible to
314313
# alter information like the doctype after the tree has been parsed.
315314
# Therefore we need to use the built-in parser to create our initial

0 commit comments

Comments
 (0)