Skip to content

Commit 805f272

Browse files
committed
Document html5parser module
1 parent dc9443d commit 805f272

File tree

1 file changed

+94
-29
lines changed

1 file changed

+94
-29
lines changed

html5lib/html5parser.py

+94-29
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,48 @@
2525

2626

2727
def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
28-
"""Parse a string or file-like object into a tree"""
28+
"""Parse an HTML document as a string or file-like object into a tree
29+
30+
:arg doc: the document to parse as a string or file-like object
31+
32+
:arg treebuilder: the treebuilder to use when parsing
33+
34+
:arg namespaceHTMLElements: whether or not to namespace HTML elements
35+
36+
:returns: parsed tree
37+
38+
Example:
39+
40+
>>> from html5lib.html5parser import parse
41+
>>> parse('<html><body><p>This is a doc</p></body></html>')
42+
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
43+
44+
"""
2945
tb = treebuilders.getTreeBuilder(treebuilder)
3046
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
3147
return p.parse(doc, **kwargs)
3248

3349

3450
def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
51+
"""Parse an HTML fragment as a string or file-like object into a tree
52+
53+
:arg doc: the fragment to parse as a string or file-like object
54+
55+
:arg container: the container context to parse the fragment in
56+
57+
:arg treebuilder: the treebuilder to use when parsing
58+
59+
:arg namespaceHTMLElements: whether or not to namespace HTML elements
60+
61+
:returns: parsed tree
62+
63+
Example:
64+
65+
>>> from html5lib.html5libparser import parseFragment
66+
>>> parseFragment('<b>this is a fragment</b>')
67+
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
68+
69+
"""
3570
tb = treebuilders.getTreeBuilder(treebuilder)
3671
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
3772
return p.parseFragment(doc, container=container, **kwargs)
@@ -50,16 +85,30 @@ def __new__(meta, classname, bases, classDict):
5085

5186

5287
class HTMLParser(object):
53-
"""HTML parser. Generates a tree structure from a stream of (possibly
54-
malformed) HTML"""
88+
"""HTML parser
89+
90+
Generates a tree structure from a stream of (possibly malformed) HTML.
91+
92+
"""
5593

5694
def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
5795
"""
58-
strict - raise an exception when a parse error is encountered
96+
:arg tree: a treebuilder class controlling the type of tree that will be
97+
returned. Built in treebuilders can be accessed through
98+
html5lib.treebuilders.getTreeBuilder(treeType)
99+
100+
:arg strict: raise an exception when a parse error is encountered
101+
102+
:arg namespaceHTMLElements: whether or not to namespace HTML elements
103+
104+
:arg debug: whether or not to enable debug mode which logs things
105+
106+
Example:
107+
108+
>>> from html5lib.html5parser import HTMLParser
109+
>>> parser = HTMLParser() # generates parser with etree builder
110+
>>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
59111
60-
tree - a treebuilder class controlling the type of tree that will be
61-
returned. Built in treebuilders can be accessed through
62-
html5lib.treebuilders.getTreeBuilder(treeType)
63112
"""
64113

65114
# Raise an exception on the first error encountered
@@ -123,9 +172,8 @@ def reset(self):
123172

124173
@property
125174
def documentEncoding(self):
126-
"""The name of the character encoding
127-
that was used to decode the input stream,
128-
or :obj:`None` if that is not determined yet.
175+
"""Name of the character encoding that was used to decode the input stream, or
176+
:obj:`None` if that is not determined yet
129177
130178
"""
131179
if not hasattr(self, 'tokenizer'):
@@ -219,32 +267,52 @@ def normalizedTokens(self):
219267
def parse(self, stream, *args, **kwargs):
220268
"""Parse a HTML document into a well-formed tree
221269
222-
stream - a filelike object or string containing the HTML to be parsed
270+
:arg stream: a file-like object or string containing the HTML to be parsed
271+
272+
The optional encoding parameter must be a string that indicates
273+
the encoding. If specified, that encoding will be used,
274+
regardless of any BOM or later declaration (such as in a meta
275+
element).
276+
277+
:arg scripting: treat noscript elements as if JavaScript was turned on
223278
224-
The optional encoding parameter must be a string that indicates
225-
the encoding. If specified, that encoding will be used,
226-
regardless of any BOM or later declaration (such as in a meta
227-
element)
279+
:returns: parsed tree
280+
281+
Example:
282+
283+
>>> from html5lib.html5parser import HTMLParser
284+
>>> parser = HTMLParser()
285+
>>> parser.parse('<html><body><p>This is a doc</p></body></html>')
286+
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
228287
229-
scripting - treat noscript elements as if javascript was turned on
230288
"""
231289
self._parse(stream, False, None, *args, **kwargs)
232290
return self.tree.getDocument()
233291

234292
def parseFragment(self, stream, *args, **kwargs):
235293
"""Parse a HTML fragment into a well-formed tree fragment
236294
237-
container - name of the element we're setting the innerHTML property
238-
if set to None, default to 'div'
295+
:arg container: name of the element we're setting the innerHTML
296+
property if set to None, default to 'div'
297+
298+
:arg stream: a file-like object or string containing the HTML to be parsed
299+
300+
The optional encoding parameter must be a string that indicates
301+
the encoding. If specified, that encoding will be used,
302+
regardless of any BOM or later declaration (such as in a meta
303+
element)
239304
240-
stream - a filelike object or string containing the HTML to be parsed
305+
:arg scripting: treat noscript elements as if JavaScript was turned on
241306
242-
The optional encoding parameter must be a string that indicates
243-
the encoding. If specified, that encoding will be used,
244-
regardless of any BOM or later declaration (such as in a meta
245-
element)
307+
:returns: parsed tree
308+
309+
Example:
310+
311+
>>> from html5lib.html5libparser import HTMLParser
312+
>>> parser = HTMLParser()
313+
>>> parser.parseFragment('<b>this is a fragment</b>')
314+
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
246315
247-
scripting - treat noscript elements as if javascript was turned on
248316
"""
249317
self._parse(stream, True, *args, **kwargs)
250318
return self.tree.getFragment()
@@ -258,8 +326,7 @@ def parseError(self, errorcode="XXX-undefined-error", datavars=None):
258326
raise ParseError(E[errorcode] % datavars)
259327

260328
def normalizeToken(self, token):
261-
""" HTML5 specific normalizations to the token stream """
262-
329+
# HTML5 specific normalizations to the token stream
263330
if token["type"] == tokenTypes["StartTag"]:
264331
raw = token["data"]
265332
token["data"] = OrderedDict(raw)
@@ -327,9 +394,7 @@ def resetInsertionMode(self):
327394
self.phase = new_phase
328395

329396
def parseRCDataRawtext(self, token, contentType):
330-
"""Generic RCDATA/RAWTEXT Parsing algorithm
331-
contentType - RCDATA or RAWTEXT
332-
"""
397+
# Generic RCDATA/RAWTEXT Parsing algorithm
333398
assert contentType in ("RAWTEXT", "RCDATA")
334399

335400
self.tree.insertElement(token)

0 commit comments

Comments
 (0)