25
25
26
26
27
27
def parse (doc , treebuilder = "etree" , namespaceHTMLElements = True , ** kwargs ):
28
- """Parse a string or file-like object into a tree"""
28
+ """Parse an HTML document as a string or file-like object into a tree
29
+
30
+ :arg doc: the document to parse as a string or file-like object
31
+
32
+ :arg treebuilder: the treebuilder to use when parsing
33
+
34
+ :arg namespaceHTMLElements: whether or not to namespace HTML elements
35
+
36
+ :returns: parsed tree
37
+
38
+ Example:
39
+
40
+ >>> from html5lib.html5parser import parse
41
+ >>> parse('<html><body><p>This is a doc</p></body></html>')
42
+ <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
43
+
44
+ """
29
45
tb = treebuilders .getTreeBuilder (treebuilder )
30
46
p = HTMLParser (tb , namespaceHTMLElements = namespaceHTMLElements )
31
47
return p .parse (doc , ** kwargs )
32
48
33
49
34
50
def parseFragment (doc , container = "div" , treebuilder = "etree" , namespaceHTMLElements = True , ** kwargs ):
51
+ """Parse an HTML fragment as a string or file-like object into a tree
52
+
53
+ :arg doc: the fragment to parse as a string or file-like object
54
+
55
+ :arg container: the container context to parse the fragment in
56
+
57
+ :arg treebuilder: the treebuilder to use when parsing
58
+
59
+ :arg namespaceHTMLElements: whether or not to namespace HTML elements
60
+
61
+ :returns: parsed tree
62
+
63
+ Example:
64
+
65
+ >>> from html5lib.html5libparser import parseFragment
66
+ >>> parseFragment('<b>this is a fragment</b>')
67
+ <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
68
+
69
+ """
35
70
tb = treebuilders .getTreeBuilder (treebuilder )
36
71
p = HTMLParser (tb , namespaceHTMLElements = namespaceHTMLElements )
37
72
return p .parseFragment (doc , container = container , ** kwargs )
@@ -50,16 +85,30 @@ def __new__(meta, classname, bases, classDict):
50
85
51
86
52
87
class HTMLParser (object ):
53
- """HTML parser. Generates a tree structure from a stream of (possibly
54
- malformed) HTML"""
88
+ """HTML parser
89
+
90
+ Generates a tree structure from a stream of (possibly malformed) HTML.
91
+
92
+ """
55
93
56
94
def __init__ (self , tree = None , strict = False , namespaceHTMLElements = True , debug = False ):
57
95
"""
58
- strict - raise an exception when a parse error is encountered
96
+ :arg tree: a treebuilder class controlling the type of tree that will be
97
+ returned. Built in treebuilders can be accessed through
98
+ html5lib.treebuilders.getTreeBuilder(treeType)
99
+
100
+ :arg strict: raise an exception when a parse error is encountered
101
+
102
+ :arg namespaceHTMLElements: whether or not to namespace HTML elements
103
+
104
+ :arg debug: whether or not to enable debug mode which logs things
105
+
106
+ Example:
107
+
108
+ >>> from html5lib.html5parser import HTMLParser
109
+ >>> parser = HTMLParser() # generates parser with etree builder
110
+ >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
59
111
60
- tree - a treebuilder class controlling the type of tree that will be
61
- returned. Built in treebuilders can be accessed through
62
- html5lib.treebuilders.getTreeBuilder(treeType)
63
112
"""
64
113
65
114
# Raise an exception on the first error encountered
@@ -123,9 +172,8 @@ def reset(self):
123
172
124
173
@property
125
174
def documentEncoding (self ):
126
- """The name of the character encoding
127
- that was used to decode the input stream,
128
- or :obj:`None` if that is not determined yet.
175
+ """Name of the character encoding that was used to decode the input stream, or
176
+ :obj:`None` if that is not determined yet
129
177
130
178
"""
131
179
if not hasattr (self , 'tokenizer' ):
@@ -219,32 +267,52 @@ def normalizedTokens(self):
219
267
def parse (self , stream , * args , ** kwargs ):
220
268
"""Parse a HTML document into a well-formed tree
221
269
222
- stream - a filelike object or string containing the HTML to be parsed
270
+ :arg stream: a file-like object or string containing the HTML to be parsed
271
+
272
+ The optional encoding parameter must be a string that indicates
273
+ the encoding. If specified, that encoding will be used,
274
+ regardless of any BOM or later declaration (such as in a meta
275
+ element).
276
+
277
+ :arg scripting: treat noscript elements as if JavaScript was turned on
223
278
224
- The optional encoding parameter must be a string that indicates
225
- the encoding. If specified, that encoding will be used,
226
- regardless of any BOM or later declaration (such as in a meta
227
- element)
279
+ :returns: parsed tree
280
+
281
+ Example:
282
+
283
+ >>> from html5lib.html5parser import HTMLParser
284
+ >>> parser = HTMLParser()
285
+ >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
286
+ <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
228
287
229
- scripting - treat noscript elements as if javascript was turned on
230
288
"""
231
289
self ._parse (stream , False , None , * args , ** kwargs )
232
290
return self .tree .getDocument ()
233
291
234
292
def parseFragment (self , stream , * args , ** kwargs ):
235
293
"""Parse a HTML fragment into a well-formed tree fragment
236
294
237
- container - name of the element we're setting the innerHTML property
238
- if set to None, default to 'div'
295
+ :arg container: name of the element we're setting the innerHTML
296
+ property if set to None, default to 'div'
297
+
298
+ :arg stream: a file-like object or string containing the HTML to be parsed
299
+
300
+ The optional encoding parameter must be a string that indicates
301
+ the encoding. If specified, that encoding will be used,
302
+ regardless of any BOM or later declaration (such as in a meta
303
+ element)
239
304
240
- stream - a filelike object or string containing the HTML to be parsed
305
+ :arg scripting: treat noscript elements as if JavaScript was turned on
241
306
242
- The optional encoding parameter must be a string that indicates
243
- the encoding. If specified, that encoding will be used,
244
- regardless of any BOM or later declaration (such as in a meta
245
- element)
307
+ :returns: parsed tree
308
+
309
+ Example:
310
+
311
+ >>> from html5lib.html5libparser import HTMLParser
312
+ >>> parser = HTMLParser()
313
+ >>> parser.parseFragment('<b>this is a fragment</b>')
314
+ <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
246
315
247
- scripting - treat noscript elements as if javascript was turned on
248
316
"""
249
317
self ._parse (stream , True , * args , ** kwargs )
250
318
return self .tree .getFragment ()
@@ -258,8 +326,7 @@ def parseError(self, errorcode="XXX-undefined-error", datavars=None):
258
326
raise ParseError (E [errorcode ] % datavars )
259
327
260
328
def normalizeToken (self , token ):
261
- """ HTML5 specific normalizations to the token stream """
262
-
329
+ # HTML5 specific normalizations to the token stream
263
330
if token ["type" ] == tokenTypes ["StartTag" ]:
264
331
raw = token ["data" ]
265
332
token ["data" ] = OrderedDict (raw )
@@ -327,9 +394,7 @@ def resetInsertionMode(self):
327
394
self .phase = new_phase
328
395
329
396
def parseRCDataRawtext (self , token , contentType ):
330
- """Generic RCDATA/RAWTEXT Parsing algorithm
331
- contentType - RCDATA or RAWTEXT
332
- """
397
+ # Generic RCDATA/RAWTEXT Parsing algorithm
333
398
assert contentType in ("RAWTEXT" , "RCDATA" )
334
399
335
400
self .tree .insertElement (token )
0 commit comments