@@ -68,10 +68,33 @@ def htmlentityreplace_errors(exc):
68
68
else :
69
69
return xmlcharrefreplace_errors (exc )
70
70
71
+
71
72
register_error ("htmlentityreplace" , htmlentityreplace_errors )
72
73
73
74
74
75
def serialize (input , tree = "etree" , encoding = None , ** serializer_opts ):
76
+ """Serializes the input token stream using the specified treewalker
77
+
78
+ :arg input: the token stream to serialize
79
+
80
+ :arg tree: the treewalker to use
81
+
82
+ :arg encoding: the encoding to use
83
+
84
+ :arg serializer_opts: any options to pass to the
85
+ :py:class:`html5lib.serializer.HTMLSerializer` that gets created
86
+
87
+ :returns: the tree serialized as a string
88
+
89
+ Example:
90
+
91
+ >>> from html5lib.html5parser import parse
92
+ >>> from html5lib.serializer import serialize
93
+ >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
94
+ >>> serialize(token_stream, omit_optional_tags=False)
95
+ '<html><head></head><body><p>Hi!</p></body></html>'
96
+
97
+ """
75
98
# XXX: Should we cache this?
76
99
walker = treewalkers .getTreeWalker (tree )
77
100
s = HTMLSerializer (** serializer_opts )
@@ -110,50 +133,83 @@ class HTMLSerializer(object):
110
133
"strip_whitespace" , "sanitize" )
111
134
112
135
def __init__ (self , ** kwargs ):
113
- """Initialize HTMLSerializer.
114
-
115
- Keyword options (default given first unless specified) include:
116
-
117
- inject_meta_charset=True|False
118
- Whether it insert a meta element to define the character set of the
119
- document.
120
- quote_attr_values="legacy"|"spec"|"always"
121
- Whether to quote attribute values that don't require quoting
122
- per legacy browser behaviour, when required by the standard, or always.
123
- quote_char=u'"'|u"'"
124
- Use given quote character for attribute quoting. Default is to
125
- use double quote unless attribute value contains a double quote,
126
- in which case single quotes are used instead.
127
- escape_lt_in_attrs=False|True
128
- Whether to escape < in attribute values.
129
- escape_rcdata=False|True
130
- Whether to escape characters that need to be escaped within normal
131
- elements within rcdata elements such as style.
132
- resolve_entities=True|False
133
- Whether to resolve named character entities that appear in the
134
- source tree. The XML predefined entities < > & " '
135
- are unaffected by this setting.
136
- strip_whitespace=False|True
137
- Whether to remove semantically meaningless whitespace. (This
138
- compresses all whitespace to a single space except within pre.)
139
- minimize_boolean_attributes=True|False
140
- Shortens boolean attributes to give just the attribute value,
141
- for example <input disabled="disabled"> becomes <input disabled>.
142
- use_trailing_solidus=False|True
143
- Includes a close-tag slash at the end of the start tag of void
144
- elements (empty elements whose end tag is forbidden). E.g. <hr/>.
145
- space_before_trailing_solidus=True|False
146
- Places a space immediately before the closing slash in a tag
147
- using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
148
- sanitize=False|True
149
- Strip all unsafe or unknown constructs from output.
150
- See `html5lib user documentation`_
151
- omit_optional_tags=True|False
152
- Omit start/end tags that are optional.
153
- alphabetical_attributes=False|True
154
- Reorder attributes to be in alphabetical order.
155
-
156
- .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
136
+ """Initialize HTMLSerializer
137
+
138
+ :arg inject_meta_charset: Whether or not to inject the meta charset.
139
+
140
+ Defaults to ``True``.
141
+
142
+ :arg quote_attr_values: Whether to quote attribute values that don't
143
+ require quoting per legacy browser behavior (``"legacy"``), when
144
+ required by the standard (``"spec"``), or always (``"always"``).
145
+
146
+ Defaults to ``"legacy"``.
147
+
148
+ :arg quote_char: Use given quote character for attribute quoting.
149
+
150
+ Defaults to ``"`` which will use double quotes unless attribute
151
+ value contains a double quote, in which case single quotes are
152
+ used.
153
+
154
+ :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
155
+ values.
156
+
157
+ Defaults to ``False``.
158
+
159
+ :arg escape_rcdata: Whether to escape characters that need to be
160
+ escaped within normal elements within rcdata elements such as
161
+ style.
162
+
163
+ Defaults to ``False``.
164
+
165
+ :arg resolve_entities: Whether to resolve named character entities that
166
+ appear in the source tree. The XML predefined entities < >
167
+ & " ' are unaffected by this setting.
168
+
169
+ Defaults to ``True``.
170
+
171
+ :arg strip_whitespace: Whether to remove semantically meaningless
172
+ whitespace. (This compresses all whitespace to a single space
173
+ except within ``pre``.)
174
+
175
+ Defaults to ``False``.
176
+
177
+ :arg minimize_boolean_attributes: Shortens boolean attributes to give
178
+ just the attribute value, for example::
179
+
180
+ <input disabled="disabled">
181
+
182
+ becomes::
183
+
184
+ <input disabled>
185
+
186
+ Defaults to ``True``.
187
+
188
+ :arg use_trailing_solidus: Includes a close-tag slash at the end of the
189
+ start tag of void elements (empty elements whose end tag is
190
+ forbidden). E.g. ``<hr/>``.
191
+
192
+ Defaults to ``False``.
193
+
194
+ :arg space_before_trailing_solidus: Places a space immediately before
195
+ the closing slash in a tag using a trailing solidus. E.g.
196
+ ``<hr />``. Requires ``use_trailing_solidus=True``.
197
+
198
+ Defaults to ``True``.
199
+
200
+ :arg sanitize: Strip all unsafe or unknown constructs from output.
201
+ See :py:class:`html5lib.filters.sanitizer.Filter`.
202
+
203
+ Defaults to ``False``.
204
+
205
+ :arg omit_optional_tags: Omit start/end tags that are optional.
206
+
207
+ Defaults to ``True``.
208
+
209
+ :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
210
+
211
+ Defaults to ``False``.
212
+
157
213
"""
158
214
unexpected_args = frozenset (kwargs ) - frozenset (self .options )
159
215
if len (unexpected_args ) > 0 :
@@ -317,6 +373,25 @@ def serialize(self, treewalker, encoding=None):
317
373
self .serializeError (token ["data" ])
318
374
319
375
def render (self , treewalker , encoding = None ):
376
+ """Serializes the stream from the treewalker into a string
377
+
378
+ :arg treewalker: the treewalker to serialize
379
+
380
+ :arg encoding: the string encoding to use
381
+
382
+ :returns: the serialized tree
383
+
384
+ Example:
385
+
386
+ >>> from html5lib import parse, getTreeWalker
387
+ >>> from html5lib.serializer import HTMLSerializer
388
+ >>> token_stream = parse('<html><body>Hi!</body></html>')
389
+ >>> walker = getTreeWalker('etree')
390
+ >>> serializer = HTMLSerializer(omit_optional_tags=False)
391
+ >>> serializer.render(walker(token_stream))
392
+ '<html><head></head><body>Hi!</body></html>'
393
+
394
+ """
320
395
if encoding :
321
396
return b"" .join (list (self .serialize (treewalker , encoding )))
322
397
else :
0 commit comments