Skip to content

Get rid of getPhases #272

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions html5lib/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,3 @@ def moduleFactory(baseModule, *args, **kwargs):
return mod

return moduleFactory


def memoize(func):
cache = {}

def wrapped(*args, **kwargs):
key = (tuple(args), tuple(kwargs.items()))
if key not in cache:
cache[key] = func(*args, **kwargs)
return cache[key]

return wrapped
93 changes: 44 additions & 49 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import absolute_import, division, unicode_literals
from six import with_metaclass, viewkeys

import types
from six import viewkeys

from . import _inputstream
from . import _tokenizer
Expand All @@ -13,7 +11,7 @@
from .constants import (
spaceCharacters, asciiUpper2Lower,
specialElements, headingElements, cdataElements, rcdataElements,
tokenTypes, tagTokenTypes,
tokenTypes,
namespaces,
htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
adjustForeignAttributes as adjustForeignAttributesMap,
Expand Down Expand Up @@ -71,18 +69,6 @@ def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElemen
return p.parseFragment(doc, container=container, **kwargs)


def method_decorator_metaclass(function):
class Decorated(type):
def __new__(meta, classname, bases, classDict):
for attributeName, attribute in classDict.items():
if isinstance(attribute, types.FunctionType):
attribute = function(attribute)

classDict[attributeName] = attribute
return type.__new__(meta, classname, bases, classDict)
return Decorated


class HTMLParser(object):
"""HTML parser

Expand Down Expand Up @@ -112,14 +98,15 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa

# Raise an exception on the first error encountered
self.strict = strict
self.debug = debug

if tree is None:
tree = treebuilders.getTreeBuilder("etree")
self.tree = tree(namespaceHTMLElements)
self.errors = []

self.phases = {name: cls(self, self.tree) for name, cls in
getPhases(debug).items()}
_phases.items()}

def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):

Expand Down Expand Up @@ -201,6 +188,9 @@ def mainLoop(self):
DoctypeToken = tokenTypes["Doctype"]
ParseErrorToken = tokenTypes["ParseError"]

type_names = {value: key for key, value in tokenTypes.items()}
debug = self.debug
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gsnedders Is there a reason to bring tokenTypes and self.debug into local method variables here?

At L225 we could compare using if self.debug and similarly at L226 we could use info = {"type": tokenTypes[type]}.

(am guessing it's possibly an artifact of some IDE-based refactoring? Hopefully a straightforward cleanup either way)


for token in self.tokenizer:
prev_token = None
new_token = token
Expand Down Expand Up @@ -232,6 +222,17 @@ def mainLoop(self):
else:
phase = self.phases["inForeignContent"]

if debug:
info = {"type": type_names[type]}
if type in (StartTagToken, EndTagToken):
info["name"] = new_token['name']

self.log.append((self.tokenizer.state.__name__,
self.phase.__class__.__name__,
phase.__class__.__name__,
"process" + info["type"],
info))

if type == CharactersToken:
new_token = phase.processCharacters(new_token)
elif type == SpaceCharactersToken:
Expand Down Expand Up @@ -393,37 +394,7 @@ def parseRCDataRawtext(self, token, contentType):
self.phase = self.phases["text"]


@_utils.memoize
def getPhases(debug):
def log(function):
"""Logger that records which phase processes each token"""
type_names = {value: key for key, value in tokenTypes.items()}

def wrapped(self, *args, **kwargs):
if function.__name__.startswith("process") and len(args) > 0:
token = args[0]
info = {"type": type_names[token['type']]}
if token['type'] in tagTokenTypes:
info["name"] = token['name']

self.parser.log.append((self.parser.tokenizer.state.__name__,
self.parser.phase.__class__.__name__,
self.__class__.__name__,
function.__name__,
info))
return function(self, *args, **kwargs)
else:
return function(self, *args, **kwargs)
return wrapped

def getMetaclass(use_metaclass, metaclass_func):
if use_metaclass:
return method_decorator_metaclass(metaclass_func)
else:
return type
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice :) I get some minor worries about difficult-to-debug future issues when built-in Python keywords like type get used as variables / returned as values. Removing this is a nice improvement 👍


# pylint:disable=unused-argument
class Phase(with_metaclass(getMetaclass(debug, log))):
class Phase(object):
"""Base class for helper object that implements each phase of processing
"""
__slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
Expand Down Expand Up @@ -495,6 +466,7 @@ def processEndTag(self, token):
self.__endTagCache.pop(next(iter(self.__endTagCache)))
return func(token)


class InitialPhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -625,6 +597,7 @@ def processEOF(self):
self.anythingElse()
return True


class BeforeHtmlPhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -662,6 +635,7 @@ def processEndTag(self, token):
self.insertHtmlElement()
return token


class BeforeHeadPhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -707,6 +681,7 @@ def endTagOther(self, token):
])
endTagHandler.default = endTagOther


class InHeadPhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -809,6 +784,7 @@ def anythingElse(self):
])
endTagHandler.default = endTagOther


class InHeadNoscriptPhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -872,6 +848,7 @@ def anythingElse(self):
])
endTagHandler.default = endTagOther


class AfterHeadPhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -938,6 +915,7 @@ def anythingElse(self):
endTagHtmlBodyBr)])
endTagHandler.default = endTagOther


class InBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
# the really-really-really-very crazy mode
Expand Down Expand Up @@ -1662,6 +1640,7 @@ def endTagOther(self, token):
])
endTagHandler.default = endTagOther


class TextPhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -1695,6 +1674,7 @@ def endTagOther(self, token):
("script", endTagScript)])
endTagHandler.default = endTagOther


class InTablePhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
__slots__ = tuple()
Expand Down Expand Up @@ -1840,6 +1820,7 @@ def endTagOther(self, token):
])
endTagHandler.default = endTagOther


class InTableTextPhase(Phase):
__slots__ = ("originalPhase", "characterTokens")

Expand Down Expand Up @@ -1887,6 +1868,7 @@ def processEndTag(self, token):
self.parser.phase = self.originalPhase
return token


class InCaptionPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
__slots__ = tuple()
Expand Down Expand Up @@ -1957,6 +1939,7 @@ def endTagOther(self, token):
])
endTagHandler.default = endTagOther


class InColumnGroupPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
__slots__ = tuple()
Expand Down Expand Up @@ -2021,6 +2004,7 @@ def endTagOther(self, token):
])
endTagHandler.default = endTagOther


class InTableBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
__slots__ = tuple()
Expand Down Expand Up @@ -2119,6 +2103,7 @@ def endTagOther(self, token):
])
endTagHandler.default = endTagOther


class InRowPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
__slots__ = tuple()
Expand Down Expand Up @@ -2208,6 +2193,7 @@ def endTagOther(self, token):
])
endTagHandler.default = endTagOther


class InCellPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
__slots__ = tuple()
Expand Down Expand Up @@ -2284,6 +2270,7 @@ def endTagOther(self, token):
])
endTagHandler.default = endTagOther


class InSelectPhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -2383,6 +2370,7 @@ def endTagOther(self, token):
])
endTagHandler.default = endTagOther


class InSelectInTablePhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -2421,6 +2409,7 @@ def endTagOther(self, token):
])
endTagHandler.default = endTagOther


class InForeignContentPhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -2535,6 +2524,7 @@ def processEndTag(self, token):
break
return new_token


class AfterBodyPhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -2581,6 +2571,7 @@ def endTagOther(self, token):
endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
endTagHandler.default = endTagOther


class InFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
__slots__ = tuple()
Expand Down Expand Up @@ -2637,6 +2628,7 @@ def endTagOther(self, token):
])
endTagHandler.default = endTagOther


class AfterFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#after3
__slots__ = tuple()
Expand Down Expand Up @@ -2673,6 +2665,7 @@ def endTagOther(self, token):
])
endTagHandler.default = endTagOther


class AfterAfterBodyPhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -2710,6 +2703,7 @@ def processEndTag(self, token):
])
startTagHandler.default = startTagOther


class AfterAfterFramesetPhase(Phase):
__slots__ = tuple()

Expand Down Expand Up @@ -2747,7 +2741,8 @@ def processEndTag(self, token):

# pylint:enable=unused-argument

return {

_phases = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The core of the improvement - looks good generally. What do you think about going one step further and moving this to the initialization of the HTMLParser's self.phases at L121?

"initial": InitialPhase,
"beforeHtml": BeforeHtmlPhase,
"beforeHead": BeforeHeadPhase,
Expand Down
1 change: 0 additions & 1 deletion html5lib/tests/test_parser2.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ def test_debug_log():
('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! How did you discover this duplicate entry, out of interest?

('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}),
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
Expand Down