html5lib · gsnedders · Jun 23, 2020 · Jun 23, 2020 · jayaddison · Jan 9, 2021
diff --git a/html5lib/_utils.py b/html5lib/_utils.py
@@ -145,15 +145,3 @@ def moduleFactory(baseModule, *args, **kwargs):
             return mod
 
     return moduleFactory
-
-
-def memoize(func):
-    cache = {}
-
-    def wrapped(*args, **kwargs):
-        key = (tuple(args), tuple(kwargs.items()))
-        if key not in cache:
-            cache[key] = func(*args, **kwargs)
-        return cache[key]
-
-    return wrapped
diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
@@ -1,7 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import with_metaclass, viewkeys
-
-import types
+from six import viewkeys
 
 from . import _inputstream
 from . import _tokenizer
@@ -13,7 +11,7 @@
 from .constants import (
     spaceCharacters, asciiUpper2Lower,
     specialElements, headingElements, cdataElements, rcdataElements,
-    tokenTypes, tagTokenTypes,
+    tokenTypes,
     namespaces,
     htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
     adjustForeignAttributes as adjustForeignAttributesMap,
@@ -71,18 +69,6 @@ def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElemen
     return p.parseFragment(doc, container=container, **kwargs)
 
 
-def method_decorator_metaclass(function):
-    class Decorated(type):
-        def __new__(meta, classname, bases, classDict):
-            for attributeName, attribute in classDict.items():
-                if isinstance(attribute, types.FunctionType):
-                    attribute = function(attribute)
-
-                classDict[attributeName] = attribute
-            return type.__new__(meta, classname, bases, classDict)
-    return Decorated
-
-
 class HTMLParser(object):
     """HTML parser
 
@@ -112,14 +98,15 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa
 
         # Raise an exception on the first error encountered
         self.strict = strict
+        self.debug = debug
 
         if tree is None:
             tree = treebuilders.getTreeBuilder("etree")
         self.tree = tree(namespaceHTMLElements)
         self.errors = []
 
         self.phases = {name: cls(self, self.tree) for name, cls in
-                       getPhases(debug).items()}
+                       _phases.items()}
 
     def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
 
@@ -201,6 +188,9 @@ def mainLoop(self):
         DoctypeToken = tokenTypes["Doctype"]
         ParseErrorToken = tokenTypes["ParseError"]
 
+        type_names = {value: key for key, value in tokenTypes.items()}
+        debug = self.debug
+
         for token in self.tokenizer:
             prev_token = None
             new_token = token
@@ -232,6 +222,17 @@ def mainLoop(self):
                     else:
                         phase = self.phases["inForeignContent"]
 
+                    if debug:
+                        info = {"type": type_names[type]}
+                        if type in (StartTagToken, EndTagToken):
+                            info["name"] = new_token['name']
+
+                        self.log.append((self.tokenizer.state.__name__,
+                                         self.phase.__class__.__name__,
+                                         phase.__class__.__name__,
+                                         "process" + info["type"],
+                                         info))
+
                     if type == CharactersToken:
                         new_token = phase.processCharacters(new_token)
                     elif type == SpaceCharactersToken:
@@ -393,37 +394,7 @@ def parseRCDataRawtext(self, token, contentType):
         self.phase = self.phases["text"]
 
 
-@_utils.memoize
-def getPhases(debug):
-    def log(function):
-        """Logger that records which phase processes each token"""
-        type_names = {value: key for key, value in tokenTypes.items()}
-
-        def wrapped(self, *args, **kwargs):
-            if function.__name__.startswith("process") and len(args) > 0:
-                token = args[0]
-                info = {"type": type_names[token['type']]}
-                if token['type'] in tagTokenTypes:
-                    info["name"] = token['name']
-
-                self.parser.log.append((self.parser.tokenizer.state.__name__,
-                                        self.parser.phase.__class__.__name__,
-                                        self.__class__.__name__,
-                                        function.__name__,
-                                        info))
-                return function(self, *args, **kwargs)
-            else:
-                return function(self, *args, **kwargs)
-        return wrapped
-
-    def getMetaclass(use_metaclass, metaclass_func):
-        if use_metaclass:
-            return method_decorator_metaclass(metaclass_func)
-        else:
-            return type
-
-    # pylint:disable=unused-argument
-    class Phase(with_metaclass(getMetaclass(debug, log))):
+class Phase(object):
     """Base class for helper object that implements each phase of processing
     """
     __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
@@ -495,6 +466,7 @@ def processEndTag(self, token):
                 self.__endTagCache.pop(next(iter(self.__endTagCache)))
         return func(token)
 
+
 class InitialPhase(Phase):
     __slots__ = tuple()
 
@@ -625,6 +597,7 @@ def processEOF(self):
         self.anythingElse()
         return True
 
+
 class BeforeHtmlPhase(Phase):
     __slots__ = tuple()
 
@@ -662,6 +635,7 @@ def processEndTag(self, token):
             self.insertHtmlElement()
             return token
 
+
 class BeforeHeadPhase(Phase):
     __slots__ = tuple()
 
@@ -707,6 +681,7 @@ def endTagOther(self, token):
     ])
     endTagHandler.default = endTagOther
 
+
 class InHeadPhase(Phase):
     __slots__ = tuple()
 
@@ -809,6 +784,7 @@ def anythingElse(self):
     ])
     endTagHandler.default = endTagOther
 
+
 class InHeadNoscriptPhase(Phase):
     __slots__ = tuple()
 
@@ -872,6 +848,7 @@ def anythingElse(self):
     ])
     endTagHandler.default = endTagOther
 
+
 class AfterHeadPhase(Phase):
     __slots__ = tuple()
 
@@ -938,6 +915,7 @@ def anythingElse(self):
                                               endTagHtmlBodyBr)])
     endTagHandler.default = endTagOther
 
+
 class InBodyPhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
     # the really-really-really-very crazy mode
@@ -1662,6 +1640,7 @@ def endTagOther(self, token):
     ])
     endTagHandler.default = endTagOther
 
+
 class TextPhase(Phase):
     __slots__ = tuple()
 
@@ -1695,6 +1674,7 @@ def endTagOther(self, token):
         ("script", endTagScript)])
     endTagHandler.default = endTagOther
 
+
 class InTablePhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#in-table
     __slots__ = tuple()
@@ -1840,6 +1820,7 @@ def endTagOther(self, token):
     ])
     endTagHandler.default = endTagOther
 
+
 class InTableTextPhase(Phase):
     __slots__ = ("originalPhase", "characterTokens")
 
@@ -1887,6 +1868,7 @@ def processEndTag(self, token):
         self.parser.phase = self.originalPhase
         return token
 
+
 class InCaptionPhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
     __slots__ = tuple()
@@ -1957,6 +1939,7 @@ def endTagOther(self, token):
     ])
     endTagHandler.default = endTagOther
 
+
 class InColumnGroupPhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#in-column
     __slots__ = tuple()
@@ -2021,6 +2004,7 @@ def endTagOther(self, token):
     ])
     endTagHandler.default = endTagOther
 
+
 class InTableBodyPhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
     __slots__ = tuple()
@@ -2119,6 +2103,7 @@ def endTagOther(self, token):
     ])
     endTagHandler.default = endTagOther
 
+
 class InRowPhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#in-row
     __slots__ = tuple()
@@ -2208,6 +2193,7 @@ def endTagOther(self, token):
     ])
     endTagHandler.default = endTagOther
 
+
 class InCellPhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
     __slots__ = tuple()
@@ -2284,6 +2270,7 @@ def endTagOther(self, token):
     ])
     endTagHandler.default = endTagOther
 
+
 class InSelectPhase(Phase):
     __slots__ = tuple()
 
@@ -2383,6 +2370,7 @@ def endTagOther(self, token):
     ])
     endTagHandler.default = endTagOther
 
+
 class InSelectInTablePhase(Phase):
     __slots__ = tuple()
 
@@ -2421,6 +2409,7 @@ def endTagOther(self, token):
     ])
     endTagHandler.default = endTagOther
 
+
 class InForeignContentPhase(Phase):
     __slots__ = tuple()
 
@@ -2535,6 +2524,7 @@ def processEndTag(self, token):
                 break
         return new_token
 
+
 class AfterBodyPhase(Phase):
     __slots__ = tuple()
 
@@ -2581,6 +2571,7 @@ def endTagOther(self, token):
     endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
     endTagHandler.default = endTagOther
 
+
 class InFramesetPhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
     __slots__ = tuple()
@@ -2637,6 +2628,7 @@ def endTagOther(self, token):
     ])
     endTagHandler.default = endTagOther
 
+
 class AfterFramesetPhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#after3
     __slots__ = tuple()
@@ -2673,6 +2665,7 @@ def endTagOther(self, token):
     ])
     endTagHandler.default = endTagOther
 
+
 class AfterAfterBodyPhase(Phase):
     __slots__ = tuple()
 
@@ -2710,6 +2703,7 @@ def processEndTag(self, token):
     ])
     startTagHandler.default = startTagOther
 
+
 class AfterAfterFramesetPhase(Phase):
     __slots__ = tuple()
 
@@ -2747,7 +2741,8 @@ def processEndTag(self, token):
 
 # pylint:enable=unused-argument
 
-    return {
+
+_phases = {
     "initial": InitialPhase,
     "beforeHtml": BeforeHtmlPhase,
     "beforeHead": BeforeHeadPhase,

diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py
@@ -68,7 +68,6 @@ def test_debug_log():
                 ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
                 ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
                 ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
-                ('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
                 ('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
                 ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}),
                 ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),