Skip to content

Use Python built-in str.lower in preference to asciiUpper2Lower character table translation #526

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions html5lib/_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from .constants import spaceCharacters
from .constants import entities
from .constants import asciiLetters, asciiUpper2Lower
from .constants import asciiLetters
from .constants import digits, hexDigits, EOF
from .constants import tokenTypes, tagTokenTypes
from .constants import replacementCharacters
Expand Down Expand Up @@ -233,7 +233,7 @@ def emitCurrentToken(self):
token = self.currentToken
# Add token to the queue to be yielded
if (token["type"] in tagTokenTypes):
token["name"] = token["name"].translate(asciiUpper2Lower)
token["name"] = token["name"].lower()
if token["type"] == tokenTypes["StartTag"]:
raw = token["data"]
data = attributeMap(raw)
Expand Down Expand Up @@ -927,7 +927,7 @@ def attributeNameState(self):
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
self.currentToken["data"][-1][0] = (
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
self.currentToken["data"][-1][0].lower())
for name, _ in self.currentToken["data"][:-1]:
if self.currentToken["data"][-1][0] == name:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
Expand Down Expand Up @@ -1348,10 +1348,10 @@ def beforeDoctypeNameState(self):
def doctypeNameState(self):
data = self.stream.char()
if data in spaceCharacters:
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.currentToken["name"] = self.currentToken["name"].lower()
self.state = self.afterDoctypeNameState
elif data == ">":
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.currentToken["name"] = self.currentToken["name"].lower()
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data == "\u0000":
Expand All @@ -1363,7 +1363,7 @@ def doctypeNameState(self):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype-name"})
self.currentToken["correct"] = False
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.currentToken["name"] = self.currentToken["name"].lower()
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
Expand Down
3 changes: 0 additions & 3 deletions html5lib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,14 +538,11 @@
"tr"
])

asciiLowercase = frozenset(string.ascii_lowercase)
asciiUppercase = frozenset(string.ascii_uppercase)
asciiLetters = frozenset(string.ascii_letters)
digits = frozenset(string.digits)
hexDigits = frozenset(string.hexdigits)

asciiUpper2Lower = {ord(c): ord(c.lower()) for c in string.ascii_uppercase}

# Heading elements need to be ordered
headingElements = (
"h1",
Expand Down
15 changes: 7 additions & 8 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from . import _utils
from .constants import (
spaceCharacters, asciiUpper2Lower,
spaceCharacters,
specialElements, headingElements, cdataElements, rcdataElements,
tokenTypes, tagTokenTypes,
namespaces,
Expand Down Expand Up @@ -183,8 +183,7 @@ def isHTMLIntegrationPoint(self, element):
if (element.name == "annotation-xml" and
element.namespace == namespaces["mathml"]):
return ("encoding" in element.attributes and
element.attributes["encoding"].translate(
asciiUpper2Lower) in
element.attributes["encoding"].lower() in
("text/html", "application/xhtml+xml"))
else:
return (element.namespace, element.name) in htmlIntegrationPointElements
Expand Down Expand Up @@ -520,7 +519,7 @@ def processDoctype(self, token):
self.tree.insertDoctype(token)

if publicId != "":
publicId = publicId.translate(asciiUpper2Lower)
publicId = publicId.lower()

if (not correct or token["name"] != "html" or
publicId.startswith(
Expand Down Expand Up @@ -1165,7 +1164,7 @@ def startTagInput(self, token):
framesetOK = self.parser.framesetOK
self.startTagVoidFormatting(token)
if ("type" in token["data"] and
token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
token["data"]["type"].lower() == "hidden"):
# input type=hidden doesn't change framesetOK
self.parser.framesetOK = framesetOK

Expand Down Expand Up @@ -1771,7 +1770,7 @@ def startTagStyleScript(self, token):

def startTagInput(self, token):
if ("type" in token["data"] and
token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
token["data"]["type"].lower() == "hidden"):
self.parser.parseError("unexpected-hidden-input-in-table")
self.tree.insertElement(token)
# XXX associate with form
Expand Down Expand Up @@ -2512,11 +2511,11 @@ def processStartTag(self, token):
def processEndTag(self, token):
nodeIndex = len(self.tree.openElements) - 1
node = self.tree.openElements[-1]
if node.name.translate(asciiUpper2Lower) != token["name"]:
if node.name.lower() != token["name"]:
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

while True:
if node.name.translate(asciiUpper2Lower) == token["name"]:
if node.name.lower() == token["name"]:
# XXX this isn't in the spec but it seems necessary
if self.parser.phase == self.parser.phases["inTableText"]:
self.parser.phase.flushCharacters()
Expand Down