-
Notifications
You must be signed in to change notification settings - Fork 294
Tokenizer: pretranslate lowercase element and attribute names #520
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
183d8a0
2e86373
8f96b17
a912842
f9f370e
fa62671
df94e2d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -233,7 +233,6 @@ def emitCurrentToken(self): | |
token = self.currentToken | ||
# Add token to the queue to be yielded | ||
if (token["type"] in tagTokenTypes): | ||
token["name"] = token["name"].translate(asciiUpper2Lower) | ||
if token["type"] == tokenTypes["StartTag"]: | ||
raw = token["data"] | ||
data = attributeMap(raw) | ||
|
@@ -380,7 +379,8 @@ def tagOpenState(self): | |
self.state = self.closeTagOpenState | ||
elif data in asciiLetters: | ||
self.currentToken = {"type": tokenTypes["StartTag"], | ||
"name": data, "data": [], | ||
"name": data.translate(asciiUpper2Lower), | ||
"data": [], | ||
"selfClosing": False, | ||
"selfClosingAcknowledged": False} | ||
self.state = self.tagNameState | ||
|
@@ -410,7 +410,8 @@ def tagOpenState(self): | |
def closeTagOpenState(self): | ||
data = self.stream.char() | ||
if data in asciiLetters: | ||
self.currentToken = {"type": tokenTypes["EndTag"], "name": data, | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": data.translate(asciiUpper2Lower), | ||
"data": [], "selfClosing": False} | ||
self.state = self.tagNameState | ||
elif data == ">": | ||
|
@@ -448,7 +449,7 @@ def tagNameState(self): | |
"data": "invalid-codepoint"}) | ||
self.currentToken["name"] += "\uFFFD" | ||
else: | ||
self.currentToken["name"] += data | ||
self.currentToken["name"] += data.translate(asciiUpper2Lower) | ||
# (Don't use charsUntil here, because tag names are | ||
# very short and it's faster to not do anything fancy) | ||
return True | ||
|
@@ -476,21 +477,22 @@ def rcdataEndTagOpenState(self): | |
return True | ||
|
||
def rcdataEndTagNameState(self): | ||
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() | ||
name = self.temporaryBuffer.translate(asciiUpper2Lower) | ||
appropriate = self.currentToken and self.currentToken["name"] == name | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW: my thinking from my previous comment was to instead change this state (and similar ones) to:
etc. At the absolute least, I think we should avoid computing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, yep - I didn't really register that after your first comment; I'll take a look at re-ordering these conditionals soon. Thanks for detailing that a bit further. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's applied now; I didn't find a noticeable performance difference as a result (cpython 3.9.1), but it may be logically clearer. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like the modifications in this PR are getting blended together slightly confusingly, so I'll do a more thorough analysis soon to pick apart the individual suggestions, analyze performance for them individually, and then keep the ones that still seem useful and show performance benefits (or seem worthwhile enough to include anyway). |
||
data = self.stream.char() | ||
if data in spaceCharacters and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": self.temporaryBuffer, | ||
"name": name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.beforeAttributeNameState | ||
elif data == "/" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": self.temporaryBuffer, | ||
"name": name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.selfClosingStartTagState | ||
elif data == ">" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": self.temporaryBuffer, | ||
"name": name, | ||
"data": [], "selfClosing": False} | ||
self.emitCurrentToken() | ||
self.state = self.dataState | ||
|
@@ -526,21 +528,22 @@ def rawtextEndTagOpenState(self): | |
return True | ||
|
||
def rawtextEndTagNameState(self): | ||
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() | ||
name = self.temporaryBuffer.translate(asciiUpper2Lower) | ||
appropriate = self.currentToken and self.currentToken["name"] == name | ||
data = self.stream.char() | ||
if data in spaceCharacters and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": self.temporaryBuffer, | ||
"name": name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.beforeAttributeNameState | ||
elif data == "/" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": self.temporaryBuffer, | ||
"name": name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.selfClosingStartTagState | ||
elif data == ">" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": self.temporaryBuffer, | ||
"name": name, | ||
"data": [], "selfClosing": False} | ||
self.emitCurrentToken() | ||
self.state = self.dataState | ||
|
@@ -579,21 +582,22 @@ def scriptDataEndTagOpenState(self): | |
return True | ||
|
||
def scriptDataEndTagNameState(self): | ||
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() | ||
name = self.temporaryBuffer.translate(asciiUpper2Lower) | ||
appropriate = self.currentToken and self.currentToken["name"] == name | ||
data = self.stream.char() | ||
if data in spaceCharacters and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": self.temporaryBuffer, | ||
"name": name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.beforeAttributeNameState | ||
elif data == "/" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": self.temporaryBuffer, | ||
"name": name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.selfClosingStartTagState | ||
elif data == ">" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": self.temporaryBuffer, | ||
"name": name, | ||
"data": [], "selfClosing": False} | ||
self.emitCurrentToken() | ||
self.state = self.dataState | ||
|
@@ -715,21 +719,22 @@ def scriptDataEscapedEndTagOpenState(self): | |
return True | ||
|
||
def scriptDataEscapedEndTagNameState(self): | ||
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() | ||
name = self.temporaryBuffer.translate(asciiUpper2Lower) | ||
appropriate = self.currentToken and self.currentToken["name"] == name | ||
data = self.stream.char() | ||
if data in spaceCharacters and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": self.temporaryBuffer, | ||
"name": name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.beforeAttributeNameState | ||
elif data == "/" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": self.temporaryBuffer, | ||
"name": name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.selfClosingStartTagState | ||
elif data == ">" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": self.temporaryBuffer, | ||
"name": name, | ||
"data": [], "selfClosing": False} | ||
self.emitCurrentToken() | ||
self.state = self.dataState | ||
|
@@ -776,7 +781,9 @@ def scriptDataDoubleEscapedState(self): | |
"eof-in-script-in-script"}) | ||
self.state = self.dataState | ||
else: | ||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) | ||
chars = self.stream.charsUntil(("<", "-", "\u0000")) | ||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | ||
data + chars}) | ||
return True | ||
|
||
def scriptDataDoubleEscapedDashState(self): | ||
|
@@ -859,7 +866,8 @@ def beforeAttributeNameState(self): | |
if data in spaceCharacters: | ||
self.stream.charsUntil(spaceCharacters, True) | ||
elif data in asciiLetters: | ||
self.currentToken["data"].append([data, ""]) | ||
attr_name = data.translate(asciiUpper2Lower) | ||
self.currentToken["data"].append([attr_name, ""]) | ||
self.state = self.attributeNameState | ||
elif data == ">": | ||
self.emitCurrentToken() | ||
|
@@ -891,8 +899,7 @@ def attributeNameState(self): | |
if data == "=": | ||
self.state = self.beforeAttributeValueState | ||
elif data in asciiLetters: | ||
self.currentToken["data"][-1][0] += data +\ | ||
self.stream.charsUntil(asciiLetters, True) | ||
self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower) | ||
leavingThisState = False | ||
elif data == ">": | ||
# XXX If we emit here the attributes are converted to a dict | ||
|
@@ -919,15 +926,13 @@ def attributeNameState(self): | |
"data": "eof-in-attribute-name"}) | ||
self.state = self.dataState | ||
else: | ||
self.currentToken["data"][-1][0] += data | ||
self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower) | ||
leavingThisState = False | ||
|
||
if leavingThisState: | ||
# Attributes are not dropped at this stage. That happens when the | ||
# start tag token is emitted so values can still be safely appended | ||
# to attributes, but we do want to report the parse error in time. | ||
self.currentToken["data"][-1][0] = ( | ||
self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) | ||
for name, _ in self.currentToken["data"][:-1]: | ||
if self.currentToken["data"][-1][0] == name: | ||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
|
@@ -947,7 +952,8 @@ def afterAttributeNameState(self): | |
elif data == ">": | ||
self.emitCurrentToken() | ||
elif data in asciiLetters: | ||
self.currentToken["data"].append([data, ""]) | ||
attr_name = data.translate(asciiUpper2Lower) | ||
self.currentToken["data"].append([attr_name, ""]) | ||
self.state = self.attributeNameState | ||
elif data == "/": | ||
self.state = self.selfClosingStartTagState | ||
|
@@ -1341,17 +1347,15 @@ def beforeDoctypeNameState(self): | |
self.tokenQueue.append(self.currentToken) | ||
self.state = self.dataState | ||
else: | ||
self.currentToken["name"] = data | ||
self.currentToken["name"] = data.translate(asciiUpper2Lower) | ||
self.state = self.doctypeNameState | ||
return True | ||
|
||
def doctypeNameState(self): | ||
data = self.stream.char() | ||
if data in spaceCharacters: | ||
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) | ||
self.state = self.afterDoctypeNameState | ||
elif data == ">": | ||
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) | ||
self.tokenQueue.append(self.currentToken) | ||
self.state = self.dataState | ||
elif data == "\u0000": | ||
|
@@ -1363,11 +1367,10 @@ def doctypeNameState(self): | |
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
"eof-in-doctype-name"}) | ||
self.currentToken["correct"] = False | ||
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) | ||
self.tokenQueue.append(self.currentToken) | ||
self.state = self.dataState | ||
else: | ||
self.currentToken["name"] += data | ||
self.currentToken["name"] += data.translate(asciiUpper2Lower) | ||
return True | ||
|
||
def afterDoctypeNameState(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm very skeptical about this being a perf win, versus it being in
emitCurrentToken
. What do the benchmarks say?Yes,
emitCurrentToken
's lowercasing becomes redundant in the RCDATA/RAWTEXT/script cases, but I expect the cost of this will negate any gains.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's fair, yep - especially for short element names it seems likely that the
translate
method call overhead (especially if called repeatedly) could negate any benefits provided by simpler comparisons.I hadn't assessed the performance of this code path separately; it felt worth maintaining consistency but I don't believe there's a noticeable performance change.