Skip to content

Fix GH-17802: \Dom\HTMLDocument querySelector attribute name is case sensitive in HTML #17815

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 83 additions & 4 deletions ext/dom/lexbor/lexbor/selectors-adapted/selectors.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,21 @@ static zend_always_inline bool lxb_selectors_adapted_cmp_local_name_id(const xml

static zend_always_inline const xmlAttr *lxb_selectors_adapted_attr(const xmlNode *node, const lxb_char_t *name)
{
const xmlAttr *attr = xmlHasProp(node, (const xmlChar *) name);
const xmlAttr *attr = NULL;
ZEND_ASSERT(node->doc != NULL);
if (php_dom_ns_is_html_and_document_is_html(node)) {
/* No need to handle DTD entities as we're in HTML. */
size_t name_bound = strlen((const char *) name) + 1;
for (const xmlAttr *cur = node->properties; cur != NULL; cur = cur->next) {
if (lexbor_str_data_nlocmp_right(cur->name, name, name_bound)) {
attr = cur;
break;
}
}
} else {
attr = xmlHasProp(node, (const xmlChar *) name);
}

if (attr != NULL && attr->ns != NULL) {
return NULL;
}
Expand All @@ -85,8 +99,67 @@ static zend_always_inline dom_lxb_str_wrapper lxb_selectors_adapted_attr_value(c
return ret;
}

static bool lxb_selectors_attrib_name_cmp(const lxb_css_selector_t *selector, const char *name, size_t len)
{
return selector->name.length == len && lexbor_str_data_nlocmp_right((const lxb_char_t *) name, selector->name.data, len);
}

/* From https://html.spec.whatwg.org/#case-sensitivity-of-selectors
* "Attribute selectors on an HTML element in an HTML document must treat the values of attributes with the following names as ASCII case-insensitive:" */
static bool lxb_selectors_is_lowercased_html_attrib_name(const lxb_css_selector_t *selector)
{
return lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("accept"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("accept-charset"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("align"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("alink"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("axis"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("bgcolor"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("charset"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("checked"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("clear"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("codetype"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("color"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("compact"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("declare"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("defer"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("dir"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("direction"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("disabled"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("enctype"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("face"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("frame"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("hreflang"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("http-equiv"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("lang"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("language"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("link"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("media"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("method"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("multiple"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("nohref"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("noresize"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("noshade"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("nowrap"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("readonly"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("rel"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("rev"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("rules"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("scope"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("scrolling"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("selected"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("shape"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("target"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("text"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("type"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("valign"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("valuetype"))
|| lxb_selectors_attrib_name_cmp(selector, ZEND_STRL("vlink"));
}

static void lxb_selectors_adapted_set_entry_id_ex(lxb_selectors_entry_t *entry, const lxb_css_selector_t *selector, const xmlNode *node)
{
entry->id.attr_case_insensitive = lxb_selectors_is_lowercased_html_attrib_name(selector);

if (node->doc != NULL && node->doc->dict != NULL) {
const xmlChar *interned = xmlDictExists(node->doc->dict, selector->name.data, selector->name.length);
if (interned != NULL) {
Expand Down Expand Up @@ -1290,10 +1363,10 @@ lxb_selectors_match_class(const lexbor_str_t *target, const lexbor_str_t *src,
}

static bool
lxb_selectors_match_attribute_value(const lxb_css_selector_attribute_t *attr, const lexbor_str_t *trg, const lexbor_str_t *src)
lxb_selectors_match_attribute_value(const lxb_css_selector_attribute_t *attr, bool force_modifier_i, const lexbor_str_t *trg, const lexbor_str_t *src)
{
bool res;
bool ins = attr->modifier == LXB_CSS_SELECTOR_MODIFIER_I;
bool ins = attr->modifier == LXB_CSS_SELECTOR_MODIFIER_I || force_modifier_i;

switch (attr->match) {
case LXB_CSS_SELECTOR_MATCH_EQUAL: /* = */
Expand Down Expand Up @@ -1405,7 +1478,13 @@ lxb_selectors_match_attribute(const lxb_css_selector_t *selector,
}

dom_lxb_str_wrapper trg = lxb_selectors_adapted_attr_value(dom_attr);
bool res = lxb_selectors_match_attribute_value(attr, &trg.str, src);
ZEND_ASSERT(node->doc != NULL);
bool res = lxb_selectors_match_attribute_value(
attr,
entry->id.attr_case_insensitive && php_dom_ns_is_html_and_document_is_html(node),
&trg.str,
src
);
dom_lxb_str_wrapper_release(&trg);
return res;
}
Expand Down
1 change: 1 addition & 0 deletions ext/dom/lexbor/lexbor/selectors-adapted/selectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ typedef lxb_selectors_entry_t *
typedef struct {
const xmlChar *name;
bool interned;
bool attr_case_insensitive;
} lxb_selectors_adapted_id;

struct lxb_selectors_entry {
Expand Down
63 changes: 63 additions & 0 deletions ext/dom/tests/modern/css_selectors/gh17802.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
--TEST--
GH-17802 (\Dom\HTMLDocument querySelector attribute name is case sensitive in HTML)
--EXTENSIONS--
dom
--FILE--
<?php

$text = <<<TEXT
<html>
<head>
<meta charset="Windows-1252">
</head>
<body>
</body>
</html>
TEXT;

$dom = \Dom\HTMLDocument::createFromString($text, options: LIBXML_NOERROR);
$meta2 = $dom->head->appendChild($dom->createElementNS('urn:x', 'meta'));
$meta2->setAttribute('charset', 'x');
echo $dom->saveHtml(), "\n";

echo "--- charseT ---\n";

foreach ($dom->querySelectorAll('meta[charseT]') as $entry) {
var_dump($dom->saveHtml($entry));
}

echo "--- charset ---\n";

foreach ($dom->querySelectorAll('meta[charset]') as $entry) {
var_dump($dom->saveHtml($entry));
}

echo "--- charseT and lowercase value ---\n";

foreach ($dom->querySelectorAll('meta[charseT="windows-1252"]') as $entry) {
var_dump($dom->saveHtml($entry));
}

echo "--- charset and lowercase value ---\n";

foreach ($dom->querySelectorAll('meta[charset="windows-1252"]') as $entry) {
var_dump($dom->saveHtml($entry));
}

?>
--EXPECT--
<html><head>
<meta charset="Windows-1252">
<meta charset="x"></meta></head>
<body>

</body></html>
--- charseT ---
string(29) "<meta charset="Windows-1252">"
--- charset ---
string(29) "<meta charset="Windows-1252">"
string(25) "<meta charset="x"></meta>"
--- charseT and lowercase value ---
string(29) "<meta charset="Windows-1252">"
--- charset and lowercase value ---
string(29) "<meta charset="Windows-1252">"
Loading