Skip to content

Commit f78538c

Browse files
committed
Fix GH-17481: UTF-8 corruption in \Dom\HTMLDocument
We need to properly handle the case when we return from having too few bytes, this needs to be handled separately because the while loop otherwise just performs a partial byte copy.
1 parent 27fbdc1 commit f78538c

File tree

2 files changed

+56
-2
lines changed

2 files changed

+56
-2
lines changed

ext/dom/html_document.c

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -528,9 +528,30 @@ static bool dom_decode_encode_fast_path(
528528
size_t *tree_error_offset
529529
)
530530
{
531-
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
532-
533531
const lxb_char_t *buf_ref = *buf_ref_ref;
532+
533+
/* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */
534+
if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) {
535+
lxb_char_t buf[4];
536+
lxb_char_t *buf_ptr = buf;
537+
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
538+
lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint);
539+
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
540+
541+
if (!dom_process_parse_chunk(
542+
ctx,
543+
document,
544+
parser,
545+
buf_ptr - buf,
546+
buf,
547+
buf_ptr - buf,
548+
tokenizer_error_offset,
549+
tree_error_offset
550+
)) {
551+
goto fail_oom;
552+
}
553+
}
554+
534555
const lxb_char_t *last_output = buf_ref;
535556
while (buf_ref != buf_end) {
536557
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
--TEST--
2+
GH-17481 (UTF-8 corruption in \Dom\HTMLDocument)
3+
--EXTENSIONS--
4+
dom
5+
--FILE--
6+
<?php
7+
8+
$inputs = [
9+
[str_repeat('', 4096), false],
10+
[str_repeat('😏', 4096), false],
11+
[str_repeat('', 4096), true],
12+
[str_repeat('😏', 4096), true],
13+
[str_repeat('', 1358), false],
14+
[str_repeat('', 1359), false],
15+
];
16+
17+
foreach ($inputs as [$input, $endTag]) {
18+
$Data = "<!DOCTYPE HTML><html>$input";
19+
if ($endTag) {
20+
$Data .= '</html>';
21+
}
22+
$Document = \Dom\HTMLDocument::createFromString($Data, 0, 'UTF-8');
23+
var_dump($Document->body->textContent === $input);
24+
}
25+
26+
?>
27+
--EXPECT--
28+
bool(true)
29+
bool(true)
30+
bool(true)
31+
bool(true)
32+
bool(true)
33+
bool(true)

0 commit comments

Comments
 (0)