Skip to content

Commit 603b954

Browse files
committed
Implement override_encoding
1 parent af71943 commit 603b954

13 files changed

+251
-64
lines changed

ext/dom/document.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,7 +1194,7 @@ const char *_dom_get_valid_file_path(const char *source, char *resolved_path, in
11941194
}
11951195
/* }}} */
11961196

1197-
xmlDocPtr dom_document_parser(zval *id, int mode, const char *source, size_t source_len, size_t options) /* {{{ */
1197+
xmlDocPtr dom_document_parser(zval *id, int mode, const char *source, size_t source_len, size_t options, xmlCharEncodingHandlerPtr encoding) /* {{{ */
11981198
{
11991199
xmlDocPtr ret;
12001200
xmlParserCtxtPtr ctxt = NULL;
@@ -1237,6 +1237,8 @@ xmlDocPtr dom_document_parser(zval *id, int mode, const char *source, size_t sou
12371237
return(NULL);
12381238
}
12391239

1240+
(void) xmlSwitchToEncoding(ctxt, encoding);
1241+
12401242
/* If loading from memory, we need to set the base directory for the document */
12411243
if (mode != DOM_LOAD_FILE) {
12421244
#ifdef HAVE_GETCWD
@@ -1374,7 +1376,7 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode)
13741376
RETURN_FALSE;
13751377
}
13761378

1377-
xmlDocPtr newdoc = dom_document_parser(ZEND_THIS, mode, source, source_len, options);
1379+
xmlDocPtr newdoc = dom_document_parser(ZEND_THIS, mode, source, source_len, options, NULL);
13781380
php_dom_finish_loading_document(ZEND_THIS, return_value, newdoc);
13791381
}
13801382

ext/dom/html_document.c

Lines changed: 70 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ static dom_character_encoding_data dom_determine_encoding(const char *source, si
353353
lxb_html_encoding_t encoding;
354354
lxb_status_t status = lxb_html_encoding_init(&encoding);
355355
if (status != LXB_STATUS_OK) {
356-
goto fallback;
356+
goto fallback_uninit;
357357
}
358358
/* This is the "wait either for 1024 bytes or 500ms" part */
359359
if (source_len > 1024) {
@@ -368,32 +368,47 @@ static dom_character_encoding_data dom_determine_encoding(const char *source, si
368368
goto fallback;
369369
}
370370
result.encoding_data = lxb_encoding_data_by_pre_name(entry->name, entry->end - entry->name);
371+
if (!result.encoding_data) {
372+
goto fallback;
373+
}
371374
result.bom_shift = 0;
372375
lxb_html_encoding_destroy(&encoding, false);
373376
return result;
374377

375378
fallback:
379+
lxb_html_encoding_destroy(&encoding, false);
380+
fallback_uninit:
376381
result.encoding_data = lxb_encoding_data(DOM_FALLBACK_ENCODING_ID);
377382
result.bom_shift = 0;
378-
lxb_html_encoding_destroy(&encoding, false);
379383
return result;
380384
}
381385

382-
static void dom_setup_parser_encoding(const lxb_char_t **buf_ref, size_t *read, dom_decoding_encoding_ctx *decoding_encoding_ctx)
386+
static void dom_setup_parser_encoding_manually(const lxb_char_t *buf_start, const lxb_encoding_data_t *encoding_data, dom_decoding_encoding_ctx *decoding_encoding_ctx, dom_lexbor_libxml2_bridge_application_data *application_data)
383387
{
384388
static const lxb_codepoint_t replacement_codepoint = LXB_ENCODING_REPLACEMENT_CODEPOINT;
385-
dom_character_encoding_data dom_encoding_data = dom_determine_encoding((const char *) *buf_ref, *read);
386-
*buf_ref += dom_encoding_data.bom_shift;
387-
*read -= dom_encoding_data.bom_shift;
388389

389-
decoding_encoding_ctx->decode_data = dom_encoding_data.encoding_data;
390-
if (decoding_encoding_ctx->decode_data == NULL) {
391-
decoding_encoding_ctx->decode_data = lxb_encoding_data(DOM_FALLBACK_ENCODING_ID);
392-
ZEND_ASSERT(decoding_encoding_ctx->decode_data != NULL);
393-
}
390+
decoding_encoding_ctx->decode_data = encoding_data;
391+
394392
(void) lxb_encoding_decode_init(&decoding_encoding_ctx->decode, decoding_encoding_ctx->decode_data, decoding_encoding_ctx->codepoints, sizeof(decoding_encoding_ctx->codepoints) / sizeof(lxb_codepoint_t));
395393
(void) lxb_encoding_decode_replace_set(&decoding_encoding_ctx->decode, &replacement_codepoint, LXB_ENCODING_REPLACEMENT_BUFFER_LEN);
396394
decoding_encoding_ctx->fast_path = decoding_encoding_ctx->decode_data == decoding_encoding_ctx->encode_data; /* Note: encode_data is for UTF-8 */
395+
396+
if (decoding_encoding_ctx->fast_path) {
397+
application_data->current_input_codepoints = NULL;
398+
application_data->current_input_characters = (const char *) buf_start;
399+
} else {
400+
application_data->current_input_codepoints = decoding_encoding_ctx->codepoints;
401+
application_data->current_input_characters = NULL;
402+
}
403+
}
404+
405+
static void dom_setup_parser_encoding_implicitly(const lxb_char_t **buf_ref, size_t *read, dom_decoding_encoding_ctx *decoding_encoding_ctx, dom_lexbor_libxml2_bridge_application_data *application_data)
406+
{
407+
const char *buf_start = (const char *) *buf_ref;
408+
dom_character_encoding_data dom_encoding_data = dom_determine_encoding(buf_start, *read);
409+
*buf_ref += dom_encoding_data.bom_shift;
410+
*read -= dom_encoding_data.bom_shift;
411+
dom_setup_parser_encoding_manually((const lxb_char_t *) buf_start, dom_encoding_data.encoding_data, decoding_encoding_ctx, application_data);
397412
}
398413

399414
static bool dom_process_parse_chunk(lexbor_libxml2_bridge_parse_context *ctx, lxb_html_document_t *document, lxb_html_parser_t *parser, size_t encoded_length, const lxb_char_t *encoding_output, size_t input_buffer_length, size_t *tokenizer_error_offset, size_t *tree_error_offset)
@@ -548,10 +563,10 @@ PHP_METHOD(DOM_HTMLDocument, createEmpty)
548563

549564
PHP_METHOD(DOM_HTMLDocument, createFromString)
550565
{
551-
const char *source;
552-
size_t source_len;
566+
const char *source, *override_encoding = NULL;
567+
size_t source_len, override_encoding_len;
553568
zend_long options = 0;
554-
if (zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &source, &source_len, &options) == FAILURE) {
569+
if (zend_parse_parameters(ZEND_NUM_ARGS(), "s|lp!", &source, &source_len, &options, &override_encoding, &override_encoding_len) == FAILURE) {
555570
RETURN_THROWS();
556571
}
557572

@@ -571,6 +586,24 @@ PHP_METHOD(DOM_HTMLDocument, createFromString)
571586
}
572587
ctx.application_data = &application_data;
573588

589+
size_t tokenizer_error_offset = 0;
590+
size_t tree_error_offset = 0;
591+
592+
/* Setup everything encoding & decoding related */
593+
const lxb_char_t *buf_ref = (const lxb_char_t *) source;
594+
dom_decoding_encoding_ctx decoding_encoding_ctx;
595+
dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
596+
if (override_encoding != NULL) {
597+
const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) override_encoding, override_encoding_len);
598+
if (!encoding_data) {
599+
zend_argument_value_error(3, "must be a valid document encoding");
600+
RETURN_THROWS();
601+
}
602+
dom_setup_parser_encoding_manually(buf_ref, encoding_data, &decoding_encoding_ctx, &application_data);
603+
} else {
604+
dom_setup_parser_encoding_implicitly(&buf_ref, &source_len, &decoding_encoding_ctx, &application_data);
605+
}
606+
574607
lxb_html_document_t *document = lxb_html_document_create();
575608
if (UNEXPECTED(document == NULL)) {
576609
goto fail_oom;
@@ -581,24 +614,7 @@ PHP_METHOD(DOM_HTMLDocument, createFromString)
581614
goto fail_oom;
582615
}
583616

584-
/* Setup everything encoding & decoding related */
585-
dom_decoding_encoding_ctx decoding_encoding_ctx;
586-
dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
587-
588617
lxb_html_parser_t *parser = document->dom_document.parser;
589-
size_t tokenizer_error_offset = 0;
590-
size_t tree_error_offset = 0;
591-
592-
const lxb_char_t *buf_ref = (const lxb_char_t *) source;
593-
dom_setup_parser_encoding(&buf_ref, &source_len, &decoding_encoding_ctx);
594-
595-
if (decoding_encoding_ctx.fast_path) {
596-
application_data.current_input_codepoints = NULL;
597-
application_data.current_input_characters = source;
598-
} else {
599-
application_data.current_input_codepoints = decoding_encoding_ctx.codepoints;
600-
application_data.current_input_characters = NULL;
601-
}
602618

603619
while (source_len > 0) {
604620
size_t chunk_size = source_len;
@@ -653,11 +669,11 @@ PHP_METHOD(DOM_HTMLDocument, createFromString)
653669

654670
PHP_METHOD(DOM_HTMLDocument, createFromFile)
655671
{
656-
const char *filename;
657-
size_t filename_len;
672+
const char *filename, *override_encoding = NULL;
673+
size_t filename_len, override_encoding_len;
658674
zend_long options = 0;
659675
php_stream *stream = NULL;
660-
if (zend_parse_parameters(ZEND_NUM_ARGS(), "p|l", &filename, &filename_len, &options) == FAILURE) {
676+
if (zend_parse_parameters(ZEND_NUM_ARGS(), "p|ls!", &filename, &filename_len, &options, &override_encoding, &override_encoding_len) == FAILURE) {
661677
RETURN_THROWS();
662678
}
663679

@@ -683,6 +699,22 @@ PHP_METHOD(DOM_HTMLDocument, createFromFile)
683699
}
684700
ctx.application_data = &application_data;
685701

702+
char buf[4096];
703+
704+
/* Setup everything encoding & decoding related */
705+
dom_decoding_encoding_ctx decoding_encoding_ctx;
706+
dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
707+
bool should_determine_encoding_implicitly = true; /* First read => determine encoding implicitly */
708+
if (override_encoding != NULL) {
709+
const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) override_encoding, override_encoding_len);
710+
if (!encoding_data) {
711+
zend_argument_value_error(3, "must be a valid document encoding");
712+
RETURN_THROWS();
713+
}
714+
should_determine_encoding_implicitly = false;
715+
dom_setup_parser_encoding_manually((const lxb_char_t *) buf, encoding_data, &decoding_encoding_ctx, &application_data);
716+
}
717+
686718
// TODO: context from LIBXML(stream_context) ???
687719
// TODO: https://mimesniff.spec.whatwg.org/#parsing-a-mime-type
688720
stream = php_stream_open_wrapper_ex(filename, "rb", REPORT_ERRORS, /* opened_path */ NULL, /* context */ NULL);
@@ -703,31 +735,17 @@ PHP_METHOD(DOM_HTMLDocument, createFromFile)
703735
goto fail_oom;
704736
}
705737

706-
/* Setup everything encoding & decoding related */
707-
bool first_read = true;
708-
dom_decoding_encoding_ctx decoding_encoding_ctx;
709-
dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
710-
711738
size_t tokenizer_error_offset = 0;
712739
size_t tree_error_offset = 0;
713740
ssize_t read;
714-
char buf[4096];
715741
lxb_html_parser_t *parser = document->dom_document.parser;
716742

717743
while ((read = php_stream_read(stream, buf, sizeof(buf))) > 0) {
718744
const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
719745

720-
/* First read => determine encoding */
721-
if (first_read) {
722-
first_read = false;
723-
dom_setup_parser_encoding(&buf_ref, (size_t *) &read, &decoding_encoding_ctx);
724-
if (decoding_encoding_ctx.fast_path) {
725-
application_data.current_input_codepoints = NULL;
726-
application_data.current_input_characters = buf;
727-
} else {
728-
application_data.current_input_codepoints = decoding_encoding_ctx.codepoints;
729-
application_data.current_input_characters = NULL;
730-
}
746+
if (should_determine_encoding_implicitly) {
747+
should_determine_encoding_implicitly = false;
748+
dom_setup_parser_encoding_implicitly(&buf_ref, (size_t *) &read, &decoding_encoding_ctx, &application_data);
731749
}
732750

733751
const lxb_char_t *buf_end = buf_ref + read;
@@ -771,7 +789,7 @@ PHP_METHOD(DOM_HTMLDocument, createFromFile)
771789
if (UNEXPECTED(!converted)) {
772790
goto fail_oom;
773791
}
774-
/* Check for "file:/"" instead of "file://" because of libxml2 quirk */
792+
/* Check for "file:/" instead of "file://" because of libxml2 quirk */
775793
if (strncmp((const char *) converted, "file:/", sizeof("file:/") - 1) != 0) {
776794
xmlChar *buffer = xmlStrdup((const xmlChar *) "file://");
777795
if (UNEXPECTED(!buffer)) {

ext/dom/php_dom.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ dom_object *php_dom_instantiate_object_helper(zval *return_value, zend_class_ent
160160
#define DOM_LOAD_STRING 0
161161
#define DOM_LOAD_FILE 1
162162

163-
xmlDocPtr dom_document_parser(zval *id, int mode, const char *source, size_t source_len, size_t options);
163+
xmlDocPtr dom_document_parser(zval *id, int mode, const char *source, size_t source_len, size_t options, xmlCharEncodingHandlerPtr encoding);
164164

165165
/* parentnode */
166166
void dom_parent_node_prepend(dom_object *context, zval *nodes, uint32_t nodesc);

ext/dom/php_dom.stub.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1135,9 +1135,9 @@ private function __construct() {}
11351135

11361136
public static function createEmpty(string $encoding = "UTF-8"): HTMLDocument {}
11371137

1138-
public static function createFromFile(string $path, int $options = 0): HTMLDocument {}
1138+
public static function createFromFile(string $path, int $options = 0, ?string $override_encoding = null): HTMLDocument {}
11391139

1140-
public static function createFromString(string $source, int $options = 0): HTMLDocument {}
1140+
public static function createFromString(string $source, int $options = 0, ?string $override_encoding = null): HTMLDocument {}
11411141

11421142
/** @implementation-alias DOMDocument::saveXML */
11431143
public function saveXML(?\DOMNode $node = null, int $options = 0): string|false {}
@@ -1157,9 +1157,9 @@ private function __construct() {}
11571157

11581158
public static function createEmpty(string $version = "1.0", string $encoding = "UTF-8"): XMLDocument {}
11591159

1160-
public static function createFromFile(string $path, int $options = 0): XMLDocument {}
1160+
public static function createFromFile(string $path, int $options = 0, ?string $override_encoding = null): XMLDocument {}
11611161

1162-
public static function createFromString(string $source, int $options = 0): XMLDocument {}
1162+
public static function createFromString(string $source, int $options = 0, ?string $override_encoding = null): XMLDocument {}
11631163

11641164
/** @readonly */
11651165
public ?string $xmlEncoding;

ext/dom/php_dom_arginfo.h

Lines changed: 5 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
--TEST--
2+
DOM\HTMLDocument::createFromFile() with override_encoding
3+
--EXTENSIONS--
4+
dom
5+
--FILE--
6+
<?php
7+
8+
try {
9+
DOM\HTMLDocument::createFromFile(__DIR__ . '/gb18030_without_charset.html', override_encoding: 'nonexistent');
10+
} catch (ValueError $e) {
11+
echo $e->getMessage(), "\n";
12+
}
13+
14+
$dom = DOM\HTMLDocument::createFromFile(__DIR__ . '/gb18030_without_charset.html', override_encoding: 'GB18030');
15+
var_dump($dom->documentElement->lastChild->textContent);
16+
var_dump($dom->encoding);
17+
18+
$dom = DOM\HTMLDocument::createFromFile(__DIR__ . '/fallback_encoding.html', override_encoding: 'Windows-1252');
19+
var_dump($dom->documentElement->lastChild->textContent);
20+
var_dump($dom->encoding);
21+
22+
?>
23+
--EXPECT--
24+
DOM\HTMLDocument::createFromFile(): Argument #3 ($override_encoding) must be a valid document encoding
25+
string(20) "
26+
Héllo, world!
27+
"
28+
string(7) "gb18030"
29+
string(1) "
30+
"
31+
string(12) "windows-1252"
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
--TEST--
2+
DOM\HTMLDocument::createFromString() with override_encoding
3+
--EXTENSIONS--
4+
dom
5+
--FILE--
6+
<?php
7+
8+
try {
9+
DOM\HTMLDocument::createFromString(file_get_contents(__DIR__ . '/gb18030_without_charset.html'), override_encoding: 'nonexistent');
10+
} catch (ValueError $e) {
11+
echo $e->getMessage(), "\n";
12+
}
13+
14+
$dom = DOM\HTMLDocument::createFromString(file_get_contents(__DIR__ . '/gb18030_without_charset.html'), override_encoding: 'GB18030');
15+
var_dump($dom->documentElement->lastChild->textContent);
16+
var_dump($dom->encoding);
17+
18+
$dom = DOM\HTMLDocument::createFromString(file_get_contents(__DIR__ . '/fallback_encoding.html'), override_encoding: 'Windows-1252');
19+
var_dump($dom->documentElement->lastChild->textContent);
20+
var_dump($dom->encoding);
21+
22+
?>
23+
--EXPECT--
24+
DOM\HTMLDocument::createFromString(): Argument #3 ($override_encoding) must be a valid document encoding
25+
string(20) "
26+
Héllo, world!
27+
"
28+
string(7) "gb18030"
29+
string(1) "
30+
"
31+
string(12) "windows-1252"
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
<!doctype html>
2+
<head>
3+
<title>No charset!</title>
4+
</head>
5+
<body>
6+
H¨¦llo, world!
7+
</body>

0 commit comments

Comments
 (0)