@@ -353,7 +353,7 @@ static dom_character_encoding_data dom_determine_encoding(const char *source, si
353
353
lxb_html_encoding_t encoding ;
354
354
lxb_status_t status = lxb_html_encoding_init (& encoding );
355
355
if (status != LXB_STATUS_OK ) {
356
- goto fallback ;
356
+ goto fallback_uninit ;
357
357
}
358
358
/* This is the "wait either for 1024 bytes or 500ms" part */
359
359
if (source_len > 1024 ) {
@@ -368,32 +368,47 @@ static dom_character_encoding_data dom_determine_encoding(const char *source, si
368
368
goto fallback ;
369
369
}
370
370
result .encoding_data = lxb_encoding_data_by_pre_name (entry -> name , entry -> end - entry -> name );
371
+ if (!result .encoding_data ) {
372
+ goto fallback ;
373
+ }
371
374
result .bom_shift = 0 ;
372
375
lxb_html_encoding_destroy (& encoding , false);
373
376
return result ;
374
377
375
378
fallback :
379
+ lxb_html_encoding_destroy (& encoding , false);
380
+ fallback_uninit :
376
381
result .encoding_data = lxb_encoding_data (DOM_FALLBACK_ENCODING_ID );
377
382
result .bom_shift = 0 ;
378
- lxb_html_encoding_destroy (& encoding , false);
379
383
return result ;
380
384
}
381
385
382
- static void dom_setup_parser_encoding (const lxb_char_t * * buf_ref , size_t * read , dom_decoding_encoding_ctx * decoding_encoding_ctx )
386
+ static void dom_setup_parser_encoding_manually (const lxb_char_t * buf_start , const lxb_encoding_data_t * encoding_data , dom_decoding_encoding_ctx * decoding_encoding_ctx , dom_lexbor_libxml2_bridge_application_data * application_data )
383
387
{
384
388
static const lxb_codepoint_t replacement_codepoint = LXB_ENCODING_REPLACEMENT_CODEPOINT ;
385
- dom_character_encoding_data dom_encoding_data = dom_determine_encoding ((const char * ) * buf_ref , * read );
386
- * buf_ref += dom_encoding_data .bom_shift ;
387
- * read -= dom_encoding_data .bom_shift ;
388
389
389
- decoding_encoding_ctx -> decode_data = dom_encoding_data .encoding_data ;
390
- if (decoding_encoding_ctx -> decode_data == NULL ) {
391
- decoding_encoding_ctx -> decode_data = lxb_encoding_data (DOM_FALLBACK_ENCODING_ID );
392
- ZEND_ASSERT (decoding_encoding_ctx -> decode_data != NULL );
393
- }
390
+ decoding_encoding_ctx -> decode_data = encoding_data ;
391
+
394
392
(void ) lxb_encoding_decode_init (& decoding_encoding_ctx -> decode , decoding_encoding_ctx -> decode_data , decoding_encoding_ctx -> codepoints , sizeof (decoding_encoding_ctx -> codepoints ) / sizeof (lxb_codepoint_t ));
395
393
(void ) lxb_encoding_decode_replace_set (& decoding_encoding_ctx -> decode , & replacement_codepoint , LXB_ENCODING_REPLACEMENT_BUFFER_LEN );
396
394
decoding_encoding_ctx -> fast_path = decoding_encoding_ctx -> decode_data == decoding_encoding_ctx -> encode_data ; /* Note: encode_data is for UTF-8 */
395
+
396
+ if (decoding_encoding_ctx -> fast_path ) {
397
+ application_data -> current_input_codepoints = NULL ;
398
+ application_data -> current_input_characters = (const char * ) buf_start ;
399
+ } else {
400
+ application_data -> current_input_codepoints = decoding_encoding_ctx -> codepoints ;
401
+ application_data -> current_input_characters = NULL ;
402
+ }
403
+ }
404
+
405
+ static void dom_setup_parser_encoding_implicitly (const lxb_char_t * * buf_ref , size_t * read , dom_decoding_encoding_ctx * decoding_encoding_ctx , dom_lexbor_libxml2_bridge_application_data * application_data )
406
+ {
407
+ const char * buf_start = (const char * ) * buf_ref ;
408
+ dom_character_encoding_data dom_encoding_data = dom_determine_encoding (buf_start , * read );
409
+ * buf_ref += dom_encoding_data .bom_shift ;
410
+ * read -= dom_encoding_data .bom_shift ;
411
+ dom_setup_parser_encoding_manually ((const lxb_char_t * ) buf_start , dom_encoding_data .encoding_data , decoding_encoding_ctx , application_data );
397
412
}
398
413
399
414
static bool dom_process_parse_chunk (lexbor_libxml2_bridge_parse_context * ctx , lxb_html_document_t * document , lxb_html_parser_t * parser , size_t encoded_length , const lxb_char_t * encoding_output , size_t input_buffer_length , size_t * tokenizer_error_offset , size_t * tree_error_offset )
@@ -548,10 +563,10 @@ PHP_METHOD(DOM_HTMLDocument, createEmpty)
548
563
549
564
PHP_METHOD (DOM_HTMLDocument , createFromString )
550
565
{
551
- const char * source ;
552
- size_t source_len ;
566
+ const char * source , * override_encoding = NULL ;
567
+ size_t source_len , override_encoding_len ;
553
568
zend_long options = 0 ;
554
- if (zend_parse_parameters (ZEND_NUM_ARGS (), "s|l " , & source , & source_len , & options ) == FAILURE ) {
569
+ if (zend_parse_parameters (ZEND_NUM_ARGS (), "s|lp! " , & source , & source_len , & options , & override_encoding , & override_encoding_len ) == FAILURE ) {
555
570
RETURN_THROWS ();
556
571
}
557
572
@@ -571,6 +586,24 @@ PHP_METHOD(DOM_HTMLDocument, createFromString)
571
586
}
572
587
ctx .application_data = & application_data ;
573
588
589
+ size_t tokenizer_error_offset = 0 ;
590
+ size_t tree_error_offset = 0 ;
591
+
592
+ /* Setup everything encoding & decoding related */
593
+ const lxb_char_t * buf_ref = (const lxb_char_t * ) source ;
594
+ dom_decoding_encoding_ctx decoding_encoding_ctx ;
595
+ dom_decoding_encoding_ctx_init (& decoding_encoding_ctx );
596
+ if (override_encoding != NULL ) {
597
+ const lxb_encoding_data_t * encoding_data = lxb_encoding_data_by_name ((const lxb_char_t * ) override_encoding , override_encoding_len );
598
+ if (!encoding_data ) {
599
+ zend_argument_value_error (3 , "must be a valid document encoding" );
600
+ RETURN_THROWS ();
601
+ }
602
+ dom_setup_parser_encoding_manually (buf_ref , encoding_data , & decoding_encoding_ctx , & application_data );
603
+ } else {
604
+ dom_setup_parser_encoding_implicitly (& buf_ref , & source_len , & decoding_encoding_ctx , & application_data );
605
+ }
606
+
574
607
lxb_html_document_t * document = lxb_html_document_create ();
575
608
if (UNEXPECTED (document == NULL )) {
576
609
goto fail_oom ;
@@ -581,24 +614,7 @@ PHP_METHOD(DOM_HTMLDocument, createFromString)
581
614
goto fail_oom ;
582
615
}
583
616
584
- /* Setup everything encoding & decoding related */
585
- dom_decoding_encoding_ctx decoding_encoding_ctx ;
586
- dom_decoding_encoding_ctx_init (& decoding_encoding_ctx );
587
-
588
617
lxb_html_parser_t * parser = document -> dom_document .parser ;
589
- size_t tokenizer_error_offset = 0 ;
590
- size_t tree_error_offset = 0 ;
591
-
592
- const lxb_char_t * buf_ref = (const lxb_char_t * ) source ;
593
- dom_setup_parser_encoding (& buf_ref , & source_len , & decoding_encoding_ctx );
594
-
595
- if (decoding_encoding_ctx .fast_path ) {
596
- application_data .current_input_codepoints = NULL ;
597
- application_data .current_input_characters = source ;
598
- } else {
599
- application_data .current_input_codepoints = decoding_encoding_ctx .codepoints ;
600
- application_data .current_input_characters = NULL ;
601
- }
602
618
603
619
while (source_len > 0 ) {
604
620
size_t chunk_size = source_len ;
@@ -653,11 +669,11 @@ PHP_METHOD(DOM_HTMLDocument, createFromString)
653
669
654
670
PHP_METHOD (DOM_HTMLDocument , createFromFile )
655
671
{
656
- const char * filename ;
657
- size_t filename_len ;
672
+ const char * filename , * override_encoding = NULL ;
673
+ size_t filename_len , override_encoding_len ;
658
674
zend_long options = 0 ;
659
675
php_stream * stream = NULL ;
660
- if (zend_parse_parameters (ZEND_NUM_ARGS (), "p|l " , & filename , & filename_len , & options ) == FAILURE ) {
676
+ if (zend_parse_parameters (ZEND_NUM_ARGS (), "p|ls! " , & filename , & filename_len , & options , & override_encoding , & override_encoding_len ) == FAILURE ) {
661
677
RETURN_THROWS ();
662
678
}
663
679
@@ -683,6 +699,22 @@ PHP_METHOD(DOM_HTMLDocument, createFromFile)
683
699
}
684
700
ctx .application_data = & application_data ;
685
701
702
+ char buf [4096 ];
703
+
704
+ /* Setup everything encoding & decoding related */
705
+ dom_decoding_encoding_ctx decoding_encoding_ctx ;
706
+ dom_decoding_encoding_ctx_init (& decoding_encoding_ctx );
707
+ bool should_determine_encoding_implicitly = true; /* First read => determine encoding implicitly */
708
+ if (override_encoding != NULL ) {
709
+ const lxb_encoding_data_t * encoding_data = lxb_encoding_data_by_name ((const lxb_char_t * ) override_encoding , override_encoding_len );
710
+ if (!encoding_data ) {
711
+ zend_argument_value_error (3 , "must be a valid document encoding" );
712
+ RETURN_THROWS ();
713
+ }
714
+ should_determine_encoding_implicitly = false;
715
+ dom_setup_parser_encoding_manually ((const lxb_char_t * ) buf , encoding_data , & decoding_encoding_ctx , & application_data );
716
+ }
717
+
686
718
// TODO: context from LIBXML(stream_context) ???
687
719
// TODO: https://mimesniff.spec.whatwg.org/#parsing-a-mime-type
688
720
stream = php_stream_open_wrapper_ex (filename , "rb" , REPORT_ERRORS , /* opened_path */ NULL , /* context */ NULL );
@@ -703,31 +735,17 @@ PHP_METHOD(DOM_HTMLDocument, createFromFile)
703
735
goto fail_oom ;
704
736
}
705
737
706
- /* Setup everything encoding & decoding related */
707
- bool first_read = true;
708
- dom_decoding_encoding_ctx decoding_encoding_ctx ;
709
- dom_decoding_encoding_ctx_init (& decoding_encoding_ctx );
710
-
711
738
size_t tokenizer_error_offset = 0 ;
712
739
size_t tree_error_offset = 0 ;
713
740
ssize_t read ;
714
- char buf [4096 ];
715
741
lxb_html_parser_t * parser = document -> dom_document .parser ;
716
742
717
743
while ((read = php_stream_read (stream , buf , sizeof (buf ))) > 0 ) {
718
744
const lxb_char_t * buf_ref = (const lxb_char_t * ) buf ;
719
745
720
- /* First read => determine encoding */
721
- if (first_read ) {
722
- first_read = false;
723
- dom_setup_parser_encoding (& buf_ref , (size_t * ) & read , & decoding_encoding_ctx );
724
- if (decoding_encoding_ctx .fast_path ) {
725
- application_data .current_input_codepoints = NULL ;
726
- application_data .current_input_characters = buf ;
727
- } else {
728
- application_data .current_input_codepoints = decoding_encoding_ctx .codepoints ;
729
- application_data .current_input_characters = NULL ;
730
- }
746
+ if (should_determine_encoding_implicitly ) {
747
+ should_determine_encoding_implicitly = false;
748
+ dom_setup_parser_encoding_implicitly (& buf_ref , (size_t * ) & read , & decoding_encoding_ctx , & application_data );
731
749
}
732
750
733
751
const lxb_char_t * buf_end = buf_ref + read ;
@@ -771,7 +789,7 @@ PHP_METHOD(DOM_HTMLDocument, createFromFile)
771
789
if (UNEXPECTED (!converted )) {
772
790
goto fail_oom ;
773
791
}
774
- /* Check for "file:/"" instead of "file://" because of libxml2 quirk */
792
+ /* Check for "file:/" instead of "file://" because of libxml2 quirk */
775
793
if (strncmp ((const char * ) converted , "file:/" , sizeof ("file:/" ) - 1 ) != 0 ) {
776
794
xmlChar * buffer = xmlStrdup ((const xmlChar * ) "file://" );
777
795
if (UNEXPECTED (!buffer )) {
0 commit comments