@@ -104,9 +104,7 @@ zend_result dom_modern_document_implementation_read(dom_object *obj, zval *retva
104
104
105
105
static void dom_decoding_encoding_ctx_init (dom_decoding_encoding_ctx * ctx )
106
106
{
107
- ctx -> encode_data = lxb_encoding_data (LXB_ENCODING_UTF_8 );
108
- ctx -> decode_data = NULL ;
109
- /* Set fast path on by default so that the decoder finishing is skipped if this was never initialised properly. */
107
+ ctx -> decode_data = ctx -> encode_data = lxb_encoding_data (LXB_ENCODING_UTF_8 );
110
108
ctx -> fast_path = true;
111
109
(void ) lxb_encoding_encode_init (
112
110
& ctx -> encode ,
@@ -115,6 +113,13 @@ static void dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx *ctx)
115
113
sizeof (ctx -> encoding_output ) / sizeof (* ctx -> encoding_output )
116
114
);
117
115
(void ) lxb_encoding_encode_replace_set (& ctx -> encode , LXB_ENCODING_REPLACEMENT_BYTES , LXB_ENCODING_REPLACEMENT_SIZE );
116
+ (void ) lxb_encoding_decode_init (
117
+ & ctx -> decode ,
118
+ ctx -> decode_data ,
119
+ ctx -> codepoints ,
120
+ sizeof (ctx -> codepoints ) / sizeof (* ctx -> codepoints )
121
+ );
122
+ (void ) lxb_encoding_decode_replace_set (& ctx -> decode , LXB_ENCODING_REPLACEMENT_BUFFER , LXB_ENCODING_REPLACEMENT_BUFFER_LEN );
118
123
}
119
124
120
125
static const char * dom_lexbor_tokenizer_error_code_to_string (lxb_html_tokenizer_error_id_t id )
@@ -523,6 +528,8 @@ static bool dom_decode_encode_fast_path(
523
528
size_t * tree_error_offset
524
529
)
525
530
{
531
+ decoding_encoding_ctx -> decode .status = LXB_STATUS_OK ;
532
+
526
533
const lxb_char_t * buf_ref = * buf_ref_ref ;
527
534
const lxb_char_t * last_output = buf_ref ;
528
535
while (buf_ref != buf_end ) {
@@ -551,6 +558,17 @@ static bool dom_decode_encode_fast_path(
551
558
)) {
552
559
goto fail_oom ;
553
560
}
561
+
562
+ if (codepoint == LXB_ENCODING_DECODE_CONTINUE ) {
563
+ ZEND_ASSERT (buf_ref == buf_end );
564
+ /* The decoder needs more data but the entire buffer is consumed.
565
+ * All valid data is outputted, and if the remaining data for the code point
566
+ * is invalid, the next call will output the replacement bytes. */
567
+ * buf_ref_ref = buf_ref ;
568
+ decoding_encoding_ctx -> decode .status = LXB_STATUS_CONTINUE ;
569
+ return true;
570
+ }
571
+
554
572
if (!dom_process_parse_chunk (
555
573
ctx ,
556
574
document ,
@@ -563,6 +581,7 @@ static bool dom_decode_encode_fast_path(
563
581
)) {
564
582
goto fail_oom ;
565
583
}
584
+
566
585
last_output = buf_ref ;
567
586
}
568
587
}
@@ -676,29 +695,22 @@ static bool dom_parse_decode_encode_finish(
676
695
size_t * tree_error_offset
677
696
)
678
697
{
679
- if (!decoding_encoding_ctx -> fast_path ) {
680
- /* Fast path handles codepoints one by one, so this part is not applicable in that case */
681
- (void ) lxb_encoding_decode_finish (& decoding_encoding_ctx -> decode );
682
- size_t decoding_buffer_size = lxb_encoding_decode_buf_used (& decoding_encoding_ctx -> decode );
683
- if (decoding_buffer_size > 0 ) {
684
- const lxb_codepoint_t * codepoints_ref = (const lxb_codepoint_t * ) decoding_encoding_ctx -> codepoints ;
685
- const lxb_codepoint_t * codepoints_end = codepoints_ref + decoding_buffer_size ;
686
- (void ) decoding_encoding_ctx -> encode_data -> encode (& decoding_encoding_ctx -> encode , & codepoints_ref , codepoints_end );
687
- if (!dom_process_parse_chunk (
688
- ctx ,
689
- document ,
690
- parser ,
691
- lxb_encoding_encode_buf_used (& decoding_encoding_ctx -> encode ),
692
- decoding_encoding_ctx -> encoding_output ,
693
- decoding_buffer_size ,
694
- tokenizer_error_offset ,
695
- tree_error_offset
696
- )) {
697
- return false;
698
- }
699
- }
698
+ lxb_status_t status ;
699
+
700
+ status = lxb_encoding_decode_finish (& decoding_encoding_ctx -> decode );
701
+ ZEND_ASSERT (status == LXB_STATUS_OK );
702
+
703
+ size_t decoding_buffer_size = lxb_encoding_decode_buf_used (& decoding_encoding_ctx -> decode );
704
+ if (decoding_buffer_size > 0 ) {
705
+ const lxb_codepoint_t * codepoints_ref = (const lxb_codepoint_t * ) decoding_encoding_ctx -> codepoints ;
706
+ const lxb_codepoint_t * codepoints_end = codepoints_ref + decoding_buffer_size ;
707
+ status = decoding_encoding_ctx -> encode_data -> encode (& decoding_encoding_ctx -> encode , & codepoints_ref , codepoints_end );
708
+ ZEND_ASSERT (status == LXB_STATUS_OK );
709
+ /* No need to produce output here, as we finish the encoder below and pass the chunk. */
700
710
}
701
- (void ) lxb_encoding_encode_finish (& decoding_encoding_ctx -> encode );
711
+
712
+ status = lxb_encoding_encode_finish (& decoding_encoding_ctx -> encode );
713
+ ZEND_ASSERT (status == LXB_STATUS_OK );
702
714
if (lxb_encoding_encode_buf_used (& decoding_encoding_ctx -> encode )
703
715
&& !dom_process_parse_chunk (
704
716
ctx ,
0 commit comments