@@ -227,6 +227,22 @@ fn analyze_source_file_dispatch(
227
227
}
228
228
}
229
229
230
+ #[ cfg( target_arch = "aarch64" ) ]
231
+ fn analyze_source_file_dispatch (
232
+ src : & str ,
233
+ lines : & mut Vec < TextSize > ,
234
+ multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
235
+ ) {
236
+ if std:: arch:: is_aarch64_feature_detected!( "neon" ) {
237
+ // SAFETY: NEON support was checked
238
+ unsafe {
239
+ analyze_source_file_neon ( src, lines, multi_byte_chars) ;
240
+ }
241
+ } else {
242
+ analyze_source_file_generic ( src, src. len ( ) , TextSize :: from ( 0 ) , lines, multi_byte_chars) ;
243
+ }
244
+ }
245
+
230
246
/// Checks 16 byte chunks of text at a time. If the chunk contains
231
247
/// something other than printable ASCII characters and newlines, the
232
248
/// function falls back to the generic implementation. Otherwise it uses
@@ -322,7 +338,102 @@ unsafe fn analyze_source_file_sse2(
322
338
}
323
339
}
324
340
325
- #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
341
+ #[ target_feature( enable = "neon" ) ]
342
+ #[ cfg( any( target_arch = "aarch64" ) ) ]
343
+ #[ inline]
344
+ // See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
345
+ //
346
+ // The mask is a 64-bit integer, where each 4-bit corresponds to a u8 in the
347
+ // input vector. The least significant 4 bits correspond to the first byte in
348
+ // the vector.
349
+ unsafe fn move_mask ( v : std:: arch:: aarch64:: uint8x16_t ) -> u64 {
350
+ use std:: arch:: aarch64:: * ;
351
+
352
+ let nibble_mask = vshrn_n_u16 ( vreinterpretq_u16_u8 ( v) , 4 ) ;
353
+ vget_lane_u64 ( vreinterpret_u64_u8 ( nibble_mask) , 0 )
354
+ }
355
+
356
+ #[ target_feature( enable = "neon" ) ]
357
+ #[ cfg( any( target_arch = "aarch64" ) ) ]
358
+ unsafe fn analyze_source_file_neon (
359
+ src : & str ,
360
+ lines : & mut Vec < TextSize > ,
361
+ multi_byte_chars : & mut IntMap < u32 , Vec < WideChar > > ,
362
+ ) {
363
+ use std:: arch:: aarch64:: * ;
364
+
365
+ const CHUNK_SIZE : usize = 16 ;
366
+
367
+ let src_bytes = src. as_bytes ( ) ;
368
+
369
+ let chunk_count = src. len ( ) / CHUNK_SIZE ;
370
+
371
+ let newline = vdupq_n_s8 ( b'\n' as i8 ) ;
372
+
373
+ // This variable keeps track of where we should start decoding a
374
+ // chunk. If a multi-byte character spans across chunk boundaries,
375
+ // we need to skip that part in the next chunk because we already
376
+ // handled it.
377
+ let mut intra_chunk_offset = 0 ;
378
+
379
+ for chunk_index in 0 ..chunk_count {
380
+ let ptr = src_bytes. as_ptr ( ) as * const i8 ;
381
+ let chunk = vld1q_s8 ( ptr. add ( chunk_index * CHUNK_SIZE ) ) ;
382
+
383
+ // For character in the chunk, see if its byte value is < 0, which
384
+ // indicates that it's part of a UTF-8 char.
385
+ let multibyte_test = vcltzq_s8 ( chunk) ;
386
+ // Create a bit mask from the comparison results.
387
+ let multibyte_mask = move_mask ( multibyte_test) ;
388
+
389
+ // If the bit mask is all zero, we only have ASCII chars here:
390
+ if multibyte_mask == 0 {
391
+ assert ! ( intra_chunk_offset == 0 ) ;
392
+
393
+ // Check for newlines in the chunk
394
+ let newlines_test = vceqq_s8 ( chunk, newline) ;
395
+ let mut newlines_mask = move_mask ( newlines_test) ;
396
+
397
+ // If the bit mask is not all zero, there are newlines in this chunk.
398
+ if newlines_mask != 0 {
399
+ let output_offset = TextSize :: from ( ( chunk_index * CHUNK_SIZE + 1 ) as u32 ) ;
400
+
401
+ while newlines_mask != 0 {
402
+ let trailing_zeros = newlines_mask. trailing_zeros ( ) ;
403
+ let index = trailing_zeros / 4 ;
404
+
405
+ lines. push ( TextSize :: from ( index) + output_offset) ;
406
+
407
+ // Clear the current 4-bit, so we can find the next one.
408
+ newlines_mask &= ( !0xF ) << trailing_zeros;
409
+ }
410
+ }
411
+ continue ;
412
+ }
413
+
414
+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
415
+ intra_chunk_offset = analyze_source_file_generic (
416
+ & src[ scan_start..] ,
417
+ CHUNK_SIZE - intra_chunk_offset,
418
+ TextSize :: from ( scan_start as u32 ) ,
419
+ lines,
420
+ multi_byte_chars,
421
+ ) ;
422
+ }
423
+
424
+ let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
425
+ if tail_start < src. len ( ) {
426
+ analyze_source_file_generic (
427
+ & src[ tail_start..] ,
428
+ src. len ( ) - tail_start,
429
+ TextSize :: from ( tail_start as u32 ) ,
430
+ lines,
431
+ multi_byte_chars,
432
+ ) ;
433
+ }
434
+ }
435
+
436
+ #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" , target_arch = "aarch64" ) ) ) ]
326
437
// The target (or compiler version) does not support SSE2 ...
327
438
fn analyze_source_file_dispatch (
328
439
src : & str ,
0 commit comments