@@ -500,7 +500,7 @@ impl str {
500
500
///
501
501
/// # Unsafety
502
502
///
503
- /// Caller must check both UTF-8 character boundaries and the boundaries
503
+ /// Caller must check both UTF-8 sequence boundaries and the boundaries
504
504
/// of the entire slice as
505
505
/// well.
506
506
///
@@ -526,15 +526,16 @@ impl str {
526
526
core_str:: StrExt :: slice_mut_unchecked ( self , begin, end)
527
527
}
528
528
529
- /// Returns a slice of the string from the character range [`begin`..`end`).
529
+ /// Returns a slice of the string from the range [`begin`..`end`) where indices
530
+ /// are counted in code points.
530
531
///
531
532
/// That is, start at the `begin`-th code point of the string and continue
532
533
/// to the `end`-th code point. This does not detect or handle edge cases
533
- /// such as leaving a combining character as the first code point of the
534
+ /// such as leaving a combining character as the first `char` of the
534
535
/// string.
535
536
///
536
537
/// Due to the design of UTF-8, this operation is `O(end)`. Use slicing
537
- /// syntax if you want to use byte indices rather than codepoint indices .
538
+ /// syntax if you want to use `O(1)` byte indices instead .
538
539
///
539
540
/// # Panics
540
541
///
@@ -556,26 +557,26 @@ impl str {
556
557
core_str:: StrExt :: slice_chars ( self , begin, end)
557
558
}
558
559
559
- /// Given a byte position, return the next char and its index.
560
+ /// Given a byte position, return the next code point and its index.
560
561
///
561
- /// This can be used to iterate over the Unicode characters of a string.
562
+ /// This can be used to iterate over the Unicode code points of a string.
562
563
///
563
564
/// # Panics
564
565
///
565
566
/// If `i` is greater than or equal to the length of the string.
566
- /// If `i` is not the index of the beginning of a valid UTF-8 character .
567
+ /// If `i` is not the index of the beginning of a valid UTF-8 sequence .
567
568
///
568
569
/// # Examples
569
570
///
570
- /// This example manually iterates through the characters of a string;
571
+ /// This example manually iterates through the code points of a string;
571
572
/// this should normally be
572
573
/// done by `.chars()` or `.char_indices()`.
573
574
///
574
575
/// ```
575
576
/// # #![feature(str_char, core)]
576
577
/// use std::str::CharRange;
577
578
///
578
- /// let s = "中华Việt Nam";
579
+ /// let s = "中华Việt Nam";
579
580
/// let mut i = 0;
580
581
/// while i < s.len() {
581
582
/// let CharRange {ch, next} = s.char_range_at(i);
@@ -591,12 +592,14 @@ impl str {
591
592
/// 3: 华
592
593
/// 6: V
593
594
/// 7: i
594
- /// 8: ệ
595
- /// 11: t
596
- /// 12:
597
- /// 13: N
598
- /// 14: a
599
- /// 15: m
595
+ /// 8: e
596
+ /// 9: ̣
597
+ /// 11: ̂
598
+ /// 13: t
599
+ /// 14:
600
+ /// 15: N
601
+ /// 16: a
602
+ /// 17: m
600
603
/// ```
601
604
#[ unstable( feature = "str_char" ,
602
605
reason = "often replaced by char_indices, this method may \
@@ -608,26 +611,29 @@ impl str {
608
611
609
612
/// Given a byte position, return the previous `char` and its position.
610
613
///
611
- /// This function can be used to iterate over a Unicode string in reverse.
614
+ /// This function can be used to iterate over a Unicode code points in reverse.
615
+ ///
616
+ /// Note that Unicode has many features, such as combining marks, ligatures,
617
+ /// and direction marks, that need to be taken into account to correctly reverse a string.
612
618
///
613
619
/// Returns 0 for next index if called on start index 0.
614
620
///
615
621
/// # Panics
616
622
///
617
623
/// If `i` is greater than the length of the string.
618
- /// If `i` is not an index following a valid UTF-8 character .
624
+ /// If `i` is not an index following a valid UTF-8 sequence .
619
625
///
620
626
/// # Examples
621
627
///
622
- /// This example manually iterates through the characters of a string;
628
+ /// This example manually iterates through the code points of a string;
623
629
/// this should normally be
624
630
/// done by `.chars().rev()` or `.char_indices()`.
625
631
///
626
632
/// ```
627
633
/// # #![feature(str_char, core)]
628
634
/// use std::str::CharRange;
629
635
///
630
- /// let s = "中华Việt Nam";
636
+ /// let s = "中华Việt Nam";
631
637
/// let mut i = s.len();
632
638
/// while i > 0 {
633
639
/// let CharRange {ch, next} = s.char_range_at_reverse(i);
@@ -639,12 +645,14 @@ impl str {
639
645
/// This outputs:
640
646
///
641
647
/// ```text
642
- /// 16: m
643
- /// 15: a
644
- /// 14: N
645
- /// 13:
646
- /// 12: t
647
- /// 11: ệ
648
+ /// 18: m
649
+ /// 17: a
650
+ /// 16: N
651
+ /// 15:
652
+ /// 14: t
653
+ /// 13: ̂
654
+ /// 11: ̣
655
+ /// 9: e
648
656
/// 8: i
649
657
/// 7: V
650
658
/// 6: 华
@@ -663,7 +671,7 @@ impl str {
663
671
/// # Panics
664
672
///
665
673
/// If `i` is greater than or equal to the length of the string.
666
- /// If `i` is not the index of the beginning of a valid UTF-8 character .
674
+ /// If `i` is not the index of the beginning of a valid UTF-8 sequence .
667
675
///
668
676
/// # Examples
669
677
///
@@ -672,6 +680,7 @@ impl str {
672
680
/// let s = "abπc";
673
681
/// assert_eq!(s.char_at(1), 'b');
674
682
/// assert_eq!(s.char_at(2), 'π');
683
+ /// assert_eq!(s.char_at(4), 'c');
675
684
/// ```
676
685
#[ unstable( feature = "str_char" ,
677
686
reason = "frequently replaced by the chars() iterator, this \
@@ -689,7 +698,7 @@ impl str {
689
698
/// # Panics
690
699
///
691
700
/// If `i` is greater than the length of the string.
692
- /// If `i` is not an index following a valid UTF-8 character .
701
+ /// If `i` is not an index following a valid UTF-8 sequence .
693
702
///
694
703
/// # Examples
695
704
///
@@ -698,6 +707,7 @@ impl str {
698
707
/// let s = "abπc";
699
708
/// assert_eq!(s.char_at_reverse(1), 'a');
700
709
/// assert_eq!(s.char_at_reverse(2), 'b');
710
+ /// assert_eq!(s.char_at_reverse(3), 'π');
701
711
/// ```
702
712
#[ unstable( feature = "str_char" ,
703
713
reason = "see char_at for more details, but reverse semantics \
@@ -707,28 +717,30 @@ impl str {
707
717
core_str:: StrExt :: char_at_reverse ( self , i)
708
718
}
709
719
710
- /// Retrieves the first character from a `&str` and returns it.
720
+ /// Retrieves the first code point from a `&str` and returns it.
721
+ ///
722
+ /// Note that a single Unicode character (grapheme cluster)
723
+ /// can be composed of multiple `char`s.
711
724
///
712
725
/// This does not allocate a new string; instead, it returns a slice that
713
- /// points one character
714
- /// beyond the character that was shifted.
726
+ /// points one code point beyond the code point that was shifted.
715
727
///
716
- /// If the slice does not contain any characters, None is returned instead .
728
+ /// `None` is returned if the slice is empty .
717
729
///
718
730
/// # Examples
719
731
///
720
732
/// ```
721
733
/// # #![feature(str_char)]
722
- /// let s = "Löwe 老虎 Léopard";
734
+ /// let s = "Łódź"; // \u{141}o\u{301}dz\u{301}
723
735
/// let (c, s1) = s.slice_shift_char().unwrap();
724
736
///
725
- /// assert_eq!(c, 'L ');
726
- /// assert_eq!(s1, "öwe 老虎 Léopard ");
737
+ /// assert_eq!(c, 'Ł ');
738
+ /// assert_eq!(s1, "ódź ");
727
739
///
728
740
/// let (c, s2) = s1.slice_shift_char().unwrap();
729
741
///
730
- /// assert_eq!(c, 'ö ');
731
- /// assert_eq!(s2, "we 老虎 Léopard ");
742
+ /// assert_eq!(c, 'o ');
743
+ /// assert_eq!(s2, "\u{301}dz\u{301} ");
732
744
/// ```
733
745
#[ unstable( feature = "str_char" ,
734
746
reason = "awaiting conventions about shifting and slices and \
@@ -741,14 +753,14 @@ impl str {
741
753
/// Divide one string slice into two at an index.
742
754
///
743
755
/// The index `mid` is a byte offset from the start of the string
744
- /// that must be on a character boundary.
756
+ /// that must be on a `char` boundary.
745
757
///
746
758
/// Return slices `&self[..mid]` and `&self[mid..]`.
747
759
///
748
760
/// # Panics
749
761
///
750
- /// Panics if `mid` is beyond the last character of the string,
751
- /// or if it is not on a character boundary.
762
+ /// Panics if `mid` is beyond the last code point of the string,
763
+ /// or if it is not on a `char` boundary.
752
764
///
753
765
/// # Examples
754
766
/// ```
@@ -773,27 +785,39 @@ impl str {
773
785
core_str:: StrExt :: split_at_mut ( self , mid)
774
786
}
775
787
776
- /// An iterator over the codepoints of `self`.
788
+ /// An iterator over the code points of `self`.
789
+ ///
790
+ /// In Unicode relationship between code points and characters is complex.
791
+ /// A single character may be composed of multiple code points
792
+ /// (e.g. diacritical marks added to a letter), and a single code point
793
+ /// (e.g. Hangul syllable) may contain multiple characters.
794
+ ///
795
+ /// For iteration over human-readable characters a grapheme cluster iterator
796
+ /// may be more appropriate. See the [unicode-segmentation crate][1].
797
+ ///
798
+ /// [1]: https://crates.io/crates/unicode-segmentation
777
799
///
778
800
/// # Examples
779
801
///
780
802
/// ```
781
- /// let v: Vec<char> = "abc åäö ".chars().collect();
803
+ /// let v: Vec<char> = "ASCII żółć 🇨🇭 한 ".chars().collect();
782
804
///
783
- /// assert_eq!(v, ['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
805
+ /// assert_eq!(v, ['A', 'S', 'C', 'I', 'I', ' ',
806
+ /// 'z', '\u{307}', 'o', '\u{301}', 'ł', 'c', '\u{301}', ' ',
807
+ /// '\u{1f1e8}', '\u{1f1ed}', ' ', '한']);
784
808
/// ```
785
809
#[ stable( feature = "rust1" , since = "1.0.0" ) ]
786
810
pub fn chars ( & self ) -> Chars {
787
811
core_str:: StrExt :: chars ( self )
788
812
}
789
813
790
- /// An iterator over the characters of `self` and their byte offsets.
814
+ /// An iterator over the `char`s of `self` and their byte offsets.
791
815
///
792
816
/// # Examples
793
817
///
794
818
/// ```
795
- /// let v: Vec<(usize, char)> = "abc ".char_indices().collect();
796
- /// let b = vec![(0, 'a '), (1, 'b '), (2 , 'c ')];
819
+ /// let v: Vec<(usize, char)> = "A🇨🇭 ".char_indices().collect();
820
+ /// let b = vec![(0, 'A '), (1, '\u{1f1e8} '), (5 , '\u{1f1ed} ')];
797
821
///
798
822
/// assert_eq!(v, b);
799
823
/// ```
@@ -822,7 +846,7 @@ impl str {
822
846
/// # Examples
823
847
///
824
848
/// ```
825
- /// let some_words = " Mary had\ta little \n\t lamb";
849
+ /// let some_words = " Mary had\ta\u{2009} little \n\t lamb";
826
850
/// let v: Vec<&str> = some_words.split_whitespace().collect();
827
851
///
828
852
/// assert_eq!(v, ["Mary", "had", "a", "little", "lamb"]);
@@ -840,7 +864,7 @@ impl str {
840
864
/// ```
841
865
/// # #![feature(str_words)]
842
866
/// # #![allow(deprecated)]
843
- /// let some_words = " Mary had\ta little \n\t lamb";
867
+ /// let some_words = " Mary had\ta\u{2009} little \n\t lamb";
844
868
/// let v: Vec<&str> = some_words.words().collect();
845
869
///
846
870
/// assert_eq!(v, ["Mary", "had", "a", "little", "lamb"]);
0 commit comments