Skip to content

Commit c20e3fc

Browse files
committed
Document Unicode complications in chars iterator
1 parent 04badd6 commit c20e3fc

File tree

1 file changed

+71
-47
lines changed

1 file changed

+71
-47
lines changed

src/libcollections/str.rs

+71-47
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,7 @@ impl str {
500500
///
501501
/// # Unsafety
502502
///
503-
/// Caller must check both UTF-8 character boundaries and the boundaries
503+
/// Caller must check both UTF-8 sequence boundaries and the boundaries
504504
/// of the entire slice as
505505
/// well.
506506
///
@@ -526,15 +526,16 @@ impl str {
526526
core_str::StrExt::slice_mut_unchecked(self, begin, end)
527527
}
528528

529-
/// Returns a slice of the string from the character range [`begin`..`end`).
529+
/// Returns a slice of the string from the range [`begin`..`end`) where indices
530+
/// are counted in code points.
530531
///
531532
/// That is, start at the `begin`-th code point of the string and continue
532533
/// to the `end`-th code point. This does not detect or handle edge cases
533-
/// such as leaving a combining character as the first code point of the
534+
/// such as leaving a combining character as the first `char` of the
534535
/// string.
535536
///
536537
/// Due to the design of UTF-8, this operation is `O(end)`. Use slicing
537-
/// syntax if you want to use byte indices rather than codepoint indices.
538+
/// syntax if you want to use `O(1)` byte indices instead.
538539
///
539540
/// # Panics
540541
///
@@ -556,26 +557,26 @@ impl str {
556557
core_str::StrExt::slice_chars(self, begin, end)
557558
}
558559

559-
/// Given a byte position, return the next char and its index.
560+
/// Given a byte position, return the next code point and its index.
560561
///
561-
/// This can be used to iterate over the Unicode characters of a string.
562+
/// This can be used to iterate over the Unicode code points of a string.
562563
///
563564
/// # Panics
564565
///
565566
/// If `i` is greater than or equal to the length of the string.
566-
/// If `i` is not the index of the beginning of a valid UTF-8 character.
567+
/// If `i` is not the index of the beginning of a valid UTF-8 sequence.
567568
///
568569
/// # Examples
569570
///
570-
/// This example manually iterates through the characters of a string;
571+
/// This example manually iterates through the code points of a string;
571572
/// this should normally be
572573
/// done by `.chars()` or `.char_indices()`.
573574
///
574575
/// ```
575576
/// # #![feature(str_char, core)]
576577
/// use std::str::CharRange;
577578
///
578-
/// let s = "中华Việt Nam";
579+
/// let s = "中华Việt Nam";
579580
/// let mut i = 0;
580581
/// while i < s.len() {
581582
/// let CharRange {ch, next} = s.char_range_at(i);
@@ -591,12 +592,14 @@ impl str {
591592
/// 3: 华
592593
/// 6: V
593594
/// 7: i
594-
/// 8: ệ
595-
/// 11: t
596-
/// 12:
597-
/// 13: N
598-
/// 14: a
599-
/// 15: m
595+
/// 8: e
596+
/// 9: ̣
597+
/// 11: ̂
598+
/// 13: t
599+
/// 14:
600+
/// 15: N
601+
/// 16: a
602+
/// 17: m
600603
/// ```
601604
#[unstable(feature = "str_char",
602605
reason = "often replaced by char_indices, this method may \
@@ -608,26 +611,29 @@ impl str {
608611

609612
/// Given a byte position, return the previous `char` and its position.
610613
///
611-
/// This function can be used to iterate over a Unicode string in reverse.
614+
/// This function can be used to iterate over a Unicode code points in reverse.
615+
///
616+
/// Note that Unicode has many features, such as combining marks, ligatures,
617+
/// and direction marks, that need to be taken into account to correctly reverse a string.
612618
///
613619
/// Returns 0 for next index if called on start index 0.
614620
///
615621
/// # Panics
616622
///
617623
/// If `i` is greater than the length of the string.
618-
/// If `i` is not an index following a valid UTF-8 character.
624+
/// If `i` is not an index following a valid UTF-8 sequence.
619625
///
620626
/// # Examples
621627
///
622-
/// This example manually iterates through the characters of a string;
628+
/// This example manually iterates through the code points of a string;
623629
/// this should normally be
624630
/// done by `.chars().rev()` or `.char_indices()`.
625631
///
626632
/// ```
627633
/// # #![feature(str_char, core)]
628634
/// use std::str::CharRange;
629635
///
630-
/// let s = "中华Việt Nam";
636+
/// let s = "中华Việt Nam";
631637
/// let mut i = s.len();
632638
/// while i > 0 {
633639
/// let CharRange {ch, next} = s.char_range_at_reverse(i);
@@ -639,12 +645,14 @@ impl str {
639645
/// This outputs:
640646
///
641647
/// ```text
642-
/// 16: m
643-
/// 15: a
644-
/// 14: N
645-
/// 13:
646-
/// 12: t
647-
/// 11: ệ
648+
/// 18: m
649+
/// 17: a
650+
/// 16: N
651+
/// 15:
652+
/// 14: t
653+
/// 13: ̂
654+
/// 11: ̣
655+
/// 9: e
648656
/// 8: i
649657
/// 7: V
650658
/// 6: 华
@@ -663,7 +671,7 @@ impl str {
663671
/// # Panics
664672
///
665673
/// If `i` is greater than or equal to the length of the string.
666-
/// If `i` is not the index of the beginning of a valid UTF-8 character.
674+
/// If `i` is not the index of the beginning of a valid UTF-8 sequence.
667675
///
668676
/// # Examples
669677
///
@@ -672,6 +680,7 @@ impl str {
672680
/// let s = "abπc";
673681
/// assert_eq!(s.char_at(1), 'b');
674682
/// assert_eq!(s.char_at(2), 'π');
683+
/// assert_eq!(s.char_at(4), 'c');
675684
/// ```
676685
#[unstable(feature = "str_char",
677686
reason = "frequently replaced by the chars() iterator, this \
@@ -689,7 +698,7 @@ impl str {
689698
/// # Panics
690699
///
691700
/// If `i` is greater than the length of the string.
692-
/// If `i` is not an index following a valid UTF-8 character.
701+
/// If `i` is not an index following a valid UTF-8 sequence.
693702
///
694703
/// # Examples
695704
///
@@ -698,6 +707,7 @@ impl str {
698707
/// let s = "abπc";
699708
/// assert_eq!(s.char_at_reverse(1), 'a');
700709
/// assert_eq!(s.char_at_reverse(2), 'b');
710+
/// assert_eq!(s.char_at_reverse(3), 'π');
701711
/// ```
702712
#[unstable(feature = "str_char",
703713
reason = "see char_at for more details, but reverse semantics \
@@ -707,28 +717,30 @@ impl str {
707717
core_str::StrExt::char_at_reverse(self, i)
708718
}
709719

710-
/// Retrieves the first character from a `&str` and returns it.
720+
/// Retrieves the first code point from a `&str` and returns it.
721+
///
722+
/// Note that a single Unicode character (grapheme cluster)
723+
/// can be composed of multiple `char`s.
711724
///
712725
/// This does not allocate a new string; instead, it returns a slice that
713-
/// points one character
714-
/// beyond the character that was shifted.
726+
/// points one code point beyond the code point that was shifted.
715727
///
716-
/// If the slice does not contain any characters, None is returned instead.
728+
/// `None` is returned if the slice is empty.
717729
///
718730
/// # Examples
719731
///
720732
/// ```
721733
/// # #![feature(str_char)]
722-
/// let s = "Löwe 老虎 Léopard";
734+
/// let s = "Łódź"; // \u{141}o\u{301}dz\u{301}
723735
/// let (c, s1) = s.slice_shift_char().unwrap();
724736
///
725-
/// assert_eq!(c, 'L');
726-
/// assert_eq!(s1, "öwe 老虎 Léopard");
737+
/// assert_eq!(c, 'Ł');
738+
/// assert_eq!(s1, "ódź");
727739
///
728740
/// let (c, s2) = s1.slice_shift_char().unwrap();
729741
///
730-
/// assert_eq!(c, 'ö');
731-
/// assert_eq!(s2, "we 老虎 Léopard");
742+
/// assert_eq!(c, 'o');
743+
/// assert_eq!(s2, "\u{301}dz\u{301}");
732744
/// ```
733745
#[unstable(feature = "str_char",
734746
reason = "awaiting conventions about shifting and slices and \
@@ -741,14 +753,14 @@ impl str {
741753
/// Divide one string slice into two at an index.
742754
///
743755
/// The index `mid` is a byte offset from the start of the string
744-
/// that must be on a character boundary.
756+
/// that must be on a `char` boundary.
745757
///
746758
/// Return slices `&self[..mid]` and `&self[mid..]`.
747759
///
748760
/// # Panics
749761
///
750-
/// Panics if `mid` is beyond the last character of the string,
751-
/// or if it is not on a character boundary.
762+
/// Panics if `mid` is beyond the last code point of the string,
763+
/// or if it is not on a `char` boundary.
752764
///
753765
/// # Examples
754766
/// ```
@@ -773,27 +785,39 @@ impl str {
773785
core_str::StrExt::split_at_mut(self, mid)
774786
}
775787

776-
/// An iterator over the codepoints of `self`.
788+
/// An iterator over the code points of `self`.
789+
///
790+
/// In Unicode relationship between code points and characters is complex.
791+
/// A single character may be composed of multiple code points
792+
/// (e.g. diacritical marks added to a letter), and a single code point
793+
/// (e.g. Hangul syllable) may contain multiple characters.
794+
///
795+
/// For iteration over human-readable characters a grapheme cluster iterator
796+
/// may be more appropriate. See the [unicode-segmentation crate][1].
797+
///
798+
/// [1]: https://crates.io/crates/unicode-segmentation
777799
///
778800
/// # Examples
779801
///
780802
/// ```
781-
/// let v: Vec<char> = "abc åäö".chars().collect();
803+
/// let v: Vec<char> = "ASCII żółć 🇨🇭 한".chars().collect();
782804
///
783-
/// assert_eq!(v, ['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
805+
/// assert_eq!(v, ['A', 'S', 'C', 'I', 'I', ' ',
806+
/// 'z', '\u{307}', 'o', '\u{301}', 'ł', 'c', '\u{301}', ' ',
807+
/// '\u{1f1e8}', '\u{1f1ed}', ' ', '한']);
784808
/// ```
785809
#[stable(feature = "rust1", since = "1.0.0")]
786810
pub fn chars(&self) -> Chars {
787811
core_str::StrExt::chars(self)
788812
}
789813

790-
/// An iterator over the characters of `self` and their byte offsets.
814+
/// An iterator over the `char`s of `self` and their byte offsets.
791815
///
792816
/// # Examples
793817
///
794818
/// ```
795-
/// let v: Vec<(usize, char)> = "abc".char_indices().collect();
796-
/// let b = vec![(0, 'a'), (1, 'b'), (2, 'c')];
819+
/// let v: Vec<(usize, char)> = "A🇨🇭".char_indices().collect();
820+
/// let b = vec![(0, 'A'), (1, '\u{1f1e8}'), (5, '\u{1f1ed}')];
797821
///
798822
/// assert_eq!(v, b);
799823
/// ```
@@ -822,7 +846,7 @@ impl str {
822846
/// # Examples
823847
///
824848
/// ```
825-
/// let some_words = " Mary had\ta little \n\t lamb";
849+
/// let some_words = " Mary had\ta\u{2009}little \n\t lamb";
826850
/// let v: Vec<&str> = some_words.split_whitespace().collect();
827851
///
828852
/// assert_eq!(v, ["Mary", "had", "a", "little", "lamb"]);
@@ -840,7 +864,7 @@ impl str {
840864
/// ```
841865
/// # #![feature(str_words)]
842866
/// # #![allow(deprecated)]
843-
/// let some_words = " Mary had\ta little \n\t lamb";
867+
/// let some_words = " Mary had\ta\u{2009}little \n\t lamb";
844868
/// let v: Vec<&str> = some_words.words().collect();
845869
///
846870
/// assert_eq!(v, ["Mary", "had", "a", "little", "lamb"]);

0 commit comments

Comments
 (0)