Skip to content

Commit 6232f95

Browse files
committed
Auto merge of #27012 - pornel:master, r=Gankro
Fixes #26689 This PR tries to clarify uses of "character" where it means "code point" or "UTF-8 sequence", which are almost, but not quite the same. Edge cases added to some examples to demonstrate this. However, I've kept use of the term "code point" instead of "Unicode scalar value", because in UTF-8 they're the same, and "code point" is more widely known.
2 parents cf7e825 + c20e3fc commit 6232f95

File tree

1 file changed

+71
-47
lines changed

1 file changed

+71
-47
lines changed

src/libcollections/str.rs

+71-47
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,7 @@ impl str {
500500
///
501501
/// # Unsafety
502502
///
503-
/// Caller must check both UTF-8 character boundaries and the boundaries
503+
/// Caller must check both UTF-8 sequence boundaries and the boundaries
504504
/// of the entire slice as
505505
/// well.
506506
///
@@ -526,15 +526,16 @@ impl str {
526526
core_str::StrExt::slice_mut_unchecked(self, begin, end)
527527
}
528528

529-
/// Returns a slice of the string from the character range [`begin`..`end`).
529+
/// Returns a slice of the string from the range [`begin`..`end`) where indices
530+
/// are counted in code points.
530531
///
531532
/// That is, start at the `begin`-th code point of the string and continue
532533
/// to the `end`-th code point. This does not detect or handle edge cases
533-
/// such as leaving a combining character as the first code point of the
534+
/// such as leaving a combining character as the first `char` of the
534535
/// string.
535536
///
536537
/// Due to the design of UTF-8, this operation is `O(end)`. Use slicing
537-
/// syntax if you want to use byte indices rather than codepoint indices.
538+
/// syntax if you want to use `O(1)` byte indices instead.
538539
///
539540
/// # Panics
540541
///
@@ -556,26 +557,26 @@ impl str {
556557
core_str::StrExt::slice_chars(self, begin, end)
557558
}
558559

559-
/// Given a byte position, return the next char and its index.
560+
/// Given a byte position, return the next code point and its index.
560561
///
561-
/// This can be used to iterate over the Unicode characters of a string.
562+
/// This can be used to iterate over the Unicode code points of a string.
562563
///
563564
/// # Panics
564565
///
565566
/// If `i` is greater than or equal to the length of the string.
566-
/// If `i` is not the index of the beginning of a valid UTF-8 character.
567+
/// If `i` is not the index of the beginning of a valid UTF-8 sequence.
567568
///
568569
/// # Examples
569570
///
570-
/// This example manually iterates through the characters of a string;
571+
/// This example manually iterates through the code points of a string;
571572
/// this should normally be
572573
/// done by `.chars()` or `.char_indices()`.
573574
///
574575
/// ```
575576
/// # #![feature(str_char, core)]
576577
/// use std::str::CharRange;
577578
///
578-
/// let s = "中华Việt Nam";
579+
/// let s = "中华Việt Nam";
579580
/// let mut i = 0;
580581
/// while i < s.len() {
581582
/// let CharRange {ch, next} = s.char_range_at(i);
@@ -591,12 +592,14 @@ impl str {
591592
/// 3: 华
592593
/// 6: V
593594
/// 7: i
594-
/// 8: ệ
595-
/// 11: t
596-
/// 12:
597-
/// 13: N
598-
/// 14: a
599-
/// 15: m
595+
/// 8: e
596+
/// 9: ̣
597+
/// 11: ̂
598+
/// 13: t
599+
/// 14:
600+
/// 15: N
601+
/// 16: a
602+
/// 17: m
600603
/// ```
601604
#[unstable(feature = "str_char",
602605
reason = "often replaced by char_indices, this method may \
@@ -608,26 +611,29 @@ impl str {
608611

609612
/// Given a byte position, return the previous `char` and its position.
610613
///
611-
/// This function can be used to iterate over a Unicode string in reverse.
614+
/// This function can be used to iterate over a Unicode code points in reverse.
615+
///
616+
/// Note that Unicode has many features, such as combining marks, ligatures,
617+
/// and direction marks, that need to be taken into account to correctly reverse a string.
612618
///
613619
/// Returns 0 for next index if called on start index 0.
614620
///
615621
/// # Panics
616622
///
617623
/// If `i` is greater than the length of the string.
618-
/// If `i` is not an index following a valid UTF-8 character.
624+
/// If `i` is not an index following a valid UTF-8 sequence.
619625
///
620626
/// # Examples
621627
///
622-
/// This example manually iterates through the characters of a string;
628+
/// This example manually iterates through the code points of a string;
623629
/// this should normally be
624630
/// done by `.chars().rev()` or `.char_indices()`.
625631
///
626632
/// ```
627633
/// # #![feature(str_char, core)]
628634
/// use std::str::CharRange;
629635
///
630-
/// let s = "中华Việt Nam";
636+
/// let s = "中华Việt Nam";
631637
/// let mut i = s.len();
632638
/// while i > 0 {
633639
/// let CharRange {ch, next} = s.char_range_at_reverse(i);
@@ -639,12 +645,14 @@ impl str {
639645
/// This outputs:
640646
///
641647
/// ```text
642-
/// 16: m
643-
/// 15: a
644-
/// 14: N
645-
/// 13:
646-
/// 12: t
647-
/// 11: ệ
648+
/// 18: m
649+
/// 17: a
650+
/// 16: N
651+
/// 15:
652+
/// 14: t
653+
/// 13: ̂
654+
/// 11: ̣
655+
/// 9: e
648656
/// 8: i
649657
/// 7: V
650658
/// 6: 华
@@ -663,7 +671,7 @@ impl str {
663671
/// # Panics
664672
///
665673
/// If `i` is greater than or equal to the length of the string.
666-
/// If `i` is not the index of the beginning of a valid UTF-8 character.
674+
/// If `i` is not the index of the beginning of a valid UTF-8 sequence.
667675
///
668676
/// # Examples
669677
///
@@ -672,6 +680,7 @@ impl str {
672680
/// let s = "abπc";
673681
/// assert_eq!(s.char_at(1), 'b');
674682
/// assert_eq!(s.char_at(2), 'π');
683+
/// assert_eq!(s.char_at(4), 'c');
675684
/// ```
676685
#[unstable(feature = "str_char",
677686
reason = "frequently replaced by the chars() iterator, this \
@@ -689,7 +698,7 @@ impl str {
689698
/// # Panics
690699
///
691700
/// If `i` is greater than the length of the string.
692-
/// If `i` is not an index following a valid UTF-8 character.
701+
/// If `i` is not an index following a valid UTF-8 sequence.
693702
///
694703
/// # Examples
695704
///
@@ -698,6 +707,7 @@ impl str {
698707
/// let s = "abπc";
699708
/// assert_eq!(s.char_at_reverse(1), 'a');
700709
/// assert_eq!(s.char_at_reverse(2), 'b');
710+
/// assert_eq!(s.char_at_reverse(3), 'π');
701711
/// ```
702712
#[unstable(feature = "str_char",
703713
reason = "see char_at for more details, but reverse semantics \
@@ -707,28 +717,30 @@ impl str {
707717
core_str::StrExt::char_at_reverse(self, i)
708718
}
709719

710-
/// Retrieves the first character from a `&str` and returns it.
720+
/// Retrieves the first code point from a `&str` and returns it.
721+
///
722+
/// Note that a single Unicode character (grapheme cluster)
723+
/// can be composed of multiple `char`s.
711724
///
712725
/// This does not allocate a new string; instead, it returns a slice that
713-
/// points one character
714-
/// beyond the character that was shifted.
726+
/// points one code point beyond the code point that was shifted.
715727
///
716-
/// If the slice does not contain any characters, None is returned instead.
728+
/// `None` is returned if the slice is empty.
717729
///
718730
/// # Examples
719731
///
720732
/// ```
721733
/// # #![feature(str_char)]
722-
/// let s = "Löwe 老虎 Léopard";
734+
/// let s = "Łódź"; // \u{141}o\u{301}dz\u{301}
723735
/// let (c, s1) = s.slice_shift_char().unwrap();
724736
///
725-
/// assert_eq!(c, 'L');
726-
/// assert_eq!(s1, "öwe 老虎 Léopard");
737+
/// assert_eq!(c, 'Ł');
738+
/// assert_eq!(s1, "ódź");
727739
///
728740
/// let (c, s2) = s1.slice_shift_char().unwrap();
729741
///
730-
/// assert_eq!(c, 'ö');
731-
/// assert_eq!(s2, "we 老虎 Léopard");
742+
/// assert_eq!(c, 'o');
743+
/// assert_eq!(s2, "\u{301}dz\u{301}");
732744
/// ```
733745
#[unstable(feature = "str_char",
734746
reason = "awaiting conventions about shifting and slices and \
@@ -741,14 +753,14 @@ impl str {
741753
/// Divide one string slice into two at an index.
742754
///
743755
/// The index `mid` is a byte offset from the start of the string
744-
/// that must be on a character boundary.
756+
/// that must be on a `char` boundary.
745757
///
746758
/// Return slices `&self[..mid]` and `&self[mid..]`.
747759
///
748760
/// # Panics
749761
///
750-
/// Panics if `mid` is beyond the last character of the string,
751-
/// or if it is not on a character boundary.
762+
/// Panics if `mid` is beyond the last code point of the string,
763+
/// or if it is not on a `char` boundary.
752764
///
753765
/// # Examples
754766
/// ```
@@ -773,27 +785,39 @@ impl str {
773785
core_str::StrExt::split_at_mut(self, mid)
774786
}
775787

776-
/// An iterator over the codepoints of `self`.
788+
/// An iterator over the code points of `self`.
789+
///
790+
/// In Unicode relationship between code points and characters is complex.
791+
/// A single character may be composed of multiple code points
792+
/// (e.g. diacritical marks added to a letter), and a single code point
793+
/// (e.g. Hangul syllable) may contain multiple characters.
794+
///
795+
/// For iteration over human-readable characters a grapheme cluster iterator
796+
/// may be more appropriate. See the [unicode-segmentation crate][1].
797+
///
798+
/// [1]: https://crates.io/crates/unicode-segmentation
777799
///
778800
/// # Examples
779801
///
780802
/// ```
781-
/// let v: Vec<char> = "abc åäö".chars().collect();
803+
/// let v: Vec<char> = "ASCII żółć 🇨🇭 한".chars().collect();
782804
///
783-
/// assert_eq!(v, ['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
805+
/// assert_eq!(v, ['A', 'S', 'C', 'I', 'I', ' ',
806+
/// 'z', '\u{307}', 'o', '\u{301}', 'ł', 'c', '\u{301}', ' ',
807+
/// '\u{1f1e8}', '\u{1f1ed}', ' ', '한']);
784808
/// ```
785809
#[stable(feature = "rust1", since = "1.0.0")]
786810
pub fn chars(&self) -> Chars {
787811
core_str::StrExt::chars(self)
788812
}
789813

790-
/// An iterator over the characters of `self` and their byte offsets.
814+
/// An iterator over the `char`s of `self` and their byte offsets.
791815
///
792816
/// # Examples
793817
///
794818
/// ```
795-
/// let v: Vec<(usize, char)> = "abc".char_indices().collect();
796-
/// let b = vec![(0, 'a'), (1, 'b'), (2, 'c')];
819+
/// let v: Vec<(usize, char)> = "A🇨🇭".char_indices().collect();
820+
/// let b = vec![(0, 'A'), (1, '\u{1f1e8}'), (5, '\u{1f1ed}')];
797821
///
798822
/// assert_eq!(v, b);
799823
/// ```
@@ -822,7 +846,7 @@ impl str {
822846
/// # Examples
823847
///
824848
/// ```
825-
/// let some_words = " Mary had\ta little \n\t lamb";
849+
/// let some_words = " Mary had\ta\u{2009}little \n\t lamb";
826850
/// let v: Vec<&str> = some_words.split_whitespace().collect();
827851
///
828852
/// assert_eq!(v, ["Mary", "had", "a", "little", "lamb"]);
@@ -840,7 +864,7 @@ impl str {
840864
/// ```
841865
/// # #![feature(str_words)]
842866
/// # #![allow(deprecated)]
843-
/// let some_words = " Mary had\ta little \n\t lamb";
867+
/// let some_words = " Mary had\ta\u{2009}little \n\t lamb";
844868
/// let v: Vec<&str> = some_words.words().collect();
845869
///
846870
/// assert_eq!(v, ["Mary", "had", "a", "little", "lamb"]);

0 commit comments

Comments
 (0)