Skip to content

Added Ascii encoding, some cleanups #5980

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 23, 2013
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions src/libcore/char.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2012 The Rust Project Developers. See the COPYRIGHT
// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
Expand Down Expand Up @@ -234,6 +234,21 @@ pub fn escape_default(c: char) -> ~str {
}
}

/// Returns the amount of bytes this character would need if encoded in utf8
pub fn len_utf8_bytes(c: char) -> uint {
static max_one_b: uint = 128u;
static max_two_b: uint = 2048u;
static max_three_b: uint = 65536u;
static max_four_b: uint = 2097152u;

let code = c as uint;
if code < max_one_b { 1u }
else if code < max_two_b { 2u }
else if code < max_three_b { 3u }
else if code < max_four_b { 4u }
else { fail!(~"invalid character!") }
}

/**
* Compare two chars
*
Expand Down Expand Up @@ -334,7 +349,6 @@ fn test_escape_default() {
assert_eq!(escape_default('\U0001d4b6'), ~"\\U0001d4b6");
}


#[test]
fn test_escape_unicode() {
assert_eq!(escape_unicode('\x00'), ~"\\x00");
Expand Down
7 changes: 6 additions & 1 deletion src/libcore/prelude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,14 @@ pub use path::Path;
pub use path::PosixPath;
pub use path::WindowsPath;
pub use ptr::Ptr;
// NOTE: Remove markers after snapshot
#[cfg(stage1)]
#[cfg(stage2)]
#[cfg(stage3)]
pub use str::{Ascii, AsciiCast, OwnedAsciiCast, ToStrAscii};
pub use str::{StrSlice, OwnedStr};
pub use to_bytes::IterBytes;
pub use to_str::ToStr;
pub use to_str::{ToStr, ToStrConsume};
pub use tuple::{CopyableTuple, ImmutableTuple, ExtendedTupleOps};
pub use vec::{CopyableVector, ImmutableVector};
pub use vec::{ImmutableEqVector, ImmutableCopyableVector};
Expand Down
66 changes: 38 additions & 28 deletions src/libcore/str.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2012 The Rust Project Developers. See the COPYRIGHT
// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
Expand All @@ -17,6 +17,12 @@
* some heavy-duty uses, try std::rope.
*/

// NOTE: Remove markers after snapshot
#[cfg(stage1)]
#[cfg(stage2)]
#[cfg(stage3)]
pub use self::ascii::{Ascii, AsciiCast, OwnedAsciiCast, ToStrAscii};

use at_vec;
use cast;
use char;
Expand All @@ -34,6 +40,13 @@ use to_str::ToStr;

#[cfg(notest)] use cmp::{Eq, Ord, Equiv, TotalEq};

// NOTE: Remove markers after snapshot
#[cfg(stage1)]
#[cfg(stage2)]
#[cfg(stage3)]
#[path = "str/ascii.rs"]
mod ascii;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this should be a sub-module of str, since it's only related to it in the same way it is to [u8] (conversion). Perhaps a top-level module in libstd?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, could put it in a separate top-level module, It's globally imported in the prelude anyway. I'd be in favor of having at least ascii in core though (other byte encodings, codepages etc I'd put in std, though)


/*
Section: Creating a string
*/
Expand Down Expand Up @@ -789,16 +802,18 @@ pub fn each_split_within<'a>(ss: &'a str,

/// Convert a string to lowercase. ASCII only
pub fn to_lower(s: &str) -> ~str {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should just be removed since they're possible already by converting to [Ascii] - libc isn't actually ASCII-only, it's platform dependant (the ones from glibc are definitely locale-aware).

Functions named to_lower and to_upper in str definitely should have full Unicode support.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, wanted to make sure the the PR gets in first before starting to remove those functions, because that will surely touch some more code.

map(s,
|c| unsafe{(libc::tolower(c as libc::c_char)) as char}
)
do map(s) |c| {
assert!(char::is_ascii(c));
(unsafe{libc::tolower(c as libc::c_char)}) as char
}
}

/// Convert a string to uppercase. ASCII only
pub fn to_upper(s: &str) -> ~str {
map(s,
|c| unsafe{(libc::toupper(c as libc::c_char)) as char}
)
do map(s) |c| {
assert!(char::is_ascii(c));
(unsafe{libc::toupper(c as libc::c_char)}) as char
}
}

/**
Expand Down Expand Up @@ -2317,20 +2332,20 @@ pub mod raw {
}

/// Removes the last byte from a string and returns it. (Not UTF-8 safe).
pub fn pop_byte(s: &mut ~str) -> u8 {
pub unsafe fn pop_byte(s: &mut ~str) -> u8 {
let len = len(*s);
assert!((len > 0u));
let b = s[len - 1u];
unsafe { set_len(s, len - 1u) };
set_len(s, len - 1u);
return b;
}

/// Removes the first byte from a string and returns it. (Not UTF-8 safe).
pub fn shift_byte(s: &mut ~str) -> u8 {
pub unsafe fn shift_byte(s: &mut ~str) -> u8 {
let len = len(*s);
assert!((len > 0u));
let b = s[0];
*s = unsafe { raw::slice_bytes_owned(*s, 1u, len) };
*s = raw::slice_bytes_owned(*s, 1u, len);
return b;
}

Expand Down Expand Up @@ -3096,12 +3111,11 @@ mod tests {

#[test]
fn test_to_lower() {
unsafe {
assert!(~"" == map(~"",
|c| libc::tolower(c as c_char) as char));
assert!(~"ymca" == map(~"YMCA",
|c| libc::tolower(c as c_char) as char));
}
// libc::tolower, and hence str::to_lower
// are culturally insensitive: they only work for ASCII
// (see Issue #1347)
assert!(~"" == to_lower(""));
assert!(~"ymca" == to_lower("YMCA"));
}

#[test]
Expand Down Expand Up @@ -3346,15 +3360,15 @@ mod tests {
#[test]
fn test_shift_byte() {
let mut s = ~"ABC";
let b = raw::shift_byte(&mut s);
let b = unsafe{raw::shift_byte(&mut s)};
assert!((s == ~"BC"));
assert!((b == 65u8));
}

#[test]
fn test_pop_byte() {
let mut s = ~"ABC";
let b = raw::pop_byte(&mut s);
let b = unsafe{raw::pop_byte(&mut s)};
assert!((s == ~"AB"));
assert!((b == 67u8));
}
Expand Down Expand Up @@ -3666,12 +3680,8 @@ mod tests {

#[test]
fn test_map() {
unsafe {
assert!(~"" == map(~"", |c|
libc::toupper(c as c_char) as char));
assert!(~"YMCA" == map(~"ymca",
|c| libc::toupper(c as c_char) as char));
}
assert!(~"" == map(~"", |c| unsafe {libc::toupper(c as c_char)} as char));
assert!(~"YMCA" == map(~"ymca", |c| unsafe {libc::toupper(c as c_char)} as char));
}

#[test]
Expand All @@ -3685,11 +3695,11 @@ mod tests {

#[test]
fn test_any() {
assert!(false == any(~"", char::is_uppercase));
assert!(false == any(~"", char::is_uppercase));
assert!(false == any(~"ymca", char::is_uppercase));
assert!(true == any(~"YMCA", char::is_uppercase));
assert!(true == any(~"yMCA", char::is_uppercase));
assert!(true == any(~"Ymcy", char::is_uppercase));
assert!(true == any(~"yMCA", char::is_uppercase));
assert!(true == any(~"Ymcy", char::is_uppercase));
}

#[test]
Expand Down
Loading