Skip to content

Commit 9442d6b

Browse files
committed
regex-debug: add utf8-ranges sub-command
This sub-command prints out the UTF-8 alternation machine for an arbitrary character class.
1 parent 453198d commit 9442d6b

File tree

2 files changed

+38
-0
lines changed

2 files changed

+38
-0
lines changed

regex-debug/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ regex-syntax = { version = "0.4.0", path = "../regex-syntax" }
1717
regex-syntax2 = { version = "0.5.0", path = "../regex-syntax-2" }
1818
serde = "1"
1919
serde_derive = "1"
20+
utf8-ranges = "1"

regex-debug/src/main.rs

+37
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ extern crate regex_syntax2 as syntax;
44
extern crate serde;
55
#[macro_use]
66
extern crate serde_derive;
7+
extern crate utf8_ranges;
78

89
use std::error;
910
use std::io::{self, Write};
@@ -24,6 +25,7 @@ Usage:
2425
regex-debug [options] anchors <pattern>
2526
regex-debug [options] captures <pattern>
2627
regex-debug [options] compile <patterns> ...
28+
regex-debug [options] utf8-ranges <class>
2729
regex-debug --help
2830
2931
Options:
@@ -59,9 +61,11 @@ struct Args {
5961
cmd_anchors: bool,
6062
cmd_captures: bool,
6163
cmd_compile: bool,
64+
cmd_utf8_ranges: bool,
6265

6366
arg_pattern: String,
6467
arg_patterns: Vec<String>,
68+
arg_class: String,
6569

6670
flag_size_limit: usize,
6771
flag_bytes: bool,
@@ -108,6 +112,8 @@ fn run(args: &Args) -> Result<()> {
108112
cmd_captures(args)
109113
} else if args.cmd_compile {
110114
cmd_compile(args)
115+
} else if args.cmd_utf8_ranges {
116+
cmd_utf8_ranges(args)
111117
} else {
112118
unreachable!()
113119
}
@@ -202,6 +208,37 @@ fn cmd_compile(args: &Args) -> Result<()> {
202208
Ok(())
203209
}
204210

211+
fn cmd_utf8_ranges(args: &Args) -> Result<()> {
212+
use syntax::ParserBuilder;
213+
use syntax::hir::{self, HirKind};
214+
use utf8_ranges::Utf8Sequences;
215+
216+
let hir = try!(ParserBuilder::new()
217+
.build()
218+
.parse(&format!("[{}]", args.arg_class)));
219+
let cls = match hir.into_kind() {
220+
HirKind::Class(hir::Class::Unicode(cls)) => cls,
221+
_ => return Err(
222+
format!("unexpected HIR, expected Unicode class").into(),
223+
),
224+
};
225+
for (i, range) in cls.iter().enumerate() {
226+
if i > 0 {
227+
println!("----------------------------");
228+
}
229+
for seq in Utf8Sequences::new(range.start(), range.end()) {
230+
for (i, utf8_range) in seq.into_iter().enumerate() {
231+
if i > 0 {
232+
print!("|");
233+
}
234+
print!("[{:02X}-{:02X}]", utf8_range.start, utf8_range.end);
235+
}
236+
println!();
237+
}
238+
}
239+
Ok(())
240+
}
241+
205242
impl Args {
206243
fn parse_one(&self) -> Result<Hir> {
207244
parse(&self.arg_pattern)

0 commit comments

Comments
 (0)