|
| 1 | +#!/usr/bin/env python2 |
| 2 | + |
| 3 | +# Copyright 2014 The Rust Project Developers. See the COPYRIGHT |
| 4 | +# file at the top-level directory of this distribution and at |
| 5 | +# http://rust-lang.org/COPYRIGHT. |
| 6 | +# |
| 7 | +# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 9 | +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 10 | +# option. This file may not be copied, modified, or distributed |
| 11 | +# except according to those terms. |
| 12 | + |
| 13 | +from __future__ import absolute_import, division, print_function |
| 14 | +import argparse |
| 15 | +from collections import defaultdict |
| 16 | +import csv |
| 17 | +import datetime |
| 18 | +import urllib2 |
| 19 | + |
| 20 | +BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/' |
| 21 | +DATA = 'UnicodeData.txt' |
| 22 | +SCRIPTS = 'Scripts.txt' |
| 23 | + |
| 24 | +# Mapping taken from Table 12 from: |
| 25 | +# http://www.unicode.org/reports/tr44/#General_Category_Values |
| 26 | +expanded_categories = { |
| 27 | + 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], |
| 28 | + 'Lm': ['L'], 'Lo': ['L'], |
| 29 | + 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], |
| 30 | + 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], |
| 31 | + 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], |
| 32 | + 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], |
| 33 | + 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], |
| 34 | + 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], |
| 35 | + 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], |
| 36 | +} |
| 37 | + |
| 38 | + |
| 39 | +def as_4byte_uni(n): |
| 40 | + s = hex(n)[2:] |
| 41 | + return '\\U%s%s' % ('0' * (8 - len(s)), s) |
| 42 | + |
| 43 | + |
| 44 | +def expand_cat(c): |
| 45 | + return expanded_categories.get(c, []) + [c] |
| 46 | + |
| 47 | + |
| 48 | +def is_valid_unicode(n): |
| 49 | + return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF |
| 50 | + |
| 51 | + |
| 52 | +def read_cats(f): |
| 53 | + assigned = defaultdict(list) |
| 54 | + for row in csv.reader(f, delimiter=';'): |
| 55 | + (hex, cats) = (int(row[0], 16), expand_cat(row[2])) |
| 56 | + if not is_valid_unicode(hex): |
| 57 | + continue |
| 58 | + for cat in cats: |
| 59 | + assigned[cat].append(hex) |
| 60 | + return assigned |
| 61 | + |
| 62 | + |
| 63 | +def read_scripts(f): |
| 64 | + assigned = defaultdict(list) |
| 65 | + for line in f: |
| 66 | + line = line.strip() |
| 67 | + if not line or line.startswith('#'): |
| 68 | + continue |
| 69 | + hexes, name = map(str.strip, line.split(';'))[:2] |
| 70 | + name = name[:name.index('#')].strip() |
| 71 | + if '..' not in hexes: |
| 72 | + hex = int(hexes, 16) |
| 73 | + if is_valid_unicode(hex): |
| 74 | + assigned[name].append(hex) |
| 75 | + else: |
| 76 | + hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..')) |
| 77 | + for hex in xrange(hex1, hex2 + 1): |
| 78 | + if is_valid_unicode(hex): |
| 79 | + assigned[name].append(hex) |
| 80 | + return assigned |
| 81 | + |
| 82 | + |
| 83 | +def group(letters): |
| 84 | + letters = sorted(set(letters)) |
| 85 | + grouped = [] |
| 86 | + cur_start = letters.pop(0) |
| 87 | + cur_end = cur_start |
| 88 | + for letter in letters: |
| 89 | + assert letter > cur_end, \ |
| 90 | + 'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter)) |
| 91 | + |
| 92 | + if letter == cur_end + 1: |
| 93 | + cur_end = letter |
| 94 | + else: |
| 95 | + grouped.append((cur_start, cur_end)) |
| 96 | + cur_start, cur_end = letter, letter |
| 97 | + grouped.append((cur_start, cur_end)) |
| 98 | + return grouped |
| 99 | + |
| 100 | + |
| 101 | +def ranges_to_rust(rs): |
| 102 | + rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs) |
| 103 | + return ',\n '.join(rs) |
| 104 | + |
| 105 | + |
| 106 | +def groups_to_rust(groups): |
| 107 | + rust_groups = [] |
| 108 | + for group_name in sorted(groups): |
| 109 | + rust_groups.append('("%s", &[\n %s\n ]),' |
| 110 | + % (group_name, ranges_to_rust(groups[group_name]))) |
| 111 | + return '\n'.join(rust_groups) |
| 112 | + |
| 113 | + |
| 114 | +if __name__ == '__main__': |
| 115 | + parser = argparse.ArgumentParser( |
| 116 | + description='Generate Unicode character class tables.') |
| 117 | + aa = parser.add_argument |
| 118 | + aa('--local', action='store_true', |
| 119 | + help='When set, Scripts.txt and UnicodeData.txt will be read from ' |
| 120 | + 'the CWD.') |
| 121 | + aa('--base-url', type=str, default=BASE_URL, |
| 122 | + help='The base URL to use for downloading Unicode data files.') |
| 123 | + args = parser.parse_args() |
| 124 | + |
| 125 | + if args.local: |
| 126 | + cats = read_cats(open(DATA)) |
| 127 | + scripts = read_scripts(open(SCRIPTS)) |
| 128 | + else: |
| 129 | + cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA)) |
| 130 | + scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS)) |
| 131 | + |
| 132 | + # Get Rust code for all Unicode general categories and scripts. |
| 133 | + combined = dict(cats, **scripts) |
| 134 | + unigroups = groups_to_rust({k: group(letters) |
| 135 | + for k, letters in combined.items()}) |
| 136 | + |
| 137 | + # Now get Perl character classes that are Unicode friendly. |
| 138 | + perld = range(ord('0'), ord('9') + 1) |
| 139 | + dgroups = ranges_to_rust(group(perld + cats['Nd'][:])) |
| 140 | + |
| 141 | + perls = map(ord, ['\t', '\n', '\x0C', '\r', ' ']) |
| 142 | + sgroups = ranges_to_rust(group(perls + cats['Z'][:])) |
| 143 | + |
| 144 | + low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1)) |
| 145 | + perlw = [ord('_')] + perld + low + up |
| 146 | + wgroups = ranges_to_rust(group(perlw + cats['L'][:])) |
| 147 | + |
| 148 | + tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT |
| 149 | +// file at the top-level directory of this distribution and at |
| 150 | +// http://rust-lang.org/COPYRIGHT. |
| 151 | +// |
| 152 | +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 153 | +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 154 | +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 155 | +// option. This file may not be copied, modified, or distributed |
| 156 | +// except according to those terms. |
| 157 | +
|
| 158 | +// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables' |
| 159 | +// on {date}. |
| 160 | +
|
| 161 | +use parse::{{Class, NamedClasses}}; |
| 162 | +
|
| 163 | +pub static UNICODE_CLASSES: NamedClasses = &[ |
| 164 | +
|
| 165 | +{groups} |
| 166 | +
|
| 167 | +]; |
| 168 | +
|
| 169 | +pub static PERLD: Class = &[ |
| 170 | + {dgroups} |
| 171 | +]; |
| 172 | +
|
| 173 | +pub static PERLS: Class = &[ |
| 174 | + {sgroups} |
| 175 | +]; |
| 176 | +
|
| 177 | +pub static PERLW: Class = &[ |
| 178 | + {wgroups} |
| 179 | +]; |
| 180 | +''' |
| 181 | + now = datetime.datetime.now() |
| 182 | + print(tpl.format(date=str(now), groups=unigroups, |
| 183 | + dgroups=dgroups, sgroups=sgroups, wgroups=wgroups)) |
0 commit comments