-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathbuild.rs
35 lines (32 loc) · 1.2 KB
/
build.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
use std::env;
use std::fs::File;
use std::io::Read;
use std::path::PathBuf;
use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding};
use serde::Serialize;
fn main() {
serialize_tiktoken_bpe(
"cl100k_base",
include_bytes!("data/cl100k_base.tiktoken.gz"),
17846336922010275747,
);
serialize_tiktoken_bpe(
"o200k_base",
include_bytes!("data/o200k_base.tiktoken.gz"),
17846336922010275747,
);
println!("cargo::rerun-if-changed=build.rs");
}
fn serialize_tiktoken_bpe(name: &str, data: &[u8], hash_factor: u64) {
let mut dec = flate2::read::GzDecoder::new(data);
let mut tiktoken = String::new();
dec.read_to_string(&mut tiktoken).expect("can decode data");
let tokens = read_tiktoken(&tiktoken).expect("can read data");
let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build"));
path.push(format!("bpe_{name}.dict"));
let file = File::create(path).expect("can create output file");
let mut serializer = rmp_serde::Serializer::new(file);
let bpe = BytePairEncoding::from_dictionary(tokens, Some(hash_factor));
bpe.serialize(&mut serializer)
.expect("serialization succeeds");
}