Skip to content

Commit be07807

Browse files
authored
Use weak symbol create_fallback_regex to separate the implementation using PCRE2 and std::regex
Differential Revision: D75173102 Pull Request resolved: #77
1 parent d261f58 commit be07807

File tree

8 files changed

+176
-105
lines changed

8 files changed

+176
-105
lines changed

CMakeLists.txt

Lines changed: 27 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@ project(Tokenizers)
1919
option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
2020
option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)
2121
option(SUPPORT_REGEX_LOOKAHEAD
22-
"Support regex lookahead patterns (requires PCRE2)" OFF)
22+
"Support regex lookahead patterns (requires PCRE2)" OFF)
2323

24+
include(Utils.cmake)
2425
# Ignore weak attribute warning
2526
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
2627

@@ -34,20 +35,6 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp)
3435
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
3536
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece)
3637

37-
# Configure PCRE2
38-
if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
39-
set(PCRE2_BUILD_PCRE2_8 ON)
40-
set(PCRE2_BUILD_PCRE2_16 OFF)
41-
set(PCRE2_BUILD_PCRE2_32 OFF)
42-
set(PCRE2_BUILD_TESTS OFF)
43-
set(PCRE2_BUILD_PCRE2GREP OFF)
44-
set(PCRE2_BUILD_PCRE2TEST OFF)
45-
set(PCRE2_BUILD_PCRE2GPERF OFF)
46-
set(PCRE2_BUILD_DOCS OFF)
47-
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
48-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
49-
endif()
50-
5138
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
5239

5340
file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
@@ -60,14 +47,8 @@ set(tokenizers_source_files
6047
${CMAKE_CURRENT_SOURCE_DIR}/src/regex.cpp
6148
${CMAKE_CURRENT_SOURCE_DIR}/src/sentencepiece.cpp
6249
${CMAKE_CURRENT_SOURCE_DIR}/src/tiktoken.cpp
63-
${CMAKE_CURRENT_SOURCE_DIR}/src/token_decoder.cpp
64-
)
65-
if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
66-
list(APPEND
67-
tokenizers_source_files
68-
${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp
69-
${CMAKE_CURRENT_SOURCE_DIR}/src/std_regex.cpp)
70-
endif()
50+
${CMAKE_CURRENT_SOURCE_DIR}/src/token_decoder.cpp)
51+
7152
file(GLOB unicode_source_files
7253
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
7354
add_library(tokenizers STATIC ${tokenizers_source_files}
@@ -85,11 +66,26 @@ target_include_directories(
8566
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)
8667

8768
if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
88-
target_include_directories(tokenizers
89-
PUBLIC
90-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
91-
target_link_libraries(tokenizers PUBLIC pcre2-8)
92-
target_compile_definitions(tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD)
69+
set(PCRE2_BUILD_PCRE2_8 ON)
70+
set(PCRE2_BUILD_PCRE2_16 OFF)
71+
set(PCRE2_BUILD_PCRE2_32 OFF)
72+
set(PCRE2_BUILD_TESTS OFF)
73+
set(PCRE2_BUILD_PCRE2GREP OFF)
74+
set(PCRE2_BUILD_PCRE2TEST OFF)
75+
set(PCRE2_BUILD_PCRE2GPERF OFF)
76+
set(PCRE2_BUILD_DOCS OFF)
77+
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
78+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
79+
add_library(
80+
regex_lookahead STATIC
81+
${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp
82+
${CMAKE_CURRENT_SOURCE_DIR}/src/regex_lookahead.cpp
83+
${CMAKE_CURRENT_SOURCE_DIR}/src/std_regex.cpp)
84+
target_link_libraries(regex_lookahead PUBLIC pcre2-8)
85+
target_include_directories(
86+
regex_lookahead PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
87+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
88+
target_link_options_shared_lib(regex_lookahead)
9389
endif()
9490

9591
# Build test
@@ -120,9 +116,9 @@ if(TOKENIZERS_BUILD_TEST)
120116
${CMAKE_CURRENT_SOURCE_DIR}/include
121117
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
122118
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
123-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
124-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
125-
target_link_libraries(${test_name} gtest_main GTest::gmock tokenizers)
119+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include)
120+
target_link_libraries(${test_name} gtest_main GTest::gmock tokenizers
121+
regex_lookahead)
126122
add_test(${test_name} "${test_name}")
127123
set_tests_properties(${test_name} PROPERTIES ENVIRONMENT ${test_env})
128124
endforeach()

Utils.cmake

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2+
3+
#
4+
# Build tokenizers.
5+
#
6+
# ### Editing this file ###
7+
#
8+
# This file should be formatted with
9+
# ~~~
10+
# cmake-format -i CMakeLists.txt
11+
# ~~~
12+
# It should also be cmake-lint clean.
13+
#
14+
15+
# This is the funtion to use -Wl, --whole-archive to link static library NB:
16+
# target_link_options is broken for this case, it only append the interface link
17+
# options of the first library.
18+
function(kernel_link_options target_name)
19+
# target_link_options(${target_name} INTERFACE
20+
# "$<LINK_LIBRARY:WHOLE_ARCHIVE,target_name>")
21+
target_link_options(
22+
${target_name} INTERFACE "SHELL:LINKER:--whole-archive \
23+
$<TARGET_FILE:${target_name}> \
24+
LINKER:--no-whole-archive")
25+
endfunction()
26+
27+
# Same as kernel_link_options but it's for MacOS linker
28+
function(macos_kernel_link_options target_name)
29+
target_link_options(${target_name} INTERFACE
30+
"SHELL:LINKER:-force_load,$<TARGET_FILE:${target_name}>")
31+
endfunction()
32+
33+
# Same as kernel_link_options but it's for MSVC linker
34+
function(msvc_kernel_link_options target_name)
35+
target_link_options(
36+
${target_name} INTERFACE
37+
"SHELL:LINKER:/WHOLEARCHIVE:$<TARGET_FILE:${target_name}>")
38+
endfunction()
39+
40+
# Ensure that the load-time constructor functions run. By default, the linker
41+
# would remove them since there are no other references to them.
42+
function(target_link_options_shared_lib target_name)
43+
if(APPLE)
44+
macos_kernel_link_options(${target_name})
45+
elseif(MSVC)
46+
msvc_kernel_link_options(${target_name})
47+
else()
48+
kernel_link_options(${target_name})
49+
endif()
50+
endfunction()

include/pytorch/tokenizers/error.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ enum class Error : error_code_t {
5555

5656
/// Decode failure.
5757
DecodeFailure = 0x08,
58+
59+
/// No suitable regex implementation found.
60+
RegexFailure = 0x09,
5861
};
5962

6063
} // namespace tokenizers

include/pytorch/tokenizers/pcre2_regex.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class Pcre2Regex : public IRegex {
4545
pcre2_code* regex_;
4646
pcre2_match_data* match_data_;
4747

48-
friend Result<std::unique_ptr<IRegex>> create_regex(
48+
friend Result<std::unique_ptr<IRegex>> create_fallback_regex(
4949
const std::string& pattern);
5050
};
5151

include/pytorch/tokenizers/regex.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,24 @@ class IRegex {
3838
};
3939

4040
/**
41-
* @brief Creates a regex instance. Tries RE2 first, falls back to std::regex.
41+
* @brief Creates a regex instance. If no strong symbol defined, only
42+
* uses RE2. This is a weak symbol to allow other regex libraries to be
43+
* used.
4244
*
4345
* @param pattern The regex pattern to compile.
4446
* @return A unique pointer to an IRegex-compatible object.
4547
*/
4648
Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern);
4749

50+
/**
51+
* @brief Creates a fallback regex instance. If no strong symbol defined,
52+
* returns Error, otherwise uses PCRE2 and std::regex.
53+
* This is a weak symbol to allow other regex libraries to be used.
54+
*
55+
* @param pattern The regex pattern to compile.
56+
* @return A unique pointer to an IRegex-compatible object.
57+
*/
58+
Result<std::unique_ptr<IRegex>> create_fallback_regex(
59+
const std::string& pattern) TK_WEAK;
60+
4861
} // namespace tokenizers

src/regex.cpp

Lines changed: 24 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,17 @@
55
* This source code is licensed under the BSD-style license found in the
66
* LICENSE file in the root directory of this source tree.
77
*/
8+
// A weak symbol for create_regex, only using RE2 regex library.
9+
// regex_lookahead.cpp has the implementation of create_regex with lookahead
10+
// support, backed by PCRE2 and std::regex.
811

9-
#ifdef SUPPORT_REGEX_LOOKAHEAD
10-
#include <pytorch/tokenizers/pcre2_regex.h>
11-
#endif
1212
#include <pytorch/tokenizers/re2_regex.h>
1313
#include <pytorch/tokenizers/regex.h>
14-
#include <pytorch/tokenizers/std_regex.h>
1514

16-
#include <re2/re2.h>
1715
#include <iostream>
18-
#include <memory>
1916

2017
namespace tokenizers {
2118

22-
/**
23-
* @brief Factory function that creates a regex object using RE2 if possible.
24-
* Falls back to PCRE2 if RE2 rejects the pattern and
25-
* SUPPORT_REGEX_LOOKAHEAD is enabled. Otherwise, returns an error.
26-
*/
2719
Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
2820
// Try RE2 first
2921
auto re2 = std::make_unique<Re2Regex>("(" + pattern + ")");
@@ -32,42 +24,31 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
3224
return static_cast<std::unique_ptr<IRegex>>(std::move(re2));
3325
}
3426

35-
#ifndef SUPPORT_REGEX_LOOKAHEAD
36-
std::cerr << "RE2 failed to compile pattern with lookahead: " << pattern
37-
<< "\n";
27+
std::cerr << "RE2 failed to compile pattern: " << pattern << "\n";
3828
std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
39-
std::cerr
40-
<< "Compile with SUPPORT_REGEX_LOOKAHEAD=ON to enable support for lookahead patterns."
41-
<< std::endl;
42-
return tokenizers::Error::LoadFailure;
43-
#else
44-
if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) {
45-
// RE2 doesn't support some Perl features, try PCRE2
46-
auto pcre2 = std::make_unique<Pcre2Regex>("(" + pattern + ")");
47-
48-
if (pcre2->regex_ != nullptr && pcre2->match_data_ != nullptr) {
49-
std::cout
50-
<< "RE2 is unable to support things such as negative lookaheads in "
51-
<< pattern << ", using PCRE2 instead." << std::endl;
52-
return static_cast<std::unique_ptr<IRegex>>(std::move(pcre2));
53-
}
5429

55-
// If PCRE2 also fails, fall back to std::regex
56-
try {
57-
std::cout
58-
<< "PCRE2 failed to compile pattern, falling back to std::regex.";
59-
auto std_regex = std::make_unique<StdRegex>("(" + pattern + ")");
60-
return static_cast<std::unique_ptr<IRegex>>(std::move(std_regex));
61-
} catch (const std::regex_error& e) {
62-
std::cerr << "std::regex failed: " << e.what() << std::endl;
63-
return tokenizers::Error::LoadFailure;
30+
if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) {
31+
auto res = create_fallback_regex(pattern);
32+
if (!res.ok()) {
33+
std::cerr
34+
<< "RE2 doesn't support lookahead patterns. "
35+
<< "Link with the lookahead-enabled version of this library to enable support."
36+
<< std::endl;
37+
} else {
38+
return res;
6439
}
65-
} else {
66-
std::cerr << "RE2 failed to compile pattern: " << pattern << "\n";
67-
std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
68-
return tokenizers::Error::LoadFailure;
6940
}
70-
#endif
41+
42+
return tokenizers::Error::RegexFailure;
43+
}
44+
45+
#ifdef _MSC_VER
46+
#pragma weak create_fallback_regex
47+
#endif // _MSC_VER
48+
Result<std::unique_ptr<IRegex>> create_fallback_regex(
49+
const std::string& pattern) {
50+
(void)pattern;
51+
return tokenizers::Error::RegexFailure;
7152
}
7253

7354
} // namespace tokenizers

src/regex_lookahead.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
// This file contains the implementation of create_regex with lookahead support
10+
11+
#include <pytorch/tokenizers/pcre2_regex.h>
12+
#include <pytorch/tokenizers/regex.h>
13+
#include <pytorch/tokenizers/std_regex.h>
14+
15+
#include <iostream>
16+
#include <memory>
17+
18+
namespace tokenizers {
19+
20+
/**
21+
* @brief Factory function that creates a regex object using RE2 if possible.
22+
* Falls back to PCRE2 if RE2 rejects the pattern due to lookahead.
23+
* Falls back to std::regex if PCRE2 also fails.
24+
*/
25+
26+
#ifdef _MSC_VER
27+
#pragma weak create_fallback_regex
28+
#endif // _MSC_VER
29+
Result<std::unique_ptr<IRegex>> create_fallback_regex(
30+
const std::string& pattern) {
31+
auto pcre2 = std::make_unique<Pcre2Regex>("(" + pattern + ")");
32+
33+
if (pcre2->regex_ != nullptr && pcre2->match_data_ != nullptr) {
34+
std::cout
35+
<< "RE2 is unable to support things such as negative lookaheads in "
36+
<< pattern << ", using PCRE2 instead." << std::endl;
37+
return static_cast<std::unique_ptr<IRegex>>(std::move(pcre2));
38+
}
39+
40+
// If PCRE2 also fails, fall back to std::regex
41+
try {
42+
std::cout << "PCRE2 failed to compile pattern, falling back to std::regex.";
43+
auto std_regex = std::make_unique<StdRegex>("(" + pattern + ")");
44+
return static_cast<std::unique_ptr<IRegex>>(std::move(std_regex));
45+
} catch (const std::regex_error& e) {
46+
std::cerr << "std::regex failed: " << e.what() << std::endl;
47+
return tokenizers::Error::LoadFailure;
48+
}
49+
}
50+
51+
} // namespace tokenizers

targets.bzl

Lines changed: 6 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ def define_common_targets():
2626
runtime.cxx_library(
2727
name = "regex",
2828
srcs = [
29-
"src/regex.cpp",
3029
"src/re2_regex.cpp",
30+
"src/regex.cpp",
3131
],
3232
exported_deps = [
3333
":headers",
@@ -44,19 +44,19 @@ def define_common_targets():
4444
name = "regex_lookahead",
4545
srcs = [
4646
"src/pcre2_regex.cpp",
47-
"src/regex.cpp",
48-
"src/re2_regex.cpp",
47+
"src/regex_lookahead.cpp",
4948
"src/std_regex.cpp",
5049
],
5150
exported_deps = [
5251
":headers",
5352
],
5453
exported_external_deps = [
5554
"pcre2",
56-
"re2",
5755
],
58-
preprocessor_flags = ["-DSUPPORT_REGEX_LOOKAHEAD=ON"],
59-
visibility = ["//pytorch/tokenizers/..."],
56+
visibility = [
57+
"@EXECUTORCH_CLIENTS",
58+
"//pytorch/tokenizers/...",
59+
],
6060
header_namespace = "",
6161
platforms = PLATFORMS,
6262
)
@@ -119,29 +119,6 @@ def define_common_targets():
119119
platforms = PLATFORMS,
120120
)
121121

122-
runtime.cxx_library(
123-
name = "tiktoken_lookahead",
124-
srcs = [
125-
"src/tiktoken.cpp",
126-
],
127-
deps = [
128-
":regex_lookahead",
129-
],
130-
exported_deps = [
131-
":bpe_tokenizer_base",
132-
":headers",
133-
],
134-
exported_external_deps = [
135-
"pcre2",
136-
"re2",
137-
],
138-
visibility = [
139-
"@EXECUTORCH_CLIENTS",
140-
"//pytorch/tokenizers/...",
141-
],
142-
platforms = PLATFORMS,
143-
)
144-
145122
runtime.cxx_library(
146123
name = "hf_tokenizer",
147124
srcs = [

0 commit comments

Comments
 (0)