Skip to content

Use weak symbol create_fallback_regex to separate the implementation using PCRE2 and std::regex #77

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 27 additions & 31 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ project(Tokenizers)
option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)
option(SUPPORT_REGEX_LOOKAHEAD
"Support regex lookahead patterns (requires PCRE2)" OFF)
"Support regex lookahead patterns (requires PCRE2)" OFF)

include(Utils.cmake)
# Ignore weak attribute warning
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")

Expand All @@ -34,20 +35,6 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece)

# Configure PCRE2
if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
set(PCRE2_BUILD_PCRE2_8 ON)
set(PCRE2_BUILD_PCRE2_16 OFF)
set(PCRE2_BUILD_PCRE2_32 OFF)
set(PCRE2_BUILD_TESTS OFF)
set(PCRE2_BUILD_PCRE2GREP OFF)
set(PCRE2_BUILD_PCRE2TEST OFF)
set(PCRE2_BUILD_PCRE2GPERF OFF)
set(PCRE2_BUILD_DOCS OFF)
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
endif()

set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
Expand All @@ -60,14 +47,8 @@ set(tokenizers_source_files
${CMAKE_CURRENT_SOURCE_DIR}/src/regex.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/sentencepiece.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/token_decoder.cpp
)
if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
list(APPEND
tokenizers_source_files
${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/std_regex.cpp)
endif()
${CMAKE_CURRENT_SOURCE_DIR}/src/token_decoder.cpp)

file(GLOB unicode_source_files
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
add_library(tokenizers STATIC ${tokenizers_source_files}
Expand All @@ -85,11 +66,26 @@ target_include_directories(
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)

if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
target_include_directories(tokenizers
PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
target_link_libraries(tokenizers PUBLIC pcre2-8)
target_compile_definitions(tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD)
set(PCRE2_BUILD_PCRE2_8 ON)
set(PCRE2_BUILD_PCRE2_16 OFF)
set(PCRE2_BUILD_PCRE2_32 OFF)
set(PCRE2_BUILD_TESTS OFF)
set(PCRE2_BUILD_PCRE2GREP OFF)
set(PCRE2_BUILD_PCRE2TEST OFF)
set(PCRE2_BUILD_PCRE2GPERF OFF)
set(PCRE2_BUILD_DOCS OFF)
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
add_library(
regex_lookahead STATIC
${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/regex_lookahead.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/std_regex.cpp)
target_link_libraries(regex_lookahead PUBLIC pcre2-8)
target_include_directories(
regex_lookahead PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
target_link_options_shared_lib(regex_lookahead)
endif()

# Build test
Expand Down Expand Up @@ -120,9 +116,9 @@ if(TOKENIZERS_BUILD_TEST)
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
target_link_libraries(${test_name} gtest_main GTest::gmock tokenizers)
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include)
target_link_libraries(${test_name} gtest_main GTest::gmock tokenizers
regex_lookahead)
add_test(${test_name} "${test_name}")
set_tests_properties(${test_name} PROPERTIES ENVIRONMENT ${test_env})
endforeach()
Expand Down
50 changes: 50 additions & 0 deletions Utils.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

#
# Build tokenizers.
#
# ### Editing this file ###
#
# This file should be formatted with
# ~~~
# cmake-format -i CMakeLists.txt
# ~~~
# It should also be cmake-lint clean.
#

# This is the funtion to use -Wl, --whole-archive to link static library NB:
# target_link_options is broken for this case, it only append the interface link
# options of the first library.
function(kernel_link_options target_name)
# target_link_options(${target_name} INTERFACE
# "$<LINK_LIBRARY:WHOLE_ARCHIVE,target_name>")
target_link_options(
${target_name} INTERFACE "SHELL:LINKER:--whole-archive \
$<TARGET_FILE:${target_name}> \
LINKER:--no-whole-archive")
endfunction()

# Same as kernel_link_options but it's for MacOS linker
function(macos_kernel_link_options target_name)
target_link_options(${target_name} INTERFACE
"SHELL:LINKER:-force_load,$<TARGET_FILE:${target_name}>")
endfunction()

# Same as kernel_link_options but it's for MSVC linker
function(msvc_kernel_link_options target_name)
target_link_options(
${target_name} INTERFACE
"SHELL:LINKER:/WHOLEARCHIVE:$<TARGET_FILE:${target_name}>")
endfunction()

# Ensure that the load-time constructor functions run. By default, the linker
# would remove them since there are no other references to them.
function(target_link_options_shared_lib target_name)
if(APPLE)
macos_kernel_link_options(${target_name})
elseif(MSVC)
msvc_kernel_link_options(${target_name})
else()
kernel_link_options(${target_name})
endif()
endfunction()
3 changes: 3 additions & 0 deletions include/pytorch/tokenizers/error.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ enum class Error : error_code_t {

/// Decode failure.
DecodeFailure = 0x08,

/// No suitable regex implementation found.
RegexFailure = 0x09,
};

} // namespace tokenizers
Expand Down
2 changes: 1 addition & 1 deletion include/pytorch/tokenizers/pcre2_regex.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class Pcre2Regex : public IRegex {
pcre2_code* regex_;
pcre2_match_data* match_data_;

friend Result<std::unique_ptr<IRegex>> create_regex(
friend Result<std::unique_ptr<IRegex>> create_fallback_regex(
const std::string& pattern);
};

Expand Down
15 changes: 14 additions & 1 deletion include/pytorch/tokenizers/regex.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,24 @@ class IRegex {
};

/**
* @brief Creates a regex instance. Tries RE2 first, falls back to std::regex.
* @brief Creates a regex instance. If no strong symbol defined, only
* uses RE2. This is a weak symbol to allow other regex libraries to be
* used.
*
* @param pattern The regex pattern to compile.
* @return A unique pointer to an IRegex-compatible object.
*/
Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just make this the weak symbol?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we can't have RE2 in both weak implementation and strong implementation

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean just have create_regex weak implementation do re2, then strong implementation does re2 + fallback. Basically what you are doing now, seems a bit clearer to me


/**
* @brief Creates a fallback regex instance. If no strong symbol defined,
* returns Error, otherwise uses PCRE2 and std::regex.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is meant to be extensible to other regex libs, would make sense to me to remove PCRE2 from the comment and rename regex_lookahead.cpp to regex_pcre2.cpp or something like that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lookahead = pcre2 + std::regex, right?

* This is a weak symbol to allow other regex libraries to be used.
*
* @param pattern The regex pattern to compile.
* @return A unique pointer to an IRegex-compatible object.
*/
Result<std::unique_ptr<IRegex>> create_fallback_regex(
const std::string& pattern) TK_WEAK;

} // namespace tokenizers
67 changes: 24 additions & 43 deletions src/regex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,17 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
// A weak symbol for create_regex, only using RE2 regex library.
// regex_lookahead.cpp has the implementation of create_regex with lookahead
// support, backed by PCRE2 and std::regex.

#ifdef SUPPORT_REGEX_LOOKAHEAD
#include <pytorch/tokenizers/pcre2_regex.h>
#endif
#include <pytorch/tokenizers/re2_regex.h>
#include <pytorch/tokenizers/regex.h>
#include <pytorch/tokenizers/std_regex.h>

#include <re2/re2.h>
#include <iostream>
#include <memory>

namespace tokenizers {

/**
* @brief Factory function that creates a regex object using RE2 if possible.
* Falls back to PCRE2 if RE2 rejects the pattern and
* SUPPORT_REGEX_LOOKAHEAD is enabled. Otherwise, returns an error.
*/
Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
// Try RE2 first
auto re2 = std::make_unique<Re2Regex>("(" + pattern + ")");
Expand All @@ -32,42 +24,31 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
return static_cast<std::unique_ptr<IRegex>>(std::move(re2));
}

#ifndef SUPPORT_REGEX_LOOKAHEAD
std::cerr << "RE2 failed to compile pattern with lookahead: " << pattern
<< "\n";
std::cerr << "RE2 failed to compile pattern: " << pattern << "\n";
std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
std::cerr
<< "Compile with SUPPORT_REGEX_LOOKAHEAD=ON to enable support for lookahead patterns."
<< std::endl;
return tokenizers::Error::LoadFailure;
#else
if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) {
// RE2 doesn't support some Perl features, try PCRE2
auto pcre2 = std::make_unique<Pcre2Regex>("(" + pattern + ")");

if (pcre2->regex_ != nullptr && pcre2->match_data_ != nullptr) {
std::cout
<< "RE2 is unable to support things such as negative lookaheads in "
<< pattern << ", using PCRE2 instead." << std::endl;
return static_cast<std::unique_ptr<IRegex>>(std::move(pcre2));
}

// If PCRE2 also fails, fall back to std::regex
try {
std::cout
<< "PCRE2 failed to compile pattern, falling back to std::regex.";
auto std_regex = std::make_unique<StdRegex>("(" + pattern + ")");
return static_cast<std::unique_ptr<IRegex>>(std::move(std_regex));
} catch (const std::regex_error& e) {
std::cerr << "std::regex failed: " << e.what() << std::endl;
return tokenizers::Error::LoadFailure;
if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) {
auto res = create_fallback_regex(pattern);
if (!res.ok()) {
std::cerr
<< "RE2 doesn't support lookahead patterns. "
<< "Link with the lookahead-enabled version of this library to enable support."
<< std::endl;
} else {
return res;
}
} else {
std::cerr << "RE2 failed to compile pattern: " << pattern << "\n";
std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
return tokenizers::Error::LoadFailure;
}
#endif

return tokenizers::Error::RegexFailure;
}

#ifdef _MSC_VER
#pragma weak create_fallback_regex
#endif // _MSC_VER
Result<std::unique_ptr<IRegex>> create_fallback_regex(
const std::string& pattern) {
(void)pattern;
return tokenizers::Error::RegexFailure;
}

} // namespace tokenizers
51 changes: 51 additions & 0 deletions src/regex_lookahead.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

// This file contains the implementation of create_regex with lookahead support

#include <pytorch/tokenizers/pcre2_regex.h>
#include <pytorch/tokenizers/regex.h>
#include <pytorch/tokenizers/std_regex.h>

#include <iostream>
#include <memory>

namespace tokenizers {

/**
* @brief Factory function that creates a regex object using RE2 if possible.
* Falls back to PCRE2 if RE2 rejects the pattern due to lookahead.
* Falls back to std::regex if PCRE2 also fails.
*/

#ifdef _MSC_VER
#pragma weak create_fallback_regex
#endif // _MSC_VER
Result<std::unique_ptr<IRegex>> create_fallback_regex(
const std::string& pattern) {
auto pcre2 = std::make_unique<Pcre2Regex>("(" + pattern + ")");

if (pcre2->regex_ != nullptr && pcre2->match_data_ != nullptr) {
std::cout
<< "RE2 is unable to support things such as negative lookaheads in "
<< pattern << ", using PCRE2 instead." << std::endl;
return static_cast<std::unique_ptr<IRegex>>(std::move(pcre2));
}

// If PCRE2 also fails, fall back to std::regex
try {
std::cout << "PCRE2 failed to compile pattern, falling back to std::regex.";
auto std_regex = std::make_unique<StdRegex>("(" + pattern + ")");
return static_cast<std::unique_ptr<IRegex>>(std::move(std_regex));
} catch (const std::regex_error& e) {
std::cerr << "std::regex failed: " << e.what() << std::endl;
return tokenizers::Error::LoadFailure;
}
}

} // namespace tokenizers
35 changes: 6 additions & 29 deletions targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ def define_common_targets():
runtime.cxx_library(
name = "regex",
srcs = [
"src/regex.cpp",
"src/re2_regex.cpp",
"src/regex.cpp",
],
exported_deps = [
":headers",
Expand All @@ -44,19 +44,19 @@ def define_common_targets():
name = "regex_lookahead",
srcs = [
"src/pcre2_regex.cpp",
"src/regex.cpp",
"src/re2_regex.cpp",
"src/regex_lookahead.cpp",
"src/std_regex.cpp",
],
exported_deps = [
":headers",
],
exported_external_deps = [
"pcre2",
"re2",
],
preprocessor_flags = ["-DSUPPORT_REGEX_LOOKAHEAD=ON"],
visibility = ["//pytorch/tokenizers/..."],
visibility = [
"@EXECUTORCH_CLIENTS",
"//pytorch/tokenizers/...",
],
header_namespace = "",
platforms = PLATFORMS,
)
Expand Down Expand Up @@ -119,29 +119,6 @@ def define_common_targets():
platforms = PLATFORMS,
)

runtime.cxx_library(
name = "tiktoken_lookahead",
srcs = [
"src/tiktoken.cpp",
],
deps = [
":regex_lookahead",
],
exported_deps = [
":bpe_tokenizer_base",
":headers",
],
exported_external_deps = [
"pcre2",
"re2",
],
visibility = [
"@EXECUTORCH_CLIENTS",
"//pytorch/tokenizers/...",
],
platforms = PLATFORMS,
)

runtime.cxx_library(
name = "hf_tokenizer",
srcs = [
Expand Down