-
Notifications
You must be signed in to change notification settings - Fork 7
Use weak symbol create_fallback_regex to separate the implementation using PCRE2 and std::regex #77
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. | ||
|
||
# | ||
# Build tokenizers. | ||
# | ||
# ### Editing this file ### | ||
# | ||
# This file should be formatted with | ||
# ~~~ | ||
# cmake-format -i CMakeLists.txt | ||
# ~~~ | ||
# It should also be cmake-lint clean. | ||
# | ||
|
||
# This is the funtion to use -Wl, --whole-archive to link static library NB: | ||
# target_link_options is broken for this case, it only append the interface link | ||
# options of the first library. | ||
function(kernel_link_options target_name) | ||
# target_link_options(${target_name} INTERFACE | ||
# "$<LINK_LIBRARY:WHOLE_ARCHIVE,target_name>") | ||
target_link_options( | ||
${target_name} INTERFACE "SHELL:LINKER:--whole-archive \ | ||
$<TARGET_FILE:${target_name}> \ | ||
LINKER:--no-whole-archive") | ||
endfunction() | ||
|
||
# Same as kernel_link_options but it's for MacOS linker | ||
function(macos_kernel_link_options target_name) | ||
target_link_options(${target_name} INTERFACE | ||
"SHELL:LINKER:-force_load,$<TARGET_FILE:${target_name}>") | ||
endfunction() | ||
|
||
# Same as kernel_link_options but it's for MSVC linker | ||
function(msvc_kernel_link_options target_name) | ||
target_link_options( | ||
${target_name} INTERFACE | ||
"SHELL:LINKER:/WHOLEARCHIVE:$<TARGET_FILE:${target_name}>") | ||
endfunction() | ||
|
||
# Ensure that the load-time constructor functions run. By default, the linker | ||
# would remove them since there are no other references to them. | ||
function(target_link_options_shared_lib target_name) | ||
if(APPLE) | ||
macos_kernel_link_options(${target_name}) | ||
elseif(MSVC) | ||
msvc_kernel_link_options(${target_name}) | ||
else() | ||
kernel_link_options(${target_name}) | ||
endif() | ||
endfunction() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,11 +38,24 @@ class IRegex { | |
}; | ||
|
||
/** | ||
* @brief Creates a regex instance. Tries RE2 first, falls back to std::regex. | ||
* @brief Creates a regex instance. If no strong symbol defined, only | ||
* uses RE2. This is a weak symbol to allow other regex libraries to be | ||
* used. | ||
* | ||
* @param pattern The regex pattern to compile. | ||
* @return A unique pointer to an IRegex-compatible object. | ||
*/ | ||
Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern); | ||
|
||
/** | ||
* @brief Creates a fallback regex instance. If no strong symbol defined, | ||
* returns Error, otherwise uses PCRE2 and std::regex. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is meant to be extensible to other regex libs, would make sense to me to remove PCRE2 from the comment and rename regex_lookahead.cpp to regex_pcre2.cpp or something like that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lookahead = pcre2 + std::regex, right? |
||
* This is a weak symbol to allow other regex libraries to be used. | ||
* | ||
* @param pattern The regex pattern to compile. | ||
* @return A unique pointer to an IRegex-compatible object. | ||
*/ | ||
Result<std::unique_ptr<IRegex>> create_fallback_regex( | ||
const std::string& pattern) TK_WEAK; | ||
|
||
} // namespace tokenizers |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/* | ||
* Copyright (c) Meta Platforms, Inc. and affiliates. | ||
* All rights reserved. | ||
* | ||
* This source code is licensed under the BSD-style license found in the | ||
* LICENSE file in the root directory of this source tree. | ||
*/ | ||
|
||
// This file contains the implementation of create_regex with lookahead support | ||
|
||
#include <pytorch/tokenizers/pcre2_regex.h> | ||
#include <pytorch/tokenizers/regex.h> | ||
#include <pytorch/tokenizers/std_regex.h> | ||
|
||
#include <iostream> | ||
#include <memory> | ||
|
||
namespace tokenizers { | ||
|
||
/** | ||
* @brief Factory function that creates a regex object using RE2 if possible. | ||
* Falls back to PCRE2 if RE2 rejects the pattern due to lookahead. | ||
* Falls back to std::regex if PCRE2 also fails. | ||
*/ | ||
|
||
#ifdef _MSC_VER | ||
#pragma weak create_fallback_regex | ||
#endif // _MSC_VER | ||
Result<std::unique_ptr<IRegex>> create_fallback_regex( | ||
const std::string& pattern) { | ||
auto pcre2 = std::make_unique<Pcre2Regex>("(" + pattern + ")"); | ||
|
||
if (pcre2->regex_ != nullptr && pcre2->match_data_ != nullptr) { | ||
std::cout | ||
<< "RE2 is unable to support things such as negative lookaheads in " | ||
<< pattern << ", using PCRE2 instead." << std::endl; | ||
return static_cast<std::unique_ptr<IRegex>>(std::move(pcre2)); | ||
} | ||
|
||
// If PCRE2 also fails, fall back to std::regex | ||
try { | ||
std::cout << "PCRE2 failed to compile pattern, falling back to std::regex."; | ||
auto std_regex = std::make_unique<StdRegex>("(" + pattern + ")"); | ||
return static_cast<std::unique_ptr<IRegex>>(std::move(std_regex)); | ||
} catch (const std::regex_error& e) { | ||
std::cerr << "std::regex failed: " << e.what() << std::endl; | ||
return tokenizers::Error::LoadFailure; | ||
} | ||
} | ||
|
||
} // namespace tokenizers |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not just make this the weak symbol?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because we can't have RE2 in both weak implementation and strong implementation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean just have create_regex weak implementation do re2, then strong implementation does re2 + fallback. Basically what you are doing now, seems a bit clearer to me