Skip to content

Commit 9378e21

Browse files
authored
Add pcre2 as re2 fallback (#50) (#50)
1 parent f52b18b commit 9378e21

File tree

8 files changed

+308
-7
lines changed

8 files changed

+308
-7
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,6 @@
1010
[submodule "third-party/json"]
1111
path = third-party/json
1212
url = https://github.com/nlohmann/json.git
13+
[submodule "third-party/pcre2"]
14+
path = third-party/pcre2
15+
url = https://github.com/PCRE2Project/pcre2.git

CMakeLists.txt

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,19 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
2929
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp)
3030
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
3131
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece)
32+
33+
# Configure PCRE2
34+
set(PCRE2_BUILD_PCRE2_8 ON)
35+
set(PCRE2_BUILD_PCRE2_16 OFF)
36+
set(PCRE2_BUILD_PCRE2_32 OFF)
37+
set(PCRE2_BUILD_TESTS OFF)
38+
set(PCRE2_BUILD_PCRE2GREP OFF)
39+
set(PCRE2_BUILD_PCRE2TEST OFF)
40+
set(PCRE2_BUILD_PCRE2GPERF OFF)
41+
set(PCRE2_BUILD_DOCS OFF)
42+
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
43+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
44+
3245
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
3346

3447
file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
@@ -45,9 +58,10 @@ target_include_directories(
4558
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
4659
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
4760
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
48-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)
61+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include
62+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
4963

50-
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)
64+
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2 pcre2-8)
5165

5266
# Build test
5367
if(TOKENIZERS_BUILD_TEST)
@@ -77,7 +91,8 @@ if(TOKENIZERS_BUILD_TEST)
7791
${CMAKE_CURRENT_SOURCE_DIR}/include
7892
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
7993
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
80-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include)
94+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
95+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
8196
target_link_libraries(${test_name} gtest_main GTest::gmock tokenizers)
8297
add_test(${test_name} "${test_name}")
8398
set_tests_properties(${test_name} PROPERTIES ENVIRONMENT ${test_env})
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <memory>
12+
#include <string>
13+
14+
// Define PCRE2 code unit width before including pcre2.h
15+
#define PCRE2_CODE_UNIT_WIDTH 8
16+
#include <pcre2.h>
17+
18+
#include <pytorch/tokenizers/regex.h>
19+
20+
namespace tokenizers {
21+
22+
/**
23+
* @brief PCRE2-based implementation of IRegex.
24+
*/
25+
class Pcre2Regex : public IRegex {
26+
public:
27+
/**
28+
* @brief Construct a PCRE2 regex with the given pattern.
29+
*
30+
* @param pattern The regex pattern to compile.
31+
*/
32+
explicit Pcre2Regex(const std::string& pattern);
33+
34+
/**
35+
* @brief Destructor to clean up PCRE2 resources.
36+
*/
37+
~Pcre2Regex();
38+
39+
/**
40+
* @brief Return all non-overlapping matches found in the input string.
41+
*/
42+
virtual std::vector<Match> find_all(const std::string& text) const override;
43+
44+
private:
45+
pcre2_code* regex_;
46+
pcre2_match_data* match_data_;
47+
48+
friend Result<std::unique_ptr<IRegex>> create_regex(
49+
const std::string& pattern);
50+
};
51+
52+
} // namespace tokenizers

src/pcre2_regex.cpp

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <iostream>
10+
#include <vector>
11+
12+
#include <pytorch/tokenizers/pcre2_regex.h>
13+
14+
namespace tokenizers {
15+
16+
Pcre2Regex::Pcre2Regex(const std::string& pattern)
17+
: regex_(nullptr), match_data_(nullptr) {
18+
int error_code;
19+
PCRE2_SIZE error_offset;
20+
21+
// Compile the pattern
22+
regex_ = pcre2_compile(
23+
reinterpret_cast<PCRE2_SPTR>(pattern.c_str()),
24+
pattern.length(),
25+
PCRE2_UCP | PCRE2_UTF, // Enable Unicode support and UTF-8 mode
26+
&error_code,
27+
&error_offset,
28+
nullptr);
29+
30+
if (regex_ == nullptr) {
31+
PCRE2_UCHAR error_buffer[256];
32+
pcre2_get_error_message(error_code, error_buffer, sizeof(error_buffer));
33+
std::cerr << "PCRE2 compilation failed at offset " << error_offset << ": "
34+
<< error_buffer << std::endl;
35+
return;
36+
}
37+
38+
// Create match data
39+
match_data_ = pcre2_match_data_create_from_pattern(regex_, nullptr);
40+
if (match_data_ == nullptr) {
41+
pcre2_code_free(regex_);
42+
regex_ = nullptr;
43+
std::cerr << "Failed to create PCRE2 match data" << std::endl;
44+
return;
45+
}
46+
}
47+
48+
Pcre2Regex::~Pcre2Regex() {
49+
if (match_data_) {
50+
pcre2_match_data_free(match_data_);
51+
}
52+
if (regex_) {
53+
pcre2_code_free(regex_);
54+
}
55+
}
56+
57+
std::vector<Match> Pcre2Regex::find_all(const std::string& text) const {
58+
std::vector<Match> result;
59+
60+
if (!regex_ || !match_data_) {
61+
return result;
62+
}
63+
64+
PCRE2_SIZE* ovector;
65+
PCRE2_SPTR subject = reinterpret_cast<PCRE2_SPTR>(text.c_str());
66+
PCRE2_SIZE subject_length = text.length();
67+
PCRE2_SIZE offset = 0;
68+
69+
while (offset < subject_length) {
70+
int rc = pcre2_match(
71+
regex_,
72+
subject,
73+
subject_length,
74+
offset,
75+
0, // Default options
76+
match_data_,
77+
nullptr);
78+
79+
if (rc < 0) {
80+
if (rc == PCRE2_ERROR_NOMATCH) {
81+
break; // No more matches
82+
} else {
83+
// Error occurred
84+
PCRE2_UCHAR error_buffer[256];
85+
pcre2_get_error_message(rc, error_buffer, sizeof(error_buffer));
86+
std::cerr << "PCRE2 matching error: " << error_buffer << std::endl;
87+
break;
88+
}
89+
}
90+
91+
ovector = pcre2_get_ovector_pointer(match_data_);
92+
93+
// Add the match to the result
94+
result.push_back({ovector[0], ovector[1]});
95+
96+
// Move to the next position after the match
97+
offset = ovector[1];
98+
99+
// If the match was empty, move forward by one character to avoid infinite
100+
// loop
101+
if (ovector[0] == ovector[1]) {
102+
offset++;
103+
}
104+
}
105+
106+
return result;
107+
}
108+
109+
} // namespace tokenizers

src/regex.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <pytorch/tokenizers/pcre2_regex.h>
910
#include <pytorch/tokenizers/re2_regex.h>
1011
#include <pytorch/tokenizers/regex.h>
1112
#include <pytorch/tokenizers/std_regex.h>
@@ -18,8 +19,8 @@ namespace tokenizers {
1819

1920
/**
2021
* @brief Factory function that creates a regex object using RE2 if possible.
21-
* Falls back to std::regex if RE2 rejects the pattern with
22-
* ErrorBadPerlOp.
22+
* Falls back to PCRE2 if RE2 rejects the pattern, then to std::regex if
23+
* PCRE2 fails.
2324
*/
2425
Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
2526
// Try RE2 first
@@ -30,10 +31,20 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
3031
}
3132

3233
if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) {
33-
try {
34+
// RE2 doesn't support some Perl features, try PCRE2
35+
auto pcre2 = std::make_unique<Pcre2Regex>("(" + pattern + ")");
36+
37+
if (pcre2->regex_ != nullptr && pcre2->match_data_ != nullptr) {
3438
std::cout
3539
<< "RE2 is unable to support things such as negative lookaheads in "
36-
<< pattern << ", defaulting to std::regex.";
40+
<< pattern << ", using PCRE2 instead." << std::endl;
41+
return static_cast<std::unique_ptr<IRegex>>(std::move(pcre2));
42+
}
43+
44+
// If PCRE2 also fails, fall back to std::regex
45+
try {
46+
std::cout
47+
<< "PCRE2 failed to compile pattern, falling back to std::regex.";
3748
auto std_regex = std::make_unique<StdRegex>("(" + pattern + ")");
3849
return static_cast<std::unique_ptr<IRegex>>(std::move(std_regex));
3950
} catch (const std::regex_error& e) {

targets.bzl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ def define_common_targets():
2828
srcs = ["src/regex.cpp"] + glob([
2929
"src/*_regex.cpp",
3030
]),
31+
deps = [
32+
"fbsource//third-party/pcre2:pcre2-8",
33+
],
3134
exported_headers = subdir_glob([
3235
("include", "pytorch/tokenizers/regex.h"),
3336
("include", "pytorch/tokenizers/*_regex.h"),

test/test_regex.cpp

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <gtest/gtest.h>
10+
11+
#include "pytorch/tokenizers/pcre2_regex.h"
12+
#include "pytorch/tokenizers/re2_regex.h"
13+
#include "pytorch/tokenizers/regex.h"
14+
15+
using namespace tokenizers;
16+
17+
class RegexTest : public ::testing::Test {};
18+
19+
// Test basic functionality
20+
TEST_F(RegexTest, BasicMatching) {
21+
auto regex = TK_UNWRAP_THROW(create_regex("\\w+"));
22+
23+
std::string text = "Hello world";
24+
auto matches = regex->find_all(text);
25+
ASSERT_EQ(matches.size(), 2);
26+
EXPECT_EQ(matches[0].start, 0);
27+
EXPECT_EQ(matches[0].end, 5);
28+
EXPECT_EQ(
29+
text.substr(matches[0].start, matches[0].end - matches[0].start),
30+
"Hello");
31+
EXPECT_EQ(matches[1].start, 6);
32+
EXPECT_EQ(matches[1].end, 11);
33+
EXPECT_EQ(
34+
text.substr(matches[1].start, matches[1].end - matches[1].start),
35+
"world");
36+
}
37+
38+
// Test pattern that only PCRE2 supports (lookbehind)
39+
TEST_F(RegexTest, Pcre2Specific) {
40+
const std::string pattern = "(?<=@)\\w+";
41+
42+
// Verify that the factory function fallsback on a PCRE2 regex
43+
auto regex = TK_UNWRAP_THROW(create_regex(pattern));
44+
EXPECT_NE(dynamic_cast<Pcre2Regex*>(regex.get()), nullptr);
45+
46+
std::string text = "[email protected]";
47+
auto matches = regex->find_all(text);
48+
ASSERT_EQ(matches.size(), 1);
49+
EXPECT_EQ(matches[0].start, 5);
50+
EXPECT_EQ(matches[0].end, 12);
51+
EXPECT_EQ(
52+
text.substr(matches[0].start, matches[0].end - matches[0].start),
53+
"example");
54+
}
55+
56+
// Test complex pattern with negative lookahead that should fall back to PCRE2.
57+
// This specific pattern is from the Qwen2.5 1.5B pretokenizer.
58+
// https://huggingface.co/Qwen/Qwen2.5-1.5B/raw/main/tokenizer.json
59+
TEST_F(RegexTest, ComplexPatternWithNegativeLookahead) {
60+
const std::string complex_pattern =
61+
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
62+
63+
// Now verify that the factory function fallsback on a PCRE2 regex
64+
auto regex = TK_UNWRAP_THROW(create_regex(complex_pattern));
65+
EXPECT_NE(dynamic_cast<Pcre2Regex*>(regex.get()), nullptr);
66+
67+
// Test the pattern with some sample text
68+
std::string text = "Hello's world\n test";
69+
auto matches = regex->find_all(text);
70+
71+
// We expect to match:
72+
// 1. "Hello" (word)
73+
// 2. "'s" (contraction)
74+
// 3. " world" (word with leading space)
75+
// 4. "\n" (newline)
76+
// 5. " " (whitespace)
77+
// 6. " test" (word with leading space)
78+
ASSERT_EQ(matches.size(), 6);
79+
80+
EXPECT_EQ(matches[0].start, 0);
81+
EXPECT_EQ(matches[0].end, 5);
82+
EXPECT_EQ(
83+
text.substr(matches[0].start, matches[0].end - matches[0].start),
84+
"Hello");
85+
EXPECT_EQ(matches[1].start, 5);
86+
EXPECT_EQ(matches[1].end, 7);
87+
EXPECT_EQ(
88+
text.substr(matches[1].start, matches[1].end - matches[1].start), "'s");
89+
EXPECT_EQ(matches[2].start, 7);
90+
EXPECT_EQ(matches[2].end, 13);
91+
EXPECT_EQ(
92+
text.substr(matches[2].start, matches[2].end - matches[2].start),
93+
" world");
94+
EXPECT_EQ(matches[3].start, 13);
95+
EXPECT_EQ(matches[3].end, 14);
96+
EXPECT_EQ(
97+
text.substr(matches[3].start, matches[3].end - matches[3].start), "\n");
98+
EXPECT_EQ(matches[4].start, 14);
99+
EXPECT_EQ(matches[4].end, 15);
100+
EXPECT_EQ(
101+
text.substr(matches[4].start, matches[4].end - matches[4].start), " ");
102+
EXPECT_EQ(matches[5].start, 15);
103+
EXPECT_EQ(matches[5].end, 20);
104+
EXPECT_EQ(
105+
text.substr(matches[5].start, matches[5].end - matches[5].start),
106+
" test");
107+
}

third-party/pcre2

Submodule pcre2 added at 2e03e32

0 commit comments

Comments
 (0)