Skip to content

Commit e9adcc4

Browse files
authored
[libc++][regex] Correctly adjust match prefix for zero-length matches. (#94550)
For regex patterns that produce zero-length matches, there is one (imaginary) match in-between every character in the sequence being searched (as well as before the first character and after the last character). It's easiest to demonstrate using replacement: `std::regex_replace("abc"s, "!", "")` should produce `!a!b!c!`, where each exclamation mark makes a zero-length match visible. Currently our implementation doesn't correctly set the prefix of each zero-length match, "swallowing" the characters separating the imaginary matches -- e.g. when going through zero-length matches within `abc`, the corresponding prefixes should be `{'', 'a', 'b', 'c'}`, but before this patch they will all be empty (`{'', '', '', ''}`). This happens in the implementation of `regex_iterator::operator++`. Note that the Standard spells out quite explicitly that the prefix might need to be adjusted when dealing with zero-length matches in [`re.regiter.incr`](http://eel.is/c++draft/re.regiter.incr): > In all cases in which the call to `regex_search` returns `true`, `match.prefix().first` shall be equal to the previous value of `match[0].second`... It is unspecified how the implementation makes these adjustments. [Reproduction example](https://godbolt.org/z/8ve6G3dav) ```cpp #include <iostream> #include <regex> #include <string> int main() { std::string str = "abc"; std::regex empty_matching_pattern(""); { // The underlying problem is that `regex_iterator::operator++` doesn't update // the prefix correctly. std::sregex_iterator i(str.begin(), str.end(), empty_matching_pattern), e; std::cout << "\""; for (; i != e; ++i) { const std::ssub_match& prefix = i->prefix(); std::cout << prefix.str(); } std::cout << "\"\n"; // Before the patch: "" // After the patch: "abc" } { // `regex_replace` makes the problem very visible. std::string replaced = std::regex_replace(str, empty_matching_pattern, "!"); std::cout << "\"" << replaced << "\"\n"; // Before the patch: "!!!!" // After the patch: "!a!b!c!" } } ``` Fixes #64451 rdar://119912002
1 parent 4f9c0fa commit e9adcc4

File tree

3 files changed

+200
-97
lines changed

3 files changed

+200
-97
lines changed

libcxx/include/regex

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4700,6 +4700,9 @@ private:
47004700

47014701
template <class, class>
47024702
friend class __lookahead;
4703+
4704+
template <class, class, class>
4705+
friend class regex_iterator;
47034706
};
47044707

47054708
template <class _BidirectionalIterator, class _Allocator>
@@ -5410,7 +5413,9 @@ template <class _BidirectionalIterator, class _CharT, class _Traits>
54105413
regex_iterator<_BidirectionalIterator, _CharT, _Traits>&
54115414
regex_iterator<_BidirectionalIterator, _CharT, _Traits>::operator++() {
54125415
__flags_ |= regex_constants::__no_update_pos;
5413-
_BidirectionalIterator __start = __match_[0].second;
5416+
_BidirectionalIterator __start = __match_[0].second;
5417+
_BidirectionalIterator __prefix_start = __start;
5418+
54145419
if (__match_[0].first == __match_[0].second) {
54155420
if (__start == __end_) {
54165421
__match_ = value_type();
@@ -5424,9 +5429,21 @@ regex_iterator<_BidirectionalIterator, _CharT, _Traits>::operator++() {
54245429
else
54255430
++__start;
54265431
}
5432+
54275433
__flags_ |= regex_constants::match_prev_avail;
5428-
if (!std::regex_search(__start, __end_, __match_, *__pregex_, __flags_))
5434+
if (!std::regex_search(__start, __end_, __match_, *__pregex_, __flags_)) {
54295435
__match_ = value_type();
5436+
5437+
} else {
5438+
// The Standard mandates that if `regex_search` returns true ([re.regiter.incr]), "`match.prefix().first` shall be
5439+
// equal to the previous value of `match[0].second`... It is unspecified how the implementation makes these
5440+
// adjustments." The adjustment is necessary if we incremented `__start` above (the branch that deals with
5441+
// zero-length matches).
5442+
auto& __prefix = __match_.__prefix_;
5443+
__prefix.first = __prefix_start;
5444+
__prefix.matched = __prefix.first != __prefix.second;
5445+
}
5446+
54305447
return *this;
54315448
}
54325449

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
// <regex>
10+
11+
// Test that replacing zero-length matches works correctly.
12+
13+
#include <cassert>
14+
#include <regex>
15+
#include <string>
16+
#include "test_macros.h"
17+
18+
int main(int, char**) {
19+
// Various patterns that produce zero-length matches.
20+
assert(std::regex_replace("abc", std::regex(""), "!") == "!a!b!c!");
21+
assert(std::regex_replace("abc", std::regex("X*"), "!") == "!a!b!c!");
22+
assert(std::regex_replace("abc", std::regex("X{0,3}"), "!") == "!a!b!c!");
23+
24+
// Replacement string has several characters.
25+
assert(std::regex_replace("abc", std::regex(""), "[!]") == "[!]a[!]b[!]c[!]");
26+
27+
// Empty replacement string.
28+
assert(std::regex_replace("abc", std::regex(""), "") == "abc");
29+
30+
// Empty input.
31+
assert(std::regex_replace("", std::regex(""), "!") == "!");
32+
33+
// Not all matches are zero-length.
34+
assert(std::regex_replace("abCabCa", std::regex("C*"), "!") == "!a!b!!a!b!!a!");
35+
36+
return 0;
37+
}

libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp

Lines changed: 144 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -17,102 +17,151 @@
1717
#include <iterator>
1818
#include "test_macros.h"
1919

20-
int main(int, char**)
21-
{
22-
{
23-
std::regex phone_numbers("\\d{3}-\\d{4}");
24-
const char phone_book[] = "555-1234, 555-2345, 555-3456";
25-
std::cregex_iterator i(std::begin(phone_book), std::end(phone_book), phone_numbers);
26-
std::cregex_iterator i2 = i;
27-
assert(i != std::cregex_iterator());
28-
assert(i2!= std::cregex_iterator());
29-
assert((*i).size() == 1);
30-
assert((*i).position() == 0);
31-
assert((*i).str() == "555-1234");
32-
assert((*i2).size() == 1);
33-
assert((*i2).position() == 0);
34-
assert((*i2).str() == "555-1234");
35-
i++;
36-
assert(i != std::cregex_iterator());
37-
assert(i2!= std::cregex_iterator());
38-
assert((*i).size() == 1);
39-
assert((*i).position() == 10);
40-
assert((*i).str() == "555-2345");
41-
assert((*i2).size() == 1);
42-
assert((*i2).position() == 0);
43-
assert((*i2).str() == "555-1234");
44-
i++;
45-
assert(i != std::cregex_iterator());
46-
assert(i2!= std::cregex_iterator());
47-
assert((*i).size() == 1);
48-
assert((*i).position() == 20);
49-
assert((*i).str() == "555-3456");
50-
assert((*i2).size() == 1);
51-
assert((*i2).position() == 0);
52-
assert((*i2).str() == "555-1234");
53-
i++;
54-
assert(i == std::cregex_iterator());
55-
assert(i2!= std::cregex_iterator());
56-
assert((*i2).size() == 1);
57-
assert((*i2).position() == 0);
58-
assert((*i2).str() == "555-1234");
59-
}
60-
{
61-
std::regex phone_numbers("\\d{3}-\\d{4}");
62-
const char phone_book[] = "555-1234, 555-2345, 555-3456";
63-
std::cregex_iterator i(std::begin(phone_book), std::end(phone_book), phone_numbers);
64-
std::cregex_iterator i2 = i;
65-
assert(i != std::cregex_iterator());
66-
assert(i2!= std::cregex_iterator());
67-
assert((*i).size() == 1);
68-
assert((*i).position() == 0);
69-
assert((*i).str() == "555-1234");
70-
assert((*i2).size() == 1);
71-
assert((*i2).position() == 0);
72-
assert((*i2).str() == "555-1234");
73-
++i;
74-
assert(i != std::cregex_iterator());
75-
assert(i2!= std::cregex_iterator());
76-
assert((*i).size() == 1);
77-
assert((*i).position() == 10);
78-
assert((*i).str() == "555-2345");
79-
assert((*i2).size() == 1);
80-
assert((*i2).position() == 0);
81-
assert((*i2).str() == "555-1234");
82-
++i;
83-
assert(i != std::cregex_iterator());
84-
assert(i2!= std::cregex_iterator());
85-
assert((*i).size() == 1);
86-
assert((*i).position() == 20);
87-
assert((*i).str() == "555-3456");
88-
assert((*i2).size() == 1);
89-
assert((*i2).position() == 0);
90-
assert((*i2).str() == "555-1234");
91-
++i;
92-
assert(i == std::cregex_iterator());
93-
assert(i2!= std::cregex_iterator());
94-
assert((*i2).size() == 1);
95-
assert((*i2).position() == 0);
96-
assert((*i2).str() == "555-1234");
97-
}
98-
{ // https://llvm.org/PR33681
99-
std::regex rex(".*");
100-
const char foo[] = "foo";
20+
void validate_prefixes(const std::regex& empty_matching_pattern) {
21+
const char source[] = "abc";
22+
23+
std::cregex_iterator i(source, source + 3, empty_matching_pattern);
24+
assert(!i->prefix().matched);
25+
assert(i->prefix().length() == 0);
26+
assert(i->prefix().first == source);
27+
assert(i->prefix().second == source);
28+
29+
++i;
30+
assert(i->prefix().matched);
31+
assert(i->prefix().length() == 1);
32+
assert(i->prefix().first == source);
33+
assert(i->prefix().second == source + 1);
34+
assert(i->prefix().str() == "a");
35+
36+
++i;
37+
assert(i->prefix().matched);
38+
assert(i->prefix().length() == 1);
39+
assert(i->prefix().first == source + 1);
40+
assert(i->prefix().second == source + 2);
41+
assert(i->prefix().str() == "b");
42+
43+
++i;
44+
assert(i->prefix().matched);
45+
assert(i->prefix().length() == 1);
46+
assert(i->prefix().first == source + 2);
47+
assert(i->prefix().second == source + 3);
48+
assert(i->prefix().str() == "c");
49+
50+
++i;
51+
assert(i == std::cregex_iterator());
52+
}
53+
54+
void test_prefix_adjustment() {
55+
// Check that we correctly adjust the match prefix when dealing with zero-length matches -- this is explicitly
56+
// required by the Standard ([re.regiter.incr]: "In all cases in which the call to `regex_search` returns true,
57+
// `match.prefix().first` shall be equal to the previous value of `match[0].second`"). For a pattern that matches
58+
// empty sequences, there is an implicit zero-length match between every character in a string -- make sure the
59+
// prefix of each of these matches (except the first one) is the preceding character.
60+
61+
// An empty pattern produces zero-length matches.
62+
validate_prefixes(std::regex(""));
63+
// Any character repeated zero or more times can produce zero-length matches.
64+
validate_prefixes(std::regex("X*"));
65+
validate_prefixes(std::regex("X{0,3}"));
66+
}
67+
68+
int main(int, char**) {
69+
{
70+
std::regex phone_numbers("\\d{3}-\\d{4}");
71+
const char phone_book[] = "555-1234, 555-2345, 555-3456";
72+
std::cregex_iterator i(std::begin(phone_book), std::end(phone_book), phone_numbers);
73+
std::cregex_iterator i2 = i;
74+
assert(i != std::cregex_iterator());
75+
assert(i2 != std::cregex_iterator());
76+
assert((*i).size() == 1);
77+
assert((*i).position() == 0);
78+
assert((*i).str() == "555-1234");
79+
assert((*i2).size() == 1);
80+
assert((*i2).position() == 0);
81+
assert((*i2).str() == "555-1234");
82+
i++;
83+
assert(i != std::cregex_iterator());
84+
assert(i2 != std::cregex_iterator());
85+
assert((*i).size() == 1);
86+
assert((*i).position() == 10);
87+
assert((*i).str() == "555-2345");
88+
assert((*i2).size() == 1);
89+
assert((*i2).position() == 0);
90+
assert((*i2).str() == "555-1234");
91+
i++;
92+
assert(i != std::cregex_iterator());
93+
assert(i2 != std::cregex_iterator());
94+
assert((*i).size() == 1);
95+
assert((*i).position() == 20);
96+
assert((*i).str() == "555-3456");
97+
assert((*i2).size() == 1);
98+
assert((*i2).position() == 0);
99+
assert((*i2).str() == "555-1234");
100+
i++;
101+
assert(i == std::cregex_iterator());
102+
assert(i2 != std::cregex_iterator());
103+
assert((*i2).size() == 1);
104+
assert((*i2).position() == 0);
105+
assert((*i2).str() == "555-1234");
106+
}
107+
{
108+
std::regex phone_numbers("\\d{3}-\\d{4}");
109+
const char phone_book[] = "555-1234, 555-2345, 555-3456";
110+
std::cregex_iterator i(std::begin(phone_book), std::end(phone_book), phone_numbers);
111+
std::cregex_iterator i2 = i;
112+
assert(i != std::cregex_iterator());
113+
assert(i2 != std::cregex_iterator());
114+
assert((*i).size() == 1);
115+
assert((*i).position() == 0);
116+
assert((*i).str() == "555-1234");
117+
assert((*i2).size() == 1);
118+
assert((*i2).position() == 0);
119+
assert((*i2).str() == "555-1234");
120+
++i;
121+
assert(i != std::cregex_iterator());
122+
assert(i2 != std::cregex_iterator());
123+
assert((*i).size() == 1);
124+
assert((*i).position() == 10);
125+
assert((*i).str() == "555-2345");
126+
assert((*i2).size() == 1);
127+
assert((*i2).position() == 0);
128+
assert((*i2).str() == "555-1234");
129+
++i;
130+
assert(i != std::cregex_iterator());
131+
assert(i2 != std::cregex_iterator());
132+
assert((*i).size() == 1);
133+
assert((*i).position() == 20);
134+
assert((*i).str() == "555-3456");
135+
assert((*i2).size() == 1);
136+
assert((*i2).position() == 0);
137+
assert((*i2).str() == "555-1234");
138+
++i;
139+
assert(i == std::cregex_iterator());
140+
assert(i2 != std::cregex_iterator());
141+
assert((*i2).size() == 1);
142+
assert((*i2).position() == 0);
143+
assert((*i2).str() == "555-1234");
144+
}
145+
{ // https://llvm.org/PR33681
146+
std::regex rex(".*");
147+
const char foo[] = "foo";
101148
// The -1 is because we don't want the implicit null from the array.
102-
std::cregex_iterator i(std::begin(foo), std::end(foo) - 1, rex);
103-
std::cregex_iterator e;
104-
assert(i != e);
105-
assert((*i).size() == 1);
106-
assert((*i).str() == "foo");
107-
108-
++i;
109-
assert(i != e);
110-
assert((*i).size() == 1);
111-
assert((*i).str() == "");
112-
113-
++i;
114-
assert(i == e);
115-
}
149+
std::cregex_iterator i(std::begin(foo), std::end(foo) - 1, rex);
150+
std::cregex_iterator e;
151+
assert(i != e);
152+
assert((*i).size() == 1);
153+
assert((*i).str() == "foo");
154+
155+
++i;
156+
assert(i != e);
157+
assert((*i).size() == 1);
158+
assert((*i).str() == "");
159+
160+
++i;
161+
assert(i == e);
162+
}
163+
164+
test_prefix_adjustment();
116165

117166
return 0;
118167
}

0 commit comments

Comments
 (0)