Skip to content

[libc++][regex] Correctly adjust match prefix for zero-length matches. #94550

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions libcxx/include/regex
Original file line number Diff line number Diff line change
Expand Up @@ -4700,6 +4700,9 @@ private:

template <class, class>
friend class __lookahead;

template <class, class, class>
friend class regex_iterator;
};

template <class _BidirectionalIterator, class _Allocator>
Expand Down Expand Up @@ -5410,7 +5413,9 @@ template <class _BidirectionalIterator, class _CharT, class _Traits>
regex_iterator<_BidirectionalIterator, _CharT, _Traits>&
regex_iterator<_BidirectionalIterator, _CharT, _Traits>::operator++() {
__flags_ |= regex_constants::__no_update_pos;
_BidirectionalIterator __start = __match_[0].second;
_BidirectionalIterator __start = __match_[0].second;
_BidirectionalIterator __prefix_start = __start;

if (__match_[0].first == __match_[0].second) {
if (__start == __end_) {
__match_ = value_type();
Expand All @@ -5424,9 +5429,21 @@ regex_iterator<_BidirectionalIterator, _CharT, _Traits>::operator++() {
else
++__start;
}

__flags_ |= regex_constants::match_prev_avail;
if (!std::regex_search(__start, __end_, __match_, *__pregex_, __flags_))
if (!std::regex_search(__start, __end_, __match_, *__pregex_, __flags_)) {
__match_ = value_type();

} else {
// The Standard mandates that if `regex_search` returns true ([re.regiter.incr]), "`match.prefix().first` shall be
// equal to the previous value of `match[0].second`... It is unspecified how the implementation makes these
// adjustments." The adjustment is necessary if we incremented `__start` above (the branch that deals with
// zero-length matches).
auto& __prefix = __match_.__prefix_;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's another call to regex_search above, but I don't think prefix adjustment should ever be necessary there since it happens before __start is incremented.

__prefix.first = __prefix_start;
__prefix.matched = __prefix.first != __prefix.second;
}

return *this;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// <regex>

// Test that replacing zero-length matches works correctly.

#include <cassert>
#include <regex>
#include <string>
#include "test_macros.h"

int main(int, char**) {
// Various patterns that produce zero-length matches.
assert(std::regex_replace("abc", std::regex(""), "!") == "!a!b!c!");
assert(std::regex_replace("abc", std::regex("X*"), "!") == "!a!b!c!");
assert(std::regex_replace("abc", std::regex("X{0,3}"), "!") == "!a!b!c!");

// Replacement string has several characters.
assert(std::regex_replace("abc", std::regex(""), "[!]") == "[!]a[!]b[!]c[!]");

// Empty replacement string.
assert(std::regex_replace("abc", std::regex(""), "") == "abc");

// Empty input.
assert(std::regex_replace("", std::regex(""), "!") == "!");

// Not all matches are zero-length.
assert(std::regex_replace("abCabCa", std::regex("C*"), "!") == "!a!b!!a!b!!a!");

return 0;
}
239 changes: 144 additions & 95 deletions libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,102 +17,151 @@
#include <iterator>
#include "test_macros.h"

int main(int, char**)
{
{
std::regex phone_numbers("\\d{3}-\\d{4}");
const char phone_book[] = "555-1234, 555-2345, 555-3456";
std::cregex_iterator i(std::begin(phone_book), std::end(phone_book), phone_numbers);
std::cregex_iterator i2 = i;
assert(i != std::cregex_iterator());
assert(i2!= std::cregex_iterator());
assert((*i).size() == 1);
assert((*i).position() == 0);
assert((*i).str() == "555-1234");
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
i++;
assert(i != std::cregex_iterator());
assert(i2!= std::cregex_iterator());
assert((*i).size() == 1);
assert((*i).position() == 10);
assert((*i).str() == "555-2345");
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
i++;
assert(i != std::cregex_iterator());
assert(i2!= std::cregex_iterator());
assert((*i).size() == 1);
assert((*i).position() == 20);
assert((*i).str() == "555-3456");
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
i++;
assert(i == std::cregex_iterator());
assert(i2!= std::cregex_iterator());
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
}
{
std::regex phone_numbers("\\d{3}-\\d{4}");
const char phone_book[] = "555-1234, 555-2345, 555-3456";
std::cregex_iterator i(std::begin(phone_book), std::end(phone_book), phone_numbers);
std::cregex_iterator i2 = i;
assert(i != std::cregex_iterator());
assert(i2!= std::cregex_iterator());
assert((*i).size() == 1);
assert((*i).position() == 0);
assert((*i).str() == "555-1234");
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
++i;
assert(i != std::cregex_iterator());
assert(i2!= std::cregex_iterator());
assert((*i).size() == 1);
assert((*i).position() == 10);
assert((*i).str() == "555-2345");
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
++i;
assert(i != std::cregex_iterator());
assert(i2!= std::cregex_iterator());
assert((*i).size() == 1);
assert((*i).position() == 20);
assert((*i).str() == "555-3456");
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
++i;
assert(i == std::cregex_iterator());
assert(i2!= std::cregex_iterator());
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
}
{ // https://llvm.org/PR33681
std::regex rex(".*");
const char foo[] = "foo";
void validate_prefixes(const std::regex& empty_matching_pattern) {
const char source[] = "abc";

std::cregex_iterator i(source, source + 3, empty_matching_pattern);
assert(!i->prefix().matched);
assert(i->prefix().length() == 0);
assert(i->prefix().first == source);
assert(i->prefix().second == source);

++i;
assert(i->prefix().matched);
assert(i->prefix().length() == 1);
assert(i->prefix().first == source);
assert(i->prefix().second == source + 1);
assert(i->prefix().str() == "a");

++i;
assert(i->prefix().matched);
assert(i->prefix().length() == 1);
assert(i->prefix().first == source + 1);
assert(i->prefix().second == source + 2);
assert(i->prefix().str() == "b");

++i;
assert(i->prefix().matched);
assert(i->prefix().length() == 1);
assert(i->prefix().first == source + 2);
assert(i->prefix().second == source + 3);
assert(i->prefix().str() == "c");

++i;
assert(i == std::cregex_iterator());
}

void test_prefix_adjustment() {
// Check that we correctly adjust the match prefix when dealing with zero-length matches -- this is explicitly
// required by the Standard ([re.regiter.incr]: "In all cases in which the call to `regex_search` returns true,
// `match.prefix().first` shall be equal to the previous value of `match[0].second`"). For a pattern that matches
// empty sequences, there is an implicit zero-length match between every character in a string -- make sure the
// prefix of each of these matches (except the first one) is the preceding character.

// An empty pattern produces zero-length matches.
validate_prefixes(std::regex(""));
// Any character repeated zero or more times can produce zero-length matches.
validate_prefixes(std::regex("X*"));
validate_prefixes(std::regex("X{0,3}"));
}

int main(int, char**) {
{
std::regex phone_numbers("\\d{3}-\\d{4}");
const char phone_book[] = "555-1234, 555-2345, 555-3456";
std::cregex_iterator i(std::begin(phone_book), std::end(phone_book), phone_numbers);
std::cregex_iterator i2 = i;
assert(i != std::cregex_iterator());
assert(i2 != std::cregex_iterator());
assert((*i).size() == 1);
assert((*i).position() == 0);
assert((*i).str() == "555-1234");
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
i++;
assert(i != std::cregex_iterator());
assert(i2 != std::cregex_iterator());
assert((*i).size() == 1);
assert((*i).position() == 10);
assert((*i).str() == "555-2345");
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
i++;
assert(i != std::cregex_iterator());
assert(i2 != std::cregex_iterator());
assert((*i).size() == 1);
assert((*i).position() == 20);
assert((*i).str() == "555-3456");
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
i++;
assert(i == std::cregex_iterator());
assert(i2 != std::cregex_iterator());
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
}
{
std::regex phone_numbers("\\d{3}-\\d{4}");
const char phone_book[] = "555-1234, 555-2345, 555-3456";
std::cregex_iterator i(std::begin(phone_book), std::end(phone_book), phone_numbers);
std::cregex_iterator i2 = i;
assert(i != std::cregex_iterator());
assert(i2 != std::cregex_iterator());
assert((*i).size() == 1);
assert((*i).position() == 0);
assert((*i).str() == "555-1234");
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
++i;
assert(i != std::cregex_iterator());
assert(i2 != std::cregex_iterator());
assert((*i).size() == 1);
assert((*i).position() == 10);
assert((*i).str() == "555-2345");
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
++i;
assert(i != std::cregex_iterator());
assert(i2 != std::cregex_iterator());
assert((*i).size() == 1);
assert((*i).position() == 20);
assert((*i).str() == "555-3456");
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
++i;
assert(i == std::cregex_iterator());
assert(i2 != std::cregex_iterator());
assert((*i2).size() == 1);
assert((*i2).position() == 0);
assert((*i2).str() == "555-1234");
}
{ // https://llvm.org/PR33681
std::regex rex(".*");
const char foo[] = "foo";
// The -1 is because we don't want the implicit null from the array.
std::cregex_iterator i(std::begin(foo), std::end(foo) - 1, rex);
std::cregex_iterator e;
assert(i != e);
assert((*i).size() == 1);
assert((*i).str() == "foo");

++i;
assert(i != e);
assert((*i).size() == 1);
assert((*i).str() == "");

++i;
assert(i == e);
}
std::cregex_iterator i(std::begin(foo), std::end(foo) - 1, rex);
std::cregex_iterator e;
assert(i != e);
assert((*i).size() == 1);
assert((*i).str() == "foo");

++i;
assert(i != e);
assert((*i).size() == 1);
assert((*i).str() == "");

++i;
assert(i == e);
}

test_prefix_adjustment();

return 0;
}
Loading