Skip to content

RFC: Locale-independent case conversion #7506

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions UPGRADING
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ PHP 8.2 UPGRADE NOTES
1. Backward Incompatible Changes
========================================

- Standard:
. strtolower() and strtoupper() are no longer locale-sensitive. They now
perform ASCII case conversion, as if the locale were "C". Use
mb_strtolower() if you want localized case conversion. Similarly, stristr,
stripos, strripos, lcfirst, ucfirst, ucwords, str_ireplace,
array_change_key_case and sorting with SORT_FLAG_CASE use ASCII case
conversion.

========================================
2. New Features
========================================
Expand Down
25 changes: 19 additions & 6 deletions Zend/tests/lc_ctype_inheritance.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,30 @@ LC_CTYPE=de_DE
--FILE--
<?php

echo "inherited\n";
echo 'ctype_lower(\xe4): ' . (ctype_lower("\xe4") ? 'y' : 'n') . "\n";
echo 'preg_match(\w, \xe4): ';
var_dump(preg_match('/\w/', "\xe4"));

var_dump(setlocale(LC_CTYPE, "0"));
var_dump(bin2hex(strtoupper("\xe4")));
echo 'ctype_lower(\xe4): ' . (ctype_lower("\xe4") ? 'y' : 'n') . "\n";
echo 'preg_match(\w, \xe4): ';
var_dump(preg_match('/\w/', "\xe4"));

echo "de_DE\n";
var_dump(setlocale(LC_CTYPE, "de_DE", "de-DE") !== false);
var_dump(bin2hex(strtoupper("\xe4")));
echo 'ctype_lower(\xe4): ' . (ctype_lower("\xe4") ? 'y' : 'n') . "\n";
echo 'preg_match(\w, \xe4): ';
var_dump(preg_match('/\w/', "\xe4"));
?>
--EXPECTF--
inherited
ctype_lower(\xe4): n
preg_match(\w, \xe4): int(0)
string(%d) "C%r(\.UTF-8)?%r"
string(2) "e4"
int(0)
ctype_lower(\xe4): n
preg_match(\w, \xe4): int(0)
de_DE
bool(true)
string(2) "c4"
int(1)
ctype_lower(\xe4): y
preg_match(\w, \xe4): int(1)
8 changes: 4 additions & 4 deletions Zend/zend_operators.c
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,9 @@ ZEND_API const unsigned char zend_toupper_map[256] = {
* Functions using locale lowercase:
zend_binary_strncasecmp_l
zend_binary_strcasecmp_l
* Functions using ascii lowercase:
string_compare_function_ex
string_case_compare_function
* Functions using ascii lowercase:
zend_str_tolower_copy
zend_str_tolower_dup
zend_str_tolower
Expand Down Expand Up @@ -1997,7 +1997,7 @@ ZEND_API int ZEND_FASTCALL string_compare_function_ex(zval *op1, zval *op2, bool
int ret;

if (case_insensitive) {
ret = zend_binary_strcasecmp_l(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str1));
ret = zend_binary_strcasecmp(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str1));
} else {
ret = zend_binary_strcmp(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str2));
}
Expand Down Expand Up @@ -2037,13 +2037,13 @@ ZEND_API int ZEND_FASTCALL string_case_compare_function(zval *op1, zval *op2) /*
if (Z_STR_P(op1) == Z_STR_P(op2)) {
return 0;
} else {
return zend_binary_strcasecmp_l(Z_STRVAL_P(op1), Z_STRLEN_P(op1), Z_STRVAL_P(op2), Z_STRLEN_P(op2));
return zend_binary_strcasecmp(Z_STRVAL_P(op1), Z_STRLEN_P(op1), Z_STRVAL_P(op2), Z_STRLEN_P(op2));
}
} else {
zend_string *tmp_str1, *tmp_str2;
zend_string *str1 = zval_get_tmp_string(op1, &tmp_str1);
zend_string *str2 = zval_get_tmp_string(op2, &tmp_str2);
int ret = zend_binary_strcasecmp_l(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str1));
int ret = zend_binary_strcasecmp(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str1));

zend_tmp_string_release(tmp_str1);
zend_tmp_string_release(tmp_str2);
Expand Down
1 change: 0 additions & 1 deletion ext/pdo_dblib/dblib_stmt.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

#include "php.h"
#include "php_ini.h"
#include "ext/standard/php_string.h"
#include "ext/standard/info.h"
#include "pdo/php_pdo.h"
#include "pdo/php_pdo_driver.h"
Expand Down
122 changes: 25 additions & 97 deletions ext/standard/string.c
Original file line number Diff line number Diff line change
Expand Up @@ -1346,52 +1346,15 @@ PHP_FUNCTION(strtok)
/* {{{ php_strtoupper */
PHPAPI char *php_strtoupper(char *s, size_t len)
{
unsigned char *c;
const unsigned char *e;

c = (unsigned char *)s;
e = (unsigned char *)c+len;

while (c < e) {
*c = toupper(*c);
c++;
}
zend_str_toupper(s, len);
return s;
}
/* }}} */

/* {{{ php_string_toupper */
PHPAPI zend_string *php_string_toupper(zend_string *s)
{
unsigned char *c;
const unsigned char *e;

if (EXPECTED(!BG(ctype_string))) {
return zend_string_toupper(s);
}
c = (unsigned char *)ZSTR_VAL(s);
e = c + ZSTR_LEN(s);

while (c < e) {
if (islower(*c)) {
unsigned char *r;
zend_string *res = zend_string_alloc(ZSTR_LEN(s), 0);

if (c != (unsigned char*)ZSTR_VAL(s)) {
memcpy(ZSTR_VAL(res), ZSTR_VAL(s), c - (unsigned char*)ZSTR_VAL(s));
}
r = c + (ZSTR_VAL(res) - ZSTR_VAL(s));
while (c < e) {
*r = toupper(*c);
r++;
c++;
}
*r = '\0';
return res;
}
c++;
}
return zend_string_copy(s);
return zend_string_toupper(s);
}
/* }}} */

Expand All @@ -1404,56 +1367,22 @@ PHP_FUNCTION(strtoupper)
Z_PARAM_STR(arg)
ZEND_PARSE_PARAMETERS_END();

RETURN_STR(php_string_toupper(arg));
RETURN_STR(zend_string_toupper(arg));
}
/* }}} */

/* {{{ php_strtolower */
PHPAPI char *php_strtolower(char *s, size_t len)
{
unsigned char *c;
const unsigned char *e;

c = (unsigned char *)s;
e = c+len;

while (c < e) {
*c = tolower(*c);
c++;
}
zend_str_tolower(s, len);
return s;
}
/* }}} */

/* {{{ php_string_tolower */
PHPAPI zend_string *php_string_tolower(zend_string *s)
{
if (EXPECTED(!BG(ctype_string))) {
return zend_string_tolower(s);
}

unsigned char *c = (unsigned char *)ZSTR_VAL(s);
const unsigned char *e = c + ZSTR_LEN(s);
while (c < e) {
if (isupper(*c)) {
unsigned char *r;
zend_string *res = zend_string_alloc(ZSTR_LEN(s), 0);

if (c != (unsigned char*)ZSTR_VAL(s)) {
memcpy(ZSTR_VAL(res), ZSTR_VAL(s), c - (unsigned char*)ZSTR_VAL(s));
}
r = c + (ZSTR_VAL(res) - ZSTR_VAL(s));
while (c < e) {
*r = tolower(*c);
r++;
c++;
}
*r = '\0';
return res;
}
c++;
}
return zend_string_copy(s);
return zend_string_tolower(s);
}
/* }}} */

Expand All @@ -1466,7 +1395,7 @@ PHP_FUNCTION(strtolower)
Z_PARAM_STR(str)
ZEND_PARSE_PARAMETERS_END();

RETURN_STR(php_string_tolower(str));
RETURN_STR(zend_string_tolower(str));
}
/* }}} */

Expand Down Expand Up @@ -1758,8 +1687,8 @@ PHP_FUNCTION(pathinfo)
case insensitive strstr */
PHPAPI char *php_stristr(char *s, char *t, size_t s_len, size_t t_len)
{
php_strtolower(s, s_len);
php_strtolower(t, t_len);
zend_str_tolower(s, s_len);
zend_str_tolower(t, t_len);
return (char*)php_memnstr(s, t, t_len, s + s_len);
}
/* }}} */
Expand Down Expand Up @@ -1982,8 +1911,8 @@ PHP_FUNCTION(stripos)
RETURN_FALSE;
}

haystack_dup = php_string_tolower(haystack);
needle_dup = php_string_tolower(needle);
haystack_dup = zend_string_tolower(haystack);
needle_dup = zend_string_tolower(needle);
found = (char*)php_memnstr(ZSTR_VAL(haystack_dup) + offset,
ZSTR_VAL(needle_dup), ZSTR_LEN(needle_dup), ZSTR_VAL(haystack_dup) + ZSTR_LEN(haystack));

Expand Down Expand Up @@ -2077,18 +2006,17 @@ PHP_FUNCTION(strripos)
}
e = ZSTR_VAL(haystack) + (ZSTR_LEN(haystack) + (size_t)offset);
}
/* Borrow that ord_needle buffer to avoid repeatedly tolower()ing needle */
lowered = tolower(*ZSTR_VAL(needle));
lowered = zend_tolower_ascii(*ZSTR_VAL(needle));
while (e >= p) {
if (tolower(*e) == lowered) {
if (zend_tolower_ascii(*e) == lowered) {
RETURN_LONG(e - p + (offset > 0 ? offset : 0));
}
e--;
}
RETURN_FALSE;
}

haystack_dup = php_string_tolower(haystack);
haystack_dup = zend_string_tolower(haystack);
if (offset >= 0) {
if ((size_t)offset > ZSTR_LEN(haystack)) {
zend_string_release_ex(haystack_dup, 0);
Expand All @@ -2112,7 +2040,7 @@ PHP_FUNCTION(strripos)
}
}

needle_dup = php_string_tolower(needle);
needle_dup = zend_string_tolower(needle);
if ((found = (char *)zend_memnrstr(p, ZSTR_VAL(needle_dup), ZSTR_LEN(needle_dup), e))) {
RETVAL_LONG(found - ZSTR_VAL(haystack_dup));
zend_string_release_ex(needle_dup, 0);
Expand Down Expand Up @@ -2647,7 +2575,7 @@ PHP_FUNCTION(chr)
static zend_string* php_ucfirst(zend_string *str)
{
const unsigned char ch = ZSTR_VAL(str)[0];
unsigned char r = toupper(ch);
unsigned char r = zend_toupper_ascii(ch);
if (r == ch) {
return zend_string_copy(str);
} else {
Expand Down Expand Up @@ -2679,7 +2607,7 @@ PHP_FUNCTION(ucfirst)
Lowercase the first character of the word in a native string */
static zend_string* php_lcfirst(zend_string *str)
{
unsigned char r = tolower(ZSTR_VAL(str)[0]);
unsigned char r = zend_tolower_ascii(ZSTR_VAL(str)[0]);
if (r == ZSTR_VAL(str)[0]) {
return zend_string_copy(str);
} else {
Expand Down Expand Up @@ -2732,10 +2660,10 @@ PHP_FUNCTION(ucwords)
ZVAL_STRINGL(return_value, ZSTR_VAL(str), ZSTR_LEN(str));
r = Z_STRVAL_P(return_value);

*r = toupper((unsigned char) *r);
*r = zend_toupper_ascii((unsigned char) *r);
for (r_end = r + Z_STRLEN_P(return_value) - 1; r < r_end; ) {
if (mask[(unsigned char)*r++]) {
*r = toupper((unsigned char) *r);
*r = zend_toupper_ascii((unsigned char) *r);
}
}
}
Expand Down Expand Up @@ -3067,11 +2995,11 @@ static zend_string* php_char_to_str_ex(zend_string *str, char from, char *to, si
if (case_sensitivity) {
char_count = count_chars(ZSTR_VAL(str), ZSTR_LEN(str), from);
} else {
lc_from = tolower(from);
char_count = 0;
lc_from = zend_tolower_ascii(from);
source_end = ZSTR_VAL(str) + ZSTR_LEN(str);
for (source = ZSTR_VAL(str); source < source_end; source++) {
if (tolower(*source) == lc_from) {
if (zend_tolower_ascii(*source) == lc_from) {
char_count++;
}
}
Expand Down Expand Up @@ -3111,7 +3039,7 @@ static zend_string* php_char_to_str_ex(zend_string *str, char from, char *to, si
} else {
source_end = ZSTR_VAL(str) + ZSTR_LEN(str);
for (source = ZSTR_VAL(str); source < source_end; source++) {
if (tolower(*source) == lc_from) {
if (zend_tolower_ascii(*source) == lc_from) {
memcpy(target, to, to_len);
target += to_len;
} else {
Expand Down Expand Up @@ -4345,7 +4273,7 @@ static zend_long php_str_replace_in_subject(
zend_long old_replace_count = replace_count;

if (!lc_subject_str) {
lc_subject_str = php_string_tolower(subject_str);
lc_subject_str = zend_string_tolower(subject_str);
}
tmp_result = php_str_to_str_i_ex(subject_str, ZSTR_VAL(lc_subject_str),
search_str, replace_value, replace_len, &replace_count);
Expand Down Expand Up @@ -4398,7 +4326,7 @@ static zend_long php_str_replace_in_subject(
ZSTR_VAL(search_str), ZSTR_LEN(search_str),
ZSTR_VAL(replace_str), ZSTR_LEN(replace_str), &replace_count));
} else {
lc_subject_str = php_string_tolower(subject_str);
lc_subject_str = zend_string_tolower(subject_str);
ZVAL_STR(result, php_str_to_str_i_ex(subject_str, ZSTR_VAL(lc_subject_str),
search_str, ZSTR_VAL(replace_str), ZSTR_LEN(replace_str), &replace_count));
zend_string_release_ex(lc_subject_str, 0);
Expand Down Expand Up @@ -4941,7 +4869,7 @@ int php_tag_find(char *tag, size_t len, const char *set) {

n = norm;
t = tag;
c = tolower(*t);
c = zend_tolower_ascii(*t);
/*
normalize the tag removing leading and trailing whitespace
and turn any <a whatever...> into just <a> and any </tag>
Expand Down Expand Up @@ -4969,7 +4897,7 @@ int php_tag_find(char *tag, size_t len, const char *set) {
}
break;
}
c = tolower(*(++t));
c = zend_tolower_ascii(*(++t));
}
*(n++) = '>';
*n = '\0';
Expand Down
13 changes: 0 additions & 13 deletions ext/standard/tests/strings/bug79986.phpt

This file was deleted.

Loading