Skip to content

Commit 585d34c

Browse files
committed
Make strtolower() and strtoupper() do ASCII case conversion
Implement RFC https://wiki.php.net/rfc/strtolower-ascii
1 parent ad9c10b commit 585d34c

12 files changed

+317
-865
lines changed

UPGRADING

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@ PHP 8.2 UPGRADE NOTES
1919
1. Backward Incompatible Changes
2020
========================================
2121

22+
- Standard:
23+
. strtolower() and strtoupper() are no longer locale-sensitive. They now
24+
perform ASCII case conversion. Use mb_strtolower() if you want localized
25+
case conversion. Similarly, stristr, stripos, strripos, lcfirst, ucfirst,
26+
ucwords, str_ireplace, array_change_key_case and sorting with SORT_FLAG_CASE
27+
use ASCII case conversion.
28+
2229
========================================
2330
2. New Features
2431
========================================

Zend/tests/lc_ctype_inheritance.phpt

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,30 @@ LC_CTYPE=de_DE
99
--FILE--
1010
<?php
1111

12+
echo "inherited\n";
13+
echo 'ctype_lower(\xe4): ' . (ctype_lower("\xe4") ? 'y' : 'n') . "\n";
14+
echo 'preg_match(\w, \xe4): ';
15+
var_dump(preg_match('/\w/', "\xe4"));
16+
1217
var_dump(setlocale(LC_CTYPE, "0"));
13-
var_dump(bin2hex(strtoupper("\xe4")));
18+
echo 'ctype_lower(\xe4): ' . (ctype_lower("\xe4") ? 'y' : 'n') . "\n";
19+
echo 'preg_match(\w, \xe4): ';
1420
var_dump(preg_match('/\w/', "\xe4"));
21+
22+
echo "de_DE\n";
1523
var_dump(setlocale(LC_CTYPE, "de_DE", "de-DE") !== false);
16-
var_dump(bin2hex(strtoupper("\xe4")));
24+
echo 'ctype_lower(\xe4): ' . (ctype_lower("\xe4") ? 'y' : 'n') . "\n";
25+
echo 'preg_match(\w, \xe4): ';
1726
var_dump(preg_match('/\w/', "\xe4"));
1827
?>
1928
--EXPECT--
29+
inherited
30+
ctype_lower(\xe4): n
31+
preg_match(\w, \xe4): int(0)
2032
string(1) "C"
21-
string(2) "e4"
22-
int(0)
33+
ctype_lower(\xe4): n
34+
preg_match(\w, \xe4): int(0)
35+
de_DE
2336
bool(true)
24-
string(2) "c4"
25-
int(1)
37+
ctype_lower(\xe4): y
38+
preg_match(\w, \xe4): int(1)

Zend/zend_operators.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,9 @@ ZEND_API const unsigned char zend_toupper_map[256] = {
126126
* Functions using locale lowercase:
127127
zend_binary_strncasecmp_l
128128
zend_binary_strcasecmp_l
129+
* Functions using ascii lowercase:
129130
string_compare_function_ex
130131
string_case_compare_function
131-
* Functions using ascii lowercase:
132132
zend_str_tolower_copy
133133
zend_str_tolower_dup
134134
zend_str_tolower
@@ -1997,7 +1997,7 @@ ZEND_API int ZEND_FASTCALL string_compare_function_ex(zval *op1, zval *op2, bool
19971997
int ret;
19981998

19991999
if (case_insensitive) {
2000-
ret = zend_binary_strcasecmp_l(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str1));
2000+
ret = zend_binary_strcasecmp(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str1));
20012001
} else {
20022002
ret = zend_binary_strcmp(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str2));
20032003
}
@@ -2037,13 +2037,13 @@ ZEND_API int ZEND_FASTCALL string_case_compare_function(zval *op1, zval *op2) /*
20372037
if (Z_STR_P(op1) == Z_STR_P(op2)) {
20382038
return 0;
20392039
} else {
2040-
return zend_binary_strcasecmp_l(Z_STRVAL_P(op1), Z_STRLEN_P(op1), Z_STRVAL_P(op2), Z_STRLEN_P(op2));
2040+
return zend_binary_strcasecmp(Z_STRVAL_P(op1), Z_STRLEN_P(op1), Z_STRVAL_P(op2), Z_STRLEN_P(op2));
20412041
}
20422042
} else {
20432043
zend_string *tmp_str1, *tmp_str2;
20442044
zend_string *str1 = zval_get_tmp_string(op1, &tmp_str1);
20452045
zend_string *str2 = zval_get_tmp_string(op2, &tmp_str2);
2046-
int ret = zend_binary_strcasecmp_l(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str1));
2046+
int ret = zend_binary_strcasecmp(ZSTR_VAL(str1), ZSTR_LEN(str1), ZSTR_VAL(str2), ZSTR_LEN(str1));
20472047

20482048
zend_tmp_string_release(tmp_str1);
20492049
zend_tmp_string_release(tmp_str2);

ext/pdo_dblib/dblib_stmt.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
#include "php.h"
2323
#include "php_ini.h"
24-
#include "ext/standard/php_string.h"
2524
#include "ext/standard/info.h"
2625
#include "pdo/php_pdo.h"
2726
#include "pdo/php_pdo_driver.h"

ext/standard/string.c

Lines changed: 25 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,52 +1342,15 @@ PHP_FUNCTION(strtok)
13421342
/* {{{ php_strtoupper */
13431343
PHPAPI char *php_strtoupper(char *s, size_t len)
13441344
{
1345-
unsigned char *c;
1346-
const unsigned char *e;
1347-
1348-
c = (unsigned char *)s;
1349-
e = (unsigned char *)c+len;
1350-
1351-
while (c < e) {
1352-
*c = toupper(*c);
1353-
c++;
1354-
}
1345+
zend_str_toupper(s, len);
13551346
return s;
13561347
}
13571348
/* }}} */
13581349

13591350
/* {{{ php_string_toupper */
13601351
PHPAPI zend_string *php_string_toupper(zend_string *s)
13611352
{
1362-
unsigned char *c;
1363-
const unsigned char *e;
1364-
1365-
if (EXPECTED(!BG(ctype_string))) {
1366-
return zend_string_toupper(s);
1367-
}
1368-
c = (unsigned char *)ZSTR_VAL(s);
1369-
e = c + ZSTR_LEN(s);
1370-
1371-
while (c < e) {
1372-
if (islower(*c)) {
1373-
unsigned char *r;
1374-
zend_string *res = zend_string_alloc(ZSTR_LEN(s), 0);
1375-
1376-
if (c != (unsigned char*)ZSTR_VAL(s)) {
1377-
memcpy(ZSTR_VAL(res), ZSTR_VAL(s), c - (unsigned char*)ZSTR_VAL(s));
1378-
}
1379-
r = c + (ZSTR_VAL(res) - ZSTR_VAL(s));
1380-
while (c < e) {
1381-
*r = toupper(*c);
1382-
r++;
1383-
c++;
1384-
}
1385-
*r = '\0';
1386-
return res;
1387-
}
1388-
c++;
1389-
}
1390-
return zend_string_copy(s);
1353+
return zend_string_toupper(s);
13911354
}
13921355
/* }}} */
13931356

@@ -1400,56 +1363,22 @@ PHP_FUNCTION(strtoupper)
14001363
Z_PARAM_STR(arg)
14011364
ZEND_PARSE_PARAMETERS_END();
14021365

1403-
RETURN_STR(php_string_toupper(arg));
1366+
RETURN_STR(zend_string_toupper(arg));
14041367
}
14051368
/* }}} */
14061369

14071370
/* {{{ php_strtolower */
14081371
PHPAPI char *php_strtolower(char *s, size_t len)
14091372
{
1410-
unsigned char *c;
1411-
const unsigned char *e;
1412-
1413-
c = (unsigned char *)s;
1414-
e = c+len;
1415-
1416-
while (c < e) {
1417-
*c = tolower(*c);
1418-
c++;
1419-
}
1373+
zend_str_tolower(s, len);
14201374
return s;
14211375
}
14221376
/* }}} */
14231377

14241378
/* {{{ php_string_tolower */
14251379
PHPAPI zend_string *php_string_tolower(zend_string *s)
14261380
{
1427-
if (EXPECTED(!BG(ctype_string))) {
1428-
return zend_string_tolower(s);
1429-
}
1430-
1431-
unsigned char *c = (unsigned char *)ZSTR_VAL(s);
1432-
const unsigned char *e = c + ZSTR_LEN(s);
1433-
while (c < e) {
1434-
if (isupper(*c)) {
1435-
unsigned char *r;
1436-
zend_string *res = zend_string_alloc(ZSTR_LEN(s), 0);
1437-
1438-
if (c != (unsigned char*)ZSTR_VAL(s)) {
1439-
memcpy(ZSTR_VAL(res), ZSTR_VAL(s), c - (unsigned char*)ZSTR_VAL(s));
1440-
}
1441-
r = c + (ZSTR_VAL(res) - ZSTR_VAL(s));
1442-
while (c < e) {
1443-
*r = tolower(*c);
1444-
r++;
1445-
c++;
1446-
}
1447-
*r = '\0';
1448-
return res;
1449-
}
1450-
c++;
1451-
}
1452-
return zend_string_copy(s);
1381+
return zend_string_tolower(s);
14531382
}
14541383
/* }}} */
14551384

@@ -1462,7 +1391,7 @@ PHP_FUNCTION(strtolower)
14621391
Z_PARAM_STR(str)
14631392
ZEND_PARSE_PARAMETERS_END();
14641393

1465-
RETURN_STR(php_string_tolower(str));
1394+
RETURN_STR(zend_string_tolower(str));
14661395
}
14671396
/* }}} */
14681397

@@ -1754,8 +1683,8 @@ PHP_FUNCTION(pathinfo)
17541683
case insensitive strstr */
17551684
PHPAPI char *php_stristr(char *s, char *t, size_t s_len, size_t t_len)
17561685
{
1757-
php_strtolower(s, s_len);
1758-
php_strtolower(t, t_len);
1686+
zend_str_tolower(s, s_len);
1687+
zend_str_tolower(t, t_len);
17591688
return (char*)php_memnstr(s, t, t_len, s + s_len);
17601689
}
17611690
/* }}} */
@@ -1978,8 +1907,8 @@ PHP_FUNCTION(stripos)
19781907
RETURN_FALSE;
19791908
}
19801909

1981-
haystack_dup = php_string_tolower(haystack);
1982-
needle_dup = php_string_tolower(needle);
1910+
haystack_dup = zend_string_tolower(haystack);
1911+
needle_dup = zend_string_tolower(needle);
19831912
found = (char*)php_memnstr(ZSTR_VAL(haystack_dup) + offset,
19841913
ZSTR_VAL(needle_dup), ZSTR_LEN(needle_dup), ZSTR_VAL(haystack_dup) + ZSTR_LEN(haystack));
19851914

@@ -2073,18 +2002,17 @@ PHP_FUNCTION(strripos)
20732002
}
20742003
e = ZSTR_VAL(haystack) + (ZSTR_LEN(haystack) + (size_t)offset);
20752004
}
2076-
/* Borrow that ord_needle buffer to avoid repeatedly tolower()ing needle */
2077-
lowered = tolower(*ZSTR_VAL(needle));
2005+
lowered = zend_tolower_ascii(*ZSTR_VAL(needle));
20782006
while (e >= p) {
2079-
if (tolower(*e) == lowered) {
2007+
if (zend_tolower_ascii(*e) == lowered) {
20802008
RETURN_LONG(e - p + (offset > 0 ? offset : 0));
20812009
}
20822010
e--;
20832011
}
20842012
RETURN_FALSE;
20852013
}
20862014

2087-
haystack_dup = php_string_tolower(haystack);
2015+
haystack_dup = zend_string_tolower(haystack);
20882016
if (offset >= 0) {
20892017
if ((size_t)offset > ZSTR_LEN(haystack)) {
20902018
zend_string_release_ex(haystack_dup, 0);
@@ -2108,7 +2036,7 @@ PHP_FUNCTION(strripos)
21082036
}
21092037
}
21102038

2111-
needle_dup = php_string_tolower(needle);
2039+
needle_dup = zend_string_tolower(needle);
21122040
if ((found = (char *)zend_memnrstr(p, ZSTR_VAL(needle_dup), ZSTR_LEN(needle_dup), e))) {
21132041
RETVAL_LONG(found - ZSTR_VAL(haystack_dup));
21142042
zend_string_release_ex(needle_dup, 0);
@@ -2603,7 +2531,7 @@ PHP_FUNCTION(chr)
26032531
static zend_string* php_ucfirst(zend_string *str)
26042532
{
26052533
const unsigned char ch = ZSTR_VAL(str)[0];
2606-
unsigned char r = toupper(ch);
2534+
unsigned char r = zend_toupper_ascii(ch);
26072535
if (r == ch) {
26082536
return zend_string_copy(str);
26092537
} else {
@@ -2635,7 +2563,7 @@ PHP_FUNCTION(ucfirst)
26352563
Lowercase the first character of the word in a native string */
26362564
static zend_string* php_lcfirst(zend_string *str)
26372565
{
2638-
unsigned char r = tolower(ZSTR_VAL(str)[0]);
2566+
unsigned char r = zend_tolower_ascii(ZSTR_VAL(str)[0]);
26392567
if (r == ZSTR_VAL(str)[0]) {
26402568
return zend_string_copy(str);
26412569
} else {
@@ -2688,10 +2616,10 @@ PHP_FUNCTION(ucwords)
26882616
ZVAL_STRINGL(return_value, ZSTR_VAL(str), ZSTR_LEN(str));
26892617
r = Z_STRVAL_P(return_value);
26902618

2691-
*r = toupper((unsigned char) *r);
2619+
*r = zend_toupper_ascii((unsigned char) *r);
26922620
for (r_end = r + Z_STRLEN_P(return_value) - 1; r < r_end; ) {
26932621
if (mask[(unsigned char)*r++]) {
2694-
*r = toupper((unsigned char) *r);
2622+
*r = zend_toupper_ascii((unsigned char) *r);
26952623
}
26962624
}
26972625
}
@@ -2937,9 +2865,9 @@ static zend_string* php_char_to_str_ex(zend_string *str, char from, char *to, si
29372865
p++;
29382866
}
29392867
} else {
2940-
lc_from = tolower(from);
2868+
lc_from = zend_tolower_ascii(from);
29412869
for (source = ZSTR_VAL(str); source < source_end; source++) {
2942-
if (tolower(*source) == lc_from) {
2870+
if (zend_tolower_ascii(*source) == lc_from) {
29432871
char_count++;
29442872
}
29452873
}
@@ -2975,7 +2903,7 @@ static zend_string* php_char_to_str_ex(zend_string *str, char from, char *to, si
29752903
}
29762904
} else {
29772905
for (source = ZSTR_VAL(str); source < source_end; source++) {
2978-
if (tolower(*source) == lc_from) {
2906+
if (zend_tolower_ascii(*source) == lc_from) {
29792907
if (replace_count) {
29802908
*replace_count += 1;
29812909
}
@@ -4202,7 +4130,7 @@ static zend_long php_str_replace_in_subject(
42024130
zend_long old_replace_count = replace_count;
42034131

42044132
if (!lc_subject_str) {
4205-
lc_subject_str = php_string_tolower(subject_str);
4133+
lc_subject_str = zend_string_tolower(subject_str);
42064134
}
42074135
tmp_result = php_str_to_str_i_ex(subject_str, ZSTR_VAL(lc_subject_str),
42084136
search_str, replace_value, replace_len, &replace_count);
@@ -4255,7 +4183,7 @@ static zend_long php_str_replace_in_subject(
42554183
ZSTR_VAL(search_str), ZSTR_LEN(search_str),
42564184
ZSTR_VAL(replace_str), ZSTR_LEN(replace_str), &replace_count));
42574185
} else {
4258-
lc_subject_str = php_string_tolower(subject_str);
4186+
lc_subject_str = zend_string_tolower(subject_str);
42594187
ZVAL_STR(result, php_str_to_str_i_ex(subject_str, ZSTR_VAL(lc_subject_str),
42604188
search_str, ZSTR_VAL(replace_str), ZSTR_LEN(replace_str), &replace_count));
42614189
zend_string_release_ex(lc_subject_str, 0);
@@ -4798,7 +4726,7 @@ int php_tag_find(char *tag, size_t len, const char *set) {
47984726

47994727
n = norm;
48004728
t = tag;
4801-
c = tolower(*t);
4729+
c = zend_tolower_ascii(*t);
48024730
/*
48034731
normalize the tag removing leading and trailing whitespace
48044732
and turn any <a whatever...> into just <a> and any </tag>
@@ -4826,7 +4754,7 @@ int php_tag_find(char *tag, size_t len, const char *set) {
48264754
}
48274755
break;
48284756
}
4829-
c = tolower(*(++t));
4757+
c = zend_tolower_ascii(*(++t));
48304758
}
48314759
*(n++) = '>';
48324760
*n = '\0';

ext/standard/tests/strings/bug79986.phpt

Lines changed: 0 additions & 13 deletions
This file was deleted.

0 commit comments

Comments
 (0)