Skip to content

Commit b1883a1

Browse files
committed
Factor out SSE2 accelerated case conversion
Factor out the ugly bits of SSE2 case conversion so that the four functions that use it look neat and easy to read. Abstract the SSE2 dependency and block size so that people can implement this for other instruction sets if desired.
1 parent fdc8ff3 commit b1883a1

File tree

1 file changed

+70
-60
lines changed

1 file changed

+70
-60
lines changed

Zend/zend_operators.c

Lines changed: 70 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,38 @@ static _locale_t current_locale = NULL;
5454

5555
#define TYPE_PAIR(t1,t2) (((t1) << 4) | (t2))
5656

57+
#if __SSE2__
58+
#define HAVE_BLOCKCONV
59+
60+
/* Common code for SSE2 accelerated character case conversion */
61+
62+
#define BLOCKCONV_INIT_RANGE(start, end) \
63+
const __m128i blconv_start_minus_1 = _mm_set1_epi8((start) - 1); \
64+
const __m128i blconv_end_plus_1 = _mm_set1_epi8((end) + 1); \
65+
__m128i blconv_operand, blconv_gt, blconv_lt, blconv_mingle;
66+
67+
#define BLOCKCONV_STRIDE sizeof(__m128i)
68+
69+
#define BLOCKCONV_INIT_DELTA(delta) \
70+
const __m128i blconv_delta = _mm_set1_epi8(delta); \
71+
__m128i blconv_add, blconv_result;
72+
73+
#define BLOCKCONV_LOAD(input) \
74+
blconv_operand = _mm_loadu_si128((__m128i*)(input)); \
75+
blconv_gt = _mm_cmpgt_epi8(blconv_operand, blconv_start_minus_1); \
76+
blconv_lt = _mm_cmplt_epi8(blconv_operand, blconv_end_plus_1); \
77+
blconv_mingle = _mm_and_si128(blconv_gt, blconv_lt);
78+
79+
#define BLOCKCONV_FOUND() _mm_movemask_epi8(blconv_mingle)
80+
81+
#define BLOCKCONV_STORE(dest) \
82+
blconv_add = _mm_and_si128(blconv_mingle, blconv_delta); \
83+
blconv_result = _mm_add_epi8(blconv_operand, blconv_add); \
84+
_mm_storeu_si128((__m128i *)(dest), blconv_result);
85+
86+
#endif /* __SSE2__ */
87+
88+
5789
const unsigned char zend_tolower_map[256] = {
5890
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
5991
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
@@ -2685,22 +2717,16 @@ static zend_always_inline void zend_str_tolower_impl(char *dest, const char *str
26852717
unsigned char *p = (unsigned char*)str;
26862718
unsigned char *q = (unsigned char*)dest;
26872719
unsigned char *end = p + length;
2688-
#ifdef __SSE2__
2689-
if (length >= 16) {
2690-
const __m128i _A = _mm_set1_epi8('A' - 1);
2691-
const __m128i Z_ = _mm_set1_epi8('Z' + 1);
2692-
const __m128i delta = _mm_set1_epi8('a' - 'A');
2720+
#ifdef HAVE_BLOCKCONV
2721+
if (length >= BLOCKCONV_STRIDE) {
2722+
BLOCKCONV_INIT_RANGE('A', 'Z');
2723+
BLOCKCONV_INIT_DELTA('a' - 'A');
26932724
do {
2694-
__m128i op = _mm_loadu_si128((__m128i*)p);
2695-
__m128i gt = _mm_cmpgt_epi8(op, _A);
2696-
__m128i lt = _mm_cmplt_epi8(op, Z_);
2697-
__m128i mingle = _mm_and_si128(gt, lt);
2698-
__m128i add = _mm_and_si128(mingle, delta);
2699-
__m128i lower = _mm_add_epi8(op, add);
2700-
_mm_storeu_si128((__m128i *)q, lower);
2701-
p += 16;
2702-
q += 16;
2703-
} while (p + 16 <= end);
2725+
BLOCKCONV_LOAD(p);
2726+
BLOCKCONV_STORE(q);
2727+
p += BLOCKCONV_STRIDE;
2728+
q += BLOCKCONV_STRIDE;
2729+
} while (p + BLOCKCONV_STRIDE <= end);
27042730
}
27052731
#endif
27062732
while (p < end) {
@@ -2713,22 +2739,16 @@ static zend_always_inline void zend_str_toupper_impl(char *dest, const char *str
27132739
unsigned char *p = (unsigned char*)str;
27142740
unsigned char *q = (unsigned char*)dest;
27152741
unsigned char *end = p + length;
2716-
#ifdef __SSE2__
2717-
if (length >= 16) {
2718-
const __m128i _a = _mm_set1_epi8('a' - 1);
2719-
const __m128i z_ = _mm_set1_epi8('z' + 1);
2720-
const __m128i delta = _mm_set1_epi8('a' - 'A');
2742+
#ifdef HAVE_BLOCKCONV
2743+
if (length >= BLOCKCONV_STRIDE) {
2744+
BLOCKCONV_INIT_RANGE('a', 'z');
2745+
BLOCKCONV_INIT_DELTA('A' - 'a');
27212746
do {
2722-
__m128i op = _mm_loadu_si128((__m128i*)p);
2723-
__m128i gt = _mm_cmpgt_epi8(op, _a);
2724-
__m128i lt = _mm_cmplt_epi8(op, z_);
2725-
__m128i mingle = _mm_and_si128(gt, lt);
2726-
__m128i sub = _mm_and_si128(mingle, delta);
2727-
__m128i upper = _mm_sub_epi8(op, sub);
2728-
_mm_storeu_si128((__m128i *)q, upper);
2729-
p += 16;
2730-
q += 16;
2731-
} while (p + 16 <= end);
2747+
BLOCKCONV_LOAD(p);
2748+
BLOCKCONV_STORE(q);
2749+
p += BLOCKCONV_STRIDE;
2750+
q += BLOCKCONV_STRIDE;
2751+
} while (p + BLOCKCONV_STRIDE <= end);
27322752
}
27332753
#endif
27342754
while (p < end) {
@@ -2832,32 +2852,27 @@ ZEND_API zend_string* ZEND_FASTCALL zend_string_tolower_ex(zend_string *str, boo
28322852
unsigned char *p = (unsigned char *) ZSTR_VAL(str);
28332853
unsigned char *end = p + length;
28342854

2835-
#ifdef __SSE2__
2836-
while (p + 16 <= end) {
2837-
const __m128i _A = _mm_set1_epi8('A' - 1);
2838-
const __m128i Z_ = _mm_set1_epi8('Z' + 1);
2839-
__m128i op = _mm_loadu_si128((__m128i*)p);
2840-
__m128i gt = _mm_cmpgt_epi8(op, _A);
2841-
__m128i lt = _mm_cmplt_epi8(op, Z_);
2842-
__m128i mingle = _mm_and_si128(gt, lt);
2843-
if (_mm_movemask_epi8(mingle)) {
2855+
#ifdef HAVE_BLOCKCONV
2856+
BLOCKCONV_INIT_RANGE('A', 'Z');
2857+
while (p + BLOCKCONV_STRIDE <= end) {
2858+
BLOCKCONV_LOAD(p);
2859+
if (BLOCKCONV_FOUND()) {
28442860
zend_string *res = zend_string_alloc(length, persistent);
28452861
memcpy(ZSTR_VAL(res), ZSTR_VAL(str), p - (unsigned char *) ZSTR_VAL(str));
28462862
unsigned char *q = p + (ZSTR_VAL(res) - ZSTR_VAL(str));
28472863

28482864
/* Lowercase the chunk we already compared. */
2849-
const __m128i delta = _mm_set1_epi8('a' - 'A');
2850-
__m128i add = _mm_and_si128(mingle, delta);
2851-
__m128i lower = _mm_add_epi8(op, add);
2852-
_mm_storeu_si128((__m128i *) q, lower);
2865+
BLOCKCONV_INIT_DELTA('a' - 'A');
2866+
BLOCKCONV_STORE(q);
28532867

28542868
/* Lowercase the rest of the string. */
2855-
p += 16; q += 16;
2869+
p += BLOCKCONV_STRIDE;
2870+
q += BLOCKCONV_STRIDE;
28562871
zend_str_tolower_impl((char *) q, (const char *) p, end - p);
28572872
ZSTR_VAL(res)[length] = '\0';
28582873
return res;
28592874
}
2860-
p += 16;
2875+
p += BLOCKCONV_STRIDE;
28612876
}
28622877
#endif
28632878

@@ -2886,32 +2901,27 @@ ZEND_API zend_string* ZEND_FASTCALL zend_string_toupper_ex(zend_string *str, boo
28862901
unsigned char *p = (unsigned char *) ZSTR_VAL(str);
28872902
unsigned char *end = p + length;
28882903

2889-
#ifdef __SSE2__
2890-
while (p + 16 <= end) {
2891-
const __m128i _a = _mm_set1_epi8('a' - 1);
2892-
const __m128i z_ = _mm_set1_epi8('z' + 1);
2893-
__m128i op = _mm_loadu_si128((__m128i*)p);
2894-
__m128i gt = _mm_cmpgt_epi8(op, _a);
2895-
__m128i lt = _mm_cmplt_epi8(op, z_);
2896-
__m128i mingle = _mm_and_si128(gt, lt);
2897-
if (_mm_movemask_epi8(mingle)) {
2904+
#ifdef HAVE_BLOCKCONV
2905+
BLOCKCONV_INIT_RANGE('a', 'z');
2906+
while (p + BLOCKCONV_STRIDE <= end) {
2907+
BLOCKCONV_LOAD(p);
2908+
if (BLOCKCONV_FOUND()) {
28982909
zend_string *res = zend_string_alloc(length, persistent);
28992910
memcpy(ZSTR_VAL(res), ZSTR_VAL(str), p - (unsigned char *) ZSTR_VAL(str));
29002911
unsigned char *q = p + (ZSTR_VAL(res) - ZSTR_VAL(str));
29012912

29022913
/* Uppercase the chunk we already compared. */
2903-
const __m128i delta = _mm_set1_epi8('a' - 'A');
2904-
__m128i add = _mm_and_si128(mingle, delta);
2905-
__m128i upper = _mm_sub_epi8(op, add);
2906-
_mm_storeu_si128((__m128i *) q, upper);
2914+
BLOCKCONV_INIT_DELTA('A' - 'a');
2915+
BLOCKCONV_STORE(q);
29072916

29082917
/* Uppercase the rest of the string. */
2909-
p += 16; q += 16;
2918+
p += BLOCKCONV_STRIDE;
2919+
q += BLOCKCONV_STRIDE;
29102920
zend_str_toupper_impl((char *) q, (const char *) p, end - p);
29112921
ZSTR_VAL(res)[length] = '\0';
29122922
return res;
29132923
}
2914-
p += 16;
2924+
p += BLOCKCONV_STRIDE;
29152925
}
29162926
#endif
29172927

0 commit comments

Comments
 (0)