@@ -54,6 +54,38 @@ static _locale_t current_locale = NULL;
54
54
55
55
#define TYPE_PAIR (t1 ,t2 ) (((t1) << 4) | (t2))
56
56
57
+ #if __SSE2__
58
+ #define HAVE_BLOCKCONV
59
+
60
+ /* Common code for SSE2 accelerated character case conversion */
61
+
62
+ #define BLOCKCONV_INIT_RANGE (start , end ) \
63
+ const __m128i blconv_start_minus_1 = _mm_set1_epi8((start) - 1); \
64
+ const __m128i blconv_end_plus_1 = _mm_set1_epi8((end) + 1); \
65
+ __m128i blconv_operand, blconv_gt, blconv_lt, blconv_mingle;
66
+
67
+ #define BLOCKCONV_STRIDE sizeof(__m128i)
68
+
69
+ #define BLOCKCONV_INIT_DELTA (delta ) \
70
+ const __m128i blconv_delta = _mm_set1_epi8(delta); \
71
+ __m128i blconv_add, blconv_result;
72
+
73
+ #define BLOCKCONV_LOAD (input ) \
74
+ blconv_operand = _mm_loadu_si128((__m128i*)(input)); \
75
+ blconv_gt = _mm_cmpgt_epi8(blconv_operand, blconv_start_minus_1); \
76
+ blconv_lt = _mm_cmplt_epi8(blconv_operand, blconv_end_plus_1); \
77
+ blconv_mingle = _mm_and_si128(blconv_gt, blconv_lt);
78
+
79
+ #define BLOCKCONV_FOUND () _mm_movemask_epi8(blconv_mingle)
80
+
81
+ #define BLOCKCONV_STORE (dest ) \
82
+ blconv_add = _mm_and_si128(blconv_mingle, blconv_delta); \
83
+ blconv_result = _mm_add_epi8(blconv_operand, blconv_add); \
84
+ _mm_storeu_si128((__m128i *)(dest), blconv_result);
85
+
86
+ #endif /* __SSE2__ */
87
+
88
+
57
89
const unsigned char zend_tolower_map [256 ] = {
58
90
0x00 ,0x01 ,0x02 ,0x03 ,0x04 ,0x05 ,0x06 ,0x07 ,0x08 ,0x09 ,0x0a ,0x0b ,0x0c ,0x0d ,0x0e ,0x0f ,
59
91
0x10 ,0x11 ,0x12 ,0x13 ,0x14 ,0x15 ,0x16 ,0x17 ,0x18 ,0x19 ,0x1a ,0x1b ,0x1c ,0x1d ,0x1e ,0x1f ,
@@ -2685,22 +2717,16 @@ static zend_always_inline void zend_str_tolower_impl(char *dest, const char *str
2685
2717
unsigned char * p = (unsigned char * )str ;
2686
2718
unsigned char * q = (unsigned char * )dest ;
2687
2719
unsigned char * end = p + length ;
2688
- #ifdef __SSE2__
2689
- if (length >= 16 ) {
2690
- const __m128i _A = _mm_set1_epi8 ('A' - 1 );
2691
- const __m128i Z_ = _mm_set1_epi8 ('Z' + 1 );
2692
- const __m128i delta = _mm_set1_epi8 ('a' - 'A' );
2720
+ #ifdef HAVE_BLOCKCONV
2721
+ if (length >= BLOCKCONV_STRIDE ) {
2722
+ BLOCKCONV_INIT_RANGE ('A' , 'Z' );
2723
+ BLOCKCONV_INIT_DELTA ('a' - 'A' );
2693
2724
do {
2694
- __m128i op = _mm_loadu_si128 ((__m128i * )p );
2695
- __m128i gt = _mm_cmpgt_epi8 (op , _A );
2696
- __m128i lt = _mm_cmplt_epi8 (op , Z_ );
2697
- __m128i mingle = _mm_and_si128 (gt , lt );
2698
- __m128i add = _mm_and_si128 (mingle , delta );
2699
- __m128i lower = _mm_add_epi8 (op , add );
2700
- _mm_storeu_si128 ((__m128i * )q , lower );
2701
- p += 16 ;
2702
- q += 16 ;
2703
- } while (p + 16 <= end );
2725
+ BLOCKCONV_LOAD (p );
2726
+ BLOCKCONV_STORE (q );
2727
+ p += BLOCKCONV_STRIDE ;
2728
+ q += BLOCKCONV_STRIDE ;
2729
+ } while (p + BLOCKCONV_STRIDE <= end );
2704
2730
}
2705
2731
#endif
2706
2732
while (p < end ) {
@@ -2713,22 +2739,16 @@ static zend_always_inline void zend_str_toupper_impl(char *dest, const char *str
2713
2739
unsigned char * p = (unsigned char * )str ;
2714
2740
unsigned char * q = (unsigned char * )dest ;
2715
2741
unsigned char * end = p + length ;
2716
- #ifdef __SSE2__
2717
- if (length >= 16 ) {
2718
- const __m128i _a = _mm_set1_epi8 ('a' - 1 );
2719
- const __m128i z_ = _mm_set1_epi8 ('z' + 1 );
2720
- const __m128i delta = _mm_set1_epi8 ('a' - 'A' );
2742
+ #ifdef HAVE_BLOCKCONV
2743
+ if (length >= BLOCKCONV_STRIDE ) {
2744
+ BLOCKCONV_INIT_RANGE ('a' , 'z' );
2745
+ BLOCKCONV_INIT_DELTA ('A' - 'a' );
2721
2746
do {
2722
- __m128i op = _mm_loadu_si128 ((__m128i * )p );
2723
- __m128i gt = _mm_cmpgt_epi8 (op , _a );
2724
- __m128i lt = _mm_cmplt_epi8 (op , z_ );
2725
- __m128i mingle = _mm_and_si128 (gt , lt );
2726
- __m128i sub = _mm_and_si128 (mingle , delta );
2727
- __m128i upper = _mm_sub_epi8 (op , sub );
2728
- _mm_storeu_si128 ((__m128i * )q , upper );
2729
- p += 16 ;
2730
- q += 16 ;
2731
- } while (p + 16 <= end );
2747
+ BLOCKCONV_LOAD (p );
2748
+ BLOCKCONV_STORE (q );
2749
+ p += BLOCKCONV_STRIDE ;
2750
+ q += BLOCKCONV_STRIDE ;
2751
+ } while (p + BLOCKCONV_STRIDE <= end );
2732
2752
}
2733
2753
#endif
2734
2754
while (p < end ) {
@@ -2832,32 +2852,27 @@ ZEND_API zend_string* ZEND_FASTCALL zend_string_tolower_ex(zend_string *str, boo
2832
2852
unsigned char * p = (unsigned char * ) ZSTR_VAL (str );
2833
2853
unsigned char * end = p + length ;
2834
2854
2835
- #ifdef __SSE2__
2836
- while (p + 16 <= end ) {
2837
- const __m128i _A = _mm_set1_epi8 ('A' - 1 );
2838
- const __m128i Z_ = _mm_set1_epi8 ('Z' + 1 );
2839
- __m128i op = _mm_loadu_si128 ((__m128i * )p );
2840
- __m128i gt = _mm_cmpgt_epi8 (op , _A );
2841
- __m128i lt = _mm_cmplt_epi8 (op , Z_ );
2842
- __m128i mingle = _mm_and_si128 (gt , lt );
2843
- if (_mm_movemask_epi8 (mingle )) {
2855
+ #ifdef HAVE_BLOCKCONV
2856
+ BLOCKCONV_INIT_RANGE ('A' , 'Z' );
2857
+ while (p + BLOCKCONV_STRIDE <= end ) {
2858
+ BLOCKCONV_LOAD (p );
2859
+ if (BLOCKCONV_FOUND ()) {
2844
2860
zend_string * res = zend_string_alloc (length , persistent );
2845
2861
memcpy (ZSTR_VAL (res ), ZSTR_VAL (str ), p - (unsigned char * ) ZSTR_VAL (str ));
2846
2862
unsigned char * q = p + (ZSTR_VAL (res ) - ZSTR_VAL (str ));
2847
2863
2848
2864
/* Lowercase the chunk we already compared. */
2849
- const __m128i delta = _mm_set1_epi8 ('a' - 'A' );
2850
- __m128i add = _mm_and_si128 (mingle , delta );
2851
- __m128i lower = _mm_add_epi8 (op , add );
2852
- _mm_storeu_si128 ((__m128i * ) q , lower );
2865
+ BLOCKCONV_INIT_DELTA ('a' - 'A' );
2866
+ BLOCKCONV_STORE (q );
2853
2867
2854
2868
/* Lowercase the rest of the string. */
2855
- p += 16 ; q += 16 ;
2869
+ p += BLOCKCONV_STRIDE ;
2870
+ q += BLOCKCONV_STRIDE ;
2856
2871
zend_str_tolower_impl ((char * ) q , (const char * ) p , end - p );
2857
2872
ZSTR_VAL (res )[length ] = '\0' ;
2858
2873
return res ;
2859
2874
}
2860
- p += 16 ;
2875
+ p += BLOCKCONV_STRIDE ;
2861
2876
}
2862
2877
#endif
2863
2878
@@ -2886,32 +2901,27 @@ ZEND_API zend_string* ZEND_FASTCALL zend_string_toupper_ex(zend_string *str, boo
2886
2901
unsigned char * p = (unsigned char * ) ZSTR_VAL (str );
2887
2902
unsigned char * end = p + length ;
2888
2903
2889
- #ifdef __SSE2__
2890
- while (p + 16 <= end ) {
2891
- const __m128i _a = _mm_set1_epi8 ('a' - 1 );
2892
- const __m128i z_ = _mm_set1_epi8 ('z' + 1 );
2893
- __m128i op = _mm_loadu_si128 ((__m128i * )p );
2894
- __m128i gt = _mm_cmpgt_epi8 (op , _a );
2895
- __m128i lt = _mm_cmplt_epi8 (op , z_ );
2896
- __m128i mingle = _mm_and_si128 (gt , lt );
2897
- if (_mm_movemask_epi8 (mingle )) {
2904
+ #ifdef HAVE_BLOCKCONV
2905
+ BLOCKCONV_INIT_RANGE ('a' , 'z' );
2906
+ while (p + BLOCKCONV_STRIDE <= end ) {
2907
+ BLOCKCONV_LOAD (p );
2908
+ if (BLOCKCONV_FOUND ()) {
2898
2909
zend_string * res = zend_string_alloc (length , persistent );
2899
2910
memcpy (ZSTR_VAL (res ), ZSTR_VAL (str ), p - (unsigned char * ) ZSTR_VAL (str ));
2900
2911
unsigned char * q = p + (ZSTR_VAL (res ) - ZSTR_VAL (str ));
2901
2912
2902
2913
/* Uppercase the chunk we already compared. */
2903
- const __m128i delta = _mm_set1_epi8 ('a' - 'A' );
2904
- __m128i add = _mm_and_si128 (mingle , delta );
2905
- __m128i upper = _mm_sub_epi8 (op , add );
2906
- _mm_storeu_si128 ((__m128i * ) q , upper );
2914
+ BLOCKCONV_INIT_DELTA ('A' - 'a' );
2915
+ BLOCKCONV_STORE (q );
2907
2916
2908
2917
/* Uppercase the rest of the string. */
2909
- p += 16 ; q += 16 ;
2918
+ p += BLOCKCONV_STRIDE ;
2919
+ q += BLOCKCONV_STRIDE ;
2910
2920
zend_str_toupper_impl ((char * ) q , (const char * ) p , end - p );
2911
2921
ZSTR_VAL (res )[length ] = '\0' ;
2912
2922
return res ;
2913
2923
}
2914
- p += 16 ;
2924
+ p += BLOCKCONV_STRIDE ;
2915
2925
}
2916
2926
#endif
2917
2927
0 commit comments