-
Notifications
You must be signed in to change notification settings - Fork 7.9k
ext/bcmath: Use SIMD for trailing zero counts during conversion #14166
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
364a698
570268d
1189f4f
fc7f7cb
87e9d63
323e144
275abd0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,6 +76,35 @@ static const char *bc_count_digits(const char *str, const char *end) | |
return str; | ||
} | ||
|
||
static inline const char *bc_skip_zero_reverse(const char *str, const char *end) | ||
{ | ||
/* Check in bulk */ | ||
#ifdef __SSE2__ | ||
const __m128i c_zero_repeat = _mm_set1_epi8((signed char) '0'); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Casting this to signed char shouldn't be necessary. |
||
while (str - sizeof(__m128i) >= end) { | ||
str -= sizeof(__m128i); | ||
__m128i bytes = _mm_loadu_si128((const __m128i *) str); | ||
/* Checks if all numeric strings are equal to '0'. */ | ||
bytes = _mm_cmpeq_epi8(bytes, c_zero_repeat); | ||
|
||
int mask = _mm_movemask_epi8(bytes); | ||
/* The probability of having 16 trailing 0s in a row is very low, so we use EXPECTED. */ | ||
if (EXPECTED(mask != 0xffff)) { | ||
/* Move the pointer back and check each character in loop. */ | ||
str += sizeof(__m128i); | ||
break; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can also use code like the following, but a while loop has always been faster. This may be because the number of calculations increases by one.
|
||
} | ||
#endif | ||
|
||
/* Exclude trailing zeros. */ | ||
while (str - 1 >= end && str[-1] == '0') { | ||
str--; | ||
} | ||
|
||
return str; | ||
} | ||
|
||
/* Assumes `num` points to NULL, i.e. does yet not hold a number. */ | ||
bool bc_str2num(bc_num *num, const char *str, const char *end, size_t scale, bool auto_scale) | ||
{ | ||
|
@@ -104,32 +133,28 @@ bool bc_str2num(bc_num *num, const char *str, const char *end, size_t scale, boo | |
const char *decimal_point = (*ptr == '.') ? ptr : NULL; | ||
|
||
/* If a non-digit and non-decimal-point indicator is in the string, i.e. an invalid character */ | ||
if (!decimal_point && *ptr != '\0') { | ||
if (UNEXPECTED(!decimal_point && *ptr != '\0')) { | ||
goto fail; | ||
} | ||
|
||
/* search and validate fractional end if exists */ | ||
if (decimal_point) { | ||
/* search */ | ||
fractional_ptr = fractional_end = decimal_point + 1; | ||
if (*fractional_ptr == '\0') { | ||
/* For strings that end with a decimal point, such as "012." */ | ||
if (UNEXPECTED(*fractional_ptr == '\0')) { | ||
goto after_fractional; | ||
} | ||
|
||
/* validate */ | ||
fractional_end = bc_count_digits(fractional_ptr, end); | ||
if (*fractional_end != '\0') { | ||
if (UNEXPECTED(*fractional_end != '\0')) { | ||
/* invalid num */ | ||
goto fail; | ||
} | ||
|
||
/* Exclude trailing zeros. */ | ||
while (fractional_end - 1 > decimal_point && fractional_end[-1] == '0') { | ||
fractional_end--; | ||
} | ||
|
||
/* Move the pointer to the beginning of the fraction. */ | ||
fractional_ptr = decimal_point + 1; | ||
fractional_end = bc_skip_zero_reverse(fractional_end, fractional_ptr); | ||
|
||
/* Calculate the length of the fraction excluding trailing zero. */ | ||
str_scale = fractional_end - fractional_ptr; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The argument names are swapped, which makes it very confusing.