-
Notifications
You must be signed in to change notification settings - Fork 7.9k
ext/bcmath: Changed the bcmul calculation method #14213
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
91b9343
ec9f8df
ac9342d
899bb39
ea57a9f
b23e0a3
ec51d76
7e82dc1
25b33bd
888ded6
aa1a677
dd401bd
c7c977a
dab7e15
48b96df
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,217 +36,149 @@ | |
#include "private.h" /* For _bc_rm_leading_zeros() */ | ||
#include "zend_alloc.h" | ||
|
||
/* Recursive vs non-recursive multiply crossover ranges. */ | ||
#if defined(MULDIGITS) | ||
#include "muldigits.h" | ||
|
||
#if SIZEOF_SIZE_T >= 8 | ||
# define BC_MUL_UINT_DIGITS 8 | ||
# define BC_MUL_UINT_OVERFLOW 100000000 | ||
#else | ||
#define MUL_BASE_DIGITS 80 | ||
# define BC_MUL_UINT_DIGITS 4 | ||
# define BC_MUL_UINT_OVERFLOW 10000 | ||
#endif | ||
|
||
int mul_base_digits = MUL_BASE_DIGITS; | ||
#define MUL_SMALL_DIGITS mul_base_digits/4 | ||
|
||
/* Multiply utility routines */ | ||
|
||
static bc_num new_sub_num(size_t length, size_t scale, char *value) | ||
{ | ||
bc_num temp = (bc_num) emalloc(sizeof(bc_struct)); | ||
|
||
temp->n_sign = PLUS; | ||
temp->n_len = length; | ||
temp->n_scale = scale; | ||
temp->n_refs = 1; | ||
temp->n_value = value; | ||
return temp; | ||
} | ||
|
||
static void _bc_simp_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_num *prod) | ||
/* | ||
* Converts BCD to uint, going backwards from pointer n by the number of | ||
* characters specified by len. | ||
*/ | ||
static inline BC_UINT_T bc_partial_convert_to_uint(const char *n, size_t len) | ||
{ | ||
char *n1ptr, *n2ptr, *pvptr; | ||
char *n1end, *n2end; /* To the end of n1 and n2. */ | ||
int sum = 0; | ||
BC_UINT_T num = 0; | ||
BC_UINT_T base = 1; | ||
|
||
int prodlen = n1len + n2len + 1; | ||
for (size_t i = 0; i < len; i++) { | ||
num += *n * base; | ||
base *= BASE; | ||
n--; | ||
} | ||
|
||
*prod = bc_new_num_nonzeroed(prodlen, 0); | ||
return num; | ||
} | ||
|
||
n1end = (char *) (n1->n_value + n1len - 1); | ||
n2end = (char *) (n2->n_value + n2len - 1); | ||
pvptr = (char *) ((*prod)->n_value + prodlen - 1); | ||
|
||
/* Here is the loop... */ | ||
for (int index = 0; index < prodlen - 1; index++) { | ||
n1ptr = (char *) (n1end - MAX(0, index - n2len + 1)); | ||
n2ptr = (char *) (n2end - MIN(index, n2len - 1)); | ||
while ((n1ptr >= n1->n_value) && (n2ptr <= n2end)) { | ||
sum += *n1ptr * *n2ptr; | ||
n1ptr--; | ||
n2ptr++; | ||
} | ||
*pvptr-- = sum % BASE; | ||
sum = sum / BASE; | ||
static inline void bc_convert_to_uint(BC_UINT_T *n_uint, const char *nend, size_t nlen) | ||
{ | ||
size_t i = 0; | ||
while (nlen > 0) { | ||
size_t len = MIN(BC_MUL_UINT_DIGITS, nlen); | ||
n_uint[i] = bc_partial_convert_to_uint(nend, len); | ||
nend -= len; | ||
nlen -= len; | ||
i++; | ||
} | ||
*pvptr = sum; | ||
} | ||
|
||
|
||
/* A special adder/subtractor for the recursive divide and conquer | ||
multiply algorithm. Note: if sub is called, accum must | ||
be larger that what is being subtracted. Also, accum and val | ||
must have n_scale = 0. (e.g. they must look like integers. *) */ | ||
static void _bc_shift_addsub(bc_num accum, bc_num val, int shift, bool sub) | ||
/* | ||
* If the n_values of n1 and n2 are both 4 (32-bit) or 8 (64-bit) digits or less, | ||
* the calculation will be performed at high speed without using an array. | ||
*/ | ||
static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_num *prod) | ||
{ | ||
signed char *accp, *valp; | ||
unsigned int carry = 0; | ||
size_t count = val->n_len; | ||
const char *n1end = n1->n_value + n1len - 1; | ||
const char *n2end = n2->n_value + n2len - 1; | ||
|
||
if (val->n_value[0] == 0) { | ||
count--; | ||
} | ||
assert(accum->n_len + accum->n_scale >= shift + count); | ||
|
||
/* Set up pointers and others */ | ||
accp = (signed char *) (accum->n_value + accum->n_len + accum->n_scale - shift - 1); | ||
valp = (signed char *) (val->n_value + val->n_len - 1); | ||
|
||
if (sub) { | ||
/* Subtraction, carry is really borrow. */ | ||
while (count--) { | ||
*accp -= *valp-- + carry; | ||
if (*accp < 0) { | ||
carry = 1; | ||
*accp-- += BASE; | ||
} else { | ||
carry = 0; | ||
accp--; | ||
} | ||
} | ||
while (carry) { | ||
*accp -= carry; | ||
if (*accp < 0) { | ||
*accp-- += BASE; | ||
} else { | ||
carry = 0; | ||
} | ||
} | ||
} else { | ||
/* Addition */ | ||
while (count--) { | ||
*accp += *valp-- + carry; | ||
if (*accp > (BASE - 1)) { | ||
carry = 1; | ||
*accp-- -= BASE; | ||
} else { | ||
carry = 0; | ||
accp--; | ||
} | ||
} | ||
while (carry) { | ||
*accp += carry; | ||
if (*accp > (BASE - 1)) { | ||
*accp-- -= BASE; | ||
} else { | ||
carry = 0; | ||
} | ||
} | ||
BC_UINT_T n1_uint = bc_partial_convert_to_uint(n1end, n1len); | ||
BC_UINT_T n2_uint = bc_partial_convert_to_uint(n2end, n2len); | ||
BC_UINT_T prod_uint = n1_uint * n2_uint; | ||
|
||
size_t prodlen = n1len + n2len; | ||
*prod = bc_new_num_nonzeroed(prodlen, 0); | ||
char *pptr = (*prod)->n_value; | ||
char *pend = pptr + prodlen - 1; | ||
|
||
while (pend >= pptr) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also had a faster way than this of writing the BCD representation back. See nielsdos@20783c3#diff-5550a7de511402b266a0fc932bd11298d2e02ef720879507b89651add38f380dR177 |
||
*pend-- = prod_uint % BASE; | ||
prod_uint /= BASE; | ||
} | ||
} | ||
|
||
/* Recursive divide and conquer multiply algorithm. | ||
Based on | ||
Let u = u0 + u1*(b^n) | ||
Let v = v0 + v1*(b^n) | ||
Then uv = (B^2n+B^n)*u1*v1 + B^n*(u1-u0)*(v0-v1) + (B^n+1)*u0*v0 | ||
|
||
B is the base of storage, number of digits in u1,u0 close to equal. | ||
*/ | ||
static void _bc_rec_mul(bc_num u, size_t ulen, bc_num v, size_t vlen, bc_num *prod) | ||
/* | ||
* Converts the BCD of bc_num by 4 (32 bits) or 8 (64 bits) digits to an array of BC_UINT_Ts. | ||
* The array is generated starting with the smaller digits. | ||
* e.g. 12345678901234567890 => {34567890, 56789012, 1234} | ||
* | ||
* Multiply and add these groups of numbers to perform multiplication fast. | ||
* How much to shift the digits when adding values can be calculated from the index of the array. | ||
*/ | ||
static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, size_t n2len, bc_num *prod) | ||
{ | ||
bc_num u0, u1, v0, v1; | ||
bc_num m1, m2, m3; | ||
size_t n; | ||
bool m1zero; | ||
|
||
/* Base case? */ | ||
if ((ulen + vlen) < mul_base_digits | ||
|| ulen < MUL_SMALL_DIGITS | ||
|| vlen < MUL_SMALL_DIGITS | ||
) { | ||
_bc_simp_mul(u, ulen, v, vlen, prod); | ||
return; | ||
size_t i; | ||
const char *n1end = n1->n_value + n1len - 1; | ||
const char *n2end = n2->n_value + n2len - 1; | ||
size_t prodlen = n1len + n2len; | ||
|
||
size_t n1_arr_size = (n1len + BC_MUL_UINT_DIGITS - 1) / BC_MUL_UINT_DIGITS; | ||
size_t n2_arr_size = (n2len + BC_MUL_UINT_DIGITS - 1) / BC_MUL_UINT_DIGITS; | ||
size_t prod_arr_size = n1_arr_size + n2_arr_size - 1; | ||
|
||
/* | ||
* let's say that N is the max of n1len and n2len (and a multiple of BC_MUL_UINT_DIGITS for simplicity), | ||
* then this sum is <= N/BC_MUL_UINT_DIGITS + N/BC_MUL_UINT_DIGITS + N/BC_MUL_UINT_DIGITS + N/BC_MUL_UINT_DIGITS - 1 | ||
* which is equal to N - 1 if BC_MUL_UINT_DIGITS is 4, and N/2 - 1 if BC_MUL_UINT_DIGITS is 8. | ||
*/ | ||
BC_UINT_T *buf = safe_emalloc(n1_arr_size + n2_arr_size + prod_arr_size, sizeof(BC_UINT_T), 0); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this deserves a comment to explain why the addition can't overflow in practice. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added, thx! |
||
|
||
BC_UINT_T *n1_uint = buf; | ||
BC_UINT_T *n2_uint = buf + n1_arr_size; | ||
BC_UINT_T *prod_uint = n2_uint + n2_arr_size; | ||
|
||
for (i = 0; i < prod_arr_size; i++) { | ||
prod_uint[i] = 0; | ||
} | ||
|
||
/* Calculate n -- the u and v split point in digits. */ | ||
n = (MAX(ulen, vlen) + 1) / 2; | ||
/* Convert to uint[] */ | ||
bc_convert_to_uint(n1_uint, n1end, n1len); | ||
bc_convert_to_uint(n2_uint, n2end, n2len); | ||
|
||
/* Split u and v. */ | ||
if (ulen < n) { | ||
u1 = bc_copy_num(BCG(_zero_)); | ||
u0 = new_sub_num(ulen, 0, u->n_value); | ||
} else { | ||
u1 = new_sub_num(ulen - n, 0, u->n_value); | ||
u0 = new_sub_num(n, 0, u->n_value + ulen - n); | ||
} | ||
if (vlen < n) { | ||
v1 = bc_copy_num(BCG(_zero_)); | ||
v0 = new_sub_num(vlen, 0, v->n_value); | ||
} else { | ||
v1 = new_sub_num(vlen - n, 0, v->n_value); | ||
v0 = new_sub_num(n, 0, v->n_value + vlen - n); | ||
/* Multiplication and addition */ | ||
for (i = 0; i < n1_arr_size; i++) { | ||
for (size_t j = 0; j < n2_arr_size; j++) { | ||
prod_uint[i + j] += n1_uint[i] * n2_uint[j]; | ||
} | ||
} | ||
_bc_rm_leading_zeros(u1); | ||
_bc_rm_leading_zeros(u0); | ||
_bc_rm_leading_zeros(v1); | ||
_bc_rm_leading_zeros(v0); | ||
|
||
m1zero = bc_is_zero(u1) || bc_is_zero(v1); | ||
|
||
/* Calculate sub results ... */ | ||
|
||
bc_num d1 = bc_sub(u1, u0, 0); | ||
bc_num d2 = bc_sub(v0, v1, 0); | ||
|
||
|
||
/* Do recursive multiplies and shifted adds. */ | ||
if (m1zero) { | ||
m1 = bc_copy_num(BCG(_zero_)); | ||
} else { | ||
_bc_rec_mul(u1, u1->n_len, v1, v1->n_len, &m1); | ||
/* | ||
* Move a value exceeding 4/8 digits by carrying to the next digit. | ||
* However, the last digit does nothing. | ||
*/ | ||
for (i = 0; i < prod_arr_size - 1; i++) { | ||
prod_uint[i + 1] += prod_uint[i] / BC_MUL_UINT_OVERFLOW; | ||
prod_uint[i] %= BC_MUL_UINT_OVERFLOW; | ||
} | ||
|
||
if (bc_is_zero(d1) || bc_is_zero(d2)) { | ||
m2 = bc_copy_num(BCG(_zero_)); | ||
} else { | ||
_bc_rec_mul(d1, d1->n_len, d2, d2->n_len, &m2); | ||
/* Convert to bc_num */ | ||
*prod = bc_new_num_nonzeroed(prodlen, 0); | ||
char *pptr = (*prod)->n_value; | ||
char *pend = pptr + prodlen - 1; | ||
i = 0; | ||
while (i < prod_arr_size - 1) { | ||
for (size_t j = 0; j < BC_MUL_UINT_DIGITS; j++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also an opportunity for the faster BCD write method for follow-up. |
||
*pend-- = prod_uint[i] % BASE; | ||
prod_uint[i] /= BASE; | ||
} | ||
i++; | ||
} | ||
|
||
if (bc_is_zero(u0) || bc_is_zero(v0)) { | ||
m3 = bc_copy_num(BCG(_zero_)); | ||
} else { | ||
_bc_rec_mul(u0, u0->n_len, v0, v0->n_len, &m3); | ||
/* | ||
* The last digit may carry over. | ||
* Also need to fill it to the end with zeros, so loop until the end of the string. | ||
*/ | ||
while (pend >= pptr) { | ||
*pend-- = prod_uint[i] % BASE; | ||
prod_uint[i] /= BASE; | ||
} | ||
|
||
/* Initialize product */ | ||
*prod = bc_new_num(ulen + vlen + 1, 0); | ||
|
||
if (!m1zero) { | ||
_bc_shift_addsub(*prod, m1, 2 * n, false); | ||
_bc_shift_addsub(*prod, m1, n, false); | ||
} | ||
_bc_shift_addsub(*prod, m3, n, false); | ||
_bc_shift_addsub(*prod, m3, 0, false); | ||
_bc_shift_addsub(*prod, m2, n, d1->n_sign != d2->n_sign); | ||
|
||
/* Now clean up! */ | ||
bc_free_num (&u1); | ||
bc_free_num (&u0); | ||
bc_free_num (&v1); | ||
bc_free_num (&m1); | ||
bc_free_num (&v0); | ||
bc_free_num (&m2); | ||
bc_free_num (&m3); | ||
bc_free_num (&d1); | ||
bc_free_num (&d2); | ||
efree(buf); | ||
} | ||
|
||
/* The multiply routine. N2 times N1 is put int PROD with the scale of | ||
|
@@ -255,26 +187,28 @@ static void _bc_rec_mul(bc_num u, size_t ulen, bc_num v, size_t vlen, bc_num *pr | |
|
||
bc_num bc_multiply(bc_num n1, bc_num n2, size_t scale) | ||
{ | ||
bc_num pval; | ||
size_t len1, len2; | ||
size_t full_scale, prod_scale; | ||
bc_num prod; | ||
|
||
/* Initialize things. */ | ||
len1 = n1->n_len + n1->n_scale; | ||
len2 = n2->n_len + n2->n_scale; | ||
full_scale = n1->n_scale + n2->n_scale; | ||
prod_scale = MIN(full_scale, MAX(scale, MAX(n1->n_scale, n2->n_scale))); | ||
size_t len1 = n1->n_len + n1->n_scale; | ||
size_t len2 = n2->n_len + n2->n_scale; | ||
size_t full_scale = n1->n_scale + n2->n_scale; | ||
size_t prod_scale = MIN(full_scale, MAX(scale, MAX(n1->n_scale, n2->n_scale))); | ||
|
||
/* Do the multiply */ | ||
_bc_rec_mul(n1, len1, n2, len2, &pval); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One thing we're going to lose here now is the divide-and-conquer approach for very large numbers, and I think we should probably keep that. WDYT? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. Sorry, I should have explained this in the PR description. The existing implementation uses divide-and-conquer when the number of digits is 80 or more. This is compared in test case 2, and clearly this PR is faster, so it's not worth keeping the original implementation as is. However, combining this PR approach with a divide and conquer approach has shown, in my local testing, to probably be nearly twice as effective. Therefore, we should use the divide and conquer method (commonly called the Karatsuba algorithm in Japanese), but probably separate PRs. |
||
if (len1 <= BC_MUL_UINT_DIGITS && len2 <= BC_MUL_UINT_DIGITS) { | ||
bc_fast_mul(n1, len1, n2, len2, &prod); | ||
} else { | ||
bc_standard_mul(n1, len1, n2, len2, &prod); | ||
} | ||
|
||
/* Assign to prod and clean up the number. */ | ||
pval->n_sign = (n1->n_sign == n2->n_sign ? PLUS : MINUS); | ||
pval->n_len = len2 + len1 + 1 - full_scale; | ||
pval->n_scale = prod_scale; | ||
_bc_rm_leading_zeros(pval); | ||
if (bc_is_zero(pval)) { | ||
pval->n_sign = PLUS; | ||
prod->n_sign = (n1->n_sign == n2->n_sign ? PLUS : MINUS); | ||
prod->n_len -= full_scale; | ||
prod->n_scale = prod_scale; | ||
_bc_rm_leading_zeros(prod); | ||
if (bc_is_zero(prod)) { | ||
prod->n_sign = PLUS; | ||
} | ||
return pval; | ||
return prod; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I once shared a fast way of doing this that I had implemented in a WIP commit: nielsdos@20783c3
Original source: https://kholdstare.github.io/technical/2020/05/26/faster-integer-parsing.html
The article claims it is faster than a regular for loop that multiplies by 10.
(note: I wouldn't use the SSE SIMD approach from the article, just the 4/8 byte fast non-SIMD approach)
However, let's not do that in this PR because this PR is large enough as-is.