Skip to content

ext/bcmath: Changed the bcmul calculation method #14213

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
May 20, 2024
Merged
326 changes: 130 additions & 196 deletions ext/bcmath/libbcmath/src/recmul.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,217 +36,149 @@
#include "private.h" /* For _bc_rm_leading_zeros() */
#include "zend_alloc.h"

/* Recursive vs non-recursive multiply crossover ranges. */
#if defined(MULDIGITS)
#include "muldigits.h"

#if SIZEOF_SIZE_T >= 8
# define BC_MUL_UINT_DIGITS 8
# define BC_MUL_UINT_OVERFLOW 100000000
#else
#define MUL_BASE_DIGITS 80
# define BC_MUL_UINT_DIGITS 4
# define BC_MUL_UINT_OVERFLOW 10000
#endif

int mul_base_digits = MUL_BASE_DIGITS;
#define MUL_SMALL_DIGITS mul_base_digits/4

/* Multiply utility routines */

static bc_num new_sub_num(size_t length, size_t scale, char *value)
{
bc_num temp = (bc_num) emalloc(sizeof(bc_struct));

temp->n_sign = PLUS;
temp->n_len = length;
temp->n_scale = scale;
temp->n_refs = 1;
temp->n_value = value;
return temp;
}

static void _bc_simp_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_num *prod)
/*
* Converts BCD to uint, going backwards from pointer n by the number of
* characters specified by len.
*/
static inline BC_UINT_T bc_partial_convert_to_uint(const char *n, size_t len)
{
char *n1ptr, *n2ptr, *pvptr;
char *n1end, *n2end; /* To the end of n1 and n2. */
int sum = 0;
BC_UINT_T num = 0;
BC_UINT_T base = 1;

int prodlen = n1len + n2len + 1;
for (size_t i = 0; i < len; i++) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I once shared a fast way of doing this that I had implemented in a WIP commit: nielsdos@20783c3
Original source: https://kholdstare.github.io/technical/2020/05/26/faster-integer-parsing.html
The article claims it is faster than a regular for loop that multiplies by 10.
(note: I wouldn't use the SSE SIMD approach from the article, just the 4/8 byte fast non-SIMD approach)
However, let's not do that in this PR because this PR is large enough as-is.

num += *n * base;
base *= BASE;
n--;
}

*prod = bc_new_num_nonzeroed(prodlen, 0);
return num;
}

n1end = (char *) (n1->n_value + n1len - 1);
n2end = (char *) (n2->n_value + n2len - 1);
pvptr = (char *) ((*prod)->n_value + prodlen - 1);

/* Here is the loop... */
for (int index = 0; index < prodlen - 1; index++) {
n1ptr = (char *) (n1end - MAX(0, index - n2len + 1));
n2ptr = (char *) (n2end - MIN(index, n2len - 1));
while ((n1ptr >= n1->n_value) && (n2ptr <= n2end)) {
sum += *n1ptr * *n2ptr;
n1ptr--;
n2ptr++;
}
*pvptr-- = sum % BASE;
sum = sum / BASE;
static inline void bc_convert_to_uint(BC_UINT_T *n_uint, const char *nend, size_t nlen)
{
size_t i = 0;
while (nlen > 0) {
size_t len = MIN(BC_MUL_UINT_DIGITS, nlen);
n_uint[i] = bc_partial_convert_to_uint(nend, len);
nend -= len;
nlen -= len;
i++;
}
*pvptr = sum;
}


/* A special adder/subtractor for the recursive divide and conquer
multiply algorithm. Note: if sub is called, accum must
be larger that what is being subtracted. Also, accum and val
must have n_scale = 0. (e.g. they must look like integers. *) */
static void _bc_shift_addsub(bc_num accum, bc_num val, int shift, bool sub)
/*
* If the n_values of n1 and n2 are both 4 (32-bit) or 8 (64-bit) digits or less,
* the calculation will be performed at high speed without using an array.
*/
static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_num *prod)
{
signed char *accp, *valp;
unsigned int carry = 0;
size_t count = val->n_len;
const char *n1end = n1->n_value + n1len - 1;
const char *n2end = n2->n_value + n2len - 1;

if (val->n_value[0] == 0) {
count--;
}
assert(accum->n_len + accum->n_scale >= shift + count);

/* Set up pointers and others */
accp = (signed char *) (accum->n_value + accum->n_len + accum->n_scale - shift - 1);
valp = (signed char *) (val->n_value + val->n_len - 1);

if (sub) {
/* Subtraction, carry is really borrow. */
while (count--) {
*accp -= *valp-- + carry;
if (*accp < 0) {
carry = 1;
*accp-- += BASE;
} else {
carry = 0;
accp--;
}
}
while (carry) {
*accp -= carry;
if (*accp < 0) {
*accp-- += BASE;
} else {
carry = 0;
}
}
} else {
/* Addition */
while (count--) {
*accp += *valp-- + carry;
if (*accp > (BASE - 1)) {
carry = 1;
*accp-- -= BASE;
} else {
carry = 0;
accp--;
}
}
while (carry) {
*accp += carry;
if (*accp > (BASE - 1)) {
*accp-- -= BASE;
} else {
carry = 0;
}
}
BC_UINT_T n1_uint = bc_partial_convert_to_uint(n1end, n1len);
BC_UINT_T n2_uint = bc_partial_convert_to_uint(n2end, n2len);
BC_UINT_T prod_uint = n1_uint * n2_uint;

size_t prodlen = n1len + n2len;
*prod = bc_new_num_nonzeroed(prodlen, 0);
char *pptr = (*prod)->n_value;
char *pend = pptr + prodlen - 1;

while (pend >= pptr) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also had a faster way than this of writing the BCD representation back. See nielsdos@20783c3#diff-5550a7de511402b266a0fc932bd11298d2e02ef720879507b89651add38f380dR177
But we should keep that as a follow-up imo.

*pend-- = prod_uint % BASE;
prod_uint /= BASE;
}
}

/* Recursive divide and conquer multiply algorithm.
Based on
Let u = u0 + u1*(b^n)
Let v = v0 + v1*(b^n)
Then uv = (B^2n+B^n)*u1*v1 + B^n*(u1-u0)*(v0-v1) + (B^n+1)*u0*v0

B is the base of storage, number of digits in u1,u0 close to equal.
*/
static void _bc_rec_mul(bc_num u, size_t ulen, bc_num v, size_t vlen, bc_num *prod)
/*
* Converts the BCD of bc_num by 4 (32 bits) or 8 (64 bits) digits to an array of BC_UINT_Ts.
* The array is generated starting with the smaller digits.
* e.g. 12345678901234567890 => {34567890, 56789012, 1234}
*
* Multiply and add these groups of numbers to perform multiplication fast.
* How much to shift the digits when adding values can be calculated from the index of the array.
*/
static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, size_t n2len, bc_num *prod)
{
bc_num u0, u1, v0, v1;
bc_num m1, m2, m3;
size_t n;
bool m1zero;

/* Base case? */
if ((ulen + vlen) < mul_base_digits
|| ulen < MUL_SMALL_DIGITS
|| vlen < MUL_SMALL_DIGITS
) {
_bc_simp_mul(u, ulen, v, vlen, prod);
return;
size_t i;
const char *n1end = n1->n_value + n1len - 1;
const char *n2end = n2->n_value + n2len - 1;
size_t prodlen = n1len + n2len;

size_t n1_arr_size = (n1len + BC_MUL_UINT_DIGITS - 1) / BC_MUL_UINT_DIGITS;
size_t n2_arr_size = (n2len + BC_MUL_UINT_DIGITS - 1) / BC_MUL_UINT_DIGITS;
size_t prod_arr_size = n1_arr_size + n2_arr_size - 1;

/*
* let's say that N is the max of n1len and n2len (and a multiple of BC_MUL_UINT_DIGITS for simplicity),
* then this sum is <= N/BC_MUL_UINT_DIGITS + N/BC_MUL_UINT_DIGITS + N/BC_MUL_UINT_DIGITS + N/BC_MUL_UINT_DIGITS - 1
* which is equal to N - 1 if BC_MUL_UINT_DIGITS is 4, and N/2 - 1 if BC_MUL_UINT_DIGITS is 8.
*/
BC_UINT_T *buf = safe_emalloc(n1_arr_size + n2_arr_size + prod_arr_size, sizeof(BC_UINT_T), 0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this deserves a comment to explain why the addition can't overflow in practice.
I.e. let's say that N is the max of n1len and n2len (and a multiple of BC_MUL_UINT_DIGITS for simplicity), then this sum is <= N/BC_MUL_UINT_DIGITS + N/BC_MUL_UINT_DIGITS + N/BC_MUL_UINT_DIGITS + N/BC_MUL_UINT_DIGITS - 1 which is equal to N - 1 if BC_MUL_UINT_DIGITS is 4, and N/2 - 1 if BC_MUL_UINT_DIGITS is 8.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added, thx!


BC_UINT_T *n1_uint = buf;
BC_UINT_T *n2_uint = buf + n1_arr_size;
BC_UINT_T *prod_uint = n2_uint + n2_arr_size;

for (i = 0; i < prod_arr_size; i++) {
prod_uint[i] = 0;
}

/* Calculate n -- the u and v split point in digits. */
n = (MAX(ulen, vlen) + 1) / 2;
/* Convert to uint[] */
bc_convert_to_uint(n1_uint, n1end, n1len);
bc_convert_to_uint(n2_uint, n2end, n2len);

/* Split u and v. */
if (ulen < n) {
u1 = bc_copy_num(BCG(_zero_));
u0 = new_sub_num(ulen, 0, u->n_value);
} else {
u1 = new_sub_num(ulen - n, 0, u->n_value);
u0 = new_sub_num(n, 0, u->n_value + ulen - n);
}
if (vlen < n) {
v1 = bc_copy_num(BCG(_zero_));
v0 = new_sub_num(vlen, 0, v->n_value);
} else {
v1 = new_sub_num(vlen - n, 0, v->n_value);
v0 = new_sub_num(n, 0, v->n_value + vlen - n);
/* Multiplication and addition */
for (i = 0; i < n1_arr_size; i++) {
for (size_t j = 0; j < n2_arr_size; j++) {
prod_uint[i + j] += n1_uint[i] * n2_uint[j];
}
}
_bc_rm_leading_zeros(u1);
_bc_rm_leading_zeros(u0);
_bc_rm_leading_zeros(v1);
_bc_rm_leading_zeros(v0);

m1zero = bc_is_zero(u1) || bc_is_zero(v1);

/* Calculate sub results ... */

bc_num d1 = bc_sub(u1, u0, 0);
bc_num d2 = bc_sub(v0, v1, 0);


/* Do recursive multiplies and shifted adds. */
if (m1zero) {
m1 = bc_copy_num(BCG(_zero_));
} else {
_bc_rec_mul(u1, u1->n_len, v1, v1->n_len, &m1);
/*
* Move a value exceeding 4/8 digits by carrying to the next digit.
* However, the last digit does nothing.
*/
for (i = 0; i < prod_arr_size - 1; i++) {
prod_uint[i + 1] += prod_uint[i] / BC_MUL_UINT_OVERFLOW;
prod_uint[i] %= BC_MUL_UINT_OVERFLOW;
}

if (bc_is_zero(d1) || bc_is_zero(d2)) {
m2 = bc_copy_num(BCG(_zero_));
} else {
_bc_rec_mul(d1, d1->n_len, d2, d2->n_len, &m2);
/* Convert to bc_num */
*prod = bc_new_num_nonzeroed(prodlen, 0);
char *pptr = (*prod)->n_value;
char *pend = pptr + prodlen - 1;
i = 0;
while (i < prod_arr_size - 1) {
for (size_t j = 0; j < BC_MUL_UINT_DIGITS; j++) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also an opportunity for the faster BCD write method for follow-up.

*pend-- = prod_uint[i] % BASE;
prod_uint[i] /= BASE;
}
i++;
}

if (bc_is_zero(u0) || bc_is_zero(v0)) {
m3 = bc_copy_num(BCG(_zero_));
} else {
_bc_rec_mul(u0, u0->n_len, v0, v0->n_len, &m3);
/*
* The last digit may carry over.
* Also need to fill it to the end with zeros, so loop until the end of the string.
*/
while (pend >= pptr) {
*pend-- = prod_uint[i] % BASE;
prod_uint[i] /= BASE;
}

/* Initialize product */
*prod = bc_new_num(ulen + vlen + 1, 0);

if (!m1zero) {
_bc_shift_addsub(*prod, m1, 2 * n, false);
_bc_shift_addsub(*prod, m1, n, false);
}
_bc_shift_addsub(*prod, m3, n, false);
_bc_shift_addsub(*prod, m3, 0, false);
_bc_shift_addsub(*prod, m2, n, d1->n_sign != d2->n_sign);

/* Now clean up! */
bc_free_num (&u1);
bc_free_num (&u0);
bc_free_num (&v1);
bc_free_num (&m1);
bc_free_num (&v0);
bc_free_num (&m2);
bc_free_num (&m3);
bc_free_num (&d1);
bc_free_num (&d2);
efree(buf);
}

/* The multiply routine. N2 times N1 is put int PROD with the scale of
Expand All @@ -255,26 +187,28 @@ static void _bc_rec_mul(bc_num u, size_t ulen, bc_num v, size_t vlen, bc_num *pr

bc_num bc_multiply(bc_num n1, bc_num n2, size_t scale)
{
bc_num pval;
size_t len1, len2;
size_t full_scale, prod_scale;
bc_num prod;

/* Initialize things. */
len1 = n1->n_len + n1->n_scale;
len2 = n2->n_len + n2->n_scale;
full_scale = n1->n_scale + n2->n_scale;
prod_scale = MIN(full_scale, MAX(scale, MAX(n1->n_scale, n2->n_scale)));
size_t len1 = n1->n_len + n1->n_scale;
size_t len2 = n2->n_len + n2->n_scale;
size_t full_scale = n1->n_scale + n2->n_scale;
size_t prod_scale = MIN(full_scale, MAX(scale, MAX(n1->n_scale, n2->n_scale)));

/* Do the multiply */
_bc_rec_mul(n1, len1, n2, len2, &pval);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing we're going to lose here now is the divide-and-conquer approach for very large numbers, and I think we should probably keep that. WDYT?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Sorry, I should have explained this in the PR description.

The existing implementation uses divide-and-conquer when the number of digits is 80 or more.

This is compared in test case 2, and clearly this PR is faster, so it's not worth keeping the original implementation as is.

However, combining this PR approach with a divide and conquer approach has shown, in my local testing, to probably be nearly twice as effective.

Therefore, we should use the divide and conquer method (commonly called the Karatsuba algorithm in Japanese), but probably separate PRs.

if (len1 <= BC_MUL_UINT_DIGITS && len2 <= BC_MUL_UINT_DIGITS) {
bc_fast_mul(n1, len1, n2, len2, &prod);
} else {
bc_standard_mul(n1, len1, n2, len2, &prod);
}

/* Assign to prod and clean up the number. */
pval->n_sign = (n1->n_sign == n2->n_sign ? PLUS : MINUS);
pval->n_len = len2 + len1 + 1 - full_scale;
pval->n_scale = prod_scale;
_bc_rm_leading_zeros(pval);
if (bc_is_zero(pval)) {
pval->n_sign = PLUS;
prod->n_sign = (n1->n_sign == n2->n_sign ? PLUS : MINUS);
prod->n_len -= full_scale;
prod->n_scale = prod_scale;
_bc_rm_leading_zeros(prod);
if (bc_is_zero(prod)) {
prod->n_sign = PLUS;
}
return pval;
return prod;
}
Loading