Skip to content

Commit 20783c3

Browse files
committed
wip
1 parent 4465e50 commit 20783c3

File tree

1 file changed

+188
-5
lines changed

1 file changed

+188
-5
lines changed

ext/bcmath/libbcmath/src/doaddsub.c

Lines changed: 188 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,156 @@
3333
#include "private.h"
3434
#include <stddef.h>
3535

36+
#define IS_LITTLE_ENDIAN (*(unsigned char *)&(uint16_t){1})
37+
38+
/* This is based on the technique described in https://kholdstare.github.io/technical/2020/05/26/faster-integer-parsing.html.
39+
* This function transforms AABBCCDD into 1000 * AA + 100 * BB + 10 * CC + DD,
40+
* with the caveat that all components must be in the interval [0, 25] to prevent overflow
41+
* due to the multiplication by power of 10 (10 * 25 = 250 is the largest number that fits in a byte).
42+
* The advantage of this method instead of using shifts + 3 multiplications is that this is cheaper
43+
* due to its divide-and-conquer nature.
44+
*
45+
* The difference here however is that the first mask uses 0xff instead of 0x0f.
46+
* This doesn't make a difference when a byte is in the range [0, 9], but it has an extra advantage.
47+
* By setting the mask to 0xff we can already sum the two numbers in non-parsed form,
48+
* and the parsing will fix up the carry.
49+
* Here's how it works with an example:
50+
* Let's say A = [5 9 0 1] and B = [2 2 0 0], and we pass in A + B = [7 11 0 1]
51+
* We know that the highest number may only be 25 but that is no problem as the vectors only have
52+
* numbers in the range [0, 9] which means the sum vector has numbers in the range [0, 18].
53+
* Then [7 11 0 1] is transformed into 7 * 1000 + 11 * 100 + 0 * 10 + 1 = 8101,
54+
* which would have been the same as if the carry for the 11 would've been on the 7,
55+
* i.e. [8 1 0 1] would've been the right result, but that gives 8 * 1000 + 1 * 100 + 0 * 10 + 1 = *also* 8101.
56+
*/
57+
static uint32_t bc_parse_4_chars(uint32_t tmp)
58+
{
59+
uint32_t lower_digits = (tmp & 0xff00ff00) >> 8;
60+
uint32_t upper_digits = (tmp & 0x00ff00ff) * 10;
61+
62+
tmp = lower_digits + upper_digits;
63+
64+
lower_digits = (tmp & 0x00ff0000) >> 16;
65+
upper_digits = (tmp & 0x000000ff) * 100;
66+
67+
return lower_digits + upper_digits;
68+
}
69+
70+
/* This LUT encodes the decimal representation of numbers 0-100
71+
* such that we can avoid taking modulos and divisions which would be slow. */
72+
static const unsigned short LUT[100] = {
73+
0 | 0 << 8,
74+
0 | 1 << 8,
75+
0 | 2 << 8,
76+
0 | 3 << 8,
77+
0 | 4 << 8,
78+
0 | 5 << 8,
79+
0 | 6 << 8,
80+
0 | 7 << 8,
81+
0 | 8 << 8,
82+
0 | 9 << 8,
83+
1 | 0 << 8,
84+
1 | 1 << 8,
85+
1 | 2 << 8,
86+
1 | 3 << 8,
87+
1 | 4 << 8,
88+
1 | 5 << 8,
89+
1 | 6 << 8,
90+
1 | 7 << 8,
91+
1 | 8 << 8,
92+
1 | 9 << 8,
93+
2 | 0 << 8,
94+
2 | 1 << 8,
95+
2 | 2 << 8,
96+
2 | 3 << 8,
97+
2 | 4 << 8,
98+
2 | 5 << 8,
99+
2 | 6 << 8,
100+
2 | 7 << 8,
101+
2 | 8 << 8,
102+
2 | 9 << 8,
103+
3 | 0 << 8,
104+
3 | 1 << 8,
105+
3 | 2 << 8,
106+
3 | 3 << 8,
107+
3 | 4 << 8,
108+
3 | 5 << 8,
109+
3 | 6 << 8,
110+
3 | 7 << 8,
111+
3 | 8 << 8,
112+
3 | 9 << 8,
113+
4 | 0 << 8,
114+
4 | 1 << 8,
115+
4 | 2 << 8,
116+
4 | 3 << 8,
117+
4 | 4 << 8,
118+
4 | 5 << 8,
119+
4 | 6 << 8,
120+
4 | 7 << 8,
121+
4 | 8 << 8,
122+
4 | 9 << 8,
123+
5 | 0 << 8,
124+
5 | 1 << 8,
125+
5 | 2 << 8,
126+
5 | 3 << 8,
127+
5 | 4 << 8,
128+
5 | 5 << 8,
129+
5 | 6 << 8,
130+
5 | 7 << 8,
131+
5 | 8 << 8,
132+
5 | 9 << 8,
133+
6 | 0 << 8,
134+
6 | 1 << 8,
135+
6 | 2 << 8,
136+
6 | 3 << 8,
137+
6 | 4 << 8,
138+
6 | 5 << 8,
139+
6 | 6 << 8,
140+
6 | 7 << 8,
141+
6 | 8 << 8,
142+
6 | 9 << 8,
143+
7 | 0 << 8,
144+
7 | 1 << 8,
145+
7 | 2 << 8,
146+
7 | 3 << 8,
147+
7 | 4 << 8,
148+
7 | 5 << 8,
149+
7 | 6 << 8,
150+
7 | 7 << 8,
151+
7 | 8 << 8,
152+
7 | 9 << 8,
153+
8 | 0 << 8,
154+
8 | 1 << 8,
155+
8 | 2 << 8,
156+
8 | 3 << 8,
157+
8 | 4 << 8,
158+
8 | 5 << 8,
159+
8 | 6 << 8,
160+
8 | 7 << 8,
161+
8 | 8 << 8,
162+
8 | 9 << 8,
163+
9 | 0 << 8,
164+
9 | 1 << 8,
165+
9 | 2 << 8,
166+
9 | 3 << 8,
167+
9 | 4 << 8,
168+
9 | 5 << 8,
169+
9 | 6 << 8,
170+
9 | 7 << 8,
171+
9 | 8 << 8,
172+
9 | 9 << 8,
173+
};
174+
175+
/* Writes the character representation of the number encoded in value.
176+
* E.g. if value = 1234, then the string "1234" will be written to str. */
177+
static void bc_write_char_representation(uint32_t value, char *str)
178+
{
179+
uint32_t upper = value / 100; /* e.g. 12 */
180+
uint32_t lower = value % 100; /* e.g. 34 */
181+
182+
/* Note: little endian, so `lower` comes before `upper`! */
183+
uint32_t digits = LUT[lower] << 16 | LUT[upper];
184+
memcpy(str, &digits, sizeof(digits));
185+
}
36186

37187
/* Perform addition: N1 is added to N2 and the value is
38188
returned. The signs of N1 and N2 are ignored.
@@ -42,7 +192,8 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
42192
{
43193
bc_num sum;
44194
size_t sum_scale, sum_digits;
45-
char *n1ptr, *n2ptr, *sumptr;
195+
const char *n1ptr, *n2ptr;
196+
char *sumptr;
46197
size_t n1bytes, n2bytes;
47198
bool carry;
48199

@@ -53,7 +204,7 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
53204

54205
/* Zero extra digits made by scale_min. */
55206
if (scale_min > sum_scale) {
56-
sumptr = (char *) (sum->n_value + sum_scale + sum_digits);
207+
sumptr = (char * restrict) (sum->n_value + sum_scale + sum_digits);
57208
for (int count = scale_min - sum_scale; count > 0; count--) {
58209
*sumptr++ = 0;
59210
}
@@ -62,9 +213,9 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
62213
/* Start with the fraction part. Initialize the pointers. */
63214
n1bytes = n1->n_scale;
64215
n2bytes = n2->n_scale;
65-
n1ptr = (char *) (n1->n_value + n1->n_len + n1bytes - 1);
66-
n2ptr = (char *) (n2->n_value + n2->n_len + n2bytes - 1);
67-
sumptr = (char *) (sum->n_value + sum_scale + sum_digits - 1);
216+
n1ptr = n1->n_value + n1->n_len + n1bytes - 1;
217+
n2ptr = n2->n_value + n2->n_len + n2bytes - 1;
218+
sumptr = (char * restrict) (sum->n_value + sum_scale + sum_digits - 1);
68219

69220
/* Add the fraction part. First copy the longer fraction.*/
70221
if (n1bytes != n2bytes) {
@@ -85,6 +236,38 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
85236
n1bytes += n1->n_len;
86237
n2bytes += n2->n_len;
87238
carry = 0;
239+
240+
if (IS_LITTLE_ENDIAN) {
241+
/* The idea here is to work in a higher base, allowing summing digits in parallel. */
242+
while (n1bytes >= 4 && n2bytes >= 4) {
243+
n1ptr -= 4;
244+
n2ptr -= 4;
245+
sumptr -= 4;
246+
247+
/* Fetches the two number chunks into tmp1 and tmp2. */
248+
uint32_t tmp1, tmp2;
249+
memcpy(&tmp1, n1ptr + 1, sizeof(tmp1));
250+
memcpy(&tmp2, n2ptr + 1, sizeof(tmp2));
251+
/* We don't have to parse tmp1 and tmp2 separately, we can perform
252+
* the addition already and the parsing code will fix up the carry.
253+
* See comment above bc_parse_4_chars(). We can only add the carry
254+
* after parsing however because of endianness. */
255+
uint32_t tmp_sum = bc_parse_4_chars(tmp1 + tmp2) + carry;
256+
257+
/* We're handling 4 digits at once, i.e. the base is 10**4 here. */
258+
if (tmp_sum >= 10000) {
259+
tmp_sum -= 10000;
260+
carry = 1;
261+
} else {
262+
carry = 0;
263+
}
264+
265+
bc_write_char_representation(tmp_sum, sumptr + 1);
266+
n1bytes -= 4;
267+
n2bytes -= 4;
268+
}
269+
}
270+
88271
while ((n1bytes > 0) && (n2bytes > 0)) {
89272
*sumptr = *n1ptr-- + *n2ptr-- + carry;
90273
if (*sumptr > (BASE - 1)) {

0 commit comments

Comments
 (0)