33
33
#include "private.h"
34
34
#include <stddef.h>
35
35
36
+ #define IS_LITTLE_ENDIAN (*(unsigned char *)&(uint16_t){1})
37
+
38
+ /* This is based on the technique described in https://kholdstare.github.io/technical/2020/05/26/faster-integer-parsing.html.
39
+ * This function transforms AABBCCDD into 1000 * AA + 100 * BB + 10 * CC + DD,
40
+ * with the caveat that all components must be in the interval [0, 25] to prevent overflow
41
+ * due to the multiplication by power of 10 (10 * 25 = 250 is the largest number that fits in a byte).
42
+ * The advantage of this method instead of using shifts + 3 multiplications is that this is cheaper
43
+ * due to its divide-and-conquer nature.
44
+ *
45
+ * The difference here however is that the first mask uses 0xff instead of 0x0f.
46
+ * This doesn't make a difference when a byte is in the range [0, 9], but it has an extra advantage.
47
+ * By setting the mask to 0xff we can already sum the two numbers in non-parsed form,
48
+ * and the parsing will fix up the carry.
49
+ * Here's how it works with an example:
50
+ * Let's say A = [5 9 0 1] and B = [2 2 0 0], and we pass in A + B = [7 11 0 1]
51
+ * We know that the highest number may only be 25 but that is no problem as the vectors only have
52
+ * numbers in the range [0, 9] which means the sum vector has numbers in the range [0, 18].
53
+ * Then [7 11 0 1] is transformed into 7 * 1000 + 11 * 100 + 0 * 10 + 1 = 8101,
54
+ * which would have been the same as if the carry for the 11 would've been on the 7,
55
+ * i.e. [8 1 0 1] would've been the right result, but that gives 8 * 1000 + 1 * 100 + 0 * 10 + 1 = *also* 8101.
56
+ */
57
+ static uint32_t bc_parse_4_chars (uint32_t tmp )
58
+ {
59
+ uint32_t lower_digits = (tmp & 0xff00ff00 ) >> 8 ;
60
+ uint32_t upper_digits = (tmp & 0x00ff00ff ) * 10 ;
61
+
62
+ tmp = lower_digits + upper_digits ;
63
+
64
+ lower_digits = (tmp & 0x00ff0000 ) >> 16 ;
65
+ upper_digits = (tmp & 0x000000ff ) * 100 ;
66
+
67
+ return lower_digits + upper_digits ;
68
+ }
69
+
70
+ /* This LUT encodes the decimal representation of numbers 0-100
71
+ * such that we can avoid taking modulos and divisions which would be slow. */
72
+ static const unsigned short LUT [100 ] = {
73
+ 0 | 0 << 8 ,
74
+ 0 | 1 << 8 ,
75
+ 0 | 2 << 8 ,
76
+ 0 | 3 << 8 ,
77
+ 0 | 4 << 8 ,
78
+ 0 | 5 << 8 ,
79
+ 0 | 6 << 8 ,
80
+ 0 | 7 << 8 ,
81
+ 0 | 8 << 8 ,
82
+ 0 | 9 << 8 ,
83
+ 1 | 0 << 8 ,
84
+ 1 | 1 << 8 ,
85
+ 1 | 2 << 8 ,
86
+ 1 | 3 << 8 ,
87
+ 1 | 4 << 8 ,
88
+ 1 | 5 << 8 ,
89
+ 1 | 6 << 8 ,
90
+ 1 | 7 << 8 ,
91
+ 1 | 8 << 8 ,
92
+ 1 | 9 << 8 ,
93
+ 2 | 0 << 8 ,
94
+ 2 | 1 << 8 ,
95
+ 2 | 2 << 8 ,
96
+ 2 | 3 << 8 ,
97
+ 2 | 4 << 8 ,
98
+ 2 | 5 << 8 ,
99
+ 2 | 6 << 8 ,
100
+ 2 | 7 << 8 ,
101
+ 2 | 8 << 8 ,
102
+ 2 | 9 << 8 ,
103
+ 3 | 0 << 8 ,
104
+ 3 | 1 << 8 ,
105
+ 3 | 2 << 8 ,
106
+ 3 | 3 << 8 ,
107
+ 3 | 4 << 8 ,
108
+ 3 | 5 << 8 ,
109
+ 3 | 6 << 8 ,
110
+ 3 | 7 << 8 ,
111
+ 3 | 8 << 8 ,
112
+ 3 | 9 << 8 ,
113
+ 4 | 0 << 8 ,
114
+ 4 | 1 << 8 ,
115
+ 4 | 2 << 8 ,
116
+ 4 | 3 << 8 ,
117
+ 4 | 4 << 8 ,
118
+ 4 | 5 << 8 ,
119
+ 4 | 6 << 8 ,
120
+ 4 | 7 << 8 ,
121
+ 4 | 8 << 8 ,
122
+ 4 | 9 << 8 ,
123
+ 5 | 0 << 8 ,
124
+ 5 | 1 << 8 ,
125
+ 5 | 2 << 8 ,
126
+ 5 | 3 << 8 ,
127
+ 5 | 4 << 8 ,
128
+ 5 | 5 << 8 ,
129
+ 5 | 6 << 8 ,
130
+ 5 | 7 << 8 ,
131
+ 5 | 8 << 8 ,
132
+ 5 | 9 << 8 ,
133
+ 6 | 0 << 8 ,
134
+ 6 | 1 << 8 ,
135
+ 6 | 2 << 8 ,
136
+ 6 | 3 << 8 ,
137
+ 6 | 4 << 8 ,
138
+ 6 | 5 << 8 ,
139
+ 6 | 6 << 8 ,
140
+ 6 | 7 << 8 ,
141
+ 6 | 8 << 8 ,
142
+ 6 | 9 << 8 ,
143
+ 7 | 0 << 8 ,
144
+ 7 | 1 << 8 ,
145
+ 7 | 2 << 8 ,
146
+ 7 | 3 << 8 ,
147
+ 7 | 4 << 8 ,
148
+ 7 | 5 << 8 ,
149
+ 7 | 6 << 8 ,
150
+ 7 | 7 << 8 ,
151
+ 7 | 8 << 8 ,
152
+ 7 | 9 << 8 ,
153
+ 8 | 0 << 8 ,
154
+ 8 | 1 << 8 ,
155
+ 8 | 2 << 8 ,
156
+ 8 | 3 << 8 ,
157
+ 8 | 4 << 8 ,
158
+ 8 | 5 << 8 ,
159
+ 8 | 6 << 8 ,
160
+ 8 | 7 << 8 ,
161
+ 8 | 8 << 8 ,
162
+ 8 | 9 << 8 ,
163
+ 9 | 0 << 8 ,
164
+ 9 | 1 << 8 ,
165
+ 9 | 2 << 8 ,
166
+ 9 | 3 << 8 ,
167
+ 9 | 4 << 8 ,
168
+ 9 | 5 << 8 ,
169
+ 9 | 6 << 8 ,
170
+ 9 | 7 << 8 ,
171
+ 9 | 8 << 8 ,
172
+ 9 | 9 << 8 ,
173
+ };
174
+
175
+ /* Writes the character representation of the number encoded in value.
176
+ * E.g. if value = 1234, then the string "1234" will be written to str. */
177
+ static void bc_write_char_representation (uint32_t value , char * str )
178
+ {
179
+ uint32_t upper = value / 100 ; /* e.g. 12 */
180
+ uint32_t lower = value % 100 ; /* e.g. 34 */
181
+
182
+ /* Note: little endian, so `lower` comes before `upper`! */
183
+ uint32_t digits = LUT [lower ] << 16 | LUT [upper ];
184
+ memcpy (str , & digits , sizeof (digits ));
185
+ }
36
186
37
187
/* Perform addition: N1 is added to N2 and the value is
38
188
returned. The signs of N1 and N2 are ignored.
@@ -42,7 +192,8 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
42
192
{
43
193
bc_num sum ;
44
194
size_t sum_scale , sum_digits ;
45
- char * n1ptr , * n2ptr , * sumptr ;
195
+ const char * n1ptr , * n2ptr ;
196
+ char * sumptr ;
46
197
size_t n1bytes , n2bytes ;
47
198
bool carry ;
48
199
@@ -53,7 +204,7 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
53
204
54
205
/* Zero extra digits made by scale_min. */
55
206
if (scale_min > sum_scale ) {
56
- sumptr = (char * ) (sum -> n_value + sum_scale + sum_digits );
207
+ sumptr = (char * restrict ) (sum -> n_value + sum_scale + sum_digits );
57
208
for (int count = scale_min - sum_scale ; count > 0 ; count -- ) {
58
209
* sumptr ++ = 0 ;
59
210
}
@@ -62,9 +213,9 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
62
213
/* Start with the fraction part. Initialize the pointers. */
63
214
n1bytes = n1 -> n_scale ;
64
215
n2bytes = n2 -> n_scale ;
65
- n1ptr = ( char * ) ( n1 -> n_value + n1 -> n_len + n1bytes - 1 ) ;
66
- n2ptr = ( char * ) ( n2 -> n_value + n2 -> n_len + n2bytes - 1 ) ;
67
- sumptr = (char * ) (sum -> n_value + sum_scale + sum_digits - 1 );
216
+ n1ptr = n1 -> n_value + n1 -> n_len + n1bytes - 1 ;
217
+ n2ptr = n2 -> n_value + n2 -> n_len + n2bytes - 1 ;
218
+ sumptr = (char * restrict ) (sum -> n_value + sum_scale + sum_digits - 1 );
68
219
69
220
/* Add the fraction part. First copy the longer fraction.*/
70
221
if (n1bytes != n2bytes ) {
@@ -85,6 +236,38 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
85
236
n1bytes += n1 -> n_len ;
86
237
n2bytes += n2 -> n_len ;
87
238
carry = 0 ;
239
+
240
+ if (IS_LITTLE_ENDIAN ) {
241
+ /* The idea here is to work in a higher base, allowing summing digits in parallel. */
242
+ while (n1bytes >= 4 && n2bytes >= 4 ) {
243
+ n1ptr -= 4 ;
244
+ n2ptr -= 4 ;
245
+ sumptr -= 4 ;
246
+
247
+ /* Fetches the two number chunks into tmp1 and tmp2. */
248
+ uint32_t tmp1 , tmp2 ;
249
+ memcpy (& tmp1 , n1ptr + 1 , sizeof (tmp1 ));
250
+ memcpy (& tmp2 , n2ptr + 1 , sizeof (tmp2 ));
251
+ /* We don't have to parse tmp1 and tmp2 separately, we can perform
252
+ * the addition already and the parsing code will fix up the carry.
253
+ * See comment above bc_parse_4_chars(). We can only add the carry
254
+ * after parsing however because of endianness. */
255
+ uint32_t tmp_sum = bc_parse_4_chars (tmp1 + tmp2 ) + carry ;
256
+
257
+ /* We're handling 4 digits at once, i.e. the base is 10**4 here. */
258
+ if (tmp_sum >= 10000 ) {
259
+ tmp_sum -= 10000 ;
260
+ carry = 1 ;
261
+ } else {
262
+ carry = 0 ;
263
+ }
264
+
265
+ bc_write_char_representation (tmp_sum , sumptr + 1 );
266
+ n1bytes -= 4 ;
267
+ n2bytes -= 4 ;
268
+ }
269
+ }
270
+
88
271
while ((n1bytes > 0 ) && (n2bytes > 0 )) {
89
272
* sumptr = * n1ptr -- + * n2ptr -- + carry ;
90
273
if (* sumptr > (BASE - 1 )) {
0 commit comments