php · nielsdos · May 23, 2024 · May 20, 2024 · May 20, 2024 · May 20, 2024
@@ -58,12 +58,68 @@ static inline void bc_digits_adjustment(BC_UINT_T *prod_uint, size_t prod_arr_si
 	}
 }
 
+/* This is based on the technique described in https://kholdstare.github.io/technical/2020/05/26/faster-integer-parsing.html.
+ * This function transforms AABBCCDD into 1000 * AA + 100 * BB + 10 * CC + DD,
+ * with the caveat that all components must be in the interval [0, 25] to prevent overflow
+ * due to the multiplication by power of 10 (10 * 25 = 250 is the largest number that fits in a byte).
+ * The advantage of this method instead of using shifts + 3 multiplications is that this is cheaper
+ * due to its divide-and-conquer nature.
+ */
+#if SIZEOF_SIZE_T == 4
+static uint32_t bc_parse_chunk_chars(const char *str)
+{
+	uint32_t tmp;
+	memcpy(&tmp, str, sizeof(tmp));
+#if !BC_LITTLE_ENDIAN
+	tmp = BC_BSWAP(tmp);
+#endif
+
+	uint32_t lower_digits = (tmp & 0x0f000f00) >> 8;
+	uint32_t upper_digits = (tmp & 0x000f000f) * 10;
+
+	tmp = lower_digits + upper_digits;
+
+	lower_digits = (tmp & 0x00ff0000) >> 16;
+	upper_digits = (tmp & 0x000000ff) * 100;
+
+	return lower_digits + upper_digits;
+}
+#elif SIZEOF_SIZE_T == 8
+static uint64_t bc_parse_chunk_chars(const char *str)
+{
+	uint64_t tmp;
+	memcpy(&tmp, str, sizeof(tmp));
+#if !BC_LITTLE_ENDIAN
+	tmp = BC_BSWAP(tmp);
+#endif
+
+	uint64_t lower_digits = (tmp & 0x0f000f000f000f00) >> 8;
+	uint64_t upper_digits = (tmp & 0x000f000f000f000f) * 10;
+
+	tmp = lower_digits + upper_digits;
+
+	lower_digits = (tmp & 0x00ff000000ff0000) >> 16;
+	upper_digits = (tmp & 0x000000ff000000ff) * 100;
+
+	tmp = lower_digits + upper_digits;
+
+	lower_digits = (tmp & 0x0000ffff00000000) >> 32;
+	upper_digits = (tmp & 0x000000000000ffff) * 10000;
+
+	return lower_digits + upper_digits;
+}
+#endif
+
 /*
  * Converts BCD to uint, going backwards from pointer n by the number of
  * characters specified by len.
  */
 static inline BC_UINT_T bc_partial_convert_to_uint(const char *n, size_t len)
 {
+	if (len == BC_MUL_UINT_DIGITS) {
+		return bc_parse_chunk_chars(n - BC_MUL_UINT_DIGITS + 1);
+	}
+
 	BC_UINT_T num = 0;
 	BC_UINT_T base = 1;
 
@@ -92,7 +148,7 @@ static inline void bc_convert_to_uint(BC_UINT_T *n_uint, const char *nend, size_
  * If the n_values of n1 and n2 are both 4 (32-bit) or 8 (64-bit) digits or less,
  * the calculation will be performed at high speed without using an array.
  */
-static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_num *prod)
+static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, size_t n2len, bc_num *prod)
 {
 	const char *n1end = n1->n_value + n1len - 1;
 	const char *n2end = n2->n_value + n2len - 1;
@@ -112,6 +168,52 @@ static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc
 	}
 }
 
+#if BC_LITTLE_ENDIAN
+# define BC_ENCODE_LUT(A, B) ((A) | (B) << 4)
+#else
+# define BC_ENCODE_LUT(A, B) ((B) | (A) << 4)
+#endif
+
+#define LUT_ITERATE(_, A) \
+	_(A, 0), _(A, 1), _(A, 2), _(A, 3), _(A, 4), _(A, 5), _(A, 6), _(A, 7), _(A, 8), _(A, 9)
+
+/* This LUT encodes the decimal representation of numbers 0-100
+ * such that we can avoid taking modulos and divisions which would be slow. */
+static const unsigned char LUT[100] = {
+	LUT_ITERATE(BC_ENCODE_LUT, 0),
+	LUT_ITERATE(BC_ENCODE_LUT, 1),
+	LUT_ITERATE(BC_ENCODE_LUT, 2),
+	LUT_ITERATE(BC_ENCODE_LUT, 3),
+	LUT_ITERATE(BC_ENCODE_LUT, 4),
+	LUT_ITERATE(BC_ENCODE_LUT, 5),
+	LUT_ITERATE(BC_ENCODE_LUT, 6),
+	LUT_ITERATE(BC_ENCODE_LUT, 7),
+	LUT_ITERATE(BC_ENCODE_LUT, 8),
+	LUT_ITERATE(BC_ENCODE_LUT, 9),
+};
+
+static inline unsigned short bc_expand_lut(unsigned char c)
+{
+	return (c & 0x0f) | (c & 0xf0) << 4;
+}
+
+/* Writes the character representation of the number encoded in value.
+ * E.g. if value = 1234, then the string "1234" will be written to str. */
+static void bc_write_bcd_representation(uint32_t value, char *str)
+{
+	uint32_t upper = value / 100; /* e.g. 12 */
+	uint32_t lower = value % 100; /* e.g. 34 */
+
+#if BC_LITTLE_ENDIAN
+	/* Note: little endian, so `lower` comes before `upper`! */
+	uint32_t digits = bc_expand_lut(LUT[lower]) << 16 | bc_expand_lut(LUT[upper]);
+#else
+	/* Note: big endian, so `upper` comes before `lower`! */
+	uint32_t digits = bc_expand_lut(LUT[upper]) << 16 | bc_expand_lut(LUT[lower]);
+#endif
+	memcpy(str, &digits, sizeof(digits));
+}
+
 /*
  * Converts the BCD of bc_num by 4 (32 bits) or 8 (64 bits) digits to an array of BC_UINT_Ts.
  * The array is generated starting with the smaller digits.
@@ -180,10 +282,14 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, size_t n2len, bc
 	char *pend = pptr + prodlen - 1;
 	i = 0;
 	while (i < prod_arr_size - 1) {
-		for (size_t j = 0; j < BC_MUL_UINT_DIGITS; j++) {
-			*pend-- = prod_uint[i] % BASE;
-			prod_uint[i] /= BASE;
-		}
+#if BC_MUL_UINT_DIGITS == 4
+		bc_write_bcd_representation(prod_uint[i], pend - 3);
+		pend -= 4;
+#else
+		bc_write_bcd_representation(prod_uint[i] / 10000, pend - 7);
+		bc_write_bcd_representation(prod_uint[i] % 10000, pend - 3);
+		pend -= 8;
+#endif
 		i++;
 	}