php · SakiTakamachi · May 7, 2024 · May 5, 2024 · May 5, 2024 · May 5, 2024
diff --git a/ext/bcmath/libbcmath/src/bcmath.h b/ext/bcmath/libbcmath/src/bcmath.h
@@ -78,6 +78,66 @@ typedef struct bc_struct {
 #define LONG_MAX 0x7ffffff
 #endif
 
+/* This will be 0x01010101 for 32-bit and 0x0101010101010101 for 64-bit */
+#define SWAR_ONES (~((size_t) 0) / 0xFF)
+/* This repeats a byte `x` into an entire 32/64-bit word.
+ * Example: SWAR_REPEAT(0xAB) will be 0xABABABAB for 32-bit and 0xABABABABABABABAB for 64-bit. */
+#define SWAR_REPEAT(x) (SWAR_ONES * (x))
+
+/* Bytes swap */
+#if defined(_MSC_VER)
+#  include <stdlib.h>
+#  define BSWAP32(u) _byteswap_ulong(u)
+#  define BSWAP64(u) _byteswap_uint64(u)
+#else
+#  ifdef __has_builtin
+#    if __has_builtin(__builtin_bswap32)
+#      define BSWAP32(u) __builtin_bswap32(u)
+#    endif // __has_builtin(__builtin_bswap32)
+#    if __has_builtin(__builtin_bswap64)
+#      define BSWAP64(u) __builtin_bswap64(u)
+#    endif // __has_builtin(__builtin_bswap64)
+#  elif defined(__GNUC__)
+#    define BSWAP32(u) __builtin_bswap32(u)
+#    define BSWAP64(u) __builtin_bswap64(u)
+#  endif // __has_builtin
+#endif // defined(_MSC_VER)
+#ifndef BSWAP32
+inline uint32_t BSWAP32(uint32_t u)
+{
+  return (((u & 0xff000000) >> 24)
+          | ((u & 0x00ff0000) >>  8)
+          | ((u & 0x0000ff00) <<  8)
+          | ((u & 0x000000ff) << 24));
+}
+#endif
+#ifndef BSWAP64
+inline uint64_t BSWAP64(uint64_t u)
+{
+   return (((u & 0xff00000000000000ULL) >> 56)
+          | ((u & 0x00ff000000000000ULL) >> 40)
+          | ((u & 0x0000ff0000000000ULL) >> 24)
+          | ((u & 0x000000ff00000000ULL) >>  8)
+          | ((u & 0x00000000ff000000ULL) <<  8)
+          | ((u & 0x0000000000ff0000ULL) << 24)
+          | ((u & 0x000000000000ff00ULL) << 40)
+          | ((u & 0x00000000000000ffULL) << 56));
+}
+#endif
+
+#if SIZEOF_SIZE_T >= 8
+#define BC_BSWAP(u) BSWAP64(u)
+#define BC_UINT_T uint64_t
+#else
+#define BC_BSWAP(u) BSWAP32(u)
+#define BC_UINT_T uint32_t
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define BC_LITTLE_ENDIAN 0
+#else
+#define BC_LITTLE_ENDIAN 1
+#endif
 
 /* Function Prototypes */
 

@@ -20,12 +20,6 @@
 # include <emmintrin.h>
 #endif
 
-/* This will be 0x01010101 for 32-bit and 0x0101010101010101 */
-#define SWAR_ONES (~((size_t) 0) / 0xFF)
-/* This repeats a byte `x` into an entire 32/64-bit word.
- * Example: SWAR_REPEAT(0xAB) will be 0xABABABAB for 32-bit and 0xABABABABABABABAB for 64-bit. */
-#define SWAR_REPEAT(x) (SWAR_ONES * (x))
-
 static char *bc_copy_and_shift_numbers(char *restrict dest, const char *source, const char *source_end, unsigned char shift, bool add)
 {
 	size_t bulk_shift = SWAR_REPEAT(shift);

@@ -124,36 +124,33 @@ bc_num _bc_do_add(bc_num n1, bc_num n2, size_t scale_min)
 bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 {
 	bc_num diff;
-	size_t diff_scale, diff_len;
-	size_t min_scale, min_len;
-	size_t borrow, count;
+
+	size_t diff_scale = MAX(n1->n_scale, n2->n_scale);
+	size_t diff_len = EXPECTED(n1->n_len >= n2->n_len) ? n1->n_len : n2->n_len;
+	size_t min_scale = MIN(n1->n_scale, n2->n_scale);
+	size_t min_len = n1->n_len >= n2->n_len ? n2->n_len : n1->n_len;
+	size_t min_bytes = min_len + min_scale;
+	size_t borrow = 0;
 	int val;
 	char *n1ptr, *n2ptr, *diffptr;
 
 	/* Allocate temporary storage. */
-	diff_len = MAX(n1->n_len, n2->n_len);
-	diff_scale = MAX(n1->n_scale, n2->n_scale);
-	min_len = MIN(n1->n_len, n2->n_len);
-	min_scale = MIN(n1->n_scale, n2->n_scale);
-	diff = bc_new_num (diff_len, MAX(diff_scale, scale_min));
+	diff = bc_new_num (n1->n_len, MAX(diff_scale, scale_min));
 
 	/* Initialize the subtract. */
 	n1ptr = (char *) (n1->n_value + n1->n_len + n1->n_scale - 1);
 	n2ptr = (char *) (n2->n_value + n2->n_len + n2->n_scale - 1);
 	diffptr = (char *) (diff->n_value + diff_len + diff_scale - 1);
 
-	/* Subtract the numbers. */
-	borrow = 0;
-
 	/* Take care of the longer scaled number. */
 	if (n1->n_scale != min_scale) {
 		/* n1 has the longer scale */
-		for (count = n1->n_scale - min_scale; count > 0; count--) {
+		for (size_t count = n1->n_scale - min_scale; count > 0; count--) {
 			*diffptr-- = *n1ptr--;
 		}
 	} else {
 		/* n2 has the longer scale */
-		for (count = n2->n_scale - min_scale; count > 0; count--) {
+		for (size_t count = n2->n_scale - min_scale; count > 0; count--) {
 			val = -*n2ptr-- - borrow;
 			if (val < 0) {
 				val += BASE;
@@ -166,7 +163,57 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 	}
 
 	/* Now do the equal length scale and integer parts. */
-	for (count = 0; count < min_len + min_scale; count++) {
+	size_t sub_count = 0;
+	if (min_bytes >= sizeof(BC_UINT_T)) {
+		diffptr++;
+		n1ptr++;
+		n2ptr++;
+		while (sub_count + sizeof(BC_UINT_T) <= min_bytes) {
+			diffptr -= sizeof(BC_UINT_T);
+			n1ptr -= sizeof(BC_UINT_T);
+			n2ptr -= sizeof(BC_UINT_T);
+
+			BC_UINT_T n1bytes;
+			BC_UINT_T n2bytes;
+			memcpy(&n1bytes, n1ptr, sizeof(n1bytes));
+			memcpy(&n2bytes, n2ptr, sizeof(n2bytes));
+
+#if BC_LITTLE_ENDIAN
+			/* Bytes swap */
+			n1bytes = BC_BSWAP(n1bytes);
+			n2bytes = BC_BSWAP(n2bytes);
+#endif
+
+			n1bytes -= (n2bytes + borrow);
-			n1bytes -= (n2bytes + borrow);
+			n1bytes -= n2bytes + borrow;
-			n1bytes -= (n2bytes + borrow);
+			n1bytes -= n2bytes + borrow;
+			/* If the most significant 4 bits of the 8 bytes are not 0, a carry-down has occurred. */
+			bool tmp_borrow = n1bytes >= ((BC_UINT_T) 0x10 << (8 * (sizeof(BC_UINT_T) - 1)));
-			bool tmp_borrow = n1bytes >= ((BC_UINT_T) 0x10 << (8 * (sizeof(BC_UINT_T) - 1)));
+			bool tmp_borrow = n1bytes & ((BC_UINT_T) 1 << (8 * sizeof(BC_UINT_T) - 1));
-			bool tmp_borrow = n1bytes >= ((BC_UINT_T) 0x10 << (8 * (sizeof(BC_UINT_T) - 1)));
+			bool tmp_borrow = n1bytes & ((BC_UINT_T) 1 << (8 * sizeof(BC_UINT_T) - 1));
+
+			/*
+			 * If any one of the upper 4 bits of each of the 8 bytes is 1, subtract 6 from that byte.
+			 * The fact that the upper 4 bits are not 0 means that a carry-down has occurred, and when
+			 * the hexadecimal number is carried down, there is a difference of 6 from the decimal
+			 * calculation, so 6 is subtracted.
+			 * Also, set all upper 4 bits to 0.
+			 */
+			BC_UINT_T borrow_mask = (((n1bytes | (n1bytes >> 1) | (n1bytes >> 2) | (n1bytes >> 3)) & SWAR_REPEAT(0x10)) * 0x06) >> 4;
-			BC_UINT_T borrow_mask = (((n1bytes | (n1bytes >> 1) | (n1bytes >> 2) | (n1bytes >> 3)) & SWAR_REPEAT(0x10)) * 0x06) >> 4;
+			BC_UINT_T borrow_mask = n1bytes | (n1bytes >> 1);
+			borrow_mask |= borrow_mask >> 2;
+			borrow_mask = ((borrow_mask & SWAR_REPEAT(0x10)) * 0x06) >> 4;
-			BC_UINT_T borrow_mask = (((n1bytes | (n1bytes >> 1) | (n1bytes >> 2) | (n1bytes >> 3)) & SWAR_REPEAT(0x10)) * 0x06) >> 4;
+			BC_UINT_T borrow_mask = n1bytes | (n1bytes >> 1);
+			borrow_mask |= borrow_mask >> 2;
+			borrow_mask = ((borrow_mask & SWAR_REPEAT(0x10)) * 0x06) >> 4;
+			n1bytes = (n1bytes & SWAR_REPEAT(0x0F)) - borrow_mask;
+
+#if BC_LITTLE_ENDIAN
+			/* Bytes swap */
+			n1bytes = BC_BSWAP(n1bytes);
+#endif
+
+			memcpy(diffptr, &n1bytes, sizeof(n1bytes));
+
+			borrow = tmp_borrow;
+			sub_count += sizeof(BC_UINT_T);
+		}
+		diffptr--;
+		n1ptr--;
+		n2ptr--;
+	}
+
+	for (; sub_count < min_bytes; sub_count++) {
 		val = *n1ptr-- - *n2ptr-- - borrow;
 		if (val < 0) {
 			val += BASE;
@@ -179,7 +226,7 @@ bc_num _bc_do_sub(bc_num n1, bc_num n2, size_t scale_min)
 
 	/* If n1 has more digits than n2, we now do that subtract. */
 	if (diff_len != min_len) {
-		for (count = diff_len - min_len; count > 0; count--) {
+		for (size_t count = diff_len - min_len; count > 0; count--) {
 			val = *n1ptr-- - borrow;
 			if (val < 0) {
 				val += BASE;