do_div(): generic optimization for constant divisor on 32-bit machines

64-by-32-bit divisions are prominent in the kernel, even on 32-bit machines. Luckily, many of them use a constant divisor that allows for a much faster multiplication by the divisor's reciprocal. The compiler already performs this optimization when compiling a 32-by-32 division with a constant divisor. Unfortunately, on 32-bit machines, gcc does not optimize 64-by-32 divisions in that case, except for constant divisors that happen to be a power of 2. Let's avoid the slow path whenever the divisor is constant by manually computing the reciprocal ourselves and performing the multiplication inline. In most cases, this improves performance of 64-by-32 divisions by about two orders of magnitude compared to the __div64_32() fallback, especially on architectures lacking a native div instruction. The algorithm used here comes from the existing ARM code. The __div64_const32_is_OK macro can be predefined by architectures to disable this optimization in some cases. For example, some ancient gcc version on ARM would crash with an ICE when fed this code. Signed-off-by: Nicolas Pitre <nico@linaro.org> Acked-by: Alexey Brodkin <abrodkin@synopsys.com>
author: Nicolas Pitre <nicolas.pitre@linaro.org> 2015-10-30 15:36:39 -0400
committer: Nicolas Pitre <nicolas.pitre@linaro.org> 2015-11-16 14:42:08 -0500
commit: 461a5e51060c93f5844113f4be9dba513cc92830 (patch)
tree: f02ce68a22937ce0b8d2c609e77bccc02c86455a /include/asm-generic
parent: 911918aa7ef6f868c135505b0015e42072c54682 (diff)
1 files changed, 147 insertions, 0 deletions
diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index 5d974683193a..5a1bf1aff502 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -4,6 +4,9 @@
 * Copyright (C) 2003 Bernardo Innocenti <bernie@develer.com>
 * Based on former asm-ppc/div64.h and asm-m68knommu/div64.h
 *
+ * Optimization for constant divisors on 32-bit machines:
+ * Copyright (C) 2006-2015 Nicolas Pitre
+ *
 * The semantics of do_div() are:
 *
 * uint32_t do_div(uint64_t *n, uint32_t base)
@@ -34,6 +37,142 @@
 #include <linux/log2.h>
+/*
+ * If the divisor happens to be constant, we determine the appropriate
+ * inverse at compile time to turn the division into a few inline
+ * multiplications which ought to be much faster. And yet only if compiling
+ * with a sufficiently recent gcc version to perform proper 64-bit constant
+ * propagation.
+ *
+ * (It is unfortunate that gcc doesn't perform all this internally.)
+ */
+#ifndef __div64_const32_is_OK
+#define __div64_const32_is_OK (__GNUC__ >= 4)
+#endif
+#define __div64_const32(n, ___b)                                        \
+({                                                                      \
+        /*                                                              \
+         * Multiplication by reciprocal of b: n / b = n * (p / b) / p   \
+         *                                                              \
+         * We rely on the fact that most of this code gets optimized    \
+         * away at compile time due to constant propagation and only    \
+         * a few multiplication instructions should remain.             \
+         * Hence this monstrous macro (static inline doesn't always     \
+         * do the trick here).                                          \
+         */                                                             \
+        uint64_t ___res, ___x, ___t, ___m, ___n = (n);                  \
+        uint32_t ___p, ___bias, ___m_lo, ___m_hi, ___n_lo, ___n_hi;     \
+                                                                        \
+        /* determine MSB of b */                                        \
+        ___p = 1 << ilog2(___b);                                        \
+                                                                        \
+        /* compute m = ((p << 64) + b - 1) / b */                       \
+        ___m = (~0ULL / ___b) * ___p;                                   \
+        ___m += (((~0ULL % ___b + 1) * ___p) + ___b - 1) / ___b;        \
+                                                                        \
+        /* one less than the dividend with highest result */            \
+        ___x = ~0ULL / ___b * ___b - 1;                                 \
+                                                                        \
+        /* test our ___m with res = m * x / (p << 64) */                \
+        ___res = ((___m & 0xffffffff) * (___x & 0xffffffff)) >> 32;     \
+        ___t = ___res += (___m & 0xffffffff) * (___x >> 32);            \
+        ___res += (___x & 0xffffffff) * (___m >> 32);                   \
+        ___t = (___res < ___t) ? (1ULL << 32) : 0;                      \
+        ___res = (___res >> 32) + ___t;                                 \
+        ___res += (___m >> 32) * (___x >> 32);                          \
+        ___res /= ___p;                                                 \
+                                                                        \
+        /* Now sanitize and optimize what we've got. */                 \
+        if (~0ULL % (___b / (___b & -___b)) == 0) {                     \
+                /* special case, can be simplified to ... */            \
+                ___n /= (___b & -___b);                                 \
+                ___m = ~0ULL / (___b / (___b & -___b));                 \
+                ___p = 1;                                               \
+                ___bias = 1;                                            \
+        } else if (___res != ___x / ___b) {                             \
+                /*                                                      \
+                 * We can't get away without a bias to compensate       \
+                 * for bit truncation errors.  To avoid it we'd need an \
+                 * additional bit to represent m which would overflow   \
+                 * a 64-bit variable.                                   \
+                 *                                                      \
+                 * Instead we do m = p / b and n / b = (n * m + m) / p. \
+                 */                                                     \
+                ___bias = 1;                                            \
+                /* Compute m = (p << 64) / b */                         \
+                ___m = (~0ULL / ___b) * ___p;                           \
+                ___m += ((~0ULL % ___b + 1) * ___p) / ___b;             \
+        } else {                                                        \
+                /*                                                      \
+                 * Reduce m / p, and try to clear bit 31 of m when      \
+                 * possible, otherwise that'll need extra overflow      \
+                 * handling later.                                      \
+                 */                                                     \
+                uint32_t ___bits = -(___m & -___m);                     \
+                ___bits |= ___m >> 32;                                  \
+                ___bits = (~___bits) << 1;                              \
+                /*                                                      \
+                 * If ___bits == 0 then setting bit 31 is  unavoidable. \
+                 * Simply apply the maximum possible reduction in that  \
+                 * case. Otherwise the MSB of ___bits indicates the     \
+                 * best reduction we should apply.                      \
+                 */                                                     \
+                if (!___bits) {                                         \
+                        ___p /= (___m & -___m);                         \
+                        ___m /= (___m & -___m);                         \
+                } else {                                                \
+                        ___p >>= ilog2(___bits);                        \
+                        ___m >>= ilog2(___bits);                        \
+                }                                                       \
+                /* No bias needed. */                                   \
+                ___bias = 0;                                            \
+        }                                                               \
+                                                                        \
+        /*                                                              \
+         * Now we have a combination of 2 conditions:                   \
+         *                                                              \
+         * 1) whether or not we need to apply a bias, and               \
+         *                                                              \
+         * 2) whether or not there might be an overflow in the cross    \
+         *    product determined by (___m & ((1 << 63) | (1 << 31))).   \
+         *                                                              \
+         * Select the best way to do (m_bias + m * n) / (p << 64).      \
+         * From now on there will be actual runtime code generated.     \
+         */                                                             \
+                                                                        \
+        ___m_lo = ___m;                                                 \
+        ___m_hi = ___m >> 32;                                           \
+        ___n_lo = ___n;                                                 \
+        ___n_hi = ___n >> 32;                                           \
+                                                                        \
+        if (!___bias) {                                                 \
+                ___res = ((uint64_t)___m_lo * ___n_lo) >> 32;           \
+        } else if (!(___m & ((1ULL << 63) | (1ULL << 31)))) {           \
+                ___res = (___m + (uint64_t)___m_lo * ___n_lo) >> 32;    \
+        } else {                                                        \
+                ___res = ___m + (uint64_t)___m_lo * ___n_lo;            \
+                ___t = (___res < ___m) ? (1ULL << 32) : 0;              \
+                ___res = (___res >> 32) + ___t;                         \
+        }                                                               \
+                                                                        \
+        if (!(___m & ((1ULL << 63) | (1ULL << 31)))) {                  \
+                ___res += (uint64_t)___m_lo * ___n_hi;                  \
+                ___res += (uint64_t)___m_hi * ___n_lo;                  \
+                ___res >>= 32;                                          \
+        } else {                                                        \
+                ___t = ___res += (uint64_t)___m_lo * ___n_hi;           \
+                ___res += (uint64_t)___m_hi * ___n_lo;                  \
+                ___t = (___res < ___t) ? (1ULL << 32) : 0;              \
+                ___res = (___res >> 32) + ___t;                         \
+        }                                                               \
+                                                                        \
+        ___res += (uint64_t)___m_hi * ___n_hi;                          \
+                                                                        \
+        ___res /= ___p;                                                 \
+})
 extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
 /* The unnecessary pointer compare is there
@@ -47,6 +186,14 @@ extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
            is_power_of_2(__base)) {                    \
                __rem = (n) & (__base - 1);             \
                (n) >>= ilog2(__base);                  \
+        } else if (__div64_const32_is_OK &&             \
+                   __builtin_constant_p(__base) &&      \
+                   __base != 0) {                       \
+                uint32_t __res_lo, __n_lo = (n);        \
+                (n) = __div64_const32(n, __base);       \
+                /* the remainder can be computed with 32-bit regs */ \
+                __res_lo = (n);                         \
+                __rem = __n_lo - __res_lo * __base;     \
        } else if (likely(((n) >> 32) == 0)) {          \
                __rem = (uint32_t)(n) % __base;         \
                (n) = (uint32_t)(n) / __base;           \
author	Nicolas Pitre <nicolas.pitre@linaro.org>	2015-10-30 15:36:39 -0400
committer	Nicolas Pitre <nicolas.pitre@linaro.org>	2015-11-16 14:42:08 -0500
commit	461a5e51060c93f5844113f4be9dba513cc92830 (patch)
tree	f02ce68a22937ce0b8d2c609e77bccc02c86455a /include/asm-generic
parent	911918aa7ef6f868c135505b0015e42072c54682 (diff)

diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h index 5d974683193a..5a1bf1aff502 100644 --- a/include/asm-generic/div64.h +++ b/include/asm-generic/div64.h
@@ -4,6 +4,9 @@
4	* Copyright (C) 2003 Bernardo Innocenti <bernie@develer.com>	4	* Copyright (C) 2003 Bernardo Innocenti <bernie@develer.com>
5	* Based on former asm-ppc/div64.h and asm-m68knommu/div64.h	5	* Based on former asm-ppc/div64.h and asm-m68knommu/div64.h
6	*	6	*
		7	* Optimization for constant divisors on 32-bit machines:
		8	* Copyright (C) 2006-2015 Nicolas Pitre
		9	*
7	* The semantics of do_div() are:	10	* The semantics of do_div() are:
8	*	11	*
9	* uint32_t do_div(uint64_t *n, uint32_t base)	12	* uint32_t do_div(uint64_t *n, uint32_t base)
@@ -34,6 +37,142 @@
34		37
35	#include <linux/log2.h>	38	#include <linux/log2.h>
36		39
		40	/*
		41	* If the divisor happens to be constant, we determine the appropriate
		42	* inverse at compile time to turn the division into a few inline
		43	* multiplications which ought to be much faster. And yet only if compiling
		44	* with a sufficiently recent gcc version to perform proper 64-bit constant
		45	* propagation.
		46	*
		47	* (It is unfortunate that gcc doesn't perform all this internally.)
		48	*/
		49
		50	#ifndef __div64_const32_is_OK
		51	#define __div64_const32_is_OK (__GNUC__ >= 4)
		52	#endif
		53
		54	#define __div64_const32(n, ___b) \
		55	({ \
		56	/* \
		57	* Multiplication by reciprocal of b: n / b = n * (p / b) / p \
		58	* \
		59	* We rely on the fact that most of this code gets optimized \
		60	* away at compile time due to constant propagation and only \
		61	* a few multiplication instructions should remain. \
		62	* Hence this monstrous macro (static inline doesn't always \
		63	* do the trick here). \
		64	*/ \
		65	uint64_t ___res, ___x, ___t, ___m, ___n = (n); \
		66	uint32_t ___p, ___bias, ___m_lo, ___m_hi, ___n_lo, ___n_hi; \
		67	\
		68	/* determine MSB of b */ \
		69	___p = 1 << ilog2(___b); \
		70	\
		71	/* compute m = ((p << 64) + b - 1) / b */ \
		72	___m = (~0ULL / ___b) * ___p; \
		73	___m += (((~0ULL % ___b + 1) * ___p) + ___b - 1) / ___b; \
		74	\
		75	/* one less than the dividend with highest result */ \
		76	___x = ~0ULL / ___b * ___b - 1; \
		77	\
		78	/* test our ___m with res = m * x / (p << 64) */ \
		79	___res = ((___m & 0xffffffff) * (___x & 0xffffffff)) >> 32; \
		80	___t = ___res += (___m & 0xffffffff) * (___x >> 32); \
		81	___res += (___x & 0xffffffff) * (___m >> 32); \
		82	___t = (___res < ___t) ? (1ULL << 32) : 0; \
		83	___res = (___res >> 32) + ___t; \
		84	___res += (___m >> 32) * (___x >> 32); \
		85	___res /= ___p; \
		86	\
		87	/* Now sanitize and optimize what we've got. */ \
		88	if (~0ULL % (___b / (___b & -___b)) == 0) { \
		89	/* special case, can be simplified to ... */ \
		90	___n /= (___b & -___b); \
		91	___m = ~0ULL / (___b / (___b & -___b)); \
		92	___p = 1; \
		93	___bias = 1; \
		94	} else if (___res != ___x / ___b) { \
		95	/* \
		96	* We can't get away without a bias to compensate \
		97	* for bit truncation errors. To avoid it we'd need an \
		98	* additional bit to represent m which would overflow \
		99	* a 64-bit variable. \
		100	* \
		101	* Instead we do m = p / b and n / b = (n * m + m) / p. \
		102	*/ \
		103	___bias = 1; \
		104	/* Compute m = (p << 64) / b */ \
		105	___m = (~0ULL / ___b) * ___p; \
		106	___m += ((~0ULL % ___b + 1) * ___p) / ___b; \
		107	} else { \
		108	/* \
		109	* Reduce m / p, and try to clear bit 31 of m when \
		110	* possible, otherwise that'll need extra overflow \
		111	* handling later. \
		112	*/ \
		113	uint32_t ___bits = -(___m & -___m); \
		114	___bits \|= ___m >> 32; \
		115	___bits = (~___bits) << 1; \
		116	/* \
		117	* If ___bits == 0 then setting bit 31 is unavoidable. \
		118	* Simply apply the maximum possible reduction in that \
		119	* case. Otherwise the MSB of ___bits indicates the \
		120	* best reduction we should apply. \
		121	*/ \
		122	if (!___bits) { \
		123	___p /= (___m & -___m); \
		124	___m /= (___m & -___m); \
		125	} else { \
		126	___p >>= ilog2(___bits); \
		127	___m >>= ilog2(___bits); \
		128	} \
		129	/* No bias needed. */ \
		130	___bias = 0; \
		131	} \
		132	\
		133	/* \
		134	* Now we have a combination of 2 conditions: \
		135	* \
		136	* 1) whether or not we need to apply a bias, and \
		137	* \
		138	* 2) whether or not there might be an overflow in the cross \
		139	* product determined by (___m & ((1 << 63) \| (1 << 31))). \
		140	* \
		141	* Select the best way to do (m_bias + m * n) / (p << 64). \
		142	* From now on there will be actual runtime code generated. \
		143	*/ \
		144	\
		145	___m_lo = ___m; \
		146	___m_hi = ___m >> 32; \
		147	___n_lo = ___n; \
		148	___n_hi = ___n >> 32; \
		149	\
		150	if (!___bias) { \
		151	___res = ((uint64_t)___m_lo * ___n_lo) >> 32; \
		152	} else if (!(___m & ((1ULL << 63) \| (1ULL << 31)))) { \
		153	___res = (___m + (uint64_t)___m_lo * ___n_lo) >> 32; \
		154	} else { \
		155	___res = ___m + (uint64_t)___m_lo * ___n_lo; \
		156	___t = (___res < ___m) ? (1ULL << 32) : 0; \
		157	___res = (___res >> 32) + ___t; \
		158	} \
		159	\
		160	if (!(___m & ((1ULL << 63) \| (1ULL << 31)))) { \
		161	___res += (uint64_t)___m_lo * ___n_hi; \
		162	___res += (uint64_t)___m_hi * ___n_lo; \
		163	___res >>= 32; \
		164	} else { \
		165	___t = ___res += (uint64_t)___m_lo * ___n_hi; \
		166	___res += (uint64_t)___m_hi * ___n_lo; \
		167	___t = (___res < ___t) ? (1ULL << 32) : 0; \
		168	___res = (___res >> 32) + ___t; \
		169	} \
		170	\
		171	___res += (uint64_t)___m_hi * ___n_hi; \
		172	\
		173	___res /= ___p; \
		174	})
		175
37	extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);	176	extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
38		177
39	/* The unnecessary pointer compare is there	178	/* The unnecessary pointer compare is there
@@ -47,6 +186,14 @@ extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
47	is_power_of_2(__base)) { \	186	is_power_of_2(__base)) { \
48	__rem = (n) & (__base - 1); \	187	__rem = (n) & (__base - 1); \
49	(n) >>= ilog2(__base); \	188	(n) >>= ilog2(__base); \
		189	} else if (__div64_const32_is_OK && \
		190	__builtin_constant_p(__base) && \
		191	__base != 0) { \
		192	uint32_t __res_lo, __n_lo = (n); \
		193	(n) = __div64_const32(n, __base); \
		194	/* the remainder can be computed with 32-bit regs */ \
		195	__res_lo = (n); \
		196	__rem = __n_lo - __res_lo * __base; \
50	} else if (likely(((n) >> 32) == 0)) { \	197	} else if (likely(((n) >> 32) == 0)) { \
51	__rem = (uint32_t)(n) % __base; \	198	__rem = (uint32_t)(n) % __base; \
52	(n) = (uint32_t)(n) / __base; \	199	(n) = (uint32_t)(n) / __base; \