[ARM] 3611/4: optimize do_div() when divisor is constant

On ARM all divisions have to be performed "manually". For 64-bit divisions that may take more than a hundred cycles in many cases. With 32-bit divisions gcc already use the recyprocal of constant divisors to perform a multiplication, but not with 64-bit divisions. Since the kernel is increasingly relying upon 64-bit divisions it is worth optimizing at least those cases where the divisor is a constant. This is what this patch does using plain C code that gets optimized away at compile time. For example, despite the amount of added C code, do_div(x, 10000) now produces the following assembly code (where x is assigned to r0-r1): adr r4, .L0 ldmia r4, {r4-r5} umull r2, r3, r4, r0 mov r2, #0 umlal r3, r2, r5, r0 umlal r3, r2, r4, r1 mov r3, #0 umlal r2, r3, r5, r1 mov r0, r2, lsr #11 orr r0, r0, r3, lsl #21 mov r1, r3, lsr #11 ... .L0: .word 948328779 .word 879609302 which is the fastest that can be done for any value of x in that case, many times faster than the __do_div64 code (except for the small x value space for which the result ends up being zero or a single bit). The fact that this code is generated inline produces a tiny increase in .text size, but not significant compared to the needed code around each __do_div64 call site this code is replacing. The algorithm used has been validated on a 16-bit scale for all possible values, and then recodified for 64-bit values. Furthermore I've been running it with the final BUG_ON() uncommented for over two months now with no problem. Note that this new code is compiled with gcc versions 4.0 or later. Earlier gcc versions proved themselves too problematic and only the original code is used with them. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
author: Nicolas Pitre <nico@cam.org> 2006-12-05 22:13:18 -0500
committer: Russell King <rmk+kernel@arm.linux.org.uk> 2006-12-07 11:06:09 -0500
commit: fa4adc614922c24601320e55bc5a1f837abad6e9 (patch)
tree: f93979fc2d9e2e2dad6edd3fa92b084a2645976a /include
parent: 0215ffb08ce99e2bb59eca114a99499a4d06e704 (diff)
1 files changed, 179 insertions, 1 deletions
diff --git a/include/asm-arm/div64.h b/include/asm-arm/div64.h
index 3682616804ca..37e0a96e8789 100644
--- a/include/asm-arm/div64.h
+++ b/include/asm-arm/div64.h
@@ -27,7 +27,7 @@
 #define __xh "r1"
 #endif
-#define do_div(n,base)                                          \
+#define __do_div_asm(n, base)                                   \
 ({                                                              \
        register unsigned int __base      asm("r4") = base;     \
        register unsigned long long __n   asm("r0") = n;        \
@@ -45,4 +45,182 @@
        __rem;                                                  \
 })
+#if __GNUC__ < 4
+/*
+ * gcc versions earlier than 4.0 are simply too problematic for the
+ * optimized implementation below. First there is gcc PR 15089 that
+ * tend to trig on more complex constructs, spurious .global __udivsi3
+ * are inserted even if none of those symbols are referenced in the
+ * generated code, and those gcc versions are not able to do constant
+ * propagation on long long values anyway.
+ */
+#define do_div(n, base) __do_div_asm(n, base)
+#elif __GNUC__ >= 4
+#include <asm/bug.h>
+/*
+ * If the divisor happens to be constant, we determine the appropriate
+ * inverse at compile time to turn the division into a few inline
+ * multiplications instead which is much faster. And yet only if compiling
+ * for ARMv4 or higher (we need umull/umlal) and if the gcc version is
+ * sufficiently recent to perform proper long long constant propagation.
+ * (It is unfortunate that gcc doesn't perform all this internally.)
+ */
+#define do_div(n, base)                                                 \
+({                                                                      \
+        unsigned int __r, __b = (base);                                 \
+        if (!__builtin_constant_p(__b) || __b == 0 ||                   \
+            (__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) {       \
+                /* non-constant divisor (or zero): slow path */         \
+                __r = __do_div_asm(n, __b);                             \
+        } else if ((__b & (__b - 1)) == 0) {                            \
+                /* Trivial: __b is constant and a power of 2 */         \
+                /* gcc does the right thing with this code.  */         \
+                __r = n;                                                \
+                __r &= (__b - 1);                                       \
+                n /= __b;                                               \
+        } else {                                                        \
+                /* Multiply by inverse of __b: n/b = n*(p/b)/p       */ \
+                /* We rely on the fact that most of this code gets   */ \
+                /* optimized away at compile time due to constant    */ \
+                /* propagation and only a couple inline assembly     */ \
+                /* instructions should remain. Better avoid any      */ \
+                /* code construct that might prevent that.           */ \
+                unsigned long long __res, __x, __t, __m, __n = n;       \
+                unsigned int __c, __p, __z = 0;                         \
+                /* preserve low part of n for reminder computation */   \
+                __r = __n;                                              \
+                /* determine number of bits to represent __b */         \
+                __p = 1 << __div64_fls(__b);                            \
+                /* compute __m = ((__p << 64) + __b - 1) / __b */       \
+                __m = (~0ULL / __b) * __p;                              \
+                __m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b;     \
+                /* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */ \
+                __x = ~0ULL / __b * __b - 1;                            \
+                __res = (__m & 0xffffffff) * (__x & 0xffffffff);        \
+                __res >>= 32;                                           \
+                __res += (__m & 0xffffffff) * (__x >> 32);              \
+                __t = __res;                                            \
+                __res += (__x & 0xffffffff) * (__m >> 32);              \
+                __t = (__res < __t) ? (1ULL << 32) : 0;                 \
+                __res = (__res >> 32) + __t;                            \
+                __res += (__m >> 32) * (__x >> 32);                     \
+                __res /= __p;                                           \
+                /* Now sanitize and optimize what we've got. */         \
+                if (~0ULL % (__b / (__b & -__b)) == 0) {                \
+                        /* those cases can be simplified with: */       \
+                        __n /= (__b & -__b);                            \
+                        __m = ~0ULL / (__b / (__b & -__b));             \
+                        __p = 1;                                        \
+                        __c = 1;                                        \
+                } else if (__res != __x / __b) {                        \
+                        /* We can't get away without a correction    */ \
+                        /* to compensate for bit truncation errors.  */ \
+                        /* To avoid it we'd need an additional bit   */ \
+                        /* to represent __m which would overflow it. */ \
+                        /* Instead we do m=p/b and n/b=(n*m+m)/p.    */ \
+                        __c = 1;                                        \
+                        /* Compute __m = (__p << 64) / __b */           \
+                        __m = (~0ULL / __b) * __p;                      \
+                        __m += ((~0ULL % __b + 1) * __p) / __b;         \
+                } else {                                                \
+                        /* Reduce __m/__p, and try to clear bit 31   */ \
+                        /* of __m when possible otherwise that'll    */ \
+                        /* need extra overflow handling later.       */ \
+                        unsigned int __bits = -(__m & -__m);            \
+                        __bits |= __m >> 32;                            \
+                        __bits = (~__bits) << 1;                        \
+                        /* If __bits == 0 then setting bit 31 is     */ \
+                        /* unavoidable.  Simply apply the maximum    */ \
+                        /* possible reduction in that case.          */ \
+                        /* Otherwise the MSB of __bits indicates the */ \
+                        /* best reduction we should apply.           */ \
+                        if (!__bits) {                                  \
+                                __p /= (__m & -__m);                    \
+                                __m /= (__m & -__m);                    \
+                        } else {                                        \
+                                __p >>= __div64_fls(__bits);            \
+                                __m >>= __div64_fls(__bits);            \
+                        }                                               \
+                        /* No correction needed. */                     \
+                        __c = 0;                                        \
+                }                                                       \
+                /* Now we have a combination of 2 conditions:        */ \
+                /* 1) whether or not we need a correction (__c), and */ \
+                /* 2) whether or not there might be an overflow in   */ \
+                /*    the cross product (__m & ((1<<63) | (1<<31)))  */ \
+                /* Select the best insn combination to perform the   */ \
+                /* actual __m * __n / (__p << 64) operation.         */ \
+                if (!__c) {                                             \
+                        asm (   "umull  %Q0, %R0, %1, %Q2\n\t"          \
+                                "mov    %Q0, #0"                        \
+                                : "=&r" (__res)                         \
+                                : "r" (__m), "r" (__n)                  \
+                                : "cc" );                               \
+                } else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {    \
+                        __res = __m;                                    \
+                        asm (   "umlal  %Q0, %R0, %Q1, %Q2\n\t"         \
+                                "mov    %Q0, #0"                        \
+                                : "+r" (__res)                          \
+                                : "r" (__m), "r" (__n)                  \
+                                : "cc" );                               \
+                } else {                                                \
+                        asm (   "umull  %Q0, %R0, %Q1, %Q2\n\t"         \
+                                "cmn    %Q0, %Q1\n\t"                   \
+                                "adcs   %R0, %R0, %R1\n\t"              \
+                                "adc    %Q0, %3, #0"                    \
+                                : "=&r" (__res)                         \
+                                : "r" (__m), "r" (__n), "r" (__z)       \
+                                : "cc" );                               \
+                }                                                       \
+                if (!(__m & ((1ULL << 63) | (1ULL << 31)))) {           \
+                        asm (   "umlal  %R0, %Q0, %R1, %Q2\n\t"         \
+                                "umlal  %R0, %Q0, %Q1, %R2\n\t"         \
+                                "mov    %R0, #0\n\t"                    \
+                                "umlal  %Q0, %R0, %R1, %R2"             \
+                                : "+r" (__res)                          \
+                                : "r" (__m), "r" (__n)                  \
+                                : "cc" );                               \
+                } else {                                                \
+                        asm (   "umlal  %R0, %Q0, %R2, %Q3\n\t"         \
+                                "umlal  %R0, %1, %Q2, %R3\n\t"          \
+                                "mov    %R0, #0\n\t"                    \
+                                "adds   %Q0, %1, %Q0\n\t"               \
+                                "adc    %R0, %R0, #0\n\t"               \
+                                "umlal  %Q0, %R0, %R2, %R3"             \
+                                : "+r" (__res), "+r" (__z)              \
+                                : "r" (__m), "r" (__n)                  \
+                                : "cc" );                               \
+                }                                                       \
+                __res /= __p;                                           \
+                /* The reminder can be computed with 32-bit regs     */ \
+                /* only, and gcc is good at that.                    */ \
+                {                                                       \
+                        unsigned int __res0 = __res;                    \
+                        unsigned int __b0 = __b;                        \
+                        __r -= __res0 * __b0;                           \
+                }                                                       \
+                /* BUG_ON(__r >= __b || __res * __b + __r != n); */     \
+                n = __res;                                              \
+        }                                                               \
+        __r;                                                            \
+})
+/* our own fls implementation to make sure constant propagation is fine */
+#define __div64_fls(bits)                                               \
+({                                                                      \
+        unsigned int __left = (bits), __nr = 0;                         \
+        if (__left & 0xffff0000) __nr += 16, __left >>= 16;             \
+        if (__left & 0x0000ff00) __nr +=  8, __left >>=  8;             \
+        if (__left & 0x000000f0) __nr +=  4, __left >>=  4;             \
+        if (__left & 0x0000000c) __nr +=  2, __left >>=  2;             \
+        if (__left & 0x00000002) __nr +=  1;                            \
+        __nr;                                                           \
+})
+#endif
 #endif
author	Nicolas Pitre <nico@cam.org>	2006-12-05 22:13:18 -0500
committer	Russell King <rmk+kernel@arm.linux.org.uk>	2006-12-07 11:06:09 -0500
commit	fa4adc614922c24601320e55bc5a1f837abad6e9 (patch)
tree	f93979fc2d9e2e2dad6edd3fa92b084a2645976a /include
parent	0215ffb08ce99e2bb59eca114a99499a4d06e704 (diff)

diff --git a/include/asm-arm/div64.h b/include/asm-arm/div64.h index 3682616804ca..37e0a96e8789 100644 --- a/include/asm-arm/div64.h +++ b/include/asm-arm/div64.h
@@ -27,7 +27,7 @@
27	#define __xh "r1"	27	#define __xh "r1"
28	#endif	28	#endif
29		29
30	#define do_div(n,base) \	30	#define __do_div_asm(n, base) \
31	({ \	31	({ \
32	register unsigned int __base asm("r4") = base; \	32	register unsigned int __base asm("r4") = base; \
33	register unsigned long long __n asm("r0") = n; \	33	register unsigned long long __n asm("r0") = n; \
@@ -45,4 +45,182 @@
45	__rem; \	45	__rem; \
46	})	46	})
47		47
		48	#if __GNUC__ < 4
		49
		50	/*
		51	* gcc versions earlier than 4.0 are simply too problematic for the
		52	* optimized implementation below. First there is gcc PR 15089 that
		53	* tend to trig on more complex constructs, spurious .global __udivsi3
		54	* are inserted even if none of those symbols are referenced in the
		55	* generated code, and those gcc versions are not able to do constant
		56	* propagation on long long values anyway.
		57	*/
		58	#define do_div(n, base) __do_div_asm(n, base)
		59
		60	#elif __GNUC__ >= 4
		61
		62	#include <asm/bug.h>
		63
		64	/*
		65	* If the divisor happens to be constant, we determine the appropriate
		66	* inverse at compile time to turn the division into a few inline
		67	* multiplications instead which is much faster. And yet only if compiling
		68	* for ARMv4 or higher (we need umull/umlal) and if the gcc version is
		69	* sufficiently recent to perform proper long long constant propagation.
		70	* (It is unfortunate that gcc doesn't perform all this internally.)
		71	*/
		72	#define do_div(n, base) \
		73	({ \
		74	unsigned int __r, __b = (base); \
		75	if (!__builtin_constant_p(__b) \|\| __b == 0 \|\| \
		76	(__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) { \
		77	/* non-constant divisor (or zero): slow path */ \
		78	__r = __do_div_asm(n, __b); \
		79	} else if ((__b & (__b - 1)) == 0) { \
		80	/* Trivial: __b is constant and a power of 2 */ \
		81	/* gcc does the right thing with this code. */ \
		82	__r = n; \
		83	__r &= (__b - 1); \
		84	n /= __b; \
		85	} else { \
		86	/* Multiply by inverse of __b: n/b = n(p/b)/p / \
		87	/* We rely on the fact that most of this code gets */ \
		88	/* optimized away at compile time due to constant */ \
		89	/* propagation and only a couple inline assembly */ \
		90	/* instructions should remain. Better avoid any */ \
		91	/* code construct that might prevent that. */ \
		92	unsigned long long __res, __x, __t, __m, __n = n; \
		93	unsigned int __c, __p, __z = 0; \
		94	/* preserve low part of n for reminder computation */ \
		95	__r = __n; \
		96	/* determine number of bits to represent __b */ \
		97	__p = 1 << __div64_fls(__b); \
		98	/* compute __m = ((__p << 64) + __b - 1) / __b */ \
		99	__m = (~0ULL / __b) * __p; \
		100	__m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b; \
		101	/* compute __res = __m(~0ULL/__b__b-1)/(__p << 64) */ \
		102	__x = ~0ULL / __b * __b - 1; \
		103	__res = (__m & 0xffffffff) * (__x & 0xffffffff); \
		104	__res >>= 32; \
		105	__res += (__m & 0xffffffff) * (__x >> 32); \
		106	__t = __res; \
		107	__res += (__x & 0xffffffff) * (__m >> 32); \
		108	__t = (__res < __t) ? (1ULL << 32) : 0; \
		109	__res = (__res >> 32) + __t; \
		110	__res += (__m >> 32) * (__x >> 32); \
		111	__res /= __p; \
		112	/* Now sanitize and optimize what we've got. */ \
		113	if (~0ULL % (__b / (__b & -__b)) == 0) { \
		114	/* those cases can be simplified with: */ \
		115	__n /= (__b & -__b); \
		116	__m = ~0ULL / (__b / (__b & -__b)); \
		117	__p = 1; \
		118	__c = 1; \
		119	} else if (__res != __x / __b) { \
		120	/* We can't get away without a correction */ \
		121	/* to compensate for bit truncation errors. */ \
		122	/* To avoid it we'd need an additional bit */ \
		123	/* to represent __m which would overflow it. */ \
		124	/* Instead we do m=p/b and n/b=(nm+m)/p. / \
		125	__c = 1; \
		126	/* Compute __m = (__p << 64) / __b */ \
		127	__m = (~0ULL / __b) * __p; \
		128	__m += ((~0ULL % __b + 1) * __p) / __b; \
		129	} else { \
		130	/* Reduce __m/__p, and try to clear bit 31 */ \
		131	/* of __m when possible otherwise that'll */ \
		132	/* need extra overflow handling later. */ \
		133	unsigned int __bits = -(__m & -__m); \
		134	__bits \|= __m >> 32; \
		135	__bits = (~__bits) << 1; \
		136	/* If __bits == 0 then setting bit 31 is */ \
		137	/* unavoidable. Simply apply the maximum */ \
		138	/* possible reduction in that case. */ \
		139	/* Otherwise the MSB of __bits indicates the */ \
		140	/* best reduction we should apply. */ \
		141	if (!__bits) { \
		142	__p /= (__m & -__m); \
		143	__m /= (__m & -__m); \
		144	} else { \
		145	__p >>= __div64_fls(__bits); \
		146	__m >>= __div64_fls(__bits); \
		147	} \
		148	/* No correction needed. */ \
		149	__c = 0; \
		150	} \
		151	/* Now we have a combination of 2 conditions: */ \
		152	/* 1) whether or not we need a correction (__c), and */ \
		153	/* 2) whether or not there might be an overflow in */ \
		154	/* the cross product (__m & ((1<<63) \| (1<<31))) */ \
		155	/* Select the best insn combination to perform the */ \
		156	/* actual __m * __n / (__p << 64) operation. */ \
		157	if (!__c) { \
		158	asm ( "umull %Q0, %R0, %1, %Q2\n\t" \
		159	"mov %Q0, #0" \
		160	: "=&r" (__res) \
		161	: "r" (__m), "r" (__n) \
		162	: "cc" ); \
		163	} else if (!(__m & ((1ULL << 63) \| (1ULL << 31)))) { \
		164	__res = __m; \
		165	asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" \
		166	"mov %Q0, #0" \
		167	: "+r" (__res) \
		168	: "r" (__m), "r" (__n) \
		169	: "cc" ); \
		170	} else { \
		171	asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \
		172	"cmn %Q0, %Q1\n\t" \
		173	"adcs %R0, %R0, %R1\n\t" \
		174	"adc %Q0, %3, #0" \
		175	: "=&r" (__res) \
		176	: "r" (__m), "r" (__n), "r" (__z) \
		177	: "cc" ); \
		178	} \
		179	if (!(__m & ((1ULL << 63) \| (1ULL << 31)))) { \
		180	asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" \
		181	"umlal %R0, %Q0, %Q1, %R2\n\t" \
		182	"mov %R0, #0\n\t" \
		183	"umlal %Q0, %R0, %R1, %R2" \
		184	: "+r" (__res) \
		185	: "r" (__m), "r" (__n) \
		186	: "cc" ); \
		187	} else { \
		188	asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" \
		189	"umlal %R0, %1, %Q2, %R3\n\t" \
		190	"mov %R0, #0\n\t" \
		191	"adds %Q0, %1, %Q0\n\t" \
		192	"adc %R0, %R0, #0\n\t" \
		193	"umlal %Q0, %R0, %R2, %R3" \
		194	: "+r" (__res), "+r" (__z) \
		195	: "r" (__m), "r" (__n) \
		196	: "cc" ); \
		197	} \
		198	__res /= __p; \
		199	/* The reminder can be computed with 32-bit regs */ \
		200	/* only, and gcc is good at that. */ \
		201	{ \
		202	unsigned int __res0 = __res; \
		203	unsigned int __b0 = __b; \
		204	__r -= __res0 * __b0; \
		205	} \
		206	/* BUG_ON(__r >= __b \|\| __res * __b + __r != n); */ \
		207	n = __res; \
		208	} \
		209	__r; \
		210	})
		211
		212	/* our own fls implementation to make sure constant propagation is fine */
		213	#define __div64_fls(bits) \
		214	({ \
		215	unsigned int __left = (bits), __nr = 0; \
		216	if (__left & 0xffff0000) __nr += 16, __left >>= 16; \
		217	if (__left & 0x0000ff00) __nr += 8, __left >>= 8; \
		218	if (__left & 0x000000f0) __nr += 4, __left >>= 4; \
		219	if (__left & 0x0000000c) __nr += 2, __left >>= 2; \
		220	if (__left & 0x00000002) __nr += 1; \
		221	__nr; \
		222	})
		223
		224	#endif
		225
48	#endif	226	#endif