aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNicolas Pitre <nico@cam.org>2006-12-05 22:13:18 -0500
committerRussell King <rmk+kernel@arm.linux.org.uk>2006-12-07 11:06:09 -0500
commitfa4adc614922c24601320e55bc5a1f837abad6e9 (patch)
treef93979fc2d9e2e2dad6edd3fa92b084a2645976a
parent0215ffb08ce99e2bb59eca114a99499a4d06e704 (diff)
[ARM] 3611/4: optimize do_div() when divisor is constant
On ARM all divisions have to be performed "manually". For 64-bit divisions that may take more than a hundred cycles in many cases. With 32-bit divisions gcc already use the recyprocal of constant divisors to perform a multiplication, but not with 64-bit divisions. Since the kernel is increasingly relying upon 64-bit divisions it is worth optimizing at least those cases where the divisor is a constant. This is what this patch does using plain C code that gets optimized away at compile time. For example, despite the amount of added C code, do_div(x, 10000) now produces the following assembly code (where x is assigned to r0-r1): adr r4, .L0 ldmia r4, {r4-r5} umull r2, r3, r4, r0 mov r2, #0 umlal r3, r2, r5, r0 umlal r3, r2, r4, r1 mov r3, #0 umlal r2, r3, r5, r1 mov r0, r2, lsr #11 orr r0, r0, r3, lsl #21 mov r1, r3, lsr #11 ... .L0: .word 948328779 .word 879609302 which is the fastest that can be done for any value of x in that case, many times faster than the __do_div64 code (except for the small x value space for which the result ends up being zero or a single bit). The fact that this code is generated inline produces a tiny increase in .text size, but not significant compared to the needed code around each __do_div64 call site this code is replacing. The algorithm used has been validated on a 16-bit scale for all possible values, and then recodified for 64-bit values. Furthermore I've been running it with the final BUG_ON() uncommented for over two months now with no problem. Note that this new code is compiled with gcc versions 4.0 or later. Earlier gcc versions proved themselves too problematic and only the original code is used with them. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
-rw-r--r--include/asm-arm/div64.h180
1 files changed, 179 insertions, 1 deletions
diff --git a/include/asm-arm/div64.h b/include/asm-arm/div64.h
index 3682616804ca..37e0a96e8789 100644
--- a/include/asm-arm/div64.h
+++ b/include/asm-arm/div64.h
@@ -27,7 +27,7 @@
27#define __xh "r1" 27#define __xh "r1"
28#endif 28#endif
29 29
30#define do_div(n,base) \ 30#define __do_div_asm(n, base) \
31({ \ 31({ \
32 register unsigned int __base asm("r4") = base; \ 32 register unsigned int __base asm("r4") = base; \
33 register unsigned long long __n asm("r0") = n; \ 33 register unsigned long long __n asm("r0") = n; \
@@ -45,4 +45,182 @@
45 __rem; \ 45 __rem; \
46}) 46})
47 47
48#if __GNUC__ < 4
49
50/*
51 * gcc versions earlier than 4.0 are simply too problematic for the
52 * optimized implementation below. First there is gcc PR 15089 that
53 * tend to trig on more complex constructs, spurious .global __udivsi3
54 * are inserted even if none of those symbols are referenced in the
55 * generated code, and those gcc versions are not able to do constant
56 * propagation on long long values anyway.
57 */
58#define do_div(n, base) __do_div_asm(n, base)
59
60#elif __GNUC__ >= 4
61
62#include <asm/bug.h>
63
64/*
65 * If the divisor happens to be constant, we determine the appropriate
66 * inverse at compile time to turn the division into a few inline
67 * multiplications instead which is much faster. And yet only if compiling
68 * for ARMv4 or higher (we need umull/umlal) and if the gcc version is
69 * sufficiently recent to perform proper long long constant propagation.
70 * (It is unfortunate that gcc doesn't perform all this internally.)
71 */
72#define do_div(n, base) \
73({ \
74 unsigned int __r, __b = (base); \
75 if (!__builtin_constant_p(__b) || __b == 0 || \
76 (__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) { \
77 /* non-constant divisor (or zero): slow path */ \
78 __r = __do_div_asm(n, __b); \
79 } else if ((__b & (__b - 1)) == 0) { \
80 /* Trivial: __b is constant and a power of 2 */ \
81 /* gcc does the right thing with this code. */ \
82 __r = n; \
83 __r &= (__b - 1); \
84 n /= __b; \
85 } else { \
86 /* Multiply by inverse of __b: n/b = n*(p/b)/p */ \
87 /* We rely on the fact that most of this code gets */ \
88 /* optimized away at compile time due to constant */ \
89 /* propagation and only a couple inline assembly */ \
90 /* instructions should remain. Better avoid any */ \
91 /* code construct that might prevent that. */ \
92 unsigned long long __res, __x, __t, __m, __n = n; \
93 unsigned int __c, __p, __z = 0; \
94 /* preserve low part of n for reminder computation */ \
95 __r = __n; \
96 /* determine number of bits to represent __b */ \
97 __p = 1 << __div64_fls(__b); \
98 /* compute __m = ((__p << 64) + __b - 1) / __b */ \
99 __m = (~0ULL / __b) * __p; \
100 __m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b; \
101 /* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */ \
102 __x = ~0ULL / __b * __b - 1; \
103 __res = (__m & 0xffffffff) * (__x & 0xffffffff); \
104 __res >>= 32; \
105 __res += (__m & 0xffffffff) * (__x >> 32); \
106 __t = __res; \
107 __res += (__x & 0xffffffff) * (__m >> 32); \
108 __t = (__res < __t) ? (1ULL << 32) : 0; \
109 __res = (__res >> 32) + __t; \
110 __res += (__m >> 32) * (__x >> 32); \
111 __res /= __p; \
112 /* Now sanitize and optimize what we've got. */ \
113 if (~0ULL % (__b / (__b & -__b)) == 0) { \
114 /* those cases can be simplified with: */ \
115 __n /= (__b & -__b); \
116 __m = ~0ULL / (__b / (__b & -__b)); \
117 __p = 1; \
118 __c = 1; \
119 } else if (__res != __x / __b) { \
120 /* We can't get away without a correction */ \
121 /* to compensate for bit truncation errors. */ \
122 /* To avoid it we'd need an additional bit */ \
123 /* to represent __m which would overflow it. */ \
124 /* Instead we do m=p/b and n/b=(n*m+m)/p. */ \
125 __c = 1; \
126 /* Compute __m = (__p << 64) / __b */ \
127 __m = (~0ULL / __b) * __p; \
128 __m += ((~0ULL % __b + 1) * __p) / __b; \
129 } else { \
130 /* Reduce __m/__p, and try to clear bit 31 */ \
131 /* of __m when possible otherwise that'll */ \
132 /* need extra overflow handling later. */ \
133 unsigned int __bits = -(__m & -__m); \
134 __bits |= __m >> 32; \
135 __bits = (~__bits) << 1; \
136 /* If __bits == 0 then setting bit 31 is */ \
137 /* unavoidable. Simply apply the maximum */ \
138 /* possible reduction in that case. */ \
139 /* Otherwise the MSB of __bits indicates the */ \
140 /* best reduction we should apply. */ \
141 if (!__bits) { \
142 __p /= (__m & -__m); \
143 __m /= (__m & -__m); \
144 } else { \
145 __p >>= __div64_fls(__bits); \
146 __m >>= __div64_fls(__bits); \
147 } \
148 /* No correction needed. */ \
149 __c = 0; \
150 } \
151 /* Now we have a combination of 2 conditions: */ \
152 /* 1) whether or not we need a correction (__c), and */ \
153 /* 2) whether or not there might be an overflow in */ \
154 /* the cross product (__m & ((1<<63) | (1<<31))) */ \
155 /* Select the best insn combination to perform the */ \
156 /* actual __m * __n / (__p << 64) operation. */ \
157 if (!__c) { \
158 asm ( "umull %Q0, %R0, %1, %Q2\n\t" \
159 "mov %Q0, #0" \
160 : "=&r" (__res) \
161 : "r" (__m), "r" (__n) \
162 : "cc" ); \
163 } else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \
164 __res = __m; \
165 asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" \
166 "mov %Q0, #0" \
167 : "+r" (__res) \
168 : "r" (__m), "r" (__n) \
169 : "cc" ); \
170 } else { \
171 asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \
172 "cmn %Q0, %Q1\n\t" \
173 "adcs %R0, %R0, %R1\n\t" \
174 "adc %Q0, %3, #0" \
175 : "=&r" (__res) \
176 : "r" (__m), "r" (__n), "r" (__z) \
177 : "cc" ); \
178 } \
179 if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \
180 asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" \
181 "umlal %R0, %Q0, %Q1, %R2\n\t" \
182 "mov %R0, #0\n\t" \
183 "umlal %Q0, %R0, %R1, %R2" \
184 : "+r" (__res) \
185 : "r" (__m), "r" (__n) \
186 : "cc" ); \
187 } else { \
188 asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" \
189 "umlal %R0, %1, %Q2, %R3\n\t" \
190 "mov %R0, #0\n\t" \
191 "adds %Q0, %1, %Q0\n\t" \
192 "adc %R0, %R0, #0\n\t" \
193 "umlal %Q0, %R0, %R2, %R3" \
194 : "+r" (__res), "+r" (__z) \
195 : "r" (__m), "r" (__n) \
196 : "cc" ); \
197 } \
198 __res /= __p; \
199 /* The reminder can be computed with 32-bit regs */ \
200 /* only, and gcc is good at that. */ \
201 { \
202 unsigned int __res0 = __res; \
203 unsigned int __b0 = __b; \
204 __r -= __res0 * __b0; \
205 } \
206 /* BUG_ON(__r >= __b || __res * __b + __r != n); */ \
207 n = __res; \
208 } \
209 __r; \
210})
211
212/* our own fls implementation to make sure constant propagation is fine */
213#define __div64_fls(bits) \
214({ \
215 unsigned int __left = (bits), __nr = 0; \
216 if (__left & 0xffff0000) __nr += 16, __left >>= 16; \
217 if (__left & 0x0000ff00) __nr += 8, __left >>= 8; \
218 if (__left & 0x000000f0) __nr += 4, __left >>= 4; \
219 if (__left & 0x0000000c) __nr += 2, __left >>= 2; \
220 if (__left & 0x00000002) __nr += 1; \
221 __nr; \
222})
223
224#endif
225
48#endif 226#endif