summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-20 20:30:20 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-20 20:30:20 -0500
commite3de671dd6784e30821e64f67f854b90b4496a68 (patch)
tree82cc89b65419ae5f04586075616d2de58312a770 /arch
parent71e4634e00119b2fb8dd0da99b3f5ebbb49cc872 (diff)
parent040b323b5012b5503561ec7fe15cccd6a4bcaec2 (diff)
Merge tag 'asm-generic-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic
Pull asm-generic updates from Arnd Bergmann: "The asm-generic tree this time contains one series from Nicolas Pitre that makes the optimized do_div() implementation from the ARM architecture available to all architectures. This also adds stricter type checking for callers of do_div, which has uncovered a number of bugs in existing code, and fixes up the ones we have found" * tag 'asm-generic-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic: ARM: asm/div64.h: adjust to generic codde __div64_32(): make it overridable at compile time __div64_const32(): abstract out the actual 128-bit cross product code do_div(): generic optimization for constant divisor on 32-bit machines div64.h: optimize do_div() for power-of-two constant divisors mtd/sm_ftl.c: fix wrong do_div() usage drm/mgag200/mgag200_mode.c: fix wrong do_div() usage hid-sensor-hub.c: fix wrong do_div() usage ti/fapll: fix wrong do_div() usage ti/clkt_dpll: fix wrong do_div() usage tegra/clk-divider: fix wrong do_div() usage imx/clk-pllv2: fix wrong do_div() usage imx/clk-pllv1: fix wrong do_div() usage nouveau/nvkm/subdev/clk/gk20a.c: fix wrong do_div() usage
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/include/asm/div64.h283
1 files changed, 93 insertions, 190 deletions
diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h
index 662c7bd06108..e1f07764b0d6 100644
--- a/arch/arm/include/asm/div64.h
+++ b/arch/arm/include/asm/div64.h
@@ -5,9 +5,9 @@
5#include <asm/compiler.h> 5#include <asm/compiler.h>
6 6
7/* 7/*
8 * The semantics of do_div() are: 8 * The semantics of __div64_32() are:
9 * 9 *
10 * uint32_t do_div(uint64_t *n, uint32_t base) 10 * uint32_t __div64_32(uint64_t *n, uint32_t base)
11 * { 11 * {
12 * uint32_t remainder = *n % base; 12 * uint32_t remainder = *n % base;
13 * *n = *n / base; 13 * *n = *n / base;
@@ -16,8 +16,9 @@
16 * 16 *
17 * In other words, a 64-bit dividend with a 32-bit divisor producing 17 * In other words, a 64-bit dividend with a 32-bit divisor producing
18 * a 64-bit result and a 32-bit remainder. To accomplish this optimally 18 * a 64-bit result and a 32-bit remainder. To accomplish this optimally
19 * we call a special __do_div64 helper with completely non standard 19 * we override the generic version in lib/div64.c to call our __do_div64
20 * calling convention for arguments and results (beware). 20 * assembly implementation with completely non standard calling convention
21 * for arguments and results (beware).
21 */ 22 */
22 23
23#ifdef __ARMEB__ 24#ifdef __ARMEB__
@@ -28,199 +29,101 @@
28#define __xh "r1" 29#define __xh "r1"
29#endif 30#endif
30 31
31#define __do_div_asm(n, base) \ 32static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
32({ \ 33{
33 register unsigned int __base asm("r4") = base; \ 34 register unsigned int __base asm("r4") = base;
34 register unsigned long long __n asm("r0") = n; \ 35 register unsigned long long __n asm("r0") = *n;
35 register unsigned long long __res asm("r2"); \ 36 register unsigned long long __res asm("r2");
36 register unsigned int __rem asm(__xh); \ 37 register unsigned int __rem asm(__xh);
37 asm( __asmeq("%0", __xh) \ 38 asm( __asmeq("%0", __xh)
38 __asmeq("%1", "r2") \ 39 __asmeq("%1", "r2")
39 __asmeq("%2", "r0") \ 40 __asmeq("%2", "r0")
40 __asmeq("%3", "r4") \ 41 __asmeq("%3", "r4")
41 "bl __do_div64" \ 42 "bl __do_div64"
42 : "=r" (__rem), "=r" (__res) \ 43 : "=r" (__rem), "=r" (__res)
43 : "r" (__n), "r" (__base) \ 44 : "r" (__n), "r" (__base)
44 : "ip", "lr", "cc"); \ 45 : "ip", "lr", "cc");
45 n = __res; \ 46 *n = __res;
46 __rem; \ 47 return __rem;
47}) 48}
48 49#define __div64_32 __div64_32
49#if __GNUC__ < 4 || !defined(CONFIG_AEABI) 50
51#if !defined(CONFIG_AEABI)
50 52
51/* 53/*
52 * gcc versions earlier than 4.0 are simply too problematic for the 54 * In OABI configurations, some uses of the do_div function
53 * optimized implementation below. First there is gcc PR 15089 that 55 * cause gcc to run out of registers. To work around that,
54 * tend to trig on more complex constructs, spurious .global __udivsi3 56 * we can force the use of the out-of-line version for
55 * are inserted even if none of those symbols are referenced in the 57 * configurations that build a OABI kernel.
56 * generated code, and those gcc versions are not able to do constant
57 * propagation on long long values anyway.
58 */ 58 */
59#define do_div(n, base) __do_div_asm(n, base) 59#define do_div(n, base) __div64_32(&(n), base)
60
61#elif __GNUC__ >= 4
62 60
63#include <asm/bug.h> 61#else
64 62
65/* 63/*
66 * If the divisor happens to be constant, we determine the appropriate 64 * gcc versions earlier than 4.0 are simply too problematic for the
67 * inverse at compile time to turn the division into a few inline 65 * __div64_const32() code in asm-generic/div64.h. First there is
68 * multiplications instead which is much faster. And yet only if compiling 66 * gcc PR 15089 that tend to trig on more complex constructs, spurious
69 * for ARMv4 or higher (we need umull/umlal) and if the gcc version is 67 * .global __udivsi3 are inserted even if none of those symbols are
70 * sufficiently recent to perform proper long long constant propagation. 68 * referenced in the generated code, and those gcc versions are not able
71 * (It is unfortunate that gcc doesn't perform all this internally.) 69 * to do constant propagation on long long values anyway.
72 */ 70 */
73#define do_div(n, base) \ 71
74({ \ 72#define __div64_const32_is_OK (__GNUC__ >= 4)
75 unsigned int __r, __b = (base); \ 73
76 if (!__builtin_constant_p(__b) || __b == 0 || \ 74static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
77 (__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) { \ 75{
78 /* non-constant divisor (or zero): slow path */ \ 76 unsigned long long res;
79 __r = __do_div_asm(n, __b); \ 77 unsigned int tmp = 0;
80 } else if ((__b & (__b - 1)) == 0) { \ 78
81 /* Trivial: __b is constant and a power of 2 */ \ 79 if (!bias) {
82 /* gcc does the right thing with this code. */ \ 80 asm ( "umull %Q0, %R0, %Q1, %Q2\n\t"
83 __r = n; \ 81 "mov %Q0, #0"
84 __r &= (__b - 1); \ 82 : "=&r" (res)
85 n /= __b; \ 83 : "r" (m), "r" (n)
86 } else { \ 84 : "cc");
87 /* Multiply by inverse of __b: n/b = n*(p/b)/p */ \ 85 } else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
88 /* We rely on the fact that most of this code gets */ \ 86 res = m;
89 /* optimized away at compile time due to constant */ \ 87 asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t"
90 /* propagation and only a couple inline assembly */ \ 88 "mov %Q0, #0"
91 /* instructions should remain. Better avoid any */ \ 89 : "+&r" (res)
92 /* code construct that might prevent that. */ \ 90 : "r" (m), "r" (n)
93 unsigned long long __res, __x, __t, __m, __n = n; \ 91 : "cc");
94 unsigned int __c, __p, __z = 0; \ 92 } else {
95 /* preserve low part of n for reminder computation */ \ 93 asm ( "umull %Q0, %R0, %Q1, %Q2\n\t"
96 __r = __n; \ 94 "cmn %Q0, %Q1\n\t"
97 /* determine number of bits to represent __b */ \ 95 "adcs %R0, %R0, %R1\n\t"
98 __p = 1 << __div64_fls(__b); \ 96 "adc %Q0, %3, #0"
99 /* compute __m = ((__p << 64) + __b - 1) / __b */ \ 97 : "=&r" (res)
100 __m = (~0ULL / __b) * __p; \ 98 : "r" (m), "r" (n), "r" (tmp)
101 __m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b; \ 99 : "cc");
102 /* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */ \ 100 }
103 __x = ~0ULL / __b * __b - 1; \ 101
104 __res = (__m & 0xffffffff) * (__x & 0xffffffff); \ 102 if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
105 __res >>= 32; \ 103 asm ( "umlal %R0, %Q0, %R1, %Q2\n\t"
106 __res += (__m & 0xffffffff) * (__x >> 32); \ 104 "umlal %R0, %Q0, %Q1, %R2\n\t"
107 __t = __res; \ 105 "mov %R0, #0\n\t"
108 __res += (__x & 0xffffffff) * (__m >> 32); \ 106 "umlal %Q0, %R0, %R1, %R2"
109 __t = (__res < __t) ? (1ULL << 32) : 0; \ 107 : "+&r" (res)
110 __res = (__res >> 32) + __t; \ 108 : "r" (m), "r" (n)
111 __res += (__m >> 32) * (__x >> 32); \ 109 : "cc");
112 __res /= __p; \ 110 } else {
113 /* Now sanitize and optimize what we've got. */ \ 111 asm ( "umlal %R0, %Q0, %R2, %Q3\n\t"
114 if (~0ULL % (__b / (__b & -__b)) == 0) { \ 112 "umlal %R0, %1, %Q2, %R3\n\t"
115 /* those cases can be simplified with: */ \ 113 "mov %R0, #0\n\t"
116 __n /= (__b & -__b); \ 114 "adds %Q0, %1, %Q0\n\t"
117 __m = ~0ULL / (__b / (__b & -__b)); \ 115 "adc %R0, %R0, #0\n\t"
118 __p = 1; \ 116 "umlal %Q0, %R0, %R2, %R3"
119 __c = 1; \ 117 : "+&r" (res), "+&r" (tmp)
120 } else if (__res != __x / __b) { \ 118 : "r" (m), "r" (n)
121 /* We can't get away without a correction */ \ 119 : "cc");
122 /* to compensate for bit truncation errors. */ \ 120 }
123 /* To avoid it we'd need an additional bit */ \ 121
124 /* to represent __m which would overflow it. */ \ 122 return res;
125 /* Instead we do m=p/b and n/b=(n*m+m)/p. */ \ 123}
126 __c = 1; \ 124#define __arch_xprod_64 __arch_xprod_64
127 /* Compute __m = (__p << 64) / __b */ \ 125
128 __m = (~0ULL / __b) * __p; \ 126#include <asm-generic/div64.h>
129 __m += ((~0ULL % __b + 1) * __p) / __b; \
130 } else { \
131 /* Reduce __m/__p, and try to clear bit 31 */ \
132 /* of __m when possible otherwise that'll */ \
133 /* need extra overflow handling later. */ \
134 unsigned int __bits = -(__m & -__m); \
135 __bits |= __m >> 32; \
136 __bits = (~__bits) << 1; \
137 /* If __bits == 0 then setting bit 31 is */ \
138 /* unavoidable. Simply apply the maximum */ \
139 /* possible reduction in that case. */ \
140 /* Otherwise the MSB of __bits indicates the */ \
141 /* best reduction we should apply. */ \
142 if (!__bits) { \
143 __p /= (__m & -__m); \
144 __m /= (__m & -__m); \
145 } else { \
146 __p >>= __div64_fls(__bits); \
147 __m >>= __div64_fls(__bits); \
148 } \
149 /* No correction needed. */ \
150 __c = 0; \
151 } \
152 /* Now we have a combination of 2 conditions: */ \
153 /* 1) whether or not we need a correction (__c), and */ \
154 /* 2) whether or not there might be an overflow in */ \
155 /* the cross product (__m & ((1<<63) | (1<<31))) */ \
156 /* Select the best insn combination to perform the */ \
157 /* actual __m * __n / (__p << 64) operation. */ \
158 if (!__c) { \
159 asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \
160 "mov %Q0, #0" \
161 : "=&r" (__res) \
162 : "r" (__m), "r" (__n) \
163 : "cc" ); \
164 } else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \
165 __res = __m; \
166 asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" \
167 "mov %Q0, #0" \
168 : "+&r" (__res) \
169 : "r" (__m), "r" (__n) \
170 : "cc" ); \
171 } else { \
172 asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \
173 "cmn %Q0, %Q1\n\t" \
174 "adcs %R0, %R0, %R1\n\t" \
175 "adc %Q0, %3, #0" \
176 : "=&r" (__res) \
177 : "r" (__m), "r" (__n), "r" (__z) \
178 : "cc" ); \
179 } \
180 if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \
181 asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" \
182 "umlal %R0, %Q0, %Q1, %R2\n\t" \
183 "mov %R0, #0\n\t" \
184 "umlal %Q0, %R0, %R1, %R2" \
185 : "+&r" (__res) \
186 : "r" (__m), "r" (__n) \
187 : "cc" ); \
188 } else { \
189 asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" \
190 "umlal %R0, %1, %Q2, %R3\n\t" \
191 "mov %R0, #0\n\t" \
192 "adds %Q0, %1, %Q0\n\t" \
193 "adc %R0, %R0, #0\n\t" \
194 "umlal %Q0, %R0, %R2, %R3" \
195 : "+&r" (__res), "+&r" (__z) \
196 : "r" (__m), "r" (__n) \
197 : "cc" ); \
198 } \
199 __res /= __p; \
200 /* The reminder can be computed with 32-bit regs */ \
201 /* only, and gcc is good at that. */ \
202 { \
203 unsigned int __res0 = __res; \
204 unsigned int __b0 = __b; \
205 __r -= __res0 * __b0; \
206 } \
207 /* BUG_ON(__r >= __b || __res * __b + __r != n); */ \
208 n = __res; \
209 } \
210 __r; \
211})
212
213/* our own fls implementation to make sure constant propagation is fine */
214#define __div64_fls(bits) \
215({ \
216 unsigned int __left = (bits), __nr = 0; \
217 if (__left & 0xffff0000) __nr += 16, __left >>= 16; \
218 if (__left & 0x0000ff00) __nr += 8, __left >>= 8; \
219 if (__left & 0x000000f0) __nr += 4, __left >>= 4; \
220 if (__left & 0x0000000c) __nr += 2, __left >>= 2; \
221 if (__left & 0x00000002) __nr += 1; \
222 __nr; \
223})
224 127
225#endif 128#endif
226 129