diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-20 20:30:20 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-20 20:30:20 -0500 |
commit | e3de671dd6784e30821e64f67f854b90b4496a68 (patch) | |
tree | 82cc89b65419ae5f04586075616d2de58312a770 /arch | |
parent | 71e4634e00119b2fb8dd0da99b3f5ebbb49cc872 (diff) | |
parent | 040b323b5012b5503561ec7fe15cccd6a4bcaec2 (diff) |
Merge tag 'asm-generic-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic
Pull asm-generic updates from Arnd Bergmann:
"The asm-generic tree this time contains one series from Nicolas Pitre
that makes the optimized do_div() implementation from the ARM
architecture available to all architectures.
This also adds stricter type checking for callers of do_div, which has
uncovered a number of bugs in existing code, and fixes up the ones we
have found"
* tag 'asm-generic-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic:
ARM: asm/div64.h: adjust to generic codde
__div64_32(): make it overridable at compile time
__div64_const32(): abstract out the actual 128-bit cross product code
do_div(): generic optimization for constant divisor on 32-bit machines
div64.h: optimize do_div() for power-of-two constant divisors
mtd/sm_ftl.c: fix wrong do_div() usage
drm/mgag200/mgag200_mode.c: fix wrong do_div() usage
hid-sensor-hub.c: fix wrong do_div() usage
ti/fapll: fix wrong do_div() usage
ti/clkt_dpll: fix wrong do_div() usage
tegra/clk-divider: fix wrong do_div() usage
imx/clk-pllv2: fix wrong do_div() usage
imx/clk-pllv1: fix wrong do_div() usage
nouveau/nvkm/subdev/clk/gk20a.c: fix wrong do_div() usage
Diffstat (limited to 'arch')
-rw-r--r-- | arch/arm/include/asm/div64.h | 283 |
1 files changed, 93 insertions, 190 deletions
diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h index 662c7bd06108..e1f07764b0d6 100644 --- a/arch/arm/include/asm/div64.h +++ b/arch/arm/include/asm/div64.h | |||
@@ -5,9 +5,9 @@ | |||
5 | #include <asm/compiler.h> | 5 | #include <asm/compiler.h> |
6 | 6 | ||
7 | /* | 7 | /* |
8 | * The semantics of do_div() are: | 8 | * The semantics of __div64_32() are: |
9 | * | 9 | * |
10 | * uint32_t do_div(uint64_t *n, uint32_t base) | 10 | * uint32_t __div64_32(uint64_t *n, uint32_t base) |
11 | * { | 11 | * { |
12 | * uint32_t remainder = *n % base; | 12 | * uint32_t remainder = *n % base; |
13 | * *n = *n / base; | 13 | * *n = *n / base; |
@@ -16,8 +16,9 @@ | |||
16 | * | 16 | * |
17 | * In other words, a 64-bit dividend with a 32-bit divisor producing | 17 | * In other words, a 64-bit dividend with a 32-bit divisor producing |
18 | * a 64-bit result and a 32-bit remainder. To accomplish this optimally | 18 | * a 64-bit result and a 32-bit remainder. To accomplish this optimally |
19 | * we call a special __do_div64 helper with completely non standard | 19 | * we override the generic version in lib/div64.c to call our __do_div64 |
20 | * calling convention for arguments and results (beware). | 20 | * assembly implementation with completely non standard calling convention |
21 | * for arguments and results (beware). | ||
21 | */ | 22 | */ |
22 | 23 | ||
23 | #ifdef __ARMEB__ | 24 | #ifdef __ARMEB__ |
@@ -28,199 +29,101 @@ | |||
28 | #define __xh "r1" | 29 | #define __xh "r1" |
29 | #endif | 30 | #endif |
30 | 31 | ||
31 | #define __do_div_asm(n, base) \ | 32 | static inline uint32_t __div64_32(uint64_t *n, uint32_t base) |
32 | ({ \ | 33 | { |
33 | register unsigned int __base asm("r4") = base; \ | 34 | register unsigned int __base asm("r4") = base; |
34 | register unsigned long long __n asm("r0") = n; \ | 35 | register unsigned long long __n asm("r0") = *n; |
35 | register unsigned long long __res asm("r2"); \ | 36 | register unsigned long long __res asm("r2"); |
36 | register unsigned int __rem asm(__xh); \ | 37 | register unsigned int __rem asm(__xh); |
37 | asm( __asmeq("%0", __xh) \ | 38 | asm( __asmeq("%0", __xh) |
38 | __asmeq("%1", "r2") \ | 39 | __asmeq("%1", "r2") |
39 | __asmeq("%2", "r0") \ | 40 | __asmeq("%2", "r0") |
40 | __asmeq("%3", "r4") \ | 41 | __asmeq("%3", "r4") |
41 | "bl __do_div64" \ | 42 | "bl __do_div64" |
42 | : "=r" (__rem), "=r" (__res) \ | 43 | : "=r" (__rem), "=r" (__res) |
43 | : "r" (__n), "r" (__base) \ | 44 | : "r" (__n), "r" (__base) |
44 | : "ip", "lr", "cc"); \ | 45 | : "ip", "lr", "cc"); |
45 | n = __res; \ | 46 | *n = __res; |
46 | __rem; \ | 47 | return __rem; |
47 | }) | 48 | } |
48 | 49 | #define __div64_32 __div64_32 | |
49 | #if __GNUC__ < 4 || !defined(CONFIG_AEABI) | 50 | |
51 | #if !defined(CONFIG_AEABI) | ||
50 | 52 | ||
51 | /* | 53 | /* |
52 | * gcc versions earlier than 4.0 are simply too problematic for the | 54 | * In OABI configurations, some uses of the do_div function |
53 | * optimized implementation below. First there is gcc PR 15089 that | 55 | * cause gcc to run out of registers. To work around that, |
54 | * tend to trig on more complex constructs, spurious .global __udivsi3 | 56 | * we can force the use of the out-of-line version for |
55 | * are inserted even if none of those symbols are referenced in the | 57 | * configurations that build a OABI kernel. |
56 | * generated code, and those gcc versions are not able to do constant | ||
57 | * propagation on long long values anyway. | ||
58 | */ | 58 | */ |
59 | #define do_div(n, base) __do_div_asm(n, base) | 59 | #define do_div(n, base) __div64_32(&(n), base) |
60 | |||
61 | #elif __GNUC__ >= 4 | ||
62 | 60 | ||
63 | #include <asm/bug.h> | 61 | #else |
64 | 62 | ||
65 | /* | 63 | /* |
66 | * If the divisor happens to be constant, we determine the appropriate | 64 | * gcc versions earlier than 4.0 are simply too problematic for the |
67 | * inverse at compile time to turn the division into a few inline | 65 | * __div64_const32() code in asm-generic/div64.h. First there is |
68 | * multiplications instead which is much faster. And yet only if compiling | 66 | * gcc PR 15089 that tend to trig on more complex constructs, spurious |
69 | * for ARMv4 or higher (we need umull/umlal) and if the gcc version is | 67 | * .global __udivsi3 are inserted even if none of those symbols are |
70 | * sufficiently recent to perform proper long long constant propagation. | 68 | * referenced in the generated code, and those gcc versions are not able |
71 | * (It is unfortunate that gcc doesn't perform all this internally.) | 69 | * to do constant propagation on long long values anyway. |
72 | */ | 70 | */ |
73 | #define do_div(n, base) \ | 71 | |
74 | ({ \ | 72 | #define __div64_const32_is_OK (__GNUC__ >= 4) |
75 | unsigned int __r, __b = (base); \ | 73 | |
76 | if (!__builtin_constant_p(__b) || __b == 0 || \ | 74 | static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias) |
77 | (__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) { \ | 75 | { |
78 | /* non-constant divisor (or zero): slow path */ \ | 76 | unsigned long long res; |
79 | __r = __do_div_asm(n, __b); \ | 77 | unsigned int tmp = 0; |
80 | } else if ((__b & (__b - 1)) == 0) { \ | 78 | |
81 | /* Trivial: __b is constant and a power of 2 */ \ | 79 | if (!bias) { |
82 | /* gcc does the right thing with this code. */ \ | 80 | asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" |
83 | __r = n; \ | 81 | "mov %Q0, #0" |
84 | __r &= (__b - 1); \ | 82 | : "=&r" (res) |
85 | n /= __b; \ | 83 | : "r" (m), "r" (n) |
86 | } else { \ | 84 | : "cc"); |
87 | /* Multiply by inverse of __b: n/b = n*(p/b)/p */ \ | 85 | } else if (!(m & ((1ULL << 63) | (1ULL << 31)))) { |
88 | /* We rely on the fact that most of this code gets */ \ | 86 | res = m; |
89 | /* optimized away at compile time due to constant */ \ | 87 | asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" |
90 | /* propagation and only a couple inline assembly */ \ | 88 | "mov %Q0, #0" |
91 | /* instructions should remain. Better avoid any */ \ | 89 | : "+&r" (res) |
92 | /* code construct that might prevent that. */ \ | 90 | : "r" (m), "r" (n) |
93 | unsigned long long __res, __x, __t, __m, __n = n; \ | 91 | : "cc"); |
94 | unsigned int __c, __p, __z = 0; \ | 92 | } else { |
95 | /* preserve low part of n for reminder computation */ \ | 93 | asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" |
96 | __r = __n; \ | 94 | "cmn %Q0, %Q1\n\t" |
97 | /* determine number of bits to represent __b */ \ | 95 | "adcs %R0, %R0, %R1\n\t" |
98 | __p = 1 << __div64_fls(__b); \ | 96 | "adc %Q0, %3, #0" |
99 | /* compute __m = ((__p << 64) + __b - 1) / __b */ \ | 97 | : "=&r" (res) |
100 | __m = (~0ULL / __b) * __p; \ | 98 | : "r" (m), "r" (n), "r" (tmp) |
101 | __m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b; \ | 99 | : "cc"); |
102 | /* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */ \ | 100 | } |
103 | __x = ~0ULL / __b * __b - 1; \ | 101 | |
104 | __res = (__m & 0xffffffff) * (__x & 0xffffffff); \ | 102 | if (!(m & ((1ULL << 63) | (1ULL << 31)))) { |
105 | __res >>= 32; \ | 103 | asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" |
106 | __res += (__m & 0xffffffff) * (__x >> 32); \ | 104 | "umlal %R0, %Q0, %Q1, %R2\n\t" |
107 | __t = __res; \ | 105 | "mov %R0, #0\n\t" |
108 | __res += (__x & 0xffffffff) * (__m >> 32); \ | 106 | "umlal %Q0, %R0, %R1, %R2" |
109 | __t = (__res < __t) ? (1ULL << 32) : 0; \ | 107 | : "+&r" (res) |
110 | __res = (__res >> 32) + __t; \ | 108 | : "r" (m), "r" (n) |
111 | __res += (__m >> 32) * (__x >> 32); \ | 109 | : "cc"); |
112 | __res /= __p; \ | 110 | } else { |
113 | /* Now sanitize and optimize what we've got. */ \ | 111 | asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" |
114 | if (~0ULL % (__b / (__b & -__b)) == 0) { \ | 112 | "umlal %R0, %1, %Q2, %R3\n\t" |
115 | /* those cases can be simplified with: */ \ | 113 | "mov %R0, #0\n\t" |
116 | __n /= (__b & -__b); \ | 114 | "adds %Q0, %1, %Q0\n\t" |
117 | __m = ~0ULL / (__b / (__b & -__b)); \ | 115 | "adc %R0, %R0, #0\n\t" |
118 | __p = 1; \ | 116 | "umlal %Q0, %R0, %R2, %R3" |
119 | __c = 1; \ | 117 | : "+&r" (res), "+&r" (tmp) |
120 | } else if (__res != __x / __b) { \ | 118 | : "r" (m), "r" (n) |
121 | /* We can't get away without a correction */ \ | 119 | : "cc"); |
122 | /* to compensate for bit truncation errors. */ \ | 120 | } |
123 | /* To avoid it we'd need an additional bit */ \ | 121 | |
124 | /* to represent __m which would overflow it. */ \ | 122 | return res; |
125 | /* Instead we do m=p/b and n/b=(n*m+m)/p. */ \ | 123 | } |
126 | __c = 1; \ | 124 | #define __arch_xprod_64 __arch_xprod_64 |
127 | /* Compute __m = (__p << 64) / __b */ \ | 125 | |
128 | __m = (~0ULL / __b) * __p; \ | 126 | #include <asm-generic/div64.h> |
129 | __m += ((~0ULL % __b + 1) * __p) / __b; \ | ||
130 | } else { \ | ||
131 | /* Reduce __m/__p, and try to clear bit 31 */ \ | ||
132 | /* of __m when possible otherwise that'll */ \ | ||
133 | /* need extra overflow handling later. */ \ | ||
134 | unsigned int __bits = -(__m & -__m); \ | ||
135 | __bits |= __m >> 32; \ | ||
136 | __bits = (~__bits) << 1; \ | ||
137 | /* If __bits == 0 then setting bit 31 is */ \ | ||
138 | /* unavoidable. Simply apply the maximum */ \ | ||
139 | /* possible reduction in that case. */ \ | ||
140 | /* Otherwise the MSB of __bits indicates the */ \ | ||
141 | /* best reduction we should apply. */ \ | ||
142 | if (!__bits) { \ | ||
143 | __p /= (__m & -__m); \ | ||
144 | __m /= (__m & -__m); \ | ||
145 | } else { \ | ||
146 | __p >>= __div64_fls(__bits); \ | ||
147 | __m >>= __div64_fls(__bits); \ | ||
148 | } \ | ||
149 | /* No correction needed. */ \ | ||
150 | __c = 0; \ | ||
151 | } \ | ||
152 | /* Now we have a combination of 2 conditions: */ \ | ||
153 | /* 1) whether or not we need a correction (__c), and */ \ | ||
154 | /* 2) whether or not there might be an overflow in */ \ | ||
155 | /* the cross product (__m & ((1<<63) | (1<<31))) */ \ | ||
156 | /* Select the best insn combination to perform the */ \ | ||
157 | /* actual __m * __n / (__p << 64) operation. */ \ | ||
158 | if (!__c) { \ | ||
159 | asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \ | ||
160 | "mov %Q0, #0" \ | ||
161 | : "=&r" (__res) \ | ||
162 | : "r" (__m), "r" (__n) \ | ||
163 | : "cc" ); \ | ||
164 | } else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \ | ||
165 | __res = __m; \ | ||
166 | asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" \ | ||
167 | "mov %Q0, #0" \ | ||
168 | : "+&r" (__res) \ | ||
169 | : "r" (__m), "r" (__n) \ | ||
170 | : "cc" ); \ | ||
171 | } else { \ | ||
172 | asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \ | ||
173 | "cmn %Q0, %Q1\n\t" \ | ||
174 | "adcs %R0, %R0, %R1\n\t" \ | ||
175 | "adc %Q0, %3, #0" \ | ||
176 | : "=&r" (__res) \ | ||
177 | : "r" (__m), "r" (__n), "r" (__z) \ | ||
178 | : "cc" ); \ | ||
179 | } \ | ||
180 | if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \ | ||
181 | asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" \ | ||
182 | "umlal %R0, %Q0, %Q1, %R2\n\t" \ | ||
183 | "mov %R0, #0\n\t" \ | ||
184 | "umlal %Q0, %R0, %R1, %R2" \ | ||
185 | : "+&r" (__res) \ | ||
186 | : "r" (__m), "r" (__n) \ | ||
187 | : "cc" ); \ | ||
188 | } else { \ | ||
189 | asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" \ | ||
190 | "umlal %R0, %1, %Q2, %R3\n\t" \ | ||
191 | "mov %R0, #0\n\t" \ | ||
192 | "adds %Q0, %1, %Q0\n\t" \ | ||
193 | "adc %R0, %R0, #0\n\t" \ | ||
194 | "umlal %Q0, %R0, %R2, %R3" \ | ||
195 | : "+&r" (__res), "+&r" (__z) \ | ||
196 | : "r" (__m), "r" (__n) \ | ||
197 | : "cc" ); \ | ||
198 | } \ | ||
199 | __res /= __p; \ | ||
200 | /* The reminder can be computed with 32-bit regs */ \ | ||
201 | /* only, and gcc is good at that. */ \ | ||
202 | { \ | ||
203 | unsigned int __res0 = __res; \ | ||
204 | unsigned int __b0 = __b; \ | ||
205 | __r -= __res0 * __b0; \ | ||
206 | } \ | ||
207 | /* BUG_ON(__r >= __b || __res * __b + __r != n); */ \ | ||
208 | n = __res; \ | ||
209 | } \ | ||
210 | __r; \ | ||
211 | }) | ||
212 | |||
213 | /* our own fls implementation to make sure constant propagation is fine */ | ||
214 | #define __div64_fls(bits) \ | ||
215 | ({ \ | ||
216 | unsigned int __left = (bits), __nr = 0; \ | ||
217 | if (__left & 0xffff0000) __nr += 16, __left >>= 16; \ | ||
218 | if (__left & 0x0000ff00) __nr += 8, __left >>= 8; \ | ||
219 | if (__left & 0x000000f0) __nr += 4, __left >>= 4; \ | ||
220 | if (__left & 0x0000000c) __nr += 2, __left >>= 2; \ | ||
221 | if (__left & 0x00000002) __nr += 1; \ | ||
222 | __nr; \ | ||
223 | }) | ||
224 | 127 | ||
225 | #endif | 128 | #endif |
226 | 129 | ||