diff options
author | Mike Frysinger <michael.frysinger@analog.com> | 2007-10-21 10:57:36 -0400 |
---|---|---|
committer | Bryan Wu <bryan.wu@analog.com> | 2007-10-21 10:57:36 -0400 |
commit | b0a68dc07ec395d44849ce98eb417713ca333410 (patch) | |
tree | 5e469225188d63fd296fc04d0c04e9580d8e47db /arch | |
parent | 1c668d82465cd5c17030c0f69561841374380ac8 (diff) |
Blackfin arch: add assembly function for doing 64bit unsigned division
Signed-off-by: Mike Frysinger <michael.frysinger@analog.com>
Signed-off-by: Bryan Wu <bryan.wu@analog.com>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/blackfin/lib/Makefile | 2 | ||||
-rw-r--r-- | arch/blackfin/lib/udivdi3.S | 375 |
2 files changed, 376 insertions, 1 deletions
diff --git a/arch/blackfin/lib/Makefile b/arch/blackfin/lib/Makefile index 635288fc5f54..bfdad52c570b 100644 --- a/arch/blackfin/lib/Makefile +++ b/arch/blackfin/lib/Makefile | |||
@@ -4,7 +4,7 @@ | |||
4 | 4 | ||
5 | lib-y := \ | 5 | lib-y := \ |
6 | ashldi3.o ashrdi3.o lshrdi3.o \ | 6 | ashldi3.o ashrdi3.o lshrdi3.o \ |
7 | muldi3.o divsi3.o udivsi3.o modsi3.o umodsi3.o \ | 7 | muldi3.o divsi3.o udivsi3.o udivdi3.o modsi3.o umodsi3.o \ |
8 | checksum.o memcpy.o memset.o memcmp.o memchr.o memmove.o \ | 8 | checksum.o memcpy.o memset.o memcmp.o memchr.o memmove.o \ |
9 | strcmp.o strcpy.o strncmp.o strncpy.o \ | 9 | strcmp.o strcpy.o strncmp.o strncpy.o \ |
10 | umulsi3_highpart.o smulsi3_highpart.o \ | 10 | umulsi3_highpart.o smulsi3_highpart.o \ |
diff --git a/arch/blackfin/lib/udivdi3.S b/arch/blackfin/lib/udivdi3.S new file mode 100644 index 000000000000..ad1ebee675e1 --- /dev/null +++ b/arch/blackfin/lib/udivdi3.S | |||
@@ -0,0 +1,375 @@ | |||
1 | /* | ||
2 | * udivdi3.S - unsigned long long division | ||
3 | * | ||
4 | * Copyright 2003-2007 Analog Devices Inc. | ||
5 | * Enter bugs at http://blackfin.uclinux.org/ | ||
6 | * | ||
7 | * Licensed under the GPLv2 or later. | ||
8 | */ | ||
9 | |||
10 | #include <linux/linkage.h> | ||
11 | |||
12 | #define CARRY AC0 | ||
13 | |||
14 | #ifdef CONFIG_ARITHMETIC_OPS_L1 | ||
15 | .section .l1.text | ||
16 | #else | ||
17 | .text | ||
18 | #endif | ||
19 | |||
20 | |||
21 | ENTRY(___udivdi3) | ||
22 | R3 = [SP + 12]; | ||
23 | [--SP] = (R7:4, P5:3); | ||
24 | |||
25 | /* Attempt to use divide primitive first; these will handle | ||
26 | ** most cases, and they're quick - avoids stalls incurred by | ||
27 | ** testing for identities. | ||
28 | */ | ||
29 | |||
30 | R4 = R2 | R3; | ||
31 | CC = R4 == 0; | ||
32 | IF CC JUMP .LDIV_BY_ZERO; | ||
33 | |||
34 | R4.H = 0x8000; | ||
35 | R4 >>>= 16; // R4 now 0xFFFF8000 | ||
36 | R5 = R0 | R2; // If either dividend or | ||
37 | R4 = R5 & R4; // divisor have bits in | ||
38 | CC = R4; // top half or low half's sign | ||
39 | IF CC JUMP .LIDENTS; // bit, skip builtins. | ||
40 | R4 = R1 | R3; // Also check top halves | ||
41 | CC = R4; | ||
42 | IF CC JUMP .LIDENTS; | ||
43 | |||
44 | /* Can use the builtins. */ | ||
45 | |||
46 | AQ = CC; // Clear AQ (CC==0) | ||
47 | DIVQ(R0, R2); | ||
48 | DIVQ(R0, R2); | ||
49 | DIVQ(R0, R2); | ||
50 | DIVQ(R0, R2); | ||
51 | DIVQ(R0, R2); | ||
52 | DIVQ(R0, R2); | ||
53 | DIVQ(R0, R2); | ||
54 | DIVQ(R0, R2); | ||
55 | DIVQ(R0, R2); | ||
56 | DIVQ(R0, R2); | ||
57 | DIVQ(R0, R2); | ||
58 | DIVQ(R0, R2); | ||
59 | DIVQ(R0, R2); | ||
60 | DIVQ(R0, R2); | ||
61 | DIVQ(R0, R2); | ||
62 | DIVQ(R0, R2); | ||
63 | DIVQ(R0, R2); | ||
64 | R0 = R0.L (Z); | ||
65 | R1 = 0; | ||
66 | (R7:4, P5:3) = [SP++]; | ||
67 | RTS; | ||
68 | |||
69 | .LIDENTS: | ||
70 | /* Test for common identities. Value to be returned is | ||
71 | ** placed in R6,R7. | ||
72 | */ | ||
73 | // Check for 0/y, return 0 | ||
74 | R4 = R0 | R1; | ||
75 | CC = R4 == 0; | ||
76 | IF CC JUMP .LRETURN_R0; | ||
77 | |||
78 | // Check for x/x, return 1 | ||
79 | R6 = R0 - R2; // If x == y, then both R6 and R7 will be zero | ||
80 | R7 = R1 - R3; | ||
81 | R4 = R6 | R7; // making R4 zero. | ||
82 | R6 += 1; // which would now make R6:R7==1. | ||
83 | CC = R4 == 0; | ||
84 | IF CC JUMP .LRETURN_IDENT; | ||
85 | |||
86 | // Check for x/1, return x | ||
87 | R6 = R0; | ||
88 | R7 = R1; | ||
89 | CC = R3 == 0; | ||
90 | IF !CC JUMP .Lnexttest; | ||
91 | CC = R2 == 1; | ||
92 | IF CC JUMP .LRETURN_IDENT; | ||
93 | |||
94 | .Lnexttest: | ||
95 | R4.L = ONES R2; // check for div by power of two which | ||
96 | R5.L = ONES R3; // can be done using a shift | ||
97 | R6 = PACK (R5.L, R4.L); | ||
98 | CC = R6 == 1; | ||
99 | IF CC JUMP .Lpower_of_two_upper_zero; | ||
100 | R6 = PACK (R4.L, R5.L); | ||
101 | CC = R6 == 1; | ||
102 | IF CC JUMP .Lpower_of_two_lower_zero; | ||
103 | |||
104 | // Check for x < y, return 0 | ||
105 | R6 = 0; | ||
106 | R7 = R6; | ||
107 | CC = R1 < R3 (IU); | ||
108 | IF CC JUMP .LRETURN_IDENT; | ||
109 | CC = R1 == R3; | ||
110 | IF !CC JUMP .Lno_idents; | ||
111 | CC = R0 < R2 (IU); | ||
112 | IF CC JUMP .LRETURN_IDENT; | ||
113 | |||
114 | .Lno_idents: // Idents don't match. Go for the full operation | ||
115 | |||
116 | |||
117 | // If X, or X and Y have high bit set, it'll affect the | ||
118 | // results, so shift right one to stop this. Note: we've already | ||
119 | // checked that X >= Y, so Y's msb won't be set unless X's | ||
120 | // is. | ||
121 | |||
122 | R4 = 0; | ||
123 | CC = R1 < 0; | ||
124 | IF !CC JUMP .Lx_msb_clear; | ||
125 | CC = !CC; // 1 -> 0; | ||
126 | R1 = ROT R1 BY -1; // Shift X >> 1 | ||
127 | R0 = ROT R0 BY -1; // lsb -> CC | ||
128 | BITSET(R4,31); // to record only x msb was set | ||
129 | CC = R3 < 0; | ||
130 | IF !CC JUMP .Ly_msb_clear; | ||
131 | CC = !CC; | ||
132 | R3 = ROT R3 BY -1; // Shift Y >> 1 | ||
133 | R2 = ROT R2 BY -1; | ||
134 | BITCLR(R4,31); // clear bit to record only x msb was set | ||
135 | |||
136 | .Ly_msb_clear: | ||
137 | .Lx_msb_clear: | ||
138 | // Bit 31 in R4 indicates X msb set, but Y msb wasn't, and no bits | ||
139 | // were lost, so we should shift result left by one. | ||
140 | |||
141 | [--SP] = R4; // save for later | ||
142 | |||
143 | // In the loop that follows, each iteration we add | ||
144 | // either Y' or -Y' to the Remainder. We compute the | ||
145 | // negated Y', and store, for convenience. Y' goes | ||
146 | // into P0:P1, while -Y' goes into P2:P3. | ||
147 | |||
148 | P0 = R2; | ||
149 | P1 = R3; | ||
150 | R2 = -R2; | ||
151 | CC = CARRY; | ||
152 | CC = !CC; | ||
153 | R4 = CC; | ||
154 | R3 = -R3; | ||
155 | R3 = R3 - R4; | ||
156 | |||
157 | R6 = 0; // remainder = 0 | ||
158 | R7 = R6; | ||
159 | |||
160 | [--SP] = R2; P2 = SP; | ||
161 | [--SP] = R3; P3 = SP; | ||
162 | [--SP] = R6; P5 = SP; // AQ = 0 | ||
163 | [--SP] = P1; | ||
164 | |||
165 | /* In the loop that follows, we use the following | ||
166 | ** register assignments: | ||
167 | ** R0,R1 X, workspace | ||
168 | ** R2,R3 Y, workspace | ||
169 | ** R4,R5 partial Div | ||
170 | ** R6,R7 partial remainder | ||
171 | ** P5 AQ | ||
172 | ** The remainder and div form a 128-bit number, with | ||
173 | ** the remainder in the high 64-bits. | ||
174 | */ | ||
175 | R4 = R0; // Div = X' | ||
176 | R5 = R1; | ||
177 | R3 = 0; | ||
178 | |||
179 | P4 = 64; // Iterate once per bit | ||
180 | LSETUP(.LULST,.LULEND) LC0 = P4; | ||
181 | .LULST: | ||
182 | /* Shift Div and remainder up by one. The bit shifted | ||
183 | ** out of the top of the quotient is shifted into the bottom | ||
184 | ** of the remainder. | ||
185 | */ | ||
186 | CC = R3; | ||
187 | R4 = ROT R4 BY 1; | ||
188 | R5 = ROT R5 BY 1 || // low q to high q | ||
189 | R2 = [P5]; // load saved AQ | ||
190 | R6 = ROT R6 BY 1 || // high q to low r | ||
191 | R0 = [P2]; // load -Y' | ||
192 | R7 = ROT R7 BY 1 || // low r to high r | ||
193 | R1 = [P3]; | ||
194 | |||
195 | // Assume add -Y' | ||
196 | CC = R2 < 0; // But if AQ is set... | ||
197 | IF CC R0 = P0; // then add Y' instead | ||
198 | IF CC R1 = P1; | ||
199 | |||
200 | R6 = R6 + R0; // Rem += (Y' or -Y') | ||
201 | CC = CARRY; | ||
202 | R0 = CC; | ||
203 | R7 = R7 + R1; | ||
204 | R7 = R7 + R0 (NS) || | ||
205 | R1 = [SP]; | ||
206 | // Set the next AQ bit | ||
207 | R1 = R7 ^ R1; // from Remainder and Y' | ||
208 | R1 = R1 >> 31 || // Negate AQ's value, and | ||
209 | [P5] = R1; // save next AQ | ||
210 | BITTGL(R1, 0); // add neg AQ to the Div | ||
211 | .LULEND: R4 = R4 + R1; | ||
212 | |||
213 | R6 = [SP + 16]; | ||
214 | |||
215 | R0 = R4; | ||
216 | R1 = R5; | ||
217 | CC = BITTST(R6,30); // Just set CC=0 | ||
218 | R4 = ROT R0 BY 1; // but if we had to shift X, | ||
219 | R5 = ROT R1 BY 1; // and didn't shift any bits out, | ||
220 | CC = BITTST(R6,31); // then the result will be half as | ||
221 | IF CC R0 = R4; // much as required, so shift left | ||
222 | IF CC R1 = R5; // one space. | ||
223 | |||
224 | SP += 20; | ||
225 | (R7:4, P5:3) = [SP++]; | ||
226 | RTS; | ||
227 | |||
228 | .Lpower_of_two: | ||
229 | /* Y has a single bit set, which means it's a power of two. | ||
230 | ** That means we can perform the division just by shifting | ||
231 | ** X to the right the appropriate number of bits | ||
232 | */ | ||
233 | |||
234 | /* signbits returns the number of sign bits, minus one. | ||
235 | ** 1=>30, 2=>29, ..., 0x40000000=>0. Which means we need | ||
236 | ** to shift right n-signbits spaces. It also means 0x80000000 | ||
237 | ** is a special case, because that *also* gives a signbits of 0 | ||
238 | */ | ||
239 | .Lpower_of_two_lower_zero: | ||
240 | R7 = 0; | ||
241 | R6 = R1 >> 31; | ||
242 | CC = R3 < 0; | ||
243 | IF CC JUMP .LRETURN_IDENT; | ||
244 | |||
245 | R2.L = SIGNBITS R3; | ||
246 | R2 = R2.L (Z); | ||
247 | R2 += -62; | ||
248 | (R7:4, P5:3) = [SP++]; | ||
249 | JUMP ___lshftli; | ||
250 | |||
251 | .Lpower_of_two_upper_zero: | ||
252 | CC = R2 < 0; | ||
253 | IF CC JUMP .Lmaxint_shift; | ||
254 | |||
255 | R2.L = SIGNBITS R2; | ||
256 | R2 = R2.L (Z); | ||
257 | R2 += -30; | ||
258 | (R7:4, P5:3) = [SP++]; | ||
259 | JUMP ___lshftli; | ||
260 | |||
261 | .Lmaxint_shift: | ||
262 | R2 = -31; | ||
263 | (R7:4, P5:3) = [SP++]; | ||
264 | JUMP ___lshftli; | ||
265 | |||
266 | .LRETURN_IDENT: | ||
267 | R0 = R6; | ||
268 | R1 = R7; | ||
269 | .LRETURN_R0: | ||
270 | (R7:4, P5:3) = [SP++]; | ||
271 | RTS; | ||
272 | .LDIV_BY_ZERO: | ||
273 | R0 = ~R2; | ||
274 | R1 = R0; | ||
275 | (R7:4, P5:3) = [SP++]; | ||
276 | RTS; | ||
277 | |||
278 | ENDPROC(___udivdi3) | ||
279 | |||
280 | |||
281 | ENTRY(___lshftli) | ||
282 | CC = R2 == 0; | ||
283 | IF CC JUMP .Lfinished; // nothing to do | ||
284 | CC = R2 < 0; | ||
285 | IF CC JUMP .Lrshift; | ||
286 | R3 = 64; | ||
287 | CC = R2 < R3; | ||
288 | IF !CC JUMP .Lretzero; | ||
289 | |||
290 | // We're shifting left, and it's less than 64 bits, so | ||
291 | // a valid result will be returned. | ||
292 | |||
293 | R3 >>= 1; // R3 now 32 | ||
294 | CC = R2 < R3; | ||
295 | |||
296 | IF !CC JUMP .Lzerohalf; | ||
297 | |||
298 | // We're shifting left, between 1 and 31 bits, which means | ||
299 | // some of the low half will be shifted into the high half. | ||
300 | // Work out how much. | ||
301 | |||
302 | R3 = R3 - R2; | ||
303 | |||
304 | // Save that much data from the bottom half. | ||
305 | |||
306 | P1 = R7; | ||
307 | R7 = R0; | ||
308 | R7 >>= R3; | ||
309 | |||
310 | // Adjust both parts of the parameter. | ||
311 | |||
312 | R0 <<= R2; | ||
313 | R1 <<= R2; | ||
314 | |||
315 | // And include the bits moved across. | ||
316 | |||
317 | R1 = R1 | R7; | ||
318 | R7 = P1; | ||
319 | RTS; | ||
320 | |||
321 | .Lzerohalf: | ||
322 | // We're shifting left, between 32 and 63 bits, so the | ||
323 | // bottom half will become zero, and the top half will | ||
324 | // lose some bits. How many? | ||
325 | |||
326 | R2 = R2 - R3; // N - 32 | ||
327 | R1 = LSHIFT R0 BY R2.L; | ||
328 | R0 = R0 - R0; | ||
329 | RTS; | ||
330 | |||
331 | .Lretzero: | ||
332 | R0 = R0 - R0; | ||
333 | R1 = R0; | ||
334 | .Lfinished: | ||
335 | RTS; | ||
336 | |||
337 | .Lrshift: | ||
338 | // We're shifting right, but by how much? | ||
339 | R2 = -R2; | ||
340 | R3 = 64; | ||
341 | CC = R2 < R3; | ||
342 | IF !CC JUMP .Lretzero; | ||
343 | |||
344 | // Shifting right less than 64 bits, so some result bits will | ||
345 | // be retained. | ||
346 | |||
347 | R3 >>= 1; // R3 now 32 | ||
348 | CC = R2 < R3; | ||
349 | IF !CC JUMP .Lsignhalf; | ||
350 | |||
351 | // Shifting right between 1 and 31 bits, so need to copy | ||
352 | // data across words. | ||
353 | |||
354 | P1 = R7; | ||
355 | R3 = R3 - R2; | ||
356 | R7 = R1; | ||
357 | R7 <<= R3; | ||
358 | R1 >>= R2; | ||
359 | R0 >>= R2; | ||
360 | R0 = R7 | R0; | ||
361 | R7 = P1; | ||
362 | RTS; | ||
363 | |||
364 | .Lsignhalf: | ||
365 | // Shifting right between 32 and 63 bits, so the top half | ||
366 | // will become all zero-bits, and the bottom half is some | ||
367 | // of the top half. But how much? | ||
368 | |||
369 | R2 = R2 - R3; | ||
370 | R0 = R1; | ||
371 | R0 >>= R2; | ||
372 | R1 = 0; | ||
373 | RTS; | ||
374 | |||
375 | ENDPROC(___lshftli) | ||