diff options
Diffstat (limited to 'arch/parisc/lib/milli/div_const.S')
-rw-r--r-- | arch/parisc/lib/milli/div_const.S | 682 |
1 files changed, 0 insertions, 682 deletions
diff --git a/arch/parisc/lib/milli/div_const.S b/arch/parisc/lib/milli/div_const.S deleted file mode 100644 index dd660076e944..000000000000 --- a/arch/parisc/lib/milli/div_const.S +++ /dev/null | |||
@@ -1,682 +0,0 @@ | |||
1 | /* 32 and 64-bit millicode, original author Hewlett-Packard | ||
2 | adapted for gcc by Paul Bame <bame@debian.org> | ||
3 | and Alan Modra <alan@linuxcare.com.au>. | ||
4 | |||
5 | Copyright 2001, 2002, 2003 Free Software Foundation, Inc. | ||
6 | |||
7 | This file is part of GCC and is released under the terms of | ||
8 | of the GNU General Public License as published by the Free Software | ||
9 | Foundation; either version 2, or (at your option) any later version. | ||
10 | See the file COPYING in the top-level GCC source directory for a copy | ||
11 | of the license. */ | ||
12 | |||
13 | #include "milli.h" | ||
14 | |||
15 | #ifdef L_div_const | ||
16 | /* ROUTINE: $$divI_2 | ||
17 | . $$divI_3 $$divU_3 | ||
18 | . $$divI_4 | ||
19 | . $$divI_5 $$divU_5 | ||
20 | . $$divI_6 $$divU_6 | ||
21 | . $$divI_7 $$divU_7 | ||
22 | . $$divI_8 | ||
23 | . $$divI_9 $$divU_9 | ||
24 | . $$divI_10 $$divU_10 | ||
25 | . | ||
26 | . $$divI_12 $$divU_12 | ||
27 | . | ||
28 | . $$divI_14 $$divU_14 | ||
29 | . $$divI_15 $$divU_15 | ||
30 | . $$divI_16 | ||
31 | . $$divI_17 $$divU_17 | ||
32 | . | ||
33 | . Divide by selected constants for single precision binary integers. | ||
34 | |||
35 | INPUT REGISTERS: | ||
36 | . arg0 == dividend | ||
37 | . mrp == return pc | ||
38 | . sr0 == return space when called externally | ||
39 | |||
40 | OUTPUT REGISTERS: | ||
41 | . arg0 = undefined | ||
42 | . arg1 = undefined | ||
43 | . ret1 = quotient | ||
44 | |||
45 | OTHER REGISTERS AFFECTED: | ||
46 | . r1 = undefined | ||
47 | |||
48 | SIDE EFFECTS: | ||
49 | . Causes a trap under the following conditions: NONE | ||
50 | . Changes memory at the following places: NONE | ||
51 | |||
52 | PERMISSIBLE CONTEXT: | ||
53 | . Unwindable. | ||
54 | . Does not create a stack frame. | ||
55 | . Suitable for internal or external millicode. | ||
56 | . Assumes the special millicode register conventions. | ||
57 | |||
58 | DISCUSSION: | ||
59 | . Calls other millicode routines using mrp: NONE | ||
60 | . Calls other millicode routines: NONE */ | ||
61 | |||
62 | |||
63 | /* TRUNCATED DIVISION BY SMALL INTEGERS | ||
64 | |||
65 | We are interested in q(x) = floor(x/y), where x >= 0 and y > 0 | ||
66 | (with y fixed). | ||
67 | |||
68 | Let a = floor(z/y), for some choice of z. Note that z will be | ||
69 | chosen so that division by z is cheap. | ||
70 | |||
71 | Let r be the remainder(z/y). In other words, r = z - ay. | ||
72 | |||
73 | Now, our method is to choose a value for b such that | ||
74 | |||
75 | q'(x) = floor((ax+b)/z) | ||
76 | |||
77 | is equal to q(x) over as large a range of x as possible. If the | ||
78 | two are equal over a sufficiently large range, and if it is easy to | ||
79 | form the product (ax), and it is easy to divide by z, then we can | ||
80 | perform the division much faster than the general division algorithm. | ||
81 | |||
82 | So, we want the following to be true: | ||
83 | |||
84 | . For x in the following range: | ||
85 | . | ||
86 | . ky <= x < (k+1)y | ||
87 | . | ||
88 | . implies that | ||
89 | . | ||
90 | . k <= (ax+b)/z < (k+1) | ||
91 | |||
92 | We want to determine b such that this is true for all k in the | ||
93 | range {0..K} for some maximum K. | ||
94 | |||
95 | Since (ax+b) is an increasing function of x, we can take each | ||
96 | bound separately to determine the "best" value for b. | ||
97 | |||
98 | (ax+b)/z < (k+1) implies | ||
99 | |||
100 | (a((k+1)y-1)+b < (k+1)z implies | ||
101 | |||
102 | b < a + (k+1)(z-ay) implies | ||
103 | |||
104 | b < a + (k+1)r | ||
105 | |||
106 | This needs to be true for all k in the range {0..K}. In | ||
107 | particular, it is true for k = 0 and this leads to a maximum | ||
108 | acceptable value for b. | ||
109 | |||
110 | b < a+r or b <= a+r-1 | ||
111 | |||
112 | Taking the other bound, we have | ||
113 | |||
114 | k <= (ax+b)/z implies | ||
115 | |||
116 | k <= (aky+b)/z implies | ||
117 | |||
118 | k(z-ay) <= b implies | ||
119 | |||
120 | kr <= b | ||
121 | |||
122 | Clearly, the largest range for k will be achieved by maximizing b, | ||
123 | when r is not zero. When r is zero, then the simplest choice for b | ||
124 | is 0. When r is not 0, set | ||
125 | |||
126 | . b = a+r-1 | ||
127 | |||
128 | Now, by construction, q'(x) = floor((ax+b)/z) = q(x) = floor(x/y) | ||
129 | for all x in the range: | ||
130 | |||
131 | . 0 <= x < (K+1)y | ||
132 | |||
133 | We need to determine what K is. Of our two bounds, | ||
134 | |||
135 | . b < a+(k+1)r is satisfied for all k >= 0, by construction. | ||
136 | |||
137 | The other bound is | ||
138 | |||
139 | . kr <= b | ||
140 | |||
141 | This is always true if r = 0. If r is not 0 (the usual case), then | ||
142 | K = floor((a+r-1)/r), is the maximum value for k. | ||
143 | |||
144 | Therefore, the formula q'(x) = floor((ax+b)/z) yields the correct | ||
145 | answer for q(x) = floor(x/y) when x is in the range | ||
146 | |||
147 | (0,(K+1)y-1) K = floor((a+r-1)/r) | ||
148 | |||
149 | To be most useful, we want (K+1)y-1 = (max x) >= 2**32-1 so that | ||
150 | the formula for q'(x) yields the correct value of q(x) for all x | ||
151 | representable by a single word in HPPA. | ||
152 | |||
153 | We are also constrained in that computing the product (ax), adding | ||
154 | b, and dividing by z must all be done quickly, otherwise we will be | ||
155 | better off going through the general algorithm using the DS | ||
156 | instruction, which uses approximately 70 cycles. | ||
157 | |||
158 | For each y, there is a choice of z which satisfies the constraints | ||
159 | for (K+1)y >= 2**32. We may not, however, be able to satisfy the | ||
160 | timing constraints for arbitrary y. It seems that z being equal to | ||
161 | a power of 2 or a power of 2 minus 1 is as good as we can do, since | ||
162 | it minimizes the time to do division by z. We want the choice of z | ||
163 | to also result in a value for (a) that minimizes the computation of | ||
164 | the product (ax). This is best achieved if (a) has a regular bit | ||
165 | pattern (so the multiplication can be done with shifts and adds). | ||
166 | The value of (a) also needs to be less than 2**32 so the product is | ||
167 | always guaranteed to fit in 2 words. | ||
168 | |||
169 | In actual practice, the following should be done: | ||
170 | |||
171 | 1) For negative x, you should take the absolute value and remember | ||
172 | . the fact so that the result can be negated. This obviously does | ||
173 | . not apply in the unsigned case. | ||
174 | 2) For even y, you should factor out the power of 2 that divides y | ||
175 | . and divide x by it. You can then proceed by dividing by the | ||
176 | . odd factor of y. | ||
177 | |||
178 | Here is a table of some odd values of y, and corresponding choices | ||
179 | for z which are "good". | ||
180 | |||
181 | y z r a (hex) max x (hex) | ||
182 | |||
183 | 3 2**32 1 55555555 100000001 | ||
184 | 5 2**32 1 33333333 100000003 | ||
185 | 7 2**24-1 0 249249 (infinite) | ||
186 | 9 2**24-1 0 1c71c7 (infinite) | ||
187 | 11 2**20-1 0 1745d (infinite) | ||
188 | 13 2**24-1 0 13b13b (infinite) | ||
189 | 15 2**32 1 11111111 10000000d | ||
190 | 17 2**32 1 f0f0f0f 10000000f | ||
191 | |||
192 | If r is 1, then b = a+r-1 = a. This simplifies the computation | ||
193 | of (ax+b), since you can compute (x+1)(a) instead. If r is 0, | ||
194 | then b = 0 is ok to use which simplifies (ax+b). | ||
195 | |||
196 | The bit patterns for 55555555, 33333333, and 11111111 are obviously | ||
197 | very regular. The bit patterns for the other values of a above are: | ||
198 | |||
199 | y (hex) (binary) | ||
200 | |||
201 | 7 249249 001001001001001001001001 << regular >> | ||
202 | 9 1c71c7 000111000111000111000111 << regular >> | ||
203 | 11 1745d 000000010111010001011101 << irregular >> | ||
204 | 13 13b13b 000100111011000100111011 << irregular >> | ||
205 | |||
206 | The bit patterns for (a) corresponding to (y) of 11 and 13 may be | ||
207 | too irregular to warrant using this method. | ||
208 | |||
209 | When z is a power of 2 minus 1, then the division by z is slightly | ||
210 | more complicated, involving an iterative solution. | ||
211 | |||
212 | The code presented here solves division by 1 through 17, except for | ||
213 | 11 and 13. There are algorithms for both signed and unsigned | ||
214 | quantities given. | ||
215 | |||
216 | TIMINGS (cycles) | ||
217 | |||
218 | divisor positive negative unsigned | ||
219 | |||
220 | . 1 2 2 2 | ||
221 | . 2 4 4 2 | ||
222 | . 3 19 21 19 | ||
223 | . 4 4 4 2 | ||
224 | . 5 18 22 19 | ||
225 | . 6 19 22 19 | ||
226 | . 8 4 4 2 | ||
227 | . 10 18 19 17 | ||
228 | . 12 18 20 18 | ||
229 | . 15 16 18 16 | ||
230 | . 16 4 4 2 | ||
231 | . 17 16 18 16 | ||
232 | |||
233 | Now, the algorithm for 7, 9, and 14 is an iterative one. That is, | ||
234 | a loop body is executed until the tentative quotient is 0. The | ||
235 | number of times the loop body is executed varies depending on the | ||
236 | dividend, but is never more than two times. If the dividend is | ||
237 | less than the divisor, then the loop body is not executed at all. | ||
238 | Each iteration adds 4 cycles to the timings. | ||
239 | |||
240 | divisor positive negative unsigned | ||
241 | |||
242 | . 7 19+4n 20+4n 20+4n n = number of iterations | ||
243 | . 9 21+4n 22+4n 21+4n | ||
244 | . 14 21+4n 22+4n 20+4n | ||
245 | |||
246 | To give an idea of how the number of iterations varies, here is a | ||
247 | table of dividend versus number of iterations when dividing by 7. | ||
248 | |||
249 | smallest largest required | ||
250 | dividend dividend iterations | ||
251 | |||
252 | . 0 6 0 | ||
253 | . 7 0x6ffffff 1 | ||
254 | 0x1000006 0xffffffff 2 | ||
255 | |||
256 | There is some overlap in the range of numbers requiring 1 and 2 | ||
257 | iterations. */ | ||
258 | |||
259 | RDEFINE(t2,r1) | ||
260 | RDEFINE(x2,arg0) /* r26 */ | ||
261 | RDEFINE(t1,arg1) /* r25 */ | ||
262 | RDEFINE(x1,ret1) /* r29 */ | ||
263 | |||
264 | SUBSPA_MILLI_DIV | ||
265 | ATTR_MILLI | ||
266 | |||
267 | .proc | ||
268 | .callinfo millicode | ||
269 | .entry | ||
270 | /* NONE of these routines require a stack frame | ||
271 | ALL of these routines are unwindable from millicode */ | ||
272 | |||
273 | GSYM($$divide_by_constant) | ||
274 | .export $$divide_by_constant,millicode | ||
275 | /* Provides a "nice" label for the code covered by the unwind descriptor | ||
276 | for things like gprof. */ | ||
277 | |||
278 | /* DIVISION BY 2 (shift by 1) */ | ||
279 | GSYM($$divI_2) | ||
280 | .export $$divI_2,millicode | ||
281 | comclr,>= arg0,0,0 | ||
282 | addi 1,arg0,arg0 | ||
283 | MILLIRET | ||
284 | extrs arg0,30,31,ret1 | ||
285 | |||
286 | |||
287 | /* DIVISION BY 4 (shift by 2) */ | ||
288 | GSYM($$divI_4) | ||
289 | .export $$divI_4,millicode | ||
290 | comclr,>= arg0,0,0 | ||
291 | addi 3,arg0,arg0 | ||
292 | MILLIRET | ||
293 | extrs arg0,29,30,ret1 | ||
294 | |||
295 | |||
296 | /* DIVISION BY 8 (shift by 3) */ | ||
297 | GSYM($$divI_8) | ||
298 | .export $$divI_8,millicode | ||
299 | comclr,>= arg0,0,0 | ||
300 | addi 7,arg0,arg0 | ||
301 | MILLIRET | ||
302 | extrs arg0,28,29,ret1 | ||
303 | |||
304 | /* DIVISION BY 16 (shift by 4) */ | ||
305 | GSYM($$divI_16) | ||
306 | .export $$divI_16,millicode | ||
307 | comclr,>= arg0,0,0 | ||
308 | addi 15,arg0,arg0 | ||
309 | MILLIRET | ||
310 | extrs arg0,27,28,ret1 | ||
311 | |||
312 | /**************************************************************************** | ||
313 | * | ||
314 | * DIVISION BY DIVISORS OF FFFFFFFF, and powers of 2 times these | ||
315 | * | ||
316 | * includes 3,5,15,17 and also 6,10,12 | ||
317 | * | ||
318 | ****************************************************************************/ | ||
319 | |||
320 | /* DIVISION BY 3 (use z = 2**32; a = 55555555) */ | ||
321 | |||
322 | GSYM($$divI_3) | ||
323 | .export $$divI_3,millicode | ||
324 | comb,<,N x2,0,LREF(neg3) | ||
325 | |||
326 | addi 1,x2,x2 /* this cannot overflow */ | ||
327 | extru x2,1,2,x1 /* multiply by 5 to get started */ | ||
328 | sh2add x2,x2,x2 | ||
329 | b LREF(pos) | ||
330 | addc x1,0,x1 | ||
331 | |||
332 | LSYM(neg3) | ||
333 | subi 1,x2,x2 /* this cannot overflow */ | ||
334 | extru x2,1,2,x1 /* multiply by 5 to get started */ | ||
335 | sh2add x2,x2,x2 | ||
336 | b LREF(neg) | ||
337 | addc x1,0,x1 | ||
338 | |||
339 | GSYM($$divU_3) | ||
340 | .export $$divU_3,millicode | ||
341 | addi 1,x2,x2 /* this CAN overflow */ | ||
342 | addc 0,0,x1 | ||
343 | shd x1,x2,30,t1 /* multiply by 5 to get started */ | ||
344 | sh2add x2,x2,x2 | ||
345 | b LREF(pos) | ||
346 | addc x1,t1,x1 | ||
347 | |||
348 | /* DIVISION BY 5 (use z = 2**32; a = 33333333) */ | ||
349 | |||
350 | GSYM($$divI_5) | ||
351 | .export $$divI_5,millicode | ||
352 | comb,<,N x2,0,LREF(neg5) | ||
353 | |||
354 | addi 3,x2,t1 /* this cannot overflow */ | ||
355 | sh1add x2,t1,x2 /* multiply by 3 to get started */ | ||
356 | b LREF(pos) | ||
357 | addc 0,0,x1 | ||
358 | |||
359 | LSYM(neg5) | ||
360 | sub 0,x2,x2 /* negate x2 */ | ||
361 | addi 1,x2,x2 /* this cannot overflow */ | ||
362 | shd 0,x2,31,x1 /* get top bit (can be 1) */ | ||
363 | sh1add x2,x2,x2 /* multiply by 3 to get started */ | ||
364 | b LREF(neg) | ||
365 | addc x1,0,x1 | ||
366 | |||
367 | GSYM($$divU_5) | ||
368 | .export $$divU_5,millicode | ||
369 | addi 1,x2,x2 /* this CAN overflow */ | ||
370 | addc 0,0,x1 | ||
371 | shd x1,x2,31,t1 /* multiply by 3 to get started */ | ||
372 | sh1add x2,x2,x2 | ||
373 | b LREF(pos) | ||
374 | addc t1,x1,x1 | ||
375 | |||
376 | /* DIVISION BY 6 (shift to divide by 2 then divide by 3) */ | ||
377 | GSYM($$divI_6) | ||
378 | .export $$divI_6,millicode | ||
379 | comb,<,N x2,0,LREF(neg6) | ||
380 | extru x2,30,31,x2 /* divide by 2 */ | ||
381 | addi 5,x2,t1 /* compute 5*(x2+1) = 5*x2+5 */ | ||
382 | sh2add x2,t1,x2 /* multiply by 5 to get started */ | ||
383 | b LREF(pos) | ||
384 | addc 0,0,x1 | ||
385 | |||
386 | LSYM(neg6) | ||
387 | subi 2,x2,x2 /* negate, divide by 2, and add 1 */ | ||
388 | /* negation and adding 1 are done */ | ||
389 | /* at the same time by the SUBI */ | ||
390 | extru x2,30,31,x2 | ||
391 | shd 0,x2,30,x1 | ||
392 | sh2add x2,x2,x2 /* multiply by 5 to get started */ | ||
393 | b LREF(neg) | ||
394 | addc x1,0,x1 | ||
395 | |||
396 | GSYM($$divU_6) | ||
397 | .export $$divU_6,millicode | ||
398 | extru x2,30,31,x2 /* divide by 2 */ | ||
399 | addi 1,x2,x2 /* cannot carry */ | ||
400 | shd 0,x2,30,x1 /* multiply by 5 to get started */ | ||
401 | sh2add x2,x2,x2 | ||
402 | b LREF(pos) | ||
403 | addc x1,0,x1 | ||
404 | |||
405 | /* DIVISION BY 10 (shift to divide by 2 then divide by 5) */ | ||
406 | GSYM($$divU_10) | ||
407 | .export $$divU_10,millicode | ||
408 | extru x2,30,31,x2 /* divide by 2 */ | ||
409 | addi 3,x2,t1 /* compute 3*(x2+1) = (3*x2)+3 */ | ||
410 | sh1add x2,t1,x2 /* multiply by 3 to get started */ | ||
411 | addc 0,0,x1 | ||
412 | LSYM(pos) | ||
413 | shd x1,x2,28,t1 /* multiply by 0x11 */ | ||
414 | shd x2,0,28,t2 | ||
415 | add x2,t2,x2 | ||
416 | addc x1,t1,x1 | ||
417 | LSYM(pos_for_17) | ||
418 | shd x1,x2,24,t1 /* multiply by 0x101 */ | ||
419 | shd x2,0,24,t2 | ||
420 | add x2,t2,x2 | ||
421 | addc x1,t1,x1 | ||
422 | |||
423 | shd x1,x2,16,t1 /* multiply by 0x10001 */ | ||
424 | shd x2,0,16,t2 | ||
425 | add x2,t2,x2 | ||
426 | MILLIRET | ||
427 | addc x1,t1,x1 | ||
428 | |||
429 | GSYM($$divI_10) | ||
430 | .export $$divI_10,millicode | ||
431 | comb,< x2,0,LREF(neg10) | ||
432 | copy 0,x1 | ||
433 | extru x2,30,31,x2 /* divide by 2 */ | ||
434 | addib,TR 1,x2,LREF(pos) /* add 1 (cannot overflow) */ | ||
435 | sh1add x2,x2,x2 /* multiply by 3 to get started */ | ||
436 | |||
437 | LSYM(neg10) | ||
438 | subi 2,x2,x2 /* negate, divide by 2, and add 1 */ | ||
439 | /* negation and adding 1 are done */ | ||
440 | /* at the same time by the SUBI */ | ||
441 | extru x2,30,31,x2 | ||
442 | sh1add x2,x2,x2 /* multiply by 3 to get started */ | ||
443 | LSYM(neg) | ||
444 | shd x1,x2,28,t1 /* multiply by 0x11 */ | ||
445 | shd x2,0,28,t2 | ||
446 | add x2,t2,x2 | ||
447 | addc x1,t1,x1 | ||
448 | LSYM(neg_for_17) | ||
449 | shd x1,x2,24,t1 /* multiply by 0x101 */ | ||
450 | shd x2,0,24,t2 | ||
451 | add x2,t2,x2 | ||
452 | addc x1,t1,x1 | ||
453 | |||
454 | shd x1,x2,16,t1 /* multiply by 0x10001 */ | ||
455 | shd x2,0,16,t2 | ||
456 | add x2,t2,x2 | ||
457 | addc x1,t1,x1 | ||
458 | MILLIRET | ||
459 | sub 0,x1,x1 | ||
460 | |||
461 | /* DIVISION BY 12 (shift to divide by 4 then divide by 3) */ | ||
462 | GSYM($$divI_12) | ||
463 | .export $$divI_12,millicode | ||
464 | comb,< x2,0,LREF(neg12) | ||
465 | copy 0,x1 | ||
466 | extru x2,29,30,x2 /* divide by 4 */ | ||
467 | addib,tr 1,x2,LREF(pos) /* compute 5*(x2+1) = 5*x2+5 */ | ||
468 | sh2add x2,x2,x2 /* multiply by 5 to get started */ | ||
469 | |||
470 | LSYM(neg12) | ||
471 | subi 4,x2,x2 /* negate, divide by 4, and add 1 */ | ||
472 | /* negation and adding 1 are done */ | ||
473 | /* at the same time by the SUBI */ | ||
474 | extru x2,29,30,x2 | ||
475 | b LREF(neg) | ||
476 | sh2add x2,x2,x2 /* multiply by 5 to get started */ | ||
477 | |||
478 | GSYM($$divU_12) | ||
479 | .export $$divU_12,millicode | ||
480 | extru x2,29,30,x2 /* divide by 4 */ | ||
481 | addi 5,x2,t1 /* cannot carry */ | ||
482 | sh2add x2,t1,x2 /* multiply by 5 to get started */ | ||
483 | b LREF(pos) | ||
484 | addc 0,0,x1 | ||
485 | |||
486 | /* DIVISION BY 15 (use z = 2**32; a = 11111111) */ | ||
487 | GSYM($$divI_15) | ||
488 | .export $$divI_15,millicode | ||
489 | comb,< x2,0,LREF(neg15) | ||
490 | copy 0,x1 | ||
491 | addib,tr 1,x2,LREF(pos)+4 | ||
492 | shd x1,x2,28,t1 | ||
493 | |||
494 | LSYM(neg15) | ||
495 | b LREF(neg) | ||
496 | subi 1,x2,x2 | ||
497 | |||
498 | GSYM($$divU_15) | ||
499 | .export $$divU_15,millicode | ||
500 | addi 1,x2,x2 /* this CAN overflow */ | ||
501 | b LREF(pos) | ||
502 | addc 0,0,x1 | ||
503 | |||
504 | /* DIVISION BY 17 (use z = 2**32; a = f0f0f0f) */ | ||
505 | GSYM($$divI_17) | ||
506 | .export $$divI_17,millicode | ||
507 | comb,<,n x2,0,LREF(neg17) | ||
508 | addi 1,x2,x2 /* this cannot overflow */ | ||
509 | shd 0,x2,28,t1 /* multiply by 0xf to get started */ | ||
510 | shd x2,0,28,t2 | ||
511 | sub t2,x2,x2 | ||
512 | b LREF(pos_for_17) | ||
513 | subb t1,0,x1 | ||
514 | |||
515 | LSYM(neg17) | ||
516 | subi 1,x2,x2 /* this cannot overflow */ | ||
517 | shd 0,x2,28,t1 /* multiply by 0xf to get started */ | ||
518 | shd x2,0,28,t2 | ||
519 | sub t2,x2,x2 | ||
520 | b LREF(neg_for_17) | ||
521 | subb t1,0,x1 | ||
522 | |||
523 | GSYM($$divU_17) | ||
524 | .export $$divU_17,millicode | ||
525 | addi 1,x2,x2 /* this CAN overflow */ | ||
526 | addc 0,0,x1 | ||
527 | shd x1,x2,28,t1 /* multiply by 0xf to get started */ | ||
528 | LSYM(u17) | ||
529 | shd x2,0,28,t2 | ||
530 | sub t2,x2,x2 | ||
531 | b LREF(pos_for_17) | ||
532 | subb t1,x1,x1 | ||
533 | |||
534 | |||
535 | /* DIVISION BY DIVISORS OF FFFFFF, and powers of 2 times these | ||
536 | includes 7,9 and also 14 | ||
537 | |||
538 | |||
539 | z = 2**24-1 | ||
540 | r = z mod x = 0 | ||
541 | |||
542 | so choose b = 0 | ||
543 | |||
544 | Also, in order to divide by z = 2**24-1, we approximate by dividing | ||
545 | by (z+1) = 2**24 (which is easy), and then correcting. | ||
546 | |||
547 | (ax) = (z+1)q' + r | ||
548 | . = zq' + (q'+r) | ||
549 | |||
550 | So to compute (ax)/z, compute q' = (ax)/(z+1) and r = (ax) mod (z+1) | ||
551 | Then the true remainder of (ax)/z is (q'+r). Repeat the process | ||
552 | with this new remainder, adding the tentative quotients together, | ||
553 | until a tentative quotient is 0 (and then we are done). There is | ||
554 | one last correction to be done. It is possible that (q'+r) = z. | ||
555 | If so, then (q'+r)/(z+1) = 0 and it looks like we are done. But, | ||
556 | in fact, we need to add 1 more to the quotient. Now, it turns | ||
557 | out that this happens if and only if the original value x is | ||
558 | an exact multiple of y. So, to avoid a three instruction test at | ||
559 | the end, instead use 1 instruction to add 1 to x at the beginning. */ | ||
560 | |||
561 | /* DIVISION BY 7 (use z = 2**24-1; a = 249249) */ | ||
562 | GSYM($$divI_7) | ||
563 | .export $$divI_7,millicode | ||
564 | comb,<,n x2,0,LREF(neg7) | ||
565 | LSYM(7) | ||
566 | addi 1,x2,x2 /* cannot overflow */ | ||
567 | shd 0,x2,29,x1 | ||
568 | sh3add x2,x2,x2 | ||
569 | addc x1,0,x1 | ||
570 | LSYM(pos7) | ||
571 | shd x1,x2,26,t1 | ||
572 | shd x2,0,26,t2 | ||
573 | add x2,t2,x2 | ||
574 | addc x1,t1,x1 | ||
575 | |||
576 | shd x1,x2,20,t1 | ||
577 | shd x2,0,20,t2 | ||
578 | add x2,t2,x2 | ||
579 | addc x1,t1,t1 | ||
580 | |||
581 | /* computed <t1,x2>. Now divide it by (2**24 - 1) */ | ||
582 | |||
583 | copy 0,x1 | ||
584 | shd,= t1,x2,24,t1 /* tentative quotient */ | ||
585 | LSYM(1) | ||
586 | addb,tr t1,x1,LREF(2) /* add to previous quotient */ | ||
587 | extru x2,31,24,x2 /* new remainder (unadjusted) */ | ||
588 | |||
589 | MILLIRETN | ||
590 | |||
591 | LSYM(2) | ||
592 | addb,tr t1,x2,LREF(1) /* adjust remainder */ | ||
593 | extru,= x2,7,8,t1 /* new quotient */ | ||
594 | |||
595 | LSYM(neg7) | ||
596 | subi 1,x2,x2 /* negate x2 and add 1 */ | ||
597 | LSYM(8) | ||
598 | shd 0,x2,29,x1 | ||
599 | sh3add x2,x2,x2 | ||
600 | addc x1,0,x1 | ||
601 | |||
602 | LSYM(neg7_shift) | ||
603 | shd x1,x2,26,t1 | ||
604 | shd x2,0,26,t2 | ||
605 | add x2,t2,x2 | ||
606 | addc x1,t1,x1 | ||
607 | |||
608 | shd x1,x2,20,t1 | ||
609 | shd x2,0,20,t2 | ||
610 | add x2,t2,x2 | ||
611 | addc x1,t1,t1 | ||
612 | |||
613 | /* computed <t1,x2>. Now divide it by (2**24 - 1) */ | ||
614 | |||
615 | copy 0,x1 | ||
616 | shd,= t1,x2,24,t1 /* tentative quotient */ | ||
617 | LSYM(3) | ||
618 | addb,tr t1,x1,LREF(4) /* add to previous quotient */ | ||
619 | extru x2,31,24,x2 /* new remainder (unadjusted) */ | ||
620 | |||
621 | MILLIRET | ||
622 | sub 0,x1,x1 /* negate result */ | ||
623 | |||
624 | LSYM(4) | ||
625 | addb,tr t1,x2,LREF(3) /* adjust remainder */ | ||
626 | extru,= x2,7,8,t1 /* new quotient */ | ||
627 | |||
628 | GSYM($$divU_7) | ||
629 | .export $$divU_7,millicode | ||
630 | addi 1,x2,x2 /* can carry */ | ||
631 | addc 0,0,x1 | ||
632 | shd x1,x2,29,t1 | ||
633 | sh3add x2,x2,x2 | ||
634 | b LREF(pos7) | ||
635 | addc t1,x1,x1 | ||
636 | |||
637 | /* DIVISION BY 9 (use z = 2**24-1; a = 1c71c7) */ | ||
638 | GSYM($$divI_9) | ||
639 | .export $$divI_9,millicode | ||
640 | comb,<,n x2,0,LREF(neg9) | ||
641 | addi 1,x2,x2 /* cannot overflow */ | ||
642 | shd 0,x2,29,t1 | ||
643 | shd x2,0,29,t2 | ||
644 | sub t2,x2,x2 | ||
645 | b LREF(pos7) | ||
646 | subb t1,0,x1 | ||
647 | |||
648 | LSYM(neg9) | ||
649 | subi 1,x2,x2 /* negate and add 1 */ | ||
650 | shd 0,x2,29,t1 | ||
651 | shd x2,0,29,t2 | ||
652 | sub t2,x2,x2 | ||
653 | b LREF(neg7_shift) | ||
654 | subb t1,0,x1 | ||
655 | |||
656 | GSYM($$divU_9) | ||
657 | .export $$divU_9,millicode | ||
658 | addi 1,x2,x2 /* can carry */ | ||
659 | addc 0,0,x1 | ||
660 | shd x1,x2,29,t1 | ||
661 | shd x2,0,29,t2 | ||
662 | sub t2,x2,x2 | ||
663 | b LREF(pos7) | ||
664 | subb t1,x1,x1 | ||
665 | |||
666 | /* DIVISION BY 14 (shift to divide by 2 then divide by 7) */ | ||
667 | GSYM($$divI_14) | ||
668 | .export $$divI_14,millicode | ||
669 | comb,<,n x2,0,LREF(neg14) | ||
670 | GSYM($$divU_14) | ||
671 | .export $$divU_14,millicode | ||
672 | b LREF(7) /* go to 7 case */ | ||
673 | extru x2,30,31,x2 /* divide by 2 */ | ||
674 | |||
675 | LSYM(neg14) | ||
676 | subi 2,x2,x2 /* negate (and add 2) */ | ||
677 | b LREF(8) | ||
678 | extru x2,30,31,x2 /* divide by 2 */ | ||
679 | .exit | ||
680 | .procend | ||
681 | .end | ||
682 | #endif | ||