aboutsummaryrefslogtreecommitdiffstats
path: root/arch/sparc/lib
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-05-15 14:23:01 -0400
committerDavid S. Miller <davem@davemloft.net>2012-05-15 14:23:47 -0400
commit1b35a57b1c1781f0fc8fc554f732b3a5408c5244 (patch)
tree80e5616798e0dc5ec138f020e6aa9ae482378462 /arch/sparc/lib
parent2119ff6d2bc0dd6a97de1632e50cd7936049738c (diff)
sparc32: Kill off software 32-bit multiply/divide routines.
For the explicit calls to .udiv/.umul in assembler, I made a mechanical (read as: safe) transformation. I didn't attempt to make any simplifications. In particular, __ndelay and __udelay can be simplified significantly. Some of the %y reads are unnecessary and these routines have no need any longer for allocating a register window, they can be leaf functions. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'arch/sparc/lib')
-rw-r--r--arch/sparc/lib/Makefile2
-rw-r--r--arch/sparc/lib/divdi3.S4
-rw-r--r--arch/sparc/lib/ksyms.c17
-rw-r--r--arch/sparc/lib/mul.S137
-rw-r--r--arch/sparc/lib/muldi3.S4
-rw-r--r--arch/sparc/lib/rem.S384
-rw-r--r--arch/sparc/lib/sdiv.S381
-rw-r--r--arch/sparc/lib/udiv.S357
-rw-r--r--arch/sparc/lib/udivdi3.S3
-rw-r--r--arch/sparc/lib/umul.S171
-rw-r--r--arch/sparc/lib/urem.S357
11 files changed, 7 insertions, 1810 deletions
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 33d8d85ad594..ead6df25054c 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -4,7 +4,7 @@
4asflags-y := -ansi -DST_DIV0=0x02 4asflags-y := -ansi -DST_DIV0=0x02
5ccflags-y := -Werror 5ccflags-y := -Werror
6 6
7lib-$(CONFIG_SPARC32) += mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o 7lib-$(CONFIG_SPARC32) += ashrdi3.o
8lib-$(CONFIG_SPARC32) += memcpy.o memset.o 8lib-$(CONFIG_SPARC32) += memcpy.o memset.o
9lib-y += strlen.o 9lib-y += strlen.o
10lib-y += checksum_$(BITS).o 10lib-y += checksum_$(BITS).o
diff --git a/arch/sparc/lib/divdi3.S b/arch/sparc/lib/divdi3.S
index d74bc0925f2d..9614b48b6ef8 100644
--- a/arch/sparc/lib/divdi3.S
+++ b/arch/sparc/lib/divdi3.S
@@ -19,7 +19,6 @@ Boston, MA 02111-1307, USA. */
19 19
20 .text 20 .text
21 .align 4 21 .align 4
22 .global .udiv
23 .globl __divdi3 22 .globl __divdi3
24__divdi3: 23__divdi3:
25 save %sp,-104,%sp 24 save %sp,-104,%sp
@@ -83,8 +82,9 @@ __divdi3:
83 bne .LL85 82 bne .LL85
84 mov %i0,%o2 83 mov %i0,%o2
85 mov 1,%o0 84 mov 1,%o0
86 call .udiv,0
87 mov 0,%o1 85 mov 0,%o1
86 wr %g0, 0, %y
87 udiv %o0, %o1, %o0
88 mov %o0,%o4 88 mov %o0,%o4
89 mov %i0,%o2 89 mov %i0,%o2
90.LL85: 90.LL85:
diff --git a/arch/sparc/lib/ksyms.c b/arch/sparc/lib/ksyms.c
index 1bc8972f029f..2dc30875c8bc 100644
--- a/arch/sparc/lib/ksyms.c
+++ b/arch/sparc/lib/ksyms.c
@@ -61,16 +61,6 @@ extern void ___rw_read_try(void);
61extern void ___rw_read_exit(void); 61extern void ___rw_read_exit(void);
62extern void ___rw_write_enter(void); 62extern void ___rw_write_enter(void);
63 63
64/* Alias functions whose names begin with "." and export the aliases.
65 * The module references will be fixed up by module_frob_arch_sections.
66 */
67extern int _Div(int, int);
68extern int _Mul(int, int);
69extern int _Rem(int, int);
70extern unsigned _Udiv(unsigned, unsigned);
71extern unsigned _Umul(unsigned, unsigned);
72extern unsigned _Urem(unsigned, unsigned);
73
74/* Networking helper routines. */ 64/* Networking helper routines. */
75EXPORT_SYMBOL(__csum_partial_copy_sparc_generic); 65EXPORT_SYMBOL(__csum_partial_copy_sparc_generic);
76 66
@@ -95,13 +85,6 @@ EXPORT_SYMBOL(__ashldi3);
95EXPORT_SYMBOL(__lshrdi3); 85EXPORT_SYMBOL(__lshrdi3);
96EXPORT_SYMBOL(__muldi3); 86EXPORT_SYMBOL(__muldi3);
97EXPORT_SYMBOL(__divdi3); 87EXPORT_SYMBOL(__divdi3);
98
99EXPORT_SYMBOL(_Rem);
100EXPORT_SYMBOL(_Urem);
101EXPORT_SYMBOL(_Mul);
102EXPORT_SYMBOL(_Umul);
103EXPORT_SYMBOL(_Div);
104EXPORT_SYMBOL(_Udiv);
105#endif 88#endif
106 89
107/* 90/*
diff --git a/arch/sparc/lib/mul.S b/arch/sparc/lib/mul.S
deleted file mode 100644
index c45470d0b0ce..000000000000
--- a/arch/sparc/lib/mul.S
+++ /dev/null
@@ -1,137 +0,0 @@
1/*
2 * mul.S: This routine was taken from glibc-1.09 and is covered
3 * by the GNU Library General Public License Version 2.
4 */
5
6/*
7 * Signed multiply, from Appendix E of the Sparc Version 8
8 * Architecture Manual.
9 */
10
11/*
12 * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of
13 * the 64-bit product).
14 *
15 * This code optimizes short (less than 13-bit) multiplies.
16 */
17
18 .globl .mul
19 .globl _Mul
20.mul:
21_Mul: /* needed for export */
22 mov %o0, %y ! multiplier -> Y
23 andncc %o0, 0xfff, %g0 ! test bits 12..31
24 be Lmul_shortway ! if zero, can do it the short way
25 andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
26
27 /*
28 * Long multiply. 32 steps, followed by a final shift step.
29 */
30 mulscc %o4, %o1, %o4 ! 1
31 mulscc %o4, %o1, %o4 ! 2
32 mulscc %o4, %o1, %o4 ! 3
33 mulscc %o4, %o1, %o4 ! 4
34 mulscc %o4, %o1, %o4 ! 5
35 mulscc %o4, %o1, %o4 ! 6
36 mulscc %o4, %o1, %o4 ! 7
37 mulscc %o4, %o1, %o4 ! 8
38 mulscc %o4, %o1, %o4 ! 9
39 mulscc %o4, %o1, %o4 ! 10
40 mulscc %o4, %o1, %o4 ! 11
41 mulscc %o4, %o1, %o4 ! 12
42 mulscc %o4, %o1, %o4 ! 13
43 mulscc %o4, %o1, %o4 ! 14
44 mulscc %o4, %o1, %o4 ! 15
45 mulscc %o4, %o1, %o4 ! 16
46 mulscc %o4, %o1, %o4 ! 17
47 mulscc %o4, %o1, %o4 ! 18
48 mulscc %o4, %o1, %o4 ! 19
49 mulscc %o4, %o1, %o4 ! 20
50 mulscc %o4, %o1, %o4 ! 21
51 mulscc %o4, %o1, %o4 ! 22
52 mulscc %o4, %o1, %o4 ! 23
53 mulscc %o4, %o1, %o4 ! 24
54 mulscc %o4, %o1, %o4 ! 25
55 mulscc %o4, %o1, %o4 ! 26
56 mulscc %o4, %o1, %o4 ! 27
57 mulscc %o4, %o1, %o4 ! 28
58 mulscc %o4, %o1, %o4 ! 29
59 mulscc %o4, %o1, %o4 ! 30
60 mulscc %o4, %o1, %o4 ! 31
61 mulscc %o4, %o1, %o4 ! 32
62 mulscc %o4, %g0, %o4 ! final shift
63
64 ! If %o0 was negative, the result is
65 ! (%o0 * %o1) + (%o1 << 32))
66 ! We fix that here.
67
68#if 0
69 tst %o0
70 bge 1f
71 rd %y, %o0
72
73 ! %o0 was indeed negative; fix upper 32 bits of result by subtracting
74 ! %o1 (i.e., return %o4 - %o1 in %o1).
75 retl
76 sub %o4, %o1, %o1
77
781:
79 retl
80 mov %o4, %o1
81#else
82 /* Faster code adapted from tege@sics.se's code for umul.S. */
83 sra %o0, 31, %o2 ! make mask from sign bit
84 and %o1, %o2, %o2 ! %o2 = 0 or %o1, depending on sign of %o0
85 rd %y, %o0 ! get lower half of product
86 retl
87 sub %o4, %o2, %o1 ! subtract compensation
88 ! and put upper half in place
89#endif
90
91Lmul_shortway:
92 /*
93 * Short multiply. 12 steps, followed by a final shift step.
94 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
95 * but there is no problem with %o0 being negative (unlike above).
96 */
97 mulscc %o4, %o1, %o4 ! 1
98 mulscc %o4, %o1, %o4 ! 2
99 mulscc %o4, %o1, %o4 ! 3
100 mulscc %o4, %o1, %o4 ! 4
101 mulscc %o4, %o1, %o4 ! 5
102 mulscc %o4, %o1, %o4 ! 6
103 mulscc %o4, %o1, %o4 ! 7
104 mulscc %o4, %o1, %o4 ! 8
105 mulscc %o4, %o1, %o4 ! 9
106 mulscc %o4, %o1, %o4 ! 10
107 mulscc %o4, %o1, %o4 ! 11
108 mulscc %o4, %o1, %o4 ! 12
109 mulscc %o4, %g0, %o4 ! final shift
110
111 /*
112 * %o4 has 20 of the bits that should be in the low part of the
113 * result; %y has the bottom 12 (as %y's top 12). That is:
114 *
115 * %o4 %y
116 * +----------------+----------------+
117 * | -12- | -20- | -12- | -20- |
118 * +------(---------+------)---------+
119 * --hi-- ----low-part----
120 *
121 * The upper 12 bits of %o4 should be sign-extended to form the
122 * high part of the product (i.e., highpart = %o4 >> 20).
123 */
124
125 rd %y, %o5
126 sll %o4, 12, %o0 ! shift middle bits left 12
127 srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left
128 or %o5, %o0, %o0 ! construct low part of result
129 retl
130 sra %o4, 20, %o1 ! ... and extract high part of result
131
132 .globl .mul_patch
133.mul_patch:
134 smul %o0, %o1, %o0
135 retl
136 rd %y, %o1
137 nop
diff --git a/arch/sparc/lib/muldi3.S b/arch/sparc/lib/muldi3.S
index 7f17872d0603..9794939d1c12 100644
--- a/arch/sparc/lib/muldi3.S
+++ b/arch/sparc/lib/muldi3.S
@@ -63,12 +63,12 @@ __muldi3:
63 rd %y, %o1 63 rd %y, %o1
64 mov %o1, %l3 64 mov %o1, %l3
65 mov %i1, %o0 65 mov %i1, %o0
66 call .umul
67 mov %i2, %o1 66 mov %i2, %o1
67 umul %o0, %o1, %o0
68 mov %o0, %l0 68 mov %o0, %l0
69 mov %i0, %o0 69 mov %i0, %o0
70 call .umul
71 mov %i3, %o1 70 mov %i3, %o1
71 umul %o0, %o1, %o0
72 add %l0, %o0, %l0 72 add %l0, %o0, %l0
73 mov %l2, %i0 73 mov %l2, %i0
74 add %l2, %l0, %i0 74 add %l2, %l0, %i0
diff --git a/arch/sparc/lib/rem.S b/arch/sparc/lib/rem.S
deleted file mode 100644
index 42fb86252815..000000000000
--- a/arch/sparc/lib/rem.S
+++ /dev/null
@@ -1,384 +0,0 @@
1/*
2 * rem.S: This routine was taken from glibc-1.09 and is covered
3 * by the GNU Library General Public License Version 2.
4 */
5
6
7/* This file is generated from divrem.m4; DO NOT EDIT! */
8/*
9 * Division and remainder, from Appendix E of the Sparc Version 8
10 * Architecture Manual, with fixes from Gordon Irlam.
11 */
12
13/*
14 * Input: dividend and divisor in %o0 and %o1 respectively.
15 *
16 * m4 parameters:
17 * .rem name of function to generate
18 * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1
19 * true true=true => signed; true=false => unsigned
20 *
21 * Algorithm parameters:
22 * N how many bits per iteration we try to get (4)
23 * WORDSIZE total number of bits (32)
24 *
25 * Derived constants:
26 * TOPBITS number of bits in the top decade of a number
27 *
28 * Important variables:
29 * Q the partial quotient under development (initially 0)
30 * R the remainder so far, initially the dividend
31 * ITER number of main division loop iterations required;
32 * equal to ceil(log2(quotient) / N). Note that this
33 * is the log base (2^N) of the quotient.
34 * V the current comparand, initially divisor*2^(ITER*N-1)
35 *
36 * Cost:
37 * Current estimate for non-large dividend is
38 * ceil(log2(quotient) / N) * (10 + 7N/2) + C
39 * A large dividend is one greater than 2^(31-TOPBITS) and takes a
40 * different path, as the upper bits of the quotient must be developed
41 * one bit at a time.
42 */
43
44
45 .globl .rem
46 .globl _Rem
47.rem:
48_Rem: /* needed for export */
49 ! compute sign of result; if neither is negative, no problem
50 orcc %o1, %o0, %g0 ! either negative?
51 bge 2f ! no, go do the divide
52 mov %o0, %g2 ! compute sign in any case
53
54 tst %o1
55 bge 1f
56 tst %o0
57 ! %o1 is definitely negative; %o0 might also be negative
58 bge 2f ! if %o0 not negative...
59 sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
601: ! %o0 is negative, %o1 is nonnegative
61 sub %g0, %o0, %o0 ! make %o0 nonnegative
622:
63
64 ! Ready to divide. Compute size of quotient; scale comparand.
65 orcc %o1, %g0, %o5
66 bne 1f
67 mov %o0, %o3
68
69 ! Divide by zero trap. If it returns, return 0 (about as
70 ! wrong as possible, but that is what SunOS does...).
71 ta ST_DIV0
72 retl
73 clr %o0
74
751:
76 cmp %o3, %o5 ! if %o1 exceeds %o0, done
77 blu Lgot_result ! (and algorithm fails otherwise)
78 clr %o2
79
80 sethi %hi(1 << (32 - 4 - 1)), %g1
81
82 cmp %o3, %g1
83 blu Lnot_really_big
84 clr %o4
85
86 ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
87 ! as our usual N-at-a-shot divide step will cause overflow and havoc.
88 ! The number of bits in the result here is N*ITER+SC, where SC <= N.
89 ! Compute ITER in an unorthodox manner: know we need to shift V into
90 ! the top decade: so do not even bother to compare to R.
91 1:
92 cmp %o5, %g1
93 bgeu 3f
94 mov 1, %g7
95
96 sll %o5, 4, %o5
97
98 b 1b
99 add %o4, 1, %o4
100
101 ! Now compute %g7.
102 2:
103 addcc %o5, %o5, %o5
104
105 bcc Lnot_too_big
106 add %g7, 1, %g7
107
108 ! We get here if the %o1 overflowed while shifting.
109 ! This means that %o3 has the high-order bit set.
110 ! Restore %o5 and subtract from %o3.
111 sll %g1, 4, %g1 ! high order bit
112 srl %o5, 1, %o5 ! rest of %o5
113 add %o5, %g1, %o5
114
115 b Ldo_single_div
116 sub %g7, 1, %g7
117
118 Lnot_too_big:
119 3:
120 cmp %o5, %o3
121 blu 2b
122 nop
123
124 be Ldo_single_div
125 nop
126 /* NB: these are commented out in the V8-Sparc manual as well */
127 /* (I do not understand this) */
128 ! %o5 > %o3: went too far: back up 1 step
129 ! srl %o5, 1, %o5
130 ! dec %g7
131 ! do single-bit divide steps
132 !
133 ! We have to be careful here. We know that %o3 >= %o5, so we can do the
134 ! first divide step without thinking. BUT, the others are conditional,
135 ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
136 ! order bit set in the first step, just falling into the regular
137 ! division loop will mess up the first time around.
138 ! So we unroll slightly...
139 Ldo_single_div:
140 subcc %g7, 1, %g7
141 bl Lend_regular_divide
142 nop
143
144 sub %o3, %o5, %o3
145 mov 1, %o2
146
147 b Lend_single_divloop
148 nop
149 Lsingle_divloop:
150 sll %o2, 1, %o2
151
152 bl 1f
153 srl %o5, 1, %o5
154 ! %o3 >= 0
155 sub %o3, %o5, %o3
156
157 b 2f
158 add %o2, 1, %o2
159 1: ! %o3 < 0
160 add %o3, %o5, %o3
161 sub %o2, 1, %o2
162 2:
163 Lend_single_divloop:
164 subcc %g7, 1, %g7
165 bge Lsingle_divloop
166 tst %o3
167
168 b,a Lend_regular_divide
169
170Lnot_really_big:
1711:
172 sll %o5, 4, %o5
173 cmp %o5, %o3
174 bleu 1b
175 addcc %o4, 1, %o4
176 be Lgot_result
177 sub %o4, 1, %o4
178
179 tst %o3 ! set up for initial iteration
180Ldivloop:
181 sll %o2, 4, %o2
182 ! depth 1, accumulated bits 0
183 bl L.1.16
184 srl %o5,1,%o5
185 ! remainder is positive
186 subcc %o3,%o5,%o3
187 ! depth 2, accumulated bits 1
188 bl L.2.17
189 srl %o5,1,%o5
190 ! remainder is positive
191 subcc %o3,%o5,%o3
192 ! depth 3, accumulated bits 3
193 bl L.3.19
194 srl %o5,1,%o5
195 ! remainder is positive
196 subcc %o3,%o5,%o3
197 ! depth 4, accumulated bits 7
198 bl L.4.23
199 srl %o5,1,%o5
200 ! remainder is positive
201 subcc %o3,%o5,%o3
202
203 b 9f
204 add %o2, (7*2+1), %o2
205
206L.4.23:
207 ! remainder is negative
208 addcc %o3,%o5,%o3
209 b 9f
210 add %o2, (7*2-1), %o2
211
212L.3.19:
213 ! remainder is negative
214 addcc %o3,%o5,%o3
215 ! depth 4, accumulated bits 5
216 bl L.4.21
217 srl %o5,1,%o5
218 ! remainder is positive
219 subcc %o3,%o5,%o3
220 b 9f
221 add %o2, (5*2+1), %o2
222
223L.4.21:
224 ! remainder is negative
225 addcc %o3,%o5,%o3
226 b 9f
227 add %o2, (5*2-1), %o2
228
229L.2.17:
230 ! remainder is negative
231 addcc %o3,%o5,%o3
232 ! depth 3, accumulated bits 1
233 bl L.3.17
234 srl %o5,1,%o5
235 ! remainder is positive
236 subcc %o3,%o5,%o3
237 ! depth 4, accumulated bits 3
238 bl L.4.19
239 srl %o5,1,%o5
240 ! remainder is positive
241 subcc %o3,%o5,%o3
242 b 9f
243 add %o2, (3*2+1), %o2
244
245L.4.19:
246 ! remainder is negative
247 addcc %o3,%o5,%o3
248 b 9f
249 add %o2, (3*2-1), %o2
250
251L.3.17:
252 ! remainder is negative
253 addcc %o3,%o5,%o3
254 ! depth 4, accumulated bits 1
255 bl L.4.17
256 srl %o5,1,%o5
257 ! remainder is positive
258 subcc %o3,%o5,%o3
259 b 9f
260 add %o2, (1*2+1), %o2
261
262L.4.17:
263 ! remainder is negative
264 addcc %o3,%o5,%o3
265 b 9f
266 add %o2, (1*2-1), %o2
267
268L.1.16:
269 ! remainder is negative
270 addcc %o3,%o5,%o3
271 ! depth 2, accumulated bits -1
272 bl L.2.15
273 srl %o5,1,%o5
274 ! remainder is positive
275 subcc %o3,%o5,%o3
276 ! depth 3, accumulated bits -1
277 bl L.3.15
278 srl %o5,1,%o5
279 ! remainder is positive
280 subcc %o3,%o5,%o3
281 ! depth 4, accumulated bits -1
282 bl L.4.15
283 srl %o5,1,%o5
284 ! remainder is positive
285 subcc %o3,%o5,%o3
286 b 9f
287 add %o2, (-1*2+1), %o2
288
289L.4.15:
290 ! remainder is negative
291 addcc %o3,%o5,%o3
292 b 9f
293 add %o2, (-1*2-1), %o2
294
295L.3.15:
296 ! remainder is negative
297 addcc %o3,%o5,%o3
298 ! depth 4, accumulated bits -3
299 bl L.4.13
300 srl %o5,1,%o5
301 ! remainder is positive
302 subcc %o3,%o5,%o3
303 b 9f
304 add %o2, (-3*2+1), %o2
305
306L.4.13:
307 ! remainder is negative
308 addcc %o3,%o5,%o3
309 b 9f
310 add %o2, (-3*2-1), %o2
311
312L.2.15:
313 ! remainder is negative
314 addcc %o3,%o5,%o3
315 ! depth 3, accumulated bits -3
316 bl L.3.13
317 srl %o5,1,%o5
318 ! remainder is positive
319 subcc %o3,%o5,%o3
320 ! depth 4, accumulated bits -5
321 bl L.4.11
322 srl %o5,1,%o5
323 ! remainder is positive
324 subcc %o3,%o5,%o3
325 b 9f
326 add %o2, (-5*2+1), %o2
327
328L.4.11:
329 ! remainder is negative
330 addcc %o3,%o5,%o3
331 b 9f
332 add %o2, (-5*2-1), %o2
333
334
335L.3.13:
336 ! remainder is negative
337 addcc %o3,%o5,%o3
338 ! depth 4, accumulated bits -7
339 bl L.4.9
340 srl %o5,1,%o5
341 ! remainder is positive
342 subcc %o3,%o5,%o3
343 b 9f
344 add %o2, (-7*2+1), %o2
345
346L.4.9:
347 ! remainder is negative
348 addcc %o3,%o5,%o3
349 b 9f
350 add %o2, (-7*2-1), %o2
351
352 9:
353Lend_regular_divide:
354 subcc %o4, 1, %o4
355 bge Ldivloop
356 tst %o3
357
358 bl,a Lgot_result
359 ! non-restoring fixup here (one instruction only!)
360 add %o3, %o1, %o3
361
362Lgot_result:
363 ! check to see if answer should be < 0
364 tst %g2
365 bl,a 1f
366 sub %g0, %o3, %o3
3671:
368 retl
369 mov %o3, %o0
370
371 .globl .rem_patch
372.rem_patch:
373 sra %o0, 0x1f, %o4
374 wr %o4, 0x0, %y
375 nop
376 nop
377 nop
378 sdivcc %o0, %o1, %o2
379 bvs,a 1f
380 xnor %o2, %g0, %o2
3811: smul %o2, %o1, %o2
382 retl
383 sub %o0, %o2, %o0
384 nop
diff --git a/arch/sparc/lib/sdiv.S b/arch/sparc/lib/sdiv.S
deleted file mode 100644
index f0a0d4e4db78..000000000000
--- a/arch/sparc/lib/sdiv.S
+++ /dev/null
@@ -1,381 +0,0 @@
1/*
2 * sdiv.S: This routine was taken from glibc-1.09 and is covered
3 * by the GNU Library General Public License Version 2.
4 */
5
6
7/* This file is generated from divrem.m4; DO NOT EDIT! */
8/*
9 * Division and remainder, from Appendix E of the Sparc Version 8
10 * Architecture Manual, with fixes from Gordon Irlam.
11 */
12
13/*
14 * Input: dividend and divisor in %o0 and %o1 respectively.
15 *
16 * m4 parameters:
17 * .div name of function to generate
18 * div div=div => %o0 / %o1; div=rem => %o0 % %o1
19 * true true=true => signed; true=false => unsigned
20 *
21 * Algorithm parameters:
22 * N how many bits per iteration we try to get (4)
23 * WORDSIZE total number of bits (32)
24 *
25 * Derived constants:
26 * TOPBITS number of bits in the top decade of a number
27 *
28 * Important variables:
29 * Q the partial quotient under development (initially 0)
30 * R the remainder so far, initially the dividend
31 * ITER number of main division loop iterations required;
32 * equal to ceil(log2(quotient) / N). Note that this
33 * is the log base (2^N) of the quotient.
34 * V the current comparand, initially divisor*2^(ITER*N-1)
35 *
36 * Cost:
37 * Current estimate for non-large dividend is
38 * ceil(log2(quotient) / N) * (10 + 7N/2) + C
39 * A large dividend is one greater than 2^(31-TOPBITS) and takes a
40 * different path, as the upper bits of the quotient must be developed
41 * one bit at a time.
42 */
43
44
45 .globl .div
46 .globl _Div
47.div:
48_Div: /* needed for export */
49 ! compute sign of result; if neither is negative, no problem
50 orcc %o1, %o0, %g0 ! either negative?
51 bge 2f ! no, go do the divide
52 xor %o1, %o0, %g2 ! compute sign in any case
53
54 tst %o1
55 bge 1f
56 tst %o0
57 ! %o1 is definitely negative; %o0 might also be negative
58 bge 2f ! if %o0 not negative...
59 sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
601: ! %o0 is negative, %o1 is nonnegative
61 sub %g0, %o0, %o0 ! make %o0 nonnegative
622:
63
64 ! Ready to divide. Compute size of quotient; scale comparand.
65 orcc %o1, %g0, %o5
66 bne 1f
67 mov %o0, %o3
68
69 ! Divide by zero trap. If it returns, return 0 (about as
70 ! wrong as possible, but that is what SunOS does...).
71 ta ST_DIV0
72 retl
73 clr %o0
74
751:
76 cmp %o3, %o5 ! if %o1 exceeds %o0, done
77 blu Lgot_result ! (and algorithm fails otherwise)
78 clr %o2
79
80 sethi %hi(1 << (32 - 4 - 1)), %g1
81
82 cmp %o3, %g1
83 blu Lnot_really_big
84 clr %o4
85
86 ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
87 ! as our usual N-at-a-shot divide step will cause overflow and havoc.
88 ! The number of bits in the result here is N*ITER+SC, where SC <= N.
89 ! Compute ITER in an unorthodox manner: know we need to shift V into
90 ! the top decade: so do not even bother to compare to R.
91 1:
92 cmp %o5, %g1
93 bgeu 3f
94 mov 1, %g7
95
96 sll %o5, 4, %o5
97
98 b 1b
99 add %o4, 1, %o4
100
101 ! Now compute %g7.
102 2:
103 addcc %o5, %o5, %o5
104 bcc Lnot_too_big
105 add %g7, 1, %g7
106
107 ! We get here if the %o1 overflowed while shifting.
108 ! This means that %o3 has the high-order bit set.
109 ! Restore %o5 and subtract from %o3.
110 sll %g1, 4, %g1 ! high order bit
111 srl %o5, 1, %o5 ! rest of %o5
112 add %o5, %g1, %o5
113
114 b Ldo_single_div
115 sub %g7, 1, %g7
116
117 Lnot_too_big:
118 3:
119 cmp %o5, %o3
120 blu 2b
121 nop
122
123 be Ldo_single_div
124 nop
125 /* NB: these are commented out in the V8-Sparc manual as well */
126 /* (I do not understand this) */
127 ! %o5 > %o3: went too far: back up 1 step
128 ! srl %o5, 1, %o5
129 ! dec %g7
130 ! do single-bit divide steps
131 !
132 ! We have to be careful here. We know that %o3 >= %o5, so we can do the
133 ! first divide step without thinking. BUT, the others are conditional,
134 ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
135 ! order bit set in the first step, just falling into the regular
136 ! division loop will mess up the first time around.
137 ! So we unroll slightly...
138 Ldo_single_div:
139 subcc %g7, 1, %g7
140 bl Lend_regular_divide
141 nop
142
143 sub %o3, %o5, %o3
144 mov 1, %o2
145
146 b Lend_single_divloop
147 nop
148 Lsingle_divloop:
149 sll %o2, 1, %o2
150
151 bl 1f
152 srl %o5, 1, %o5
153 ! %o3 >= 0
154 sub %o3, %o5, %o3
155
156 b 2f
157 add %o2, 1, %o2
158 1: ! %o3 < 0
159 add %o3, %o5, %o3
160 sub %o2, 1, %o2
161 2:
162 Lend_single_divloop:
163 subcc %g7, 1, %g7
164 bge Lsingle_divloop
165 tst %o3
166
167 b,a Lend_regular_divide
168
169Lnot_really_big:
1701:
171 sll %o5, 4, %o5
172 cmp %o5, %o3
173 bleu 1b
174 addcc %o4, 1, %o4
175
176 be Lgot_result
177 sub %o4, 1, %o4
178
179 tst %o3 ! set up for initial iteration
180Ldivloop:
181 sll %o2, 4, %o2
182 ! depth 1, accumulated bits 0
183 bl L.1.16
184 srl %o5,1,%o5
185 ! remainder is positive
186 subcc %o3,%o5,%o3
187 ! depth 2, accumulated bits 1
188 bl L.2.17
189 srl %o5,1,%o5
190 ! remainder is positive
191 subcc %o3,%o5,%o3
192 ! depth 3, accumulated bits 3
193 bl L.3.19
194 srl %o5,1,%o5
195 ! remainder is positive
196 subcc %o3,%o5,%o3
197 ! depth 4, accumulated bits 7
198 bl L.4.23
199 srl %o5,1,%o5
200 ! remainder is positive
201 subcc %o3,%o5,%o3
202 b 9f
203 add %o2, (7*2+1), %o2
204
205L.4.23:
206 ! remainder is negative
207 addcc %o3,%o5,%o3
208 b 9f
209 add %o2, (7*2-1), %o2
210
211L.3.19:
212 ! remainder is negative
213 addcc %o3,%o5,%o3
214 ! depth 4, accumulated bits 5
215 bl L.4.21
216 srl %o5,1,%o5
217 ! remainder is positive
218 subcc %o3,%o5,%o3
219 b 9f
220 add %o2, (5*2+1), %o2
221
222L.4.21:
223 ! remainder is negative
224 addcc %o3,%o5,%o3
225 b 9f
226 add %o2, (5*2-1), %o2
227
228L.2.17:
229 ! remainder is negative
230 addcc %o3,%o5,%o3
231 ! depth 3, accumulated bits 1
232 bl L.3.17
233 srl %o5,1,%o5
234 ! remainder is positive
235 subcc %o3,%o5,%o3
236 ! depth 4, accumulated bits 3
237 bl L.4.19
238 srl %o5,1,%o5
239 ! remainder is positive
240 subcc %o3,%o5,%o3
241 b 9f
242 add %o2, (3*2+1), %o2
243
244L.4.19:
245 ! remainder is negative
246 addcc %o3,%o5,%o3
247 b 9f
248 add %o2, (3*2-1), %o2
249
250
251L.3.17:
252 ! remainder is negative
253 addcc %o3,%o5,%o3
254 ! depth 4, accumulated bits 1
255 bl L.4.17
256 srl %o5,1,%o5
257 ! remainder is positive
258 subcc %o3,%o5,%o3
259 b 9f
260 add %o2, (1*2+1), %o2
261
262L.4.17:
263 ! remainder is negative
264 addcc %o3,%o5,%o3
265 b 9f
266 add %o2, (1*2-1), %o2
267
268L.1.16:
269 ! remainder is negative
270 addcc %o3,%o5,%o3
271 ! depth 2, accumulated bits -1
272 bl L.2.15
273 srl %o5,1,%o5
274 ! remainder is positive
275 subcc %o3,%o5,%o3
276 ! depth 3, accumulated bits -1
277 bl L.3.15
278 srl %o5,1,%o5
279 ! remainder is positive
280 subcc %o3,%o5,%o3
281 ! depth 4, accumulated bits -1
282 bl L.4.15
283 srl %o5,1,%o5
284 ! remainder is positive
285 subcc %o3,%o5,%o3
286 b 9f
287 add %o2, (-1*2+1), %o2
288
289L.4.15:
290 ! remainder is negative
291 addcc %o3,%o5,%o3
292 b 9f
293 add %o2, (-1*2-1), %o2
294
295L.3.15:
296 ! remainder is negative
297 addcc %o3,%o5,%o3
298 ! depth 4, accumulated bits -3
299 bl L.4.13
300 srl %o5,1,%o5
301 ! remainder is positive
302 subcc %o3,%o5,%o3
303 b 9f
304 add %o2, (-3*2+1), %o2
305
306L.4.13:
307 ! remainder is negative
308 addcc %o3,%o5,%o3
309 b 9f
310 add %o2, (-3*2-1), %o2
311
312L.2.15:
313 ! remainder is negative
314 addcc %o3,%o5,%o3
315 ! depth 3, accumulated bits -3
316 bl L.3.13
317 srl %o5,1,%o5
318 ! remainder is positive
319 subcc %o3,%o5,%o3
320 ! depth 4, accumulated bits -5
321 bl L.4.11
322 srl %o5,1,%o5
323 ! remainder is positive
324 subcc %o3,%o5,%o3
325 b 9f
326 add %o2, (-5*2+1), %o2
327
328L.4.11:
329 ! remainder is negative
330 addcc %o3,%o5,%o3
331 b 9f
332 add %o2, (-5*2-1), %o2
333
334L.3.13:
335 ! remainder is negative
336 addcc %o3,%o5,%o3
337 ! depth 4, accumulated bits -7
338 bl L.4.9
339 srl %o5,1,%o5
340 ! remainder is positive
341 subcc %o3,%o5,%o3
342 b 9f
343 add %o2, (-7*2+1), %o2
344
345L.4.9:
346 ! remainder is negative
347 addcc %o3,%o5,%o3
348 b 9f
349 add %o2, (-7*2-1), %o2
350
351 9:
352Lend_regular_divide:
353 subcc %o4, 1, %o4
354 bge Ldivloop
355 tst %o3
356
357 bl,a Lgot_result
358 ! non-restoring fixup here (one instruction only!)
359 sub %o2, 1, %o2
360
361Lgot_result:
362 ! check to see if answer should be < 0
363 tst %g2
364 bl,a 1f
365 sub %g0, %o2, %o2
3661:
367 retl
368 mov %o2, %o0
369
370 .globl .div_patch
371.div_patch:
372 sra %o0, 0x1f, %o2
373 wr %o2, 0x0, %y
374 nop
375 nop
376 nop
377 sdivcc %o0, %o1, %o0
378 bvs,a 1f
379 xnor %o0, %g0, %o0
3801: retl
381 nop
diff --git a/arch/sparc/lib/udiv.S b/arch/sparc/lib/udiv.S
deleted file mode 100644
index 2101405bdfcb..000000000000
--- a/arch/sparc/lib/udiv.S
+++ /dev/null
@@ -1,357 +0,0 @@
1/*
2 * udiv.S: This routine was taken from glibc-1.09 and is covered
3 * by the GNU Library General Public License Version 2.
4 */
5
6
7/* This file is generated from divrem.m4; DO NOT EDIT! */
8/*
9 * Division and remainder, from Appendix E of the Sparc Version 8
10 * Architecture Manual, with fixes from Gordon Irlam.
11 */
12
13/*
14 * Input: dividend and divisor in %o0 and %o1 respectively.
15 *
16 * m4 parameters:
17 * .udiv name of function to generate
18 * div div=div => %o0 / %o1; div=rem => %o0 % %o1
19 * false false=true => signed; false=false => unsigned
20 *
21 * Algorithm parameters:
22 * N how many bits per iteration we try to get (4)
23 * WORDSIZE total number of bits (32)
24 *
25 * Derived constants:
26 * TOPBITS number of bits in the top decade of a number
27 *
28 * Important variables:
29 * Q the partial quotient under development (initially 0)
30 * R the remainder so far, initially the dividend
31 * ITER number of main division loop iterations required;
32 * equal to ceil(log2(quotient) / N). Note that this
33 * is the log base (2^N) of the quotient.
34 * V the current comparand, initially divisor*2^(ITER*N-1)
35 *
36 * Cost:
37 * Current estimate for non-large dividend is
38 * ceil(log2(quotient) / N) * (10 + 7N/2) + C
39 * A large dividend is one greater than 2^(31-TOPBITS) and takes a
40 * different path, as the upper bits of the quotient must be developed
41 * one bit at a time.
42 */
43
44
45 .globl .udiv
46 .globl _Udiv
47.udiv:
48_Udiv: /* needed for export */
49
50 ! Ready to divide. Compute size of quotient; scale comparand.
51 orcc %o1, %g0, %o5
52 bne 1f
53 mov %o0, %o3
54
55 ! Divide by zero trap. If it returns, return 0 (about as
56 ! wrong as possible, but that is what SunOS does...).
57 ta ST_DIV0
58 retl
59 clr %o0
60
611:
62 cmp %o3, %o5 ! if %o1 exceeds %o0, done
63 blu Lgot_result ! (and algorithm fails otherwise)
64 clr %o2
65
66 sethi %hi(1 << (32 - 4 - 1)), %g1
67
68 cmp %o3, %g1
69 blu Lnot_really_big
70 clr %o4
71
72 ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
73 ! as our usual N-at-a-shot divide step will cause overflow and havoc.
74 ! The number of bits in the result here is N*ITER+SC, where SC <= N.
75 ! Compute ITER in an unorthodox manner: know we need to shift V into
76 ! the top decade: so do not even bother to compare to R.
77 1:
78 cmp %o5, %g1
79 bgeu 3f
80 mov 1, %g7
81
82 sll %o5, 4, %o5
83
84 b 1b
85 add %o4, 1, %o4
86
87 ! Now compute %g7.
88 2:
89 addcc %o5, %o5, %o5
90 bcc Lnot_too_big
91 add %g7, 1, %g7
92
93 ! We get here if the %o1 overflowed while shifting.
94 ! This means that %o3 has the high-order bit set.
95 ! Restore %o5 and subtract from %o3.
96 sll %g1, 4, %g1 ! high order bit
97 srl %o5, 1, %o5 ! rest of %o5
98 add %o5, %g1, %o5
99
100 b Ldo_single_div
101 sub %g7, 1, %g7
102
103 Lnot_too_big:
104 3:
105 cmp %o5, %o3
106 blu 2b
107 nop
108
109 be Ldo_single_div
110 nop
111 /* NB: these are commented out in the V8-Sparc manual as well */
112 /* (I do not understand this) */
113 ! %o5 > %o3: went too far: back up 1 step
114 ! srl %o5, 1, %o5
115 ! dec %g7
116 ! do single-bit divide steps
117 !
118 ! We have to be careful here. We know that %o3 >= %o5, so we can do the
119 ! first divide step without thinking. BUT, the others are conditional,
120 ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
121 ! order bit set in the first step, just falling into the regular
122 ! division loop will mess up the first time around.
123 ! So we unroll slightly...
124 Ldo_single_div:
125 subcc %g7, 1, %g7
126 bl Lend_regular_divide
127 nop
128
129 sub %o3, %o5, %o3
130 mov 1, %o2
131
132 b Lend_single_divloop
133 nop
134 Lsingle_divloop:
135 sll %o2, 1, %o2
136 bl 1f
137 srl %o5, 1, %o5
138 ! %o3 >= 0
139 sub %o3, %o5, %o3
140 b 2f
141 add %o2, 1, %o2
142 1: ! %o3 < 0
143 add %o3, %o5, %o3
144 sub %o2, 1, %o2
145 2:
146 Lend_single_divloop:
147 subcc %g7, 1, %g7
148 bge Lsingle_divloop
149 tst %o3
150
151 b,a Lend_regular_divide
152
153Lnot_really_big:
1541:
155 sll %o5, 4, %o5
156
157 cmp %o5, %o3
158 bleu 1b
159 addcc %o4, 1, %o4
160
161 be Lgot_result
162 sub %o4, 1, %o4
163
164 tst %o3 ! set up for initial iteration
165Ldivloop:
166 sll %o2, 4, %o2
167 ! depth 1, accumulated bits 0
168 bl L.1.16
169 srl %o5,1,%o5
170 ! remainder is positive
171 subcc %o3,%o5,%o3
172 ! depth 2, accumulated bits 1
173 bl L.2.17
174 srl %o5,1,%o5
175 ! remainder is positive
176 subcc %o3,%o5,%o3
177 ! depth 3, accumulated bits 3
178 bl L.3.19
179 srl %o5,1,%o5
180 ! remainder is positive
181 subcc %o3,%o5,%o3
182 ! depth 4, accumulated bits 7
183 bl L.4.23
184 srl %o5,1,%o5
185 ! remainder is positive
186 subcc %o3,%o5,%o3
187 b 9f
188 add %o2, (7*2+1), %o2
189
190L.4.23:
191 ! remainder is negative
192 addcc %o3,%o5,%o3
193 b 9f
194 add %o2, (7*2-1), %o2
195
196L.3.19:
197 ! remainder is negative
198 addcc %o3,%o5,%o3
199 ! depth 4, accumulated bits 5
200 bl L.4.21
201 srl %o5,1,%o5
202 ! remainder is positive
203 subcc %o3,%o5,%o3
204 b 9f
205 add %o2, (5*2+1), %o2
206
207L.4.21:
208 ! remainder is negative
209 addcc %o3,%o5,%o3
210 b 9f
211 add %o2, (5*2-1), %o2
212
213L.2.17:
214 ! remainder is negative
215 addcc %o3,%o5,%o3
216 ! depth 3, accumulated bits 1
217 bl L.3.17
218 srl %o5,1,%o5
219 ! remainder is positive
220 subcc %o3,%o5,%o3
221 ! depth 4, accumulated bits 3
222 bl L.4.19
223 srl %o5,1,%o5
224 ! remainder is positive
225 subcc %o3,%o5,%o3
226 b 9f
227 add %o2, (3*2+1), %o2
228
229L.4.19:
230 ! remainder is negative
231 addcc %o3,%o5,%o3
232 b 9f
233 add %o2, (3*2-1), %o2
234
235L.3.17:
236 ! remainder is negative
237 addcc %o3,%o5,%o3
238 ! depth 4, accumulated bits 1
239 bl L.4.17
240 srl %o5,1,%o5
241 ! remainder is positive
242 subcc %o3,%o5,%o3
243 b 9f
244 add %o2, (1*2+1), %o2
245
246L.4.17:
247 ! remainder is negative
248 addcc %o3,%o5,%o3
249 b 9f
250 add %o2, (1*2-1), %o2
251
252L.1.16:
253 ! remainder is negative
254 addcc %o3,%o5,%o3
255 ! depth 2, accumulated bits -1
256 bl L.2.15
257 srl %o5,1,%o5
258 ! remainder is positive
259 subcc %o3,%o5,%o3
260 ! depth 3, accumulated bits -1
261 bl L.3.15
262 srl %o5,1,%o5
263 ! remainder is positive
264 subcc %o3,%o5,%o3
265 ! depth 4, accumulated bits -1
266 bl L.4.15
267 srl %o5,1,%o5
268 ! remainder is positive
269 subcc %o3,%o5,%o3
270 b 9f
271 add %o2, (-1*2+1), %o2
272
273L.4.15:
274 ! remainder is negative
275 addcc %o3,%o5,%o3
276 b 9f
277 add %o2, (-1*2-1), %o2
278
279L.3.15:
280 ! remainder is negative
281 addcc %o3,%o5,%o3
282 ! depth 4, accumulated bits -3
283 bl L.4.13
284 srl %o5,1,%o5
285 ! remainder is positive
286 subcc %o3,%o5,%o3
287 b 9f
288 add %o2, (-3*2+1), %o2
289
290L.4.13:
291 ! remainder is negative
292 addcc %o3,%o5,%o3
293 b 9f
294 add %o2, (-3*2-1), %o2
295
296L.2.15:
297 ! remainder is negative
298 addcc %o3,%o5,%o3
299 ! depth 3, accumulated bits -3
300 bl L.3.13
301 srl %o5,1,%o5
302 ! remainder is positive
303 subcc %o3,%o5,%o3
304 ! depth 4, accumulated bits -5
305 bl L.4.11
306 srl %o5,1,%o5
307 ! remainder is positive
308 subcc %o3,%o5,%o3
309 b 9f
310 add %o2, (-5*2+1), %o2
311
312L.4.11:
313 ! remainder is negative
314 addcc %o3,%o5,%o3
315 b 9f
316 add %o2, (-5*2-1), %o2
317
318L.3.13:
319 ! remainder is negative
320 addcc %o3,%o5,%o3
321 ! depth 4, accumulated bits -7
322 bl L.4.9
323 srl %o5,1,%o5
324 ! remainder is positive
325 subcc %o3,%o5,%o3
326 b 9f
327 add %o2, (-7*2+1), %o2
328
329L.4.9:
330 ! remainder is negative
331 addcc %o3,%o5,%o3
332 b 9f
333 add %o2, (-7*2-1), %o2
334
335 9:
336Lend_regular_divide:
337 subcc %o4, 1, %o4
338 bge Ldivloop
339 tst %o3
340
341 bl,a Lgot_result
342 ! non-restoring fixup here (one instruction only!)
343 sub %o2, 1, %o2
344
345Lgot_result:
346
347 retl
348 mov %o2, %o0
349
350 .globl .udiv_patch
351.udiv_patch:
352 wr %g0, 0x0, %y
353 nop
354 nop
355 retl
356 udiv %o0, %o1, %o0
357 nop
diff --git a/arch/sparc/lib/udivdi3.S b/arch/sparc/lib/udivdi3.S
index b430f1f0ef62..24e0a355e2e8 100644
--- a/arch/sparc/lib/udivdi3.S
+++ b/arch/sparc/lib/udivdi3.S
@@ -60,8 +60,9 @@ __udivdi3:
60 bne .LL77 60 bne .LL77
61 mov %i0,%o2 61 mov %i0,%o2
62 mov 1,%o0 62 mov 1,%o0
63 call .udiv,0
64 mov 0,%o1 63 mov 0,%o1
64 wr %g0, 0, %y
65 udiv %o0, %o1, %o0
65 mov %o0,%o3 66 mov %o0,%o3
66 mov %i0,%o2 67 mov %i0,%o2
67.LL77: 68.LL77:
diff --git a/arch/sparc/lib/umul.S b/arch/sparc/lib/umul.S
deleted file mode 100644
index 1f36ae682529..000000000000
--- a/arch/sparc/lib/umul.S
+++ /dev/null
@@ -1,171 +0,0 @@
1/*
2 * umul.S: This routine was taken from glibc-1.09 and is covered
3 * by the GNU Library General Public License Version 2.
4 */
5
6
7/*
8 * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
9 * upper 32 bits of the 64-bit product).
10 *
11 * This code optimizes short (less than 13-bit) multiplies. Short
12 * multiplies require 25 instruction cycles, and long ones require
13 * 45 instruction cycles.
14 *
15 * On return, overflow has occurred (%o1 is not zero) if and only if
16 * the Z condition code is clear, allowing, e.g., the following:
17 *
18 * call .umul
19 * nop
20 * bnz overflow (or tnz)
21 */
22
23 .globl .umul
24 .globl _Umul
25.umul:
26_Umul: /* needed for export */
27 or %o0, %o1, %o4
28 mov %o0, %y ! multiplier -> Y
29
30 andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args
31 be Lmul_shortway ! if zero, can do it the short way
32 andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
33
34 /*
35 * Long multiply. 32 steps, followed by a final shift step.
36 */
37 mulscc %o4, %o1, %o4 ! 1
38 mulscc %o4, %o1, %o4 ! 2
39 mulscc %o4, %o1, %o4 ! 3
40 mulscc %o4, %o1, %o4 ! 4
41 mulscc %o4, %o1, %o4 ! 5
42 mulscc %o4, %o1, %o4 ! 6
43 mulscc %o4, %o1, %o4 ! 7
44 mulscc %o4, %o1, %o4 ! 8
45 mulscc %o4, %o1, %o4 ! 9
46 mulscc %o4, %o1, %o4 ! 10
47 mulscc %o4, %o1, %o4 ! 11
48 mulscc %o4, %o1, %o4 ! 12
49 mulscc %o4, %o1, %o4 ! 13
50 mulscc %o4, %o1, %o4 ! 14
51 mulscc %o4, %o1, %o4 ! 15
52 mulscc %o4, %o1, %o4 ! 16
53 mulscc %o4, %o1, %o4 ! 17
54 mulscc %o4, %o1, %o4 ! 18
55 mulscc %o4, %o1, %o4 ! 19
56 mulscc %o4, %o1, %o4 ! 20
57 mulscc %o4, %o1, %o4 ! 21
58 mulscc %o4, %o1, %o4 ! 22
59 mulscc %o4, %o1, %o4 ! 23
60 mulscc %o4, %o1, %o4 ! 24
61 mulscc %o4, %o1, %o4 ! 25
62 mulscc %o4, %o1, %o4 ! 26
63 mulscc %o4, %o1, %o4 ! 27
64 mulscc %o4, %o1, %o4 ! 28
65 mulscc %o4, %o1, %o4 ! 29
66 mulscc %o4, %o1, %o4 ! 30
67 mulscc %o4, %o1, %o4 ! 31
68 mulscc %o4, %o1, %o4 ! 32
69 mulscc %o4, %g0, %o4 ! final shift
70
71
72 /*
73 * Normally, with the shift-and-add approach, if both numbers are
74 * positive you get the correct result. With 32-bit two's-complement
75 * numbers, -x is represented as
76 *
77 * x 32
78 * ( 2 - ------ ) mod 2 * 2
79 * 32
80 * 2
81 *
82 * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s,
83 * we can treat this as if the radix point were just to the left
84 * of the sign bit (multiply by 2^32), and get
85 *
86 * -x = (2 - x) mod 2
87 *
88 * Then, ignoring the `mod 2's for convenience:
89 *
90 * x * y = xy
91 * -x * y = 2y - xy
92 * x * -y = 2x - xy
93 * -x * -y = 4 - 2x - 2y + xy
94 *
95 * For signed multiplies, we subtract (x << 32) from the partial
96 * product to fix this problem for negative multipliers (see mul.s).
97 * Because of the way the shift into the partial product is calculated
98 * (N xor V), this term is automatically removed for the multiplicand,
99 * so we don't have to adjust.
100 *
101 * But for unsigned multiplies, the high order bit wasn't a sign bit,
102 * and the correction is wrong. So for unsigned multiplies where the
103 * high order bit is one, we end up with xy - (y << 32). To fix it
104 * we add y << 32.
105 */
106#if 0
107 tst %o1
108 bl,a 1f ! if %o1 < 0 (high order bit = 1),
109 add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half)
110
1111:
112 rd %y, %o0 ! get lower half of product
113 retl
114 addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0
115#else
116 /* Faster code from tege@sics.se. */
117 sra %o1, 31, %o2 ! make mask from sign bit
118 and %o0, %o2, %o2 ! %o2 = 0 or %o0, depending on sign of %o1
119 rd %y, %o0 ! get lower half of product
120 retl
121 addcc %o4, %o2, %o1 ! add compensation and put upper half in place
122#endif
123
124Lmul_shortway:
125 /*
126 * Short multiply. 12 steps, followed by a final shift step.
127 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
128 * but there is no problem with %o0 being negative (unlike above),
129 * and overflow is impossible (the answer is at most 24 bits long).
130 */
131 mulscc %o4, %o1, %o4 ! 1
132 mulscc %o4, %o1, %o4 ! 2
133 mulscc %o4, %o1, %o4 ! 3
134 mulscc %o4, %o1, %o4 ! 4
135 mulscc %o4, %o1, %o4 ! 5
136 mulscc %o4, %o1, %o4 ! 6
137 mulscc %o4, %o1, %o4 ! 7
138 mulscc %o4, %o1, %o4 ! 8
139 mulscc %o4, %o1, %o4 ! 9
140 mulscc %o4, %o1, %o4 ! 10
141 mulscc %o4, %o1, %o4 ! 11
142 mulscc %o4, %o1, %o4 ! 12
143 mulscc %o4, %g0, %o4 ! final shift
144
145 /*
146 * %o4 has 20 of the bits that should be in the result; %y has
147 * the bottom 12 (as %y's top 12). That is:
148 *
149 * %o4 %y
150 * +----------------+----------------+
151 * | -12- | -20- | -12- | -20- |
152 * +------(---------+------)---------+
153 * -----result-----
154 *
155 * The 12 bits of %o4 left of the `result' area are all zero;
156 * in fact, all top 20 bits of %o4 are zero.
157 */
158
159 rd %y, %o5
160 sll %o4, 12, %o0 ! shift middle bits left 12
161 srl %o5, 20, %o5 ! shift low bits right 20
162 or %o5, %o0, %o0
163 retl
164 addcc %g0, %g0, %o1 ! %o1 = zero, and set Z
165
166 .globl .umul_patch
167.umul_patch:
168 umul %o0, %o1, %o0
169 retl
170 rd %y, %o1
171 nop
diff --git a/arch/sparc/lib/urem.S b/arch/sparc/lib/urem.S
deleted file mode 100644
index 77123eb83c44..000000000000
--- a/arch/sparc/lib/urem.S
+++ /dev/null
@@ -1,357 +0,0 @@
1/*
2 * urem.S: This routine was taken from glibc-1.09 and is covered
3 * by the GNU Library General Public License Version 2.
4 */
5
6/* This file is generated from divrem.m4; DO NOT EDIT! */
7/*
8 * Division and remainder, from Appendix E of the Sparc Version 8
9 * Architecture Manual, with fixes from Gordon Irlam.
10 */
11
12/*
13 * Input: dividend and divisor in %o0 and %o1 respectively.
14 *
15 * m4 parameters:
16 * .urem name of function to generate
17 * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1
18 * false false=true => signed; false=false => unsigned
19 *
20 * Algorithm parameters:
21 * N how many bits per iteration we try to get (4)
22 * WORDSIZE total number of bits (32)
23 *
24 * Derived constants:
25 * TOPBITS number of bits in the top decade of a number
26 *
27 * Important variables:
28 * Q the partial quotient under development (initially 0)
29 * R the remainder so far, initially the dividend
30 * ITER number of main division loop iterations required;
31 * equal to ceil(log2(quotient) / N). Note that this
32 * is the log base (2^N) of the quotient.
33 * V the current comparand, initially divisor*2^(ITER*N-1)
34 *
35 * Cost:
36 * Current estimate for non-large dividend is
37 * ceil(log2(quotient) / N) * (10 + 7N/2) + C
38 * A large dividend is one greater than 2^(31-TOPBITS) and takes a
39 * different path, as the upper bits of the quotient must be developed
40 * one bit at a time.
41 */
42
43 .globl .urem
44 .globl _Urem
45.urem:
46_Urem: /* needed for export */
47
48 ! Ready to divide. Compute size of quotient; scale comparand.
49 orcc %o1, %g0, %o5
50 bne 1f
51 mov %o0, %o3
52
53 ! Divide by zero trap. If it returns, return 0 (about as
54 ! wrong as possible, but that is what SunOS does...).
55 ta ST_DIV0
56 retl
57 clr %o0
58
591:
60 cmp %o3, %o5 ! if %o1 exceeds %o0, done
61 blu Lgot_result ! (and algorithm fails otherwise)
62 clr %o2
63
64 sethi %hi(1 << (32 - 4 - 1)), %g1
65
66 cmp %o3, %g1
67 blu Lnot_really_big
68 clr %o4
69
70 ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
71 ! as our usual N-at-a-shot divide step will cause overflow and havoc.
72 ! The number of bits in the result here is N*ITER+SC, where SC <= N.
73 ! Compute ITER in an unorthodox manner: know we need to shift V into
74 ! the top decade: so do not even bother to compare to R.
75 1:
76 cmp %o5, %g1
77 bgeu 3f
78 mov 1, %g7
79
80 sll %o5, 4, %o5
81
82 b 1b
83 add %o4, 1, %o4
84
85 ! Now compute %g7.
86 2:
87 addcc %o5, %o5, %o5
88 bcc Lnot_too_big
89 add %g7, 1, %g7
90
91 ! We get here if the %o1 overflowed while shifting.
92 ! This means that %o3 has the high-order bit set.
93 ! Restore %o5 and subtract from %o3.
94 sll %g1, 4, %g1 ! high order bit
95 srl %o5, 1, %o5 ! rest of %o5
96 add %o5, %g1, %o5
97
98 b Ldo_single_div
99 sub %g7, 1, %g7
100
101 Lnot_too_big:
102 3:
103 cmp %o5, %o3
104 blu 2b
105 nop
106
107 be Ldo_single_div
108 nop
109 /* NB: these are commented out in the V8-Sparc manual as well */
110 /* (I do not understand this) */
111 ! %o5 > %o3: went too far: back up 1 step
112 ! srl %o5, 1, %o5
113 ! dec %g7
114 ! do single-bit divide steps
115 !
116 ! We have to be careful here. We know that %o3 >= %o5, so we can do the
117 ! first divide step without thinking. BUT, the others are conditional,
118 ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
119 ! order bit set in the first step, just falling into the regular
120 ! division loop will mess up the first time around.
121 ! So we unroll slightly...
122 Ldo_single_div:
123 subcc %g7, 1, %g7
124 bl Lend_regular_divide
125 nop
126
127 sub %o3, %o5, %o3
128 mov 1, %o2
129
130 b Lend_single_divloop
131 nop
132 Lsingle_divloop:
133 sll %o2, 1, %o2
134 bl 1f
135 srl %o5, 1, %o5
136 ! %o3 >= 0
137 sub %o3, %o5, %o3
138 b 2f
139 add %o2, 1, %o2
140 1: ! %o3 < 0
141 add %o3, %o5, %o3
142 sub %o2, 1, %o2
143 2:
144 Lend_single_divloop:
145 subcc %g7, 1, %g7
146 bge Lsingle_divloop
147 tst %o3
148
149 b,a Lend_regular_divide
150
151Lnot_really_big:
1521:
153 sll %o5, 4, %o5
154
155 cmp %o5, %o3
156 bleu 1b
157 addcc %o4, 1, %o4
158
159 be Lgot_result
160 sub %o4, 1, %o4
161
162 tst %o3 ! set up for initial iteration
163Ldivloop:
164 sll %o2, 4, %o2
165 ! depth 1, accumulated bits 0
166 bl L.1.16
167 srl %o5,1,%o5
168 ! remainder is positive
169 subcc %o3,%o5,%o3
170 ! depth 2, accumulated bits 1
171 bl L.2.17
172 srl %o5,1,%o5
173 ! remainder is positive
174 subcc %o3,%o5,%o3
175 ! depth 3, accumulated bits 3
176 bl L.3.19
177 srl %o5,1,%o5
178 ! remainder is positive
179 subcc %o3,%o5,%o3
180 ! depth 4, accumulated bits 7
181 bl L.4.23
182 srl %o5,1,%o5
183 ! remainder is positive
184 subcc %o3,%o5,%o3
185 b 9f
186 add %o2, (7*2+1), %o2
187
188L.4.23:
189 ! remainder is negative
190 addcc %o3,%o5,%o3
191 b 9f
192 add %o2, (7*2-1), %o2
193
194L.3.19:
195 ! remainder is negative
196 addcc %o3,%o5,%o3
197 ! depth 4, accumulated bits 5
198 bl L.4.21
199 srl %o5,1,%o5
200 ! remainder is positive
201 subcc %o3,%o5,%o3
202 b 9f
203 add %o2, (5*2+1), %o2
204
205L.4.21:
206 ! remainder is negative
207 addcc %o3,%o5,%o3
208 b 9f
209 add %o2, (5*2-1), %o2
210
211L.2.17:
212 ! remainder is negative
213 addcc %o3,%o5,%o3
214 ! depth 3, accumulated bits 1
215 bl L.3.17
216 srl %o5,1,%o5
217 ! remainder is positive
218 subcc %o3,%o5,%o3
219 ! depth 4, accumulated bits 3
220 bl L.4.19
221 srl %o5,1,%o5
222 ! remainder is positive
223 subcc %o3,%o5,%o3
224 b 9f
225 add %o2, (3*2+1), %o2
226
227L.4.19:
228 ! remainder is negative
229 addcc %o3,%o5,%o3
230 b 9f
231 add %o2, (3*2-1), %o2
232
233L.3.17:
234 ! remainder is negative
235 addcc %o3,%o5,%o3
236 ! depth 4, accumulated bits 1
237 bl L.4.17
238 srl %o5,1,%o5
239 ! remainder is positive
240 subcc %o3,%o5,%o3
241 b 9f
242 add %o2, (1*2+1), %o2
243
244L.4.17:
245 ! remainder is negative
246 addcc %o3,%o5,%o3
247 b 9f
248 add %o2, (1*2-1), %o2
249
250L.1.16:
251 ! remainder is negative
252 addcc %o3,%o5,%o3
253 ! depth 2, accumulated bits -1
254 bl L.2.15
255 srl %o5,1,%o5
256 ! remainder is positive
257 subcc %o3,%o5,%o3
258 ! depth 3, accumulated bits -1
259 bl L.3.15
260 srl %o5,1,%o5
261 ! remainder is positive
262 subcc %o3,%o5,%o3
263 ! depth 4, accumulated bits -1
264 bl L.4.15
265 srl %o5,1,%o5
266 ! remainder is positive
267 subcc %o3,%o5,%o3
268 b 9f
269 add %o2, (-1*2+1), %o2
270
271L.4.15:
272 ! remainder is negative
273 addcc %o3,%o5,%o3
274 b 9f
275 add %o2, (-1*2-1), %o2
276
277L.3.15:
278 ! remainder is negative
279 addcc %o3,%o5,%o3
280 ! depth 4, accumulated bits -3
281 bl L.4.13
282 srl %o5,1,%o5
283 ! remainder is positive
284 subcc %o3,%o5,%o3
285 b 9f
286 add %o2, (-3*2+1), %o2
287
288L.4.13:
289 ! remainder is negative
290 addcc %o3,%o5,%o3
291 b 9f
292 add %o2, (-3*2-1), %o2
293
294L.2.15:
295 ! remainder is negative
296 addcc %o3,%o5,%o3
297 ! depth 3, accumulated bits -3
298 bl L.3.13
299 srl %o5,1,%o5
300 ! remainder is positive
301 subcc %o3,%o5,%o3
302 ! depth 4, accumulated bits -5
303 bl L.4.11
304 srl %o5,1,%o5
305 ! remainder is positive
306 subcc %o3,%o5,%o3
307 b 9f
308 add %o2, (-5*2+1), %o2
309
310L.4.11:
311 ! remainder is negative
312 addcc %o3,%o5,%o3
313 b 9f
314 add %o2, (-5*2-1), %o2
315
316L.3.13:
317 ! remainder is negative
318 addcc %o3,%o5,%o3
319 ! depth 4, accumulated bits -7
320 bl L.4.9
321 srl %o5,1,%o5
322 ! remainder is positive
323 subcc %o3,%o5,%o3
324 b 9f
325 add %o2, (-7*2+1), %o2
326
327L.4.9:
328 ! remainder is negative
329 addcc %o3,%o5,%o3
330 b 9f
331 add %o2, (-7*2-1), %o2
332
333 9:
334Lend_regular_divide:
335 subcc %o4, 1, %o4
336 bge Ldivloop
337 tst %o3
338
339 bl,a Lgot_result
340 ! non-restoring fixup here (one instruction only!)
341 add %o3, %o1, %o3
342
343Lgot_result:
344
345 retl
346 mov %o3, %o0
347
348 .globl .urem_patch
349.urem_patch:
350 wr %g0, 0x0, %y
351 nop
352 nop
353 nop
354 udiv %o0, %o1, %o2
355 umul %o2, %o1, %o2
356 retl
357 sub %o0, %o2, %o0