diff options
Diffstat (limited to 'arch/sh/lib64/udivdi3.S')
-rw-r--r-- | arch/sh/lib64/udivdi3.S | 120 |
1 files changed, 120 insertions, 0 deletions
diff --git a/arch/sh/lib64/udivdi3.S b/arch/sh/lib64/udivdi3.S new file mode 100644 index 000000000000..6895c0225b85 --- /dev/null +++ b/arch/sh/lib64/udivdi3.S | |||
@@ -0,0 +1,120 @@ | |||
1 | .section .text..SHmedia32,"ax" | ||
2 | .align 2 | ||
3 | .global __udivdi3 | ||
4 | __udivdi3: | ||
5 | shlri r3,1,r4 | ||
6 | nsb r4,r22 | ||
7 | shlld r3,r22,r6 | ||
8 | shlri r6,49,r5 | ||
9 | movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */ | ||
10 | sub r21,r5,r1 | ||
11 | mmulfx.w r1,r1,r4 | ||
12 | mshflo.w r1,r63,r1 | ||
13 | sub r63,r22,r20 // r63 == 64 % 64 | ||
14 | mmulfx.w r5,r4,r4 | ||
15 | pta large_divisor,tr0 | ||
16 | addi r20,32,r9 | ||
17 | msub.w r1,r4,r1 | ||
18 | madd.w r1,r1,r1 | ||
19 | mmulfx.w r1,r1,r4 | ||
20 | shlri r6,32,r7 | ||
21 | bgt/u r9,r63,tr0 // large_divisor | ||
22 | mmulfx.w r5,r4,r4 | ||
23 | shlri r2,32+14,r19 | ||
24 | addi r22,-31,r0 | ||
25 | msub.w r1,r4,r1 | ||
26 | |||
27 | mulu.l r1,r7,r4 | ||
28 | addi r1,-3,r5 | ||
29 | mulu.l r5,r19,r5 | ||
30 | sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 | ||
31 | shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as | ||
32 | the case may be, %0000000000000000 000.11111111111, still */ | ||
33 | muls.l r1,r4,r4 /* leaving at least one sign bit. */ | ||
34 | mulu.l r5,r3,r8 | ||
35 | mshalds.l r1,r21,r1 | ||
36 | shari r4,26,r4 | ||
37 | shlld r8,r0,r8 | ||
38 | add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) | ||
39 | sub r2,r8,r2 | ||
40 | /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */ | ||
41 | |||
42 | shlri r2,22,r21 | ||
43 | mulu.l r21,r1,r21 | ||
44 | shlld r5,r0,r8 | ||
45 | addi r20,30-22,r0 | ||
46 | shlrd r21,r0,r21 | ||
47 | mulu.l r21,r3,r5 | ||
48 | add r8,r21,r8 | ||
49 | mcmpgt.l r21,r63,r21 // See Note 1 | ||
50 | addi r20,30,r0 | ||
51 | mshfhi.l r63,r21,r21 | ||
52 | sub r2,r5,r2 | ||
53 | andc r2,r21,r2 | ||
54 | |||
55 | /* small divisor: need a third divide step */ | ||
56 | mulu.l r2,r1,r7 | ||
57 | ptabs r18,tr0 | ||
58 | addi r2,1,r2 | ||
59 | shlrd r7,r0,r7 | ||
60 | mulu.l r7,r3,r5 | ||
61 | add r8,r7,r8 | ||
62 | sub r2,r3,r2 | ||
63 | cmpgt r2,r5,r5 | ||
64 | add r8,r5,r2 | ||
65 | /* could test r3 here to check for divide by zero. */ | ||
66 | blink tr0,r63 | ||
67 | |||
68 | large_divisor: | ||
69 | mmulfx.w r5,r4,r4 | ||
70 | shlrd r2,r9,r25 | ||
71 | shlri r25,32,r8 | ||
72 | msub.w r1,r4,r1 | ||
73 | |||
74 | mulu.l r1,r7,r4 | ||
75 | addi r1,-3,r5 | ||
76 | mulu.l r5,r8,r5 | ||
77 | sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 | ||
78 | shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as | ||
79 | the case may be, %0000000000000000 000.11111111111, still */ | ||
80 | muls.l r1,r4,r4 /* leaving at least one sign bit. */ | ||
81 | shlri r5,14-1,r8 | ||
82 | mulu.l r8,r7,r5 | ||
83 | mshalds.l r1,r21,r1 | ||
84 | shari r4,26,r4 | ||
85 | add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) | ||
86 | sub r25,r5,r25 | ||
87 | /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ | ||
88 | |||
89 | shlri r25,22,r21 | ||
90 | mulu.l r21,r1,r21 | ||
91 | pta no_lo_adj,tr0 | ||
92 | addi r22,32,r0 | ||
93 | shlri r21,40,r21 | ||
94 | mulu.l r21,r7,r5 | ||
95 | add r8,r21,r8 | ||
96 | shlld r2,r0,r2 | ||
97 | sub r25,r5,r25 | ||
98 | bgtu/u r7,r25,tr0 // no_lo_adj | ||
99 | addi r8,1,r8 | ||
100 | sub r25,r7,r25 | ||
101 | no_lo_adj: | ||
102 | mextr4 r2,r25,r2 | ||
103 | |||
104 | /* large_divisor: only needs a few adjustments. */ | ||
105 | mulu.l r8,r6,r5 | ||
106 | ptabs r18,tr0 | ||
107 | /* bubble */ | ||
108 | cmpgtu r5,r2,r5 | ||
109 | sub r8,r5,r2 | ||
110 | blink tr0,r63 | ||
111 | |||
112 | /* Note 1: To shift the result of the second divide stage so that the result | ||
113 | always fits into 32 bits, yet we still reduce the rest sufficiently | ||
114 | would require a lot of instructions to do the shifts just right. Using | ||
115 | the full 64 bit shift result to multiply with the divisor would require | ||
116 | four extra instructions for the upper 32 bits (shift / mulu / shift / sub). | ||
117 | Fortunately, if the upper 32 bits of the shift result are nonzero, we | ||
118 | know that the rest after taking this partial result into account will | ||
119 | fit into 32 bits. So we just clear the upper 32 bits of the rest if the | ||
120 | upper 32 bits of the partial result are nonzero. */ | ||