diff options
author | Bernd Schmidt <bernds_cb1@t-online.de> | 2009-01-07 10:14:39 -0500 |
---|---|---|
committer | Bryan Wu <cooloney@kernel.org> | 2009-01-07 10:14:39 -0500 |
commit | 71ae92f51a5f2d824972cf60b25cc40def62ba29 (patch) | |
tree | 8c9434ecb5035c2868a5ff4ddd18236d16f52dd4 /arch/blackfin/lib | |
parent | 36478585d994f82654cf8435b34c1a8df3c6ae69 (diff) |
Blackfin arch: Replace C version of 64 bit multiply with hand optimized assembly
Signed-off-by: Bernd Schmidt <bernds_cb1@t-online.de>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Diffstat (limited to 'arch/blackfin/lib')
-rw-r--r-- | arch/blackfin/lib/muldi3.S | 68 | ||||
-rw-r--r-- | arch/blackfin/lib/muldi3.c | 99 |
2 files changed, 68 insertions, 99 deletions
diff --git a/arch/blackfin/lib/muldi3.S b/arch/blackfin/lib/muldi3.S new file mode 100644 index 000000000000..abde120ee230 --- /dev/null +++ b/arch/blackfin/lib/muldi3.S | |||
@@ -0,0 +1,68 @@ | |||
1 | .align 2 | ||
2 | .global ___muldi3; | ||
3 | .type ___muldi3, STT_FUNC; | ||
4 | |||
5 | #ifdef CONFIG_ARITHMETIC_OPS_L1 | ||
6 | .section .l1.text | ||
7 | #else | ||
8 | .text | ||
9 | #endif | ||
10 | |||
11 | /* | ||
12 | R1:R0 * R3:R2 | ||
13 | = R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l | ||
14 | [X] = (R1.h * R3.h) * 2^96 | ||
15 | [X] + (R1.h * R3.l + R1.l * R3.h) * 2^80 | ||
16 | [X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64 | ||
17 | [T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48 | ||
18 | [T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32 | ||
19 | [T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16 | ||
20 | [T4] + (R0.l * R2.l) | ||
21 | |||
22 | We can discard the first three lines marked "X" since we produce | ||
23 | only a 64 bit result. So, we need ten 16-bit multiplies. | ||
24 | |||
25 | Individual mul-acc results: | ||
26 | [E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h | ||
27 | [E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h | ||
28 | [E3] = R0.l * R2.h + R2.l * R0.h | ||
29 | [E4] = R0.l * R2.l | ||
30 | |||
31 | We also need to add high parts from lower-level results to higher ones: | ||
32 | E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4 | ||
33 | |||
34 | One interesting property is that all parts of the result that depend | ||
35 | on the sign of the multiplication are discarded. Those would be the | ||
36 | multiplications involving R1.h and R3.h, but only the top 16 bit of | ||
37 | the 32 bit result depend on the sign, and since R1.h and R3.h only | ||
38 | occur in E1, the top half of these results is cut off. | ||
39 | So, we can just use FU mode for all of the 16-bit multiplies, and | ||
40 | ignore questions of when to use mixed mode. */ | ||
41 | |||
42 | ___muldi3: | ||
43 | /* [SP] technically is part of the caller's frame, but we can | ||
44 | use it as scratch space. */ | ||
45 | A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12]; /* E1 */ | ||
46 | A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4; /* E1 */ | ||
47 | A0 += A1; /* E1 */ | ||
48 | R4 = A0.w; | ||
49 | A0 = R0.l * R3.l (FU); /* E2 */ | ||
50 | A0 += R2.l * R1.l (FU); /* E2 */ | ||
51 | |||
52 | A1 = R2.L * R0.L (FU); /* E4 */ | ||
53 | R3 = A1.w; | ||
54 | A1 = A1 >> 16; /* E3c */ | ||
55 | A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */ | ||
56 | A1 += R0.L * R2.H (FU); /* E3c */ | ||
57 | R0 = A1.w; | ||
58 | A1 = A1 >> 16; /* E2c */ | ||
59 | A0 += A1; /* E2c */ | ||
60 | R1 = A0.w; | ||
61 | |||
62 | /* low(result) = low(E3c):low(E4) */ | ||
63 | R0 = PACK (R0.l, R3.l); | ||
64 | /* high(result) = E2c + (E1 << 16) */ | ||
65 | R1.h = R1.h + R4.l (NS) || R4 = [SP]; | ||
66 | RTS; | ||
67 | |||
68 | .size ___muldi3, .-___muldi3 | ||
diff --git a/arch/blackfin/lib/muldi3.c b/arch/blackfin/lib/muldi3.c deleted file mode 100644 index 303d0c6a6dba..000000000000 --- a/arch/blackfin/lib/muldi3.c +++ /dev/null | |||
@@ -1,99 +0,0 @@ | |||
1 | /* | ||
2 | * File: arch/blackfin/lib/muldi3.c | ||
3 | * Based on: | ||
4 | * Author: | ||
5 | * | ||
6 | * Created: | ||
7 | * Description: | ||
8 | * | ||
9 | * Modified: | ||
10 | * Copyright 2004-2006 Analog Devices Inc. | ||
11 | * | ||
12 | * Bugs: Enter bugs at http://blackfin.uclinux.org/ | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify | ||
15 | * it under the terms of the GNU General Public License as published by | ||
16 | * the Free Software Foundation; either version 2 of the License, or | ||
17 | * (at your option) any later version. | ||
18 | * | ||
19 | * This program is distributed in the hope that it will be useful, | ||
20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
22 | * GNU General Public License for more details. | ||
23 | * | ||
24 | * You should have received a copy of the GNU General Public License | ||
25 | * along with this program; if not, see the file COPYING, or write | ||
26 | * to the Free Software Foundation, Inc., | ||
27 | * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
28 | */ | ||
29 | |||
30 | #ifndef SI_TYPE_SIZE | ||
31 | #define SI_TYPE_SIZE 32 | ||
32 | #endif | ||
33 | #define __ll_b (1L << (SI_TYPE_SIZE / 2)) | ||
34 | #define __ll_lowpart(t) ((usitype) (t) % __ll_b) | ||
35 | #define __ll_highpart(t) ((usitype) (t) / __ll_b) | ||
36 | #define BITS_PER_UNIT 8 | ||
37 | |||
38 | #if !defined(umul_ppmm) | ||
39 | #define umul_ppmm(w1, w0, u, v) \ | ||
40 | do { \ | ||
41 | usitype __x0, __x1, __x2, __x3; \ | ||
42 | usitype __ul, __vl, __uh, __vh; \ | ||
43 | \ | ||
44 | __ul = __ll_lowpart (u); \ | ||
45 | __uh = __ll_highpart (u); \ | ||
46 | __vl = __ll_lowpart (v); \ | ||
47 | __vh = __ll_highpart (v); \ | ||
48 | \ | ||
49 | __x0 = (usitype) __ul * __vl; \ | ||
50 | __x1 = (usitype) __ul * __vh; \ | ||
51 | __x2 = (usitype) __uh * __vl; \ | ||
52 | __x3 = (usitype) __uh * __vh; \ | ||
53 | \ | ||
54 | __x1 += __ll_highpart (__x0);/* this can't give carry */ \ | ||
55 | __x1 += __x2; /* but this indeed can */ \ | ||
56 | if (__x1 < __x2) /* did we get it? */ \ | ||
57 | __x3 += __ll_b; /* yes, add it in the proper pos. */ \ | ||
58 | \ | ||
59 | (w1) = __x3 + __ll_highpart (__x1); \ | ||
60 | (w0) = __ll_lowpart (__x1) * __ll_b + __ll_lowpart (__x0); \ | ||
61 | } while (0) | ||
62 | #endif | ||
63 | |||
64 | #if !defined(__umulsidi3) | ||
65 | #define __umulsidi3(u, v) \ | ||
66 | ({diunion __w; \ | ||
67 | umul_ppmm (__w.s.high, __w.s.low, u, v); \ | ||
68 | __w.ll; }) | ||
69 | #endif | ||
70 | |||
71 | typedef unsigned int usitype __attribute__ ((mode(SI))); | ||
72 | typedef int sitype __attribute__ ((mode(SI))); | ||
73 | typedef int ditype __attribute__ ((mode(DI))); | ||
74 | typedef int word_type __attribute__ ((mode(__word__))); | ||
75 | |||
76 | struct distruct { | ||
77 | sitype low, high; | ||
78 | }; | ||
79 | typedef union { | ||
80 | struct distruct s; | ||
81 | ditype ll; | ||
82 | } diunion; | ||
83 | |||
84 | #ifdef CONFIG_ARITHMETIC_OPS_L1 | ||
85 | ditype __muldi3(ditype u, ditype v)__attribute__((l1_text)); | ||
86 | #endif | ||
87 | |||
88 | ditype __muldi3(ditype u, ditype v) | ||
89 | { | ||
90 | diunion w; | ||
91 | diunion uu, vv; | ||
92 | |||
93 | uu.ll = u, vv.ll = v; | ||
94 | w.ll = __umulsidi3(uu.s.low, vv.s.low); | ||
95 | w.s.high += ((usitype) uu.s.low * (usitype) vv.s.high | ||
96 | + (usitype) uu.s.high * (usitype) vv.s.low); | ||
97 | |||
98 | return w.ll; | ||
99 | } | ||