aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorBernd Schmidt <bernds_cb1@t-online.de>2009-01-07 10:14:39 -0500
committerBryan Wu <cooloney@kernel.org>2009-01-07 10:14:39 -0500
commit71ae92f51a5f2d824972cf60b25cc40def62ba29 (patch)
tree8c9434ecb5035c2868a5ff4ddd18236d16f52dd4 /arch
parent36478585d994f82654cf8435b34c1a8df3c6ae69 (diff)
Blackfin arch: Replace C version of 64 bit multiply with hand optimized assembly
Signed-off-by: Bernd Schmidt <bernds_cb1@t-online.de> Signed-off-by: Bryan Wu <cooloney@kernel.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/blackfin/lib/muldi3.S68
-rw-r--r--arch/blackfin/lib/muldi3.c99
2 files changed, 68 insertions, 99 deletions
diff --git a/arch/blackfin/lib/muldi3.S b/arch/blackfin/lib/muldi3.S
new file mode 100644
index 000000000000..abde120ee230
--- /dev/null
+++ b/arch/blackfin/lib/muldi3.S
@@ -0,0 +1,68 @@
1.align 2
2.global ___muldi3;
3.type ___muldi3, STT_FUNC;
4
5#ifdef CONFIG_ARITHMETIC_OPS_L1
6.section .l1.text
7#else
8.text
9#endif
10
11/*
12 R1:R0 * R3:R2
13 = R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l
14[X] = (R1.h * R3.h) * 2^96
15[X] + (R1.h * R3.l + R1.l * R3.h) * 2^80
16[X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64
17[T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48
18[T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32
19[T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16
20[T4] + (R0.l * R2.l)
21
22 We can discard the first three lines marked "X" since we produce
23 only a 64 bit result. So, we need ten 16-bit multiplies.
24
25 Individual mul-acc results:
26[E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h
27[E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h
28[E3] = R0.l * R2.h + R2.l * R0.h
29[E4] = R0.l * R2.l
30
31 We also need to add high parts from lower-level results to higher ones:
32 E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4
33
34 One interesting property is that all parts of the result that depend
35 on the sign of the multiplication are discarded. Those would be the
36 multiplications involving R1.h and R3.h, but only the top 16 bit of
37 the 32 bit result depend on the sign, and since R1.h and R3.h only
38 occur in E1, the top half of these results is cut off.
39 So, we can just use FU mode for all of the 16-bit multiplies, and
40 ignore questions of when to use mixed mode. */
41
42___muldi3:
43 /* [SP] technically is part of the caller's frame, but we can
44 use it as scratch space. */
45 A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12]; /* E1 */
46 A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4; /* E1 */
47 A0 += A1; /* E1 */
48 R4 = A0.w;
49 A0 = R0.l * R3.l (FU); /* E2 */
50 A0 += R2.l * R1.l (FU); /* E2 */
51
52 A1 = R2.L * R0.L (FU); /* E4 */
53 R3 = A1.w;
54 A1 = A1 >> 16; /* E3c */
55 A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */
56 A1 += R0.L * R2.H (FU); /* E3c */
57 R0 = A1.w;
58 A1 = A1 >> 16; /* E2c */
59 A0 += A1; /* E2c */
60 R1 = A0.w;
61
62 /* low(result) = low(E3c):low(E4) */
63 R0 = PACK (R0.l, R3.l);
64 /* high(result) = E2c + (E1 << 16) */
65 R1.h = R1.h + R4.l (NS) || R4 = [SP];
66 RTS;
67
68.size ___muldi3, .-___muldi3
diff --git a/arch/blackfin/lib/muldi3.c b/arch/blackfin/lib/muldi3.c
deleted file mode 100644
index 303d0c6a6dba..000000000000
--- a/arch/blackfin/lib/muldi3.c
+++ /dev/null
@@ -1,99 +0,0 @@
1/*
2 * File: arch/blackfin/lib/muldi3.c
3 * Based on:
4 * Author:
5 *
6 * Created:
7 * Description:
8 *
9 * Modified:
10 * Copyright 2004-2006 Analog Devices Inc.
11 *
12 * Bugs: Enter bugs at http://blackfin.uclinux.org/
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, see the file COPYING, or write
26 * to the Free Software Foundation, Inc.,
27 * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 */
29
30#ifndef SI_TYPE_SIZE
31#define SI_TYPE_SIZE 32
32#endif
33#define __ll_b (1L << (SI_TYPE_SIZE / 2))
34#define __ll_lowpart(t) ((usitype) (t) % __ll_b)
35#define __ll_highpart(t) ((usitype) (t) / __ll_b)
36#define BITS_PER_UNIT 8
37
38#if !defined(umul_ppmm)
39#define umul_ppmm(w1, w0, u, v) \
40 do { \
41 usitype __x0, __x1, __x2, __x3; \
42 usitype __ul, __vl, __uh, __vh; \
43 \
44 __ul = __ll_lowpart (u); \
45 __uh = __ll_highpart (u); \
46 __vl = __ll_lowpart (v); \
47 __vh = __ll_highpart (v); \
48 \
49 __x0 = (usitype) __ul * __vl; \
50 __x1 = (usitype) __ul * __vh; \
51 __x2 = (usitype) __uh * __vl; \
52 __x3 = (usitype) __uh * __vh; \
53 \
54 __x1 += __ll_highpart (__x0);/* this can't give carry */ \
55 __x1 += __x2; /* but this indeed can */ \
56 if (__x1 < __x2) /* did we get it? */ \
57 __x3 += __ll_b; /* yes, add it in the proper pos. */ \
58 \
59 (w1) = __x3 + __ll_highpart (__x1); \
60 (w0) = __ll_lowpart (__x1) * __ll_b + __ll_lowpart (__x0); \
61 } while (0)
62#endif
63
64#if !defined(__umulsidi3)
65#define __umulsidi3(u, v) \
66 ({diunion __w; \
67 umul_ppmm (__w.s.high, __w.s.low, u, v); \
68 __w.ll; })
69#endif
70
71typedef unsigned int usitype __attribute__ ((mode(SI)));
72typedef int sitype __attribute__ ((mode(SI)));
73typedef int ditype __attribute__ ((mode(DI)));
74typedef int word_type __attribute__ ((mode(__word__)));
75
76struct distruct {
77 sitype low, high;
78};
79typedef union {
80 struct distruct s;
81 ditype ll;
82} diunion;
83
84#ifdef CONFIG_ARITHMETIC_OPS_L1
85ditype __muldi3(ditype u, ditype v)__attribute__((l1_text));
86#endif
87
88ditype __muldi3(ditype u, ditype v)
89{
90 diunion w;
91 diunion uu, vv;
92
93 uu.ll = u, vv.ll = v;
94 w.ll = __umulsidi3(uu.s.low, vv.s.low);
95 w.s.high += ((usitype) uu.s.low * (usitype) vv.s.high
96 + (usitype) uu.s.high * (usitype) vv.s.low);
97
98 return w.ll;
99}