Blackfin arch: Replace C version of 64 bit multiply with hand optimized assembly

Signed-off-by: Bernd Schmidt <bernds_cb1@t-online.de> Signed-off-by: Bryan Wu <cooloney@kernel.org>
author: Bernd Schmidt <bernds_cb1@t-online.de> 2009-01-07 10:14:39 -0500
committer: Bryan Wu <cooloney@kernel.org> 2009-01-07 10:14:39 -0500
commit: 71ae92f51a5f2d824972cf60b25cc40def62ba29 (patch)
tree: 8c9434ecb5035c2868a5ff4ddd18236d16f52dd4 /arch/blackfin/lib
parent: 36478585d994f82654cf8435b34c1a8df3c6ae69 (diff)
2 files changed, 68 insertions, 99 deletions
diff --git a/arch/blackfin/lib/muldi3.S b/arch/blackfin/lib/muldi3.S
new file mode 100644
index 000000000000..abde120ee230
--- /dev/null
+++ b/arch/blackfin/lib/muldi3.S
@@ -0,0 +1,68 @@
+.align 2
+.global ___muldi3;
+.type ___muldi3, STT_FUNC;
+#ifdef CONFIG_ARITHMETIC_OPS_L1
+.section .l1.text
+#else
+.text
+#endif
+/*
+           R1:R0 * R3:R2
+         = R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l
+[X]      = (R1.h * R3.h) * 2^96
+[X]        + (R1.h * R3.l + R1.l * R3.h) * 2^80
+[X]        + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64
+[T1]       + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48
+[T2]       + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32
+[T3]       + (R0.l * R2.h + R2.l * R0.h) * 2^16
+[T4]       + (R0.l * R2.l)
+        We can discard the first three lines marked "X" since we produce
+        only a 64 bit result.  So, we need ten 16-bit multiplies.
+        Individual mul-acc results:
+[E1]     =  R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h
+[E2]     =  R1.l * R2.l + R3.l * R0.l + R0.h * R2.h
+[E3]     =  R0.l * R2.h + R2.l * R0.h
+[E4]     =  R0.l * R2.l
+        We also need to add high parts from lower-level results to higher ones:
+        E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4
+        One interesting property is that all parts of the result that depend
+        on the sign of the multiplication are discarded.  Those would be the
+        multiplications involving R1.h and R3.h, but only the top 16 bit of
+        the 32 bit result depend on the sign, and since R1.h and R3.h only
+        occur in E1, the top half of these results is cut off.
+        So, we can just use FU mode for all of the 16-bit multiplies, and
+        ignore questions of when to use mixed mode.  */
+___muldi3:
+        /* [SP] technically is part of the caller's frame, but we can
+           use it as scratch space.  */
+        A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12];      /* E1 */
+        A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4;         /* E1 */
+        A0 += A1;                                                       /* E1 */
+        R4 = A0.w;
+        A0 = R0.l * R3.l (FU);                                          /* E2 */
+        A0 += R2.l * R1.l (FU);                                         /* E2 */
+        A1 = R2.L * R0.L (FU);                                          /* E4 */
+        R3 = A1.w;
+        A1 = A1 >> 16;                                                  /* E3c */
+        A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU);                      /* E2, E3c */
+        A1 += R0.L * R2.H (FU);                                         /* E3c */
+        R0 = A1.w;
+        A1 = A1 >> 16;                                                  /* E2c */
+        A0 += A1;                                                       /* E2c */
+        R1 = A0.w;
+        /* low(result) = low(E3c):low(E4) */
+        R0 = PACK (R0.l, R3.l);
+        /* high(result) = E2c + (E1 << 16) */
+        R1.h = R1.h + R4.l (NS) || R4 = [SP];
+        RTS;
+.size ___muldi3, .-___muldi3
diff --git a/arch/blackfin/lib/muldi3.c b/arch/blackfin/lib/muldi3.c
deleted file mode 100644
index 303d0c6a6dba..000000000000
--- a/arch/blackfin/lib/muldi3.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * File:         arch/blackfin/lib/muldi3.c
- * Based on:
- * Author:
- *
- * Created:
- * Description:
- *
- * Modified:
- *               Copyright 2004-2006 Analog Devices Inc.
- *
- * Bugs:         Enter bugs at http://blackfin.uclinux.org/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see the file COPYING, or write
- * to the Free Software Foundation, Inc.,
- * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef SI_TYPE_SIZE
-#define SI_TYPE_SIZE 32
-#endif
-#define __ll_b (1L << (SI_TYPE_SIZE / 2))
-#define __ll_lowpart(t) ((usitype) (t) % __ll_b)
-#define __ll_highpart(t) ((usitype) (t) / __ll_b)
-#define BITS_PER_UNIT 8
-#if !defined(umul_ppmm)
-#define umul_ppmm(w1, w0, u, v)                                         \
-  do {                                                                  \
-    usitype __x0, __x1, __x2, __x3;                                     \
-    usitype __ul, __vl, __uh, __vh;                                     \
-                                                                        \
-    __ul = __ll_lowpart (u);                                            \
-    __uh = __ll_highpart (u);                                           \
-    __vl = __ll_lowpart (v);                                            \
-    __vh = __ll_highpart (v);                                           \
-                                                                        \
-    __x0 = (usitype) __ul * __vl;                                       \
-    __x1 = (usitype) __ul * __vh;                                       \
-    __x2 = (usitype) __uh * __vl;                                       \
-    __x3 = (usitype) __uh * __vh;                                       \
-                                                                        \
-    __x1 += __ll_highpart (__x0);/* this can't give carry */            \
-    __x1 += __x2;               /* but this indeed can */               \
-    if (__x1 < __x2)            /* did we get it? */                    \
-      __x3 += __ll_b;           /* yes, add it in the proper pos. */    \
-                                                                        \
-    (w1) = __x3 + __ll_highpart (__x1);                                 \
-    (w0) = __ll_lowpart (__x1) * __ll_b + __ll_lowpart (__x0);          \
-  } while (0)
-#endif
-#if !defined(__umulsidi3)
-#define __umulsidi3(u, v)                                               \
-  ({diunion __w;                                                        \
-       umul_ppmm (__w.s.high, __w.s.low, u, v);                         \
-           __w.ll; })
-#endif
-typedef unsigned int usitype __attribute__ ((mode(SI)));
-typedef int sitype __attribute__ ((mode(SI)));
-typedef int ditype __attribute__ ((mode(DI)));
-typedef int word_type __attribute__ ((mode(__word__)));
-struct distruct {
-        sitype low, high;
-};
-typedef union {
-        struct distruct s;
-        ditype ll;
-} diunion;
-#ifdef CONFIG_ARITHMETIC_OPS_L1
-ditype __muldi3(ditype u, ditype v)__attribute__((l1_text));
-#endif
-ditype __muldi3(ditype u, ditype v)
-{
-        diunion w;
-        diunion uu, vv;
-        uu.ll = u, vv.ll = v;
-        w.ll = __umulsidi3(uu.s.low, vv.s.low);
-        w.s.high += ((usitype) uu.s.low * (usitype) vv.s.high
-                     + (usitype) uu.s.high * (usitype) vv.s.low);
-        return w.ll;
-}
author	Bernd Schmidt <bernds_cb1@t-online.de>	2009-01-07 10:14:39 -0500
committer	Bryan Wu <cooloney@kernel.org>	2009-01-07 10:14:39 -0500
commit	71ae92f51a5f2d824972cf60b25cc40def62ba29 (patch)
tree	8c9434ecb5035c2868a5ff4ddd18236d16f52dd4 /arch/blackfin/lib
parent	36478585d994f82654cf8435b34c1a8df3c6ae69 (diff)

diff --git a/arch/blackfin/lib/muldi3.S b/arch/blackfin/lib/muldi3.S new file mode 100644 index 000000000000..abde120ee230 --- /dev/null +++ b/arch/blackfin/lib/muldi3.S
@@ -0,0 +1,68 @@
		1	.align 2
		2	.global ___muldi3;
		3	.type ___muldi3, STT_FUNC;
		4
		5	#ifdef CONFIG_ARITHMETIC_OPS_L1
		6	.section .l1.text
		7	#else
		8	.text
		9	#endif
		10
		11	/*
		12	R1:R0 * R3:R2
		13	= R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l
		14	[X] = (R1.h * R3.h) * 2^96
		15	[X] + (R1.h * R3.l + R1.l * R3.h) * 2^80
		16	[X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64
		17	[T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48
		18	[T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32
		19	[T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16
		20	[T4] + (R0.l * R2.l)
		21
		22	We can discard the first three lines marked "X" since we produce
		23	only a 64 bit result. So, we need ten 16-bit multiplies.
		24
		25	Individual mul-acc results:
		26	[E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h
		27	[E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h
		28	[E3] = R0.l * R2.h + R2.l * R0.h
		29	[E4] = R0.l * R2.l
		30
		31	We also need to add high parts from lower-level results to higher ones:
		32	E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4
		33
		34	One interesting property is that all parts of the result that depend
		35	on the sign of the multiplication are discarded. Those would be the
		36	multiplications involving R1.h and R3.h, but only the top 16 bit of
		37	the 32 bit result depend on the sign, and since R1.h and R3.h only
		38	occur in E1, the top half of these results is cut off.
		39	So, we can just use FU mode for all of the 16-bit multiplies, and
		40	ignore questions of when to use mixed mode. */
		41
		42	___muldi3:
		43	/* [SP] technically is part of the caller's frame, but we can
		44	use it as scratch space. */
		45	A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) \|\| R3 = [SP + 12]; /* E1 */
		46	A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) \|\| [SP] = R4; /* E1 */
		47	A0 += A1; /* E1 */
		48	R4 = A0.w;
		49	A0 = R0.l * R3.l (FU); /* E2 */
		50	A0 += R2.l * R1.l (FU); /* E2 */
		51
		52	A1 = R2.L * R0.L (FU); /* E4 */
		53	R3 = A1.w;
		54	A1 = A1 >> 16; /* E3c */
		55	A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */
		56	A1 += R0.L * R2.H (FU); /* E3c */
		57	R0 = A1.w;
		58	A1 = A1 >> 16; /* E2c */
		59	A0 += A1; /* E2c */
		60	R1 = A0.w;
		61
		62	/* low(result) = low(E3c):low(E4) */
		63	R0 = PACK (R0.l, R3.l);
		64	/* high(result) = E2c + (E1 << 16) */
		65	R1.h = R1.h + R4.l (NS) \|\| R4 = [SP];
		66	RTS;
		67
		68	.size ___muldi3, .-___muldi3


diff --git a/arch/blackfin/lib/muldi3.c b/arch/blackfin/lib/muldi3.c deleted file mode 100644 index 303d0c6a6dba..000000000000 --- a/arch/blackfin/lib/muldi3.c +++ /dev/null
@@ -1,99 +0,0 @@
1	/*
2	* File: arch/blackfin/lib/muldi3.c
3	* Based on:
4	* Author:
5	*
6	* Created:
7	* Description:
8	*
9	* Modified:
10	* Copyright 2004-2006 Analog Devices Inc.
11	*
12	* Bugs: Enter bugs at http://blackfin.uclinux.org/
13	*
14	* This program is free software; you can redistribute it and/or modify
15	* it under the terms of the GNU General Public License as published by
16	* the Free Software Foundation; either version 2 of the License, or
17	* (at your option) any later version.
18	*
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*
24	* You should have received a copy of the GNU General Public License
25	* along with this program; if not, see the file COPYING, or write
26	* to the Free Software Foundation, Inc.,
27	* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28	*/
29
30	#ifndef SI_TYPE_SIZE
31	#define SI_TYPE_SIZE 32
32	#endif
33	#define __ll_b (1L << (SI_TYPE_SIZE / 2))
34	#define __ll_lowpart(t) ((usitype) (t) % __ll_b)
35	#define __ll_highpart(t) ((usitype) (t) / __ll_b)
36	#define BITS_PER_UNIT 8
37
38	#if !defined(umul_ppmm)
39	#define umul_ppmm(w1, w0, u, v) \
40	do { \
41	usitype __x0, __x1, __x2, __x3; \
42	usitype __ul, __vl, __uh, __vh; \
43	\
44	__ul = __ll_lowpart (u); \
45	__uh = __ll_highpart (u); \
46	__vl = __ll_lowpart (v); \
47	__vh = __ll_highpart (v); \
48	\
49	__x0 = (usitype) __ul * __vl; \
50	__x1 = (usitype) __ul * __vh; \
51	__x2 = (usitype) __uh * __vl; \
52	__x3 = (usitype) __uh * __vh; \
53	\
54	__x1 += __ll_highpart (__x0);/* this can't give carry */ \
55	__x1 += __x2; /* but this indeed can */ \
56	if (__x1 < __x2) /* did we get it? */ \
57	__x3 += __ll_b; /* yes, add it in the proper pos. */ \
58	\
59	(w1) = __x3 + __ll_highpart (__x1); \
60	(w0) = __ll_lowpart (__x1) * __ll_b + __ll_lowpart (__x0); \
61	} while (0)
62	#endif
63
64	#if !defined(__umulsidi3)
65	#define __umulsidi3(u, v) \
66	({diunion __w; \
67	umul_ppmm (__w.s.high, __w.s.low, u, v); \
68	__w.ll; })
69	#endif
70
71	typedef unsigned int usitype __attribute__ ((mode(SI)));
72	typedef int sitype __attribute__ ((mode(SI)));
73	typedef int ditype __attribute__ ((mode(DI)));
74	typedef int word_type __attribute__ ((mode(__word__)));
75
76	struct distruct {
77	sitype low, high;
78	};
79	typedef union {
80	struct distruct s;
81	ditype ll;
82	} diunion;
83
84	#ifdef CONFIG_ARITHMETIC_OPS_L1
85	ditype __muldi3(ditype u, ditype v)__attribute__((l1_text));
86	#endif
87
88	ditype __muldi3(ditype u, ditype v)
89	{
90	diunion w;
91	diunion uu, vv;
92
93	uu.ll = u, vv.ll = v;
94	w.ll = __umulsidi3(uu.s.low, vv.s.low);
95	w.s.high += ((usitype) uu.s.low * (usitype) vv.s.high
96	+ (usitype) uu.s.high * (usitype) vv.s.low);
97
98	return w.ll;
99	}