microblaze: Fix __muldi3 function for little-endian.

__muldi3 was written for big endian platforms. Code contained half word read/write instructions which are not compatible with little endian cpu. Asm __muldi3 implementation is replaced by C version. Signed-off-by: Michal Simek <monstr@monstr.eu>
author: Michal Simek <monstr@monstr.eu> 2010-12-07 05:55:06 -0500
committer: Michal Simek <monstr@monstr.eu> 2011-01-03 04:30:31 -0500
commit: 3370d82f3b3ff04d082a9c343a80019282e41261 (patch)
tree: 4466336416d76206c29cfb78b05c21648845b33c /arch/microblaze/lib
parent: 17b931468729df6921981700bf18c75609c2f6af (diff)
2 files changed, 60 insertions, 121 deletions
diff --git a/arch/microblaze/lib/muldi3.S b/arch/microblaze/lib/muldi3.S
deleted file mode 100644
index ceeaa8c407f2..000000000000
--- a/arch/microblaze/lib/muldi3.S
+++ /dev/null
@@ -1,121 +0,0 @@
-#include <linux/linkage.h>
-/*
- * Multiply operation for 64 bit integers, for devices with hard multiply
- *      Input : Operand1[H] in Reg r5
- *              Operand1[L] in Reg r6
- *              Operand2[H] in Reg r7
- *              Operand2[L] in Reg r8
- *      Output: Result[H] in Reg r3
- *              Result[L] in Reg r4
- *
- * Explaination:
- *
- *      Both the input numbers are divided into 16 bit number as follows
- *              op1 = A B C D
- *              op2 = E F G H
- *      result = D * H
- *               + (C * H + D * G) << 16
- *               + (B * H + C * G + D * F) << 32
- *               + (A * H + B * G + C * F + D * E) << 48
- *
- *      Only 64 bits of the output are considered
- */
-        .text
-        .globl  __muldi3
-        .type __muldi3, @function
-        .ent __muldi3
-__muldi3:
-        addi    r1, r1, -40
-/* Save the input operands on the caller's stack */
-        swi     r5, r1, 44
-        swi     r6, r1, 48
-        swi     r7, r1, 52
-        swi     r8, r1, 56
-/* Store all the callee saved registers */
-        sw      r20, r1, r0
-        swi     r21, r1, 4
-        swi     r22, r1, 8
-        swi     r23, r1, 12
-        swi     r24, r1, 16
-        swi     r25, r1, 20
-        swi     r26, r1, 24
-        swi     r27, r1, 28
-/* Load all the 16 bit values for A thru H */
-        lhui    r20, r1, 44 /* A */
-        lhui    r21, r1, 46 /* B */
-        lhui    r22, r1, 48 /* C */
-        lhui    r23, r1, 50 /* D */
-        lhui    r24, r1, 52 /* E */
-        lhui    r25, r1, 54 /* F */
-        lhui    r26, r1, 56 /* G */
-        lhui    r27, r1, 58 /* H */
-/* D * H ==> LSB of the result on stack ==> Store1 */
-        mul     r9, r23, r27
-        swi     r9, r1, 36 /* Pos2 and Pos3 */
-/* Hi (Store1) + C * H + D * G ==> Store2 ==> Pos1 and Pos2 */
-/* Store the carry generated in position 2 for Pos 3 */
-        lhui    r11, r1, 36 /* Pos2 */
-        mul     r9, r22, r27 /* C * H */
-        mul     r10, r23, r26 /* D * G */
-        add     r9, r9, r10
-        addc    r12, r0, r0
-        add     r9, r9, r11
-        addc    r12, r12, r0 /* Store the Carry */
-        shi     r9, r1, 36 /* Store Pos2 */
-        swi     r9, r1, 32
-        lhui    r11, r1, 32
-        shi     r11, r1, 34 /* Store Pos1 */
-/* Hi (Store2) + B * H + C * G + D * F ==> Store3 ==> Pos0 and Pos1 */
-        mul     r9, r21, r27 /* B * H */
-        mul     r10, r22, r26 /* C * G */
-        mul     r7, r23, r25 /* D * F */
-        add     r9, r9, r11
-        add     r9, r9, r10
-        add     r9, r9, r7
-        swi     r9, r1, 32 /* Pos0 and Pos1 */
-/* Hi (Store3) + A * H + B * G + C * F + D * E ==> Store3 ==> Pos0 */
-        lhui    r11, r1, 32 /* Pos0 */
-        mul     r9, r20, r27 /* A * H */
-        mul     r10, r21, r26 /* B * G */
-        mul     r7, r22, r25 /* C * F */
-        mul     r8, r23, r24 /* D * E */
-        add     r9, r9, r11
-        add     r9, r9, r10
-        add     r9, r9, r7
-        add     r9, r9, r8
-        sext16  r9, r9 /* Sign extend the MSB */
-        shi     r9, r1, 32
-/* Move results to r3 and r4 */
-        lhui    r3, r1, 32
-        add     r3, r3, r12
-        shi     r3, r1, 32
-        lwi     r3, r1, 32 /* Hi Part */
-        lwi     r4, r1, 36 /* Lo Part */
-/* Restore Callee saved registers */
-        lw      r20, r1, r0
-        lwi     r21, r1, 4
-        lwi     r22, r1, 8
-        lwi     r23, r1, 12
-        lwi     r24, r1, 16
-        lwi     r25, r1, 20
-        lwi     r26, r1, 24
-        lwi     r27, r1, 28
-/* Restore Frame and return */
-        rtsd    r15, 8
-        addi    r1, r1, 40
-.size __muldi3, . - __muldi3
-.end __muldi3
diff --git a/arch/microblaze/lib/muldi3.c b/arch/microblaze/lib/muldi3.c
new file mode 100644
index 000000000000..d4860e154d29
--- /dev/null
+++ b/arch/microblaze/lib/muldi3.c
@@ -0,0 +1,60 @@
+#include <linux/module.h>
+#include "libgcc.h"
+#define DWtype long long
+#define UWtype unsigned long
+#define UHWtype unsigned short
+#define W_TYPE_SIZE 32
+#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
+#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
+#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
+/* If we still don't have umul_ppmm, define it using plain C.  */
+#if !defined(umul_ppmm)
+#define umul_ppmm(w1, w0, u, v)                                         \
+        do {                                                            \
+                UWtype __x0, __x1, __x2, __x3;                          \
+                UHWtype __ul, __vl, __uh, __vh;                         \
+                                                                        \
+                __ul = __ll_lowpart(u);                                 \
+                __uh = __ll_highpart(u);                                \
+                __vl = __ll_lowpart(v);                                 \
+                __vh = __ll_highpart(v);                                \
+                                                                        \
+                __x0 = (UWtype) __ul * __vl;                            \
+                __x1 = (UWtype) __ul * __vh;                            \
+                __x2 = (UWtype) __uh * __vl;                            \
+                __x3 = (UWtype) __uh * __vh;                            \
+                                                                        \
+                __x1 += __ll_highpart(__x0); /* this can't give carry */\
+                __x1 += __x2; /* but this indeed can */                 \
+                if (__x1 < __x2) /* did we get it? */                   \
+                __x3 += __ll_B; /* yes, add it in the proper pos */     \
+                                                                        \
+                (w1) = __x3 + __ll_highpart(__x1);                      \
+                (w0) = __ll_lowpart(__x1) * __ll_B + __ll_lowpart(__x0);\
+        } while (0)
+#endif
+#if !defined(__umulsidi3)
+#define __umulsidi3(u, v) ({                            \
+        DWunion __w;                                    \
+        umul_ppmm(__w.s.high, __w.s.low, u, v);         \
+        __w.ll;                                         \
+        })
+#endif
+DWtype __muldi3(DWtype u, DWtype v)
+{
+        const DWunion uu = {.ll = u};
+        const DWunion vv = {.ll = v};
+        DWunion w = {.ll = __umulsidi3(uu.s.low, vv.s.low)};
+        w.s.high += ((UWtype) uu.s.low * (UWtype) vv.s.high
+                + (UWtype) uu.s.high * (UWtype) vv.s.low);
+        return w.ll;
+}
author	Michal Simek <monstr@monstr.eu>	2010-12-07 05:55:06 -0500
committer	Michal Simek <monstr@monstr.eu>	2011-01-03 04:30:31 -0500
commit	3370d82f3b3ff04d082a9c343a80019282e41261 (patch)
tree	4466336416d76206c29cfb78b05c21648845b33c /arch/microblaze/lib
parent	17b931468729df6921981700bf18c75609c2f6af (diff)

diff --git a/arch/microblaze/lib/muldi3.S b/arch/microblaze/lib/muldi3.S deleted file mode 100644 index ceeaa8c407f2..000000000000 --- a/arch/microblaze/lib/muldi3.S +++ /dev/null
@@ -1,121 +0,0 @@
1	#include <linux/linkage.h>
2
3	/*
4	* Multiply operation for 64 bit integers, for devices with hard multiply
5	* Input : Operand1[H] in Reg r5
6	* Operand1[L] in Reg r6
7	* Operand2[H] in Reg r7
8	* Operand2[L] in Reg r8
9	* Output: Result[H] in Reg r3
10	* Result[L] in Reg r4
11	*
12	* Explaination:
13	*
14	* Both the input numbers are divided into 16 bit number as follows
15	* op1 = A B C D
16	* op2 = E F G H
17	* result = D * H
18	* + (C * H + D * G) << 16
19	* + (B * H + C * G + D * F) << 32
20	* + (A * H + B * G + C * F + D * E) << 48
21	*
22	* Only 64 bits of the output are considered
23	*/
24
25	.text
26	.globl __muldi3
27	.type __muldi3, @function
28	.ent __muldi3
29
30	__muldi3:
31	addi r1, r1, -40
32
33	/* Save the input operands on the caller's stack */
34	swi r5, r1, 44
35	swi r6, r1, 48
36	swi r7, r1, 52
37	swi r8, r1, 56
38
39	/* Store all the callee saved registers */
40	sw r20, r1, r0
41	swi r21, r1, 4
42	swi r22, r1, 8
43	swi r23, r1, 12
44	swi r24, r1, 16
45	swi r25, r1, 20
46	swi r26, r1, 24
47	swi r27, r1, 28
48
49	/* Load all the 16 bit values for A thru H */
50	lhui r20, r1, 44 /* A */
51	lhui r21, r1, 46 /* B */
52	lhui r22, r1, 48 /* C */
53	lhui r23, r1, 50 /* D */
54	lhui r24, r1, 52 /* E */
55	lhui r25, r1, 54 /* F */
56	lhui r26, r1, 56 /* G */
57	lhui r27, r1, 58 /* H */
58
59	/* D * H ==> LSB of the result on stack ==> Store1 */
60	mul r9, r23, r27
61	swi r9, r1, 36 /* Pos2 and Pos3 */
62
63	/* Hi (Store1) + C * H + D * G ==> Store2 ==> Pos1 and Pos2 */
64	/* Store the carry generated in position 2 for Pos 3 */
65	lhui r11, r1, 36 /* Pos2 */
66	mul r9, r22, r27 /* C * H */
67	mul r10, r23, r26 /* D * G */
68	add r9, r9, r10
69	addc r12, r0, r0
70	add r9, r9, r11
71	addc r12, r12, r0 /* Store the Carry */
72	shi r9, r1, 36 /* Store Pos2 */
73	swi r9, r1, 32
74	lhui r11, r1, 32
75	shi r11, r1, 34 /* Store Pos1 */
76
77	/* Hi (Store2) + B * H + C * G + D * F ==> Store3 ==> Pos0 and Pos1 */
78	mul r9, r21, r27 /* B * H */
79	mul r10, r22, r26 /* C * G */
80	mul r7, r23, r25 /* D * F */
81	add r9, r9, r11
82	add r9, r9, r10
83	add r9, r9, r7
84	swi r9, r1, 32 /* Pos0 and Pos1 */
85
86	/* Hi (Store3) + A * H + B * G + C * F + D * E ==> Store3 ==> Pos0 */
87	lhui r11, r1, 32 /* Pos0 */
88	mul r9, r20, r27 /* A * H */
89	mul r10, r21, r26 /* B * G */
90	mul r7, r22, r25 /* C * F */
91	mul r8, r23, r24 /* D * E */
92	add r9, r9, r11
93	add r9, r9, r10
94	add r9, r9, r7
95	add r9, r9, r8
96	sext16 r9, r9 /* Sign extend the MSB */
97	shi r9, r1, 32
98
99	/* Move results to r3 and r4 */
100	lhui r3, r1, 32
101	add r3, r3, r12
102	shi r3, r1, 32
103	lwi r3, r1, 32 /* Hi Part */
104	lwi r4, r1, 36 /* Lo Part */
105
106	/* Restore Callee saved registers */
107	lw r20, r1, r0
108	lwi r21, r1, 4
109	lwi r22, r1, 8
110	lwi r23, r1, 12
111	lwi r24, r1, 16
112	lwi r25, r1, 20
113	lwi r26, r1, 24
114	lwi r27, r1, 28
115
116	/* Restore Frame and return */
117	rtsd r15, 8
118	addi r1, r1, 40
119
120	.size __muldi3, . - __muldi3
121	.end __muldi3


diff --git a/arch/microblaze/lib/muldi3.c b/arch/microblaze/lib/muldi3.c new file mode 100644 index 000000000000..d4860e154d29 --- /dev/null +++ b/arch/microblaze/lib/muldi3.c
@@ -0,0 +1,60 @@
		1	#include <linux/module.h>
		2
		3	#include "libgcc.h"
		4
		5	#define DWtype long long
		6	#define UWtype unsigned long
		7	#define UHWtype unsigned short
		8
		9	#define W_TYPE_SIZE 32
		10
		11	#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
		12	#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
		13	#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
		14
		15	/* If we still don't have umul_ppmm, define it using plain C. */
		16	#if !defined(umul_ppmm)
		17	#define umul_ppmm(w1, w0, u, v) \
		18	do { \
		19	UWtype __x0, __x1, __x2, __x3; \
		20	UHWtype __ul, __vl, __uh, __vh; \
		21	\
		22	__ul = __ll_lowpart(u); \
		23	__uh = __ll_highpart(u); \
		24	__vl = __ll_lowpart(v); \
		25	__vh = __ll_highpart(v); \
		26	\
		27	__x0 = (UWtype) __ul * __vl; \
		28	__x1 = (UWtype) __ul * __vh; \
		29	__x2 = (UWtype) __uh * __vl; \
		30	__x3 = (UWtype) __uh * __vh; \
		31	\
		32	__x1 += __ll_highpart(__x0); /* this can't give carry */\
		33	__x1 += __x2; /* but this indeed can */ \
		34	if (__x1 < __x2) /* did we get it? */ \
		35	__x3 += __ll_B; /* yes, add it in the proper pos */ \
		36	\
		37	(w1) = __x3 + __ll_highpart(__x1); \
		38	(w0) = __ll_lowpart(__x1) * __ll_B + __ll_lowpart(__x0);\
		39	} while (0)
		40	#endif
		41
		42	#if !defined(__umulsidi3)
		43	#define __umulsidi3(u, v) ({ \
		44	DWunion __w; \
		45	umul_ppmm(__w.s.high, __w.s.low, u, v); \
		46	__w.ll; \
		47	})
		48	#endif
		49
		50	DWtype __muldi3(DWtype u, DWtype v)
		51	{
		52	const DWunion uu = {.ll = u};
		53	const DWunion vv = {.ll = v};
		54	DWunion w = {.ll = __umulsidi3(uu.s.low, vv.s.low)};
		55
		56	w.s.high += ((UWtype) uu.s.low * (UWtype) vv.s.high
		57	+ (UWtype) uu.s.high * (UWtype) vv.s.low);
		58
		59	return w.ll;
		60	}