aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Hogan <james.hogan@imgtec.com>2012-10-05 12:02:09 -0400
committerJames Hogan <james.hogan@imgtec.com>2013-03-02 15:09:52 -0500
commit086e9dc0e2ca925b1b58caefd04ed2757d14790b (patch)
treecfe182f1d07d40d09d2cae09e337423462250cac
parentf507758ccbed5c354cc1ce3b8f53ea072d7bc222 (diff)
metag: Optimised library functions
Add optimised library functions for metag. Signed-off-by: James Hogan <james.hogan@imgtec.com>
-rw-r--r--arch/metag/include/asm/checksum.h92
-rw-r--r--arch/metag/include/asm/div64.h12
-rw-r--r--arch/metag/include/asm/string.h13
-rw-r--r--arch/metag/lib/ashldi3.S33
-rw-r--r--arch/metag/lib/ashrdi3.S33
-rw-r--r--arch/metag/lib/checksum.c168
-rw-r--r--arch/metag/lib/clear_page.S17
-rw-r--r--arch/metag/lib/cmpdi2.S32
-rw-r--r--arch/metag/lib/copy_page.S20
-rw-r--r--arch/metag/lib/delay.c56
-rw-r--r--arch/metag/lib/div64.S108
-rw-r--r--arch/metag/lib/divsi3.S100
-rw-r--r--arch/metag/lib/ip_fast_csum.S32
-rw-r--r--arch/metag/lib/lshrdi3.S33
-rw-r--r--arch/metag/lib/memcpy.S185
-rw-r--r--arch/metag/lib/memmove.S345
-rw-r--r--arch/metag/lib/memset.S86
-rw-r--r--arch/metag/lib/modsi3.S38
-rw-r--r--arch/metag/lib/muldi3.S44
-rw-r--r--arch/metag/lib/ucmpdi2.S27
20 files changed, 1474 insertions, 0 deletions
diff --git a/arch/metag/include/asm/checksum.h b/arch/metag/include/asm/checksum.h
new file mode 100644
index 000000000000..999bf761a732
--- /dev/null
+++ b/arch/metag/include/asm/checksum.h
@@ -0,0 +1,92 @@
1#ifndef _METAG_CHECKSUM_H
2#define _METAG_CHECKSUM_H
3
4/*
5 * computes the checksum of a memory block at buff, length len,
6 * and adds in "sum" (32-bit)
7 *
8 * returns a 32-bit number suitable for feeding into itself
9 * or csum_tcpudp_magic
10 *
11 * this function must be called with even lengths, except
12 * for the last fragment, which may be odd
13 *
14 * it's best to have buff aligned on a 32-bit boundary
15 */
16extern __wsum csum_partial(const void *buff, int len, __wsum sum);
17
18/*
19 * the same as csum_partial, but copies from src while it
20 * checksums
21 *
22 * here even more important to align src and dst on a 32-bit (or even
23 * better 64-bit) boundary
24 */
25extern __wsum csum_partial_copy(const void *src, void *dst, int len,
26 __wsum sum);
27
28/*
29 * the same as csum_partial_copy, but copies from user space.
30 *
31 * here even more important to align src and dst on a 32-bit (or even
32 * better 64-bit) boundary
33 */
34extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
35 int len, __wsum sum, int *csum_err);
36
37#define csum_partial_copy_nocheck(src, dst, len, sum) \
38 csum_partial_copy((src), (dst), (len), (sum))
39
40/*
41 * Fold a partial checksum
42 */
43static inline __sum16 csum_fold(__wsum csum)
44{
45 u32 sum = (__force u32)csum;
46 sum = (sum & 0xffff) + (sum >> 16);
47 sum = (sum & 0xffff) + (sum >> 16);
48 return (__force __sum16)~sum;
49}
50
51/*
52 * This is a version of ip_compute_csum() optimized for IP headers,
53 * which always checksum on 4 octet boundaries.
54 */
55extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
56
57/*
58 * computes the checksum of the TCP/UDP pseudo-header
59 * returns a 16-bit checksum, already complemented
60 */
61static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
62 unsigned short len,
63 unsigned short proto,
64 __wsum sum)
65{
66 unsigned long len_proto = (proto + len) << 8;
67 asm ("ADD %0, %0, %1\n"
68 "ADDS %0, %0, %2\n"
69 "ADDCS %0, %0, #1\n"
70 "ADDS %0, %0, %3\n"
71 "ADDCS %0, %0, #1\n"
72 : "=d" (sum)
73 : "d" (daddr), "d" (saddr), "d" (len_proto),
74 "0" (sum)
75 : "cc");
76 return sum;
77}
78
79static inline __sum16
80csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len,
81 unsigned short proto, __wsum sum)
82{
83 return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
84}
85
86/*
87 * this routine is used for miscellaneous IP-like checksums, mainly
88 * in icmp.c
89 */
90extern __sum16 ip_compute_csum(const void *buff, int len);
91
92#endif /* _METAG_CHECKSUM_H */
diff --git a/arch/metag/include/asm/div64.h b/arch/metag/include/asm/div64.h
new file mode 100644
index 000000000000..0fdd11676212
--- /dev/null
+++ b/arch/metag/include/asm/div64.h
@@ -0,0 +1,12 @@
1#ifndef __ASM_DIV64_H__
2#define __ASM_DIV64_H__
3
4#include <asm-generic/div64.h>
5
6extern u64 div_u64(u64 dividend, u64 divisor);
7extern s64 div_s64(s64 dividend, s64 divisor);
8
9#define div_u64 div_u64
10#define div_s64 div_s64
11
12#endif
diff --git a/arch/metag/include/asm/string.h b/arch/metag/include/asm/string.h
new file mode 100644
index 000000000000..53e3806eee04
--- /dev/null
+++ b/arch/metag/include/asm/string.h
@@ -0,0 +1,13 @@
1#ifndef _METAG_STRING_H_
2#define _METAG_STRING_H_
3
4#define __HAVE_ARCH_MEMSET
5extern void *memset(void *__s, int __c, size_t __count);
6
7#define __HAVE_ARCH_MEMCPY
8void *memcpy(void *__to, __const__ void *__from, size_t __n);
9
10#define __HAVE_ARCH_MEMMOVE
11extern void *memmove(void *__dest, __const__ void *__src, size_t __n);
12
13#endif /* _METAG_STRING_H_ */
diff --git a/arch/metag/lib/ashldi3.S b/arch/metag/lib/ashldi3.S
new file mode 100644
index 000000000000..78d6974cffef
--- /dev/null
+++ b/arch/metag/lib/ashldi3.S
@@ -0,0 +1,33 @@
1! Copyright (C) 2012 by Imagination Technologies Ltd.
2!
3! 64-bit arithmetic shift left routine.
4!
5
6 .text
7 .global ___ashldi3
8 .type ___ashldi3,function
9
10___ashldi3:
11 MOV D0Re0,D0Ar2
12 MOV D1Re0,D1Ar1
13 CMP D1Ar3,#0 ! COUNT == 0
14 MOVEQ PC,D1RtP ! Yes, return
15
16 SUBS D0Ar4,D1Ar3,#32 ! N = COUNT - 32
17 BGE $L10
18
19!! Shift < 32
20 NEG D0Ar4,D0Ar4 ! N = - N
21 LSL D1Re0,D1Re0,D1Ar3 ! HI = HI << COUNT
22 LSR D0Ar6,D0Re0,D0Ar4 ! TMP= LO >> -(COUNT - 32)
23 OR D1Re0,D1Re0,D0Ar6 ! HI = HI | TMP
24 SWAP D0Ar4,D1Ar3
25 LSL D0Re0,D0Re0,D0Ar4 ! LO = LO << COUNT
26 MOV PC,D1RtP
27
28$L10:
29!! Shift >= 32
30 LSL D1Re0,D0Re0,D0Ar4 ! HI = LO << N
31 MOV D0Re0,#0 ! LO = 0
32 MOV PC,D1RtP
33 .size ___ashldi3,.-___ashldi3
diff --git a/arch/metag/lib/ashrdi3.S b/arch/metag/lib/ashrdi3.S
new file mode 100644
index 000000000000..7cb7ed3bb1ad
--- /dev/null
+++ b/arch/metag/lib/ashrdi3.S
@@ -0,0 +1,33 @@
1! Copyright (C) 2012 by Imagination Technologies Ltd.
2!
3! 64-bit arithmetic shift right routine.
4!
5
6 .text
7 .global ___ashrdi3
8 .type ___ashrdi3,function
9
10___ashrdi3:
11 MOV D0Re0,D0Ar2
12 MOV D1Re0,D1Ar1
13 CMP D1Ar3,#0 ! COUNT == 0
14 MOVEQ PC,D1RtP ! Yes, return
15
16 MOV D0Ar4,D1Ar3
17 SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 32
18 BGE $L20
19
20!! Shift < 32
21 NEG D1Ar3,D1Ar3 ! N = - N
22 LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT
23 LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32)
24 OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP
25 SWAP D1Ar3,D0Ar4
26 ASR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT
27 MOV PC,D1RtP
28$L20:
29!! Shift >= 32
30 ASR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N
31 ASR D1Re0,D1Re0,#31 ! HI = HI >> 31
32 MOV PC,D1RtP
33 .size ___ashrdi3,.-___ashrdi3
diff --git a/arch/metag/lib/checksum.c b/arch/metag/lib/checksum.c
new file mode 100644
index 000000000000..44d2e1913560
--- /dev/null
+++ b/arch/metag/lib/checksum.c
@@ -0,0 +1,168 @@
1/*
2 *
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * IP/TCP/UDP checksumming routines
8 *
9 * Authors: Jorge Cwik, <jorge@laser.satlink.net>
10 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
11 * Tom May, <ftom@netcom.com>
12 * Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de>
13 * Lots of code moved from tcp.c and ip.c; see those files
14 * for more names.
15 *
16 * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek:
17 * Fixed some nasty bugs, causing some horrible crashes.
18 * A: At some points, the sum (%0) was used as
19 * length-counter instead of the length counter
20 * (%1). Thanks to Roman Hodek for pointing this out.
21 * B: GCC seems to mess up if one uses too many
22 * data-registers to hold input values and one tries to
23 * specify d0 and d1 as scratch registers. Letting gcc
24 * choose these registers itself solves the problem.
25 *
26 * This program is free software; you can redistribute it and/or
27 * modify it under the terms of the GNU General Public License
28 * as published by the Free Software Foundation; either version
29 * 2 of the License, or (at your option) any later version.
30 */
31
32/* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access
33 kills, so most of the assembly has to go. */
34
35#include <linux/module.h>
36#include <net/checksum.h>
37
38#include <asm/byteorder.h>
39
40static inline unsigned short from32to16(unsigned int x)
41{
42 /* add up 16-bit and 16-bit for 16+c bit */
43 x = (x & 0xffff) + (x >> 16);
44 /* add up carry.. */
45 x = (x & 0xffff) + (x >> 16);
46 return x;
47}
48
49static unsigned int do_csum(const unsigned char *buff, int len)
50{
51 int odd;
52 unsigned int result = 0;
53
54 if (len <= 0)
55 goto out;
56 odd = 1 & (unsigned long) buff;
57 if (odd) {
58#ifdef __LITTLE_ENDIAN
59 result += (*buff << 8);
60#else
61 result = *buff;
62#endif
63 len--;
64 buff++;
65 }
66 if (len >= 2) {
67 if (2 & (unsigned long) buff) {
68 result += *(unsigned short *) buff;
69 len -= 2;
70 buff += 2;
71 }
72 if (len >= 4) {
73 const unsigned char *end = buff + ((unsigned)len & ~3);
74 unsigned int carry = 0;
75 do {
76 unsigned int w = *(unsigned int *) buff;
77 buff += 4;
78 result += carry;
79 result += w;
80 carry = (w > result);
81 } while (buff < end);
82 result += carry;
83 result = (result & 0xffff) + (result >> 16);
84 }
85 if (len & 2) {
86 result += *(unsigned short *) buff;
87 buff += 2;
88 }
89 }
90 if (len & 1)
91#ifdef __LITTLE_ENDIAN
92 result += *buff;
93#else
94 result += (*buff << 8);
95#endif
96 result = from32to16(result);
97 if (odd)
98 result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
99out:
100 return result;
101}
102EXPORT_SYMBOL(ip_fast_csum);
103
104/*
105 * computes the checksum of a memory block at buff, length len,
106 * and adds in "sum" (32-bit)
107 *
108 * returns a 32-bit number suitable for feeding into itself
109 * or csum_tcpudp_magic
110 *
111 * this function must be called with even lengths, except
112 * for the last fragment, which may be odd
113 *
114 * it's best to have buff aligned on a 32-bit boundary
115 */
116__wsum csum_partial(const void *buff, int len, __wsum wsum)
117{
118 unsigned int sum = (__force unsigned int)wsum;
119 unsigned int result = do_csum(buff, len);
120
121 /* add in old sum, and carry.. */
122 result += sum;
123 if (sum > result)
124 result += 1;
125 return (__force __wsum)result;
126}
127EXPORT_SYMBOL(csum_partial);
128
129/*
130 * this routine is used for miscellaneous IP-like checksums, mainly
131 * in icmp.c
132 */
133__sum16 ip_compute_csum(const void *buff, int len)
134{
135 return (__force __sum16)~do_csum(buff, len);
136}
137EXPORT_SYMBOL(ip_compute_csum);
138
139/*
140 * copy from fs while checksumming, otherwise like csum_partial
141 */
142__wsum
143csum_partial_copy_from_user(const void __user *src, void *dst, int len,
144 __wsum sum, int *csum_err)
145{
146 int missing;
147
148 missing = __copy_from_user(dst, src, len);
149 if (missing) {
150 memset(dst + len - missing, 0, missing);
151 *csum_err = -EFAULT;
152 } else
153 *csum_err = 0;
154
155 return csum_partial(dst, len, sum);
156}
157EXPORT_SYMBOL(csum_partial_copy_from_user);
158
159/*
160 * copy from ds while checksumming, otherwise like csum_partial
161 */
162__wsum
163csum_partial_copy(const void *src, void *dst, int len, __wsum sum)
164{
165 memcpy(dst, src, len);
166 return csum_partial(dst, len, sum);
167}
168EXPORT_SYMBOL(csum_partial_copy);
diff --git a/arch/metag/lib/clear_page.S b/arch/metag/lib/clear_page.S
new file mode 100644
index 000000000000..43144eebec55
--- /dev/null
+++ b/arch/metag/lib/clear_page.S
@@ -0,0 +1,17 @@
1 ! Copyright 2007,2008,2009 Imagination Technologies Ltd.
2
3#include <asm/page.h>
4
5 .text
6 .global _clear_page
7 .type _clear_page,function
8 !! D1Ar1 - page
9_clear_page:
10 MOV TXRPT,#((PAGE_SIZE / 8) - 1)
11 MOV D0Re0,#0
12 MOV D1Re0,#0
13$Lclear_page_loop:
14 SETL [D1Ar1++],D0Re0,D1Re0
15 BR $Lclear_page_loop
16 MOV PC,D1RtP
17 .size _clear_page,.-_clear_page
diff --git a/arch/metag/lib/cmpdi2.S b/arch/metag/lib/cmpdi2.S
new file mode 100644
index 000000000000..9c5c663c5aea
--- /dev/null
+++ b/arch/metag/lib/cmpdi2.S
@@ -0,0 +1,32 @@
1! Copyright (C) 2012 by Imagination Technologies Ltd.
2!
3! 64-bit signed compare routine.
4!
5
6 .text
7 .global ___cmpdi2
8 .type ___cmpdi2,function
9
10! low high
11! s64 a (D0Ar2, D1Ar1)
12! s64 b (D0Ar4, D1Ar3)
13___cmpdi2:
14 ! start at 1 (equal) and conditionally increment or decrement
15 MOV D0Re0,#1
16
17 ! high words differ?
18 CMP D1Ar1,D1Ar3
19 BNE $Lhigh_differ
20
21 ! unsigned compare low words
22 CMP D0Ar2,D0Ar4
23 SUBLO D0Re0,D0Re0,#1
24 ADDHI D0Re0,D0Re0,#1
25 MOV PC,D1RtP
26
27$Lhigh_differ:
28 ! signed compare high words
29 SUBLT D0Re0,D0Re0,#1
30 ADDGT D0Re0,D0Re0,#1
31 MOV PC,D1RtP
32 .size ___cmpdi2,.-___cmpdi2
diff --git a/arch/metag/lib/copy_page.S b/arch/metag/lib/copy_page.S
new file mode 100644
index 000000000000..91f7d461239c
--- /dev/null
+++ b/arch/metag/lib/copy_page.S
@@ -0,0 +1,20 @@
1 ! Copyright 2007,2008 Imagination Technologies Ltd.
2
3#include <asm/page.h>
4
5 .text
6 .global _copy_page
7 .type _copy_page,function
8 !! D1Ar1 - to
9 !! D0Ar2 - from
10_copy_page:
11 MOV D0FrT,#PAGE_SIZE
12$Lcopy_page_loop:
13 GETL D0Re0,D1Re0,[D0Ar2++]
14 GETL D0Ar6,D1Ar5,[D0Ar2++]
15 SETL [D1Ar1++],D0Re0,D1Re0
16 SETL [D1Ar1++],D0Ar6,D1Ar5
17 SUBS D0FrT,D0FrT,#16
18 BNZ $Lcopy_page_loop
19 MOV PC,D1RtP
20 .size _copy_page,.-_copy_page
diff --git a/arch/metag/lib/delay.c b/arch/metag/lib/delay.c
new file mode 100644
index 000000000000..0b308f48b37a
--- /dev/null
+++ b/arch/metag/lib/delay.c
@@ -0,0 +1,56 @@
1/*
2 * Precise Delay Loops for Meta
3 *
4 * Copyright (C) 1993 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6 * Copyright (C) 2007,2009 Imagination Technologies Ltd.
7 *
8 */
9
10#include <linux/export.h>
11#include <linux/sched.h>
12#include <linux/delay.h>
13
14#include <asm/core_reg.h>
15#include <asm/processor.h>
16
17/*
18 * TXTACTCYC is only 24 bits, so on chips with fast clocks it will wrap
19 * many times per-second. If it does wrap __delay will return prematurely,
20 * but this is only likely with large delay values.
21 *
22 * We also can't implement read_current_timer() with TXTACTCYC due to
23 * this wrapping behaviour.
24 */
25#define rdtimer(t) t = __core_reg_get(TXTACTCYC)
26
27void __delay(unsigned long loops)
28{
29 unsigned long bclock, now;
30
31 rdtimer(bclock);
32 do {
33 asm("NOP");
34 rdtimer(now);
35 } while ((now-bclock) < loops);
36}
37EXPORT_SYMBOL(__delay);
38
39inline void __const_udelay(unsigned long xloops)
40{
41 u64 loops = (u64)xloops * (u64)loops_per_jiffy * HZ;
42 __delay(loops >> 32);
43}
44EXPORT_SYMBOL(__const_udelay);
45
46void __udelay(unsigned long usecs)
47{
48 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
49}
50EXPORT_SYMBOL(__udelay);
51
52void __ndelay(unsigned long nsecs)
53{
54 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
55}
56EXPORT_SYMBOL(__ndelay);
diff --git a/arch/metag/lib/div64.S b/arch/metag/lib/div64.S
new file mode 100644
index 000000000000..1cfc93498f70
--- /dev/null
+++ b/arch/metag/lib/div64.S
@@ -0,0 +1,108 @@
1! Copyright (C) 2012 Imagination Technologies Ltd.
2!
3! Signed/unsigned 64-bit division routines.
4!
5
6 .text
7 .global _div_u64
8 .type _div_u64,function
9
10_div_u64:
11$L1:
12 ORS A0.3,D1Ar3,D0Ar4
13 BNE $L3
14$L2:
15 MOV D0Re0,D0Ar2
16 MOV D1Re0,D1Ar1
17 MOV PC,D1RtP
18$L3:
19 CMP D1Ar3,D1Ar1
20 CMPEQ D0Ar4,D0Ar2
21 MOV D0Re0,#1
22 MOV D1Re0,#0
23 BHS $L6
24$L4:
25 ADDS D0Ar6,D0Ar4,D0Ar4
26 ADD D1Ar5,D1Ar3,D1Ar3
27 ADDCS D1Ar5,D1Ar5,#1
28 CMP D1Ar5,D1Ar3
29 CMPEQ D0Ar6,D0Ar4
30 BLO $L6
31$L5:
32 MOV D0Ar4,D0Ar6
33 MOV D1Ar3,D1Ar5
34 ADDS D0Re0,D0Re0,D0Re0
35 ADD D1Re0,D1Re0,D1Re0
36 ADDCS D1Re0,D1Re0,#1
37 CMP D1Ar3,D1Ar1
38 CMPEQ D0Ar4,D0Ar2
39 BLO $L4
40$L6:
41 ORS A0.3,D1Re0,D0Re0
42 MOV D0Ar6,#0
43 MOV D1Ar5,D0Ar6
44 BEQ $L10
45$L7:
46 CMP D1Ar1,D1Ar3
47 CMPEQ D0Ar2,D0Ar4
48 BLO $L9
49$L8:
50 ADDS D0Ar6,D0Ar6,D0Re0
51 ADD D1Ar5,D1Ar5,D1Re0
52 ADDCS D1Ar5,D1Ar5,#1
53
54 SUBS D0Ar2,D0Ar2,D0Ar4
55 SUB D1Ar1,D1Ar1,D1Ar3
56 SUBCS D1Ar1,D1Ar1,#1
57$L9:
58 LSL A0.3,D1Re0,#31
59 LSR D0Re0,D0Re0,#1
60 LSR D1Re0,D1Re0,#1
61 OR D0Re0,D0Re0,A0.3
62 LSL A0.3,D1Ar3,#31
63 LSR D0Ar4,D0Ar4,#1
64 LSR D1Ar3,D1Ar3,#1
65 OR D0Ar4,D0Ar4,A0.3
66 ORS A0.3,D1Re0,D0Re0
67 BNE $L7
68$L10:
69 MOV D0Re0,D0Ar6
70 MOV D1Re0,D1Ar5
71 MOV PC,D1RtP
72 .size _div_u64,.-_div_u64
73
74 .text
75 .global _div_s64
76 .type _div_s64,function
77_div_s64:
78 MSETL [A0StP],D0FrT,D0.5
79 XOR D0.5,D0Ar2,D0Ar4
80 XOR D1.5,D1Ar1,D1Ar3
81 TSTT D1Ar1,#HI(0x80000000)
82 BZ $L25
83
84 NEGS D0Ar2,D0Ar2
85 NEG D1Ar1,D1Ar1
86 SUBCS D1Ar1,D1Ar1,#1
87$L25:
88 TSTT D1Ar3,#HI(0x80000000)
89 BZ $L27
90
91 NEGS D0Ar4,D0Ar4
92 NEG D1Ar3,D1Ar3
93 SUBCS D1Ar3,D1Ar3,#1
94$L27:
95 CALLR D1RtP,_div_u64
96 TSTT D1.5,#HI(0x80000000)
97 BZ $L29
98
99 NEGS D0Re0,D0Re0
100 NEG D1Re0,D1Re0
101 SUBCS D1Re0,D1Re0,#1
102$L29:
103
104 GETL D0FrT,D1RtP,[A0StP+#(-16)]
105 GETL D0.5,D1.5,[A0StP+#(-8)]
106 SUB A0StP,A0StP,#16
107 MOV PC,D1RtP
108 .size _div_s64,.-_div_s64
diff --git a/arch/metag/lib/divsi3.S b/arch/metag/lib/divsi3.S
new file mode 100644
index 000000000000..7c8a8ae9a0a1
--- /dev/null
+++ b/arch/metag/lib/divsi3.S
@@ -0,0 +1,100 @@
1! Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
2! Imagination Technologies Ltd
3!
4! Integer divide routines.
5!
6
7 .text
8 .global ___udivsi3
9 .type ___udivsi3,function
10 .align 2
11___udivsi3:
12!!
13!! Since core is signed divide case, just set control variable
14!!
15 MOV D1Re0,D0Ar2 ! Au already in A1Ar1, Bu -> D1Re0
16 MOV D0Re0,#0 ! Result is 0
17 MOV D0Ar4,#0 ! Return positive result
18 B $LIDMCUStart
19 .size ___udivsi3,.-___udivsi3
20
21!!
22!! 32-bit division signed i/p - passed signed 32-bit numbers
23!!
24 .global ___divsi3
25 .type ___divsi3,function
26 .align 2
27___divsi3:
28!!
29!! A already in D1Ar1, B already in D0Ar2 -> make B abs(B)
30!!
31 MOV D1Re0,D0Ar2 ! A already in A1Ar1, B -> D1Re0
32 MOV D0Re0,#0 ! Result is 0
33 XOR D0Ar4,D1Ar1,D1Re0 ! D0Ar4 -ive if result is -ive
34 ABS D1Ar1,D1Ar1 ! abs(A) -> Au
35 ABS D1Re0,D1Re0 ! abs(B) -> Bu
36$LIDMCUStart:
37 CMP D1Ar1,D1Re0 ! Is ( Au > Bu )?
38 LSR D1Ar3,D1Ar1,#2 ! Calculate (Au & (~3)) >> 2
39 CMPHI D1Re0,D1Ar3 ! OR ( (Au & (~3)) <= (Bu << 2) )?
40 LSLSHI D1Ar3,D1Re0,#1 ! Buq = Bu << 1
41 BLS $LIDMCUSetup ! Yes: Do normal divide
42!!
43!! Quick divide setup can assume that CurBit only needs to start at 2
44!!
45$LIDMCQuick:
46 CMP D1Ar1,D1Ar3 ! ( A >= Buq )?
47 ADDCC D0Re0,D0Re0,#2 ! If yes result += 2
48 SUBCC D1Ar1,D1Ar1,D1Ar3 ! and A -= Buq
49 CMP D1Ar1,D1Re0 ! ( A >= Bu )?
50 ADDCC D0Re0,D0Re0,#1 ! If yes result += 1
51 SUBCC D1Ar1,D1Ar1,D1Re0 ! and A -= Bu
52 ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result?
53 NEG D0Ar2,D0Re0 ! Calulate neg result
54 MOVMI D0Re0,D0Ar2 ! Yes: Take neg result
55$LIDMCRet:
56 MOV PC,D1RtP
57!!
58!! Setup for general unsigned divide code
59!!
60!! D0Re0 is used to form the result, already set to Zero
61!! D1Re0 is the input Bu value, this gets trashed
62!! D0Ar6 is curbit which is set to 1 at the start and shifted up
63!! D0Ar4 is negative if we should return a negative result
64!! D1Ar1 is the input Au value, eventually this holds the remainder
65!!
66$LIDMCUSetup:
67 CMP D1Ar1,D1Re0 ! Is ( Au < Bu )?
68 MOV D0Ar6,#1 ! Set curbit to 1
69 BCS $LIDMCRet ! Yes: Return 0 remainder Au
70!!
71!! Calculate alignment using FFB instruction
72!!
73 FFB D1Ar5,D1Ar1 ! Find first bit of Au
74 ANDN D1Ar5,D1Ar5,#31 ! Handle exceptional case.
75 ORN D1Ar5,D1Ar5,#31 ! if N bit set, set to 31
76 FFB D1Ar3,D1Re0 ! Find first bit of Bu
77 ANDN D1Ar3,D1Ar3,#31 ! Handle exceptional case.
78 ORN D1Ar3,D1Ar3,#31 ! if N bit set, set to 31
79 SUBS D1Ar3,D1Ar5,D1Ar3 ! calculate diff, ffbA - ffbB
80 MOV D0Ar2,D1Ar3 ! copy into bank 0
81 LSLGT D1Re0,D1Re0,D1Ar3 ! ( > 0) ? left shift B
82 LSLGT D0Ar6,D0Ar6,D0Ar2 ! ( > 0) ? left shift curbit
83!!
84!! Now we start the divide proper, logic is
85!!
86!! if ( A >= B ) add curbit to result and subtract B from A
87!! shift curbit and B down by 1 in either case
88!!
89$LIDMCLoop:
90 CMP D1Ar1, D1Re0 ! ( A >= B )?
91 ADDCC D0Re0, D0Re0, D0Ar6 ! If yes result += curbit
92 SUBCC D1Ar1, D1Ar1, D1Re0 ! and A -= B
93 LSRS D0Ar6, D0Ar6, #1 ! Shift down curbit, is it zero?
94 LSR D1Re0, D1Re0, #1 ! Shift down B
95 BNZ $LIDMCLoop ! Was single bit in curbit lost?
96 ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result?
97 NEG D0Ar2,D0Re0 ! Calulate neg result
98 MOVMI D0Re0,D0Ar2 ! Yes: Take neg result
99 MOV PC,D1RtP
100 .size ___divsi3,.-___divsi3
diff --git a/arch/metag/lib/ip_fast_csum.S b/arch/metag/lib/ip_fast_csum.S
new file mode 100644
index 000000000000..533b1e73deac
--- /dev/null
+++ b/arch/metag/lib/ip_fast_csum.S
@@ -0,0 +1,32 @@
1
2 .text
3/*
4 * This is a version of ip_compute_csum() optimized for IP headers,
5 * which always checksum on 4 octet boundaries.
6 *
7 * extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
8 *
9 */
10 .global _ip_fast_csum
11 .type _ip_fast_csum,function
12_ip_fast_csum:
13 !! TXRPT needs loops - 1
14 SUBS TXRPT,D0Ar2,#1
15 MOV D0Re0,#0
16 BLO $Lfast_csum_exit
17$Lfast_csum_loop:
18 GETD D1Ar3,[D1Ar1++]
19 ADDS D0Re0,D0Re0,D1Ar3
20 ADDCS D0Re0,D0Re0,#1
21 BR $Lfast_csum_loop
22 LSR D0Ar4,D0Re0,#16
23 AND D0Re0,D0Re0,#0xffff
24 AND D0Ar4,D0Ar4,#0xffff
25 ADD D0Re0,D0Re0,D0Ar4
26 LSR D0Ar4,D0Re0,#16
27 ADD D0Re0,D0Re0,D0Ar4
28 XOR D0Re0,D0Re0,#-1
29 AND D0Re0,D0Re0,#0xffff
30$Lfast_csum_exit:
31 MOV PC,D1RtP
32 .size _ip_fast_csum,.-_ip_fast_csum
diff --git a/arch/metag/lib/lshrdi3.S b/arch/metag/lib/lshrdi3.S
new file mode 100644
index 000000000000..47f720283077
--- /dev/null
+++ b/arch/metag/lib/lshrdi3.S
@@ -0,0 +1,33 @@
1! Copyright (C) 2012 by Imagination Technologies Ltd.
2!
3! 64-bit logical shift right routine.
4!
5
6 .text
7 .global ___lshrdi3
8 .type ___lshrdi3,function
9
10___lshrdi3:
11 MOV D0Re0,D0Ar2
12 MOV D1Re0,D1Ar1
13 CMP D1Ar3,#0 ! COUNT == 0
14 MOVEQ PC,D1RtP ! Yes, return
15
16 MOV D0Ar4,D1Ar3
17 SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 32
18 BGE $L30
19
20!! Shift < 32
21 NEG D1Ar3,D1Ar3 ! N = - N
22 LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT
23 LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32)
24 OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP
25 SWAP D1Ar3,D0Ar4
26 LSR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT
27 MOV PC,D1RtP
28$L30:
29!! Shift >= 32
30 LSR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N
31 MOV D1Re0,#0 ! HI = 0
32 MOV PC,D1RtP
33 .size ___lshrdi3,.-___lshrdi3
diff --git a/arch/metag/lib/memcpy.S b/arch/metag/lib/memcpy.S
new file mode 100644
index 000000000000..46b7a2b9479e
--- /dev/null
+++ b/arch/metag/lib/memcpy.S
@@ -0,0 +1,185 @@
1! Copyright (C) 2008-2012 Imagination Technologies Ltd.
2
3 .text
4 .global _memcpy
5 .type _memcpy,function
6! D1Ar1 dst
7! D0Ar2 src
8! D1Ar3 cnt
9! D0Re0 dst
10_memcpy:
11 CMP D1Ar3, #16
12 MOV A1.2, D0Ar2 ! source pointer
13 MOV A0.2, D1Ar1 ! destination pointer
14 MOV A0.3, D1Ar1 ! for return value
15! If there are less than 16 bytes to copy use the byte copy loop
16 BGE $Llong_copy
17
18$Lbyte_copy:
19! Simply copy a byte at a time
20 SUBS TXRPT, D1Ar3, #1
21 BLT $Lend
22$Lloop_byte:
23 GETB D1Re0, [A1.2++]
24 SETB [A0.2++], D1Re0
25 BR $Lloop_byte
26
27$Lend:
28! Finally set return value and return
29 MOV D0Re0, A0.3
30 MOV PC, D1RtP
31
32$Llong_copy:
33 ANDS D1Ar5, D1Ar1, #7 ! test destination alignment
34 BZ $Laligned_dst
35
36! The destination address is not 8 byte aligned. We will copy bytes from
37! the source to the destination until the remaining data has an 8 byte
38! destination address alignment (i.e we should never copy more than 7
39! bytes here).
40$Lalign_dst:
41 GETB D0Re0, [A1.2++]
42 ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8
43 SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes
44 SETB [A0.2++], D0Re0
45 CMP D1Ar5, #8
46 BNE $Lalign_dst
47
48! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte
49! blocks, then jump to the unaligned copy loop or fall through to the aligned
50! copy loop as appropriate.
51$Laligned_dst:
52 MOV D0Ar4, A1.2
53 LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks
54 ANDS D0Ar4, D0Ar4, #7 ! test source alignment
55 BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop
56
57! Both source and destination are 8 byte aligned - the easy case.
58$Laligned_copy:
59 LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks
60 BZ $Lbyte_copy
61 SUB TXRPT, D1Ar5, #1
62
63$Laligned_32:
64 GETL D0Re0, D1Re0, [A1.2++]
65 GETL D0Ar6, D1Ar5, [A1.2++]
66 SETL [A0.2++], D0Re0, D1Re0
67 SETL [A0.2++], D0Ar6, D1Ar5
68 GETL D0Re0, D1Re0, [A1.2++]
69 GETL D0Ar6, D1Ar5, [A1.2++]
70 SETL [A0.2++], D0Re0, D1Re0
71 SETL [A0.2++], D0Ar6, D1Ar5
72 BR $Laligned_32
73
74! If there are any remaining bytes use the byte copy loop, otherwise we are done
75 ANDS D1Ar3, D1Ar3, #0x1f
76 BNZ $Lbyte_copy
77 B $Lend
78
79! The destination is 8 byte aligned but the source is not, and there are 8
80! or more bytes to be copied.
81$Lunaligned_copy:
82! Adjust the source pointer (A1.2) to the 8 byte boundary before its
83! current value
84 MOV D0Ar4, A1.2
85 MOV D0Ar6, A1.2
86 ANDMB D0Ar4, D0Ar4, #0xfff8
87 MOV A1.2, D0Ar4
88! Save the number of bytes of mis-alignment in D0Ar4 for use later
89 SUBS D0Ar6, D0Ar6, D0Ar4
90 MOV D0Ar4, D0Ar6
91! if there is no mis-alignment after all, use the aligned copy loop
92 BZ $Laligned_copy
93
94! prefetch 8 bytes
95 GETL D0Re0, D1Re0, [A1.2]
96
97 SUB TXRPT, D1Ar5, #1
98
99! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly
100! 4 bytes, and more than 4 bytes.
101 CMP D0Ar6, #4
102 BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop
103 BZ $Lunaligned_4 ! use 4 byte mis-alignment loop
104
105! The mis-alignment is more than 4 bytes
106$Lunaligned_5_6_7:
107 SUB D0Ar6, D0Ar6, #4
108! Calculate the bit offsets required for the shift operations necesssary
109! to align the data.
110! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
111 MULW D0Ar6, D0Ar6, #8
112 MOV D1Ar5, #32
113 SUB D1Ar5, D1Ar5, D0Ar6
114! Move data 4 bytes before we enter the main loop
115 MOV D0Re0, D1Re0
116
117$Lloop_5_6_7:
118 GETL D0Ar2, D1Ar1, [++A1.2]
119! form 64-bit data in D0Re0, D1Re0
120 LSR D0Re0, D0Re0, D0Ar6
121 MOV D1Re0, D0Ar2
122 LSL D1Re0, D1Re0, D1Ar5
123 ADD D0Re0, D0Re0, D1Re0
124
125 LSR D0Ar2, D0Ar2, D0Ar6
126 LSL D1Re0, D1Ar1, D1Ar5
127 ADD D1Re0, D1Re0, D0Ar2
128
129 SETL [A0.2++], D0Re0, D1Re0
130 MOV D0Re0, D1Ar1
131 BR $Lloop_5_6_7
132
133 B $Lunaligned_end
134
135$Lunaligned_1_2_3:
136! Calculate the bit offsets required for the shift operations necesssary
137! to align the data.
138! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
139 MULW D0Ar6, D0Ar6, #8
140 MOV D1Ar5, #32
141 SUB D1Ar5, D1Ar5, D0Ar6
142
143$Lloop_1_2_3:
144! form 64-bit data in D0Re0,D1Re0
145 LSR D0Re0, D0Re0, D0Ar6
146 LSL D1Ar1, D1Re0, D1Ar5
147 ADD D0Re0, D0Re0, D1Ar1
148 MOV D0Ar2, D1Re0
149 LSR D0FrT, D0Ar2, D0Ar6
150 GETL D0Ar2, D1Ar1, [++A1.2]
151
152 MOV D1Re0, D0Ar2
153 LSL D1Re0, D1Re0, D1Ar5
154 ADD D1Re0, D1Re0, D0FrT
155
156 SETL [A0.2++], D0Re0, D1Re0
157 MOV D0Re0, D0Ar2
158 MOV D1Re0, D1Ar1
159 BR $Lloop_1_2_3
160
161 B $Lunaligned_end
162
163! The 4 byte mis-alignment case - this does not require any shifting, just a
164! shuffling of registers.
165$Lunaligned_4:
166 MOV D0Re0, D1Re0
167$Lloop_4:
168 GETL D0Ar2, D1Ar1, [++A1.2]
169 MOV D1Re0, D0Ar2
170 SETL [A0.2++], D0Re0, D1Re0
171 MOV D0Re0, D1Ar1
172 BR $Lloop_4
173
174$Lunaligned_end:
175! If there are no remaining bytes to copy, we are done.
176 ANDS D1Ar3, D1Ar3, #7
177 BZ $Lend
178! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte
179! address of the remaining bytes, and fall through to the byte copy loop.
180 MOV D0Ar6, A1.2
181 ADD D1Ar5, D0Ar4, D0Ar6
182 MOV A1.2, D1Ar5
183 B $Lbyte_copy
184
185 .size _memcpy,.-_memcpy
diff --git a/arch/metag/lib/memmove.S b/arch/metag/lib/memmove.S
new file mode 100644
index 000000000000..228ea04d7b39
--- /dev/null
+++ b/arch/metag/lib/memmove.S
@@ -0,0 +1,345 @@
1! Copyright (C) 2008-2012 Imagination Technologies Ltd.
2
3 .text
4 .global _memmove
5 .type _memmove,function
6! D1Ar1 dst
7! D0Ar2 src
8! D1Ar3 cnt
9! D0Re0 dst
10_memmove:
11 CMP D1Ar3, #0
12 MOV D0Re0, D1Ar1
13 BZ $LEND2
14 MSETL [A0StP], D0.5, D0.6, D0.7
15 MOV D1Ar5, D0Ar2
16 CMP D1Ar1, D1Ar5
17 BLT $Lforwards_copy
18 SUB D0Ar4, D1Ar1, D1Ar3
19 ADD D0Ar4, D0Ar4, #1
20 CMP D0Ar2, D0Ar4
21 BLT $Lforwards_copy
22 ! should copy backwards
23 MOV D1Re0, D0Ar2
24 ! adjust pointer to the end of mem
25 ADD D0Ar2, D1Re0, D1Ar3
26 ADD D1Ar1, D1Ar1, D1Ar3
27
28 MOV A1.2, D0Ar2
29 MOV A0.2, D1Ar1
30 CMP D1Ar3, #8
31 BLT $Lbbyte_loop
32
33 MOV D0Ar4, D0Ar2
34 MOV D1Ar5, D1Ar1
35
36 ! test 8 byte alignment
37 ANDS D1Ar5, D1Ar5, #7
38 BNE $Lbdest_unaligned
39
40 ANDS D0Ar4, D0Ar4, #7
41 BNE $Lbsrc_unaligned
42
43 LSR D1Ar5, D1Ar3, #3
44
45$Lbaligned_loop:
46 GETL D0Re0, D1Re0, [--A1.2]
47 SETL [--A0.2], D0Re0, D1Re0
48 SUBS D1Ar5, D1Ar5, #1
49 BNE $Lbaligned_loop
50
51 ANDS D1Ar3, D1Ar3, #7
52 BZ $Lbbyte_loop_exit
53$Lbbyte_loop:
54 GETB D1Re0, [--A1.2]
55 SETB [--A0.2], D1Re0
56 SUBS D1Ar3, D1Ar3, #1
57 BNE $Lbbyte_loop
58$Lbbyte_loop_exit:
59 MOV D0Re0, A0.2
60$LEND:
61 SUB A0.2, A0StP, #24
62 MGETL D0.5, D0.6, D0.7, [A0.2]
63 SUB A0StP, A0StP, #24
64$LEND2:
65 MOV PC, D1RtP
66
67$Lbdest_unaligned:
68 GETB D0Re0, [--A1.2]
69 SETB [--A0.2], D0Re0
70 SUBS D1Ar5, D1Ar5, #1
71 SUB D1Ar3, D1Ar3, #1
72 BNE $Lbdest_unaligned
73 CMP D1Ar3, #8
74 BLT $Lbbyte_loop
75$Lbsrc_unaligned:
76 LSR D1Ar5, D1Ar3, #3
77 ! adjust A1.2
78 MOV D0Ar4, A1.2
79 ! save original address
80 MOV D0Ar6, A1.2
81
82 ADD D0Ar4, D0Ar4, #7
83 ANDMB D0Ar4, D0Ar4, #0xfff8
84 ! new address is the 8-byte aligned one above the original
85 MOV A1.2, D0Ar4
86
87 ! A0.2 dst 64-bit is aligned
88 ! measure the gap size
89 SUB D0Ar6, D0Ar4, D0Ar6
90 MOVS D0Ar4, D0Ar6
91 ! keep this information for the later adjustment
92 ! both aligned
93 BZ $Lbaligned_loop
94
95 ! prefetch
96 GETL D0Re0, D1Re0, [--A1.2]
97
98 CMP D0Ar6, #4
99 BLT $Lbunaligned_1_2_3
100 ! 32-bit aligned
101 BZ $Lbaligned_4
102
103 SUB D0Ar6, D0Ar6, #4
104 ! D1.6 stores the gap size in bits
105 MULW D1.6, D0Ar6, #8
106 MOV D0.6, #32
107 ! D0.6 stores the complement of the gap size
108 SUB D0.6, D0.6, D1.6
109
110$Lbunaligned_5_6_7:
111 GETL D0.7, D1.7, [--A1.2]
112 ! form 64-bit data in D0Re0, D1Re0
113 MOV D1Re0, D0Re0
114 ! D1Re0 << gap-size
115 LSL D1Re0, D1Re0, D1.6
116 MOV D0Re0, D1.7
117 ! D0Re0 >> complement
118 LSR D0Re0, D0Re0, D0.6
119 MOV D1.5, D0Re0
120 ! combine the both
121 ADD D1Re0, D1Re0, D1.5
122
123 MOV D1.5, D1.7
124 LSL D1.5, D1.5, D1.6
125 MOV D0Re0, D0.7
126 LSR D0Re0, D0Re0, D0.6
127 MOV D0.5, D1.5
128 ADD D0Re0, D0Re0, D0.5
129
130 SETL [--A0.2], D0Re0, D1Re0
131 MOV D0Re0, D0.7
132 MOV D1Re0, D1.7
133 SUBS D1Ar5, D1Ar5, #1
134 BNE $Lbunaligned_5_6_7
135
136 ANDS D1Ar3, D1Ar3, #7
137 BZ $Lbbyte_loop_exit
138 ! Adjust A1.2
139 ! A1.2 <- A1.2 +8 - gapsize
140 ADD A1.2, A1.2, #8
141 SUB A1.2, A1.2, D0Ar4
142 B $Lbbyte_loop
143
144$Lbunaligned_1_2_3:
145 MULW D1.6, D0Ar6, #8
146 MOV D0.6, #32
147 SUB D0.6, D0.6, D1.6
148
149$Lbunaligned_1_2_3_loop:
150 GETL D0.7, D1.7, [--A1.2]
151 ! form 64-bit data in D0Re0, D1Re0
152 LSL D1Re0, D1Re0, D1.6
153 ! save D0Re0 for later use
154 MOV D0.5, D0Re0
155 LSR D0Re0, D0Re0, D0.6
156 MOV D1.5, D0Re0
157 ADD D1Re0, D1Re0, D1.5
158
159 ! orignal data in D0Re0
160 MOV D1.5, D0.5
161 LSL D1.5, D1.5, D1.6
162 MOV D0Re0, D1.7
163 LSR D0Re0, D0Re0, D0.6
164 MOV D0.5, D1.5
165 ADD D0Re0, D0Re0, D0.5
166
167 SETL [--A0.2], D0Re0, D1Re0
168 MOV D0Re0, D0.7
169 MOV D1Re0, D1.7
170 SUBS D1Ar5, D1Ar5, #1
171 BNE $Lbunaligned_1_2_3_loop
172
173 ANDS D1Ar3, D1Ar3, #7
174 BZ $Lbbyte_loop_exit
175 ! Adjust A1.2
176 ADD A1.2, A1.2, #8
177 SUB A1.2, A1.2, D0Ar4
178 B $Lbbyte_loop
179
180$Lbaligned_4:
181 GETL D0.7, D1.7, [--A1.2]
182 MOV D1Re0, D0Re0
183 MOV D0Re0, D1.7
184 SETL [--A0.2], D0Re0, D1Re0
185 MOV D0Re0, D0.7
186 MOV D1Re0, D1.7
187 SUBS D1Ar5, D1Ar5, #1
188 BNE $Lbaligned_4
189 ANDS D1Ar3, D1Ar3, #7
190 BZ $Lbbyte_loop_exit
191 ! Adjust A1.2
192 ADD A1.2, A1.2, #8
193 SUB A1.2, A1.2, D0Ar4
194 B $Lbbyte_loop
195
196$Lforwards_copy:
197 MOV A1.2, D0Ar2
198 MOV A0.2, D1Ar1
199 CMP D1Ar3, #8
200 BLT $Lfbyte_loop
201
202 MOV D0Ar4, D0Ar2
203 MOV D1Ar5, D1Ar1
204
205 ANDS D1Ar5, D1Ar5, #7
206 BNE $Lfdest_unaligned
207
208 ANDS D0Ar4, D0Ar4, #7
209 BNE $Lfsrc_unaligned
210
211 LSR D1Ar5, D1Ar3, #3
212
213$Lfaligned_loop:
214 GETL D0Re0, D1Re0, [A1.2++]
215 SUBS D1Ar5, D1Ar5, #1
216 SETL [A0.2++], D0Re0, D1Re0
217 BNE $Lfaligned_loop
218
219 ANDS D1Ar3, D1Ar3, #7
220 BZ $Lfbyte_loop_exit
221$Lfbyte_loop:
222 GETB D1Re0, [A1.2++]
223 SETB [A0.2++], D1Re0
224 SUBS D1Ar3, D1Ar3, #1
225 BNE $Lfbyte_loop
226$Lfbyte_loop_exit:
227 MOV D0Re0, D1Ar1
228 B $LEND
229
230$Lfdest_unaligned:
231 GETB D0Re0, [A1.2++]
232 ADD D1Ar5, D1Ar5, #1
233 SUB D1Ar3, D1Ar3, #1
234 SETB [A0.2++], D0Re0
235 CMP D1Ar5, #8
236 BNE $Lfdest_unaligned
237 CMP D1Ar3, #8
238 BLT $Lfbyte_loop
239$Lfsrc_unaligned:
240 ! adjust A1.2
241 LSR D1Ar5, D1Ar3, #3
242
243 MOV D0Ar4, A1.2
244 MOV D0Ar6, A1.2
245 ANDMB D0Ar4, D0Ar4, #0xfff8
246 MOV A1.2, D0Ar4
247
248 ! A0.2 dst 64-bit is aligned
249 SUB D0Ar6, D0Ar6, D0Ar4
250 ! keep the information for the later adjustment
251 MOVS D0Ar4, D0Ar6
252
253 ! both aligned
254 BZ $Lfaligned_loop
255
256 ! prefetch
257 GETL D0Re0, D1Re0, [A1.2]
258
259 CMP D0Ar6, #4
260 BLT $Lfunaligned_1_2_3
261 BZ $Lfaligned_4
262
263 SUB D0Ar6, D0Ar6, #4
264 MULW D0.6, D0Ar6, #8
265 MOV D1.6, #32
266 SUB D1.6, D1.6, D0.6
267
268$Lfunaligned_5_6_7:
269 GETL D0.7, D1.7, [++A1.2]
270 ! form 64-bit data in D0Re0, D1Re0
271 MOV D0Re0, D1Re0
272 LSR D0Re0, D0Re0, D0.6
273 MOV D1Re0, D0.7
274 LSL D1Re0, D1Re0, D1.6
275 MOV D0.5, D1Re0
276 ADD D0Re0, D0Re0, D0.5
277
278 MOV D0.5, D0.7
279 LSR D0.5, D0.5, D0.6
280 MOV D1Re0, D1.7
281 LSL D1Re0, D1Re0, D1.6
282 MOV D1.5, D0.5
283 ADD D1Re0, D1Re0, D1.5
284
285 SETL [A0.2++], D0Re0, D1Re0
286 MOV D0Re0, D0.7
287 MOV D1Re0, D1.7
288 SUBS D1Ar5, D1Ar5, #1
289 BNE $Lfunaligned_5_6_7
290
291 ANDS D1Ar3, D1Ar3, #7
292 BZ $Lfbyte_loop_exit
293 ! Adjust A1.2
294 ADD A1.2, A1.2, D0Ar4
295 B $Lfbyte_loop
296
297$Lfunaligned_1_2_3:
298 MULW D0.6, D0Ar6, #8
299 MOV D1.6, #32
300 SUB D1.6, D1.6, D0.6
301
302$Lfunaligned_1_2_3_loop:
303 GETL D0.7, D1.7, [++A1.2]
304 ! form 64-bit data in D0Re0, D1Re0
305 LSR D0Re0, D0Re0, D0.6
306 MOV D1.5, D1Re0
307 LSL D1Re0, D1Re0, D1.6
308 MOV D0.5, D1Re0
309 ADD D0Re0, D0Re0, D0.5
310
311 MOV D0.5, D1.5
312 LSR D0.5, D0.5, D0.6
313 MOV D1Re0, D0.7
314 LSL D1Re0, D1Re0, D1.6
315 MOV D1.5, D0.5
316 ADD D1Re0, D1Re0, D1.5
317
318 SETL [A0.2++], D0Re0, D1Re0
319 MOV D0Re0, D0.7
320 MOV D1Re0, D1.7
321 SUBS D1Ar5, D1Ar5, #1
322 BNE $Lfunaligned_1_2_3_loop
323
324 ANDS D1Ar3, D1Ar3, #7
325 BZ $Lfbyte_loop_exit
326 ! Adjust A1.2
327 ADD A1.2, A1.2, D0Ar4
328 B $Lfbyte_loop
329
330$Lfaligned_4:
331 GETL D0.7, D1.7, [++A1.2]
332 MOV D0Re0, D1Re0
333 MOV D1Re0, D0.7
334 SETL [A0.2++], D0Re0, D1Re0
335 MOV D0Re0, D0.7
336 MOV D1Re0, D1.7
337 SUBS D1Ar5, D1Ar5, #1
338 BNE $Lfaligned_4
339 ANDS D1Ar3, D1Ar3, #7
340 BZ $Lfbyte_loop_exit
341 ! Adjust A1.2
342 ADD A1.2, A1.2, D0Ar4
343 B $Lfbyte_loop
344
345 .size _memmove,.-_memmove
diff --git a/arch/metag/lib/memset.S b/arch/metag/lib/memset.S
new file mode 100644
index 000000000000..721085bad1d2
--- /dev/null
+++ b/arch/metag/lib/memset.S
@@ -0,0 +1,86 @@
1! Copyright (C) 2008-2012 Imagination Technologies Ltd.
2
3 .text
4 .global _memset
5 .type _memset,function
6! D1Ar1 dst
7! D0Ar2 c
8! D1Ar3 cnt
9! D0Re0 dst
10_memset:
11 AND D0Ar2,D0Ar2,#0xFF ! Ensure a byte input value
12 MULW D0Ar2,D0Ar2,#0x0101 ! Duplicate byte value into 0-15
13 ANDS D0Ar4,D1Ar1,#7 ! Extract bottom LSBs of dst
14 LSL D0Re0,D0Ar2,#16 ! Duplicate byte value into 16-31
15 ADD A0.2,D0Ar2,D0Re0 ! Duplicate byte value into 4 (A0.2)
16 MOV D0Re0,D1Ar1 ! Return dst
17 BZ $LLongStub ! if start address is aligned
18 ! start address is not aligned on an 8 byte boundary, so we
19 ! need the number of bytes up to the next 8 byte address
20 ! boundary, or the length of the string if less than 8, in D1Ar5
21 MOV D0Ar2,#8 ! Need 8 - N in D1Ar5 ...
22 SUB D1Ar5,D0Ar2,D0Ar4 ! ... subtract N
23 CMP D1Ar3,D1Ar5
24 MOVMI D1Ar5,D1Ar3
25 B $LByteStub ! dst is mis-aligned, do $LByteStub
26
27!
28! Preamble to LongLoop which generates 4*8 bytes per interation (5 cycles)
29!
30$LLongStub:
31 LSRS D0Ar2,D1Ar3,#5
32 AND D1Ar3,D1Ar3,#0x1F
33 MOV A1.2,A0.2
34 BEQ $LLongishStub
35 SUB TXRPT,D0Ar2,#1
36 CMP D1Ar3,#0
37$LLongLoop:
38 SETL [D1Ar1++],A0.2,A1.2
39 SETL [D1Ar1++],A0.2,A1.2
40 SETL [D1Ar1++],A0.2,A1.2
41 SETL [D1Ar1++],A0.2,A1.2
42 BR $LLongLoop
43 BZ $Lexit
44!
45! Preamble to LongishLoop which generates 1*8 bytes per interation (2 cycles)
46!
47$LLongishStub:
48 LSRS D0Ar2,D1Ar3,#3
49 AND D1Ar3,D1Ar3,#0x7
50 MOV D1Ar5,D1Ar3
51 BEQ $LByteStub
52 SUB TXRPT,D0Ar2,#1
53 CMP D1Ar3,#0
54$LLongishLoop:
55 SETL [D1Ar1++],A0.2,A1.2
56 BR $LLongishLoop
57 BZ $Lexit
58!
59! This does a byte structured burst of up to 7 bytes
60!
61! D1Ar1 should point to the location required
62! D1Ar3 should be the remaining total byte count
63! D1Ar5 should be burst size (<= D1Ar3)
64!
65$LByteStub:
66 SUBS D1Ar3,D1Ar3,D1Ar5 ! Reduce count
67 ADD D1Ar1,D1Ar1,D1Ar5 ! Advance pointer to end of area
68 MULW D1Ar5,D1Ar5,#4 ! Scale to (1*4), (2*4), (3*4)
69 SUB D1Ar5,D1Ar5,#(8*4) ! Rebase to -(7*4), -(6*4), -(5*4), ...
70 MOV A1.2,D1Ar5
71 SUB PC,CPC1,A1.2 ! Jump into table below
72 SETB [D1Ar1+#(-7)],A0.2
73 SETB [D1Ar1+#(-6)],A0.2
74 SETB [D1Ar1+#(-5)],A0.2
75 SETB [D1Ar1+#(-4)],A0.2
76 SETB [D1Ar1+#(-3)],A0.2
77 SETB [D1Ar1+#(-2)],A0.2
78 SETB [D1Ar1+#(-1)],A0.2
79!
80! Return if all data has been output, otherwise do $LLongStub
81!
82 BNZ $LLongStub
83$Lexit:
84 MOV PC,D1RtP
85 .size _memset,.-_memset
86
diff --git a/arch/metag/lib/modsi3.S b/arch/metag/lib/modsi3.S
new file mode 100644
index 000000000000..210cfa856593
--- /dev/null
+++ b/arch/metag/lib/modsi3.S
@@ -0,0 +1,38 @@
1! Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
2! Imagination Technologies Ltd
3!
4! Integer modulus routines.
5!
6!!
7!! 32-bit modulus unsigned i/p - passed unsigned 32-bit numbers
8!!
9 .text
10 .global ___umodsi3
11 .type ___umodsi3,function
12 .align 2
13___umodsi3:
14 MOV D0FrT,D1RtP ! Save original return address
15 CALLR D1RtP,___udivsi3
16 MOV D1RtP,D0FrT ! Recover return address
17 MOV D0Re0,D1Ar1 ! Return remainder
18 MOV PC,D1RtP
19 .size ___umodsi3,.-___umodsi3
20
21!!
22!! 32-bit modulus signed i/p - passed signed 32-bit numbers
23!!
24 .global ___modsi3
25 .type ___modsi3,function
26 .align 2
27___modsi3:
28 MOV D0FrT,D1RtP ! Save original return address
29 MOV A0.2,D1Ar1 ! Save A in A0.2
30 CALLR D1RtP,___divsi3
31 MOV D1RtP,D0FrT ! Recover return address
32 MOV D1Re0,A0.2 ! Recover A
33 MOV D0Re0,D1Ar1 ! Return remainder
34 ORS D1Re0,D1Re0,D1Re0 ! Was A negative?
35 NEG D1Ar1,D1Ar1 ! Negate remainder
36 MOVMI D0Re0,D1Ar1 ! Return neg remainder
37 MOV PC, D1RtP
38 .size ___modsi3,.-___modsi3
diff --git a/arch/metag/lib/muldi3.S b/arch/metag/lib/muldi3.S
new file mode 100644
index 000000000000..ee66ca8644d0
--- /dev/null
+++ b/arch/metag/lib/muldi3.S
@@ -0,0 +1,44 @@
1! Copyright (C) 2012 by Imagination Technologies Ltd.
2!
3! 64-bit multiply routine.
4!
5
6!
7! 64-bit signed/unsigned multiply
8!
9! A = D1Ar1:D0Ar2 = a 2^48 + b 2^32 + c 2^16 + d 2^0
10!
11! B = D1Ar3:D0Ar4 = w 2^48 + x 2^32 + y 2^16 + z 2^0
12!
13 .text
14 .global ___muldi3
15 .type ___muldi3,function
16
17___muldi3:
18 MULD D1Re0,D1Ar1,D0Ar4 ! (a 2^48 + b 2^32)(y 2^16 + z 2^0)
19 MULD D0Re0,D0Ar2,D1Ar3 ! (w 2^48 + x 2^32)(c 2^16 + d 2^0)
20 ADD D1Re0,D1Re0,D0Re0
21
22 MULW D0Re0,D0Ar2,D0Ar4 ! (d 2^0) * (z 2^0)
23
24 RTDW D0Ar2,D0Ar2
25 MULW D0Ar6,D0Ar2,D0Ar4 ! (c 2^16)(z 2^0)
26 LSR D1Ar5,D0Ar6,#16
27 LSL D0Ar6,D0Ar6,#16
28 ADDS D0Re0,D0Re0,D0Ar6
29 ADDCS D1Re0,D1Re0,#1
30 RTDW D0Ar4,D0Ar4
31 ADD D1Re0,D1Re0,D1Ar5
32
33 MULW D0Ar6,D0Ar2,D0Ar4 ! (c 2^16)(y 2^16)
34 ADD D1Re0,D1Re0,D0Ar6
35
36 RTDW D0Ar2,D0Ar2
37 MULW D0Ar6,D0Ar2,D0Ar4 ! (d 2^0)(y 2^16)
38 LSR D1Ar5,D0Ar6,#16
39 LSL D0Ar6,D0Ar6,#16
40 ADDS D0Re0,D0Re0,D0Ar6
41 ADD D1Re0,D1Re0,D1Ar5
42 ADDCS D1Re0,D1Re0,#1
43 MOV PC, D1RtP
44 .size ___muldi3,.-___muldi3
diff --git a/arch/metag/lib/ucmpdi2.S b/arch/metag/lib/ucmpdi2.S
new file mode 100644
index 000000000000..6f3347f7daeb
--- /dev/null
+++ b/arch/metag/lib/ucmpdi2.S
@@ -0,0 +1,27 @@
1! Copyright (C) 2012 by Imagination Technologies Ltd.
2!
3! 64-bit unsigned compare routine.
4!
5
6 .text
7 .global ___ucmpdi2
8 .type ___ucmpdi2,function
9
10! low high
11! u64 a (D0Ar2, D1Ar1)
12! u64 b (D0Ar4, D1Ar3)
13___ucmpdi2:
14 ! start at 1 (equal) and conditionally increment or decrement
15 MOV D0Re0,#1
16
17 ! high words
18 CMP D1Ar1,D1Ar3
19 ! or if equal, low words
20 CMPEQ D0Ar2,D0Ar4
21
22 ! unsigned compare
23 SUBLO D0Re0,D0Re0,#1
24 ADDHI D0Re0,D0Re0,#1
25
26 MOV PC,D1RtP
27 .size ___ucmpdi2,.-___ucmpdi2