aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arc/lib
diff options
context:
space:
mode:
authorVineet Gupta <vgupta@synopsys.com>2013-01-18 04:42:18 -0500
committerVineet Gupta <vgupta@synopsys.com>2013-02-11 09:30:35 -0500
commit5210d1e6889c8183ecad269e86e2d9c524015b5f (patch)
tree77fcc0cfb1853c553eaf58a271256f13b860a528 /arch/arc/lib
parent6e35fa2d430538cd0609e499c6f789beea9e9798 (diff)
ARC: String library
Hand optimised asm code for ARC700 pipeline. Originally written/optimized by Joern Rennecke Signed-off-by: Vineet Gupta <vgupta@synopsys.com> Cc: Joern Rennecke <joern.rennecke@embecosm.com>
Diffstat (limited to 'arch/arc/lib')
-rw-r--r--arch/arc/lib/memcmp.S124
-rw-r--r--arch/arc/lib/memcpy-700.S66
-rw-r--r--arch/arc/lib/memset.S59
-rw-r--r--arch/arc/lib/strchr-700.S123
-rw-r--r--arch/arc/lib/strcmp.S96
-rw-r--r--arch/arc/lib/strcpy-700.S70
-rw-r--r--arch/arc/lib/strlen.S83
7 files changed, 621 insertions, 0 deletions
diff --git a/arch/arc/lib/memcmp.S b/arch/arc/lib/memcmp.S
new file mode 100644
index 000000000000..bc813d55b6c3
--- /dev/null
+++ b/arch/arc/lib/memcmp.S
@@ -0,0 +1,124 @@
1/*
2 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <asm/linkage.h>
10
11#ifdef __LITTLE_ENDIAN__
12#define WORD2 r2
13#define SHIFT r3
14#else /* BIG ENDIAN */
15#define WORD2 r3
16#define SHIFT r2
17#endif
18
19ARC_ENTRY memcmp
20 or r12,r0,r1
21 asl_s r12,r12,30
22 sub r3,r2,1
23 brls r2,r12,.Lbytewise
24 ld r4,[r0,0]
25 ld r5,[r1,0]
26 lsr.f lp_count,r3,3
27 lpne .Loop_end
28 ld_s WORD2,[r0,4]
29 ld_s r12,[r1,4]
30 brne r4,r5,.Leven
31 ld.a r4,[r0,8]
32 ld.a r5,[r1,8]
33 brne WORD2,r12,.Lodd
34.Loop_end:
35 asl_s SHIFT,SHIFT,3
36 bhs_s .Last_cmp
37 brne r4,r5,.Leven
38 ld r4,[r0,4]
39 ld r5,[r1,4]
40#ifdef __LITTLE_ENDIAN__
41 nop_s
42 ; one more load latency cycle
43.Last_cmp:
44 xor r0,r4,r5
45 bset r0,r0,SHIFT
46 sub_s r1,r0,1
47 bic_s r1,r1,r0
48 norm r1,r1
49 b.d .Leven_cmp
50 and r1,r1,24
51.Leven:
52 xor r0,r4,r5
53 sub_s r1,r0,1
54 bic_s r1,r1,r0
55 norm r1,r1
56 ; slow track insn
57 and r1,r1,24
58.Leven_cmp:
59 asl r2,r4,r1
60 asl r12,r5,r1
61 lsr_s r2,r2,1
62 lsr_s r12,r12,1
63 j_s.d [blink]
64 sub r0,r2,r12
65 .balign 4
66.Lodd:
67 xor r0,WORD2,r12
68 sub_s r1,r0,1
69 bic_s r1,r1,r0
70 norm r1,r1
71 ; slow track insn
72 and r1,r1,24
73 asl_s r2,r2,r1
74 asl_s r12,r12,r1
75 lsr_s r2,r2,1
76 lsr_s r12,r12,1
77 j_s.d [blink]
78 sub r0,r2,r12
79#else /* BIG ENDIAN */
80.Last_cmp:
81 neg_s SHIFT,SHIFT
82 lsr r4,r4,SHIFT
83 lsr r5,r5,SHIFT
84 ; slow track insn
85.Leven:
86 sub.f r0,r4,r5
87 mov.ne r0,1
88 j_s.d [blink]
89 bset.cs r0,r0,31
90.Lodd:
91 cmp_s WORD2,r12
92
93 mov_s r0,1
94 j_s.d [blink]
95 bset.cs r0,r0,31
96#endif /* ENDIAN */
97 .balign 4
98.Lbytewise:
99 breq r2,0,.Lnil
100 ldb r4,[r0,0]
101 ldb r5,[r1,0]
102 lsr.f lp_count,r3
103 lpne .Lbyte_end
104 ldb_s r3,[r0,1]
105 ldb r12,[r1,1]
106 brne r4,r5,.Lbyte_even
107 ldb.a r4,[r0,2]
108 ldb.a r5,[r1,2]
109 brne r3,r12,.Lbyte_odd
110.Lbyte_end:
111 bcc .Lbyte_even
112 brne r4,r5,.Lbyte_even
113 ldb_s r3,[r0,1]
114 ldb_s r12,[r1,1]
115.Lbyte_odd:
116 j_s.d [blink]
117 sub r0,r3,r12
118.Lbyte_even:
119 j_s.d [blink]
120 sub r0,r4,r5
121.Lnil:
122 j_s.d [blink]
123 mov r0,0
124ARC_EXIT memcmp
diff --git a/arch/arc/lib/memcpy-700.S b/arch/arc/lib/memcpy-700.S
new file mode 100644
index 000000000000..b64cc10ac918
--- /dev/null
+++ b/arch/arc/lib/memcpy-700.S
@@ -0,0 +1,66 @@
1/*
2 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <asm/linkage.h>
10
11ARC_ENTRY memcpy
12 or r3,r0,r1
13 asl_s r3,r3,30
14 mov_s r5,r0
15 brls.d r2,r3,.Lcopy_bytewise
16 sub.f r3,r2,1
17 ld_s r12,[r1,0]
18 asr.f lp_count,r3,3
19 bbit0.d r3,2,.Lnox4
20 bmsk_s r2,r2,1
21 st.ab r12,[r5,4]
22 ld.a r12,[r1,4]
23.Lnox4:
24 lppnz .Lendloop
25 ld_s r3,[r1,4]
26 st.ab r12,[r5,4]
27 ld.a r12,[r1,8]
28 st.ab r3,[r5,4]
29.Lendloop:
30 breq r2,0,.Last_store
31 ld r3,[r5,0]
32#ifdef __LITTLE_ENDIAN__
33 add3 r2,-1,r2
34 ; uses long immediate
35 xor_s r12,r12,r3
36 bmsk r12,r12,r2
37 xor_s r12,r12,r3
38#else /* BIG ENDIAN */
39 sub3 r2,31,r2
40 ; uses long immediate
41 xor_s r3,r3,r12
42 bmsk r3,r3,r2
43 xor_s r12,r12,r3
44#endif /* ENDIAN */
45.Last_store:
46 j_s.d [blink]
47 st r12,[r5,0]
48
49 .balign 4
50.Lcopy_bytewise:
51 jcs [blink]
52 ldb_s r12,[r1,0]
53 lsr.f lp_count,r3
54 bhs_s .Lnox1
55 stb.ab r12,[r5,1]
56 ldb.a r12,[r1,1]
57.Lnox1:
58 lppnz .Lendbloop
59 ldb_s r3,[r1,1]
60 stb.ab r12,[r5,1]
61 ldb.a r12,[r1,2]
62 stb.ab r3,[r5,1]
63.Lendbloop:
64 j_s.d [blink]
65 stb r12,[r5,0]
66ARC_EXIT memcpy
diff --git a/arch/arc/lib/memset.S b/arch/arc/lib/memset.S
new file mode 100644
index 000000000000..9b2d88d2e141
--- /dev/null
+++ b/arch/arc/lib/memset.S
@@ -0,0 +1,59 @@
1/*
2 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <asm/linkage.h>
10
11#define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */
12
13ARC_ENTRY memset
14 mov_s r4,r0
15 or r12,r0,r2
16 bmsk.f r12,r12,1
17 extb_s r1,r1
18 asl r3,r1,8
19 beq.d .Laligned
20 or_s r1,r1,r3
21 brls r2,SMALL,.Ltiny
22 add r3,r2,r0
23 stb r1,[r3,-1]
24 bclr_s r3,r3,0
25 stw r1,[r3,-2]
26 bmsk.f r12,r0,1
27 add_s r2,r2,r12
28 sub.ne r2,r2,4
29 stb.ab r1,[r4,1]
30 and r4,r4,-2
31 stw.ab r1,[r4,2]
32 and r4,r4,-4
33.Laligned: ; This code address should be aligned for speed.
34 asl r3,r1,16
35 lsr.f lp_count,r2,2
36 or_s r1,r1,r3
37 lpne .Loop_end
38 st.ab r1,[r4,4]
39.Loop_end:
40 j_s [blink]
41
42 .balign 4
43.Ltiny:
44 mov.f lp_count,r2
45 lpne .Ltiny_end
46 stb.ab r1,[r4,1]
47.Ltiny_end:
48 j_s [blink]
49ARC_EXIT memset
50
51; memzero: @r0 = mem, @r1 = size_t
52; memset: @r0 = mem, @r1 = char, @r2 = size_t
53
54ARC_ENTRY memzero
55 ; adjust bzero args to memset args
56 mov r2, r1
57 mov r1, 0
58 b memset ;tail call so need to tinker with blink
59ARC_EXIT memzero
diff --git a/arch/arc/lib/strchr-700.S b/arch/arc/lib/strchr-700.S
new file mode 100644
index 000000000000..99c10475d477
--- /dev/null
+++ b/arch/arc/lib/strchr-700.S
@@ -0,0 +1,123 @@
1/*
2 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9/* ARC700 has a relatively long pipeline and branch prediction, so we want
10 to avoid branches that are hard to predict. On the other hand, the
11 presence of the norm instruction makes it easier to operate on whole
12 words branch-free. */
13
14#include <asm/linkage.h>
15
16ARC_ENTRY strchr
17 extb_s r1,r1
18 asl r5,r1,8
19 bmsk r2,r0,1
20 or r5,r5,r1
21 mov_s r3,0x01010101
22 breq.d r2,r0,.Laligned
23 asl r4,r5,16
24 sub_s r0,r0,r2
25 asl r7,r2,3
26 ld_s r2,[r0]
27#ifdef __LITTLE_ENDIAN__
28 asl r7,r3,r7
29#else
30 lsr r7,r3,r7
31#endif
32 or r5,r5,r4
33 ror r4,r3
34 sub r12,r2,r7
35 bic_s r12,r12,r2
36 and r12,r12,r4
37 brne.d r12,0,.Lfound0_ua
38 xor r6,r2,r5
39 ld.a r2,[r0,4]
40 sub r12,r6,r7
41 bic r12,r12,r6
42 and r7,r12,r4
43 breq r7,0,.Loop ; For speed, we want this branch to be unaligned.
44 b .Lfound_char ; Likewise this one.
45; /* We require this code address to be unaligned for speed... */
46.Laligned:
47 ld_s r2,[r0]
48 or r5,r5,r4
49 ror r4,r3
50; /* ... so that this code address is aligned, for itself and ... */
51.Loop:
52 sub r12,r2,r3
53 bic_s r12,r12,r2
54 and r12,r12,r4
55 brne.d r12,0,.Lfound0
56 xor r6,r2,r5
57 ld.a r2,[r0,4]
58 sub r12,r6,r3
59 bic r12,r12,r6
60 and r7,r12,r4
61 breq r7,0,.Loop /* ... so that this branch is unaligned. */
62 ; Found searched-for character. r0 has already advanced to next word.
63#ifdef __LITTLE_ENDIAN__
64/* We only need the information about the first matching byte
65 (i.e. the least significant matching byte) to be exact,
66 hence there is no problem with carry effects. */
67.Lfound_char:
68 sub r3,r7,1
69 bic r3,r3,r7
70 norm r2,r3
71 sub_s r0,r0,1
72 asr_s r2,r2,3
73 j.d [blink]
74 sub_s r0,r0,r2
75
76 .balign 4
77.Lfound0_ua:
78 mov r3,r7
79.Lfound0:
80 sub r3,r6,r3
81 bic r3,r3,r6
82 and r2,r3,r4
83 or_s r12,r12,r2
84 sub_s r3,r12,1
85 bic_s r3,r3,r12
86 norm r3,r3
87 add_s r0,r0,3
88 asr_s r12,r3,3
89 asl.f 0,r2,r3
90 sub_s r0,r0,r12
91 j_s.d [blink]
92 mov.pl r0,0
93#else /* BIG ENDIAN */
94.Lfound_char:
95 lsr r7,r7,7
96
97 bic r2,r7,r6
98 norm r2,r2
99 sub_s r0,r0,4
100 asr_s r2,r2,3
101 j.d [blink]
102 add_s r0,r0,r2
103
104.Lfound0_ua:
105 mov_s r3,r7
106.Lfound0:
107 asl_s r2,r2,7
108 or r7,r6,r4
109 bic_s r12,r12,r2
110 sub r2,r7,r3
111 or r2,r2,r6
112 bic r12,r2,r12
113 bic.f r3,r4,r12
114 norm r3,r3
115
116 add.pl r3,r3,1
117 asr_s r12,r3,3
118 asl.f 0,r2,r3
119 add_s r0,r0,r12
120 j_s.d [blink]
121 mov.mi r0,0
122#endif /* ENDIAN */
123ARC_EXIT strchr
diff --git a/arch/arc/lib/strcmp.S b/arch/arc/lib/strcmp.S
new file mode 100644
index 000000000000..5dc802b45cf3
--- /dev/null
+++ b/arch/arc/lib/strcmp.S
@@ -0,0 +1,96 @@
1/*
2 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9/* This is optimized primarily for the ARC700.
10 It would be possible to speed up the loops by one cycle / word
11 respective one cycle / byte by forcing double source 1 alignment, unrolling
12 by a factor of two, and speculatively loading the second word / byte of
13 source 1; however, that would increase the overhead for loop setup / finish,
14 and strcmp might often terminate early. */
15
16#include <asm/linkage.h>
17
18ARC_ENTRY strcmp
19 or r2,r0,r1
20 bmsk_s r2,r2,1
21 brne r2,0,.Lcharloop
22 mov_s r12,0x01010101
23 ror r5,r12
24.Lwordloop:
25 ld.ab r2,[r0,4]
26 ld.ab r3,[r1,4]
27 nop_s
28 sub r4,r2,r12
29 bic r4,r4,r2
30 and r4,r4,r5
31 brne r4,0,.Lfound0
32 breq r2,r3,.Lwordloop
33#ifdef __LITTLE_ENDIAN__
34 xor r0,r2,r3 ; mask for difference
35 sub_s r1,r0,1
36 bic_s r0,r0,r1 ; mask for least significant difference bit
37 sub r1,r5,r0
38 xor r0,r5,r1 ; mask for least significant difference byte
39 and_s r2,r2,r0
40 and_s r3,r3,r0
41#endif /* LITTLE ENDIAN */
42 cmp_s r2,r3
43 mov_s r0,1
44 j_s.d [blink]
45 bset.lo r0,r0,31
46
47 .balign 4
48#ifdef __LITTLE_ENDIAN__
49.Lfound0:
50 xor r0,r2,r3 ; mask for difference
51 or r0,r0,r4 ; or in zero indicator
52 sub_s r1,r0,1
53 bic_s r0,r0,r1 ; mask for least significant difference bit
54 sub r1,r5,r0
55 xor r0,r5,r1 ; mask for least significant difference byte
56 and_s r2,r2,r0
57 and_s r3,r3,r0
58 sub.f r0,r2,r3
59 mov.hi r0,1
60 j_s.d [blink]
61 bset.lo r0,r0,31
62#else /* BIG ENDIAN */
63 /* The zero-detection above can mis-detect 0x01 bytes as zeroes
64 because of carry-propagateion from a lower significant zero byte.
65 We can compensate for this by checking that bit0 is zero.
66 This compensation is not necessary in the step where we
67 get a low estimate for r2, because in any affected bytes
68 we already have 0x00 or 0x01, which will remain unchanged
69 when bit 7 is cleared. */
70 .balign 4
71.Lfound0:
72 lsr r0,r4,8
73 lsr_s r1,r2
74 bic_s r2,r2,r0 ; get low estimate for r2 and get ...
75 bic_s r0,r0,r1 ; <this is the adjusted mask for zeros>
76 or_s r3,r3,r0 ; ... high estimate r3 so that r2 > r3 will ...
77 cmp_s r3,r2 ; ... be independent of trailing garbage
78 or_s r2,r2,r0 ; likewise for r3 > r2
79 bic_s r3,r3,r0
80 rlc r0,0 ; r0 := r2 > r3 ? 1 : 0
81 cmp_s r2,r3
82 j_s.d [blink]
83 bset.lo r0,r0,31
84#endif /* ENDIAN */
85
86 .balign 4
87.Lcharloop:
88 ldb.ab r2,[r0,1]
89 ldb.ab r3,[r1,1]
90 nop_s
91 breq r2,0,.Lcmpend
92 breq r2,r3,.Lcharloop
93.Lcmpend:
94 j_s.d [blink]
95 sub r0,r2,r3
96ARC_EXIT strcmp
diff --git a/arch/arc/lib/strcpy-700.S b/arch/arc/lib/strcpy-700.S
new file mode 100644
index 000000000000..b7ca4ae81d88
--- /dev/null
+++ b/arch/arc/lib/strcpy-700.S
@@ -0,0 +1,70 @@
1/*
2 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9/* If dst and src are 4 byte aligned, copy 8 bytes at a time.
10 If the src is 4, but not 8 byte aligned, we first read 4 bytes to get
11 it 8 byte aligned. Thus, we can do a little read-ahead, without
12 dereferencing a cache line that we should not touch.
13 Note that short and long instructions have been scheduled to avoid
14 branch stalls.
15 The beq_s to r3z could be made unaligned & long to avoid a stall
16 there, but the it is not likely to be taken often, and it
17 would also be likey to cost an unaligned mispredict at the next call. */
18
19#include <asm/linkage.h>
20
21ARC_ENTRY strcpy
22 or r2,r0,r1
23 bmsk_s r2,r2,1
24 brne.d r2,0,charloop
25 mov_s r10,r0
26 ld_s r3,[r1,0]
27 mov r8,0x01010101
28 bbit0.d r1,2,loop_start
29 ror r12,r8
30 sub r2,r3,r8
31 bic_s r2,r2,r3
32 tst_s r2,r12
33 bne r3z
34 mov_s r4,r3
35 .balign 4
36loop:
37 ld.a r3,[r1,4]
38 st.ab r4,[r10,4]
39loop_start:
40 ld.a r4,[r1,4]
41 sub r2,r3,r8
42 bic_s r2,r2,r3
43 tst_s r2,r12
44 bne_s r3z
45 st.ab r3,[r10,4]
46 sub r2,r4,r8
47 bic r2,r2,r4
48 tst r2,r12
49 beq loop
50 mov_s r3,r4
51#ifdef __LITTLE_ENDIAN__
52r3z: bmsk.f r1,r3,7
53 lsr_s r3,r3,8
54#else
55r3z: lsr.f r1,r3,24
56 asl_s r3,r3,8
57#endif
58 bne.d r3z
59 stb.ab r1,[r10,1]
60 j_s [blink]
61
62 .balign 4
63charloop:
64 ldb.ab r3,[r1,1]
65
66
67 brne.d r3,0,charloop
68 stb.ab r3,[r10,1]
69 j [blink]
70ARC_EXIT strcpy
diff --git a/arch/arc/lib/strlen.S b/arch/arc/lib/strlen.S
new file mode 100644
index 000000000000..39759e099696
--- /dev/null
+++ b/arch/arc/lib/strlen.S
@@ -0,0 +1,83 @@
1/*
2 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <asm/linkage.h>
10
11ARC_ENTRY strlen
12 or r3,r0,7
13 ld r2,[r3,-7]
14 ld.a r6,[r3,-3]
15 mov r4,0x01010101
16 ; uses long immediate
17#ifdef __LITTLE_ENDIAN__
18 asl_s r1,r0,3
19 btst_s r0,2
20 asl r7,r4,r1
21 ror r5,r4
22 sub r1,r2,r7
23 bic_s r1,r1,r2
24 mov.eq r7,r4
25 sub r12,r6,r7
26 bic r12,r12,r6
27 or.eq r12,r12,r1
28 and r12,r12,r5
29 brne r12,0,.Learly_end
30#else /* BIG ENDIAN */
31 ror r5,r4
32 btst_s r0,2
33 mov_s r1,31
34 sub3 r7,r1,r0
35 sub r1,r2,r4
36 bic_s r1,r1,r2
37 bmsk r1,r1,r7
38 sub r12,r6,r4
39 bic r12,r12,r6
40 bmsk.ne r12,r12,r7
41 or.eq r12,r12,r1
42 and r12,r12,r5
43 brne r12,0,.Learly_end
44#endif /* ENDIAN */
45
46.Loop:
47 ld_s r2,[r3,4]
48 ld.a r6,[r3,8]
49 ; stall for load result
50 sub r1,r2,r4
51 bic_s r1,r1,r2
52 sub r12,r6,r4
53 bic r12,r12,r6
54 or r12,r12,r1
55 and r12,r12,r5
56 breq r12,0,.Loop
57.Lend:
58 and.f r1,r1,r5
59 sub.ne r3,r3,4
60 mov.eq r1,r12
61#ifdef __LITTLE_ENDIAN__
62 sub_s r2,r1,1
63 bic_s r2,r2,r1
64 norm r1,r2
65 sub_s r0,r0,3
66 lsr_s r1,r1,3
67 sub r0,r3,r0
68 j_s.d [blink]
69 sub r0,r0,r1
70#else /* BIG ENDIAN */
71 lsr_s r1,r1,7
72 mov.eq r2,r6
73 bic_s r1,r1,r2
74 norm r1,r1
75 sub r0,r3,r0
76 lsr_s r1,r1,3
77 j_s.d [blink]
78 add r0,r0,r1
79#endif /* ENDIAN */
80.Learly_end:
81 b.d .Lend
82 sub_s.ne r1,r1,r1
83ARC_EXIT strlen