aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm64
diff options
context:
space:
mode:
authorzhichang.yuan <zhichang.yuan@linaro.org>2014-04-28 01:11:30 -0400
committerCatalin Marinas <catalin.marinas@arm.com>2014-05-23 10:07:35 -0400
commit280adc1951c0c9fc8f2d85571ff563a1c412b1cd (patch)
tree89be4d00570a72fb8b4c30216f6b45d017181e78 /arch/arm64
parent808dbac6b51f3441eb5a07724c0b0d1257046d51 (diff)
arm64: lib: Implement optimized memmove routine
This patch, based on Linaro's Cortex Strings library, improves the performance of the assembly optimized memmove() function. Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org> Signed-off-by: Deepak Saxena <dsaxena@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/lib/memmove.S190
1 files changed, 165 insertions, 25 deletions
diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S
index b79fdfa42d39..57b19ea2dad4 100644
--- a/arch/arm64/lib/memmove.S
+++ b/arch/arm64/lib/memmove.S
@@ -1,5 +1,13 @@
1/* 1/*
2 * Copyright (C) 2013 ARM Ltd. 2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
3 * 11 *
4 * This program is free software; you can redistribute it and/or modify 12 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 13 * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
16 24
17#include <linux/linkage.h> 25#include <linux/linkage.h>
18#include <asm/assembler.h> 26#include <asm/assembler.h>
27#include <asm/cache.h>
19 28
20/* 29/*
21 * Move a buffer from src to test (alignment handled by the hardware). 30 * Move a buffer from src to test (alignment handled by the hardware).
@@ -28,30 +37,161 @@
28 * Returns: 37 * Returns:
29 * x0 - dest 38 * x0 - dest
30 */ 39 */
40dstin .req x0
41src .req x1
42count .req x2
43tmp1 .req x3
44tmp1w .req w3
45tmp2 .req x4
46tmp2w .req w4
47tmp3 .req x5
48tmp3w .req w5
49dst .req x6
50
51A_l .req x7
52A_h .req x8
53B_l .req x9
54B_h .req x10
55C_l .req x11
56C_h .req x12
57D_l .req x13
58D_h .req x14
59
31ENTRY(memmove) 60ENTRY(memmove)
32 cmp x0, x1 61 cmp dstin, src
33 b.ls memcpy 62 b.lo memcpy
34 add x4, x0, x2 63 add tmp1, src, count
35 add x1, x1, x2 64 cmp dstin, tmp1
36 subs x2, x2, #8 65 b.hs memcpy /* No overlap. */
37 b.mi 2f 66
381: ldr x3, [x1, #-8]! 67 add dst, dstin, count
39 subs x2, x2, #8 68 add src, src, count
40 str x3, [x4, #-8]! 69 cmp count, #16
41 b.pl 1b 70 b.lo .Ltail15 /*probably non-alignment accesses.*/
422: adds x2, x2, #4 71
43 b.mi 3f 72 ands tmp2, src, #15 /* Bytes to reach alignment. */
44 ldr w3, [x1, #-4]! 73 b.eq .LSrcAligned
45 sub x2, x2, #4 74 sub count, count, tmp2
46 str w3, [x4, #-4]! 75 /*
473: adds x2, x2, #2 76 * process the aligned offset length to make the src aligned firstly.
48 b.mi 4f 77 * those extra instructions' cost is acceptable. It also make the
49 ldrh w3, [x1, #-2]! 78 * coming accesses are based on aligned address.
50 sub x2, x2, #2 79 */
51 strh w3, [x4, #-2]! 80 tbz tmp2, #0, 1f
524: adds x2, x2, #1 81 ldrb tmp1w, [src, #-1]!
53 b.mi 5f 82 strb tmp1w, [dst, #-1]!
54 ldrb w3, [x1, #-1] 831:
55 strb w3, [x4, #-1] 84 tbz tmp2, #1, 2f
565: ret 85 ldrh tmp1w, [src, #-2]!
86 strh tmp1w, [dst, #-2]!
872:
88 tbz tmp2, #2, 3f
89 ldr tmp1w, [src, #-4]!
90 str tmp1w, [dst, #-4]!
913:
92 tbz tmp2, #3, .LSrcAligned
93 ldr tmp1, [src, #-8]!
94 str tmp1, [dst, #-8]!
95
96.LSrcAligned:
97 cmp count, #64
98 b.ge .Lcpy_over64
99
100 /*
101 * Deal with small copies quickly by dropping straight into the
102 * exit block.
103 */
104.Ltail63:
105 /*
106 * Copy up to 48 bytes of data. At this point we only need the
107 * bottom 6 bits of count to be accurate.
108 */
109 ands tmp1, count, #0x30
110 b.eq .Ltail15
111 cmp tmp1w, #0x20
112 b.eq 1f
113 b.lt 2f
114 ldp A_l, A_h, [src, #-16]!
115 stp A_l, A_h, [dst, #-16]!
1161:
117 ldp A_l, A_h, [src, #-16]!
118 stp A_l, A_h, [dst, #-16]!
1192:
120 ldp A_l, A_h, [src, #-16]!
121 stp A_l, A_h, [dst, #-16]!
122
123.Ltail15:
124 tbz count, #3, 1f
125 ldr tmp1, [src, #-8]!
126 str tmp1, [dst, #-8]!
1271:
128 tbz count, #2, 2f
129 ldr tmp1w, [src, #-4]!
130 str tmp1w, [dst, #-4]!
1312:
132 tbz count, #1, 3f
133 ldrh tmp1w, [src, #-2]!
134 strh tmp1w, [dst, #-2]!
1353:
136 tbz count, #0, .Lexitfunc
137 ldrb tmp1w, [src, #-1]
138 strb tmp1w, [dst, #-1]
139
140.Lexitfunc:
141 ret
142
143.Lcpy_over64:
144 subs count, count, #128
145 b.ge .Lcpy_body_large
146 /*
147 * Less than 128 bytes to copy, so handle 64 bytes here and then jump
148 * to the tail.
149 */
150 ldp A_l, A_h, [src, #-16]
151 stp A_l, A_h, [dst, #-16]
152 ldp B_l, B_h, [src, #-32]
153 ldp C_l, C_h, [src, #-48]
154 stp B_l, B_h, [dst, #-32]
155 stp C_l, C_h, [dst, #-48]
156 ldp D_l, D_h, [src, #-64]!
157 stp D_l, D_h, [dst, #-64]!
158
159 tst count, #0x3f
160 b.ne .Ltail63
161 ret
162
163 /*
164 * Critical loop. Start at a new cache line boundary. Assuming
165 * 64 bytes per line this ensures the entire loop is in one line.
166 */
167 .p2align L1_CACHE_SHIFT
168.Lcpy_body_large:
169 /* pre-load 64 bytes data. */
170 ldp A_l, A_h, [src, #-16]
171 ldp B_l, B_h, [src, #-32]
172 ldp C_l, C_h, [src, #-48]
173 ldp D_l, D_h, [src, #-64]!
1741:
175 /*
176 * interlace the load of next 64 bytes data block with store of the last
177 * loaded 64 bytes data.
178 */
179 stp A_l, A_h, [dst, #-16]
180 ldp A_l, A_h, [src, #-16]
181 stp B_l, B_h, [dst, #-32]
182 ldp B_l, B_h, [src, #-32]
183 stp C_l, C_h, [dst, #-48]
184 ldp C_l, C_h, [src, #-48]
185 stp D_l, D_h, [dst, #-64]!
186 ldp D_l, D_h, [src, #-64]!
187 subs count, count, #64
188 b.ge 1b
189 stp A_l, A_h, [dst, #-16]
190 stp B_l, B_h, [dst, #-32]
191 stp C_l, C_h, [dst, #-48]
192 stp D_l, D_h, [dst, #-64]!
193
194 tst count, #0x3f
195 b.ne .Ltail63
196 ret
57ENDPROC(memmove) 197ENDPROC(memmove)