aboutsummaryrefslogtreecommitdiffstats
path: root/arch/sparc/lib/NG4memcpy.S
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-02 15:57:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-02 15:57:42 -0400
commita20acf99f75e49271381d65db097c9763060a1e8 (patch)
tree3cf661125e86b7625171b96b885bf5395f62e684 /arch/sparc/lib/NG4memcpy.S
parent437589a74b6a590d175f86cf9f7b2efcee7765e7 (diff)
parent42a4172b6ebb4a419085c6caee7c135e51cae5ea (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next
Pull sparc updates from David Miller: "Largely this is simply adding support for the Niagara 4 cpu. Major areas are perf events (chip now supports 4 counters and can monitor any event on each counter), crypto (opcodes are availble for sha1, sha256, sha512, md5, crc32c, AES, DES, CAMELLIA, and Kasumi although the last is unsupported since we lack a generic crypto layer Kasumi implementation), and an optimized memcpy. Finally some cleanups by Peter Senna Tschudin." * git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next: (47 commits) sparc64: Fix trailing whitespace in NG4 memcpy. sparc64: Fix comment type in NG4 copy from user. sparc64: Add SPARC-T4 optimized memcpy. drivers/sbus/char: removes unnecessary semicolon arch/sparc/kernel/pci_sun4v.c: removes unnecessary semicolon sparc64: Fix function argument comment in camellia_sparc64_key_expand asm. sparc64: Fix IV handling bug in des_sparc64_cbc_decrypt sparc64: Add auto-loading mechanism to crypto-opcode drivers. sparc64: Add missing pr_fmt define to crypto opcode drivers. sparc64: Adjust crypto priorities. sparc64: Use cpu_pgsz_mask for linear kernel mapping config. sparc64: Probe cpu page size support more portably. sparc64: Support 2GB and 16GB page sizes for kernel linear mappings. sparc64: Fix bugs in unrolled 256-bit loops. sparc64: Avoid code duplication in crypto assembler. sparc64: Unroll CTR crypt loops in AES driver. sparc64: Unroll ECB decryption loops in AES driver. sparc64: Unroll ECB encryption loops in AES driver. sparc64: Add ctr mode support to AES driver. sparc64: Move AES driver over to a methods based implementation. ...
Diffstat (limited to 'arch/sparc/lib/NG4memcpy.S')
-rw-r--r--arch/sparc/lib/NG4memcpy.S360
1 files changed, 360 insertions, 0 deletions
diff --git a/arch/sparc/lib/NG4memcpy.S b/arch/sparc/lib/NG4memcpy.S
new file mode 100644
index 000000000000..9cf2ee01cee3
--- /dev/null
+++ b/arch/sparc/lib/NG4memcpy.S
@@ -0,0 +1,360 @@
1/* NG4memcpy.S: Niagara-4 optimized memcpy.
2 *
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
4 */
5
6#ifdef __KERNEL__
7#include <asm/visasm.h>
8#include <asm/asi.h>
9#define GLOBAL_SPARE %g7
10#else
11#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
12#define FPRS_FEF 0x04
13
14/* On T4 it is very expensive to access ASRs like %fprs and
15 * %asi, avoiding a read or a write can save ~50 cycles.
16 */
17#define FPU_ENTER \
18 rd %fprs, %o5; \
19 andcc %o5, FPRS_FEF, %g0; \
20 be,a,pn %icc, 999f; \
21 wr %g0, FPRS_FEF, %fprs; \
22 999:
23
24#ifdef MEMCPY_DEBUG
25#define VISEntryHalf FPU_ENTER; \
26 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
27#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
28#else
29#define VISEntryHalf FPU_ENTER
30#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
31#endif
32
33#define GLOBAL_SPARE %g5
34#endif
35
36#ifndef STORE_ASI
37#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
38#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
39#else
40#define STORE_ASI 0x80 /* ASI_P */
41#endif
42#endif
43
44#ifndef EX_LD
45#define EX_LD(x) x
46#endif
47
48#ifndef EX_ST
49#define EX_ST(x) x
50#endif
51
52#ifndef EX_RETVAL
53#define EX_RETVAL(x) x
54#endif
55
56#ifndef LOAD
57#define LOAD(type,addr,dest) type [addr], dest
58#endif
59
60#ifndef STORE
61#ifndef MEMCPY_DEBUG
62#define STORE(type,src,addr) type src, [addr]
63#else
64#define STORE(type,src,addr) type##a src, [addr] %asi
65#endif
66#endif
67
68#ifndef STORE_INIT
69#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
70#endif
71
72#ifndef FUNC_NAME
73#define FUNC_NAME NG4memcpy
74#endif
75#ifndef PREAMBLE
76#define PREAMBLE
77#endif
78
79#ifndef XCC
80#define XCC xcc
81#endif
82
83 .register %g2,#scratch
84 .register %g3,#scratch
85
86 .text
87 .align 64
88
89 .globl FUNC_NAME
90 .type FUNC_NAME,#function
91FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
92#ifdef MEMCPY_DEBUG
93 wr %g0, 0x80, %asi
94#endif
95 srlx %o2, 31, %g2
96 cmp %g2, 0
97 tne %XCC, 5
98 PREAMBLE
99 mov %o0, %o3
100 brz,pn %o2, .Lexit
101 cmp %o2, 3
102 ble,pn %icc, .Ltiny
103 cmp %o2, 19
104 ble,pn %icc, .Lsmall
105 or %o0, %o1, %g2
106 cmp %o2, 128
107 bl,pn %icc, .Lmedium
108 nop
109
110.Llarge:/* len >= 0x80 */
111 /* First get dest 8 byte aligned. */
112 sub %g0, %o0, %g1
113 and %g1, 0x7, %g1
114 brz,pt %g1, 51f
115 sub %o2, %g1, %o2
116
1171: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
118 add %o1, 1, %o1
119 subcc %g1, 1, %g1
120 add %o0, 1, %o0
121 bne,pt %icc, 1b
122 EX_ST(STORE(stb, %g2, %o0 - 0x01))
123
12451: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
125 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
126 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
127 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
128 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
129 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
130 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
131 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
132
133 /* Check if we can use the straight fully aligned
134 * loop, or we require the alignaddr/faligndata variant.
135 */
136 andcc %o1, 0x7, %o5
137 bne,pn %icc, .Llarge_src_unaligned
138 sub %g0, %o0, %g1
139
140 /* Legitimize the use of initializing stores by getting dest
141 * to be 64-byte aligned.
142 */
143 and %g1, 0x3f, %g1
144 brz,pt %g1, .Llarge_aligned
145 sub %o2, %g1, %o2
146
1471: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
148 add %o1, 8, %o1
149 subcc %g1, 8, %g1
150 add %o0, 8, %o0
151 bne,pt %icc, 1b
152 EX_ST(STORE(stx, %g2, %o0 - 0x08))
153
154.Llarge_aligned:
155 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
156 andn %o2, 0x3f, %o4
157 sub %o2, %o4, %o2
158
1591: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
160 add %o1, 0x40, %o1
161 EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
162 subcc %o4, 0x40, %o4
163 EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
164 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
165 EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
166 EX_ST(STORE_INIT(%g1, %o0))
167 add %o0, 0x08, %o0
168 EX_ST(STORE_INIT(%g2, %o0))
169 add %o0, 0x08, %o0
170 EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
171 EX_ST(STORE_INIT(%g3, %o0))
172 add %o0, 0x08, %o0
173 EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
174 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
175 add %o0, 0x08, %o0
176 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
177 EX_ST(STORE_INIT(%o5, %o0))
178 add %o0, 0x08, %o0
179 EX_ST(STORE_INIT(%g2, %o0))
180 add %o0, 0x08, %o0
181 EX_ST(STORE_INIT(%g3, %o0))
182 add %o0, 0x08, %o0
183 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
184 add %o0, 0x08, %o0
185 bne,pt %icc, 1b
186 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
187
188 membar #StoreLoad | #StoreStore
189
190 brz,pn %o2, .Lexit
191 cmp %o2, 19
192 ble,pn %icc, .Lsmall_unaligned
193 nop
194 ba,a,pt %icc, .Lmedium_noprefetch
195
196.Lexit: retl
197 mov EX_RETVAL(%o3), %o0
198
199.Llarge_src_unaligned:
200 andn %o2, 0x3f, %o4
201 sub %o2, %o4, %o2
202 VISEntryHalf
203 alignaddr %o1, %g0, %g1
204 add %o1, %o4, %o1
205 EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
2061: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
207 subcc %o4, 0x40, %o4
208 EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
209 EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
210 EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
211 EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
212 EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
213 EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
214 faligndata %f0, %f2, %f16
215 EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
216 faligndata %f2, %f4, %f18
217 add %g1, 0x40, %g1
218 faligndata %f4, %f6, %f20
219 faligndata %f6, %f8, %f22
220 faligndata %f8, %f10, %f24
221 faligndata %f10, %f12, %f26
222 faligndata %f12, %f14, %f28
223 faligndata %f14, %f0, %f30
224 EX_ST(STORE(std, %f16, %o0 + 0x00))
225 EX_ST(STORE(std, %f18, %o0 + 0x08))
226 EX_ST(STORE(std, %f20, %o0 + 0x10))
227 EX_ST(STORE(std, %f22, %o0 + 0x18))
228 EX_ST(STORE(std, %f24, %o0 + 0x20))
229 EX_ST(STORE(std, %f26, %o0 + 0x28))
230 EX_ST(STORE(std, %f28, %o0 + 0x30))
231 EX_ST(STORE(std, %f30, %o0 + 0x38))
232 add %o0, 0x40, %o0
233 bne,pt %icc, 1b
234 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
235 VISExitHalf
236
237 brz,pn %o2, .Lexit
238 cmp %o2, 19
239 ble,pn %icc, .Lsmall_unaligned
240 nop
241 ba,a,pt %icc, .Lmedium_unaligned
242
243.Lmedium:
244 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
245 andcc %g2, 0x7, %g0
246 bne,pn %icc, .Lmedium_unaligned
247 nop
248.Lmedium_noprefetch:
249 andncc %o2, 0x20 - 1, %o5
250 be,pn %icc, 2f
251 sub %o2, %o5, %o2
2521: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
253 EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
254 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
255 EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
256 add %o1, 0x20, %o1
257 subcc %o5, 0x20, %o5
258 EX_ST(STORE(stx, %g1, %o0 + 0x00))
259 EX_ST(STORE(stx, %g2, %o0 + 0x08))
260 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
261 EX_ST(STORE(stx, %o4, %o0 + 0x18))
262 bne,pt %icc, 1b
263 add %o0, 0x20, %o0
2642: andcc %o2, 0x18, %o5
265 be,pt %icc, 3f
266 sub %o2, %o5, %o2
2671: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
268 add %o1, 0x08, %o1
269 add %o0, 0x08, %o0
270 subcc %o5, 0x08, %o5
271 bne,pt %icc, 1b
272 EX_ST(STORE(stx, %g1, %o0 - 0x08))
2733: brz,pt %o2, .Lexit
274 cmp %o2, 0x04
275 bl,pn %icc, .Ltiny
276 nop
277 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
278 add %o1, 0x04, %o1
279 add %o0, 0x04, %o0
280 subcc %o2, 0x04, %o2
281 bne,pn %icc, .Ltiny
282 EX_ST(STORE(stw, %g1, %o0 - 0x04))
283 ba,a,pt %icc, .Lexit
284.Lmedium_unaligned:
285 /* First get dest 8 byte aligned. */
286 sub %g0, %o0, %g1
287 and %g1, 0x7, %g1
288 brz,pt %g1, 2f
289 sub %o2, %g1, %o2
290
2911: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
292 add %o1, 1, %o1
293 subcc %g1, 1, %g1
294 add %o0, 1, %o0
295 bne,pt %icc, 1b
296 EX_ST(STORE(stb, %g2, %o0 - 0x01))
2972:
298 and %o1, 0x7, %g1
299 brz,pn %g1, .Lmedium_noprefetch
300 sll %g1, 3, %g1
301 mov 64, %g2
302 sub %g2, %g1, %g2
303 andn %o1, 0x7, %o1
304 EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
305 sllx %o4, %g1, %o4
306 andn %o2, 0x08 - 1, %o5
307 sub %o2, %o5, %o2
3081: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
309 add %o1, 0x08, %o1
310 subcc %o5, 0x08, %o5
311 srlx %g3, %g2, GLOBAL_SPARE
312 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
313 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
314 add %o0, 0x08, %o0
315 bne,pt %icc, 1b
316 sllx %g3, %g1, %o4
317 srl %g1, 3, %g1
318 add %o1, %g1, %o1
319 brz,pn %o2, .Lexit
320 nop
321 ba,pt %icc, .Lsmall_unaligned
322
323.Ltiny:
324 EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
325 subcc %o2, 1, %o2
326 be,pn %icc, .Lexit
327 EX_ST(STORE(stb, %g1, %o0 + 0x00))
328 EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
329 subcc %o2, 1, %o2
330 be,pn %icc, .Lexit
331 EX_ST(STORE(stb, %g1, %o0 + 0x01))
332 EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
333 ba,pt %icc, .Lexit
334 EX_ST(STORE(stb, %g1, %o0 + 0x02))
335
336.Lsmall:
337 andcc %g2, 0x3, %g0
338 bne,pn %icc, .Lsmall_unaligned
339 andn %o2, 0x4 - 1, %o5
340 sub %o2, %o5, %o2
3411:
342 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
343 add %o1, 0x04, %o1
344 subcc %o5, 0x04, %o5
345 add %o0, 0x04, %o0
346 bne,pt %icc, 1b
347 EX_ST(STORE(stw, %g1, %o0 - 0x04))
348 brz,pt %o2, .Lexit
349 nop
350 ba,a,pt %icc, .Ltiny
351
352.Lsmall_unaligned:
3531: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
354 add %o1, 1, %o1
355 add %o0, 1, %o0
356 subcc %o2, 1, %o2
357 bne,pt %icc, 1b
358 EX_ST(STORE(stb, %g1, %o0 - 0x01))
359 ba,a,pt %icc, .Lexit
360 .size FUNC_NAME, .-FUNC_NAME