diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-02 15:57:42 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-02 15:57:42 -0400 |
commit | a20acf99f75e49271381d65db097c9763060a1e8 (patch) | |
tree | 3cf661125e86b7625171b96b885bf5395f62e684 /arch/sparc/lib/NG4memcpy.S | |
parent | 437589a74b6a590d175f86cf9f7b2efcee7765e7 (diff) | |
parent | 42a4172b6ebb4a419085c6caee7c135e51cae5ea (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next
Pull sparc updates from David Miller:
"Largely this is simply adding support for the Niagara 4 cpu.
Major areas are perf events (chip now supports 4 counters and can
monitor any event on each counter), crypto (opcodes are availble for
sha1, sha256, sha512, md5, crc32c, AES, DES, CAMELLIA, and Kasumi
although the last is unsupported since we lack a generic crypto layer
Kasumi implementation), and an optimized memcpy.
Finally some cleanups by Peter Senna Tschudin."
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next: (47 commits)
sparc64: Fix trailing whitespace in NG4 memcpy.
sparc64: Fix comment type in NG4 copy from user.
sparc64: Add SPARC-T4 optimized memcpy.
drivers/sbus/char: removes unnecessary semicolon
arch/sparc/kernel/pci_sun4v.c: removes unnecessary semicolon
sparc64: Fix function argument comment in camellia_sparc64_key_expand asm.
sparc64: Fix IV handling bug in des_sparc64_cbc_decrypt
sparc64: Add auto-loading mechanism to crypto-opcode drivers.
sparc64: Add missing pr_fmt define to crypto opcode drivers.
sparc64: Adjust crypto priorities.
sparc64: Use cpu_pgsz_mask for linear kernel mapping config.
sparc64: Probe cpu page size support more portably.
sparc64: Support 2GB and 16GB page sizes for kernel linear mappings.
sparc64: Fix bugs in unrolled 256-bit loops.
sparc64: Avoid code duplication in crypto assembler.
sparc64: Unroll CTR crypt loops in AES driver.
sparc64: Unroll ECB decryption loops in AES driver.
sparc64: Unroll ECB encryption loops in AES driver.
sparc64: Add ctr mode support to AES driver.
sparc64: Move AES driver over to a methods based implementation.
...
Diffstat (limited to 'arch/sparc/lib/NG4memcpy.S')
-rw-r--r-- | arch/sparc/lib/NG4memcpy.S | 360 |
1 files changed, 360 insertions, 0 deletions
diff --git a/arch/sparc/lib/NG4memcpy.S b/arch/sparc/lib/NG4memcpy.S new file mode 100644 index 000000000000..9cf2ee01cee3 --- /dev/null +++ b/arch/sparc/lib/NG4memcpy.S | |||
@@ -0,0 +1,360 @@ | |||
1 | /* NG4memcpy.S: Niagara-4 optimized memcpy. | ||
2 | * | ||
3 | * Copyright (C) 2012 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #ifdef __KERNEL__ | ||
7 | #include <asm/visasm.h> | ||
8 | #include <asm/asi.h> | ||
9 | #define GLOBAL_SPARE %g7 | ||
10 | #else | ||
11 | #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 | ||
12 | #define FPRS_FEF 0x04 | ||
13 | |||
14 | /* On T4 it is very expensive to access ASRs like %fprs and | ||
15 | * %asi, avoiding a read or a write can save ~50 cycles. | ||
16 | */ | ||
17 | #define FPU_ENTER \ | ||
18 | rd %fprs, %o5; \ | ||
19 | andcc %o5, FPRS_FEF, %g0; \ | ||
20 | be,a,pn %icc, 999f; \ | ||
21 | wr %g0, FPRS_FEF, %fprs; \ | ||
22 | 999: | ||
23 | |||
24 | #ifdef MEMCPY_DEBUG | ||
25 | #define VISEntryHalf FPU_ENTER; \ | ||
26 | clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; | ||
27 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | ||
28 | #else | ||
29 | #define VISEntryHalf FPU_ENTER | ||
30 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | ||
31 | #endif | ||
32 | |||
33 | #define GLOBAL_SPARE %g5 | ||
34 | #endif | ||
35 | |||
36 | #ifndef STORE_ASI | ||
37 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA | ||
38 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P | ||
39 | #else | ||
40 | #define STORE_ASI 0x80 /* ASI_P */ | ||
41 | #endif | ||
42 | #endif | ||
43 | |||
44 | #ifndef EX_LD | ||
45 | #define EX_LD(x) x | ||
46 | #endif | ||
47 | |||
48 | #ifndef EX_ST | ||
49 | #define EX_ST(x) x | ||
50 | #endif | ||
51 | |||
52 | #ifndef EX_RETVAL | ||
53 | #define EX_RETVAL(x) x | ||
54 | #endif | ||
55 | |||
56 | #ifndef LOAD | ||
57 | #define LOAD(type,addr,dest) type [addr], dest | ||
58 | #endif | ||
59 | |||
60 | #ifndef STORE | ||
61 | #ifndef MEMCPY_DEBUG | ||
62 | #define STORE(type,src,addr) type src, [addr] | ||
63 | #else | ||
64 | #define STORE(type,src,addr) type##a src, [addr] %asi | ||
65 | #endif | ||
66 | #endif | ||
67 | |||
68 | #ifndef STORE_INIT | ||
69 | #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI | ||
70 | #endif | ||
71 | |||
72 | #ifndef FUNC_NAME | ||
73 | #define FUNC_NAME NG4memcpy | ||
74 | #endif | ||
75 | #ifndef PREAMBLE | ||
76 | #define PREAMBLE | ||
77 | #endif | ||
78 | |||
79 | #ifndef XCC | ||
80 | #define XCC xcc | ||
81 | #endif | ||
82 | |||
83 | .register %g2,#scratch | ||
84 | .register %g3,#scratch | ||
85 | |||
86 | .text | ||
87 | .align 64 | ||
88 | |||
89 | .globl FUNC_NAME | ||
90 | .type FUNC_NAME,#function | ||
91 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | ||
92 | #ifdef MEMCPY_DEBUG | ||
93 | wr %g0, 0x80, %asi | ||
94 | #endif | ||
95 | srlx %o2, 31, %g2 | ||
96 | cmp %g2, 0 | ||
97 | tne %XCC, 5 | ||
98 | PREAMBLE | ||
99 | mov %o0, %o3 | ||
100 | brz,pn %o2, .Lexit | ||
101 | cmp %o2, 3 | ||
102 | ble,pn %icc, .Ltiny | ||
103 | cmp %o2, 19 | ||
104 | ble,pn %icc, .Lsmall | ||
105 | or %o0, %o1, %g2 | ||
106 | cmp %o2, 128 | ||
107 | bl,pn %icc, .Lmedium | ||
108 | nop | ||
109 | |||
110 | .Llarge:/* len >= 0x80 */ | ||
111 | /* First get dest 8 byte aligned. */ | ||
112 | sub %g0, %o0, %g1 | ||
113 | and %g1, 0x7, %g1 | ||
114 | brz,pt %g1, 51f | ||
115 | sub %o2, %g1, %o2 | ||
116 | |||
117 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) | ||
118 | add %o1, 1, %o1 | ||
119 | subcc %g1, 1, %g1 | ||
120 | add %o0, 1, %o0 | ||
121 | bne,pt %icc, 1b | ||
122 | EX_ST(STORE(stb, %g2, %o0 - 0x01)) | ||
123 | |||
124 | 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) | ||
125 | LOAD(prefetch, %o1 + 0x080, #n_reads_strong) | ||
126 | LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) | ||
127 | LOAD(prefetch, %o1 + 0x100, #n_reads_strong) | ||
128 | LOAD(prefetch, %o1 + 0x140, #n_reads_strong) | ||
129 | LOAD(prefetch, %o1 + 0x180, #n_reads_strong) | ||
130 | LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) | ||
131 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | ||
132 | |||
133 | /* Check if we can use the straight fully aligned | ||
134 | * loop, or we require the alignaddr/faligndata variant. | ||
135 | */ | ||
136 | andcc %o1, 0x7, %o5 | ||
137 | bne,pn %icc, .Llarge_src_unaligned | ||
138 | sub %g0, %o0, %g1 | ||
139 | |||
140 | /* Legitimize the use of initializing stores by getting dest | ||
141 | * to be 64-byte aligned. | ||
142 | */ | ||
143 | and %g1, 0x3f, %g1 | ||
144 | brz,pt %g1, .Llarge_aligned | ||
145 | sub %o2, %g1, %o2 | ||
146 | |||
147 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2)) | ||
148 | add %o1, 8, %o1 | ||
149 | subcc %g1, 8, %g1 | ||
150 | add %o0, 8, %o0 | ||
151 | bne,pt %icc, 1b | ||
152 | EX_ST(STORE(stx, %g2, %o0 - 0x08)) | ||
153 | |||
154 | .Llarge_aligned: | ||
155 | /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ | ||
156 | andn %o2, 0x3f, %o4 | ||
157 | sub %o2, %o4, %o2 | ||
158 | |||
159 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | ||
160 | add %o1, 0x40, %o1 | ||
161 | EX_LD(LOAD(ldx, %o1 - 0x38, %g2)) | ||
162 | subcc %o4, 0x40, %o4 | ||
163 | EX_LD(LOAD(ldx, %o1 - 0x30, %g3)) | ||
164 | EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE)) | ||
165 | EX_LD(LOAD(ldx, %o1 - 0x20, %o5)) | ||
166 | EX_ST(STORE_INIT(%g1, %o0)) | ||
167 | add %o0, 0x08, %o0 | ||
168 | EX_ST(STORE_INIT(%g2, %o0)) | ||
169 | add %o0, 0x08, %o0 | ||
170 | EX_LD(LOAD(ldx, %o1 - 0x18, %g2)) | ||
171 | EX_ST(STORE_INIT(%g3, %o0)) | ||
172 | add %o0, 0x08, %o0 | ||
173 | EX_LD(LOAD(ldx, %o1 - 0x10, %g3)) | ||
174 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) | ||
175 | add %o0, 0x08, %o0 | ||
176 | EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE)) | ||
177 | EX_ST(STORE_INIT(%o5, %o0)) | ||
178 | add %o0, 0x08, %o0 | ||
179 | EX_ST(STORE_INIT(%g2, %o0)) | ||
180 | add %o0, 0x08, %o0 | ||
181 | EX_ST(STORE_INIT(%g3, %o0)) | ||
182 | add %o0, 0x08, %o0 | ||
183 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) | ||
184 | add %o0, 0x08, %o0 | ||
185 | bne,pt %icc, 1b | ||
186 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | ||
187 | |||
188 | membar #StoreLoad | #StoreStore | ||
189 | |||
190 | brz,pn %o2, .Lexit | ||
191 | cmp %o2, 19 | ||
192 | ble,pn %icc, .Lsmall_unaligned | ||
193 | nop | ||
194 | ba,a,pt %icc, .Lmedium_noprefetch | ||
195 | |||
196 | .Lexit: retl | ||
197 | mov EX_RETVAL(%o3), %o0 | ||
198 | |||
199 | .Llarge_src_unaligned: | ||
200 | andn %o2, 0x3f, %o4 | ||
201 | sub %o2, %o4, %o2 | ||
202 | VISEntryHalf | ||
203 | alignaddr %o1, %g0, %g1 | ||
204 | add %o1, %o4, %o1 | ||
205 | EX_LD(LOAD(ldd, %g1 + 0x00, %f0)) | ||
206 | 1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2)) | ||
207 | subcc %o4, 0x40, %o4 | ||
208 | EX_LD(LOAD(ldd, %g1 + 0x10, %f4)) | ||
209 | EX_LD(LOAD(ldd, %g1 + 0x18, %f6)) | ||
210 | EX_LD(LOAD(ldd, %g1 + 0x20, %f8)) | ||
211 | EX_LD(LOAD(ldd, %g1 + 0x28, %f10)) | ||
212 | EX_LD(LOAD(ldd, %g1 + 0x30, %f12)) | ||
213 | EX_LD(LOAD(ldd, %g1 + 0x38, %f14)) | ||
214 | faligndata %f0, %f2, %f16 | ||
215 | EX_LD(LOAD(ldd, %g1 + 0x40, %f0)) | ||
216 | faligndata %f2, %f4, %f18 | ||
217 | add %g1, 0x40, %g1 | ||
218 | faligndata %f4, %f6, %f20 | ||
219 | faligndata %f6, %f8, %f22 | ||
220 | faligndata %f8, %f10, %f24 | ||
221 | faligndata %f10, %f12, %f26 | ||
222 | faligndata %f12, %f14, %f28 | ||
223 | faligndata %f14, %f0, %f30 | ||
224 | EX_ST(STORE(std, %f16, %o0 + 0x00)) | ||
225 | EX_ST(STORE(std, %f18, %o0 + 0x08)) | ||
226 | EX_ST(STORE(std, %f20, %o0 + 0x10)) | ||
227 | EX_ST(STORE(std, %f22, %o0 + 0x18)) | ||
228 | EX_ST(STORE(std, %f24, %o0 + 0x20)) | ||
229 | EX_ST(STORE(std, %f26, %o0 + 0x28)) | ||
230 | EX_ST(STORE(std, %f28, %o0 + 0x30)) | ||
231 | EX_ST(STORE(std, %f30, %o0 + 0x38)) | ||
232 | add %o0, 0x40, %o0 | ||
233 | bne,pt %icc, 1b | ||
234 | LOAD(prefetch, %g1 + 0x200, #n_reads_strong) | ||
235 | VISExitHalf | ||
236 | |||
237 | brz,pn %o2, .Lexit | ||
238 | cmp %o2, 19 | ||
239 | ble,pn %icc, .Lsmall_unaligned | ||
240 | nop | ||
241 | ba,a,pt %icc, .Lmedium_unaligned | ||
242 | |||
243 | .Lmedium: | ||
244 | LOAD(prefetch, %o1 + 0x40, #n_reads_strong) | ||
245 | andcc %g2, 0x7, %g0 | ||
246 | bne,pn %icc, .Lmedium_unaligned | ||
247 | nop | ||
248 | .Lmedium_noprefetch: | ||
249 | andncc %o2, 0x20 - 1, %o5 | ||
250 | be,pn %icc, 2f | ||
251 | sub %o2, %o5, %o2 | ||
252 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | ||
253 | EX_LD(LOAD(ldx, %o1 + 0x08, %g2)) | ||
254 | EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE)) | ||
255 | EX_LD(LOAD(ldx, %o1 + 0x18, %o4)) | ||
256 | add %o1, 0x20, %o1 | ||
257 | subcc %o5, 0x20, %o5 | ||
258 | EX_ST(STORE(stx, %g1, %o0 + 0x00)) | ||
259 | EX_ST(STORE(stx, %g2, %o0 + 0x08)) | ||
260 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10)) | ||
261 | EX_ST(STORE(stx, %o4, %o0 + 0x18)) | ||
262 | bne,pt %icc, 1b | ||
263 | add %o0, 0x20, %o0 | ||
264 | 2: andcc %o2, 0x18, %o5 | ||
265 | be,pt %icc, 3f | ||
266 | sub %o2, %o5, %o2 | ||
267 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | ||
268 | add %o1, 0x08, %o1 | ||
269 | add %o0, 0x08, %o0 | ||
270 | subcc %o5, 0x08, %o5 | ||
271 | bne,pt %icc, 1b | ||
272 | EX_ST(STORE(stx, %g1, %o0 - 0x08)) | ||
273 | 3: brz,pt %o2, .Lexit | ||
274 | cmp %o2, 0x04 | ||
275 | bl,pn %icc, .Ltiny | ||
276 | nop | ||
277 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) | ||
278 | add %o1, 0x04, %o1 | ||
279 | add %o0, 0x04, %o0 | ||
280 | subcc %o2, 0x04, %o2 | ||
281 | bne,pn %icc, .Ltiny | ||
282 | EX_ST(STORE(stw, %g1, %o0 - 0x04)) | ||
283 | ba,a,pt %icc, .Lexit | ||
284 | .Lmedium_unaligned: | ||
285 | /* First get dest 8 byte aligned. */ | ||
286 | sub %g0, %o0, %g1 | ||
287 | and %g1, 0x7, %g1 | ||
288 | brz,pt %g1, 2f | ||
289 | sub %o2, %g1, %o2 | ||
290 | |||
291 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) | ||
292 | add %o1, 1, %o1 | ||
293 | subcc %g1, 1, %g1 | ||
294 | add %o0, 1, %o0 | ||
295 | bne,pt %icc, 1b | ||
296 | EX_ST(STORE(stb, %g2, %o0 - 0x01)) | ||
297 | 2: | ||
298 | and %o1, 0x7, %g1 | ||
299 | brz,pn %g1, .Lmedium_noprefetch | ||
300 | sll %g1, 3, %g1 | ||
301 | mov 64, %g2 | ||
302 | sub %g2, %g1, %g2 | ||
303 | andn %o1, 0x7, %o1 | ||
304 | EX_LD(LOAD(ldx, %o1 + 0x00, %o4)) | ||
305 | sllx %o4, %g1, %o4 | ||
306 | andn %o2, 0x08 - 1, %o5 | ||
307 | sub %o2, %o5, %o2 | ||
308 | 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3)) | ||
309 | add %o1, 0x08, %o1 | ||
310 | subcc %o5, 0x08, %o5 | ||
311 | srlx %g3, %g2, GLOBAL_SPARE | ||
312 | or GLOBAL_SPARE, %o4, GLOBAL_SPARE | ||
313 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00)) | ||
314 | add %o0, 0x08, %o0 | ||
315 | bne,pt %icc, 1b | ||
316 | sllx %g3, %g1, %o4 | ||
317 | srl %g1, 3, %g1 | ||
318 | add %o1, %g1, %o1 | ||
319 | brz,pn %o2, .Lexit | ||
320 | nop | ||
321 | ba,pt %icc, .Lsmall_unaligned | ||
322 | |||
323 | .Ltiny: | ||
324 | EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) | ||
325 | subcc %o2, 1, %o2 | ||
326 | be,pn %icc, .Lexit | ||
327 | EX_ST(STORE(stb, %g1, %o0 + 0x00)) | ||
328 | EX_LD(LOAD(ldub, %o1 + 0x01, %g1)) | ||
329 | subcc %o2, 1, %o2 | ||
330 | be,pn %icc, .Lexit | ||
331 | EX_ST(STORE(stb, %g1, %o0 + 0x01)) | ||
332 | EX_LD(LOAD(ldub, %o1 + 0x02, %g1)) | ||
333 | ba,pt %icc, .Lexit | ||
334 | EX_ST(STORE(stb, %g1, %o0 + 0x02)) | ||
335 | |||
336 | .Lsmall: | ||
337 | andcc %g2, 0x3, %g0 | ||
338 | bne,pn %icc, .Lsmall_unaligned | ||
339 | andn %o2, 0x4 - 1, %o5 | ||
340 | sub %o2, %o5, %o2 | ||
341 | 1: | ||
342 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) | ||
343 | add %o1, 0x04, %o1 | ||
344 | subcc %o5, 0x04, %o5 | ||
345 | add %o0, 0x04, %o0 | ||
346 | bne,pt %icc, 1b | ||
347 | EX_ST(STORE(stw, %g1, %o0 - 0x04)) | ||
348 | brz,pt %o2, .Lexit | ||
349 | nop | ||
350 | ba,a,pt %icc, .Ltiny | ||
351 | |||
352 | .Lsmall_unaligned: | ||
353 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) | ||
354 | add %o1, 1, %o1 | ||
355 | add %o0, 1, %o0 | ||
356 | subcc %o2, 1, %o2 | ||
357 | bne,pt %icc, 1b | ||
358 | EX_ST(STORE(stb, %g1, %o0 - 0x01)) | ||
359 | ba,a,pt %icc, .Lexit | ||
360 | .size FUNC_NAME, .-FUNC_NAME | ||