aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorJan Beulich <JBeulich@suse.com>2012-11-02 10:19:18 -0400
committerIngo Molnar <mingo@kernel.org>2013-01-25 03:23:50 -0500
commite8f6e3f8a14bae98197c6d9f280cd23d22eb1a33 (patch)
treeaefda4da8f27c865d858147864df8f0cd92b0efb /arch
parente3e81aca8d51a50e19d6c67fafc4c9c4f0404bf1 (diff)
x86/xor: Unify SSE-base xor-block routines
Besides folding duplicate code, this has the advantage of fixing x86-64's failure to use proper (para-virtualizable) accessors for dealing with CR0.TS. Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/5093E47602000078000A615B@nat28.tlf.novell.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/include/asm/xor.h319
-rw-r--r--arch/x86/include/asm/xor_32.h286
-rw-r--r--arch/x86/include/asm/xor_64.h295
3 files changed, 319 insertions, 581 deletions
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index f8fde90bc45e..c661571ca0b7 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,10 +1,327 @@
1#ifdef CONFIG_KMEMCHECK 1#ifdef CONFIG_KMEMCHECK
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ 2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h> 3# include <asm-generic/xor.h>
4#elif !defined(_ASM_X86_XOR_H)
5#define _ASM_X86_XOR_H
6
7/*
8 * Optimized RAID-5 checksumming functions for SSE.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2, or (at your option)
13 * any later version.
14 *
15 * You should have received a copy of the GNU General Public License
16 * (for example /usr/src/linux/COPYING); if not, write to the Free
17 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20/*
21 * Cache avoiding checksumming functions utilizing KNI instructions
22 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
23 */
24
25/*
26 * Based on
27 * High-speed RAID5 checksumming functions utilizing SSE instructions.
28 * Copyright (C) 1998 Ingo Molnar.
29 */
30
31/*
32 * x86-64 changes / gcc fixes from Andi Kleen.
33 * Copyright 2002 Andi Kleen, SuSE Labs.
34 *
35 * This hasn't been optimized for the hammer yet, but there are likely
36 * no advantages to be gotten from x86-64 here anyways.
37 */
38
39#include <asm/i387.h>
40
41#ifdef CONFIG_X86_32
42/* reduce register pressure */
43# define XOR_CONSTANT_CONSTRAINT "i"
4#else 44#else
45# define XOR_CONSTANT_CONSTRAINT "re"
46#endif
47
48#define OFFS(x) "16*("#x")"
49#define PF_OFFS(x) "256+16*("#x")"
50#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
51#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
52#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
53#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
54#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
55#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
56#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
57#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
58#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
59#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
60#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
61
62static void
63xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
64{
65 unsigned long lines = bytes >> 8;
66
67 kernel_fpu_begin();
68
69 asm volatile(
70#undef BLOCK
71#define BLOCK(i) \
72 LD(i, 0) \
73 LD(i + 1, 1) \
74 PF1(i) \
75 PF1(i + 2) \
76 LD(i + 2, 2) \
77 LD(i + 3, 3) \
78 PF0(i + 4) \
79 PF0(i + 6) \
80 XO1(i, 0) \
81 XO1(i + 1, 1) \
82 XO1(i + 2, 2) \
83 XO1(i + 3, 3) \
84 ST(i, 0) \
85 ST(i + 1, 1) \
86 ST(i + 2, 2) \
87 ST(i + 3, 3) \
88
89
90 PF0(0)
91 PF0(2)
92
93 " .align 32 ;\n"
94 " 1: ;\n"
95
96 BLOCK(0)
97 BLOCK(4)
98 BLOCK(8)
99 BLOCK(12)
100
101 " add %[inc], %[p1] ;\n"
102 " add %[inc], %[p2] ;\n"
103 " dec %[cnt] ;\n"
104 " jnz 1b ;\n"
105 : [cnt] "+r" (lines),
106 [p1] "+r" (p1), [p2] "+r" (p2)
107 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
108 : "memory");
109
110 kernel_fpu_end();
111}
112
113static void
114xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
115 unsigned long *p3)
116{
117 unsigned long lines = bytes >> 8;
118
119 kernel_fpu_begin();
120
121 asm volatile(
122#undef BLOCK
123#define BLOCK(i) \
124 PF1(i) \
125 PF1(i + 2) \
126 LD(i, 0) \
127 LD(i + 1, 1) \
128 LD(i + 2, 2) \
129 LD(i + 3, 3) \
130 PF2(i) \
131 PF2(i + 2) \
132 PF0(i + 4) \
133 PF0(i + 6) \
134 XO1(i, 0) \
135 XO1(i + 1, 1) \
136 XO1(i + 2, 2) \
137 XO1(i + 3, 3) \
138 XO2(i, 0) \
139 XO2(i + 1, 1) \
140 XO2(i + 2, 2) \
141 XO2(i + 3, 3) \
142 ST(i, 0) \
143 ST(i + 1, 1) \
144 ST(i + 2, 2) \
145 ST(i + 3, 3) \
146
147
148 PF0(0)
149 PF0(2)
150
151 " .align 32 ;\n"
152 " 1: ;\n"
153
154 BLOCK(0)
155 BLOCK(4)
156 BLOCK(8)
157 BLOCK(12)
158
159 " add %[inc], %[p1] ;\n"
160 " add %[inc], %[p2] ;\n"
161 " add %[inc], %[p3] ;\n"
162 " dec %[cnt] ;\n"
163 " jnz 1b ;\n"
164 : [cnt] "+r" (lines),
165 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
166 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
167 : "memory");
168
169 kernel_fpu_end();
170}
171
172static void
173xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
174 unsigned long *p3, unsigned long *p4)
175{
176 unsigned long lines = bytes >> 8;
177
178 kernel_fpu_begin();
179
180 asm volatile(
181#undef BLOCK
182#define BLOCK(i) \
183 PF1(i) \
184 PF1(i + 2) \
185 LD(i, 0) \
186 LD(i + 1, 1) \
187 LD(i + 2, 2) \
188 LD(i + 3, 3) \
189 PF2(i) \
190 PF2(i + 2) \
191 XO1(i, 0) \
192 XO1(i + 1, 1) \
193 XO1(i + 2, 2) \
194 XO1(i + 3, 3) \
195 PF3(i) \
196 PF3(i + 2) \
197 PF0(i + 4) \
198 PF0(i + 6) \
199 XO2(i, 0) \
200 XO2(i + 1, 1) \
201 XO2(i + 2, 2) \
202 XO2(i + 3, 3) \
203 XO3(i, 0) \
204 XO3(i + 1, 1) \
205 XO3(i + 2, 2) \
206 XO3(i + 3, 3) \
207 ST(i, 0) \
208 ST(i + 1, 1) \
209 ST(i + 2, 2) \
210 ST(i + 3, 3) \
211
212
213 PF0(0)
214 PF0(2)
215
216 " .align 32 ;\n"
217 " 1: ;\n"
218
219 BLOCK(0)
220 BLOCK(4)
221 BLOCK(8)
222 BLOCK(12)
223
224 " add %[inc], %[p1] ;\n"
225 " add %[inc], %[p2] ;\n"
226 " add %[inc], %[p3] ;\n"
227 " add %[inc], %[p4] ;\n"
228 " dec %[cnt] ;\n"
229 " jnz 1b ;\n"
230 : [cnt] "+r" (lines), [p1] "+r" (p1),
231 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
232 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
233 : "memory");
234
235 kernel_fpu_end();
236}
237
238static void
239xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
240 unsigned long *p3, unsigned long *p4, unsigned long *p5)
241{
242 unsigned long lines = bytes >> 8;
243
244 kernel_fpu_begin();
245
246 asm volatile(
247#undef BLOCK
248#define BLOCK(i) \
249 PF1(i) \
250 PF1(i + 2) \
251 LD(i, 0) \
252 LD(i + 1, 1) \
253 LD(i + 2, 2) \
254 LD(i + 3, 3) \
255 PF2(i) \
256 PF2(i + 2) \
257 XO1(i, 0) \
258 XO1(i + 1, 1) \
259 XO1(i + 2, 2) \
260 XO1(i + 3, 3) \
261 PF3(i) \
262 PF3(i + 2) \
263 XO2(i, 0) \
264 XO2(i + 1, 1) \
265 XO2(i + 2, 2) \
266 XO2(i + 3, 3) \
267 PF4(i) \
268 PF4(i + 2) \
269 PF0(i + 4) \
270 PF0(i + 6) \
271 XO3(i, 0) \
272 XO3(i + 1, 1) \
273 XO3(i + 2, 2) \
274 XO3(i + 3, 3) \
275 XO4(i, 0) \
276 XO4(i + 1, 1) \
277 XO4(i + 2, 2) \
278 XO4(i + 3, 3) \
279 ST(i, 0) \
280 ST(i + 1, 1) \
281 ST(i + 2, 2) \
282 ST(i + 3, 3) \
283
284
285 PF0(0)
286 PF0(2)
287
288 " .align 32 ;\n"
289 " 1: ;\n"
290
291 BLOCK(0)
292 BLOCK(4)
293 BLOCK(8)
294 BLOCK(12)
295
296 " add %[inc], %[p1] ;\n"
297 " add %[inc], %[p2] ;\n"
298 " add %[inc], %[p3] ;\n"
299 " add %[inc], %[p4] ;\n"
300 " add %[inc], %[p5] ;\n"
301 " dec %[cnt] ;\n"
302 " jnz 1b ;\n"
303 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
304 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
305 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
306 : "memory");
307
308 kernel_fpu_end();
309}
310
311#undef LD
312#undef XO1
313#undef XO2
314#undef XO3
315#undef XO4
316#undef ST
317#undef BLOCK
318
319#undef XOR_CONSTANT_CONSTRAINT
320
5#ifdef CONFIG_X86_32 321#ifdef CONFIG_X86_32
6# include <asm/xor_32.h> 322# include <asm/xor_32.h>
7#else 323#else
8# include <asm/xor_64.h> 324# include <asm/xor_64.h>
9#endif 325#endif
10#endif 326
327#endif /* _ASM_X86_XOR_H */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index f79cb7ec0e06..b85dc87f3cc7 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -2,7 +2,7 @@
2#define _ASM_X86_XOR_32_H 2#define _ASM_X86_XOR_32_H
3 3
4/* 4/*
5 * Optimized RAID-5 checksumming functions for MMX and SSE. 5 * Optimized RAID-5 checksumming functions for MMX.
6 * 6 *
7 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by 8 * it under the terms of the GNU General Public License as published by
@@ -529,290 +529,6 @@ static struct xor_block_template xor_block_p5_mmx = {
529 .do_5 = xor_p5_mmx_5, 529 .do_5 = xor_p5_mmx_5,
530}; 530};
531 531
532/*
533 * Cache avoiding checksumming functions utilizing KNI instructions
534 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
535 */
536
537#define OFFS(x) "16*("#x")"
538#define PF_OFFS(x) "256+16*("#x")"
539#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
540#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
541#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
542#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
543#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
544#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
545#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
546#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
547#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
548#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
549#define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
550#define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
551#define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
552
553
554static void
555xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
556{
557 unsigned long lines = bytes >> 8;
558
559 kernel_fpu_begin();
560
561 asm volatile(
562#undef BLOCK
563#define BLOCK(i) \
564 LD(i, 0) \
565 LD(i + 1, 1) \
566 PF1(i) \
567 PF1(i + 2) \
568 LD(i + 2, 2) \
569 LD(i + 3, 3) \
570 PF0(i + 4) \
571 PF0(i + 6) \
572 XO1(i, 0) \
573 XO1(i + 1, 1) \
574 XO1(i + 2, 2) \
575 XO1(i + 3, 3) \
576 ST(i, 0) \
577 ST(i + 1, 1) \
578 ST(i + 2, 2) \
579 ST(i + 3, 3) \
580
581
582 PF0(0)
583 PF0(2)
584
585 " .align 32 ;\n"
586 " 1: ;\n"
587
588 BLOCK(0)
589 BLOCK(4)
590 BLOCK(8)
591 BLOCK(12)
592
593 " addl $256, %1 ;\n"
594 " addl $256, %2 ;\n"
595 " decl %0 ;\n"
596 " jnz 1b ;\n"
597 : "+r" (lines),
598 "+r" (p1), "+r" (p2)
599 :
600 : "memory");
601
602 kernel_fpu_end();
603}
604
605static void
606xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
607 unsigned long *p3)
608{
609 unsigned long lines = bytes >> 8;
610
611 kernel_fpu_begin();
612
613 asm volatile(
614#undef BLOCK
615#define BLOCK(i) \
616 PF1(i) \
617 PF1(i + 2) \
618 LD(i,0) \
619 LD(i + 1, 1) \
620 LD(i + 2, 2) \
621 LD(i + 3, 3) \
622 PF2(i) \
623 PF2(i + 2) \
624 PF0(i + 4) \
625 PF0(i + 6) \
626 XO1(i,0) \
627 XO1(i + 1, 1) \
628 XO1(i + 2, 2) \
629 XO1(i + 3, 3) \
630 XO2(i,0) \
631 XO2(i + 1, 1) \
632 XO2(i + 2, 2) \
633 XO2(i + 3, 3) \
634 ST(i,0) \
635 ST(i + 1, 1) \
636 ST(i + 2, 2) \
637 ST(i + 3, 3) \
638
639
640 PF0(0)
641 PF0(2)
642
643 " .align 32 ;\n"
644 " 1: ;\n"
645
646 BLOCK(0)
647 BLOCK(4)
648 BLOCK(8)
649 BLOCK(12)
650
651 " addl $256, %1 ;\n"
652 " addl $256, %2 ;\n"
653 " addl $256, %3 ;\n"
654 " decl %0 ;\n"
655 " jnz 1b ;\n"
656 : "+r" (lines),
657 "+r" (p1), "+r"(p2), "+r"(p3)
658 :
659 : "memory" );
660
661 kernel_fpu_end();
662}
663
664static void
665xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
666 unsigned long *p3, unsigned long *p4)
667{
668 unsigned long lines = bytes >> 8;
669
670 kernel_fpu_begin();
671
672 asm volatile(
673#undef BLOCK
674#define BLOCK(i) \
675 PF1(i) \
676 PF1(i + 2) \
677 LD(i,0) \
678 LD(i + 1, 1) \
679 LD(i + 2, 2) \
680 LD(i + 3, 3) \
681 PF2(i) \
682 PF2(i + 2) \
683 XO1(i,0) \
684 XO1(i + 1, 1) \
685 XO1(i + 2, 2) \
686 XO1(i + 3, 3) \
687 PF3(i) \
688 PF3(i + 2) \
689 PF0(i + 4) \
690 PF0(i + 6) \
691 XO2(i,0) \
692 XO2(i + 1, 1) \
693 XO2(i + 2, 2) \
694 XO2(i + 3, 3) \
695 XO3(i,0) \
696 XO3(i + 1, 1) \
697 XO3(i + 2, 2) \
698 XO3(i + 3, 3) \
699 ST(i,0) \
700 ST(i + 1, 1) \
701 ST(i + 2, 2) \
702 ST(i + 3, 3) \
703
704
705 PF0(0)
706 PF0(2)
707
708 " .align 32 ;\n"
709 " 1: ;\n"
710
711 BLOCK(0)
712 BLOCK(4)
713 BLOCK(8)
714 BLOCK(12)
715
716 " addl $256, %1 ;\n"
717 " addl $256, %2 ;\n"
718 " addl $256, %3 ;\n"
719 " addl $256, %4 ;\n"
720 " decl %0 ;\n"
721 " jnz 1b ;\n"
722 : "+r" (lines),
723 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
724 :
725 : "memory" );
726
727 kernel_fpu_end();
728}
729
730static void
731xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
732 unsigned long *p3, unsigned long *p4, unsigned long *p5)
733{
734 unsigned long lines = bytes >> 8;
735
736 kernel_fpu_begin();
737
738 /* Make sure GCC forgets anything it knows about p4 or p5,
739 such that it won't pass to the asm volatile below a
740 register that is shared with any other variable. That's
741 because we modify p4 and p5 there, but we can't mark them
742 as read/write, otherwise we'd overflow the 10-asm-operands
743 limit of GCC < 3.1. */
744 asm("" : "+r" (p4), "+r" (p5));
745
746 asm volatile(
747#undef BLOCK
748#define BLOCK(i) \
749 PF1(i) \
750 PF1(i + 2) \
751 LD(i,0) \
752 LD(i + 1, 1) \
753 LD(i + 2, 2) \
754 LD(i + 3, 3) \
755 PF2(i) \
756 PF2(i + 2) \
757 XO1(i,0) \
758 XO1(i + 1, 1) \
759 XO1(i + 2, 2) \
760 XO1(i + 3, 3) \
761 PF3(i) \
762 PF3(i + 2) \
763 XO2(i,0) \
764 XO2(i + 1, 1) \
765 XO2(i + 2, 2) \
766 XO2(i + 3, 3) \
767 PF4(i) \
768 PF4(i + 2) \
769 PF0(i + 4) \
770 PF0(i + 6) \
771 XO3(i,0) \
772 XO3(i + 1, 1) \
773 XO3(i + 2, 2) \
774 XO3(i + 3, 3) \
775 XO4(i,0) \
776 XO4(i + 1, 1) \
777 XO4(i + 2, 2) \
778 XO4(i + 3, 3) \
779 ST(i,0) \
780 ST(i + 1, 1) \
781 ST(i + 2, 2) \
782 ST(i + 3, 3) \
783
784
785 PF0(0)
786 PF0(2)
787
788 " .align 32 ;\n"
789 " 1: ;\n"
790
791 BLOCK(0)
792 BLOCK(4)
793 BLOCK(8)
794 BLOCK(12)
795
796 " addl $256, %1 ;\n"
797 " addl $256, %2 ;\n"
798 " addl $256, %3 ;\n"
799 " addl $256, %4 ;\n"
800 " addl $256, %5 ;\n"
801 " decl %0 ;\n"
802 " jnz 1b ;\n"
803 : "+r" (lines),
804 "+r" (p1), "+r" (p2), "+r" (p3)
805 : "r" (p4), "r" (p5)
806 : "memory");
807
808 /* p4 and p5 were modified, and now the variables are dead.
809 Clobber them just to be sure nobody does something stupid
810 like assuming they have some legal value. */
811 asm("" : "=r" (p4), "=r" (p5));
812
813 kernel_fpu_end();
814}
815
816static struct xor_block_template xor_block_pIII_sse = { 532static struct xor_block_template xor_block_pIII_sse = {
817 .name = "pIII_sse", 533 .name = "pIII_sse",
818 .do_2 = xor_sse_2, 534 .do_2 = xor_sse_2,
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 87ac522c4af5..1baf89dcc423 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -1,301 +1,6 @@
1#ifndef _ASM_X86_XOR_64_H 1#ifndef _ASM_X86_XOR_64_H
2#define _ASM_X86_XOR_64_H 2#define _ASM_X86_XOR_64_H
3 3
4/*
5 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17
18/*
19 * Cache avoiding checksumming functions utilizing KNI instructions
20 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21 */
22
23/*
24 * Based on
25 * High-speed RAID5 checksumming functions utilizing SSE instructions.
26 * Copyright (C) 1998 Ingo Molnar.
27 */
28
29/*
30 * x86-64 changes / gcc fixes from Andi Kleen.
31 * Copyright 2002 Andi Kleen, SuSE Labs.
32 *
33 * This hasn't been optimized for the hammer yet, but there are likely
34 * no advantages to be gotten from x86-64 here anyways.
35 */
36
37#include <asm/i387.h>
38
39#define OFFS(x) "16*("#x")"
40#define PF_OFFS(x) "256+16*("#x")"
41#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
42#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
43#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
44#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
45#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
46#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
47#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
48#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
49#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
50#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
51#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
52#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
53#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
54
55
56static void
57xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
58{
59 unsigned int lines = bytes >> 8;
60
61 kernel_fpu_begin();
62
63 asm volatile(
64#undef BLOCK
65#define BLOCK(i) \
66 LD(i, 0) \
67 LD(i + 1, 1) \
68 PF1(i) \
69 PF1(i + 2) \
70 LD(i + 2, 2) \
71 LD(i + 3, 3) \
72 PF0(i + 4) \
73 PF0(i + 6) \
74 XO1(i, 0) \
75 XO1(i + 1, 1) \
76 XO1(i + 2, 2) \
77 XO1(i + 3, 3) \
78 ST(i, 0) \
79 ST(i + 1, 1) \
80 ST(i + 2, 2) \
81 ST(i + 3, 3) \
82
83
84 PF0(0)
85 PF0(2)
86
87 " .align 32 ;\n"
88 " 1: ;\n"
89
90 BLOCK(0)
91 BLOCK(4)
92 BLOCK(8)
93 BLOCK(12)
94
95 " addq %[inc], %[p1] ;\n"
96 " addq %[inc], %[p2] ;\n"
97 " decl %[cnt] ; jnz 1b"
98 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
99 : [inc] "r" (256UL)
100 : "memory");
101
102 kernel_fpu_end();
103}
104
105static void
106xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
107 unsigned long *p3)
108{
109 unsigned int lines = bytes >> 8;
110
111 kernel_fpu_begin();
112 asm volatile(
113#undef BLOCK
114#define BLOCK(i) \
115 PF1(i) \
116 PF1(i + 2) \
117 LD(i, 0) \
118 LD(i + 1, 1) \
119 LD(i + 2, 2) \
120 LD(i + 3, 3) \
121 PF2(i) \
122 PF2(i + 2) \
123 PF0(i + 4) \
124 PF0(i + 6) \
125 XO1(i, 0) \
126 XO1(i + 1, 1) \
127 XO1(i + 2, 2) \
128 XO1(i + 3, 3) \
129 XO2(i, 0) \
130 XO2(i + 1, 1) \
131 XO2(i + 2, 2) \
132 XO2(i + 3, 3) \
133 ST(i, 0) \
134 ST(i + 1, 1) \
135 ST(i + 2, 2) \
136 ST(i + 3, 3) \
137
138
139 PF0(0)
140 PF0(2)
141
142 " .align 32 ;\n"
143 " 1: ;\n"
144
145 BLOCK(0)
146 BLOCK(4)
147 BLOCK(8)
148 BLOCK(12)
149
150 " addq %[inc], %[p1] ;\n"
151 " addq %[inc], %[p2] ;\n"
152 " addq %[inc], %[p3] ;\n"
153 " decl %[cnt] ; jnz 1b"
154 : [cnt] "+r" (lines),
155 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
156 : [inc] "r" (256UL)
157 : "memory");
158 kernel_fpu_end();
159}
160
161static void
162xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
163 unsigned long *p3, unsigned long *p4)
164{
165 unsigned int lines = bytes >> 8;
166
167 kernel_fpu_begin();
168
169 asm volatile(
170#undef BLOCK
171#define BLOCK(i) \
172 PF1(i) \
173 PF1(i + 2) \
174 LD(i, 0) \
175 LD(i + 1, 1) \
176 LD(i + 2, 2) \
177 LD(i + 3, 3) \
178 PF2(i) \
179 PF2(i + 2) \
180 XO1(i, 0) \
181 XO1(i + 1, 1) \
182 XO1(i + 2, 2) \
183 XO1(i + 3, 3) \
184 PF3(i) \
185 PF3(i + 2) \
186 PF0(i + 4) \
187 PF0(i + 6) \
188 XO2(i, 0) \
189 XO2(i + 1, 1) \
190 XO2(i + 2, 2) \
191 XO2(i + 3, 3) \
192 XO3(i, 0) \
193 XO3(i + 1, 1) \
194 XO3(i + 2, 2) \
195 XO3(i + 3, 3) \
196 ST(i, 0) \
197 ST(i + 1, 1) \
198 ST(i + 2, 2) \
199 ST(i + 3, 3) \
200
201
202 PF0(0)
203 PF0(2)
204
205 " .align 32 ;\n"
206 " 1: ;\n"
207
208 BLOCK(0)
209 BLOCK(4)
210 BLOCK(8)
211 BLOCK(12)
212
213 " addq %[inc], %[p1] ;\n"
214 " addq %[inc], %[p2] ;\n"
215 " addq %[inc], %[p3] ;\n"
216 " addq %[inc], %[p4] ;\n"
217 " decl %[cnt] ; jnz 1b"
218 : [cnt] "+c" (lines),
219 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
220 : [inc] "r" (256UL)
221 : "memory" );
222
223 kernel_fpu_end();
224}
225
226static void
227xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
228 unsigned long *p3, unsigned long *p4, unsigned long *p5)
229{
230 unsigned int lines = bytes >> 8;
231
232 kernel_fpu_begin();
233
234 asm volatile(
235#undef BLOCK
236#define BLOCK(i) \
237 PF1(i) \
238 PF1(i + 2) \
239 LD(i, 0) \
240 LD(i + 1, 1) \
241 LD(i + 2, 2) \
242 LD(i + 3, 3) \
243 PF2(i) \
244 PF2(i + 2) \
245 XO1(i, 0) \
246 XO1(i + 1, 1) \
247 XO1(i + 2, 2) \
248 XO1(i + 3, 3) \
249 PF3(i) \
250 PF3(i + 2) \
251 XO2(i, 0) \
252 XO2(i + 1, 1) \
253 XO2(i + 2, 2) \
254 XO2(i + 3, 3) \
255 PF4(i) \
256 PF4(i + 2) \
257 PF0(i + 4) \
258 PF0(i + 6) \
259 XO3(i, 0) \
260 XO3(i + 1, 1) \
261 XO3(i + 2, 2) \
262 XO3(i + 3, 3) \
263 XO4(i, 0) \
264 XO4(i + 1, 1) \
265 XO4(i + 2, 2) \
266 XO4(i + 3, 3) \
267 ST(i, 0) \
268 ST(i + 1, 1) \
269 ST(i + 2, 2) \
270 ST(i + 3, 3) \
271
272
273 PF0(0)
274 PF0(2)
275
276 " .align 32 ;\n"
277 " 1: ;\n"
278
279 BLOCK(0)
280 BLOCK(4)
281 BLOCK(8)
282 BLOCK(12)
283
284 " addq %[inc], %[p1] ;\n"
285 " addq %[inc], %[p2] ;\n"
286 " addq %[inc], %[p3] ;\n"
287 " addq %[inc], %[p4] ;\n"
288 " addq %[inc], %[p5] ;\n"
289 " decl %[cnt] ; jnz 1b"
290 : [cnt] "+c" (lines),
291 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
292 [p5] "+r" (p5)
293 : [inc] "r" (256UL)
294 : "memory");
295
296 kernel_fpu_end();
297}
298
299static struct xor_block_template xor_block_sse = { 4static struct xor_block_template xor_block_sse = {
300 .name = "generic_sse", 5 .name = "generic_sse",
301 .do_2 = xor_sse_2, 6 .do_2 = xor_sse_2,