diff options
Diffstat (limited to 'arch/sparc')
50 files changed, 6494 insertions, 2 deletions
diff --git a/arch/sparc/Makefile b/arch/sparc/Makefile index 317aad600ed7..b1d691489ed4 100644 --- a/arch/sparc/Makefile +++ b/arch/sparc/Makefile | |||
@@ -77,8 +77,7 @@ core-$(CONFIG_SPARC64) += arch/sparc64/kernel/ | |||
77 | core-y += arch/sparc/mm/ arch/sparc/math-emu/ | 77 | core-y += arch/sparc/mm/ arch/sparc/math-emu/ |
78 | 78 | ||
79 | libs-y += arch/sparc/prom/ | 79 | libs-y += arch/sparc/prom/ |
80 | libs-$(CONFIG_SPARC32) += arch/sparc/lib/ | 80 | libs-y += arch/sparc/lib/ |
81 | libs-$(CONFIG_SPARC64) += arch/sparc64/lib/ | ||
82 | 81 | ||
83 | drivers-$(CONFIG_OPROFILE) += arch/sparc/oprofile/ | 82 | drivers-$(CONFIG_OPROFILE) += arch/sparc/oprofile/ |
84 | 83 | ||
diff --git a/arch/sparc/lib/GENbzero.S b/arch/sparc/lib/GENbzero.S new file mode 100644 index 000000000000..6a4f956a2f7a --- /dev/null +++ b/arch/sparc/lib/GENbzero.S | |||
@@ -0,0 +1,160 @@ | |||
1 | /* GENbzero.S: Generic sparc64 memset/clear_user. | ||
2 | * | ||
3 | * Copyright (C) 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | #include <asm/asi.h> | ||
6 | |||
7 | #define EX_ST(x,y) \ | ||
8 | 98: x,y; \ | ||
9 | .section .fixup; \ | ||
10 | .align 4; \ | ||
11 | 99: retl; \ | ||
12 | mov %o1, %o0; \ | ||
13 | .section __ex_table,"a";\ | ||
14 | .align 4; \ | ||
15 | .word 98b, 99b; \ | ||
16 | .text; \ | ||
17 | .align 4; | ||
18 | |||
19 | .align 32 | ||
20 | .text | ||
21 | |||
22 | .globl GENmemset | ||
23 | .type GENmemset, #function | ||
24 | GENmemset: /* %o0=buf, %o1=pat, %o2=len */ | ||
25 | and %o1, 0xff, %o3 | ||
26 | mov %o2, %o1 | ||
27 | sllx %o3, 8, %g1 | ||
28 | or %g1, %o3, %o2 | ||
29 | sllx %o2, 16, %g1 | ||
30 | or %g1, %o2, %o2 | ||
31 | sllx %o2, 32, %g1 | ||
32 | ba,pt %xcc, 1f | ||
33 | or %g1, %o2, %o2 | ||
34 | |||
35 | .globl GENbzero | ||
36 | .type GENbzero, #function | ||
37 | GENbzero: | ||
38 | clr %o2 | ||
39 | 1: brz,pn %o1, GENbzero_return | ||
40 | mov %o0, %o3 | ||
41 | |||
42 | /* %o5: saved %asi, restored at GENbzero_done | ||
43 | * %o4: store %asi to use | ||
44 | */ | ||
45 | rd %asi, %o5 | ||
46 | mov ASI_P, %o4 | ||
47 | wr %o4, 0x0, %asi | ||
48 | |||
49 | GENbzero_from_clear_user: | ||
50 | cmp %o1, 15 | ||
51 | bl,pn %icc, GENbzero_tiny | ||
52 | andcc %o0, 0x7, %g1 | ||
53 | be,pt %xcc, 2f | ||
54 | mov 8, %g2 | ||
55 | sub %g2, %g1, %g1 | ||
56 | sub %o1, %g1, %o1 | ||
57 | 1: EX_ST(stba %o2, [%o0 + 0x00] %asi) | ||
58 | subcc %g1, 1, %g1 | ||
59 | bne,pt %xcc, 1b | ||
60 | add %o0, 1, %o0 | ||
61 | 2: cmp %o1, 128 | ||
62 | bl,pn %icc, GENbzero_medium | ||
63 | andcc %o0, (64 - 1), %g1 | ||
64 | be,pt %xcc, GENbzero_pre_loop | ||
65 | mov 64, %g2 | ||
66 | sub %g2, %g1, %g1 | ||
67 | sub %o1, %g1, %o1 | ||
68 | 1: EX_ST(stxa %o2, [%o0 + 0x00] %asi) | ||
69 | subcc %g1, 8, %g1 | ||
70 | bne,pt %xcc, 1b | ||
71 | add %o0, 8, %o0 | ||
72 | |||
73 | GENbzero_pre_loop: | ||
74 | andn %o1, (64 - 1), %g1 | ||
75 | sub %o1, %g1, %o1 | ||
76 | GENbzero_loop: | ||
77 | EX_ST(stxa %o2, [%o0 + 0x00] %asi) | ||
78 | EX_ST(stxa %o2, [%o0 + 0x08] %asi) | ||
79 | EX_ST(stxa %o2, [%o0 + 0x10] %asi) | ||
80 | EX_ST(stxa %o2, [%o0 + 0x18] %asi) | ||
81 | EX_ST(stxa %o2, [%o0 + 0x20] %asi) | ||
82 | EX_ST(stxa %o2, [%o0 + 0x28] %asi) | ||
83 | EX_ST(stxa %o2, [%o0 + 0x30] %asi) | ||
84 | EX_ST(stxa %o2, [%o0 + 0x38] %asi) | ||
85 | subcc %g1, 64, %g1 | ||
86 | bne,pt %xcc, GENbzero_loop | ||
87 | add %o0, 64, %o0 | ||
88 | |||
89 | membar #Sync | ||
90 | wr %o4, 0x0, %asi | ||
91 | brz,pn %o1, GENbzero_done | ||
92 | GENbzero_medium: | ||
93 | andncc %o1, 0x7, %g1 | ||
94 | be,pn %xcc, 2f | ||
95 | sub %o1, %g1, %o1 | ||
96 | 1: EX_ST(stxa %o2, [%o0 + 0x00] %asi) | ||
97 | subcc %g1, 8, %g1 | ||
98 | bne,pt %xcc, 1b | ||
99 | add %o0, 8, %o0 | ||
100 | 2: brz,pt %o1, GENbzero_done | ||
101 | nop | ||
102 | |||
103 | GENbzero_tiny: | ||
104 | 1: EX_ST(stba %o2, [%o0 + 0x00] %asi) | ||
105 | subcc %o1, 1, %o1 | ||
106 | bne,pt %icc, 1b | ||
107 | add %o0, 1, %o0 | ||
108 | |||
109 | /* fallthrough */ | ||
110 | |||
111 | GENbzero_done: | ||
112 | wr %o5, 0x0, %asi | ||
113 | |||
114 | GENbzero_return: | ||
115 | retl | ||
116 | mov %o3, %o0 | ||
117 | .size GENbzero, .-GENbzero | ||
118 | .size GENmemset, .-GENmemset | ||
119 | |||
120 | .globl GENclear_user | ||
121 | .type GENclear_user, #function | ||
122 | GENclear_user: /* %o0=buf, %o1=len */ | ||
123 | rd %asi, %o5 | ||
124 | brz,pn %o1, GENbzero_done | ||
125 | clr %o3 | ||
126 | cmp %o5, ASI_AIUS | ||
127 | bne,pn %icc, GENbzero | ||
128 | clr %o2 | ||
129 | ba,pt %xcc, GENbzero_from_clear_user | ||
130 | mov ASI_AIUS, %o4 | ||
131 | .size GENclear_user, .-GENclear_user | ||
132 | |||
133 | #define BRANCH_ALWAYS 0x10680000 | ||
134 | #define NOP 0x01000000 | ||
135 | #define GEN_DO_PATCH(OLD, NEW) \ | ||
136 | sethi %hi(NEW), %g1; \ | ||
137 | or %g1, %lo(NEW), %g1; \ | ||
138 | sethi %hi(OLD), %g2; \ | ||
139 | or %g2, %lo(OLD), %g2; \ | ||
140 | sub %g1, %g2, %g1; \ | ||
141 | sethi %hi(BRANCH_ALWAYS), %g3; \ | ||
142 | sll %g1, 11, %g1; \ | ||
143 | srl %g1, 11 + 2, %g1; \ | ||
144 | or %g3, %lo(BRANCH_ALWAYS), %g3; \ | ||
145 | or %g3, %g1, %g3; \ | ||
146 | stw %g3, [%g2]; \ | ||
147 | sethi %hi(NOP), %g3; \ | ||
148 | or %g3, %lo(NOP), %g3; \ | ||
149 | stw %g3, [%g2 + 0x4]; \ | ||
150 | flush %g2; | ||
151 | |||
152 | .globl generic_patch_bzero | ||
153 | .type generic_patch_bzero,#function | ||
154 | generic_patch_bzero: | ||
155 | GEN_DO_PATCH(memset, GENmemset) | ||
156 | GEN_DO_PATCH(__bzero, GENbzero) | ||
157 | GEN_DO_PATCH(__clear_user, GENclear_user) | ||
158 | retl | ||
159 | nop | ||
160 | .size generic_patch_bzero,.-generic_patch_bzero | ||
diff --git a/arch/sparc/lib/GENcopy_from_user.S b/arch/sparc/lib/GENcopy_from_user.S new file mode 100644 index 000000000000..2b9df99e87f9 --- /dev/null +++ b/arch/sparc/lib/GENcopy_from_user.S | |||
@@ -0,0 +1,34 @@ | |||
1 | /* GENcopy_from_user.S: Generic sparc64 copy from userspace. | ||
2 | * | ||
3 | * Copyright (C) 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #define EX_LD(x) \ | ||
7 | 98: x; \ | ||
8 | .section .fixup; \ | ||
9 | .align 4; \ | ||
10 | 99: retl; \ | ||
11 | mov 1, %o0; \ | ||
12 | .section __ex_table,"a";\ | ||
13 | .align 4; \ | ||
14 | .word 98b, 99b; \ | ||
15 | .text; \ | ||
16 | .align 4; | ||
17 | |||
18 | #ifndef ASI_AIUS | ||
19 | #define ASI_AIUS 0x11 | ||
20 | #endif | ||
21 | |||
22 | #define FUNC_NAME GENcopy_from_user | ||
23 | #define LOAD(type,addr,dest) type##a [addr] ASI_AIUS, dest | ||
24 | #define EX_RETVAL(x) 0 | ||
25 | |||
26 | #ifdef __KERNEL__ | ||
27 | #define PREAMBLE \ | ||
28 | rd %asi, %g1; \ | ||
29 | cmp %g1, ASI_AIUS; \ | ||
30 | bne,pn %icc, memcpy_user_stub; \ | ||
31 | nop | ||
32 | #endif | ||
33 | |||
34 | #include "GENmemcpy.S" | ||
diff --git a/arch/sparc/lib/GENcopy_to_user.S b/arch/sparc/lib/GENcopy_to_user.S new file mode 100644 index 000000000000..bb3f7084daf9 --- /dev/null +++ b/arch/sparc/lib/GENcopy_to_user.S | |||
@@ -0,0 +1,38 @@ | |||
1 | /* GENcopy_to_user.S: Generic sparc64 copy to userspace. | ||
2 | * | ||
3 | * Copyright (C) 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #define EX_ST(x) \ | ||
7 | 98: x; \ | ||
8 | .section .fixup; \ | ||
9 | .align 4; \ | ||
10 | 99: retl; \ | ||
11 | mov 1, %o0; \ | ||
12 | .section __ex_table,"a";\ | ||
13 | .align 4; \ | ||
14 | .word 98b, 99b; \ | ||
15 | .text; \ | ||
16 | .align 4; | ||
17 | |||
18 | #ifndef ASI_AIUS | ||
19 | #define ASI_AIUS 0x11 | ||
20 | #endif | ||
21 | |||
22 | #define FUNC_NAME GENcopy_to_user | ||
23 | #define STORE(type,src,addr) type##a src, [addr] ASI_AIUS | ||
24 | #define EX_RETVAL(x) 0 | ||
25 | |||
26 | #ifdef __KERNEL__ | ||
27 | /* Writing to %asi is _expensive_ so we hardcode it. | ||
28 | * Reading %asi to check for KERNEL_DS is comparatively | ||
29 | * cheap. | ||
30 | */ | ||
31 | #define PREAMBLE \ | ||
32 | rd %asi, %g1; \ | ||
33 | cmp %g1, ASI_AIUS; \ | ||
34 | bne,pn %icc, memcpy_user_stub; \ | ||
35 | nop | ||
36 | #endif | ||
37 | |||
38 | #include "GENmemcpy.S" | ||
diff --git a/arch/sparc/lib/GENmemcpy.S b/arch/sparc/lib/GENmemcpy.S new file mode 100644 index 000000000000..89358ee94851 --- /dev/null +++ b/arch/sparc/lib/GENmemcpy.S | |||
@@ -0,0 +1,121 @@ | |||
1 | /* GENmemcpy.S: Generic sparc64 memcpy. | ||
2 | * | ||
3 | * Copyright (C) 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #ifdef __KERNEL__ | ||
7 | #define GLOBAL_SPARE %g7 | ||
8 | #else | ||
9 | #define GLOBAL_SPARE %g5 | ||
10 | #endif | ||
11 | |||
12 | #ifndef EX_LD | ||
13 | #define EX_LD(x) x | ||
14 | #endif | ||
15 | |||
16 | #ifndef EX_ST | ||
17 | #define EX_ST(x) x | ||
18 | #endif | ||
19 | |||
20 | #ifndef EX_RETVAL | ||
21 | #define EX_RETVAL(x) x | ||
22 | #endif | ||
23 | |||
24 | #ifndef LOAD | ||
25 | #define LOAD(type,addr,dest) type [addr], dest | ||
26 | #endif | ||
27 | |||
28 | #ifndef STORE | ||
29 | #define STORE(type,src,addr) type src, [addr] | ||
30 | #endif | ||
31 | |||
32 | #ifndef FUNC_NAME | ||
33 | #define FUNC_NAME GENmemcpy | ||
34 | #endif | ||
35 | |||
36 | #ifndef PREAMBLE | ||
37 | #define PREAMBLE | ||
38 | #endif | ||
39 | |||
40 | #ifndef XCC | ||
41 | #define XCC xcc | ||
42 | #endif | ||
43 | |||
44 | .register %g2,#scratch | ||
45 | .register %g3,#scratch | ||
46 | |||
47 | .text | ||
48 | .align 64 | ||
49 | |||
50 | .globl FUNC_NAME | ||
51 | .type FUNC_NAME,#function | ||
52 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | ||
53 | srlx %o2, 31, %g2 | ||
54 | cmp %g2, 0 | ||
55 | tne %XCC, 5 | ||
56 | PREAMBLE | ||
57 | mov %o0, GLOBAL_SPARE | ||
58 | |||
59 | cmp %o2, 0 | ||
60 | be,pn %XCC, 85f | ||
61 | or %o0, %o1, %o3 | ||
62 | cmp %o2, 16 | ||
63 | blu,a,pn %XCC, 80f | ||
64 | or %o3, %o2, %o3 | ||
65 | |||
66 | xor %o0, %o1, %o4 | ||
67 | andcc %o4, 0x7, %g0 | ||
68 | bne,a,pn %XCC, 90f | ||
69 | sub %o0, %o1, %o3 | ||
70 | |||
71 | and %o0, 0x7, %o4 | ||
72 | sub %o4, 0x8, %o4 | ||
73 | sub %g0, %o4, %o4 | ||
74 | sub %o2, %o4, %o2 | ||
75 | 1: subcc %o4, 1, %o4 | ||
76 | EX_LD(LOAD(ldub, %o1, %g1)) | ||
77 | EX_ST(STORE(stb, %g1, %o0)) | ||
78 | add %o1, 1, %o1 | ||
79 | bne,pt %XCC, 1b | ||
80 | add %o0, 1, %o0 | ||
81 | |||
82 | andn %o2, 0x7, %g1 | ||
83 | sub %o2, %g1, %o2 | ||
84 | 1: subcc %g1, 0x8, %g1 | ||
85 | EX_LD(LOAD(ldx, %o1, %g2)) | ||
86 | EX_ST(STORE(stx, %g2, %o0)) | ||
87 | add %o1, 0x8, %o1 | ||
88 | bne,pt %XCC, 1b | ||
89 | add %o0, 0x8, %o0 | ||
90 | |||
91 | brz,pt %o2, 85f | ||
92 | sub %o0, %o1, %o3 | ||
93 | ba,a,pt %XCC, 90f | ||
94 | |||
95 | .align 64 | ||
96 | 80: /* 0 < len <= 16 */ | ||
97 | andcc %o3, 0x3, %g0 | ||
98 | bne,pn %XCC, 90f | ||
99 | sub %o0, %o1, %o3 | ||
100 | |||
101 | 1: | ||
102 | subcc %o2, 4, %o2 | ||
103 | EX_LD(LOAD(lduw, %o1, %g1)) | ||
104 | EX_ST(STORE(stw, %g1, %o1 + %o3)) | ||
105 | bgu,pt %XCC, 1b | ||
106 | add %o1, 4, %o1 | ||
107 | |||
108 | 85: retl | ||
109 | mov EX_RETVAL(GLOBAL_SPARE), %o0 | ||
110 | |||
111 | .align 32 | ||
112 | 90: | ||
113 | subcc %o2, 1, %o2 | ||
114 | EX_LD(LOAD(ldub, %o1, %g1)) | ||
115 | EX_ST(STORE(stb, %g1, %o1 + %o3)) | ||
116 | bgu,pt %XCC, 90b | ||
117 | add %o1, 1, %o1 | ||
118 | retl | ||
119 | mov EX_RETVAL(GLOBAL_SPARE), %o0 | ||
120 | |||
121 | .size FUNC_NAME, .-FUNC_NAME | ||
diff --git a/arch/sparc/lib/GENpage.S b/arch/sparc/lib/GENpage.S new file mode 100644 index 000000000000..2ef9d05f21bc --- /dev/null +++ b/arch/sparc/lib/GENpage.S | |||
@@ -0,0 +1,77 @@ | |||
1 | /* GENpage.S: Generic clear and copy page. | ||
2 | * | ||
3 | * Copyright (C) 2007 (davem@davemloft.net) | ||
4 | */ | ||
5 | #include <asm/page.h> | ||
6 | |||
7 | .text | ||
8 | .align 32 | ||
9 | |||
10 | GENcopy_user_page: | ||
11 | set PAGE_SIZE, %g7 | ||
12 | 1: ldx [%o1 + 0x00], %o2 | ||
13 | ldx [%o1 + 0x08], %o3 | ||
14 | ldx [%o1 + 0x10], %o4 | ||
15 | ldx [%o1 + 0x18], %o5 | ||
16 | stx %o2, [%o0 + 0x00] | ||
17 | stx %o3, [%o0 + 0x08] | ||
18 | stx %o4, [%o0 + 0x10] | ||
19 | stx %o5, [%o0 + 0x18] | ||
20 | ldx [%o1 + 0x20], %o2 | ||
21 | ldx [%o1 + 0x28], %o3 | ||
22 | ldx [%o1 + 0x30], %o4 | ||
23 | ldx [%o1 + 0x38], %o5 | ||
24 | stx %o2, [%o0 + 0x20] | ||
25 | stx %o3, [%o0 + 0x28] | ||
26 | stx %o4, [%o0 + 0x30] | ||
27 | stx %o5, [%o0 + 0x38] | ||
28 | subcc %g7, 64, %g7 | ||
29 | add %o1, 64, %o1 | ||
30 | bne,pt %xcc, 1b | ||
31 | add %o0, 64, %o0 | ||
32 | retl | ||
33 | nop | ||
34 | |||
35 | GENclear_page: | ||
36 | GENclear_user_page: | ||
37 | set PAGE_SIZE, %g7 | ||
38 | 1: stx %g0, [%o0 + 0x00] | ||
39 | stx %g0, [%o0 + 0x08] | ||
40 | stx %g0, [%o0 + 0x10] | ||
41 | stx %g0, [%o0 + 0x18] | ||
42 | stx %g0, [%o0 + 0x20] | ||
43 | stx %g0, [%o0 + 0x28] | ||
44 | stx %g0, [%o0 + 0x30] | ||
45 | stx %g0, [%o0 + 0x38] | ||
46 | subcc %g7, 64, %g7 | ||
47 | bne,pt %xcc, 1b | ||
48 | add %o0, 64, %o0 | ||
49 | |||
50 | #define BRANCH_ALWAYS 0x10680000 | ||
51 | #define NOP 0x01000000 | ||
52 | #define GEN_DO_PATCH(OLD, NEW) \ | ||
53 | sethi %hi(NEW), %g1; \ | ||
54 | or %g1, %lo(NEW), %g1; \ | ||
55 | sethi %hi(OLD), %g2; \ | ||
56 | or %g2, %lo(OLD), %g2; \ | ||
57 | sub %g1, %g2, %g1; \ | ||
58 | sethi %hi(BRANCH_ALWAYS), %g3; \ | ||
59 | sll %g1, 11, %g1; \ | ||
60 | srl %g1, 11 + 2, %g1; \ | ||
61 | or %g3, %lo(BRANCH_ALWAYS), %g3; \ | ||
62 | or %g3, %g1, %g3; \ | ||
63 | stw %g3, [%g2]; \ | ||
64 | sethi %hi(NOP), %g3; \ | ||
65 | or %g3, %lo(NOP), %g3; \ | ||
66 | stw %g3, [%g2 + 0x4]; \ | ||
67 | flush %g2; | ||
68 | |||
69 | .globl generic_patch_pageops | ||
70 | .type generic_patch_pageops,#function | ||
71 | generic_patch_pageops: | ||
72 | GEN_DO_PATCH(copy_user_page, GENcopy_user_page) | ||
73 | GEN_DO_PATCH(_clear_page, GENclear_page) | ||
74 | GEN_DO_PATCH(clear_user_page, GENclear_user_page) | ||
75 | retl | ||
76 | nop | ||
77 | .size generic_patch_pageops,.-generic_patch_pageops | ||
diff --git a/arch/sparc/lib/GENpatch.S b/arch/sparc/lib/GENpatch.S new file mode 100644 index 000000000000..fab9e89f16bd --- /dev/null +++ b/arch/sparc/lib/GENpatch.S | |||
@@ -0,0 +1,33 @@ | |||
1 | /* GENpatch.S: Patch Ultra-I routines with generic variant. | ||
2 | * | ||
3 | * Copyright (C) 2007 David S. Miller <davem@davemloft.net> | ||
4 | */ | ||
5 | |||
6 | #define BRANCH_ALWAYS 0x10680000 | ||
7 | #define NOP 0x01000000 | ||
8 | #define GEN_DO_PATCH(OLD, NEW) \ | ||
9 | sethi %hi(NEW), %g1; \ | ||
10 | or %g1, %lo(NEW), %g1; \ | ||
11 | sethi %hi(OLD), %g2; \ | ||
12 | or %g2, %lo(OLD), %g2; \ | ||
13 | sub %g1, %g2, %g1; \ | ||
14 | sethi %hi(BRANCH_ALWAYS), %g3; \ | ||
15 | sll %g1, 11, %g1; \ | ||
16 | srl %g1, 11 + 2, %g1; \ | ||
17 | or %g3, %lo(BRANCH_ALWAYS), %g3; \ | ||
18 | or %g3, %g1, %g3; \ | ||
19 | stw %g3, [%g2]; \ | ||
20 | sethi %hi(NOP), %g3; \ | ||
21 | or %g3, %lo(NOP), %g3; \ | ||
22 | stw %g3, [%g2 + 0x4]; \ | ||
23 | flush %g2; | ||
24 | |||
25 | .globl generic_patch_copyops | ||
26 | .type generic_patch_copyops,#function | ||
27 | generic_patch_copyops: | ||
28 | GEN_DO_PATCH(memcpy, GENmemcpy) | ||
29 | GEN_DO_PATCH(___copy_from_user, GENcopy_from_user) | ||
30 | GEN_DO_PATCH(___copy_to_user, GENcopy_to_user) | ||
31 | retl | ||
32 | nop | ||
33 | .size generic_patch_copyops,.-generic_patch_copyops | ||
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index f2650545c774..05ae5c945e35 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile | |||
@@ -18,5 +18,27 @@ lib-$(CONFIG_SPARC32) += lshrdi3.o ashldi3.o | |||
18 | lib-y += rwsem_$(BITS).o | 18 | lib-y += rwsem_$(BITS).o |
19 | lib-$(CONFIG_SPARC32) += muldi3.o bitext.o cmpdi2.o | 19 | lib-$(CONFIG_SPARC32) += muldi3.o bitext.o cmpdi2.o |
20 | 20 | ||
21 | lib-$(CONFIG_SPARC64) += PeeCeeI.o copy_page.o clear_page.o bzero.o | ||
22 | lib-$(CONFIG_SPARC64) += csum_copy.o csum_copy_from_user.o csum_copy_to_user.o | ||
23 | lib-$(CONFIG_SPARC64) += VISsave.o | ||
24 | lib-$(CONFIG_SPARC64) += bitops.o | ||
25 | |||
26 | lib-$(CONFIG_SPARC64) += U1memcpy.o U1copy_from_user.o U1copy_to_user.o | ||
27 | |||
28 | lib-$(CONFIG_SPARC64) += U3memcpy.o U3copy_from_user.o U3copy_to_user.o | ||
29 | lib-$(CONFIG_SPARC64) += U3patch.o | ||
30 | |||
31 | lib-$(CONFIG_SPARC64) += NGmemcpy.o NGcopy_from_user.o NGcopy_to_user.o | ||
32 | lib-$(CONFIG_SPARC64) += NGpatch.o NGpage.o NGbzero.o | ||
33 | |||
34 | lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o | ||
35 | lib-$(CONFIG_SPARC64) += NG2patch.o NG2page.o | ||
36 | |||
37 | lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o | ||
38 | lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o | ||
39 | |||
40 | lib-$(CONFIG_SPARC64) += copy_in_user.o user_fixup.o memmove.o | ||
41 | lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o xor.o | ||
42 | |||
21 | obj-y += iomap.o | 43 | obj-y += iomap.o |
22 | obj-$(CONFIG_SPARC32) += atomic32.o | 44 | obj-$(CONFIG_SPARC32) += atomic32.o |
diff --git a/arch/sparc/lib/NG2copy_from_user.S b/arch/sparc/lib/NG2copy_from_user.S new file mode 100644 index 000000000000..c77ef5f22102 --- /dev/null +++ b/arch/sparc/lib/NG2copy_from_user.S | |||
@@ -0,0 +1,40 @@ | |||
1 | /* NG2copy_from_user.S: Niagara-2 optimized copy from userspace. | ||
2 | * | ||
3 | * Copyright (C) 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #define EX_LD(x) \ | ||
7 | 98: x; \ | ||
8 | .section .fixup; \ | ||
9 | .align 4; \ | ||
10 | 99: wr %g0, ASI_AIUS, %asi;\ | ||
11 | retl; \ | ||
12 | mov 1, %o0; \ | ||
13 | .section __ex_table,"a";\ | ||
14 | .align 4; \ | ||
15 | .word 98b, 99b; \ | ||
16 | .text; \ | ||
17 | .align 4; | ||
18 | |||
19 | #ifndef ASI_AIUS | ||
20 | #define ASI_AIUS 0x11 | ||
21 | #endif | ||
22 | |||
23 | #ifndef ASI_BLK_AIUS_4V | ||
24 | #define ASI_BLK_AIUS_4V 0x17 | ||
25 | #endif | ||
26 | |||
27 | #define FUNC_NAME NG2copy_from_user | ||
28 | #define LOAD(type,addr,dest) type##a [addr] %asi, dest | ||
29 | #define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_AIUS_4V, dest | ||
30 | #define EX_RETVAL(x) 0 | ||
31 | |||
32 | #ifdef __KERNEL__ | ||
33 | #define PREAMBLE \ | ||
34 | rd %asi, %g1; \ | ||
35 | cmp %g1, ASI_AIUS; \ | ||
36 | bne,pn %icc, memcpy_user_stub; \ | ||
37 | nop | ||
38 | #endif | ||
39 | |||
40 | #include "NG2memcpy.S" | ||
diff --git a/arch/sparc/lib/NG2copy_to_user.S b/arch/sparc/lib/NG2copy_to_user.S new file mode 100644 index 000000000000..4bd4093acbbd --- /dev/null +++ b/arch/sparc/lib/NG2copy_to_user.S | |||
@@ -0,0 +1,49 @@ | |||
1 | /* NG2copy_to_user.S: Niagara-2 optimized copy to userspace. | ||
2 | * | ||
3 | * Copyright (C) 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #define EX_ST(x) \ | ||
7 | 98: x; \ | ||
8 | .section .fixup; \ | ||
9 | .align 4; \ | ||
10 | 99: wr %g0, ASI_AIUS, %asi;\ | ||
11 | retl; \ | ||
12 | mov 1, %o0; \ | ||
13 | .section __ex_table,"a";\ | ||
14 | .align 4; \ | ||
15 | .word 98b, 99b; \ | ||
16 | .text; \ | ||
17 | .align 4; | ||
18 | |||
19 | #ifndef ASI_AIUS | ||
20 | #define ASI_AIUS 0x11 | ||
21 | #endif | ||
22 | |||
23 | #ifndef ASI_BLK_AIUS_4V | ||
24 | #define ASI_BLK_AIUS_4V 0x17 | ||
25 | #endif | ||
26 | |||
27 | #ifndef ASI_BLK_INIT_QUAD_LDD_AIUS | ||
28 | #define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23 | ||
29 | #endif | ||
30 | |||
31 | #define FUNC_NAME NG2copy_to_user | ||
32 | #define STORE(type,src,addr) type##a src, [addr] ASI_AIUS | ||
33 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS | ||
34 | #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_AIUS_4V | ||
35 | #define EX_RETVAL(x) 0 | ||
36 | |||
37 | #ifdef __KERNEL__ | ||
38 | /* Writing to %asi is _expensive_ so we hardcode it. | ||
39 | * Reading %asi to check for KERNEL_DS is comparatively | ||
40 | * cheap. | ||
41 | */ | ||
42 | #define PREAMBLE \ | ||
43 | rd %asi, %g1; \ | ||
44 | cmp %g1, ASI_AIUS; \ | ||
45 | bne,pn %icc, memcpy_user_stub; \ | ||
46 | nop | ||
47 | #endif | ||
48 | |||
49 | #include "NG2memcpy.S" | ||
diff --git a/arch/sparc/lib/NG2memcpy.S b/arch/sparc/lib/NG2memcpy.S new file mode 100644 index 000000000000..0aed75653b50 --- /dev/null +++ b/arch/sparc/lib/NG2memcpy.S | |||
@@ -0,0 +1,520 @@ | |||
1 | /* NG2memcpy.S: Niagara-2 optimized memcpy. | ||
2 | * | ||
3 | * Copyright (C) 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #ifdef __KERNEL__ | ||
7 | #include <asm/visasm.h> | ||
8 | #include <asm/asi.h> | ||
9 | #define GLOBAL_SPARE %g7 | ||
10 | #else | ||
11 | #define ASI_PNF 0x82 | ||
12 | #define ASI_BLK_P 0xf0 | ||
13 | #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 | ||
14 | #define FPRS_FEF 0x04 | ||
15 | #ifdef MEMCPY_DEBUG | ||
16 | #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ | ||
17 | clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; | ||
18 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | ||
19 | #else | ||
20 | #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs | ||
21 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | ||
22 | #endif | ||
23 | #define GLOBAL_SPARE %g5 | ||
24 | #endif | ||
25 | |||
26 | #ifndef STORE_ASI | ||
27 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA | ||
28 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P | ||
29 | #else | ||
30 | #define STORE_ASI 0x80 /* ASI_P */ | ||
31 | #endif | ||
32 | #endif | ||
33 | |||
34 | #ifndef EX_LD | ||
35 | #define EX_LD(x) x | ||
36 | #endif | ||
37 | |||
38 | #ifndef EX_ST | ||
39 | #define EX_ST(x) x | ||
40 | #endif | ||
41 | |||
42 | #ifndef EX_RETVAL | ||
43 | #define EX_RETVAL(x) x | ||
44 | #endif | ||
45 | |||
46 | #ifndef LOAD | ||
47 | #define LOAD(type,addr,dest) type [addr], dest | ||
48 | #endif | ||
49 | |||
50 | #ifndef LOAD_BLK | ||
51 | #define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest | ||
52 | #endif | ||
53 | |||
54 | #ifndef STORE | ||
55 | #ifndef MEMCPY_DEBUG | ||
56 | #define STORE(type,src,addr) type src, [addr] | ||
57 | #else | ||
58 | #define STORE(type,src,addr) type##a src, [addr] 0x80 | ||
59 | #endif | ||
60 | #endif | ||
61 | |||
62 | #ifndef STORE_BLK | ||
63 | #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P | ||
64 | #endif | ||
65 | |||
66 | #ifndef STORE_INIT | ||
67 | #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI | ||
68 | #endif | ||
69 | |||
70 | #ifndef FUNC_NAME | ||
71 | #define FUNC_NAME NG2memcpy | ||
72 | #endif | ||
73 | |||
74 | #ifndef PREAMBLE | ||
75 | #define PREAMBLE | ||
76 | #endif | ||
77 | |||
78 | #ifndef XCC | ||
79 | #define XCC xcc | ||
80 | #endif | ||
81 | |||
82 | #define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \ | ||
83 | faligndata %x0, %x1, %f0; \ | ||
84 | faligndata %x1, %x2, %f2; \ | ||
85 | faligndata %x2, %x3, %f4; \ | ||
86 | faligndata %x3, %x4, %f6; \ | ||
87 | faligndata %x4, %x5, %f8; \ | ||
88 | faligndata %x5, %x6, %f10; \ | ||
89 | faligndata %x6, %x7, %f12; \ | ||
90 | faligndata %x7, %x8, %f14; | ||
91 | |||
92 | #define FREG_MOVE_1(x0) \ | ||
93 | fmovd %x0, %f0; | ||
94 | #define FREG_MOVE_2(x0, x1) \ | ||
95 | fmovd %x0, %f0; \ | ||
96 | fmovd %x1, %f2; | ||
97 | #define FREG_MOVE_3(x0, x1, x2) \ | ||
98 | fmovd %x0, %f0; \ | ||
99 | fmovd %x1, %f2; \ | ||
100 | fmovd %x2, %f4; | ||
101 | #define FREG_MOVE_4(x0, x1, x2, x3) \ | ||
102 | fmovd %x0, %f0; \ | ||
103 | fmovd %x1, %f2; \ | ||
104 | fmovd %x2, %f4; \ | ||
105 | fmovd %x3, %f6; | ||
106 | #define FREG_MOVE_5(x0, x1, x2, x3, x4) \ | ||
107 | fmovd %x0, %f0; \ | ||
108 | fmovd %x1, %f2; \ | ||
109 | fmovd %x2, %f4; \ | ||
110 | fmovd %x3, %f6; \ | ||
111 | fmovd %x4, %f8; | ||
112 | #define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \ | ||
113 | fmovd %x0, %f0; \ | ||
114 | fmovd %x1, %f2; \ | ||
115 | fmovd %x2, %f4; \ | ||
116 | fmovd %x3, %f6; \ | ||
117 | fmovd %x4, %f8; \ | ||
118 | fmovd %x5, %f10; | ||
119 | #define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \ | ||
120 | fmovd %x0, %f0; \ | ||
121 | fmovd %x1, %f2; \ | ||
122 | fmovd %x2, %f4; \ | ||
123 | fmovd %x3, %f6; \ | ||
124 | fmovd %x4, %f8; \ | ||
125 | fmovd %x5, %f10; \ | ||
126 | fmovd %x6, %f12; | ||
127 | #define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \ | ||
128 | fmovd %x0, %f0; \ | ||
129 | fmovd %x1, %f2; \ | ||
130 | fmovd %x2, %f4; \ | ||
131 | fmovd %x3, %f6; \ | ||
132 | fmovd %x4, %f8; \ | ||
133 | fmovd %x5, %f10; \ | ||
134 | fmovd %x6, %f12; \ | ||
135 | fmovd %x7, %f14; | ||
136 | #define FREG_LOAD_1(base, x0) \ | ||
137 | EX_LD(LOAD(ldd, base + 0x00, %x0)) | ||
138 | #define FREG_LOAD_2(base, x0, x1) \ | ||
139 | EX_LD(LOAD(ldd, base + 0x00, %x0)); \ | ||
140 | EX_LD(LOAD(ldd, base + 0x08, %x1)); | ||
141 | #define FREG_LOAD_3(base, x0, x1, x2) \ | ||
142 | EX_LD(LOAD(ldd, base + 0x00, %x0)); \ | ||
143 | EX_LD(LOAD(ldd, base + 0x08, %x1)); \ | ||
144 | EX_LD(LOAD(ldd, base + 0x10, %x2)); | ||
145 | #define FREG_LOAD_4(base, x0, x1, x2, x3) \ | ||
146 | EX_LD(LOAD(ldd, base + 0x00, %x0)); \ | ||
147 | EX_LD(LOAD(ldd, base + 0x08, %x1)); \ | ||
148 | EX_LD(LOAD(ldd, base + 0x10, %x2)); \ | ||
149 | EX_LD(LOAD(ldd, base + 0x18, %x3)); | ||
150 | #define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \ | ||
151 | EX_LD(LOAD(ldd, base + 0x00, %x0)); \ | ||
152 | EX_LD(LOAD(ldd, base + 0x08, %x1)); \ | ||
153 | EX_LD(LOAD(ldd, base + 0x10, %x2)); \ | ||
154 | EX_LD(LOAD(ldd, base + 0x18, %x3)); \ | ||
155 | EX_LD(LOAD(ldd, base + 0x20, %x4)); | ||
156 | #define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \ | ||
157 | EX_LD(LOAD(ldd, base + 0x00, %x0)); \ | ||
158 | EX_LD(LOAD(ldd, base + 0x08, %x1)); \ | ||
159 | EX_LD(LOAD(ldd, base + 0x10, %x2)); \ | ||
160 | EX_LD(LOAD(ldd, base + 0x18, %x3)); \ | ||
161 | EX_LD(LOAD(ldd, base + 0x20, %x4)); \ | ||
162 | EX_LD(LOAD(ldd, base + 0x28, %x5)); | ||
163 | #define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \ | ||
164 | EX_LD(LOAD(ldd, base + 0x00, %x0)); \ | ||
165 | EX_LD(LOAD(ldd, base + 0x08, %x1)); \ | ||
166 | EX_LD(LOAD(ldd, base + 0x10, %x2)); \ | ||
167 | EX_LD(LOAD(ldd, base + 0x18, %x3)); \ | ||
168 | EX_LD(LOAD(ldd, base + 0x20, %x4)); \ | ||
169 | EX_LD(LOAD(ldd, base + 0x28, %x5)); \ | ||
170 | EX_LD(LOAD(ldd, base + 0x30, %x6)); | ||
171 | |||
172 | .register %g2,#scratch | ||
173 | .register %g3,#scratch | ||
174 | |||
175 | .text | ||
176 | .align 64 | ||
177 | |||
178 | .globl FUNC_NAME | ||
179 | .type FUNC_NAME,#function | ||
180 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | ||
181 | srlx %o2, 31, %g2 | ||
182 | cmp %g2, 0 | ||
183 | tne %xcc, 5 | ||
184 | PREAMBLE | ||
185 | mov %o0, GLOBAL_SPARE | ||
186 | cmp %o2, 0 | ||
187 | be,pn %XCC, 85f | ||
188 | or %o0, %o1, %o3 | ||
189 | cmp %o2, 16 | ||
190 | blu,a,pn %XCC, 80f | ||
191 | or %o3, %o2, %o3 | ||
192 | |||
193 | /* 2 blocks (128 bytes) is the minimum we can do the block | ||
194 | * copy with. We need to ensure that we'll iterate at least | ||
195 | * once in the block copy loop. At worst we'll need to align | ||
196 | * the destination to a 64-byte boundary which can chew up | ||
197 | * to (64 - 1) bytes from the length before we perform the | ||
198 | * block copy loop. | ||
199 | * | ||
200 | * However, the cut-off point, performance wise, is around | ||
201 | * 4 64-byte blocks. | ||
202 | */ | ||
203 | cmp %o2, (4 * 64) | ||
204 | blu,pt %XCC, 75f | ||
205 | andcc %o3, 0x7, %g0 | ||
206 | |||
207 | /* %o0: dst | ||
208 | * %o1: src | ||
209 | * %o2: len (known to be >= 128) | ||
210 | * | ||
211 | * The block copy loops can use %o4, %g2, %g3 as | ||
212 | * temporaries while copying the data. %o5 must | ||
213 | * be preserved between VISEntryHalf and VISExitHalf | ||
214 | */ | ||
215 | |||
216 | LOAD(prefetch, %o1 + 0x000, #one_read) | ||
217 | LOAD(prefetch, %o1 + 0x040, #one_read) | ||
218 | LOAD(prefetch, %o1 + 0x080, #one_read) | ||
219 | |||
220 | /* Align destination on 64-byte boundary. */ | ||
221 | andcc %o0, (64 - 1), %o4 | ||
222 | be,pt %XCC, 2f | ||
223 | sub %o4, 64, %o4 | ||
224 | sub %g0, %o4, %o4 ! bytes to align dst | ||
225 | sub %o2, %o4, %o2 | ||
226 | 1: subcc %o4, 1, %o4 | ||
227 | EX_LD(LOAD(ldub, %o1, %g1)) | ||
228 | EX_ST(STORE(stb, %g1, %o0)) | ||
229 | add %o1, 1, %o1 | ||
230 | bne,pt %XCC, 1b | ||
231 | add %o0, 1, %o0 | ||
232 | |||
233 | 2: | ||
234 | /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve | ||
235 | * o5 from here until we hit VISExitHalf. | ||
236 | */ | ||
237 | VISEntryHalf | ||
238 | |||
239 | alignaddr %o1, %g0, %g0 | ||
240 | |||
241 | add %o1, (64 - 1), %o4 | ||
242 | andn %o4, (64 - 1), %o4 | ||
243 | andn %o2, (64 - 1), %g1 | ||
244 | sub %o2, %g1, %o2 | ||
245 | |||
246 | and %o1, (64 - 1), %g2 | ||
247 | add %o1, %g1, %o1 | ||
248 | sub %o0, %o4, %g3 | ||
249 | brz,pt %g2, 190f | ||
250 | cmp %g2, 32 | ||
251 | blu,a 5f | ||
252 | cmp %g2, 16 | ||
253 | cmp %g2, 48 | ||
254 | blu,a 4f | ||
255 | cmp %g2, 40 | ||
256 | cmp %g2, 56 | ||
257 | blu 170f | ||
258 | nop | ||
259 | ba,a,pt %xcc, 180f | ||
260 | |||
261 | 4: /* 32 <= low bits < 48 */ | ||
262 | blu 150f | ||
263 | nop | ||
264 | ba,a,pt %xcc, 160f | ||
265 | 5: /* 0 < low bits < 32 */ | ||
266 | blu,a 6f | ||
267 | cmp %g2, 8 | ||
268 | cmp %g2, 24 | ||
269 | blu 130f | ||
270 | nop | ||
271 | ba,a,pt %xcc, 140f | ||
272 | 6: /* 0 < low bits < 16 */ | ||
273 | bgeu 120f | ||
274 | nop | ||
275 | /* fall through for 0 < low bits < 8 */ | ||
276 | 110: sub %o4, 64, %g2 | ||
277 | EX_LD(LOAD_BLK(%g2, %f0)) | ||
278 | 1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) | ||
279 | EX_LD(LOAD_BLK(%o4, %f16)) | ||
280 | FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16) | ||
281 | EX_ST(STORE_BLK(%f0, %o4 + %g3)) | ||
282 | FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30) | ||
283 | subcc %g1, 64, %g1 | ||
284 | add %o4, 64, %o4 | ||
285 | bne,pt %xcc, 1b | ||
286 | LOAD(prefetch, %o4 + 64, #one_read) | ||
287 | ba,pt %xcc, 195f | ||
288 | nop | ||
289 | |||
290 | 120: sub %o4, 56, %g2 | ||
291 | FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12) | ||
292 | 1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) | ||
293 | EX_LD(LOAD_BLK(%o4, %f16)) | ||
294 | FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18) | ||
295 | EX_ST(STORE_BLK(%f0, %o4 + %g3)) | ||
296 | FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30) | ||
297 | subcc %g1, 64, %g1 | ||
298 | add %o4, 64, %o4 | ||
299 | bne,pt %xcc, 1b | ||
300 | LOAD(prefetch, %o4 + 64, #one_read) | ||
301 | ba,pt %xcc, 195f | ||
302 | nop | ||
303 | |||
304 | 130: sub %o4, 48, %g2 | ||
305 | FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10) | ||
306 | 1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) | ||
307 | EX_LD(LOAD_BLK(%o4, %f16)) | ||
308 | FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20) | ||
309 | EX_ST(STORE_BLK(%f0, %o4 + %g3)) | ||
310 | FREG_MOVE_6(f20, f22, f24, f26, f28, f30) | ||
311 | subcc %g1, 64, %g1 | ||
312 | add %o4, 64, %o4 | ||
313 | bne,pt %xcc, 1b | ||
314 | LOAD(prefetch, %o4 + 64, #one_read) | ||
315 | ba,pt %xcc, 195f | ||
316 | nop | ||
317 | |||
318 | 140: sub %o4, 40, %g2 | ||
319 | FREG_LOAD_5(%g2, f0, f2, f4, f6, f8) | ||
320 | 1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) | ||
321 | EX_LD(LOAD_BLK(%o4, %f16)) | ||
322 | FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22) | ||
323 | EX_ST(STORE_BLK(%f0, %o4 + %g3)) | ||
324 | FREG_MOVE_5(f22, f24, f26, f28, f30) | ||
325 | subcc %g1, 64, %g1 | ||
326 | add %o4, 64, %o4 | ||
327 | bne,pt %xcc, 1b | ||
328 | LOAD(prefetch, %o4 + 64, #one_read) | ||
329 | ba,pt %xcc, 195f | ||
330 | nop | ||
331 | |||
332 | 150: sub %o4, 32, %g2 | ||
333 | FREG_LOAD_4(%g2, f0, f2, f4, f6) | ||
334 | 1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) | ||
335 | EX_LD(LOAD_BLK(%o4, %f16)) | ||
336 | FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24) | ||
337 | EX_ST(STORE_BLK(%f0, %o4 + %g3)) | ||
338 | FREG_MOVE_4(f24, f26, f28, f30) | ||
339 | subcc %g1, 64, %g1 | ||
340 | add %o4, 64, %o4 | ||
341 | bne,pt %xcc, 1b | ||
342 | LOAD(prefetch, %o4 + 64, #one_read) | ||
343 | ba,pt %xcc, 195f | ||
344 | nop | ||
345 | |||
346 | 160: sub %o4, 24, %g2 | ||
347 | FREG_LOAD_3(%g2, f0, f2, f4) | ||
348 | 1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) | ||
349 | EX_LD(LOAD_BLK(%o4, %f16)) | ||
350 | FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26) | ||
351 | EX_ST(STORE_BLK(%f0, %o4 + %g3)) | ||
352 | FREG_MOVE_3(f26, f28, f30) | ||
353 | subcc %g1, 64, %g1 | ||
354 | add %o4, 64, %o4 | ||
355 | bne,pt %xcc, 1b | ||
356 | LOAD(prefetch, %o4 + 64, #one_read) | ||
357 | ba,pt %xcc, 195f | ||
358 | nop | ||
359 | |||
360 | 170: sub %o4, 16, %g2 | ||
361 | FREG_LOAD_2(%g2, f0, f2) | ||
362 | 1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) | ||
363 | EX_LD(LOAD_BLK(%o4, %f16)) | ||
364 | FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28) | ||
365 | EX_ST(STORE_BLK(%f0, %o4 + %g3)) | ||
366 | FREG_MOVE_2(f28, f30) | ||
367 | subcc %g1, 64, %g1 | ||
368 | add %o4, 64, %o4 | ||
369 | bne,pt %xcc, 1b | ||
370 | LOAD(prefetch, %o4 + 64, #one_read) | ||
371 | ba,pt %xcc, 195f | ||
372 | nop | ||
373 | |||
374 | 180: sub %o4, 8, %g2 | ||
375 | FREG_LOAD_1(%g2, f0) | ||
376 | 1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) | ||
377 | EX_LD(LOAD_BLK(%o4, %f16)) | ||
378 | FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30) | ||
379 | EX_ST(STORE_BLK(%f0, %o4 + %g3)) | ||
380 | FREG_MOVE_1(f30) | ||
381 | subcc %g1, 64, %g1 | ||
382 | add %o4, 64, %o4 | ||
383 | bne,pt %xcc, 1b | ||
384 | LOAD(prefetch, %o4 + 64, #one_read) | ||
385 | ba,pt %xcc, 195f | ||
386 | nop | ||
387 | |||
388 | 190: | ||
389 | 1: EX_ST(STORE_INIT(%g0, %o4 + %g3)) | ||
390 | subcc %g1, 64, %g1 | ||
391 | EX_LD(LOAD_BLK(%o4, %f0)) | ||
392 | EX_ST(STORE_BLK(%f0, %o4 + %g3)) | ||
393 | add %o4, 64, %o4 | ||
394 | bne,pt %xcc, 1b | ||
395 | LOAD(prefetch, %o4 + 64, #one_read) | ||
396 | |||
397 | 195: | ||
398 | add %o4, %g3, %o0 | ||
399 | membar #Sync | ||
400 | |||
401 | VISExitHalf | ||
402 | |||
403 | /* %o2 contains any final bytes still needed to be copied | ||
404 | * over. If anything is left, we copy it one byte at a time. | ||
405 | */ | ||
406 | brz,pt %o2, 85f | ||
407 | sub %o0, %o1, %o3 | ||
408 | ba,a,pt %XCC, 90f | ||
409 | |||
410 | .align 64 | ||
411 | 75: /* 16 < len <= 64 */ | ||
412 | bne,pn %XCC, 75f | ||
413 | sub %o0, %o1, %o3 | ||
414 | |||
415 | 72: | ||
416 | andn %o2, 0xf, %o4 | ||
417 | and %o2, 0xf, %o2 | ||
418 | 1: subcc %o4, 0x10, %o4 | ||
419 | EX_LD(LOAD(ldx, %o1, %o5)) | ||
420 | add %o1, 0x08, %o1 | ||
421 | EX_LD(LOAD(ldx, %o1, %g1)) | ||
422 | sub %o1, 0x08, %o1 | ||
423 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | ||
424 | add %o1, 0x8, %o1 | ||
425 | EX_ST(STORE(stx, %g1, %o1 + %o3)) | ||
426 | bgu,pt %XCC, 1b | ||
427 | add %o1, 0x8, %o1 | ||
428 | 73: andcc %o2, 0x8, %g0 | ||
429 | be,pt %XCC, 1f | ||
430 | nop | ||
431 | sub %o2, 0x8, %o2 | ||
432 | EX_LD(LOAD(ldx, %o1, %o5)) | ||
433 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | ||
434 | add %o1, 0x8, %o1 | ||
435 | 1: andcc %o2, 0x4, %g0 | ||
436 | be,pt %XCC, 1f | ||
437 | nop | ||
438 | sub %o2, 0x4, %o2 | ||
439 | EX_LD(LOAD(lduw, %o1, %o5)) | ||
440 | EX_ST(STORE(stw, %o5, %o1 + %o3)) | ||
441 | add %o1, 0x4, %o1 | ||
442 | 1: cmp %o2, 0 | ||
443 | be,pt %XCC, 85f | ||
444 | nop | ||
445 | ba,pt %xcc, 90f | ||
446 | nop | ||
447 | |||
448 | 75: | ||
449 | andcc %o0, 0x7, %g1 | ||
450 | sub %g1, 0x8, %g1 | ||
451 | be,pn %icc, 2f | ||
452 | sub %g0, %g1, %g1 | ||
453 | sub %o2, %g1, %o2 | ||
454 | |||
455 | 1: subcc %g1, 1, %g1 | ||
456 | EX_LD(LOAD(ldub, %o1, %o5)) | ||
457 | EX_ST(STORE(stb, %o5, %o1 + %o3)) | ||
458 | bgu,pt %icc, 1b | ||
459 | add %o1, 1, %o1 | ||
460 | |||
461 | 2: add %o1, %o3, %o0 | ||
462 | andcc %o1, 0x7, %g1 | ||
463 | bne,pt %icc, 8f | ||
464 | sll %g1, 3, %g1 | ||
465 | |||
466 | cmp %o2, 16 | ||
467 | bgeu,pt %icc, 72b | ||
468 | nop | ||
469 | ba,a,pt %xcc, 73b | ||
470 | |||
471 | 8: mov 64, %o3 | ||
472 | andn %o1, 0x7, %o1 | ||
473 | EX_LD(LOAD(ldx, %o1, %g2)) | ||
474 | sub %o3, %g1, %o3 | ||
475 | andn %o2, 0x7, %o4 | ||
476 | sllx %g2, %g1, %g2 | ||
477 | 1: add %o1, 0x8, %o1 | ||
478 | EX_LD(LOAD(ldx, %o1, %g3)) | ||
479 | subcc %o4, 0x8, %o4 | ||
480 | srlx %g3, %o3, %o5 | ||
481 | or %o5, %g2, %o5 | ||
482 | EX_ST(STORE(stx, %o5, %o0)) | ||
483 | add %o0, 0x8, %o0 | ||
484 | bgu,pt %icc, 1b | ||
485 | sllx %g3, %g1, %g2 | ||
486 | |||
487 | srl %g1, 3, %g1 | ||
488 | andcc %o2, 0x7, %o2 | ||
489 | be,pn %icc, 85f | ||
490 | add %o1, %g1, %o1 | ||
491 | ba,pt %xcc, 90f | ||
492 | sub %o0, %o1, %o3 | ||
493 | |||
494 | .align 64 | ||
495 | 80: /* 0 < len <= 16 */ | ||
496 | andcc %o3, 0x3, %g0 | ||
497 | bne,pn %XCC, 90f | ||
498 | sub %o0, %o1, %o3 | ||
499 | |||
500 | 1: | ||
501 | subcc %o2, 4, %o2 | ||
502 | EX_LD(LOAD(lduw, %o1, %g1)) | ||
503 | EX_ST(STORE(stw, %g1, %o1 + %o3)) | ||
504 | bgu,pt %XCC, 1b | ||
505 | add %o1, 4, %o1 | ||
506 | |||
507 | 85: retl | ||
508 | mov EX_RETVAL(GLOBAL_SPARE), %o0 | ||
509 | |||
510 | .align 32 | ||
511 | 90: | ||
512 | subcc %o2, 1, %o2 | ||
513 | EX_LD(LOAD(ldub, %o1, %g1)) | ||
514 | EX_ST(STORE(stb, %g1, %o1 + %o3)) | ||
515 | bgu,pt %XCC, 90b | ||
516 | add %o1, 1, %o1 | ||
517 | retl | ||
518 | mov EX_RETVAL(GLOBAL_SPARE), %o0 | ||
519 | |||
520 | .size FUNC_NAME, .-FUNC_NAME | ||
diff --git a/arch/sparc/lib/NG2page.S b/arch/sparc/lib/NG2page.S new file mode 100644 index 000000000000..73b6b7c72cbf --- /dev/null +++ b/arch/sparc/lib/NG2page.S | |||
@@ -0,0 +1,61 @@ | |||
1 | /* NG2page.S: Niagara-2 optimized clear and copy page. | ||
2 | * | ||
3 | * Copyright (C) 2007 (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #include <asm/asi.h> | ||
7 | #include <asm/page.h> | ||
8 | #include <asm/visasm.h> | ||
9 | |||
10 | .text | ||
11 | .align 32 | ||
12 | |||
13 | /* This is heavily simplified from the sun4u variants | ||
14 | * because Niagara-2 does not have any D-cache aliasing issues. | ||
15 | */ | ||
16 | NG2copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ | ||
17 | prefetch [%o1 + 0x00], #one_read | ||
18 | prefetch [%o1 + 0x40], #one_read | ||
19 | VISEntryHalf | ||
20 | set PAGE_SIZE, %g7 | ||
21 | sub %o0, %o1, %g3 | ||
22 | 1: stxa %g0, [%o1 + %g3] ASI_BLK_INIT_QUAD_LDD_P | ||
23 | subcc %g7, 64, %g7 | ||
24 | ldda [%o1] ASI_BLK_P, %f0 | ||
25 | stda %f0, [%o1 + %g3] ASI_BLK_P | ||
26 | add %o1, 64, %o1 | ||
27 | bne,pt %xcc, 1b | ||
28 | prefetch [%o1 + 0x40], #one_read | ||
29 | membar #Sync | ||
30 | VISExitHalf | ||
31 | retl | ||
32 | nop | ||
33 | |||
34 | #define BRANCH_ALWAYS 0x10680000 | ||
35 | #define NOP 0x01000000 | ||
36 | #define NG_DO_PATCH(OLD, NEW) \ | ||
37 | sethi %hi(NEW), %g1; \ | ||
38 | or %g1, %lo(NEW), %g1; \ | ||
39 | sethi %hi(OLD), %g2; \ | ||
40 | or %g2, %lo(OLD), %g2; \ | ||
41 | sub %g1, %g2, %g1; \ | ||
42 | sethi %hi(BRANCH_ALWAYS), %g3; \ | ||
43 | sll %g1, 11, %g1; \ | ||
44 | srl %g1, 11 + 2, %g1; \ | ||
45 | or %g3, %lo(BRANCH_ALWAYS), %g3; \ | ||
46 | or %g3, %g1, %g3; \ | ||
47 | stw %g3, [%g2]; \ | ||
48 | sethi %hi(NOP), %g3; \ | ||
49 | or %g3, %lo(NOP), %g3; \ | ||
50 | stw %g3, [%g2 + 0x4]; \ | ||
51 | flush %g2; | ||
52 | |||
53 | .globl niagara2_patch_pageops | ||
54 | .type niagara2_patch_pageops,#function | ||
55 | niagara2_patch_pageops: | ||
56 | NG_DO_PATCH(copy_user_page, NG2copy_user_page) | ||
57 | NG_DO_PATCH(_clear_page, NGclear_page) | ||
58 | NG_DO_PATCH(clear_user_page, NGclear_user_page) | ||
59 | retl | ||
60 | nop | ||
61 | .size niagara2_patch_pageops,.-niagara2_patch_pageops | ||
diff --git a/arch/sparc/lib/NG2patch.S b/arch/sparc/lib/NG2patch.S new file mode 100644 index 000000000000..28c36f06a6d1 --- /dev/null +++ b/arch/sparc/lib/NG2patch.S | |||
@@ -0,0 +1,33 @@ | |||
1 | /* NG2patch.S: Patch Ultra-I routines with Niagara-2 variant. | ||
2 | * | ||
3 | * Copyright (C) 2007 David S. Miller <davem@davemloft.net> | ||
4 | */ | ||
5 | |||
6 | #define BRANCH_ALWAYS 0x10680000 | ||
7 | #define NOP 0x01000000 | ||
8 | #define NG_DO_PATCH(OLD, NEW) \ | ||
9 | sethi %hi(NEW), %g1; \ | ||
10 | or %g1, %lo(NEW), %g1; \ | ||
11 | sethi %hi(OLD), %g2; \ | ||
12 | or %g2, %lo(OLD), %g2; \ | ||
13 | sub %g1, %g2, %g1; \ | ||
14 | sethi %hi(BRANCH_ALWAYS), %g3; \ | ||
15 | sll %g1, 11, %g1; \ | ||
16 | srl %g1, 11 + 2, %g1; \ | ||
17 | or %g3, %lo(BRANCH_ALWAYS), %g3; \ | ||
18 | or %g3, %g1, %g3; \ | ||
19 | stw %g3, [%g2]; \ | ||
20 | sethi %hi(NOP), %g3; \ | ||
21 | or %g3, %lo(NOP), %g3; \ | ||
22 | stw %g3, [%g2 + 0x4]; \ | ||
23 | flush %g2; | ||
24 | |||
25 | .globl niagara2_patch_copyops | ||
26 | .type niagara2_patch_copyops,#function | ||
27 | niagara2_patch_copyops: | ||
28 | NG_DO_PATCH(memcpy, NG2memcpy) | ||
29 | NG_DO_PATCH(___copy_from_user, NG2copy_from_user) | ||
30 | NG_DO_PATCH(___copy_to_user, NG2copy_to_user) | ||
31 | retl | ||
32 | nop | ||
33 | .size niagara2_patch_copyops,.-niagara2_patch_copyops | ||
diff --git a/arch/sparc/lib/NGbzero.S b/arch/sparc/lib/NGbzero.S new file mode 100644 index 000000000000..814d5f7a45e1 --- /dev/null +++ b/arch/sparc/lib/NGbzero.S | |||
@@ -0,0 +1,164 @@ | |||
1 | /* NGbzero.S: Niagara optimized memset/clear_user. | ||
2 | * | ||
3 | * Copyright (C) 2006 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | #include <asm/asi.h> | ||
6 | |||
7 | #define EX_ST(x,y) \ | ||
8 | 98: x,y; \ | ||
9 | .section .fixup; \ | ||
10 | .align 4; \ | ||
11 | 99: retl; \ | ||
12 | mov %o1, %o0; \ | ||
13 | .section __ex_table,"a";\ | ||
14 | .align 4; \ | ||
15 | .word 98b, 99b; \ | ||
16 | .text; \ | ||
17 | .align 4; | ||
18 | |||
19 | .text | ||
20 | |||
21 | .globl NGmemset | ||
22 | .type NGmemset, #function | ||
23 | NGmemset: /* %o0=buf, %o1=pat, %o2=len */ | ||
24 | and %o1, 0xff, %o3 | ||
25 | mov %o2, %o1 | ||
26 | sllx %o3, 8, %g1 | ||
27 | or %g1, %o3, %o2 | ||
28 | sllx %o2, 16, %g1 | ||
29 | or %g1, %o2, %o2 | ||
30 | sllx %o2, 32, %g1 | ||
31 | ba,pt %xcc, 1f | ||
32 | or %g1, %o2, %o2 | ||
33 | |||
34 | .globl NGbzero | ||
35 | .type NGbzero, #function | ||
36 | NGbzero: | ||
37 | clr %o2 | ||
38 | 1: brz,pn %o1, NGbzero_return | ||
39 | mov %o0, %o3 | ||
40 | |||
41 | /* %o5: saved %asi, restored at NGbzero_done | ||
42 | * %g7: store-init %asi to use | ||
43 | * %o4: non-store-init %asi to use | ||
44 | */ | ||
45 | rd %asi, %o5 | ||
46 | mov ASI_BLK_INIT_QUAD_LDD_P, %g7 | ||
47 | mov ASI_P, %o4 | ||
48 | wr %o4, 0x0, %asi | ||
49 | |||
50 | NGbzero_from_clear_user: | ||
51 | cmp %o1, 15 | ||
52 | bl,pn %icc, NGbzero_tiny | ||
53 | andcc %o0, 0x7, %g1 | ||
54 | be,pt %xcc, 2f | ||
55 | mov 8, %g2 | ||
56 | sub %g2, %g1, %g1 | ||
57 | sub %o1, %g1, %o1 | ||
58 | 1: EX_ST(stba %o2, [%o0 + 0x00] %asi) | ||
59 | subcc %g1, 1, %g1 | ||
60 | bne,pt %xcc, 1b | ||
61 | add %o0, 1, %o0 | ||
62 | 2: cmp %o1, 128 | ||
63 | bl,pn %icc, NGbzero_medium | ||
64 | andcc %o0, (64 - 1), %g1 | ||
65 | be,pt %xcc, NGbzero_pre_loop | ||
66 | mov 64, %g2 | ||
67 | sub %g2, %g1, %g1 | ||
68 | sub %o1, %g1, %o1 | ||
69 | 1: EX_ST(stxa %o2, [%o0 + 0x00] %asi) | ||
70 | subcc %g1, 8, %g1 | ||
71 | bne,pt %xcc, 1b | ||
72 | add %o0, 8, %o0 | ||
73 | |||
74 | NGbzero_pre_loop: | ||
75 | wr %g7, 0x0, %asi | ||
76 | andn %o1, (64 - 1), %g1 | ||
77 | sub %o1, %g1, %o1 | ||
78 | NGbzero_loop: | ||
79 | EX_ST(stxa %o2, [%o0 + 0x00] %asi) | ||
80 | EX_ST(stxa %o2, [%o0 + 0x08] %asi) | ||
81 | EX_ST(stxa %o2, [%o0 + 0x10] %asi) | ||
82 | EX_ST(stxa %o2, [%o0 + 0x18] %asi) | ||
83 | EX_ST(stxa %o2, [%o0 + 0x20] %asi) | ||
84 | EX_ST(stxa %o2, [%o0 + 0x28] %asi) | ||
85 | EX_ST(stxa %o2, [%o0 + 0x30] %asi) | ||
86 | EX_ST(stxa %o2, [%o0 + 0x38] %asi) | ||
87 | subcc %g1, 64, %g1 | ||
88 | bne,pt %xcc, NGbzero_loop | ||
89 | add %o0, 64, %o0 | ||
90 | |||
91 | membar #Sync | ||
92 | wr %o4, 0x0, %asi | ||
93 | brz,pn %o1, NGbzero_done | ||
94 | NGbzero_medium: | ||
95 | andncc %o1, 0x7, %g1 | ||
96 | be,pn %xcc, 2f | ||
97 | sub %o1, %g1, %o1 | ||
98 | 1: EX_ST(stxa %o2, [%o0 + 0x00] %asi) | ||
99 | subcc %g1, 8, %g1 | ||
100 | bne,pt %xcc, 1b | ||
101 | add %o0, 8, %o0 | ||
102 | 2: brz,pt %o1, NGbzero_done | ||
103 | nop | ||
104 | |||
105 | NGbzero_tiny: | ||
106 | 1: EX_ST(stba %o2, [%o0 + 0x00] %asi) | ||
107 | subcc %o1, 1, %o1 | ||
108 | bne,pt %icc, 1b | ||
109 | add %o0, 1, %o0 | ||
110 | |||
111 | /* fallthrough */ | ||
112 | |||
113 | NGbzero_done: | ||
114 | wr %o5, 0x0, %asi | ||
115 | |||
116 | NGbzero_return: | ||
117 | retl | ||
118 | mov %o3, %o0 | ||
119 | .size NGbzero, .-NGbzero | ||
120 | .size NGmemset, .-NGmemset | ||
121 | |||
122 | .globl NGclear_user | ||
123 | .type NGclear_user, #function | ||
124 | NGclear_user: /* %o0=buf, %o1=len */ | ||
125 | rd %asi, %o5 | ||
126 | brz,pn %o1, NGbzero_done | ||
127 | clr %o3 | ||
128 | cmp %o5, ASI_AIUS | ||
129 | bne,pn %icc, NGbzero | ||
130 | clr %o2 | ||
131 | mov ASI_BLK_INIT_QUAD_LDD_AIUS, %g7 | ||
132 | ba,pt %xcc, NGbzero_from_clear_user | ||
133 | mov ASI_AIUS, %o4 | ||
134 | .size NGclear_user, .-NGclear_user | ||
135 | |||
136 | #define BRANCH_ALWAYS 0x10680000 | ||
137 | #define NOP 0x01000000 | ||
138 | #define NG_DO_PATCH(OLD, NEW) \ | ||
139 | sethi %hi(NEW), %g1; \ | ||
140 | or %g1, %lo(NEW), %g1; \ | ||
141 | sethi %hi(OLD), %g2; \ | ||
142 | or %g2, %lo(OLD), %g2; \ | ||
143 | sub %g1, %g2, %g1; \ | ||
144 | sethi %hi(BRANCH_ALWAYS), %g3; \ | ||
145 | sll %g1, 11, %g1; \ | ||
146 | srl %g1, 11 + 2, %g1; \ | ||
147 | or %g3, %lo(BRANCH_ALWAYS), %g3; \ | ||
148 | or %g3, %g1, %g3; \ | ||
149 | stw %g3, [%g2]; \ | ||
150 | sethi %hi(NOP), %g3; \ | ||
151 | or %g3, %lo(NOP), %g3; \ | ||
152 | stw %g3, [%g2 + 0x4]; \ | ||
153 | flush %g2; | ||
154 | |||
155 | .globl niagara_patch_bzero | ||
156 | .type niagara_patch_bzero,#function | ||
157 | niagara_patch_bzero: | ||
158 | NG_DO_PATCH(memset, NGmemset) | ||
159 | NG_DO_PATCH(__bzero, NGbzero) | ||
160 | NG_DO_PATCH(__clear_user, NGclear_user) | ||
161 | NG_DO_PATCH(tsb_init, NGtsb_init) | ||
162 | retl | ||
163 | nop | ||
164 | .size niagara_patch_bzero,.-niagara_patch_bzero | ||
diff --git a/arch/sparc/lib/NGcopy_from_user.S b/arch/sparc/lib/NGcopy_from_user.S new file mode 100644 index 000000000000..e7f433f71b42 --- /dev/null +++ b/arch/sparc/lib/NGcopy_from_user.S | |||
@@ -0,0 +1,37 @@ | |||
1 | /* NGcopy_from_user.S: Niagara optimized copy from userspace. | ||
2 | * | ||
3 | * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #define EX_LD(x) \ | ||
7 | 98: x; \ | ||
8 | .section .fixup; \ | ||
9 | .align 4; \ | ||
10 | 99: wr %g0, ASI_AIUS, %asi;\ | ||
11 | ret; \ | ||
12 | restore %g0, 1, %o0; \ | ||
13 | .section __ex_table,"a";\ | ||
14 | .align 4; \ | ||
15 | .word 98b, 99b; \ | ||
16 | .text; \ | ||
17 | .align 4; | ||
18 | |||
19 | #ifndef ASI_AIUS | ||
20 | #define ASI_AIUS 0x11 | ||
21 | #endif | ||
22 | |||
23 | #define FUNC_NAME NGcopy_from_user | ||
24 | #define LOAD(type,addr,dest) type##a [addr] ASI_AIUS, dest | ||
25 | #define LOAD_TWIN(addr_reg,dest0,dest1) \ | ||
26 | ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_AIUS, dest0 | ||
27 | #define EX_RETVAL(x) %g0 | ||
28 | |||
29 | #ifdef __KERNEL__ | ||
30 | #define PREAMBLE \ | ||
31 | rd %asi, %g1; \ | ||
32 | cmp %g1, ASI_AIUS; \ | ||
33 | bne,pn %icc, memcpy_user_stub; \ | ||
34 | nop | ||
35 | #endif | ||
36 | |||
37 | #include "NGmemcpy.S" | ||
diff --git a/arch/sparc/lib/NGcopy_to_user.S b/arch/sparc/lib/NGcopy_to_user.S new file mode 100644 index 000000000000..6ea01c5532a0 --- /dev/null +++ b/arch/sparc/lib/NGcopy_to_user.S | |||
@@ -0,0 +1,40 @@ | |||
1 | /* NGcopy_to_user.S: Niagara optimized copy to userspace. | ||
2 | * | ||
3 | * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #define EX_ST(x) \ | ||
7 | 98: x; \ | ||
8 | .section .fixup; \ | ||
9 | .align 4; \ | ||
10 | 99: wr %g0, ASI_AIUS, %asi;\ | ||
11 | ret; \ | ||
12 | restore %g0, 1, %o0; \ | ||
13 | .section __ex_table,"a";\ | ||
14 | .align 4; \ | ||
15 | .word 98b, 99b; \ | ||
16 | .text; \ | ||
17 | .align 4; | ||
18 | |||
19 | #ifndef ASI_AIUS | ||
20 | #define ASI_AIUS 0x11 | ||
21 | #endif | ||
22 | |||
23 | #define FUNC_NAME NGcopy_to_user | ||
24 | #define STORE(type,src,addr) type##a src, [addr] ASI_AIUS | ||
25 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS | ||
26 | #define EX_RETVAL(x) %g0 | ||
27 | |||
28 | #ifdef __KERNEL__ | ||
29 | /* Writing to %asi is _expensive_ so we hardcode it. | ||
30 | * Reading %asi to check for KERNEL_DS is comparatively | ||
31 | * cheap. | ||
32 | */ | ||
33 | #define PREAMBLE \ | ||
34 | rd %asi, %g1; \ | ||
35 | cmp %g1, ASI_AIUS; \ | ||
36 | bne,pn %icc, memcpy_user_stub; \ | ||
37 | nop | ||
38 | #endif | ||
39 | |||
40 | #include "NGmemcpy.S" | ||
diff --git a/arch/sparc/lib/NGmemcpy.S b/arch/sparc/lib/NGmemcpy.S new file mode 100644 index 000000000000..96a14caf6966 --- /dev/null +++ b/arch/sparc/lib/NGmemcpy.S | |||
@@ -0,0 +1,425 @@ | |||
1 | /* NGmemcpy.S: Niagara optimized memcpy. | ||
2 | * | ||
3 | * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #ifdef __KERNEL__ | ||
7 | #include <asm/asi.h> | ||
8 | #include <asm/thread_info.h> | ||
9 | #define GLOBAL_SPARE %g7 | ||
10 | #define RESTORE_ASI(TMP) \ | ||
11 | ldub [%g6 + TI_CURRENT_DS], TMP; \ | ||
12 | wr TMP, 0x0, %asi; | ||
13 | #else | ||
14 | #define GLOBAL_SPARE %g5 | ||
15 | #define RESTORE_ASI(TMP) \ | ||
16 | wr %g0, ASI_PNF, %asi | ||
17 | #endif | ||
18 | |||
19 | #ifdef __sparc_v9__ | ||
20 | #define SAVE_AMOUNT 128 | ||
21 | #else | ||
22 | #define SAVE_AMOUNT 64 | ||
23 | #endif | ||
24 | |||
25 | #ifndef STORE_ASI | ||
26 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P | ||
27 | #endif | ||
28 | |||
29 | #ifndef EX_LD | ||
30 | #define EX_LD(x) x | ||
31 | #endif | ||
32 | |||
33 | #ifndef EX_ST | ||
34 | #define EX_ST(x) x | ||
35 | #endif | ||
36 | |||
37 | #ifndef EX_RETVAL | ||
38 | #define EX_RETVAL(x) x | ||
39 | #endif | ||
40 | |||
41 | #ifndef LOAD | ||
42 | #ifndef MEMCPY_DEBUG | ||
43 | #define LOAD(type,addr,dest) type [addr], dest | ||
44 | #else | ||
45 | #define LOAD(type,addr,dest) type##a [addr] 0x80, dest | ||
46 | #endif | ||
47 | #endif | ||
48 | |||
49 | #ifndef LOAD_TWIN | ||
50 | #define LOAD_TWIN(addr_reg,dest0,dest1) \ | ||
51 | ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 | ||
52 | #endif | ||
53 | |||
54 | #ifndef STORE | ||
55 | #define STORE(type,src,addr) type src, [addr] | ||
56 | #endif | ||
57 | |||
58 | #ifndef STORE_INIT | ||
59 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA | ||
60 | #define STORE_INIT(src,addr) stxa src, [addr] %asi | ||
61 | #else | ||
62 | #define STORE_INIT(src,addr) stx src, [addr + 0x00] | ||
63 | #endif | ||
64 | #endif | ||
65 | |||
66 | #ifndef FUNC_NAME | ||
67 | #define FUNC_NAME NGmemcpy | ||
68 | #endif | ||
69 | |||
70 | #ifndef PREAMBLE | ||
71 | #define PREAMBLE | ||
72 | #endif | ||
73 | |||
74 | #ifndef XCC | ||
75 | #define XCC xcc | ||
76 | #endif | ||
77 | |||
78 | .register %g2,#scratch | ||
79 | .register %g3,#scratch | ||
80 | |||
81 | .text | ||
82 | .align 64 | ||
83 | |||
84 | .globl FUNC_NAME | ||
85 | .type FUNC_NAME,#function | ||
86 | FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ | ||
87 | PREAMBLE | ||
88 | save %sp, -SAVE_AMOUNT, %sp | ||
89 | srlx %i2, 31, %g2 | ||
90 | cmp %g2, 0 | ||
91 | tne %xcc, 5 | ||
92 | mov %i0, %o0 | ||
93 | cmp %i2, 0 | ||
94 | be,pn %XCC, 85f | ||
95 | or %o0, %i1, %i3 | ||
96 | cmp %i2, 16 | ||
97 | blu,a,pn %XCC, 80f | ||
98 | or %i3, %i2, %i3 | ||
99 | |||
100 | /* 2 blocks (128 bytes) is the minimum we can do the block | ||
101 | * copy with. We need to ensure that we'll iterate at least | ||
102 | * once in the block copy loop. At worst we'll need to align | ||
103 | * the destination to a 64-byte boundary which can chew up | ||
104 | * to (64 - 1) bytes from the length before we perform the | ||
105 | * block copy loop. | ||
106 | */ | ||
107 | cmp %i2, (2 * 64) | ||
108 | blu,pt %XCC, 70f | ||
109 | andcc %i3, 0x7, %g0 | ||
110 | |||
111 | /* %o0: dst | ||
112 | * %i1: src | ||
113 | * %i2: len (known to be >= 128) | ||
114 | * | ||
115 | * The block copy loops will use %i4/%i5,%g2/%g3 as | ||
116 | * temporaries while copying the data. | ||
117 | */ | ||
118 | |||
119 | LOAD(prefetch, %i1, #one_read) | ||
120 | wr %g0, STORE_ASI, %asi | ||
121 | |||
122 | /* Align destination on 64-byte boundary. */ | ||
123 | andcc %o0, (64 - 1), %i4 | ||
124 | be,pt %XCC, 2f | ||
125 | sub %i4, 64, %i4 | ||
126 | sub %g0, %i4, %i4 ! bytes to align dst | ||
127 | sub %i2, %i4, %i2 | ||
128 | 1: subcc %i4, 1, %i4 | ||
129 | EX_LD(LOAD(ldub, %i1, %g1)) | ||
130 | EX_ST(STORE(stb, %g1, %o0)) | ||
131 | add %i1, 1, %i1 | ||
132 | bne,pt %XCC, 1b | ||
133 | add %o0, 1, %o0 | ||
134 | |||
135 | /* If the source is on a 16-byte boundary we can do | ||
136 | * the direct block copy loop. If it is 8-byte aligned | ||
137 | * we can do the 16-byte loads offset by -8 bytes and the | ||
138 | * init stores offset by one register. | ||
139 | * | ||
140 | * If the source is not even 8-byte aligned, we need to do | ||
141 | * shifting and masking (basically integer faligndata). | ||
142 | * | ||
143 | * The careful bit with init stores is that if we store | ||
144 | * to any part of the cache line we have to store the whole | ||
145 | * cacheline else we can end up with corrupt L2 cache line | ||
146 | * contents. Since the loop works on 64-bytes of 64-byte | ||
147 | * aligned store data at a time, this is easy to ensure. | ||
148 | */ | ||
149 | 2: | ||
150 | andcc %i1, (16 - 1), %i4 | ||
151 | andn %i2, (64 - 1), %g1 ! block copy loop iterator | ||
152 | be,pt %XCC, 50f | ||
153 | sub %i2, %g1, %i2 ! final sub-block copy bytes | ||
154 | |||
155 | cmp %i4, 8 | ||
156 | be,pt %XCC, 10f | ||
157 | sub %i1, %i4, %i1 | ||
158 | |||
159 | /* Neither 8-byte nor 16-byte aligned, shift and mask. */ | ||
160 | and %i4, 0x7, GLOBAL_SPARE | ||
161 | sll GLOBAL_SPARE, 3, GLOBAL_SPARE | ||
162 | mov 64, %i5 | ||
163 | EX_LD(LOAD_TWIN(%i1, %g2, %g3)) | ||
164 | sub %i5, GLOBAL_SPARE, %i5 | ||
165 | mov 16, %o4 | ||
166 | mov 32, %o5 | ||
167 | mov 48, %o7 | ||
168 | mov 64, %i3 | ||
169 | |||
170 | bg,pn %XCC, 9f | ||
171 | nop | ||
172 | |||
173 | #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ | ||
174 | sllx WORD1, POST_SHIFT, WORD1; \ | ||
175 | srlx WORD2, PRE_SHIFT, TMP; \ | ||
176 | sllx WORD2, POST_SHIFT, WORD2; \ | ||
177 | or WORD1, TMP, WORD1; \ | ||
178 | srlx WORD3, PRE_SHIFT, TMP; \ | ||
179 | or WORD2, TMP, WORD2; | ||
180 | |||
181 | 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3)) | ||
182 | MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) | ||
183 | LOAD(prefetch, %i1 + %i3, #one_read) | ||
184 | |||
185 | EX_ST(STORE_INIT(%g2, %o0 + 0x00)) | ||
186 | EX_ST(STORE_INIT(%g3, %o0 + 0x08)) | ||
187 | |||
188 | EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3)) | ||
189 | MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) | ||
190 | |||
191 | EX_ST(STORE_INIT(%o2, %o0 + 0x10)) | ||
192 | EX_ST(STORE_INIT(%o3, %o0 + 0x18)) | ||
193 | |||
194 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) | ||
195 | MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) | ||
196 | |||
197 | EX_ST(STORE_INIT(%g2, %o0 + 0x20)) | ||
198 | EX_ST(STORE_INIT(%g3, %o0 + 0x28)) | ||
199 | |||
200 | EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3)) | ||
201 | add %i1, 64, %i1 | ||
202 | MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) | ||
203 | |||
204 | EX_ST(STORE_INIT(%o2, %o0 + 0x30)) | ||
205 | EX_ST(STORE_INIT(%o3, %o0 + 0x38)) | ||
206 | |||
207 | subcc %g1, 64, %g1 | ||
208 | bne,pt %XCC, 8b | ||
209 | add %o0, 64, %o0 | ||
210 | |||
211 | ba,pt %XCC, 60f | ||
212 | add %i1, %i4, %i1 | ||
213 | |||
214 | 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3)) | ||
215 | MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) | ||
216 | LOAD(prefetch, %i1 + %i3, #one_read) | ||
217 | |||
218 | EX_ST(STORE_INIT(%g3, %o0 + 0x00)) | ||
219 | EX_ST(STORE_INIT(%o2, %o0 + 0x08)) | ||
220 | |||
221 | EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3)) | ||
222 | MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) | ||
223 | |||
224 | EX_ST(STORE_INIT(%o3, %o0 + 0x10)) | ||
225 | EX_ST(STORE_INIT(%g2, %o0 + 0x18)) | ||
226 | |||
227 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) | ||
228 | MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) | ||
229 | |||
230 | EX_ST(STORE_INIT(%g3, %o0 + 0x20)) | ||
231 | EX_ST(STORE_INIT(%o2, %o0 + 0x28)) | ||
232 | |||
233 | EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3)) | ||
234 | add %i1, 64, %i1 | ||
235 | MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) | ||
236 | |||
237 | EX_ST(STORE_INIT(%o3, %o0 + 0x30)) | ||
238 | EX_ST(STORE_INIT(%g2, %o0 + 0x38)) | ||
239 | |||
240 | subcc %g1, 64, %g1 | ||
241 | bne,pt %XCC, 9b | ||
242 | add %o0, 64, %o0 | ||
243 | |||
244 | ba,pt %XCC, 60f | ||
245 | add %i1, %i4, %i1 | ||
246 | |||
247 | 10: /* Destination is 64-byte aligned, source was only 8-byte | ||
248 | * aligned but it has been subtracted by 8 and we perform | ||
249 | * one twin load ahead, then add 8 back into source when | ||
250 | * we finish the loop. | ||
251 | */ | ||
252 | EX_LD(LOAD_TWIN(%i1, %o4, %o5)) | ||
253 | mov 16, %o7 | ||
254 | mov 32, %g2 | ||
255 | mov 48, %g3 | ||
256 | mov 64, %o1 | ||
257 | 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) | ||
258 | LOAD(prefetch, %i1 + %o1, #one_read) | ||
259 | EX_ST(STORE_INIT(%o5, %o0 + 0x00)) ! initializes cache line | ||
260 | EX_ST(STORE_INIT(%o2, %o0 + 0x08)) | ||
261 | EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5)) | ||
262 | EX_ST(STORE_INIT(%o3, %o0 + 0x10)) | ||
263 | EX_ST(STORE_INIT(%o4, %o0 + 0x18)) | ||
264 | EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3)) | ||
265 | EX_ST(STORE_INIT(%o5, %o0 + 0x20)) | ||
266 | EX_ST(STORE_INIT(%o2, %o0 + 0x28)) | ||
267 | EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5)) | ||
268 | add %i1, 64, %i1 | ||
269 | EX_ST(STORE_INIT(%o3, %o0 + 0x30)) | ||
270 | EX_ST(STORE_INIT(%o4, %o0 + 0x38)) | ||
271 | subcc %g1, 64, %g1 | ||
272 | bne,pt %XCC, 1b | ||
273 | add %o0, 64, %o0 | ||
274 | |||
275 | ba,pt %XCC, 60f | ||
276 | add %i1, 0x8, %i1 | ||
277 | |||
278 | 50: /* Destination is 64-byte aligned, and source is 16-byte | ||
279 | * aligned. | ||
280 | */ | ||
281 | mov 16, %o7 | ||
282 | mov 32, %g2 | ||
283 | mov 48, %g3 | ||
284 | mov 64, %o1 | ||
285 | 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5)) | ||
286 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) | ||
287 | LOAD(prefetch, %i1 + %o1, #one_read) | ||
288 | EX_ST(STORE_INIT(%o4, %o0 + 0x00)) ! initializes cache line | ||
289 | EX_ST(STORE_INIT(%o5, %o0 + 0x08)) | ||
290 | EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5)) | ||
291 | EX_ST(STORE_INIT(%o2, %o0 + 0x10)) | ||
292 | EX_ST(STORE_INIT(%o3, %o0 + 0x18)) | ||
293 | EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3)) | ||
294 | add %i1, 64, %i1 | ||
295 | EX_ST(STORE_INIT(%o4, %o0 + 0x20)) | ||
296 | EX_ST(STORE_INIT(%o5, %o0 + 0x28)) | ||
297 | EX_ST(STORE_INIT(%o2, %o0 + 0x30)) | ||
298 | EX_ST(STORE_INIT(%o3, %o0 + 0x38)) | ||
299 | subcc %g1, 64, %g1 | ||
300 | bne,pt %XCC, 1b | ||
301 | add %o0, 64, %o0 | ||
302 | /* fall through */ | ||
303 | |||
304 | 60: | ||
305 | membar #Sync | ||
306 | |||
307 | /* %i2 contains any final bytes still needed to be copied | ||
308 | * over. If anything is left, we copy it one byte at a time. | ||
309 | */ | ||
310 | RESTORE_ASI(%i3) | ||
311 | brz,pt %i2, 85f | ||
312 | sub %o0, %i1, %i3 | ||
313 | ba,a,pt %XCC, 90f | ||
314 | |||
315 | .align 64 | ||
316 | 70: /* 16 < len <= 64 */ | ||
317 | bne,pn %XCC, 75f | ||
318 | sub %o0, %i1, %i3 | ||
319 | |||
320 | 72: | ||
321 | andn %i2, 0xf, %i4 | ||
322 | and %i2, 0xf, %i2 | ||
323 | 1: subcc %i4, 0x10, %i4 | ||
324 | EX_LD(LOAD(ldx, %i1, %o4)) | ||
325 | add %i1, 0x08, %i1 | ||
326 | EX_LD(LOAD(ldx, %i1, %g1)) | ||
327 | sub %i1, 0x08, %i1 | ||
328 | EX_ST(STORE(stx, %o4, %i1 + %i3)) | ||
329 | add %i1, 0x8, %i1 | ||
330 | EX_ST(STORE(stx, %g1, %i1 + %i3)) | ||
331 | bgu,pt %XCC, 1b | ||
332 | add %i1, 0x8, %i1 | ||
333 | 73: andcc %i2, 0x8, %g0 | ||
334 | be,pt %XCC, 1f | ||
335 | nop | ||
336 | sub %i2, 0x8, %i2 | ||
337 | EX_LD(LOAD(ldx, %i1, %o4)) | ||
338 | EX_ST(STORE(stx, %o4, %i1 + %i3)) | ||
339 | add %i1, 0x8, %i1 | ||
340 | 1: andcc %i2, 0x4, %g0 | ||
341 | be,pt %XCC, 1f | ||
342 | nop | ||
343 | sub %i2, 0x4, %i2 | ||
344 | EX_LD(LOAD(lduw, %i1, %i5)) | ||
345 | EX_ST(STORE(stw, %i5, %i1 + %i3)) | ||
346 | add %i1, 0x4, %i1 | ||
347 | 1: cmp %i2, 0 | ||
348 | be,pt %XCC, 85f | ||
349 | nop | ||
350 | ba,pt %xcc, 90f | ||
351 | nop | ||
352 | |||
353 | 75: | ||
354 | andcc %o0, 0x7, %g1 | ||
355 | sub %g1, 0x8, %g1 | ||
356 | be,pn %icc, 2f | ||
357 | sub %g0, %g1, %g1 | ||
358 | sub %i2, %g1, %i2 | ||
359 | |||
360 | 1: subcc %g1, 1, %g1 | ||
361 | EX_LD(LOAD(ldub, %i1, %i5)) | ||
362 | EX_ST(STORE(stb, %i5, %i1 + %i3)) | ||
363 | bgu,pt %icc, 1b | ||
364 | add %i1, 1, %i1 | ||
365 | |||
366 | 2: add %i1, %i3, %o0 | ||
367 | andcc %i1, 0x7, %g1 | ||
368 | bne,pt %icc, 8f | ||
369 | sll %g1, 3, %g1 | ||
370 | |||
371 | cmp %i2, 16 | ||
372 | bgeu,pt %icc, 72b | ||
373 | nop | ||
374 | ba,a,pt %xcc, 73b | ||
375 | |||
376 | 8: mov 64, %i3 | ||
377 | andn %i1, 0x7, %i1 | ||
378 | EX_LD(LOAD(ldx, %i1, %g2)) | ||
379 | sub %i3, %g1, %i3 | ||
380 | andn %i2, 0x7, %i4 | ||
381 | sllx %g2, %g1, %g2 | ||
382 | 1: add %i1, 0x8, %i1 | ||
383 | EX_LD(LOAD(ldx, %i1, %g3)) | ||
384 | subcc %i4, 0x8, %i4 | ||
385 | srlx %g3, %i3, %i5 | ||
386 | or %i5, %g2, %i5 | ||
387 | EX_ST(STORE(stx, %i5, %o0)) | ||
388 | add %o0, 0x8, %o0 | ||
389 | bgu,pt %icc, 1b | ||
390 | sllx %g3, %g1, %g2 | ||
391 | |||
392 | srl %g1, 3, %g1 | ||
393 | andcc %i2, 0x7, %i2 | ||
394 | be,pn %icc, 85f | ||
395 | add %i1, %g1, %i1 | ||
396 | ba,pt %xcc, 90f | ||
397 | sub %o0, %i1, %i3 | ||
398 | |||
399 | .align 64 | ||
400 | 80: /* 0 < len <= 16 */ | ||
401 | andcc %i3, 0x3, %g0 | ||
402 | bne,pn %XCC, 90f | ||
403 | sub %o0, %i1, %i3 | ||
404 | |||
405 | 1: | ||
406 | subcc %i2, 4, %i2 | ||
407 | EX_LD(LOAD(lduw, %i1, %g1)) | ||
408 | EX_ST(STORE(stw, %g1, %i1 + %i3)) | ||
409 | bgu,pt %XCC, 1b | ||
410 | add %i1, 4, %i1 | ||
411 | |||
412 | 85: ret | ||
413 | restore EX_RETVAL(%i0), %g0, %o0 | ||
414 | |||
415 | .align 32 | ||
416 | 90: | ||
417 | subcc %i2, 1, %i2 | ||
418 | EX_LD(LOAD(ldub, %i1, %g1)) | ||
419 | EX_ST(STORE(stb, %g1, %i1 + %i3)) | ||
420 | bgu,pt %XCC, 90b | ||
421 | add %i1, 1, %i1 | ||
422 | ret | ||
423 | restore EX_RETVAL(%i0), %g0, %o0 | ||
424 | |||
425 | .size FUNC_NAME, .-FUNC_NAME | ||
diff --git a/arch/sparc/lib/NGpage.S b/arch/sparc/lib/NGpage.S new file mode 100644 index 000000000000..428920de05ba --- /dev/null +++ b/arch/sparc/lib/NGpage.S | |||
@@ -0,0 +1,99 @@ | |||
1 | /* NGpage.S: Niagara optimize clear and copy page. | ||
2 | * | ||
3 | * Copyright (C) 2006 (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #include <asm/asi.h> | ||
7 | #include <asm/page.h> | ||
8 | |||
9 | .text | ||
10 | .align 32 | ||
11 | |||
12 | /* This is heavily simplified from the sun4u variants | ||
13 | * because Niagara does not have any D-cache aliasing issues | ||
14 | * and also we don't need to use the FPU in order to implement | ||
15 | * an optimal page copy/clear. | ||
16 | */ | ||
17 | |||
18 | NGcopy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ | ||
19 | prefetch [%o1 + 0x00], #one_read | ||
20 | mov 8, %g1 | ||
21 | mov 16, %g2 | ||
22 | mov 24, %g3 | ||
23 | set PAGE_SIZE, %g7 | ||
24 | |||
25 | 1: ldda [%o1 + %g0] ASI_BLK_INIT_QUAD_LDD_P, %o2 | ||
26 | ldda [%o1 + %g2] ASI_BLK_INIT_QUAD_LDD_P, %o4 | ||
27 | prefetch [%o1 + 0x40], #one_read | ||
28 | add %o1, 32, %o1 | ||
29 | stxa %o2, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P | ||
30 | stxa %o3, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P | ||
31 | ldda [%o1 + %g0] ASI_BLK_INIT_QUAD_LDD_P, %o2 | ||
32 | stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P | ||
33 | stxa %o5, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P | ||
34 | ldda [%o1 + %g2] ASI_BLK_INIT_QUAD_LDD_P, %o4 | ||
35 | add %o1, 32, %o1 | ||
36 | add %o0, 32, %o0 | ||
37 | stxa %o2, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P | ||
38 | stxa %o3, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P | ||
39 | stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P | ||
40 | stxa %o5, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P | ||
41 | subcc %g7, 64, %g7 | ||
42 | bne,pt %xcc, 1b | ||
43 | add %o0, 32, %o0 | ||
44 | membar #Sync | ||
45 | retl | ||
46 | nop | ||
47 | |||
48 | .globl NGclear_page, NGclear_user_page | ||
49 | NGclear_page: /* %o0=dest */ | ||
50 | NGclear_user_page: /* %o0=dest, %o1=vaddr */ | ||
51 | mov 8, %g1 | ||
52 | mov 16, %g2 | ||
53 | mov 24, %g3 | ||
54 | set PAGE_SIZE, %g7 | ||
55 | |||
56 | 1: stxa %g0, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P | ||
57 | stxa %g0, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P | ||
58 | stxa %g0, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P | ||
59 | stxa %g0, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P | ||
60 | add %o0, 32, %o0 | ||
61 | stxa %g0, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P | ||
62 | stxa %g0, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P | ||
63 | stxa %g0, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P | ||
64 | stxa %g0, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P | ||
65 | subcc %g7, 64, %g7 | ||
66 | bne,pt %xcc, 1b | ||
67 | add %o0, 32, %o0 | ||
68 | membar #Sync | ||
69 | retl | ||
70 | nop | ||
71 | |||
72 | #define BRANCH_ALWAYS 0x10680000 | ||
73 | #define NOP 0x01000000 | ||
74 | #define NG_DO_PATCH(OLD, NEW) \ | ||
75 | sethi %hi(NEW), %g1; \ | ||
76 | or %g1, %lo(NEW), %g1; \ | ||
77 | sethi %hi(OLD), %g2; \ | ||
78 | or %g2, %lo(OLD), %g2; \ | ||
79 | sub %g1, %g2, %g1; \ | ||
80 | sethi %hi(BRANCH_ALWAYS), %g3; \ | ||
81 | sll %g1, 11, %g1; \ | ||
82 | srl %g1, 11 + 2, %g1; \ | ||
83 | or %g3, %lo(BRANCH_ALWAYS), %g3; \ | ||
84 | or %g3, %g1, %g3; \ | ||
85 | stw %g3, [%g2]; \ | ||
86 | sethi %hi(NOP), %g3; \ | ||
87 | or %g3, %lo(NOP), %g3; \ | ||
88 | stw %g3, [%g2 + 0x4]; \ | ||
89 | flush %g2; | ||
90 | |||
91 | .globl niagara_patch_pageops | ||
92 | .type niagara_patch_pageops,#function | ||
93 | niagara_patch_pageops: | ||
94 | NG_DO_PATCH(copy_user_page, NGcopy_user_page) | ||
95 | NG_DO_PATCH(_clear_page, NGclear_page) | ||
96 | NG_DO_PATCH(clear_user_page, NGclear_user_page) | ||
97 | retl | ||
98 | nop | ||
99 | .size niagara_patch_pageops,.-niagara_patch_pageops | ||
diff --git a/arch/sparc/lib/NGpatch.S b/arch/sparc/lib/NGpatch.S new file mode 100644 index 000000000000..3b0674fc3366 --- /dev/null +++ b/arch/sparc/lib/NGpatch.S | |||
@@ -0,0 +1,33 @@ | |||
1 | /* NGpatch.S: Patch Ultra-I routines with Niagara variant. | ||
2 | * | ||
3 | * Copyright (C) 2006 David S. Miller <davem@davemloft.net> | ||
4 | */ | ||
5 | |||
6 | #define BRANCH_ALWAYS 0x10680000 | ||
7 | #define NOP 0x01000000 | ||
8 | #define NG_DO_PATCH(OLD, NEW) \ | ||
9 | sethi %hi(NEW), %g1; \ | ||
10 | or %g1, %lo(NEW), %g1; \ | ||
11 | sethi %hi(OLD), %g2; \ | ||
12 | or %g2, %lo(OLD), %g2; \ | ||
13 | sub %g1, %g2, %g1; \ | ||
14 | sethi %hi(BRANCH_ALWAYS), %g3; \ | ||
15 | sll %g1, 11, %g1; \ | ||
16 | srl %g1, 11 + 2, %g1; \ | ||
17 | or %g3, %lo(BRANCH_ALWAYS), %g3; \ | ||
18 | or %g3, %g1, %g3; \ | ||
19 | stw %g3, [%g2]; \ | ||
20 | sethi %hi(NOP), %g3; \ | ||
21 | or %g3, %lo(NOP), %g3; \ | ||
22 | stw %g3, [%g2 + 0x4]; \ | ||
23 | flush %g2; | ||
24 | |||
25 | .globl niagara_patch_copyops | ||
26 | .type niagara_patch_copyops,#function | ||
27 | niagara_patch_copyops: | ||
28 | NG_DO_PATCH(memcpy, NGmemcpy) | ||
29 | NG_DO_PATCH(___copy_from_user, NGcopy_from_user) | ||
30 | NG_DO_PATCH(___copy_to_user, NGcopy_to_user) | ||
31 | retl | ||
32 | nop | ||
33 | .size niagara_patch_copyops,.-niagara_patch_copyops | ||
diff --git a/arch/sparc/lib/PeeCeeI.c b/arch/sparc/lib/PeeCeeI.c new file mode 100644 index 000000000000..46053e6ddd7b --- /dev/null +++ b/arch/sparc/lib/PeeCeeI.c | |||
@@ -0,0 +1,203 @@ | |||
1 | /* | ||
2 | * PeeCeeI.c: The emerging standard... | ||
3 | * | ||
4 | * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) | ||
5 | */ | ||
6 | |||
7 | #include <asm/io.h> | ||
8 | #include <asm/byteorder.h> | ||
9 | |||
10 | void outsb(unsigned long __addr, const void *src, unsigned long count) | ||
11 | { | ||
12 | void __iomem *addr = (void __iomem *) __addr; | ||
13 | const u8 *p = src; | ||
14 | |||
15 | while (count--) | ||
16 | outb(*p++, addr); | ||
17 | } | ||
18 | |||
19 | void outsw(unsigned long __addr, const void *src, unsigned long count) | ||
20 | { | ||
21 | void __iomem *addr = (void __iomem *) __addr; | ||
22 | |||
23 | while (count--) { | ||
24 | __raw_writew(*(u16 *)src, addr); | ||
25 | src += sizeof(u16); | ||
26 | } | ||
27 | } | ||
28 | |||
29 | void outsl(unsigned long __addr, const void *src, unsigned long count) | ||
30 | { | ||
31 | void __iomem *addr = (void __iomem *) __addr; | ||
32 | u32 l, l2; | ||
33 | |||
34 | if (!count) | ||
35 | return; | ||
36 | |||
37 | switch (((unsigned long)src) & 0x3) { | ||
38 | case 0x0: | ||
39 | /* src is naturally aligned */ | ||
40 | while (count--) { | ||
41 | __raw_writel(*(u32 *)src, addr); | ||
42 | src += sizeof(u32); | ||
43 | } | ||
44 | break; | ||
45 | case 0x2: | ||
46 | /* 2-byte alignment */ | ||
47 | while (count--) { | ||
48 | l = (*(u16 *)src) << 16; | ||
49 | l |= *(u16 *)(src + sizeof(u16)); | ||
50 | __raw_writel(l, addr); | ||
51 | src += sizeof(u32); | ||
52 | } | ||
53 | break; | ||
54 | case 0x1: | ||
55 | /* Hold three bytes in l each time, grab a byte from l2 */ | ||
56 | l = (*(u8 *)src) << 24; | ||
57 | l |= (*(u16 *)(src + sizeof(u8))) << 8; | ||
58 | src += sizeof(u8) + sizeof(u16); | ||
59 | while (count--) { | ||
60 | l2 = *(u32 *)src; | ||
61 | l |= (l2 >> 24); | ||
62 | __raw_writel(l, addr); | ||
63 | l = l2 << 8; | ||
64 | src += sizeof(u32); | ||
65 | } | ||
66 | break; | ||
67 | case 0x3: | ||
68 | /* Hold a byte in l each time, grab 3 bytes from l2 */ | ||
69 | l = (*(u8 *)src) << 24; | ||
70 | src += sizeof(u8); | ||
71 | while (count--) { | ||
72 | l2 = *(u32 *)src; | ||
73 | l |= (l2 >> 8); | ||
74 | __raw_writel(l, addr); | ||
75 | l = l2 << 24; | ||
76 | src += sizeof(u32); | ||
77 | } | ||
78 | break; | ||
79 | } | ||
80 | } | ||
81 | |||
82 | void insb(unsigned long __addr, void *dst, unsigned long count) | ||
83 | { | ||
84 | void __iomem *addr = (void __iomem *) __addr; | ||
85 | |||
86 | if (count) { | ||
87 | u32 *pi; | ||
88 | u8 *pb = dst; | ||
89 | |||
90 | while ((((unsigned long)pb) & 0x3) && count--) | ||
91 | *pb++ = inb(addr); | ||
92 | pi = (u32 *)pb; | ||
93 | while (count >= 4) { | ||
94 | u32 w; | ||
95 | |||
96 | w = (inb(addr) << 24); | ||
97 | w |= (inb(addr) << 16); | ||
98 | w |= (inb(addr) << 8); | ||
99 | w |= (inb(addr) << 0); | ||
100 | *pi++ = w; | ||
101 | count -= 4; | ||
102 | } | ||
103 | pb = (u8 *)pi; | ||
104 | while (count--) | ||
105 | *pb++ = inb(addr); | ||
106 | } | ||
107 | } | ||
108 | |||
109 | void insw(unsigned long __addr, void *dst, unsigned long count) | ||
110 | { | ||
111 | void __iomem *addr = (void __iomem *) __addr; | ||
112 | |||
113 | if (count) { | ||
114 | u16 *ps = dst; | ||
115 | u32 *pi; | ||
116 | |||
117 | if (((unsigned long)ps) & 0x2) { | ||
118 | *ps++ = le16_to_cpu(inw(addr)); | ||
119 | count--; | ||
120 | } | ||
121 | pi = (u32 *)ps; | ||
122 | while (count >= 2) { | ||
123 | u32 w; | ||
124 | |||
125 | w = (le16_to_cpu(inw(addr)) << 16); | ||
126 | w |= (le16_to_cpu(inw(addr)) << 0); | ||
127 | *pi++ = w; | ||
128 | count -= 2; | ||
129 | } | ||
130 | ps = (u16 *)pi; | ||
131 | if (count) | ||
132 | *ps = le16_to_cpu(inw(addr)); | ||
133 | } | ||
134 | } | ||
135 | |||
136 | void insl(unsigned long __addr, void *dst, unsigned long count) | ||
137 | { | ||
138 | void __iomem *addr = (void __iomem *) __addr; | ||
139 | |||
140 | if (count) { | ||
141 | if ((((unsigned long)dst) & 0x3) == 0) { | ||
142 | u32 *pi = dst; | ||
143 | while (count--) | ||
144 | *pi++ = le32_to_cpu(inl(addr)); | ||
145 | } else { | ||
146 | u32 l = 0, l2, *pi; | ||
147 | u16 *ps; | ||
148 | u8 *pb; | ||
149 | |||
150 | switch (((unsigned long)dst) & 3) { | ||
151 | case 0x2: | ||
152 | ps = dst; | ||
153 | count -= 1; | ||
154 | l = le32_to_cpu(inl(addr)); | ||
155 | *ps++ = l; | ||
156 | pi = (u32 *)ps; | ||
157 | while (count--) { | ||
158 | l2 = le32_to_cpu(inl(addr)); | ||
159 | *pi++ = (l << 16) | (l2 >> 16); | ||
160 | l = l2; | ||
161 | } | ||
162 | ps = (u16 *)pi; | ||
163 | *ps = l; | ||
164 | break; | ||
165 | |||
166 | case 0x1: | ||
167 | pb = dst; | ||
168 | count -= 1; | ||
169 | l = le32_to_cpu(inl(addr)); | ||
170 | *pb++ = l >> 24; | ||
171 | ps = (u16 *)pb; | ||
172 | *ps++ = ((l >> 8) & 0xffff); | ||
173 | pi = (u32 *)ps; | ||
174 | while (count--) { | ||
175 | l2 = le32_to_cpu(inl(addr)); | ||
176 | *pi++ = (l << 24) | (l2 >> 8); | ||
177 | l = l2; | ||
178 | } | ||
179 | pb = (u8 *)pi; | ||
180 | *pb = l; | ||
181 | break; | ||
182 | |||
183 | case 0x3: | ||
184 | pb = (u8 *)dst; | ||
185 | count -= 1; | ||
186 | l = le32_to_cpu(inl(addr)); | ||
187 | *pb++ = l >> 24; | ||
188 | pi = (u32 *)pb; | ||
189 | while (count--) { | ||
190 | l2 = le32_to_cpu(inl(addr)); | ||
191 | *pi++ = (l << 8) | (l2 >> 24); | ||
192 | l = l2; | ||
193 | } | ||
194 | ps = (u16 *)pi; | ||
195 | *ps++ = ((l >> 8) & 0xffff); | ||
196 | pb = (u8 *)ps; | ||
197 | *pb = l; | ||
198 | break; | ||
199 | } | ||
200 | } | ||
201 | } | ||
202 | } | ||
203 | |||
diff --git a/arch/sparc/lib/U1copy_from_user.S b/arch/sparc/lib/U1copy_from_user.S new file mode 100644 index 000000000000..3192b0bf4fab --- /dev/null +++ b/arch/sparc/lib/U1copy_from_user.S | |||
@@ -0,0 +1,33 @@ | |||
1 | /* U1copy_from_user.S: UltraSparc-I/II/IIi/IIe optimized copy from userspace. | ||
2 | * | ||
3 | * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) | ||
4 | */ | ||
5 | |||
6 | #define EX_LD(x) \ | ||
7 | 98: x; \ | ||
8 | .section .fixup; \ | ||
9 | .align 4; \ | ||
10 | 99: retl; \ | ||
11 | mov 1, %o0; \ | ||
12 | .section __ex_table,"a";\ | ||
13 | .align 4; \ | ||
14 | .word 98b, 99b; \ | ||
15 | .text; \ | ||
16 | .align 4; | ||
17 | |||
18 | #define FUNC_NAME ___copy_from_user | ||
19 | #define LOAD(type,addr,dest) type##a [addr] %asi, dest | ||
20 | #define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_AIUS, dest | ||
21 | #define EX_RETVAL(x) 0 | ||
22 | |||
23 | /* Writing to %asi is _expensive_ so we hardcode it. | ||
24 | * Reading %asi to check for KERNEL_DS is comparatively | ||
25 | * cheap. | ||
26 | */ | ||
27 | #define PREAMBLE \ | ||
28 | rd %asi, %g1; \ | ||
29 | cmp %g1, ASI_AIUS; \ | ||
30 | bne,pn %icc, memcpy_user_stub; \ | ||
31 | nop; \ | ||
32 | |||
33 | #include "U1memcpy.S" | ||
diff --git a/arch/sparc/lib/U1copy_to_user.S b/arch/sparc/lib/U1copy_to_user.S new file mode 100644 index 000000000000..d1210ffb0b82 --- /dev/null +++ b/arch/sparc/lib/U1copy_to_user.S | |||
@@ -0,0 +1,33 @@ | |||
1 | /* U1copy_to_user.S: UltraSparc-I/II/IIi/IIe optimized copy to userspace. | ||
2 | * | ||
3 | * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) | ||
4 | */ | ||
5 | |||
6 | #define EX_ST(x) \ | ||
7 | 98: x; \ | ||
8 | .section .fixup; \ | ||
9 | .align 4; \ | ||
10 | 99: retl; \ | ||
11 | mov 1, %o0; \ | ||
12 | .section __ex_table,"a";\ | ||
13 | .align 4; \ | ||
14 | .word 98b, 99b; \ | ||
15 | .text; \ | ||
16 | .align 4; | ||
17 | |||
18 | #define FUNC_NAME ___copy_to_user | ||
19 | #define STORE(type,src,addr) type##a src, [addr] ASI_AIUS | ||
20 | #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_AIUS | ||
21 | #define EX_RETVAL(x) 0 | ||
22 | |||
23 | /* Writing to %asi is _expensive_ so we hardcode it. | ||
24 | * Reading %asi to check for KERNEL_DS is comparatively | ||
25 | * cheap. | ||
26 | */ | ||
27 | #define PREAMBLE \ | ||
28 | rd %asi, %g1; \ | ||
29 | cmp %g1, ASI_AIUS; \ | ||
30 | bne,pn %icc, memcpy_user_stub; \ | ||
31 | nop; \ | ||
32 | |||
33 | #include "U1memcpy.S" | ||
diff --git a/arch/sparc/lib/U1memcpy.S b/arch/sparc/lib/U1memcpy.S new file mode 100644 index 000000000000..bafd2fc07acb --- /dev/null +++ b/arch/sparc/lib/U1memcpy.S | |||
@@ -0,0 +1,563 @@ | |||
1 | /* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy. | ||
2 | * | ||
3 | * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com) | ||
4 | * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz) | ||
5 | */ | ||
6 | |||
7 | #ifdef __KERNEL__ | ||
8 | #include <asm/visasm.h> | ||
9 | #include <asm/asi.h> | ||
10 | #define GLOBAL_SPARE g7 | ||
11 | #else | ||
12 | #define GLOBAL_SPARE g5 | ||
13 | #define ASI_BLK_P 0xf0 | ||
14 | #define FPRS_FEF 0x04 | ||
15 | #ifdef MEMCPY_DEBUG | ||
16 | #define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ | ||
17 | clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; | ||
18 | #define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | ||
19 | #else | ||
20 | #define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs | ||
21 | #define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | ||
22 | #endif | ||
23 | #endif | ||
24 | |||
25 | #ifndef EX_LD | ||
26 | #define EX_LD(x) x | ||
27 | #endif | ||
28 | |||
29 | #ifndef EX_ST | ||
30 | #define EX_ST(x) x | ||
31 | #endif | ||
32 | |||
33 | #ifndef EX_RETVAL | ||
34 | #define EX_RETVAL(x) x | ||
35 | #endif | ||
36 | |||
37 | #ifndef LOAD | ||
38 | #define LOAD(type,addr,dest) type [addr], dest | ||
39 | #endif | ||
40 | |||
41 | #ifndef LOAD_BLK | ||
42 | #define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest | ||
43 | #endif | ||
44 | |||
45 | #ifndef STORE | ||
46 | #define STORE(type,src,addr) type src, [addr] | ||
47 | #endif | ||
48 | |||
49 | #ifndef STORE_BLK | ||
50 | #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P | ||
51 | #endif | ||
52 | |||
53 | #ifndef FUNC_NAME | ||
54 | #define FUNC_NAME memcpy | ||
55 | #endif | ||
56 | |||
57 | #ifndef PREAMBLE | ||
58 | #define PREAMBLE | ||
59 | #endif | ||
60 | |||
61 | #ifndef XCC | ||
62 | #define XCC xcc | ||
63 | #endif | ||
64 | |||
65 | #define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \ | ||
66 | faligndata %f1, %f2, %f48; \ | ||
67 | faligndata %f2, %f3, %f50; \ | ||
68 | faligndata %f3, %f4, %f52; \ | ||
69 | faligndata %f4, %f5, %f54; \ | ||
70 | faligndata %f5, %f6, %f56; \ | ||
71 | faligndata %f6, %f7, %f58; \ | ||
72 | faligndata %f7, %f8, %f60; \ | ||
73 | faligndata %f8, %f9, %f62; | ||
74 | |||
75 | #define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt) \ | ||
76 | EX_LD(LOAD_BLK(%src, %fdest)); \ | ||
77 | EX_ST(STORE_BLK(%fsrc, %dest)); \ | ||
78 | add %src, 0x40, %src; \ | ||
79 | subcc %len, 0x40, %len; \ | ||
80 | be,pn %xcc, jmptgt; \ | ||
81 | add %dest, 0x40, %dest; \ | ||
82 | |||
83 | #define LOOP_CHUNK1(src, dest, len, branch_dest) \ | ||
84 | MAIN_LOOP_CHUNK(src, dest, f0, f48, len, branch_dest) | ||
85 | #define LOOP_CHUNK2(src, dest, len, branch_dest) \ | ||
86 | MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest) | ||
87 | #define LOOP_CHUNK3(src, dest, len, branch_dest) \ | ||
88 | MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest) | ||
89 | |||
90 | #define DO_SYNC membar #Sync; | ||
91 | #define STORE_SYNC(dest, fsrc) \ | ||
92 | EX_ST(STORE_BLK(%fsrc, %dest)); \ | ||
93 | add %dest, 0x40, %dest; \ | ||
94 | DO_SYNC | ||
95 | |||
96 | #define STORE_JUMP(dest, fsrc, target) \ | ||
97 | EX_ST(STORE_BLK(%fsrc, %dest)); \ | ||
98 | add %dest, 0x40, %dest; \ | ||
99 | ba,pt %xcc, target; \ | ||
100 | nop; | ||
101 | |||
102 | #define FINISH_VISCHUNK(dest, f0, f1, left) \ | ||
103 | subcc %left, 8, %left;\ | ||
104 | bl,pn %xcc, 95f; \ | ||
105 | faligndata %f0, %f1, %f48; \ | ||
106 | EX_ST(STORE(std, %f48, %dest)); \ | ||
107 | add %dest, 8, %dest; | ||
108 | |||
109 | #define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ | ||
110 | subcc %left, 8, %left; \ | ||
111 | bl,pn %xcc, 95f; \ | ||
112 | fsrc1 %f0, %f1; | ||
113 | |||
114 | #define UNEVEN_VISCHUNK(dest, f0, f1, left) \ | ||
115 | UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ | ||
116 | ba,a,pt %xcc, 93f; | ||
117 | |||
118 | .register %g2,#scratch | ||
119 | .register %g3,#scratch | ||
120 | |||
121 | .text | ||
122 | .align 64 | ||
123 | |||
124 | .globl FUNC_NAME | ||
125 | .type FUNC_NAME,#function | ||
126 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | ||
127 | srlx %o2, 31, %g2 | ||
128 | cmp %g2, 0 | ||
129 | tne %xcc, 5 | ||
130 | PREAMBLE | ||
131 | mov %o0, %o4 | ||
132 | cmp %o2, 0 | ||
133 | be,pn %XCC, 85f | ||
134 | or %o0, %o1, %o3 | ||
135 | cmp %o2, 16 | ||
136 | blu,a,pn %XCC, 80f | ||
137 | or %o3, %o2, %o3 | ||
138 | |||
139 | cmp %o2, (5 * 64) | ||
140 | blu,pt %XCC, 70f | ||
141 | andcc %o3, 0x7, %g0 | ||
142 | |||
143 | /* Clobbers o5/g1/g2/g3/g7/icc/xcc. */ | ||
144 | VISEntry | ||
145 | |||
146 | /* Is 'dst' already aligned on an 64-byte boundary? */ | ||
147 | andcc %o0, 0x3f, %g2 | ||
148 | be,pt %XCC, 2f | ||
149 | |||
150 | /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number | ||
151 | * of bytes to copy to make 'dst' 64-byte aligned. We pre- | ||
152 | * subtract this from 'len'. | ||
153 | */ | ||
154 | sub %o0, %o1, %GLOBAL_SPARE | ||
155 | sub %g2, 0x40, %g2 | ||
156 | sub %g0, %g2, %g2 | ||
157 | sub %o2, %g2, %o2 | ||
158 | andcc %g2, 0x7, %g1 | ||
159 | be,pt %icc, 2f | ||
160 | and %g2, 0x38, %g2 | ||
161 | |||
162 | 1: subcc %g1, 0x1, %g1 | ||
163 | EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) | ||
164 | EX_ST(STORE(stb, %o3, %o1 + %GLOBAL_SPARE)) | ||
165 | bgu,pt %XCC, 1b | ||
166 | add %o1, 0x1, %o1 | ||
167 | |||
168 | add %o1, %GLOBAL_SPARE, %o0 | ||
169 | |||
170 | 2: cmp %g2, 0x0 | ||
171 | and %o1, 0x7, %g1 | ||
172 | be,pt %icc, 3f | ||
173 | alignaddr %o1, %g0, %o1 | ||
174 | |||
175 | EX_LD(LOAD(ldd, %o1, %f4)) | ||
176 | 1: EX_LD(LOAD(ldd, %o1 + 0x8, %f6)) | ||
177 | add %o1, 0x8, %o1 | ||
178 | subcc %g2, 0x8, %g2 | ||
179 | faligndata %f4, %f6, %f0 | ||
180 | EX_ST(STORE(std, %f0, %o0)) | ||
181 | be,pn %icc, 3f | ||
182 | add %o0, 0x8, %o0 | ||
183 | |||
184 | EX_LD(LOAD(ldd, %o1 + 0x8, %f4)) | ||
185 | add %o1, 0x8, %o1 | ||
186 | subcc %g2, 0x8, %g2 | ||
187 | faligndata %f6, %f4, %f0 | ||
188 | EX_ST(STORE(std, %f0, %o0)) | ||
189 | bne,pt %icc, 1b | ||
190 | add %o0, 0x8, %o0 | ||
191 | |||
192 | /* Destination is 64-byte aligned. */ | ||
193 | 3: | ||
194 | membar #LoadStore | #StoreStore | #StoreLoad | ||
195 | |||
196 | subcc %o2, 0x40, %GLOBAL_SPARE | ||
197 | add %o1, %g1, %g1 | ||
198 | andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE | ||
199 | srl %g1, 3, %g2 | ||
200 | sub %o2, %GLOBAL_SPARE, %g3 | ||
201 | andn %o1, (0x40 - 1), %o1 | ||
202 | and %g2, 7, %g2 | ||
203 | andncc %g3, 0x7, %g3 | ||
204 | fmovd %f0, %f2 | ||
205 | sub %g3, 0x8, %g3 | ||
206 | sub %o2, %GLOBAL_SPARE, %o2 | ||
207 | |||
208 | add %g1, %GLOBAL_SPARE, %g1 | ||
209 | subcc %o2, %g3, %o2 | ||
210 | |||
211 | EX_LD(LOAD_BLK(%o1, %f0)) | ||
212 | add %o1, 0x40, %o1 | ||
213 | add %g1, %g3, %g1 | ||
214 | EX_LD(LOAD_BLK(%o1, %f16)) | ||
215 | add %o1, 0x40, %o1 | ||
216 | sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE | ||
217 | EX_LD(LOAD_BLK(%o1, %f32)) | ||
218 | add %o1, 0x40, %o1 | ||
219 | |||
220 | /* There are 8 instances of the unrolled loop, | ||
221 | * one for each possible alignment of the | ||
222 | * source buffer. Each loop instance is 452 | ||
223 | * bytes. | ||
224 | */ | ||
225 | sll %g2, 3, %o3 | ||
226 | sub %o3, %g2, %o3 | ||
227 | sllx %o3, 4, %o3 | ||
228 | add %o3, %g2, %o3 | ||
229 | sllx %o3, 2, %g2 | ||
230 | 1: rd %pc, %o3 | ||
231 | add %o3, %lo(1f - 1b), %o3 | ||
232 | jmpl %o3 + %g2, %g0 | ||
233 | nop | ||
234 | |||
235 | .align 64 | ||
236 | 1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) | ||
237 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
238 | FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) | ||
239 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
240 | FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) | ||
241 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
242 | ba,pt %xcc, 1b+4 | ||
243 | faligndata %f0, %f2, %f48 | ||
244 | 1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) | ||
245 | STORE_SYNC(o0, f48) | ||
246 | FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) | ||
247 | STORE_JUMP(o0, f48, 40f) | ||
248 | 2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) | ||
249 | STORE_SYNC(o0, f48) | ||
250 | FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) | ||
251 | STORE_JUMP(o0, f48, 48f) | ||
252 | 3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) | ||
253 | STORE_SYNC(o0, f48) | ||
254 | FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) | ||
255 | STORE_JUMP(o0, f48, 56f) | ||
256 | |||
257 | 1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) | ||
258 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
259 | FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) | ||
260 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
261 | FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) | ||
262 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
263 | ba,pt %xcc, 1b+4 | ||
264 | faligndata %f2, %f4, %f48 | ||
265 | 1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) | ||
266 | STORE_SYNC(o0, f48) | ||
267 | FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) | ||
268 | STORE_JUMP(o0, f48, 41f) | ||
269 | 2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) | ||
270 | STORE_SYNC(o0, f48) | ||
271 | FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) | ||
272 | STORE_JUMP(o0, f48, 49f) | ||
273 | 3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) | ||
274 | STORE_SYNC(o0, f48) | ||
275 | FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) | ||
276 | STORE_JUMP(o0, f48, 57f) | ||
277 | |||
278 | 1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) | ||
279 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
280 | FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) | ||
281 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
282 | FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) | ||
283 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
284 | ba,pt %xcc, 1b+4 | ||
285 | faligndata %f4, %f6, %f48 | ||
286 | 1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) | ||
287 | STORE_SYNC(o0, f48) | ||
288 | FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) | ||
289 | STORE_JUMP(o0, f48, 42f) | ||
290 | 2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) | ||
291 | STORE_SYNC(o0, f48) | ||
292 | FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) | ||
293 | STORE_JUMP(o0, f48, 50f) | ||
294 | 3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) | ||
295 | STORE_SYNC(o0, f48) | ||
296 | FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) | ||
297 | STORE_JUMP(o0, f48, 58f) | ||
298 | |||
299 | 1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) | ||
300 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
301 | FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) | ||
302 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
303 | FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) | ||
304 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
305 | ba,pt %xcc, 1b+4 | ||
306 | faligndata %f6, %f8, %f48 | ||
307 | 1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) | ||
308 | STORE_SYNC(o0, f48) | ||
309 | FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) | ||
310 | STORE_JUMP(o0, f48, 43f) | ||
311 | 2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) | ||
312 | STORE_SYNC(o0, f48) | ||
313 | FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) | ||
314 | STORE_JUMP(o0, f48, 51f) | ||
315 | 3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) | ||
316 | STORE_SYNC(o0, f48) | ||
317 | FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) | ||
318 | STORE_JUMP(o0, f48, 59f) | ||
319 | |||
320 | 1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) | ||
321 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
322 | FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) | ||
323 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
324 | FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) | ||
325 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
326 | ba,pt %xcc, 1b+4 | ||
327 | faligndata %f8, %f10, %f48 | ||
328 | 1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) | ||
329 | STORE_SYNC(o0, f48) | ||
330 | FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) | ||
331 | STORE_JUMP(o0, f48, 44f) | ||
332 | 2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) | ||
333 | STORE_SYNC(o0, f48) | ||
334 | FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) | ||
335 | STORE_JUMP(o0, f48, 52f) | ||
336 | 3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) | ||
337 | STORE_SYNC(o0, f48) | ||
338 | FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) | ||
339 | STORE_JUMP(o0, f48, 60f) | ||
340 | |||
341 | 1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) | ||
342 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
343 | FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) | ||
344 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
345 | FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) | ||
346 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
347 | ba,pt %xcc, 1b+4 | ||
348 | faligndata %f10, %f12, %f48 | ||
349 | 1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) | ||
350 | STORE_SYNC(o0, f48) | ||
351 | FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) | ||
352 | STORE_JUMP(o0, f48, 45f) | ||
353 | 2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) | ||
354 | STORE_SYNC(o0, f48) | ||
355 | FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) | ||
356 | STORE_JUMP(o0, f48, 53f) | ||
357 | 3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) | ||
358 | STORE_SYNC(o0, f48) | ||
359 | FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) | ||
360 | STORE_JUMP(o0, f48, 61f) | ||
361 | |||
362 | 1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) | ||
363 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
364 | FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) | ||
365 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
366 | FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) | ||
367 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
368 | ba,pt %xcc, 1b+4 | ||
369 | faligndata %f12, %f14, %f48 | ||
370 | 1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) | ||
371 | STORE_SYNC(o0, f48) | ||
372 | FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) | ||
373 | STORE_JUMP(o0, f48, 46f) | ||
374 | 2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) | ||
375 | STORE_SYNC(o0, f48) | ||
376 | FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) | ||
377 | STORE_JUMP(o0, f48, 54f) | ||
378 | 3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) | ||
379 | STORE_SYNC(o0, f48) | ||
380 | FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) | ||
381 | STORE_JUMP(o0, f48, 62f) | ||
382 | |||
383 | 1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) | ||
384 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
385 | FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) | ||
386 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
387 | FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) | ||
388 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
389 | ba,pt %xcc, 1b+4 | ||
390 | faligndata %f14, %f16, %f48 | ||
391 | 1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) | ||
392 | STORE_SYNC(o0, f48) | ||
393 | FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) | ||
394 | STORE_JUMP(o0, f48, 47f) | ||
395 | 2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) | ||
396 | STORE_SYNC(o0, f48) | ||
397 | FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) | ||
398 | STORE_JUMP(o0, f48, 55f) | ||
399 | 3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) | ||
400 | STORE_SYNC(o0, f48) | ||
401 | FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) | ||
402 | STORE_JUMP(o0, f48, 63f) | ||
403 | |||
404 | 40: FINISH_VISCHUNK(o0, f0, f2, g3) | ||
405 | 41: FINISH_VISCHUNK(o0, f2, f4, g3) | ||
406 | 42: FINISH_VISCHUNK(o0, f4, f6, g3) | ||
407 | 43: FINISH_VISCHUNK(o0, f6, f8, g3) | ||
408 | 44: FINISH_VISCHUNK(o0, f8, f10, g3) | ||
409 | 45: FINISH_VISCHUNK(o0, f10, f12, g3) | ||
410 | 46: FINISH_VISCHUNK(o0, f12, f14, g3) | ||
411 | 47: UNEVEN_VISCHUNK(o0, f14, f0, g3) | ||
412 | 48: FINISH_VISCHUNK(o0, f16, f18, g3) | ||
413 | 49: FINISH_VISCHUNK(o0, f18, f20, g3) | ||
414 | 50: FINISH_VISCHUNK(o0, f20, f22, g3) | ||
415 | 51: FINISH_VISCHUNK(o0, f22, f24, g3) | ||
416 | 52: FINISH_VISCHUNK(o0, f24, f26, g3) | ||
417 | 53: FINISH_VISCHUNK(o0, f26, f28, g3) | ||
418 | 54: FINISH_VISCHUNK(o0, f28, f30, g3) | ||
419 | 55: UNEVEN_VISCHUNK(o0, f30, f0, g3) | ||
420 | 56: FINISH_VISCHUNK(o0, f32, f34, g3) | ||
421 | 57: FINISH_VISCHUNK(o0, f34, f36, g3) | ||
422 | 58: FINISH_VISCHUNK(o0, f36, f38, g3) | ||
423 | 59: FINISH_VISCHUNK(o0, f38, f40, g3) | ||
424 | 60: FINISH_VISCHUNK(o0, f40, f42, g3) | ||
425 | 61: FINISH_VISCHUNK(o0, f42, f44, g3) | ||
426 | 62: FINISH_VISCHUNK(o0, f44, f46, g3) | ||
427 | 63: UNEVEN_VISCHUNK_LAST(o0, f46, f0, g3) | ||
428 | |||
429 | 93: EX_LD(LOAD(ldd, %o1, %f2)) | ||
430 | add %o1, 8, %o1 | ||
431 | subcc %g3, 8, %g3 | ||
432 | faligndata %f0, %f2, %f8 | ||
433 | EX_ST(STORE(std, %f8, %o0)) | ||
434 | bl,pn %xcc, 95f | ||
435 | add %o0, 8, %o0 | ||
436 | EX_LD(LOAD(ldd, %o1, %f0)) | ||
437 | add %o1, 8, %o1 | ||
438 | subcc %g3, 8, %g3 | ||
439 | faligndata %f2, %f0, %f8 | ||
440 | EX_ST(STORE(std, %f8, %o0)) | ||
441 | bge,pt %xcc, 93b | ||
442 | add %o0, 8, %o0 | ||
443 | |||
444 | 95: brz,pt %o2, 2f | ||
445 | mov %g1, %o1 | ||
446 | |||
447 | 1: EX_LD(LOAD(ldub, %o1, %o3)) | ||
448 | add %o1, 1, %o1 | ||
449 | subcc %o2, 1, %o2 | ||
450 | EX_ST(STORE(stb, %o3, %o0)) | ||
451 | bne,pt %xcc, 1b | ||
452 | add %o0, 1, %o0 | ||
453 | |||
454 | 2: membar #StoreLoad | #StoreStore | ||
455 | VISExit | ||
456 | retl | ||
457 | mov EX_RETVAL(%o4), %o0 | ||
458 | |||
459 | .align 64 | ||
460 | 70: /* 16 < len <= (5 * 64) */ | ||
461 | bne,pn %XCC, 75f | ||
462 | sub %o0, %o1, %o3 | ||
463 | |||
464 | 72: andn %o2, 0xf, %GLOBAL_SPARE | ||
465 | and %o2, 0xf, %o2 | ||
466 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) | ||
467 | EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) | ||
468 | subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE | ||
469 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | ||
470 | add %o1, 0x8, %o1 | ||
471 | EX_ST(STORE(stx, %g1, %o1 + %o3)) | ||
472 | bgu,pt %XCC, 1b | ||
473 | add %o1, 0x8, %o1 | ||
474 | 73: andcc %o2, 0x8, %g0 | ||
475 | be,pt %XCC, 1f | ||
476 | nop | ||
477 | EX_LD(LOAD(ldx, %o1, %o5)) | ||
478 | sub %o2, 0x8, %o2 | ||
479 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | ||
480 | add %o1, 0x8, %o1 | ||
481 | 1: andcc %o2, 0x4, %g0 | ||
482 | be,pt %XCC, 1f | ||
483 | nop | ||
484 | EX_LD(LOAD(lduw, %o1, %o5)) | ||
485 | sub %o2, 0x4, %o2 | ||
486 | EX_ST(STORE(stw, %o5, %o1 + %o3)) | ||
487 | add %o1, 0x4, %o1 | ||
488 | 1: cmp %o2, 0 | ||
489 | be,pt %XCC, 85f | ||
490 | nop | ||
491 | ba,pt %xcc, 90f | ||
492 | nop | ||
493 | |||
494 | 75: andcc %o0, 0x7, %g1 | ||
495 | sub %g1, 0x8, %g1 | ||
496 | be,pn %icc, 2f | ||
497 | sub %g0, %g1, %g1 | ||
498 | sub %o2, %g1, %o2 | ||
499 | |||
500 | 1: EX_LD(LOAD(ldub, %o1, %o5)) | ||
501 | subcc %g1, 1, %g1 | ||
502 | EX_ST(STORE(stb, %o5, %o1 + %o3)) | ||
503 | bgu,pt %icc, 1b | ||
504 | add %o1, 1, %o1 | ||
505 | |||
506 | 2: add %o1, %o3, %o0 | ||
507 | andcc %o1, 0x7, %g1 | ||
508 | bne,pt %icc, 8f | ||
509 | sll %g1, 3, %g1 | ||
510 | |||
511 | cmp %o2, 16 | ||
512 | bgeu,pt %icc, 72b | ||
513 | nop | ||
514 | ba,a,pt %xcc, 73b | ||
515 | |||
516 | 8: mov 64, %o3 | ||
517 | andn %o1, 0x7, %o1 | ||
518 | EX_LD(LOAD(ldx, %o1, %g2)) | ||
519 | sub %o3, %g1, %o3 | ||
520 | andn %o2, 0x7, %GLOBAL_SPARE | ||
521 | sllx %g2, %g1, %g2 | ||
522 | 1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) | ||
523 | subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE | ||
524 | add %o1, 0x8, %o1 | ||
525 | srlx %g3, %o3, %o5 | ||
526 | or %o5, %g2, %o5 | ||
527 | EX_ST(STORE(stx, %o5, %o0)) | ||
528 | add %o0, 0x8, %o0 | ||
529 | bgu,pt %icc, 1b | ||
530 | sllx %g3, %g1, %g2 | ||
531 | |||
532 | srl %g1, 3, %g1 | ||
533 | andcc %o2, 0x7, %o2 | ||
534 | be,pn %icc, 85f | ||
535 | add %o1, %g1, %o1 | ||
536 | ba,pt %xcc, 90f | ||
537 | sub %o0, %o1, %o3 | ||
538 | |||
539 | .align 64 | ||
540 | 80: /* 0 < len <= 16 */ | ||
541 | andcc %o3, 0x3, %g0 | ||
542 | bne,pn %XCC, 90f | ||
543 | sub %o0, %o1, %o3 | ||
544 | |||
545 | 1: EX_LD(LOAD(lduw, %o1, %g1)) | ||
546 | subcc %o2, 4, %o2 | ||
547 | EX_ST(STORE(stw, %g1, %o1 + %o3)) | ||
548 | bgu,pt %XCC, 1b | ||
549 | add %o1, 4, %o1 | ||
550 | |||
551 | 85: retl | ||
552 | mov EX_RETVAL(%o4), %o0 | ||
553 | |||
554 | .align 32 | ||
555 | 90: EX_LD(LOAD(ldub, %o1, %g1)) | ||
556 | subcc %o2, 1, %o2 | ||
557 | EX_ST(STORE(stb, %g1, %o1 + %o3)) | ||
558 | bgu,pt %XCC, 90b | ||
559 | add %o1, 1, %o1 | ||
560 | retl | ||
561 | mov EX_RETVAL(%o4), %o0 | ||
562 | |||
563 | .size FUNC_NAME, .-FUNC_NAME | ||
diff --git a/arch/sparc/lib/U3copy_from_user.S b/arch/sparc/lib/U3copy_from_user.S new file mode 100644 index 000000000000..f5bfc8d9d216 --- /dev/null +++ b/arch/sparc/lib/U3copy_from_user.S | |||
@@ -0,0 +1,22 @@ | |||
1 | /* U3copy_from_user.S: UltraSparc-III optimized copy from userspace. | ||
2 | * | ||
3 | * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) | ||
4 | */ | ||
5 | |||
6 | #define EX_LD(x) \ | ||
7 | 98: x; \ | ||
8 | .section .fixup; \ | ||
9 | .align 4; \ | ||
10 | 99: retl; \ | ||
11 | mov 1, %o0; \ | ||
12 | .section __ex_table,"a";\ | ||
13 | .align 4; \ | ||
14 | .word 98b, 99b; \ | ||
15 | .text; \ | ||
16 | .align 4; | ||
17 | |||
18 | #define FUNC_NAME U3copy_from_user | ||
19 | #define LOAD(type,addr,dest) type##a [addr] %asi, dest | ||
20 | #define EX_RETVAL(x) 0 | ||
21 | |||
22 | #include "U3memcpy.S" | ||
diff --git a/arch/sparc/lib/U3copy_to_user.S b/arch/sparc/lib/U3copy_to_user.S new file mode 100644 index 000000000000..2334f111bb0c --- /dev/null +++ b/arch/sparc/lib/U3copy_to_user.S | |||
@@ -0,0 +1,33 @@ | |||
1 | /* U3copy_to_user.S: UltraSparc-III optimized copy to userspace. | ||
2 | * | ||
3 | * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) | ||
4 | */ | ||
5 | |||
6 | #define EX_ST(x) \ | ||
7 | 98: x; \ | ||
8 | .section .fixup; \ | ||
9 | .align 4; \ | ||
10 | 99: retl; \ | ||
11 | mov 1, %o0; \ | ||
12 | .section __ex_table,"a";\ | ||
13 | .align 4; \ | ||
14 | .word 98b, 99b; \ | ||
15 | .text; \ | ||
16 | .align 4; | ||
17 | |||
18 | #define FUNC_NAME U3copy_to_user | ||
19 | #define STORE(type,src,addr) type##a src, [addr] ASI_AIUS | ||
20 | #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_AIUS | ||
21 | #define EX_RETVAL(x) 0 | ||
22 | |||
23 | /* Writing to %asi is _expensive_ so we hardcode it. | ||
24 | * Reading %asi to check for KERNEL_DS is comparatively | ||
25 | * cheap. | ||
26 | */ | ||
27 | #define PREAMBLE \ | ||
28 | rd %asi, %g1; \ | ||
29 | cmp %g1, ASI_AIUS; \ | ||
30 | bne,pn %icc, memcpy_user_stub; \ | ||
31 | nop; \ | ||
32 | |||
33 | #include "U3memcpy.S" | ||
diff --git a/arch/sparc/lib/U3memcpy.S b/arch/sparc/lib/U3memcpy.S new file mode 100644 index 000000000000..7cae9cc6a204 --- /dev/null +++ b/arch/sparc/lib/U3memcpy.S | |||
@@ -0,0 +1,422 @@ | |||
1 | /* U3memcpy.S: UltraSparc-III optimized memcpy. | ||
2 | * | ||
3 | * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) | ||
4 | */ | ||
5 | |||
6 | #ifdef __KERNEL__ | ||
7 | #include <asm/visasm.h> | ||
8 | #include <asm/asi.h> | ||
9 | #define GLOBAL_SPARE %g7 | ||
10 | #else | ||
11 | #define ASI_BLK_P 0xf0 | ||
12 | #define FPRS_FEF 0x04 | ||
13 | #ifdef MEMCPY_DEBUG | ||
14 | #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ | ||
15 | clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; | ||
16 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | ||
17 | #else | ||
18 | #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs | ||
19 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | ||
20 | #endif | ||
21 | #define GLOBAL_SPARE %g5 | ||
22 | #endif | ||
23 | |||
24 | #ifndef EX_LD | ||
25 | #define EX_LD(x) x | ||
26 | #endif | ||
27 | |||
28 | #ifndef EX_ST | ||
29 | #define EX_ST(x) x | ||
30 | #endif | ||
31 | |||
32 | #ifndef EX_RETVAL | ||
33 | #define EX_RETVAL(x) x | ||
34 | #endif | ||
35 | |||
36 | #ifndef LOAD | ||
37 | #define LOAD(type,addr,dest) type [addr], dest | ||
38 | #endif | ||
39 | |||
40 | #ifndef STORE | ||
41 | #define STORE(type,src,addr) type src, [addr] | ||
42 | #endif | ||
43 | |||
44 | #ifndef STORE_BLK | ||
45 | #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P | ||
46 | #endif | ||
47 | |||
48 | #ifndef FUNC_NAME | ||
49 | #define FUNC_NAME U3memcpy | ||
50 | #endif | ||
51 | |||
52 | #ifndef PREAMBLE | ||
53 | #define PREAMBLE | ||
54 | #endif | ||
55 | |||
56 | #ifndef XCC | ||
57 | #define XCC xcc | ||
58 | #endif | ||
59 | |||
60 | .register %g2,#scratch | ||
61 | .register %g3,#scratch | ||
62 | |||
63 | /* Special/non-trivial issues of this code: | ||
64 | * | ||
65 | * 1) %o5 is preserved from VISEntryHalf to VISExitHalf | ||
66 | * 2) Only low 32 FPU registers are used so that only the | ||
67 | * lower half of the FPU register set is dirtied by this | ||
68 | * code. This is especially important in the kernel. | ||
69 | * 3) This code never prefetches cachelines past the end | ||
70 | * of the source buffer. | ||
71 | */ | ||
72 | |||
73 | .text | ||
74 | .align 64 | ||
75 | |||
76 | /* The cheetah's flexible spine, oversized liver, enlarged heart, | ||
77 | * slender muscular body, and claws make it the swiftest hunter | ||
78 | * in Africa and the fastest animal on land. Can reach speeds | ||
79 | * of up to 2.4GB per second. | ||
80 | */ | ||
81 | |||
82 | .globl FUNC_NAME | ||
83 | .type FUNC_NAME,#function | ||
84 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | ||
85 | srlx %o2, 31, %g2 | ||
86 | cmp %g2, 0 | ||
87 | tne %xcc, 5 | ||
88 | PREAMBLE | ||
89 | mov %o0, %o4 | ||
90 | cmp %o2, 0 | ||
91 | be,pn %XCC, 85f | ||
92 | or %o0, %o1, %o3 | ||
93 | cmp %o2, 16 | ||
94 | blu,a,pn %XCC, 80f | ||
95 | or %o3, %o2, %o3 | ||
96 | |||
97 | cmp %o2, (3 * 64) | ||
98 | blu,pt %XCC, 70f | ||
99 | andcc %o3, 0x7, %g0 | ||
100 | |||
101 | /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve | ||
102 | * o5 from here until we hit VISExitHalf. | ||
103 | */ | ||
104 | VISEntryHalf | ||
105 | |||
106 | /* Is 'dst' already aligned on an 64-byte boundary? */ | ||
107 | andcc %o0, 0x3f, %g2 | ||
108 | be,pt %XCC, 2f | ||
109 | |||
110 | /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number | ||
111 | * of bytes to copy to make 'dst' 64-byte aligned. We pre- | ||
112 | * subtract this from 'len'. | ||
113 | */ | ||
114 | sub %o0, %o1, GLOBAL_SPARE | ||
115 | sub %g2, 0x40, %g2 | ||
116 | sub %g0, %g2, %g2 | ||
117 | sub %o2, %g2, %o2 | ||
118 | andcc %g2, 0x7, %g1 | ||
119 | be,pt %icc, 2f | ||
120 | and %g2, 0x38, %g2 | ||
121 | |||
122 | 1: subcc %g1, 0x1, %g1 | ||
123 | EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) | ||
124 | EX_ST(STORE(stb, %o3, %o1 + GLOBAL_SPARE)) | ||
125 | bgu,pt %XCC, 1b | ||
126 | add %o1, 0x1, %o1 | ||
127 | |||
128 | add %o1, GLOBAL_SPARE, %o0 | ||
129 | |||
130 | 2: cmp %g2, 0x0 | ||
131 | and %o1, 0x7, %g1 | ||
132 | be,pt %icc, 3f | ||
133 | alignaddr %o1, %g0, %o1 | ||
134 | |||
135 | EX_LD(LOAD(ldd, %o1, %f4)) | ||
136 | 1: EX_LD(LOAD(ldd, %o1 + 0x8, %f6)) | ||
137 | add %o1, 0x8, %o1 | ||
138 | subcc %g2, 0x8, %g2 | ||
139 | faligndata %f4, %f6, %f0 | ||
140 | EX_ST(STORE(std, %f0, %o0)) | ||
141 | be,pn %icc, 3f | ||
142 | add %o0, 0x8, %o0 | ||
143 | |||
144 | EX_LD(LOAD(ldd, %o1 + 0x8, %f4)) | ||
145 | add %o1, 0x8, %o1 | ||
146 | subcc %g2, 0x8, %g2 | ||
147 | faligndata %f6, %f4, %f2 | ||
148 | EX_ST(STORE(std, %f2, %o0)) | ||
149 | bne,pt %icc, 1b | ||
150 | add %o0, 0x8, %o0 | ||
151 | |||
152 | 3: LOAD(prefetch, %o1 + 0x000, #one_read) | ||
153 | LOAD(prefetch, %o1 + 0x040, #one_read) | ||
154 | andn %o2, (0x40 - 1), GLOBAL_SPARE | ||
155 | LOAD(prefetch, %o1 + 0x080, #one_read) | ||
156 | LOAD(prefetch, %o1 + 0x0c0, #one_read) | ||
157 | LOAD(prefetch, %o1 + 0x100, #one_read) | ||
158 | EX_LD(LOAD(ldd, %o1 + 0x000, %f0)) | ||
159 | LOAD(prefetch, %o1 + 0x140, #one_read) | ||
160 | EX_LD(LOAD(ldd, %o1 + 0x008, %f2)) | ||
161 | LOAD(prefetch, %o1 + 0x180, #one_read) | ||
162 | EX_LD(LOAD(ldd, %o1 + 0x010, %f4)) | ||
163 | LOAD(prefetch, %o1 + 0x1c0, #one_read) | ||
164 | faligndata %f0, %f2, %f16 | ||
165 | EX_LD(LOAD(ldd, %o1 + 0x018, %f6)) | ||
166 | faligndata %f2, %f4, %f18 | ||
167 | EX_LD(LOAD(ldd, %o1 + 0x020, %f8)) | ||
168 | faligndata %f4, %f6, %f20 | ||
169 | EX_LD(LOAD(ldd, %o1 + 0x028, %f10)) | ||
170 | faligndata %f6, %f8, %f22 | ||
171 | |||
172 | EX_LD(LOAD(ldd, %o1 + 0x030, %f12)) | ||
173 | faligndata %f8, %f10, %f24 | ||
174 | EX_LD(LOAD(ldd, %o1 + 0x038, %f14)) | ||
175 | faligndata %f10, %f12, %f26 | ||
176 | EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) | ||
177 | |||
178 | subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE | ||
179 | add %o1, 0x40, %o1 | ||
180 | bgu,pt %XCC, 1f | ||
181 | srl GLOBAL_SPARE, 6, %o3 | ||
182 | ba,pt %xcc, 2f | ||
183 | nop | ||
184 | |||
185 | .align 64 | ||
186 | 1: | ||
187 | EX_LD(LOAD(ldd, %o1 + 0x008, %f2)) | ||
188 | faligndata %f12, %f14, %f28 | ||
189 | EX_LD(LOAD(ldd, %o1 + 0x010, %f4)) | ||
190 | faligndata %f14, %f0, %f30 | ||
191 | EX_ST(STORE_BLK(%f16, %o0)) | ||
192 | EX_LD(LOAD(ldd, %o1 + 0x018, %f6)) | ||
193 | faligndata %f0, %f2, %f16 | ||
194 | add %o0, 0x40, %o0 | ||
195 | |||
196 | EX_LD(LOAD(ldd, %o1 + 0x020, %f8)) | ||
197 | faligndata %f2, %f4, %f18 | ||
198 | EX_LD(LOAD(ldd, %o1 + 0x028, %f10)) | ||
199 | faligndata %f4, %f6, %f20 | ||
200 | EX_LD(LOAD(ldd, %o1 + 0x030, %f12)) | ||
201 | subcc %o3, 0x01, %o3 | ||
202 | faligndata %f6, %f8, %f22 | ||
203 | EX_LD(LOAD(ldd, %o1 + 0x038, %f14)) | ||
204 | |||
205 | faligndata %f8, %f10, %f24 | ||
206 | EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) | ||
207 | LOAD(prefetch, %o1 + 0x1c0, #one_read) | ||
208 | faligndata %f10, %f12, %f26 | ||
209 | bg,pt %XCC, 1b | ||
210 | add %o1, 0x40, %o1 | ||
211 | |||
212 | /* Finally we copy the last full 64-byte block. */ | ||
213 | 2: | ||
214 | EX_LD(LOAD(ldd, %o1 + 0x008, %f2)) | ||
215 | faligndata %f12, %f14, %f28 | ||
216 | EX_LD(LOAD(ldd, %o1 + 0x010, %f4)) | ||
217 | faligndata %f14, %f0, %f30 | ||
218 | EX_ST(STORE_BLK(%f16, %o0)) | ||
219 | EX_LD(LOAD(ldd, %o1 + 0x018, %f6)) | ||
220 | faligndata %f0, %f2, %f16 | ||
221 | EX_LD(LOAD(ldd, %o1 + 0x020, %f8)) | ||
222 | faligndata %f2, %f4, %f18 | ||
223 | EX_LD(LOAD(ldd, %o1 + 0x028, %f10)) | ||
224 | faligndata %f4, %f6, %f20 | ||
225 | EX_LD(LOAD(ldd, %o1 + 0x030, %f12)) | ||
226 | faligndata %f6, %f8, %f22 | ||
227 | EX_LD(LOAD(ldd, %o1 + 0x038, %f14)) | ||
228 | faligndata %f8, %f10, %f24 | ||
229 | cmp %g1, 0 | ||
230 | be,pt %XCC, 1f | ||
231 | add %o0, 0x40, %o0 | ||
232 | EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) | ||
233 | 1: faligndata %f10, %f12, %f26 | ||
234 | faligndata %f12, %f14, %f28 | ||
235 | faligndata %f14, %f0, %f30 | ||
236 | EX_ST(STORE_BLK(%f16, %o0)) | ||
237 | add %o0, 0x40, %o0 | ||
238 | add %o1, 0x40, %o1 | ||
239 | membar #Sync | ||
240 | |||
241 | /* Now we copy the (len modulo 64) bytes at the end. | ||
242 | * Note how we borrow the %f0 loaded above. | ||
243 | * | ||
244 | * Also notice how this code is careful not to perform a | ||
245 | * load past the end of the src buffer. | ||
246 | */ | ||
247 | and %o2, 0x3f, %o2 | ||
248 | andcc %o2, 0x38, %g2 | ||
249 | be,pn %XCC, 2f | ||
250 | subcc %g2, 0x8, %g2 | ||
251 | be,pn %XCC, 2f | ||
252 | cmp %g1, 0 | ||
253 | |||
254 | sub %o2, %g2, %o2 | ||
255 | be,a,pt %XCC, 1f | ||
256 | EX_LD(LOAD(ldd, %o1 + 0x00, %f0)) | ||
257 | |||
258 | 1: EX_LD(LOAD(ldd, %o1 + 0x08, %f2)) | ||
259 | add %o1, 0x8, %o1 | ||
260 | subcc %g2, 0x8, %g2 | ||
261 | faligndata %f0, %f2, %f8 | ||
262 | EX_ST(STORE(std, %f8, %o0)) | ||
263 | be,pn %XCC, 2f | ||
264 | add %o0, 0x8, %o0 | ||
265 | EX_LD(LOAD(ldd, %o1 + 0x08, %f0)) | ||
266 | add %o1, 0x8, %o1 | ||
267 | subcc %g2, 0x8, %g2 | ||
268 | faligndata %f2, %f0, %f8 | ||
269 | EX_ST(STORE(std, %f8, %o0)) | ||
270 | bne,pn %XCC, 1b | ||
271 | add %o0, 0x8, %o0 | ||
272 | |||
273 | /* If anything is left, we copy it one byte at a time. | ||
274 | * Note that %g1 is (src & 0x3) saved above before the | ||
275 | * alignaddr was performed. | ||
276 | */ | ||
277 | 2: | ||
278 | cmp %o2, 0 | ||
279 | add %o1, %g1, %o1 | ||
280 | VISExitHalf | ||
281 | be,pn %XCC, 85f | ||
282 | sub %o0, %o1, %o3 | ||
283 | |||
284 | andcc %g1, 0x7, %g0 | ||
285 | bne,pn %icc, 90f | ||
286 | andcc %o2, 0x8, %g0 | ||
287 | be,pt %icc, 1f | ||
288 | nop | ||
289 | EX_LD(LOAD(ldx, %o1, %o5)) | ||
290 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | ||
291 | add %o1, 0x8, %o1 | ||
292 | |||
293 | 1: andcc %o2, 0x4, %g0 | ||
294 | be,pt %icc, 1f | ||
295 | nop | ||
296 | EX_LD(LOAD(lduw, %o1, %o5)) | ||
297 | EX_ST(STORE(stw, %o5, %o1 + %o3)) | ||
298 | add %o1, 0x4, %o1 | ||
299 | |||
300 | 1: andcc %o2, 0x2, %g0 | ||
301 | be,pt %icc, 1f | ||
302 | nop | ||
303 | EX_LD(LOAD(lduh, %o1, %o5)) | ||
304 | EX_ST(STORE(sth, %o5, %o1 + %o3)) | ||
305 | add %o1, 0x2, %o1 | ||
306 | |||
307 | 1: andcc %o2, 0x1, %g0 | ||
308 | be,pt %icc, 85f | ||
309 | nop | ||
310 | EX_LD(LOAD(ldub, %o1, %o5)) | ||
311 | ba,pt %xcc, 85f | ||
312 | EX_ST(STORE(stb, %o5, %o1 + %o3)) | ||
313 | |||
314 | .align 64 | ||
315 | 70: /* 16 < len <= 64 */ | ||
316 | bne,pn %XCC, 75f | ||
317 | sub %o0, %o1, %o3 | ||
318 | |||
319 | 72: | ||
320 | andn %o2, 0xf, GLOBAL_SPARE | ||
321 | and %o2, 0xf, %o2 | ||
322 | 1: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE | ||
323 | EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) | ||
324 | EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) | ||
325 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | ||
326 | add %o1, 0x8, %o1 | ||
327 | EX_ST(STORE(stx, %g1, %o1 + %o3)) | ||
328 | bgu,pt %XCC, 1b | ||
329 | add %o1, 0x8, %o1 | ||
330 | 73: andcc %o2, 0x8, %g0 | ||
331 | be,pt %XCC, 1f | ||
332 | nop | ||
333 | sub %o2, 0x8, %o2 | ||
334 | EX_LD(LOAD(ldx, %o1, %o5)) | ||
335 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | ||
336 | add %o1, 0x8, %o1 | ||
337 | 1: andcc %o2, 0x4, %g0 | ||
338 | be,pt %XCC, 1f | ||
339 | nop | ||
340 | sub %o2, 0x4, %o2 | ||
341 | EX_LD(LOAD(lduw, %o1, %o5)) | ||
342 | EX_ST(STORE(stw, %o5, %o1 + %o3)) | ||
343 | add %o1, 0x4, %o1 | ||
344 | 1: cmp %o2, 0 | ||
345 | be,pt %XCC, 85f | ||
346 | nop | ||
347 | ba,pt %xcc, 90f | ||
348 | nop | ||
349 | |||
350 | 75: | ||
351 | andcc %o0, 0x7, %g1 | ||
352 | sub %g1, 0x8, %g1 | ||
353 | be,pn %icc, 2f | ||
354 | sub %g0, %g1, %g1 | ||
355 | sub %o2, %g1, %o2 | ||
356 | |||
357 | 1: subcc %g1, 1, %g1 | ||
358 | EX_LD(LOAD(ldub, %o1, %o5)) | ||
359 | EX_ST(STORE(stb, %o5, %o1 + %o3)) | ||
360 | bgu,pt %icc, 1b | ||
361 | add %o1, 1, %o1 | ||
362 | |||
363 | 2: add %o1, %o3, %o0 | ||
364 | andcc %o1, 0x7, %g1 | ||
365 | bne,pt %icc, 8f | ||
366 | sll %g1, 3, %g1 | ||
367 | |||
368 | cmp %o2, 16 | ||
369 | bgeu,pt %icc, 72b | ||
370 | nop | ||
371 | ba,a,pt %xcc, 73b | ||
372 | |||
373 | 8: mov 64, %o3 | ||
374 | andn %o1, 0x7, %o1 | ||
375 | EX_LD(LOAD(ldx, %o1, %g2)) | ||
376 | sub %o3, %g1, %o3 | ||
377 | andn %o2, 0x7, GLOBAL_SPARE | ||
378 | sllx %g2, %g1, %g2 | ||
379 | 1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) | ||
380 | subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE | ||
381 | add %o1, 0x8, %o1 | ||
382 | srlx %g3, %o3, %o5 | ||
383 | or %o5, %g2, %o5 | ||
384 | EX_ST(STORE(stx, %o5, %o0)) | ||
385 | add %o0, 0x8, %o0 | ||
386 | bgu,pt %icc, 1b | ||
387 | sllx %g3, %g1, %g2 | ||
388 | |||
389 | srl %g1, 3, %g1 | ||
390 | andcc %o2, 0x7, %o2 | ||
391 | be,pn %icc, 85f | ||
392 | add %o1, %g1, %o1 | ||
393 | ba,pt %xcc, 90f | ||
394 | sub %o0, %o1, %o3 | ||
395 | |||
396 | .align 64 | ||
397 | 80: /* 0 < len <= 16 */ | ||
398 | andcc %o3, 0x3, %g0 | ||
399 | bne,pn %XCC, 90f | ||
400 | sub %o0, %o1, %o3 | ||
401 | |||
402 | 1: | ||
403 | subcc %o2, 4, %o2 | ||
404 | EX_LD(LOAD(lduw, %o1, %g1)) | ||
405 | EX_ST(STORE(stw, %g1, %o1 + %o3)) | ||
406 | bgu,pt %XCC, 1b | ||
407 | add %o1, 4, %o1 | ||
408 | |||
409 | 85: retl | ||
410 | mov EX_RETVAL(%o4), %o0 | ||
411 | |||
412 | .align 32 | ||
413 | 90: | ||
414 | subcc %o2, 1, %o2 | ||
415 | EX_LD(LOAD(ldub, %o1, %g1)) | ||
416 | EX_ST(STORE(stb, %g1, %o1 + %o3)) | ||
417 | bgu,pt %XCC, 90b | ||
418 | add %o1, 1, %o1 | ||
419 | retl | ||
420 | mov EX_RETVAL(%o4), %o0 | ||
421 | |||
422 | .size FUNC_NAME, .-FUNC_NAME | ||
diff --git a/arch/sparc/lib/U3patch.S b/arch/sparc/lib/U3patch.S new file mode 100644 index 000000000000..ecc302619a6e --- /dev/null +++ b/arch/sparc/lib/U3patch.S | |||
@@ -0,0 +1,33 @@ | |||
1 | /* U3patch.S: Patch Ultra-I routines with Ultra-III variant. | ||
2 | * | ||
3 | * Copyright (C) 2004 David S. Miller <davem@redhat.com> | ||
4 | */ | ||
5 | |||
6 | #define BRANCH_ALWAYS 0x10680000 | ||
7 | #define NOP 0x01000000 | ||
8 | #define ULTRA3_DO_PATCH(OLD, NEW) \ | ||
9 | sethi %hi(NEW), %g1; \ | ||
10 | or %g1, %lo(NEW), %g1; \ | ||
11 | sethi %hi(OLD), %g2; \ | ||
12 | or %g2, %lo(OLD), %g2; \ | ||
13 | sub %g1, %g2, %g1; \ | ||
14 | sethi %hi(BRANCH_ALWAYS), %g3; \ | ||
15 | sll %g1, 11, %g1; \ | ||
16 | srl %g1, 11 + 2, %g1; \ | ||
17 | or %g3, %lo(BRANCH_ALWAYS), %g3; \ | ||
18 | or %g3, %g1, %g3; \ | ||
19 | stw %g3, [%g2]; \ | ||
20 | sethi %hi(NOP), %g3; \ | ||
21 | or %g3, %lo(NOP), %g3; \ | ||
22 | stw %g3, [%g2 + 0x4]; \ | ||
23 | flush %g2; | ||
24 | |||
25 | .globl cheetah_patch_copyops | ||
26 | .type cheetah_patch_copyops,#function | ||
27 | cheetah_patch_copyops: | ||
28 | ULTRA3_DO_PATCH(memcpy, U3memcpy) | ||
29 | ULTRA3_DO_PATCH(___copy_from_user, U3copy_from_user) | ||
30 | ULTRA3_DO_PATCH(___copy_to_user, U3copy_to_user) | ||
31 | retl | ||
32 | nop | ||
33 | .size cheetah_patch_copyops,.-cheetah_patch_copyops | ||
diff --git a/arch/sparc/lib/VISsave.S b/arch/sparc/lib/VISsave.S new file mode 100644 index 000000000000..b320ae9e2e2e --- /dev/null +++ b/arch/sparc/lib/VISsave.S | |||
@@ -0,0 +1,144 @@ | |||
1 | /* | ||
2 | * VISsave.S: Code for saving FPU register state for | ||
3 | * VIS routines. One should not call this directly, | ||
4 | * but use macros provided in <asm/visasm.h>. | ||
5 | * | ||
6 | * Copyright (C) 1998 Jakub Jelinek (jj@ultra.linux.cz) | ||
7 | */ | ||
8 | |||
9 | #include <asm/asi.h> | ||
10 | #include <asm/page.h> | ||
11 | #include <asm/ptrace.h> | ||
12 | #include <asm/visasm.h> | ||
13 | #include <asm/thread_info.h> | ||
14 | |||
15 | .text | ||
16 | .globl VISenter, VISenterhalf | ||
17 | |||
18 | /* On entry: %o5=current FPRS value, %g7 is callers address */ | ||
19 | /* May clobber %o5, %g1, %g2, %g3, %g7, %icc, %xcc */ | ||
20 | |||
21 | /* Nothing special need be done here to handle pre-emption, this | ||
22 | * FPU save/restore mechanism is already preemption safe. | ||
23 | */ | ||
24 | |||
25 | .align 32 | ||
26 | VISenter: | ||
27 | ldub [%g6 + TI_FPDEPTH], %g1 | ||
28 | brnz,a,pn %g1, 1f | ||
29 | cmp %g1, 1 | ||
30 | stb %g0, [%g6 + TI_FPSAVED] | ||
31 | stx %fsr, [%g6 + TI_XFSR] | ||
32 | 9: jmpl %g7 + %g0, %g0 | ||
33 | nop | ||
34 | 1: bne,pn %icc, 2f | ||
35 | |||
36 | srl %g1, 1, %g1 | ||
37 | vis1: ldub [%g6 + TI_FPSAVED], %g3 | ||
38 | stx %fsr, [%g6 + TI_XFSR] | ||
39 | or %g3, %o5, %g3 | ||
40 | stb %g3, [%g6 + TI_FPSAVED] | ||
41 | rd %gsr, %g3 | ||
42 | clr %g1 | ||
43 | ba,pt %xcc, 3f | ||
44 | |||
45 | stx %g3, [%g6 + TI_GSR] | ||
46 | 2: add %g6, %g1, %g3 | ||
47 | cmp %o5, FPRS_DU | ||
48 | be,pn %icc, 6f | ||
49 | sll %g1, 3, %g1 | ||
50 | stb %o5, [%g3 + TI_FPSAVED] | ||
51 | rd %gsr, %g2 | ||
52 | add %g6, %g1, %g3 | ||
53 | stx %g2, [%g3 + TI_GSR] | ||
54 | |||
55 | add %g6, %g1, %g2 | ||
56 | stx %fsr, [%g2 + TI_XFSR] | ||
57 | sll %g1, 5, %g1 | ||
58 | 3: andcc %o5, FPRS_DL|FPRS_DU, %g0 | ||
59 | be,pn %icc, 9b | ||
60 | add %g6, TI_FPREGS, %g2 | ||
61 | andcc %o5, FPRS_DL, %g0 | ||
62 | |||
63 | be,pn %icc, 4f | ||
64 | add %g6, TI_FPREGS+0x40, %g3 | ||
65 | membar #Sync | ||
66 | stda %f0, [%g2 + %g1] ASI_BLK_P | ||
67 | stda %f16, [%g3 + %g1] ASI_BLK_P | ||
68 | membar #Sync | ||
69 | andcc %o5, FPRS_DU, %g0 | ||
70 | be,pn %icc, 5f | ||
71 | 4: add %g1, 128, %g1 | ||
72 | membar #Sync | ||
73 | stda %f32, [%g2 + %g1] ASI_BLK_P | ||
74 | |||
75 | stda %f48, [%g3 + %g1] ASI_BLK_P | ||
76 | 5: membar #Sync | ||
77 | ba,pt %xcc, 80f | ||
78 | nop | ||
79 | |||
80 | .align 32 | ||
81 | 80: jmpl %g7 + %g0, %g0 | ||
82 | nop | ||
83 | |||
84 | 6: ldub [%g3 + TI_FPSAVED], %o5 | ||
85 | or %o5, FPRS_DU, %o5 | ||
86 | add %g6, TI_FPREGS+0x80, %g2 | ||
87 | stb %o5, [%g3 + TI_FPSAVED] | ||
88 | |||
89 | sll %g1, 5, %g1 | ||
90 | add %g6, TI_FPREGS+0xc0, %g3 | ||
91 | wr %g0, FPRS_FEF, %fprs | ||
92 | membar #Sync | ||
93 | stda %f32, [%g2 + %g1] ASI_BLK_P | ||
94 | stda %f48, [%g3 + %g1] ASI_BLK_P | ||
95 | membar #Sync | ||
96 | ba,pt %xcc, 80f | ||
97 | nop | ||
98 | |||
99 | .align 32 | ||
100 | 80: jmpl %g7 + %g0, %g0 | ||
101 | nop | ||
102 | |||
103 | .align 32 | ||
104 | VISenterhalf: | ||
105 | ldub [%g6 + TI_FPDEPTH], %g1 | ||
106 | brnz,a,pn %g1, 1f | ||
107 | cmp %g1, 1 | ||
108 | stb %g0, [%g6 + TI_FPSAVED] | ||
109 | stx %fsr, [%g6 + TI_XFSR] | ||
110 | clr %o5 | ||
111 | jmpl %g7 + %g0, %g0 | ||
112 | wr %g0, FPRS_FEF, %fprs | ||
113 | |||
114 | 1: bne,pn %icc, 2f | ||
115 | srl %g1, 1, %g1 | ||
116 | ba,pt %xcc, vis1 | ||
117 | sub %g7, 8, %g7 | ||
118 | 2: addcc %g6, %g1, %g3 | ||
119 | sll %g1, 3, %g1 | ||
120 | andn %o5, FPRS_DU, %g2 | ||
121 | stb %g2, [%g3 + TI_FPSAVED] | ||
122 | |||
123 | rd %gsr, %g2 | ||
124 | add %g6, %g1, %g3 | ||
125 | stx %g2, [%g3 + TI_GSR] | ||
126 | add %g6, %g1, %g2 | ||
127 | stx %fsr, [%g2 + TI_XFSR] | ||
128 | sll %g1, 5, %g1 | ||
129 | 3: andcc %o5, FPRS_DL, %g0 | ||
130 | be,pn %icc, 4f | ||
131 | add %g6, TI_FPREGS, %g2 | ||
132 | |||
133 | add %g6, TI_FPREGS+0x40, %g3 | ||
134 | membar #Sync | ||
135 | stda %f0, [%g2 + %g1] ASI_BLK_P | ||
136 | stda %f16, [%g3 + %g1] ASI_BLK_P | ||
137 | membar #Sync | ||
138 | ba,pt %xcc, 4f | ||
139 | nop | ||
140 | |||
141 | .align 32 | ||
142 | 4: and %o5, FPRS_DU, %o5 | ||
143 | jmpl %g7 + %g0, %g0 | ||
144 | wr %o5, FPRS_FEF, %fprs | ||
diff --git a/arch/sparc/lib/atomic_64.S b/arch/sparc/lib/atomic_64.S new file mode 100644 index 000000000000..0268210ca168 --- /dev/null +++ b/arch/sparc/lib/atomic_64.S | |||
@@ -0,0 +1,138 @@ | |||
1 | /* atomic.S: These things are too big to do inline. | ||
2 | * | ||
3 | * Copyright (C) 1999, 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #include <asm/asi.h> | ||
7 | #include <asm/backoff.h> | ||
8 | |||
9 | .text | ||
10 | |||
11 | /* Two versions of the atomic routines, one that | ||
12 | * does not return a value and does not perform | ||
13 | * memory barriers, and a second which returns | ||
14 | * a value and does the barriers. | ||
15 | */ | ||
16 | .globl atomic_add | ||
17 | .type atomic_add,#function | ||
18 | atomic_add: /* %o0 = increment, %o1 = atomic_ptr */ | ||
19 | BACKOFF_SETUP(%o2) | ||
20 | 1: lduw [%o1], %g1 | ||
21 | add %g1, %o0, %g7 | ||
22 | cas [%o1], %g1, %g7 | ||
23 | cmp %g1, %g7 | ||
24 | bne,pn %icc, 2f | ||
25 | nop | ||
26 | retl | ||
27 | nop | ||
28 | 2: BACKOFF_SPIN(%o2, %o3, 1b) | ||
29 | .size atomic_add, .-atomic_add | ||
30 | |||
31 | .globl atomic_sub | ||
32 | .type atomic_sub,#function | ||
33 | atomic_sub: /* %o0 = decrement, %o1 = atomic_ptr */ | ||
34 | BACKOFF_SETUP(%o2) | ||
35 | 1: lduw [%o1], %g1 | ||
36 | sub %g1, %o0, %g7 | ||
37 | cas [%o1], %g1, %g7 | ||
38 | cmp %g1, %g7 | ||
39 | bne,pn %icc, 2f | ||
40 | nop | ||
41 | retl | ||
42 | nop | ||
43 | 2: BACKOFF_SPIN(%o2, %o3, 1b) | ||
44 | .size atomic_sub, .-atomic_sub | ||
45 | |||
46 | .globl atomic_add_ret | ||
47 | .type atomic_add_ret,#function | ||
48 | atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ | ||
49 | BACKOFF_SETUP(%o2) | ||
50 | 1: lduw [%o1], %g1 | ||
51 | add %g1, %o0, %g7 | ||
52 | cas [%o1], %g1, %g7 | ||
53 | cmp %g1, %g7 | ||
54 | bne,pn %icc, 2f | ||
55 | add %g7, %o0, %g7 | ||
56 | sra %g7, 0, %o0 | ||
57 | retl | ||
58 | nop | ||
59 | 2: BACKOFF_SPIN(%o2, %o3, 1b) | ||
60 | .size atomic_add_ret, .-atomic_add_ret | ||
61 | |||
62 | .globl atomic_sub_ret | ||
63 | .type atomic_sub_ret,#function | ||
64 | atomic_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */ | ||
65 | BACKOFF_SETUP(%o2) | ||
66 | 1: lduw [%o1], %g1 | ||
67 | sub %g1, %o0, %g7 | ||
68 | cas [%o1], %g1, %g7 | ||
69 | cmp %g1, %g7 | ||
70 | bne,pn %icc, 2f | ||
71 | sub %g7, %o0, %g7 | ||
72 | sra %g7, 0, %o0 | ||
73 | retl | ||
74 | nop | ||
75 | 2: BACKOFF_SPIN(%o2, %o3, 1b) | ||
76 | .size atomic_sub_ret, .-atomic_sub_ret | ||
77 | |||
78 | .globl atomic64_add | ||
79 | .type atomic64_add,#function | ||
80 | atomic64_add: /* %o0 = increment, %o1 = atomic_ptr */ | ||
81 | BACKOFF_SETUP(%o2) | ||
82 | 1: ldx [%o1], %g1 | ||
83 | add %g1, %o0, %g7 | ||
84 | casx [%o1], %g1, %g7 | ||
85 | cmp %g1, %g7 | ||
86 | bne,pn %xcc, 2f | ||
87 | nop | ||
88 | retl | ||
89 | nop | ||
90 | 2: BACKOFF_SPIN(%o2, %o3, 1b) | ||
91 | .size atomic64_add, .-atomic64_add | ||
92 | |||
93 | .globl atomic64_sub | ||
94 | .type atomic64_sub,#function | ||
95 | atomic64_sub: /* %o0 = decrement, %o1 = atomic_ptr */ | ||
96 | BACKOFF_SETUP(%o2) | ||
97 | 1: ldx [%o1], %g1 | ||
98 | sub %g1, %o0, %g7 | ||
99 | casx [%o1], %g1, %g7 | ||
100 | cmp %g1, %g7 | ||
101 | bne,pn %xcc, 2f | ||
102 | nop | ||
103 | retl | ||
104 | nop | ||
105 | 2: BACKOFF_SPIN(%o2, %o3, 1b) | ||
106 | .size atomic64_sub, .-atomic64_sub | ||
107 | |||
108 | .globl atomic64_add_ret | ||
109 | .type atomic64_add_ret,#function | ||
110 | atomic64_add_ret: /* %o0 = increment, %o1 = atomic_ptr */ | ||
111 | BACKOFF_SETUP(%o2) | ||
112 | 1: ldx [%o1], %g1 | ||
113 | add %g1, %o0, %g7 | ||
114 | casx [%o1], %g1, %g7 | ||
115 | cmp %g1, %g7 | ||
116 | bne,pn %xcc, 2f | ||
117 | add %g7, %o0, %g7 | ||
118 | mov %g7, %o0 | ||
119 | retl | ||
120 | nop | ||
121 | 2: BACKOFF_SPIN(%o2, %o3, 1b) | ||
122 | .size atomic64_add_ret, .-atomic64_add_ret | ||
123 | |||
124 | .globl atomic64_sub_ret | ||
125 | .type atomic64_sub_ret,#function | ||
126 | atomic64_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */ | ||
127 | BACKOFF_SETUP(%o2) | ||
128 | 1: ldx [%o1], %g1 | ||
129 | sub %g1, %o0, %g7 | ||
130 | casx [%o1], %g1, %g7 | ||
131 | cmp %g1, %g7 | ||
132 | bne,pn %xcc, 2f | ||
133 | sub %g7, %o0, %g7 | ||
134 | mov %g7, %o0 | ||
135 | retl | ||
136 | nop | ||
137 | 2: BACKOFF_SPIN(%o2, %o3, 1b) | ||
138 | .size atomic64_sub_ret, .-atomic64_sub_ret | ||
diff --git a/arch/sparc/lib/bitops.S b/arch/sparc/lib/bitops.S new file mode 100644 index 000000000000..2b7228cb8c22 --- /dev/null +++ b/arch/sparc/lib/bitops.S | |||
@@ -0,0 +1,141 @@ | |||
1 | /* bitops.S: Sparc64 atomic bit operations. | ||
2 | * | ||
3 | * Copyright (C) 2000, 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #include <asm/asi.h> | ||
7 | #include <asm/backoff.h> | ||
8 | |||
9 | .text | ||
10 | |||
11 | .globl test_and_set_bit | ||
12 | .type test_and_set_bit,#function | ||
13 | test_and_set_bit: /* %o0=nr, %o1=addr */ | ||
14 | BACKOFF_SETUP(%o3) | ||
15 | srlx %o0, 6, %g1 | ||
16 | mov 1, %o2 | ||
17 | sllx %g1, 3, %g3 | ||
18 | and %o0, 63, %g2 | ||
19 | sllx %o2, %g2, %o2 | ||
20 | add %o1, %g3, %o1 | ||
21 | 1: ldx [%o1], %g7 | ||
22 | or %g7, %o2, %g1 | ||
23 | casx [%o1], %g7, %g1 | ||
24 | cmp %g7, %g1 | ||
25 | bne,pn %xcc, 2f | ||
26 | and %g7, %o2, %g2 | ||
27 | clr %o0 | ||
28 | movrne %g2, 1, %o0 | ||
29 | retl | ||
30 | nop | ||
31 | 2: BACKOFF_SPIN(%o3, %o4, 1b) | ||
32 | .size test_and_set_bit, .-test_and_set_bit | ||
33 | |||
34 | .globl test_and_clear_bit | ||
35 | .type test_and_clear_bit,#function | ||
36 | test_and_clear_bit: /* %o0=nr, %o1=addr */ | ||
37 | BACKOFF_SETUP(%o3) | ||
38 | srlx %o0, 6, %g1 | ||
39 | mov 1, %o2 | ||
40 | sllx %g1, 3, %g3 | ||
41 | and %o0, 63, %g2 | ||
42 | sllx %o2, %g2, %o2 | ||
43 | add %o1, %g3, %o1 | ||
44 | 1: ldx [%o1], %g7 | ||
45 | andn %g7, %o2, %g1 | ||
46 | casx [%o1], %g7, %g1 | ||
47 | cmp %g7, %g1 | ||
48 | bne,pn %xcc, 2f | ||
49 | and %g7, %o2, %g2 | ||
50 | clr %o0 | ||
51 | movrne %g2, 1, %o0 | ||
52 | retl | ||
53 | nop | ||
54 | 2: BACKOFF_SPIN(%o3, %o4, 1b) | ||
55 | .size test_and_clear_bit, .-test_and_clear_bit | ||
56 | |||
57 | .globl test_and_change_bit | ||
58 | .type test_and_change_bit,#function | ||
59 | test_and_change_bit: /* %o0=nr, %o1=addr */ | ||
60 | BACKOFF_SETUP(%o3) | ||
61 | srlx %o0, 6, %g1 | ||
62 | mov 1, %o2 | ||
63 | sllx %g1, 3, %g3 | ||
64 | and %o0, 63, %g2 | ||
65 | sllx %o2, %g2, %o2 | ||
66 | add %o1, %g3, %o1 | ||
67 | 1: ldx [%o1], %g7 | ||
68 | xor %g7, %o2, %g1 | ||
69 | casx [%o1], %g7, %g1 | ||
70 | cmp %g7, %g1 | ||
71 | bne,pn %xcc, 2f | ||
72 | and %g7, %o2, %g2 | ||
73 | clr %o0 | ||
74 | movrne %g2, 1, %o0 | ||
75 | retl | ||
76 | nop | ||
77 | 2: BACKOFF_SPIN(%o3, %o4, 1b) | ||
78 | .size test_and_change_bit, .-test_and_change_bit | ||
79 | |||
80 | .globl set_bit | ||
81 | .type set_bit,#function | ||
82 | set_bit: /* %o0=nr, %o1=addr */ | ||
83 | BACKOFF_SETUP(%o3) | ||
84 | srlx %o0, 6, %g1 | ||
85 | mov 1, %o2 | ||
86 | sllx %g1, 3, %g3 | ||
87 | and %o0, 63, %g2 | ||
88 | sllx %o2, %g2, %o2 | ||
89 | add %o1, %g3, %o1 | ||
90 | 1: ldx [%o1], %g7 | ||
91 | or %g7, %o2, %g1 | ||
92 | casx [%o1], %g7, %g1 | ||
93 | cmp %g7, %g1 | ||
94 | bne,pn %xcc, 2f | ||
95 | nop | ||
96 | retl | ||
97 | nop | ||
98 | 2: BACKOFF_SPIN(%o3, %o4, 1b) | ||
99 | .size set_bit, .-set_bit | ||
100 | |||
101 | .globl clear_bit | ||
102 | .type clear_bit,#function | ||
103 | clear_bit: /* %o0=nr, %o1=addr */ | ||
104 | BACKOFF_SETUP(%o3) | ||
105 | srlx %o0, 6, %g1 | ||
106 | mov 1, %o2 | ||
107 | sllx %g1, 3, %g3 | ||
108 | and %o0, 63, %g2 | ||
109 | sllx %o2, %g2, %o2 | ||
110 | add %o1, %g3, %o1 | ||
111 | 1: ldx [%o1], %g7 | ||
112 | andn %g7, %o2, %g1 | ||
113 | casx [%o1], %g7, %g1 | ||
114 | cmp %g7, %g1 | ||
115 | bne,pn %xcc, 2f | ||
116 | nop | ||
117 | retl | ||
118 | nop | ||
119 | 2: BACKOFF_SPIN(%o3, %o4, 1b) | ||
120 | .size clear_bit, .-clear_bit | ||
121 | |||
122 | .globl change_bit | ||
123 | .type change_bit,#function | ||
124 | change_bit: /* %o0=nr, %o1=addr */ | ||
125 | BACKOFF_SETUP(%o3) | ||
126 | srlx %o0, 6, %g1 | ||
127 | mov 1, %o2 | ||
128 | sllx %g1, 3, %g3 | ||
129 | and %o0, 63, %g2 | ||
130 | sllx %o2, %g2, %o2 | ||
131 | add %o1, %g3, %o1 | ||
132 | 1: ldx [%o1], %g7 | ||
133 | xor %g7, %o2, %g1 | ||
134 | casx [%o1], %g7, %g1 | ||
135 | cmp %g7, %g1 | ||
136 | bne,pn %xcc, 2f | ||
137 | nop | ||
138 | retl | ||
139 | nop | ||
140 | 2: BACKOFF_SPIN(%o3, %o4, 1b) | ||
141 | .size change_bit, .-change_bit | ||
diff --git a/arch/sparc/lib/bzero.S b/arch/sparc/lib/bzero.S new file mode 100644 index 000000000000..c7bbae8c590f --- /dev/null +++ b/arch/sparc/lib/bzero.S | |||
@@ -0,0 +1,158 @@ | |||
1 | /* bzero.S: Simple prefetching memset, bzero, and clear_user | ||
2 | * implementations. | ||
3 | * | ||
4 | * Copyright (C) 2005 David S. Miller <davem@davemloft.net> | ||
5 | */ | ||
6 | |||
7 | .text | ||
8 | |||
9 | .globl __memset | ||
10 | .type __memset, #function | ||
11 | __memset: /* %o0=buf, %o1=pat, %o2=len */ | ||
12 | |||
13 | .globl memset | ||
14 | .type memset, #function | ||
15 | memset: /* %o0=buf, %o1=pat, %o2=len */ | ||
16 | and %o1, 0xff, %o3 | ||
17 | mov %o2, %o1 | ||
18 | sllx %o3, 8, %g1 | ||
19 | or %g1, %o3, %o2 | ||
20 | sllx %o2, 16, %g1 | ||
21 | or %g1, %o2, %o2 | ||
22 | sllx %o2, 32, %g1 | ||
23 | ba,pt %xcc, 1f | ||
24 | or %g1, %o2, %o2 | ||
25 | |||
26 | .globl __bzero | ||
27 | .type __bzero, #function | ||
28 | __bzero: /* %o0=buf, %o1=len */ | ||
29 | clr %o2 | ||
30 | 1: mov %o0, %o3 | ||
31 | brz,pn %o1, __bzero_done | ||
32 | cmp %o1, 16 | ||
33 | bl,pn %icc, __bzero_tiny | ||
34 | prefetch [%o0 + 0x000], #n_writes | ||
35 | andcc %o0, 0x3, %g0 | ||
36 | be,pt %icc, 2f | ||
37 | 1: stb %o2, [%o0 + 0x00] | ||
38 | add %o0, 1, %o0 | ||
39 | andcc %o0, 0x3, %g0 | ||
40 | bne,pn %icc, 1b | ||
41 | sub %o1, 1, %o1 | ||
42 | 2: andcc %o0, 0x7, %g0 | ||
43 | be,pt %icc, 3f | ||
44 | stw %o2, [%o0 + 0x00] | ||
45 | sub %o1, 4, %o1 | ||
46 | add %o0, 4, %o0 | ||
47 | 3: and %o1, 0x38, %g1 | ||
48 | cmp %o1, 0x40 | ||
49 | andn %o1, 0x3f, %o4 | ||
50 | bl,pn %icc, 5f | ||
51 | and %o1, 0x7, %o1 | ||
52 | prefetch [%o0 + 0x040], #n_writes | ||
53 | prefetch [%o0 + 0x080], #n_writes | ||
54 | prefetch [%o0 + 0x0c0], #n_writes | ||
55 | prefetch [%o0 + 0x100], #n_writes | ||
56 | prefetch [%o0 + 0x140], #n_writes | ||
57 | 4: prefetch [%o0 + 0x180], #n_writes | ||
58 | stx %o2, [%o0 + 0x00] | ||
59 | stx %o2, [%o0 + 0x08] | ||
60 | stx %o2, [%o0 + 0x10] | ||
61 | stx %o2, [%o0 + 0x18] | ||
62 | stx %o2, [%o0 + 0x20] | ||
63 | stx %o2, [%o0 + 0x28] | ||
64 | stx %o2, [%o0 + 0x30] | ||
65 | stx %o2, [%o0 + 0x38] | ||
66 | subcc %o4, 0x40, %o4 | ||
67 | bne,pt %icc, 4b | ||
68 | add %o0, 0x40, %o0 | ||
69 | brz,pn %g1, 6f | ||
70 | nop | ||
71 | 5: stx %o2, [%o0 + 0x00] | ||
72 | subcc %g1, 8, %g1 | ||
73 | bne,pt %icc, 5b | ||
74 | add %o0, 0x8, %o0 | ||
75 | 6: brz,pt %o1, __bzero_done | ||
76 | nop | ||
77 | __bzero_tiny: | ||
78 | 1: stb %o2, [%o0 + 0x00] | ||
79 | subcc %o1, 1, %o1 | ||
80 | bne,pt %icc, 1b | ||
81 | add %o0, 1, %o0 | ||
82 | __bzero_done: | ||
83 | retl | ||
84 | mov %o3, %o0 | ||
85 | .size __bzero, .-__bzero | ||
86 | .size __memset, .-__memset | ||
87 | .size memset, .-memset | ||
88 | |||
89 | #define EX_ST(x,y) \ | ||
90 | 98: x,y; \ | ||
91 | .section .fixup; \ | ||
92 | .align 4; \ | ||
93 | 99: retl; \ | ||
94 | mov %o1, %o0; \ | ||
95 | .section __ex_table,"a";\ | ||
96 | .align 4; \ | ||
97 | .word 98b, 99b; \ | ||
98 | .text; \ | ||
99 | .align 4; | ||
100 | |||
101 | .globl __clear_user | ||
102 | .type __clear_user, #function | ||
103 | __clear_user: /* %o0=buf, %o1=len */ | ||
104 | brz,pn %o1, __clear_user_done | ||
105 | cmp %o1, 16 | ||
106 | bl,pn %icc, __clear_user_tiny | ||
107 | EX_ST(prefetcha [%o0 + 0x00] %asi, #n_writes) | ||
108 | andcc %o0, 0x3, %g0 | ||
109 | be,pt %icc, 2f | ||
110 | 1: EX_ST(stba %g0, [%o0 + 0x00] %asi) | ||
111 | add %o0, 1, %o0 | ||
112 | andcc %o0, 0x3, %g0 | ||
113 | bne,pn %icc, 1b | ||
114 | sub %o1, 1, %o1 | ||
115 | 2: andcc %o0, 0x7, %g0 | ||
116 | be,pt %icc, 3f | ||
117 | EX_ST(stwa %g0, [%o0 + 0x00] %asi) | ||
118 | sub %o1, 4, %o1 | ||
119 | add %o0, 4, %o0 | ||
120 | 3: and %o1, 0x38, %g1 | ||
121 | cmp %o1, 0x40 | ||
122 | andn %o1, 0x3f, %o4 | ||
123 | bl,pn %icc, 5f | ||
124 | and %o1, 0x7, %o1 | ||
125 | EX_ST(prefetcha [%o0 + 0x040] %asi, #n_writes) | ||
126 | EX_ST(prefetcha [%o0 + 0x080] %asi, #n_writes) | ||
127 | EX_ST(prefetcha [%o0 + 0x0c0] %asi, #n_writes) | ||
128 | EX_ST(prefetcha [%o0 + 0x100] %asi, #n_writes) | ||
129 | EX_ST(prefetcha [%o0 + 0x140] %asi, #n_writes) | ||
130 | 4: EX_ST(prefetcha [%o0 + 0x180] %asi, #n_writes) | ||
131 | EX_ST(stxa %g0, [%o0 + 0x00] %asi) | ||
132 | EX_ST(stxa %g0, [%o0 + 0x08] %asi) | ||
133 | EX_ST(stxa %g0, [%o0 + 0x10] %asi) | ||
134 | EX_ST(stxa %g0, [%o0 + 0x18] %asi) | ||
135 | EX_ST(stxa %g0, [%o0 + 0x20] %asi) | ||
136 | EX_ST(stxa %g0, [%o0 + 0x28] %asi) | ||
137 | EX_ST(stxa %g0, [%o0 + 0x30] %asi) | ||
138 | EX_ST(stxa %g0, [%o0 + 0x38] %asi) | ||
139 | subcc %o4, 0x40, %o4 | ||
140 | bne,pt %icc, 4b | ||
141 | add %o0, 0x40, %o0 | ||
142 | brz,pn %g1, 6f | ||
143 | nop | ||
144 | 5: EX_ST(stxa %g0, [%o0 + 0x00] %asi) | ||
145 | subcc %g1, 8, %g1 | ||
146 | bne,pt %icc, 5b | ||
147 | add %o0, 0x8, %o0 | ||
148 | 6: brz,pt %o1, __clear_user_done | ||
149 | nop | ||
150 | __clear_user_tiny: | ||
151 | 1: EX_ST(stba %g0, [%o0 + 0x00] %asi) | ||
152 | subcc %o1, 1, %o1 | ||
153 | bne,pt %icc, 1b | ||
154 | add %o0, 1, %o0 | ||
155 | __clear_user_done: | ||
156 | retl | ||
157 | clr %o0 | ||
158 | .size __clear_user, .-__clear_user | ||
diff --git a/arch/sparc/lib/checksum_64.S b/arch/sparc/lib/checksum_64.S new file mode 100644 index 000000000000..1d230f693dc4 --- /dev/null +++ b/arch/sparc/lib/checksum_64.S | |||
@@ -0,0 +1,173 @@ | |||
1 | /* checksum.S: Sparc V9 optimized checksum code. | ||
2 | * | ||
3 | * Copyright(C) 1995 Linus Torvalds | ||
4 | * Copyright(C) 1995 Miguel de Icaza | ||
5 | * Copyright(C) 1996, 2000 David S. Miller | ||
6 | * Copyright(C) 1997 Jakub Jelinek | ||
7 | * | ||
8 | * derived from: | ||
9 | * Linux/Alpha checksum c-code | ||
10 | * Linux/ix86 inline checksum assembly | ||
11 | * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) | ||
12 | * David Mosberger-Tang for optimized reference c-code | ||
13 | * BSD4.4 portable checksum routine | ||
14 | */ | ||
15 | |||
16 | .text | ||
17 | |||
18 | csum_partial_fix_alignment: | ||
19 | /* We checked for zero length already, so there must be | ||
20 | * at least one byte. | ||
21 | */ | ||
22 | be,pt %icc, 1f | ||
23 | nop | ||
24 | ldub [%o0 + 0x00], %o4 | ||
25 | add %o0, 1, %o0 | ||
26 | sub %o1, 1, %o1 | ||
27 | 1: andcc %o0, 0x2, %g0 | ||
28 | be,pn %icc, csum_partial_post_align | ||
29 | cmp %o1, 2 | ||
30 | blu,pn %icc, csum_partial_end_cruft | ||
31 | nop | ||
32 | lduh [%o0 + 0x00], %o5 | ||
33 | add %o0, 2, %o0 | ||
34 | sub %o1, 2, %o1 | ||
35 | ba,pt %xcc, csum_partial_post_align | ||
36 | add %o5, %o4, %o4 | ||
37 | |||
38 | .align 32 | ||
39 | .globl csum_partial | ||
40 | csum_partial: /* %o0=buff, %o1=len, %o2=sum */ | ||
41 | prefetch [%o0 + 0x000], #n_reads | ||
42 | clr %o4 | ||
43 | prefetch [%o0 + 0x040], #n_reads | ||
44 | brz,pn %o1, csum_partial_finish | ||
45 | andcc %o0, 0x3, %g0 | ||
46 | |||
47 | /* We "remember" whether the lowest bit in the address | ||
48 | * was set in %g7. Because if it is, we have to swap | ||
49 | * upper and lower 8 bit fields of the sum we calculate. | ||
50 | */ | ||
51 | bne,pn %icc, csum_partial_fix_alignment | ||
52 | andcc %o0, 0x1, %g7 | ||
53 | |||
54 | csum_partial_post_align: | ||
55 | prefetch [%o0 + 0x080], #n_reads | ||
56 | andncc %o1, 0x3f, %o3 | ||
57 | |||
58 | prefetch [%o0 + 0x0c0], #n_reads | ||
59 | sub %o1, %o3, %o1 | ||
60 | brz,pn %o3, 2f | ||
61 | prefetch [%o0 + 0x100], #n_reads | ||
62 | |||
63 | /* So that we don't need to use the non-pairing | ||
64 | * add-with-carry instructions we accumulate 32-bit | ||
65 | * values into a 64-bit register. At the end of the | ||
66 | * loop we fold it down to 32-bits and so on. | ||
67 | */ | ||
68 | prefetch [%o0 + 0x140], #n_reads | ||
69 | 1: lduw [%o0 + 0x00], %o5 | ||
70 | lduw [%o0 + 0x04], %g1 | ||
71 | lduw [%o0 + 0x08], %g2 | ||
72 | add %o4, %o5, %o4 | ||
73 | lduw [%o0 + 0x0c], %g3 | ||
74 | add %o4, %g1, %o4 | ||
75 | lduw [%o0 + 0x10], %o5 | ||
76 | add %o4, %g2, %o4 | ||
77 | lduw [%o0 + 0x14], %g1 | ||
78 | add %o4, %g3, %o4 | ||
79 | lduw [%o0 + 0x18], %g2 | ||
80 | add %o4, %o5, %o4 | ||
81 | lduw [%o0 + 0x1c], %g3 | ||
82 | add %o4, %g1, %o4 | ||
83 | lduw [%o0 + 0x20], %o5 | ||
84 | add %o4, %g2, %o4 | ||
85 | lduw [%o0 + 0x24], %g1 | ||
86 | add %o4, %g3, %o4 | ||
87 | lduw [%o0 + 0x28], %g2 | ||
88 | add %o4, %o5, %o4 | ||
89 | lduw [%o0 + 0x2c], %g3 | ||
90 | add %o4, %g1, %o4 | ||
91 | lduw [%o0 + 0x30], %o5 | ||
92 | add %o4, %g2, %o4 | ||
93 | lduw [%o0 + 0x34], %g1 | ||
94 | add %o4, %g3, %o4 | ||
95 | lduw [%o0 + 0x38], %g2 | ||
96 | add %o4, %o5, %o4 | ||
97 | lduw [%o0 + 0x3c], %g3 | ||
98 | add %o4, %g1, %o4 | ||
99 | prefetch [%o0 + 0x180], #n_reads | ||
100 | add %o4, %g2, %o4 | ||
101 | subcc %o3, 0x40, %o3 | ||
102 | add %o0, 0x40, %o0 | ||
103 | bne,pt %icc, 1b | ||
104 | add %o4, %g3, %o4 | ||
105 | |||
106 | 2: and %o1, 0x3c, %o3 | ||
107 | brz,pn %o3, 2f | ||
108 | sub %o1, %o3, %o1 | ||
109 | 1: lduw [%o0 + 0x00], %o5 | ||
110 | subcc %o3, 0x4, %o3 | ||
111 | add %o0, 0x4, %o0 | ||
112 | bne,pt %icc, 1b | ||
113 | add %o4, %o5, %o4 | ||
114 | |||
115 | 2: | ||
116 | /* fold 64-->32 */ | ||
117 | srlx %o4, 32, %o5 | ||
118 | srl %o4, 0, %o4 | ||
119 | add %o4, %o5, %o4 | ||
120 | srlx %o4, 32, %o5 | ||
121 | srl %o4, 0, %o4 | ||
122 | add %o4, %o5, %o4 | ||
123 | |||
124 | /* fold 32-->16 */ | ||
125 | sethi %hi(0xffff0000), %g1 | ||
126 | srl %o4, 16, %o5 | ||
127 | andn %o4, %g1, %g2 | ||
128 | add %o5, %g2, %o4 | ||
129 | srl %o4, 16, %o5 | ||
130 | andn %o4, %g1, %g2 | ||
131 | add %o5, %g2, %o4 | ||
132 | |||
133 | csum_partial_end_cruft: | ||
134 | /* %o4 has the 16-bit sum we have calculated so-far. */ | ||
135 | cmp %o1, 2 | ||
136 | blu,pt %icc, 1f | ||
137 | nop | ||
138 | lduh [%o0 + 0x00], %o5 | ||
139 | sub %o1, 2, %o1 | ||
140 | add %o0, 2, %o0 | ||
141 | add %o4, %o5, %o4 | ||
142 | 1: brz,pt %o1, 1f | ||
143 | nop | ||
144 | ldub [%o0 + 0x00], %o5 | ||
145 | sub %o1, 1, %o1 | ||
146 | add %o0, 1, %o0 | ||
147 | sllx %o5, 8, %o5 | ||
148 | add %o4, %o5, %o4 | ||
149 | 1: | ||
150 | /* fold 32-->16 */ | ||
151 | sethi %hi(0xffff0000), %g1 | ||
152 | srl %o4, 16, %o5 | ||
153 | andn %o4, %g1, %g2 | ||
154 | add %o5, %g2, %o4 | ||
155 | srl %o4, 16, %o5 | ||
156 | andn %o4, %g1, %g2 | ||
157 | add %o5, %g2, %o4 | ||
158 | |||
159 | 1: brz,pt %g7, 1f | ||
160 | nop | ||
161 | |||
162 | /* We started with an odd byte, byte-swap the result. */ | ||
163 | srl %o4, 8, %o5 | ||
164 | and %o4, 0xff, %g1 | ||
165 | sll %g1, 8, %g1 | ||
166 | or %o5, %g1, %o4 | ||
167 | |||
168 | 1: addcc %o2, %o4, %o2 | ||
169 | addc %g0, %o2, %o2 | ||
170 | |||
171 | csum_partial_finish: | ||
172 | retl | ||
173 | srl %o2, 0, %o0 | ||
diff --git a/arch/sparc/lib/clear_page.S b/arch/sparc/lib/clear_page.S new file mode 100644 index 000000000000..77e531f6c2a7 --- /dev/null +++ b/arch/sparc/lib/clear_page.S | |||
@@ -0,0 +1,103 @@ | |||
1 | /* clear_page.S: UltraSparc optimized clear page. | ||
2 | * | ||
3 | * Copyright (C) 1996, 1998, 1999, 2000, 2004 David S. Miller (davem@redhat.com) | ||
4 | * Copyright (C) 1997 Jakub Jelinek (jakub@redhat.com) | ||
5 | */ | ||
6 | |||
7 | #include <asm/visasm.h> | ||
8 | #include <asm/thread_info.h> | ||
9 | #include <asm/page.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | #include <asm/spitfire.h> | ||
12 | #include <asm/head.h> | ||
13 | |||
14 | /* What we used to do was lock a TLB entry into a specific | ||
15 | * TLB slot, clear the page with interrupts disabled, then | ||
16 | * restore the original TLB entry. This was great for | ||
17 | * disturbing the TLB as little as possible, but it meant | ||
18 | * we had to keep interrupts disabled for a long time. | ||
19 | * | ||
20 | * Now, we simply use the normal TLB loading mechanism, | ||
21 | * and this makes the cpu choose a slot all by itself. | ||
22 | * Then we do a normal TLB flush on exit. We need only | ||
23 | * disable preemption during the clear. | ||
24 | */ | ||
25 | |||
26 | .text | ||
27 | |||
28 | .globl _clear_page | ||
29 | _clear_page: /* %o0=dest */ | ||
30 | ba,pt %xcc, clear_page_common | ||
31 | clr %o4 | ||
32 | |||
33 | /* This thing is pretty important, it shows up | ||
34 | * on the profiles via do_anonymous_page(). | ||
35 | */ | ||
36 | .align 32 | ||
37 | .globl clear_user_page | ||
38 | clear_user_page: /* %o0=dest, %o1=vaddr */ | ||
39 | lduw [%g6 + TI_PRE_COUNT], %o2 | ||
40 | sethi %uhi(PAGE_OFFSET), %g2 | ||
41 | sethi %hi(PAGE_SIZE), %o4 | ||
42 | |||
43 | sllx %g2, 32, %g2 | ||
44 | sethi %hi(PAGE_KERNEL_LOCKED), %g3 | ||
45 | |||
46 | ldx [%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3 | ||
47 | sub %o0, %g2, %g1 ! paddr | ||
48 | |||
49 | and %o1, %o4, %o0 ! vaddr D-cache alias bit | ||
50 | |||
51 | or %g1, %g3, %g1 ! TTE data | ||
52 | sethi %hi(TLBTEMP_BASE), %o3 | ||
53 | |||
54 | add %o2, 1, %o4 | ||
55 | add %o0, %o3, %o0 ! TTE vaddr | ||
56 | |||
57 | /* Disable preemption. */ | ||
58 | mov TLB_TAG_ACCESS, %g3 | ||
59 | stw %o4, [%g6 + TI_PRE_COUNT] | ||
60 | |||
61 | /* Load TLB entry. */ | ||
62 | rdpr %pstate, %o4 | ||
63 | wrpr %o4, PSTATE_IE, %pstate | ||
64 | stxa %o0, [%g3] ASI_DMMU | ||
65 | stxa %g1, [%g0] ASI_DTLB_DATA_IN | ||
66 | sethi %hi(KERNBASE), %g1 | ||
67 | flush %g1 | ||
68 | wrpr %o4, 0x0, %pstate | ||
69 | |||
70 | mov 1, %o4 | ||
71 | |||
72 | clear_page_common: | ||
73 | VISEntryHalf | ||
74 | membar #StoreLoad | #StoreStore | #LoadStore | ||
75 | fzero %f0 | ||
76 | sethi %hi(PAGE_SIZE/64), %o1 | ||
77 | mov %o0, %g1 ! remember vaddr for tlbflush | ||
78 | fzero %f2 | ||
79 | or %o1, %lo(PAGE_SIZE/64), %o1 | ||
80 | faddd %f0, %f2, %f4 | ||
81 | fmuld %f0, %f2, %f6 | ||
82 | faddd %f0, %f2, %f8 | ||
83 | fmuld %f0, %f2, %f10 | ||
84 | |||
85 | faddd %f0, %f2, %f12 | ||
86 | fmuld %f0, %f2, %f14 | ||
87 | 1: stda %f0, [%o0 + %g0] ASI_BLK_P | ||
88 | subcc %o1, 1, %o1 | ||
89 | bne,pt %icc, 1b | ||
90 | add %o0, 0x40, %o0 | ||
91 | membar #Sync | ||
92 | VISExitHalf | ||
93 | |||
94 | brz,pn %o4, out | ||
95 | nop | ||
96 | |||
97 | stxa %g0, [%g1] ASI_DMMU_DEMAP | ||
98 | membar #Sync | ||
99 | stw %o2, [%g6 + TI_PRE_COUNT] | ||
100 | |||
101 | out: retl | ||
102 | nop | ||
103 | |||
diff --git a/arch/sparc/lib/copy_in_user.S b/arch/sparc/lib/copy_in_user.S new file mode 100644 index 000000000000..650af3f21f78 --- /dev/null +++ b/arch/sparc/lib/copy_in_user.S | |||
@@ -0,0 +1,119 @@ | |||
1 | /* copy_in_user.S: Copy from userspace to userspace. | ||
2 | * | ||
3 | * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) | ||
4 | */ | ||
5 | |||
6 | #include <asm/asi.h> | ||
7 | |||
8 | #define XCC xcc | ||
9 | |||
10 | #define EX(x,y) \ | ||
11 | 98: x,y; \ | ||
12 | .section .fixup; \ | ||
13 | .align 4; \ | ||
14 | 99: retl; \ | ||
15 | mov 1, %o0; \ | ||
16 | .section __ex_table,"a";\ | ||
17 | .align 4; \ | ||
18 | .word 98b, 99b; \ | ||
19 | .text; \ | ||
20 | .align 4; | ||
21 | |||
22 | .register %g2,#scratch | ||
23 | .register %g3,#scratch | ||
24 | |||
25 | .text | ||
26 | .align 32 | ||
27 | |||
28 | /* Don't try to get too fancy here, just nice and | ||
29 | * simple. This is predominantly used for well aligned | ||
30 | * small copies in the compat layer. It is also used | ||
31 | * to copy register windows around during thread cloning. | ||
32 | */ | ||
33 | |||
34 | .globl ___copy_in_user | ||
35 | .type ___copy_in_user,#function | ||
36 | ___copy_in_user: /* %o0=dst, %o1=src, %o2=len */ | ||
37 | /* Writing to %asi is _expensive_ so we hardcode it. | ||
38 | * Reading %asi to check for KERNEL_DS is comparatively | ||
39 | * cheap. | ||
40 | */ | ||
41 | rd %asi, %g1 | ||
42 | cmp %g1, ASI_AIUS | ||
43 | bne,pn %icc, memcpy_user_stub | ||
44 | nop | ||
45 | |||
46 | cmp %o2, 0 | ||
47 | be,pn %XCC, 85f | ||
48 | or %o0, %o1, %o3 | ||
49 | cmp %o2, 16 | ||
50 | bleu,a,pn %XCC, 80f | ||
51 | or %o3, %o2, %o3 | ||
52 | |||
53 | /* 16 < len <= 64 */ | ||
54 | andcc %o3, 0x7, %g0 | ||
55 | bne,pn %XCC, 90f | ||
56 | sub %o0, %o1, %o3 | ||
57 | |||
58 | andn %o2, 0x7, %o4 | ||
59 | and %o2, 0x7, %o2 | ||
60 | 1: subcc %o4, 0x8, %o4 | ||
61 | EX(ldxa [%o1] %asi, %o5) | ||
62 | EX(stxa %o5, [%o1 + %o3] ASI_AIUS) | ||
63 | bgu,pt %XCC, 1b | ||
64 | add %o1, 0x8, %o1 | ||
65 | andcc %o2, 0x4, %g0 | ||
66 | be,pt %XCC, 1f | ||
67 | nop | ||
68 | sub %o2, 0x4, %o2 | ||
69 | EX(lduwa [%o1] %asi, %o5) | ||
70 | EX(stwa %o5, [%o1 + %o3] ASI_AIUS) | ||
71 | add %o1, 0x4, %o1 | ||
72 | 1: cmp %o2, 0 | ||
73 | be,pt %XCC, 85f | ||
74 | nop | ||
75 | ba,pt %xcc, 90f | ||
76 | nop | ||
77 | |||
78 | 80: /* 0 < len <= 16 */ | ||
79 | andcc %o3, 0x3, %g0 | ||
80 | bne,pn %XCC, 90f | ||
81 | sub %o0, %o1, %o3 | ||
82 | |||
83 | 82: | ||
84 | subcc %o2, 4, %o2 | ||
85 | EX(lduwa [%o1] %asi, %g1) | ||
86 | EX(stwa %g1, [%o1 + %o3] ASI_AIUS) | ||
87 | bgu,pt %XCC, 82b | ||
88 | add %o1, 4, %o1 | ||
89 | |||
90 | 85: retl | ||
91 | clr %o0 | ||
92 | |||
93 | .align 32 | ||
94 | 90: | ||
95 | subcc %o2, 1, %o2 | ||
96 | EX(lduba [%o1] %asi, %g1) | ||
97 | EX(stba %g1, [%o1 + %o3] ASI_AIUS) | ||
98 | bgu,pt %XCC, 90b | ||
99 | add %o1, 1, %o1 | ||
100 | retl | ||
101 | clr %o0 | ||
102 | |||
103 | .size ___copy_in_user, .-___copy_in_user | ||
104 | |||
105 | /* Act like copy_{to,in}_user(), ie. return zero instead | ||
106 | * of original destination pointer. This is invoked when | ||
107 | * copy_{to,in}_user() finds that %asi is kernel space. | ||
108 | */ | ||
109 | .globl memcpy_user_stub | ||
110 | .type memcpy_user_stub,#function | ||
111 | memcpy_user_stub: | ||
112 | save %sp, -192, %sp | ||
113 | mov %i0, %o0 | ||
114 | mov %i1, %o1 | ||
115 | call memcpy | ||
116 | mov %i2, %o2 | ||
117 | ret | ||
118 | restore %g0, %g0, %o0 | ||
119 | .size memcpy_user_stub, .-memcpy_user_stub | ||
diff --git a/arch/sparc/lib/copy_page.S b/arch/sparc/lib/copy_page.S new file mode 100644 index 000000000000..b243d3b606ba --- /dev/null +++ b/arch/sparc/lib/copy_page.S | |||
@@ -0,0 +1,250 @@ | |||
1 | /* clear_page.S: UltraSparc optimized copy page. | ||
2 | * | ||
3 | * Copyright (C) 1996, 1998, 1999, 2000, 2004 David S. Miller (davem@redhat.com) | ||
4 | * Copyright (C) 1997 Jakub Jelinek (jakub@redhat.com) | ||
5 | */ | ||
6 | |||
7 | #include <asm/visasm.h> | ||
8 | #include <asm/thread_info.h> | ||
9 | #include <asm/page.h> | ||
10 | #include <asm/pgtable.h> | ||
11 | #include <asm/spitfire.h> | ||
12 | #include <asm/head.h> | ||
13 | |||
14 | /* What we used to do was lock a TLB entry into a specific | ||
15 | * TLB slot, clear the page with interrupts disabled, then | ||
16 | * restore the original TLB entry. This was great for | ||
17 | * disturbing the TLB as little as possible, but it meant | ||
18 | * we had to keep interrupts disabled for a long time. | ||
19 | * | ||
20 | * Now, we simply use the normal TLB loading mechanism, | ||
21 | * and this makes the cpu choose a slot all by itself. | ||
22 | * Then we do a normal TLB flush on exit. We need only | ||
23 | * disable preemption during the clear. | ||
24 | */ | ||
25 | |||
26 | #define DCACHE_SIZE (PAGE_SIZE * 2) | ||
27 | |||
28 | #if (PAGE_SHIFT == 13) | ||
29 | #define PAGE_SIZE_REM 0x80 | ||
30 | #elif (PAGE_SHIFT == 16) | ||
31 | #define PAGE_SIZE_REM 0x100 | ||
32 | #else | ||
33 | #error Wrong PAGE_SHIFT specified | ||
34 | #endif | ||
35 | |||
36 | #define TOUCH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7) \ | ||
37 | fmovd %reg0, %f48; fmovd %reg1, %f50; \ | ||
38 | fmovd %reg2, %f52; fmovd %reg3, %f54; \ | ||
39 | fmovd %reg4, %f56; fmovd %reg5, %f58; \ | ||
40 | fmovd %reg6, %f60; fmovd %reg7, %f62; | ||
41 | |||
42 | .text | ||
43 | |||
44 | .align 32 | ||
45 | .globl copy_user_page | ||
46 | .type copy_user_page,#function | ||
47 | copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ | ||
48 | lduw [%g6 + TI_PRE_COUNT], %o4 | ||
49 | sethi %uhi(PAGE_OFFSET), %g2 | ||
50 | sethi %hi(PAGE_SIZE), %o3 | ||
51 | |||
52 | sllx %g2, 32, %g2 | ||
53 | sethi %hi(PAGE_KERNEL_LOCKED), %g3 | ||
54 | |||
55 | ldx [%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3 | ||
56 | sub %o0, %g2, %g1 ! dest paddr | ||
57 | |||
58 | sub %o1, %g2, %g2 ! src paddr | ||
59 | |||
60 | and %o2, %o3, %o0 ! vaddr D-cache alias bit | ||
61 | or %g1, %g3, %g1 ! dest TTE data | ||
62 | |||
63 | or %g2, %g3, %g2 ! src TTE data | ||
64 | sethi %hi(TLBTEMP_BASE), %o3 | ||
65 | |||
66 | sethi %hi(DCACHE_SIZE), %o1 | ||
67 | add %o0, %o3, %o0 ! dest TTE vaddr | ||
68 | |||
69 | add %o4, 1, %o2 | ||
70 | add %o0, %o1, %o1 ! src TTE vaddr | ||
71 | |||
72 | /* Disable preemption. */ | ||
73 | mov TLB_TAG_ACCESS, %g3 | ||
74 | stw %o2, [%g6 + TI_PRE_COUNT] | ||
75 | |||
76 | /* Load TLB entries. */ | ||
77 | rdpr %pstate, %o2 | ||
78 | wrpr %o2, PSTATE_IE, %pstate | ||
79 | stxa %o0, [%g3] ASI_DMMU | ||
80 | stxa %g1, [%g0] ASI_DTLB_DATA_IN | ||
81 | membar #Sync | ||
82 | stxa %o1, [%g3] ASI_DMMU | ||
83 | stxa %g2, [%g0] ASI_DTLB_DATA_IN | ||
84 | membar #Sync | ||
85 | wrpr %o2, 0x0, %pstate | ||
86 | |||
87 | cheetah_copy_page_insn: | ||
88 | ba,pt %xcc, 9f | ||
89 | nop | ||
90 | |||
91 | 1: | ||
92 | VISEntryHalf | ||
93 | membar #StoreLoad | #StoreStore | #LoadStore | ||
94 | sethi %hi((PAGE_SIZE/64)-2), %o2 | ||
95 | mov %o0, %g1 | ||
96 | prefetch [%o1 + 0x000], #one_read | ||
97 | or %o2, %lo((PAGE_SIZE/64)-2), %o2 | ||
98 | prefetch [%o1 + 0x040], #one_read | ||
99 | prefetch [%o1 + 0x080], #one_read | ||
100 | prefetch [%o1 + 0x0c0], #one_read | ||
101 | ldd [%o1 + 0x000], %f0 | ||
102 | prefetch [%o1 + 0x100], #one_read | ||
103 | ldd [%o1 + 0x008], %f2 | ||
104 | prefetch [%o1 + 0x140], #one_read | ||
105 | ldd [%o1 + 0x010], %f4 | ||
106 | prefetch [%o1 + 0x180], #one_read | ||
107 | fmovd %f0, %f16 | ||
108 | ldd [%o1 + 0x018], %f6 | ||
109 | fmovd %f2, %f18 | ||
110 | ldd [%o1 + 0x020], %f8 | ||
111 | fmovd %f4, %f20 | ||
112 | ldd [%o1 + 0x028], %f10 | ||
113 | fmovd %f6, %f22 | ||
114 | ldd [%o1 + 0x030], %f12 | ||
115 | fmovd %f8, %f24 | ||
116 | ldd [%o1 + 0x038], %f14 | ||
117 | fmovd %f10, %f26 | ||
118 | ldd [%o1 + 0x040], %f0 | ||
119 | 1: ldd [%o1 + 0x048], %f2 | ||
120 | fmovd %f12, %f28 | ||
121 | ldd [%o1 + 0x050], %f4 | ||
122 | fmovd %f14, %f30 | ||
123 | stda %f16, [%o0] ASI_BLK_P | ||
124 | ldd [%o1 + 0x058], %f6 | ||
125 | fmovd %f0, %f16 | ||
126 | ldd [%o1 + 0x060], %f8 | ||
127 | fmovd %f2, %f18 | ||
128 | ldd [%o1 + 0x068], %f10 | ||
129 | fmovd %f4, %f20 | ||
130 | ldd [%o1 + 0x070], %f12 | ||
131 | fmovd %f6, %f22 | ||
132 | ldd [%o1 + 0x078], %f14 | ||
133 | fmovd %f8, %f24 | ||
134 | ldd [%o1 + 0x080], %f0 | ||
135 | prefetch [%o1 + 0x180], #one_read | ||
136 | fmovd %f10, %f26 | ||
137 | subcc %o2, 1, %o2 | ||
138 | add %o0, 0x40, %o0 | ||
139 | bne,pt %xcc, 1b | ||
140 | add %o1, 0x40, %o1 | ||
141 | |||
142 | ldd [%o1 + 0x048], %f2 | ||
143 | fmovd %f12, %f28 | ||
144 | ldd [%o1 + 0x050], %f4 | ||
145 | fmovd %f14, %f30 | ||
146 | stda %f16, [%o0] ASI_BLK_P | ||
147 | ldd [%o1 + 0x058], %f6 | ||
148 | fmovd %f0, %f16 | ||
149 | ldd [%o1 + 0x060], %f8 | ||
150 | fmovd %f2, %f18 | ||
151 | ldd [%o1 + 0x068], %f10 | ||
152 | fmovd %f4, %f20 | ||
153 | ldd [%o1 + 0x070], %f12 | ||
154 | fmovd %f6, %f22 | ||
155 | add %o0, 0x40, %o0 | ||
156 | ldd [%o1 + 0x078], %f14 | ||
157 | fmovd %f8, %f24 | ||
158 | fmovd %f10, %f26 | ||
159 | fmovd %f12, %f28 | ||
160 | fmovd %f14, %f30 | ||
161 | stda %f16, [%o0] ASI_BLK_P | ||
162 | membar #Sync | ||
163 | VISExitHalf | ||
164 | ba,pt %xcc, 5f | ||
165 | nop | ||
166 | |||
167 | 9: | ||
168 | VISEntry | ||
169 | ldub [%g6 + TI_FAULT_CODE], %g3 | ||
170 | mov %o0, %g1 | ||
171 | cmp %g3, 0 | ||
172 | rd %asi, %g3 | ||
173 | be,a,pt %icc, 1f | ||
174 | wr %g0, ASI_BLK_P, %asi | ||
175 | wr %g0, ASI_BLK_COMMIT_P, %asi | ||
176 | 1: ldda [%o1] ASI_BLK_P, %f0 | ||
177 | add %o1, 0x40, %o1 | ||
178 | ldda [%o1] ASI_BLK_P, %f16 | ||
179 | add %o1, 0x40, %o1 | ||
180 | sethi %hi(PAGE_SIZE), %o2 | ||
181 | 1: TOUCH(f0, f2, f4, f6, f8, f10, f12, f14) | ||
182 | ldda [%o1] ASI_BLK_P, %f32 | ||
183 | stda %f48, [%o0] %asi | ||
184 | add %o1, 0x40, %o1 | ||
185 | sub %o2, 0x40, %o2 | ||
186 | add %o0, 0x40, %o0 | ||
187 | TOUCH(f16, f18, f20, f22, f24, f26, f28, f30) | ||
188 | ldda [%o1] ASI_BLK_P, %f0 | ||
189 | stda %f48, [%o0] %asi | ||
190 | add %o1, 0x40, %o1 | ||
191 | sub %o2, 0x40, %o2 | ||
192 | add %o0, 0x40, %o0 | ||
193 | TOUCH(f32, f34, f36, f38, f40, f42, f44, f46) | ||
194 | ldda [%o1] ASI_BLK_P, %f16 | ||
195 | stda %f48, [%o0] %asi | ||
196 | sub %o2, 0x40, %o2 | ||
197 | add %o1, 0x40, %o1 | ||
198 | cmp %o2, PAGE_SIZE_REM | ||
199 | bne,pt %xcc, 1b | ||
200 | add %o0, 0x40, %o0 | ||
201 | #if (PAGE_SHIFT == 16) | ||
202 | TOUCH(f0, f2, f4, f6, f8, f10, f12, f14) | ||
203 | ldda [%o1] ASI_BLK_P, %f32 | ||
204 | stda %f48, [%o0] %asi | ||
205 | add %o1, 0x40, %o1 | ||
206 | sub %o2, 0x40, %o2 | ||
207 | add %o0, 0x40, %o0 | ||
208 | TOUCH(f16, f18, f20, f22, f24, f26, f28, f30) | ||
209 | ldda [%o1] ASI_BLK_P, %f0 | ||
210 | stda %f48, [%o0] %asi | ||
211 | add %o1, 0x40, %o1 | ||
212 | sub %o2, 0x40, %o2 | ||
213 | add %o0, 0x40, %o0 | ||
214 | membar #Sync | ||
215 | stda %f32, [%o0] %asi | ||
216 | add %o0, 0x40, %o0 | ||
217 | stda %f0, [%o0] %asi | ||
218 | #else | ||
219 | membar #Sync | ||
220 | stda %f0, [%o0] %asi | ||
221 | add %o0, 0x40, %o0 | ||
222 | stda %f16, [%o0] %asi | ||
223 | #endif | ||
224 | membar #Sync | ||
225 | wr %g3, 0x0, %asi | ||
226 | VISExit | ||
227 | |||
228 | 5: | ||
229 | stxa %g0, [%g1] ASI_DMMU_DEMAP | ||
230 | membar #Sync | ||
231 | |||
232 | sethi %hi(DCACHE_SIZE), %g2 | ||
233 | stxa %g0, [%g1 + %g2] ASI_DMMU_DEMAP | ||
234 | membar #Sync | ||
235 | |||
236 | retl | ||
237 | stw %o4, [%g6 + TI_PRE_COUNT] | ||
238 | |||
239 | .size copy_user_page, .-copy_user_page | ||
240 | |||
241 | .globl cheetah_patch_copy_page | ||
242 | cheetah_patch_copy_page: | ||
243 | sethi %hi(0x01000000), %o1 ! NOP | ||
244 | sethi %hi(cheetah_copy_page_insn), %o0 | ||
245 | or %o0, %lo(cheetah_copy_page_insn), %o0 | ||
246 | stw %o1, [%o0] | ||
247 | membar #StoreStore | ||
248 | flush %o0 | ||
249 | retl | ||
250 | nop | ||
diff --git a/arch/sparc/lib/csum_copy.S b/arch/sparc/lib/csum_copy.S new file mode 100644 index 000000000000..e566c770a0f6 --- /dev/null +++ b/arch/sparc/lib/csum_copy.S | |||
@@ -0,0 +1,309 @@ | |||
1 | /* csum_copy.S: Checksum+copy code for sparc64 | ||
2 | * | ||
3 | * Copyright (C) 2005 David S. Miller <davem@davemloft.net> | ||
4 | */ | ||
5 | |||
6 | #ifdef __KERNEL__ | ||
7 | #define GLOBAL_SPARE %g7 | ||
8 | #else | ||
9 | #define GLOBAL_SPARE %g5 | ||
10 | #endif | ||
11 | |||
12 | #ifndef EX_LD | ||
13 | #define EX_LD(x) x | ||
14 | #endif | ||
15 | |||
16 | #ifndef EX_ST | ||
17 | #define EX_ST(x) x | ||
18 | #endif | ||
19 | |||
20 | #ifndef EX_RETVAL | ||
21 | #define EX_RETVAL(x) x | ||
22 | #endif | ||
23 | |||
24 | #ifndef LOAD | ||
25 | #define LOAD(type,addr,dest) type [addr], dest | ||
26 | #endif | ||
27 | |||
28 | #ifndef STORE | ||
29 | #define STORE(type,src,addr) type src, [addr] | ||
30 | #endif | ||
31 | |||
32 | #ifndef FUNC_NAME | ||
33 | #define FUNC_NAME csum_partial_copy_nocheck | ||
34 | #endif | ||
35 | |||
36 | .register %g2, #scratch | ||
37 | .register %g3, #scratch | ||
38 | |||
39 | .text | ||
40 | |||
41 | 90: | ||
42 | /* We checked for zero length already, so there must be | ||
43 | * at least one byte. | ||
44 | */ | ||
45 | be,pt %icc, 1f | ||
46 | nop | ||
47 | EX_LD(LOAD(ldub, %o0 + 0x00, %o4)) | ||
48 | add %o0, 1, %o0 | ||
49 | sub %o2, 1, %o2 | ||
50 | EX_ST(STORE(stb, %o4, %o1 + 0x00)) | ||
51 | add %o1, 1, %o1 | ||
52 | 1: andcc %o0, 0x2, %g0 | ||
53 | be,pn %icc, 80f | ||
54 | cmp %o2, 2 | ||
55 | blu,pn %icc, 60f | ||
56 | nop | ||
57 | EX_LD(LOAD(lduh, %o0 + 0x00, %o5)) | ||
58 | add %o0, 2, %o0 | ||
59 | sub %o2, 2, %o2 | ||
60 | EX_ST(STORE(sth, %o5, %o1 + 0x00)) | ||
61 | add %o1, 2, %o1 | ||
62 | ba,pt %xcc, 80f | ||
63 | add %o5, %o4, %o4 | ||
64 | |||
65 | .globl FUNC_NAME | ||
66 | FUNC_NAME: /* %o0=src, %o1=dst, %o2=len, %o3=sum */ | ||
67 | LOAD(prefetch, %o0 + 0x000, #n_reads) | ||
68 | xor %o0, %o1, %g1 | ||
69 | clr %o4 | ||
70 | andcc %g1, 0x3, %g0 | ||
71 | bne,pn %icc, 95f | ||
72 | LOAD(prefetch, %o0 + 0x040, #n_reads) | ||
73 | |||
74 | brz,pn %o2, 70f | ||
75 | andcc %o0, 0x3, %g0 | ||
76 | |||
77 | /* We "remember" whether the lowest bit in the address | ||
78 | * was set in GLOBAL_SPARE. Because if it is, we have to swap | ||
79 | * upper and lower 8 bit fields of the sum we calculate. | ||
80 | */ | ||
81 | bne,pn %icc, 90b | ||
82 | andcc %o0, 0x1, GLOBAL_SPARE | ||
83 | |||
84 | 80: | ||
85 | LOAD(prefetch, %o0 + 0x080, #n_reads) | ||
86 | andncc %o2, 0x3f, %g3 | ||
87 | |||
88 | LOAD(prefetch, %o0 + 0x0c0, #n_reads) | ||
89 | sub %o2, %g3, %o2 | ||
90 | brz,pn %g3, 2f | ||
91 | LOAD(prefetch, %o0 + 0x100, #n_reads) | ||
92 | |||
93 | /* So that we don't need to use the non-pairing | ||
94 | * add-with-carry instructions we accumulate 32-bit | ||
95 | * values into a 64-bit register. At the end of the | ||
96 | * loop we fold it down to 32-bits and so on. | ||
97 | */ | ||
98 | ba,pt %xcc, 1f | ||
99 | LOAD(prefetch, %o0 + 0x140, #n_reads) | ||
100 | |||
101 | .align 32 | ||
102 | 1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5)) | ||
103 | EX_LD(LOAD(lduw, %o0 + 0x04, %g1)) | ||
104 | EX_LD(LOAD(lduw, %o0 + 0x08, %g2)) | ||
105 | add %o4, %o5, %o4 | ||
106 | EX_ST(STORE(stw, %o5, %o1 + 0x00)) | ||
107 | EX_LD(LOAD(lduw, %o0 + 0x0c, %o5)) | ||
108 | add %o4, %g1, %o4 | ||
109 | EX_ST(STORE(stw, %g1, %o1 + 0x04)) | ||
110 | EX_LD(LOAD(lduw, %o0 + 0x10, %g1)) | ||
111 | add %o4, %g2, %o4 | ||
112 | EX_ST(STORE(stw, %g2, %o1 + 0x08)) | ||
113 | EX_LD(LOAD(lduw, %o0 + 0x14, %g2)) | ||
114 | add %o4, %o5, %o4 | ||
115 | EX_ST(STORE(stw, %o5, %o1 + 0x0c)) | ||
116 | EX_LD(LOAD(lduw, %o0 + 0x18, %o5)) | ||
117 | add %o4, %g1, %o4 | ||
118 | EX_ST(STORE(stw, %g1, %o1 + 0x10)) | ||
119 | EX_LD(LOAD(lduw, %o0 + 0x1c, %g1)) | ||
120 | add %o4, %g2, %o4 | ||
121 | EX_ST(STORE(stw, %g2, %o1 + 0x14)) | ||
122 | EX_LD(LOAD(lduw, %o0 + 0x20, %g2)) | ||
123 | add %o4, %o5, %o4 | ||
124 | EX_ST(STORE(stw, %o5, %o1 + 0x18)) | ||
125 | EX_LD(LOAD(lduw, %o0 + 0x24, %o5)) | ||
126 | add %o4, %g1, %o4 | ||
127 | EX_ST(STORE(stw, %g1, %o1 + 0x1c)) | ||
128 | EX_LD(LOAD(lduw, %o0 + 0x28, %g1)) | ||
129 | add %o4, %g2, %o4 | ||
130 | EX_ST(STORE(stw, %g2, %o1 + 0x20)) | ||
131 | EX_LD(LOAD(lduw, %o0 + 0x2c, %g2)) | ||
132 | add %o4, %o5, %o4 | ||
133 | EX_ST(STORE(stw, %o5, %o1 + 0x24)) | ||
134 | EX_LD(LOAD(lduw, %o0 + 0x30, %o5)) | ||
135 | add %o4, %g1, %o4 | ||
136 | EX_ST(STORE(stw, %g1, %o1 + 0x28)) | ||
137 | EX_LD(LOAD(lduw, %o0 + 0x34, %g1)) | ||
138 | add %o4, %g2, %o4 | ||
139 | EX_ST(STORE(stw, %g2, %o1 + 0x2c)) | ||
140 | EX_LD(LOAD(lduw, %o0 + 0x38, %g2)) | ||
141 | add %o4, %o5, %o4 | ||
142 | EX_ST(STORE(stw, %o5, %o1 + 0x30)) | ||
143 | EX_LD(LOAD(lduw, %o0 + 0x3c, %o5)) | ||
144 | add %o4, %g1, %o4 | ||
145 | EX_ST(STORE(stw, %g1, %o1 + 0x34)) | ||
146 | LOAD(prefetch, %o0 + 0x180, #n_reads) | ||
147 | add %o4, %g2, %o4 | ||
148 | EX_ST(STORE(stw, %g2, %o1 + 0x38)) | ||
149 | subcc %g3, 0x40, %g3 | ||
150 | add %o0, 0x40, %o0 | ||
151 | add %o4, %o5, %o4 | ||
152 | EX_ST(STORE(stw, %o5, %o1 + 0x3c)) | ||
153 | bne,pt %icc, 1b | ||
154 | add %o1, 0x40, %o1 | ||
155 | |||
156 | 2: and %o2, 0x3c, %g3 | ||
157 | brz,pn %g3, 2f | ||
158 | sub %o2, %g3, %o2 | ||
159 | 1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5)) | ||
160 | subcc %g3, 0x4, %g3 | ||
161 | add %o0, 0x4, %o0 | ||
162 | add %o4, %o5, %o4 | ||
163 | EX_ST(STORE(stw, %o5, %o1 + 0x00)) | ||
164 | bne,pt %icc, 1b | ||
165 | add %o1, 0x4, %o1 | ||
166 | |||
167 | 2: | ||
168 | /* fold 64-->32 */ | ||
169 | srlx %o4, 32, %o5 | ||
170 | srl %o4, 0, %o4 | ||
171 | add %o4, %o5, %o4 | ||
172 | srlx %o4, 32, %o5 | ||
173 | srl %o4, 0, %o4 | ||
174 | add %o4, %o5, %o4 | ||
175 | |||
176 | /* fold 32-->16 */ | ||
177 | sethi %hi(0xffff0000), %g1 | ||
178 | srl %o4, 16, %o5 | ||
179 | andn %o4, %g1, %g2 | ||
180 | add %o5, %g2, %o4 | ||
181 | srl %o4, 16, %o5 | ||
182 | andn %o4, %g1, %g2 | ||
183 | add %o5, %g2, %o4 | ||
184 | |||
185 | 60: | ||
186 | /* %o4 has the 16-bit sum we have calculated so-far. */ | ||
187 | cmp %o2, 2 | ||
188 | blu,pt %icc, 1f | ||
189 | nop | ||
190 | EX_LD(LOAD(lduh, %o0 + 0x00, %o5)) | ||
191 | sub %o2, 2, %o2 | ||
192 | add %o0, 2, %o0 | ||
193 | add %o4, %o5, %o4 | ||
194 | EX_ST(STORE(sth, %o5, %o1 + 0x00)) | ||
195 | add %o1, 0x2, %o1 | ||
196 | 1: brz,pt %o2, 1f | ||
197 | nop | ||
198 | EX_LD(LOAD(ldub, %o0 + 0x00, %o5)) | ||
199 | sub %o2, 1, %o2 | ||
200 | add %o0, 1, %o0 | ||
201 | EX_ST(STORE(stb, %o5, %o1 + 0x00)) | ||
202 | sllx %o5, 8, %o5 | ||
203 | add %o1, 1, %o1 | ||
204 | add %o4, %o5, %o4 | ||
205 | 1: | ||
206 | /* fold 32-->16 */ | ||
207 | sethi %hi(0xffff0000), %g1 | ||
208 | srl %o4, 16, %o5 | ||
209 | andn %o4, %g1, %g2 | ||
210 | add %o5, %g2, %o4 | ||
211 | srl %o4, 16, %o5 | ||
212 | andn %o4, %g1, %g2 | ||
213 | add %o5, %g2, %o4 | ||
214 | |||
215 | 1: brz,pt GLOBAL_SPARE, 1f | ||
216 | nop | ||
217 | |||
218 | /* We started with an odd byte, byte-swap the result. */ | ||
219 | srl %o4, 8, %o5 | ||
220 | and %o4, 0xff, %g1 | ||
221 | sll %g1, 8, %g1 | ||
222 | or %o5, %g1, %o4 | ||
223 | |||
224 | 1: addcc %o3, %o4, %o3 | ||
225 | addc %g0, %o3, %o3 | ||
226 | |||
227 | 70: | ||
228 | retl | ||
229 | srl %o3, 0, %o0 | ||
230 | |||
231 | 95: mov 0, GLOBAL_SPARE | ||
232 | brlez,pn %o2, 4f | ||
233 | andcc %o0, 1, %o5 | ||
234 | be,a,pt %icc, 1f | ||
235 | srl %o2, 1, %g1 | ||
236 | sub %o2, 1, %o2 | ||
237 | EX_LD(LOAD(ldub, %o0, GLOBAL_SPARE)) | ||
238 | add %o0, 1, %o0 | ||
239 | EX_ST(STORE(stb, GLOBAL_SPARE, %o1)) | ||
240 | srl %o2, 1, %g1 | ||
241 | add %o1, 1, %o1 | ||
242 | 1: brz,a,pn %g1, 3f | ||
243 | andcc %o2, 1, %g0 | ||
244 | andcc %o0, 2, %g0 | ||
245 | be,a,pt %icc, 1f | ||
246 | srl %g1, 1, %g1 | ||
247 | EX_LD(LOAD(lduh, %o0, %o4)) | ||
248 | sub %o2, 2, %o2 | ||
249 | srl %o4, 8, %g2 | ||
250 | sub %g1, 1, %g1 | ||
251 | EX_ST(STORE(stb, %g2, %o1)) | ||
252 | add %o4, GLOBAL_SPARE, GLOBAL_SPARE | ||
253 | EX_ST(STORE(stb, %o4, %o1 + 1)) | ||
254 | add %o0, 2, %o0 | ||
255 | srl %g1, 1, %g1 | ||
256 | add %o1, 2, %o1 | ||
257 | 1: brz,a,pn %g1, 2f | ||
258 | andcc %o2, 2, %g0 | ||
259 | EX_LD(LOAD(lduw, %o0, %o4)) | ||
260 | 5: srl %o4, 24, %g2 | ||
261 | srl %o4, 16, %g3 | ||
262 | EX_ST(STORE(stb, %g2, %o1)) | ||
263 | srl %o4, 8, %g2 | ||
264 | EX_ST(STORE(stb, %g3, %o1 + 1)) | ||
265 | add %o0, 4, %o0 | ||
266 | EX_ST(STORE(stb, %g2, %o1 + 2)) | ||
267 | addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE | ||
268 | EX_ST(STORE(stb, %o4, %o1 + 3)) | ||
269 | addc GLOBAL_SPARE, %g0, GLOBAL_SPARE | ||
270 | add %o1, 4, %o1 | ||
271 | subcc %g1, 1, %g1 | ||
272 | bne,a,pt %icc, 5b | ||
273 | EX_LD(LOAD(lduw, %o0, %o4)) | ||
274 | sll GLOBAL_SPARE, 16, %g2 | ||
275 | srl GLOBAL_SPARE, 16, GLOBAL_SPARE | ||
276 | srl %g2, 16, %g2 | ||
277 | andcc %o2, 2, %g0 | ||
278 | add %g2, GLOBAL_SPARE, GLOBAL_SPARE | ||
279 | 2: be,a,pt %icc, 3f | ||
280 | andcc %o2, 1, %g0 | ||
281 | EX_LD(LOAD(lduh, %o0, %o4)) | ||
282 | andcc %o2, 1, %g0 | ||
283 | srl %o4, 8, %g2 | ||
284 | add %o0, 2, %o0 | ||
285 | EX_ST(STORE(stb, %g2, %o1)) | ||
286 | add GLOBAL_SPARE, %o4, GLOBAL_SPARE | ||
287 | EX_ST(STORE(stb, %o4, %o1 + 1)) | ||
288 | add %o1, 2, %o1 | ||
289 | 3: be,a,pt %icc, 1f | ||
290 | sll GLOBAL_SPARE, 16, %o4 | ||
291 | EX_LD(LOAD(ldub, %o0, %g2)) | ||
292 | sll %g2, 8, %o4 | ||
293 | EX_ST(STORE(stb, %g2, %o1)) | ||
294 | add GLOBAL_SPARE, %o4, GLOBAL_SPARE | ||
295 | sll GLOBAL_SPARE, 16, %o4 | ||
296 | 1: addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE | ||
297 | srl GLOBAL_SPARE, 16, %o4 | ||
298 | addc %g0, %o4, GLOBAL_SPARE | ||
299 | brz,pt %o5, 4f | ||
300 | srl GLOBAL_SPARE, 8, %o4 | ||
301 | and GLOBAL_SPARE, 0xff, %g2 | ||
302 | and %o4, 0xff, %o4 | ||
303 | sll %g2, 8, %g2 | ||
304 | or %g2, %o4, GLOBAL_SPARE | ||
305 | 4: addcc %o3, GLOBAL_SPARE, %o3 | ||
306 | addc %g0, %o3, %o0 | ||
307 | retl | ||
308 | srl %o0, 0, %o0 | ||
309 | .size FUNC_NAME, .-FUNC_NAME | ||
diff --git a/arch/sparc/lib/csum_copy_from_user.S b/arch/sparc/lib/csum_copy_from_user.S new file mode 100644 index 000000000000..a22eddbe5dba --- /dev/null +++ b/arch/sparc/lib/csum_copy_from_user.S | |||
@@ -0,0 +1,21 @@ | |||
1 | /* csum_copy_from_user.S: Checksum+copy from userspace. | ||
2 | * | ||
3 | * Copyright (C) 2005 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #define EX_LD(x) \ | ||
7 | 98: x; \ | ||
8 | .section .fixup; \ | ||
9 | .align 4; \ | ||
10 | 99: retl; \ | ||
11 | mov -1, %o0; \ | ||
12 | .section __ex_table,"a";\ | ||
13 | .align 4; \ | ||
14 | .word 98b, 99b; \ | ||
15 | .text; \ | ||
16 | .align 4; | ||
17 | |||
18 | #define FUNC_NAME __csum_partial_copy_from_user | ||
19 | #define LOAD(type,addr,dest) type##a [addr] %asi, dest | ||
20 | |||
21 | #include "csum_copy.S" | ||
diff --git a/arch/sparc/lib/csum_copy_to_user.S b/arch/sparc/lib/csum_copy_to_user.S new file mode 100644 index 000000000000..d5b12f441f02 --- /dev/null +++ b/arch/sparc/lib/csum_copy_to_user.S | |||
@@ -0,0 +1,21 @@ | |||
1 | /* csum_copy_to_user.S: Checksum+copy to userspace. | ||
2 | * | ||
3 | * Copyright (C) 2005 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #define EX_ST(x) \ | ||
7 | 98: x; \ | ||
8 | .section .fixup; \ | ||
9 | .align 4; \ | ||
10 | 99: retl; \ | ||
11 | mov -1, %o0; \ | ||
12 | .section __ex_table,"a";\ | ||
13 | .align 4; \ | ||
14 | .word 98b, 99b; \ | ||
15 | .text; \ | ||
16 | .align 4; | ||
17 | |||
18 | #define FUNC_NAME __csum_partial_copy_to_user | ||
19 | #define STORE(type,src,addr) type##a src, [addr] %asi | ||
20 | |||
21 | #include "csum_copy.S" | ||
diff --git a/arch/sparc/lib/ipcsum.S b/arch/sparc/lib/ipcsum.S new file mode 100644 index 000000000000..58ca5b9a8778 --- /dev/null +++ b/arch/sparc/lib/ipcsum.S | |||
@@ -0,0 +1,34 @@ | |||
1 | .text | ||
2 | .align 32 | ||
3 | .globl ip_fast_csum | ||
4 | .type ip_fast_csum,#function | ||
5 | ip_fast_csum: /* %o0 = iph, %o1 = ihl */ | ||
6 | sub %o1, 4, %g7 | ||
7 | lduw [%o0 + 0x00], %o2 | ||
8 | lduw [%o0 + 0x04], %g2 | ||
9 | lduw [%o0 + 0x08], %g3 | ||
10 | addcc %g2, %o2, %o2 | ||
11 | lduw [%o0 + 0x0c], %g2 | ||
12 | addccc %g3, %o2, %o2 | ||
13 | lduw [%o0 + 0x10], %g3 | ||
14 | |||
15 | addccc %g2, %o2, %o2 | ||
16 | addc %o2, %g0, %o2 | ||
17 | 1: addcc %g3, %o2, %o2 | ||
18 | add %o0, 4, %o0 | ||
19 | addccc %o2, %g0, %o2 | ||
20 | subcc %g7, 1, %g7 | ||
21 | be,a,pt %icc, 2f | ||
22 | sll %o2, 16, %g2 | ||
23 | |||
24 | lduw [%o0 + 0x10], %g3 | ||
25 | ba,pt %xcc, 1b | ||
26 | nop | ||
27 | 2: addcc %o2, %g2, %g2 | ||
28 | srl %g2, 16, %o2 | ||
29 | addc %o2, %g0, %o2 | ||
30 | xnor %g0, %o2, %o2 | ||
31 | set 0xffff, %o1 | ||
32 | retl | ||
33 | and %o2, %o1, %o0 | ||
34 | .size ip_fast_csum, .-ip_fast_csum | ||
diff --git a/arch/sparc/lib/mcount.S b/arch/sparc/lib/mcount.S new file mode 100644 index 000000000000..7ce9c65f3592 --- /dev/null +++ b/arch/sparc/lib/mcount.S | |||
@@ -0,0 +1,143 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2000 Anton Blanchard (anton@linuxcare.com) | ||
3 | * | ||
4 | * This file implements mcount(), which is used to collect profiling data. | ||
5 | * This can also be tweaked for kernel stack overflow detection. | ||
6 | */ | ||
7 | |||
8 | #include <linux/linkage.h> | ||
9 | |||
10 | #include <asm/ptrace.h> | ||
11 | #include <asm/thread_info.h> | ||
12 | |||
13 | /* | ||
14 | * This is the main variant and is called by C code. GCC's -pg option | ||
15 | * automatically instruments every C function with a call to this. | ||
16 | */ | ||
17 | |||
18 | #ifdef CONFIG_STACK_DEBUG | ||
19 | |||
20 | #define OVSTACKSIZE 4096 /* lets hope this is enough */ | ||
21 | |||
22 | .data | ||
23 | .align 8 | ||
24 | panicstring: | ||
25 | .asciz "Stack overflow\n" | ||
26 | .align 8 | ||
27 | ovstack: | ||
28 | .skip OVSTACKSIZE | ||
29 | #endif | ||
30 | .text | ||
31 | .align 32 | ||
32 | .globl _mcount | ||
33 | .type _mcount,#function | ||
34 | .globl mcount | ||
35 | .type mcount,#function | ||
36 | _mcount: | ||
37 | mcount: | ||
38 | #ifdef CONFIG_STACK_DEBUG | ||
39 | /* | ||
40 | * Check whether %sp is dangerously low. | ||
41 | */ | ||
42 | ldub [%g6 + TI_FPDEPTH], %g1 | ||
43 | srl %g1, 1, %g3 | ||
44 | add %g3, 1, %g3 | ||
45 | sllx %g3, 8, %g3 ! each fpregs frame is 256b | ||
46 | add %g3, 192, %g3 | ||
47 | add %g6, %g3, %g3 ! where does task_struct+frame end? | ||
48 | sub %g3, STACK_BIAS, %g3 | ||
49 | cmp %sp, %g3 | ||
50 | bg,pt %xcc, 1f | ||
51 | nop | ||
52 | lduh [%g6 + TI_CPU], %g1 | ||
53 | sethi %hi(hardirq_stack), %g3 | ||
54 | or %g3, %lo(hardirq_stack), %g3 | ||
55 | sllx %g1, 3, %g1 | ||
56 | ldx [%g3 + %g1], %g7 | ||
57 | sub %g7, STACK_BIAS, %g7 | ||
58 | cmp %sp, %g7 | ||
59 | bleu,pt %xcc, 2f | ||
60 | sethi %hi(THREAD_SIZE), %g3 | ||
61 | add %g7, %g3, %g7 | ||
62 | cmp %sp, %g7 | ||
63 | blu,pn %xcc, 1f | ||
64 | 2: sethi %hi(softirq_stack), %g3 | ||
65 | or %g3, %lo(softirq_stack), %g3 | ||
66 | ldx [%g3 + %g1], %g7 | ||
67 | cmp %sp, %g7 | ||
68 | bleu,pt %xcc, 2f | ||
69 | sethi %hi(THREAD_SIZE), %g3 | ||
70 | add %g7, %g3, %g7 | ||
71 | cmp %sp, %g7 | ||
72 | blu,pn %xcc, 1f | ||
73 | nop | ||
74 | /* If we are already on ovstack, don't hop onto it | ||
75 | * again, we are already trying to output the stack overflow | ||
76 | * message. | ||
77 | */ | ||
78 | sethi %hi(ovstack), %g7 ! cant move to panic stack fast enough | ||
79 | or %g7, %lo(ovstack), %g7 | ||
80 | add %g7, OVSTACKSIZE, %g3 | ||
81 | sub %g3, STACK_BIAS + 192, %g3 | ||
82 | sub %g7, STACK_BIAS, %g7 | ||
83 | cmp %sp, %g7 | ||
84 | blu,pn %xcc, 2f | ||
85 | cmp %sp, %g3 | ||
86 | bleu,pn %xcc, 1f | ||
87 | nop | ||
88 | 2: mov %g3, %sp | ||
89 | sethi %hi(panicstring), %g3 | ||
90 | call prom_printf | ||
91 | or %g3, %lo(panicstring), %o0 | ||
92 | call prom_halt | ||
93 | nop | ||
94 | 1: | ||
95 | #endif | ||
96 | #ifdef CONFIG_FUNCTION_TRACER | ||
97 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
98 | mov %o7, %o0 | ||
99 | .globl mcount_call | ||
100 | mcount_call: | ||
101 | call ftrace_stub | ||
102 | mov %o0, %o7 | ||
103 | #else | ||
104 | sethi %hi(ftrace_trace_function), %g1 | ||
105 | sethi %hi(ftrace_stub), %g2 | ||
106 | ldx [%g1 + %lo(ftrace_trace_function)], %g1 | ||
107 | or %g2, %lo(ftrace_stub), %g2 | ||
108 | cmp %g1, %g2 | ||
109 | be,pn %icc, 1f | ||
110 | mov %i7, %o1 | ||
111 | jmpl %g1, %g0 | ||
112 | mov %o7, %o0 | ||
113 | /* not reached */ | ||
114 | 1: | ||
115 | #endif | ||
116 | #endif | ||
117 | retl | ||
118 | nop | ||
119 | .size _mcount,.-_mcount | ||
120 | .size mcount,.-mcount | ||
121 | |||
122 | #ifdef CONFIG_FUNCTION_TRACER | ||
123 | .globl ftrace_stub | ||
124 | .type ftrace_stub,#function | ||
125 | ftrace_stub: | ||
126 | retl | ||
127 | nop | ||
128 | .size ftrace_stub,.-ftrace_stub | ||
129 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
130 | .globl ftrace_caller | ||
131 | .type ftrace_caller,#function | ||
132 | ftrace_caller: | ||
133 | mov %i7, %o1 | ||
134 | mov %o7, %o0 | ||
135 | .globl ftrace_call | ||
136 | ftrace_call: | ||
137 | call ftrace_stub | ||
138 | mov %o0, %o7 | ||
139 | retl | ||
140 | nop | ||
141 | .size ftrace_caller,.-ftrace_caller | ||
142 | #endif | ||
143 | #endif | ||
diff --git a/arch/sparc/lib/memcmp_64.S b/arch/sparc/lib/memcmp_64.S new file mode 100644 index 000000000000..d3fdaa898566 --- /dev/null +++ b/arch/sparc/lib/memcmp_64.S | |||
@@ -0,0 +1,28 @@ | |||
1 | /* | ||
2 | * Sparc64 optimized memcmp code. | ||
3 | * | ||
4 | * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) | ||
5 | * Copyright (C) 2000 David S. Miller (davem@redhat.com) | ||
6 | */ | ||
7 | |||
8 | .text | ||
9 | .align 32 | ||
10 | .globl __memcmp, memcmp | ||
11 | __memcmp: | ||
12 | memcmp: | ||
13 | cmp %o2, 0 ! IEU1 Group | ||
14 | loop: be,pn %icc, ret_0 ! CTI | ||
15 | nop ! IEU0 | ||
16 | ldub [%o0], %g7 ! LSU Group | ||
17 | ldub [%o1], %g3 ! LSU Group | ||
18 | sub %o2, 1, %o2 ! IEU0 | ||
19 | add %o0, 1, %o0 ! IEU1 | ||
20 | add %o1, 1, %o1 ! IEU0 Group | ||
21 | subcc %g7, %g3, %g3 ! IEU1 Group | ||
22 | be,pt %icc, loop ! CTI | ||
23 | cmp %o2, 0 ! IEU1 Group | ||
24 | |||
25 | ret_n0: retl | ||
26 | mov %g3, %o0 | ||
27 | ret_0: retl | ||
28 | mov 0, %o0 | ||
diff --git a/arch/sparc/lib/memmove.S b/arch/sparc/lib/memmove.S new file mode 100644 index 000000000000..97395802c23c --- /dev/null +++ b/arch/sparc/lib/memmove.S | |||
@@ -0,0 +1,31 @@ | |||
1 | /* memmove.S: Simple memmove implementation. | ||
2 | * | ||
3 | * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com) | ||
4 | * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz) | ||
5 | */ | ||
6 | |||
7 | .text | ||
8 | .align 32 | ||
9 | .globl memmove | ||
10 | .type memmove,#function | ||
11 | memmove: /* o0=dst o1=src o2=len */ | ||
12 | mov %o0, %g1 | ||
13 | cmp %o0, %o1 | ||
14 | bleu,pt %xcc, memcpy | ||
15 | add %o1, %o2, %g7 | ||
16 | cmp %g7, %o0 | ||
17 | bleu,pt %xcc, memcpy | ||
18 | add %o0, %o2, %o5 | ||
19 | sub %g7, 1, %o1 | ||
20 | |||
21 | sub %o5, 1, %o0 | ||
22 | 1: ldub [%o1], %g7 | ||
23 | subcc %o2, 1, %o2 | ||
24 | sub %o1, 1, %o1 | ||
25 | stb %g7, [%o0] | ||
26 | bne,pt %icc, 1b | ||
27 | sub %o0, 1, %o0 | ||
28 | |||
29 | retl | ||
30 | mov %g1, %o0 | ||
31 | .size memmove, .-memmove | ||
diff --git a/arch/sparc/lib/memscan_64.S b/arch/sparc/lib/memscan_64.S new file mode 100644 index 000000000000..5686dfa5dc15 --- /dev/null +++ b/arch/sparc/lib/memscan_64.S | |||
@@ -0,0 +1,129 @@ | |||
1 | /* | ||
2 | * memscan.S: Optimized memscan for Sparc64. | ||
3 | * | ||
4 | * Copyright (C) 1997,1998 Jakub Jelinek (jj@ultra.linux.cz) | ||
5 | * Copyright (C) 1998 David S. Miller (davem@redhat.com) | ||
6 | */ | ||
7 | |||
8 | #define HI_MAGIC 0x8080808080808080 | ||
9 | #define LO_MAGIC 0x0101010101010101 | ||
10 | #define ASI_PL 0x88 | ||
11 | |||
12 | .text | ||
13 | .align 32 | ||
14 | .globl __memscan_zero, __memscan_generic | ||
15 | .globl memscan | ||
16 | |||
17 | __memscan_zero: | ||
18 | /* %o0 = bufp, %o1 = size */ | ||
19 | brlez,pn %o1, szzero | ||
20 | andcc %o0, 7, %g0 | ||
21 | be,pt %icc, we_are_aligned | ||
22 | sethi %hi(HI_MAGIC), %o4 | ||
23 | ldub [%o0], %o5 | ||
24 | 1: subcc %o1, 1, %o1 | ||
25 | brz,pn %o5, 10f | ||
26 | add %o0, 1, %o0 | ||
27 | |||
28 | be,pn %xcc, szzero | ||
29 | andcc %o0, 7, %g0 | ||
30 | bne,a,pn %icc, 1b | ||
31 | ldub [%o0], %o5 | ||
32 | we_are_aligned: | ||
33 | ldxa [%o0] ASI_PL, %o5 | ||
34 | or %o4, %lo(HI_MAGIC), %o3 | ||
35 | sllx %o3, 32, %o4 | ||
36 | or %o4, %o3, %o3 | ||
37 | |||
38 | srlx %o3, 7, %o2 | ||
39 | msloop: | ||
40 | sub %o1, 8, %o1 | ||
41 | add %o0, 8, %o0 | ||
42 | sub %o5, %o2, %o4 | ||
43 | xor %o4, %o5, %o4 | ||
44 | andcc %o4, %o3, %g3 | ||
45 | bne,pn %xcc, check_bytes | ||
46 | srlx %o4, 32, %g3 | ||
47 | |||
48 | brgz,a,pt %o1, msloop | ||
49 | ldxa [%o0] ASI_PL, %o5 | ||
50 | check_bytes: | ||
51 | bne,a,pn %icc, 2f | ||
52 | andcc %o5, 0xff, %g0 | ||
53 | add %o0, -5, %g2 | ||
54 | ba,pt %xcc, 3f | ||
55 | srlx %o5, 32, %g7 | ||
56 | |||
57 | 2: srlx %o5, 8, %g7 | ||
58 | be,pn %icc, 1f | ||
59 | add %o0, -8, %g2 | ||
60 | andcc %g7, 0xff, %g0 | ||
61 | srlx %g7, 8, %g7 | ||
62 | be,pn %icc, 1f | ||
63 | inc %g2 | ||
64 | andcc %g7, 0xff, %g0 | ||
65 | |||
66 | srlx %g7, 8, %g7 | ||
67 | be,pn %icc, 1f | ||
68 | inc %g2 | ||
69 | andcc %g7, 0xff, %g0 | ||
70 | srlx %g7, 8, %g7 | ||
71 | be,pn %icc, 1f | ||
72 | inc %g2 | ||
73 | andcc %g3, %o3, %g0 | ||
74 | |||
75 | be,a,pn %icc, 2f | ||
76 | mov %o0, %g2 | ||
77 | 3: andcc %g7, 0xff, %g0 | ||
78 | srlx %g7, 8, %g7 | ||
79 | be,pn %icc, 1f | ||
80 | inc %g2 | ||
81 | andcc %g7, 0xff, %g0 | ||
82 | srlx %g7, 8, %g7 | ||
83 | |||
84 | be,pn %icc, 1f | ||
85 | inc %g2 | ||
86 | andcc %g7, 0xff, %g0 | ||
87 | srlx %g7, 8, %g7 | ||
88 | be,pn %icc, 1f | ||
89 | inc %g2 | ||
90 | andcc %g7, 0xff, %g0 | ||
91 | srlx %g7, 8, %g7 | ||
92 | |||
93 | be,pn %icc, 1f | ||
94 | inc %g2 | ||
95 | 2: brgz,a,pt %o1, msloop | ||
96 | ldxa [%o0] ASI_PL, %o5 | ||
97 | inc %g2 | ||
98 | 1: add %o0, %o1, %o0 | ||
99 | cmp %g2, %o0 | ||
100 | retl | ||
101 | |||
102 | movle %xcc, %g2, %o0 | ||
103 | 10: retl | ||
104 | sub %o0, 1, %o0 | ||
105 | szzero: retl | ||
106 | nop | ||
107 | |||
108 | memscan: | ||
109 | __memscan_generic: | ||
110 | /* %o0 = addr, %o1 = c, %o2 = size */ | ||
111 | brz,pn %o2, 3f | ||
112 | add %o0, %o2, %o3 | ||
113 | ldub [%o0], %o5 | ||
114 | sub %g0, %o2, %o4 | ||
115 | 1: | ||
116 | cmp %o5, %o1 | ||
117 | be,pn %icc, 2f | ||
118 | addcc %o4, 1, %o4 | ||
119 | bne,a,pt %xcc, 1b | ||
120 | ldub [%o3 + %o4], %o5 | ||
121 | retl | ||
122 | /* The delay slot is the same as the next insn, this is just to make it look more awful */ | ||
123 | 2: | ||
124 | add %o3, %o4, %o0 | ||
125 | retl | ||
126 | sub %o0, 1, %o0 | ||
127 | 3: | ||
128 | retl | ||
129 | nop | ||
diff --git a/arch/sparc/lib/rwsem_64.S b/arch/sparc/lib/rwsem_64.S new file mode 100644 index 000000000000..91a7d29a79d5 --- /dev/null +++ b/arch/sparc/lib/rwsem_64.S | |||
@@ -0,0 +1,163 @@ | |||
1 | /* rwsem.S: RW semaphore assembler. | ||
2 | * | ||
3 | * Written by David S. Miller (davem@redhat.com), 2001. | ||
4 | * Derived from asm-i386/rwsem.h | ||
5 | */ | ||
6 | |||
7 | #include <asm/rwsem-const.h> | ||
8 | |||
9 | .section .sched.text, "ax" | ||
10 | |||
11 | .globl __down_read | ||
12 | __down_read: | ||
13 | 1: lduw [%o0], %g1 | ||
14 | add %g1, 1, %g7 | ||
15 | cas [%o0], %g1, %g7 | ||
16 | cmp %g1, %g7 | ||
17 | bne,pn %icc, 1b | ||
18 | add %g7, 1, %g7 | ||
19 | cmp %g7, 0 | ||
20 | bl,pn %icc, 3f | ||
21 | nop | ||
22 | 2: | ||
23 | retl | ||
24 | nop | ||
25 | 3: | ||
26 | save %sp, -192, %sp | ||
27 | call rwsem_down_read_failed | ||
28 | mov %i0, %o0 | ||
29 | ret | ||
30 | restore | ||
31 | .size __down_read, .-__down_read | ||
32 | |||
33 | .globl __down_read_trylock | ||
34 | __down_read_trylock: | ||
35 | 1: lduw [%o0], %g1 | ||
36 | add %g1, 1, %g7 | ||
37 | cmp %g7, 0 | ||
38 | bl,pn %icc, 2f | ||
39 | mov 0, %o1 | ||
40 | cas [%o0], %g1, %g7 | ||
41 | cmp %g1, %g7 | ||
42 | bne,pn %icc, 1b | ||
43 | mov 1, %o1 | ||
44 | 2: retl | ||
45 | mov %o1, %o0 | ||
46 | .size __down_read_trylock, .-__down_read_trylock | ||
47 | |||
48 | .globl __down_write | ||
49 | __down_write: | ||
50 | sethi %hi(RWSEM_ACTIVE_WRITE_BIAS), %g1 | ||
51 | or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1 | ||
52 | 1: | ||
53 | lduw [%o0], %g3 | ||
54 | add %g3, %g1, %g7 | ||
55 | cas [%o0], %g3, %g7 | ||
56 | cmp %g3, %g7 | ||
57 | bne,pn %icc, 1b | ||
58 | cmp %g7, 0 | ||
59 | bne,pn %icc, 3f | ||
60 | nop | ||
61 | 2: retl | ||
62 | nop | ||
63 | 3: | ||
64 | save %sp, -192, %sp | ||
65 | call rwsem_down_write_failed | ||
66 | mov %i0, %o0 | ||
67 | ret | ||
68 | restore | ||
69 | .size __down_write, .-__down_write | ||
70 | |||
71 | .globl __down_write_trylock | ||
72 | __down_write_trylock: | ||
73 | sethi %hi(RWSEM_ACTIVE_WRITE_BIAS), %g1 | ||
74 | or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1 | ||
75 | 1: | ||
76 | lduw [%o0], %g3 | ||
77 | cmp %g3, 0 | ||
78 | bne,pn %icc, 2f | ||
79 | mov 0, %o1 | ||
80 | add %g3, %g1, %g7 | ||
81 | cas [%o0], %g3, %g7 | ||
82 | cmp %g3, %g7 | ||
83 | bne,pn %icc, 1b | ||
84 | mov 1, %o1 | ||
85 | 2: retl | ||
86 | mov %o1, %o0 | ||
87 | .size __down_write_trylock, .-__down_write_trylock | ||
88 | |||
89 | .globl __up_read | ||
90 | __up_read: | ||
91 | 1: | ||
92 | lduw [%o0], %g1 | ||
93 | sub %g1, 1, %g7 | ||
94 | cas [%o0], %g1, %g7 | ||
95 | cmp %g1, %g7 | ||
96 | bne,pn %icc, 1b | ||
97 | cmp %g7, 0 | ||
98 | bl,pn %icc, 3f | ||
99 | nop | ||
100 | 2: retl | ||
101 | nop | ||
102 | 3: sethi %hi(RWSEM_ACTIVE_MASK), %g1 | ||
103 | sub %g7, 1, %g7 | ||
104 | or %g1, %lo(RWSEM_ACTIVE_MASK), %g1 | ||
105 | andcc %g7, %g1, %g0 | ||
106 | bne,pn %icc, 2b | ||
107 | nop | ||
108 | save %sp, -192, %sp | ||
109 | call rwsem_wake | ||
110 | mov %i0, %o0 | ||
111 | ret | ||
112 | restore | ||
113 | .size __up_read, .-__up_read | ||
114 | |||
115 | .globl __up_write | ||
116 | __up_write: | ||
117 | sethi %hi(RWSEM_ACTIVE_WRITE_BIAS), %g1 | ||
118 | or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1 | ||
119 | 1: | ||
120 | lduw [%o0], %g3 | ||
121 | sub %g3, %g1, %g7 | ||
122 | cas [%o0], %g3, %g7 | ||
123 | cmp %g3, %g7 | ||
124 | bne,pn %icc, 1b | ||
125 | sub %g7, %g1, %g7 | ||
126 | cmp %g7, 0 | ||
127 | bl,pn %icc, 3f | ||
128 | nop | ||
129 | 2: | ||
130 | retl | ||
131 | nop | ||
132 | 3: | ||
133 | save %sp, -192, %sp | ||
134 | call rwsem_wake | ||
135 | mov %i0, %o0 | ||
136 | ret | ||
137 | restore | ||
138 | .size __up_write, .-__up_write | ||
139 | |||
140 | .globl __downgrade_write | ||
141 | __downgrade_write: | ||
142 | sethi %hi(RWSEM_WAITING_BIAS), %g1 | ||
143 | or %g1, %lo(RWSEM_WAITING_BIAS), %g1 | ||
144 | 1: | ||
145 | lduw [%o0], %g3 | ||
146 | sub %g3, %g1, %g7 | ||
147 | cas [%o0], %g3, %g7 | ||
148 | cmp %g3, %g7 | ||
149 | bne,pn %icc, 1b | ||
150 | sub %g7, %g1, %g7 | ||
151 | cmp %g7, 0 | ||
152 | bl,pn %icc, 3f | ||
153 | nop | ||
154 | 2: | ||
155 | retl | ||
156 | nop | ||
157 | 3: | ||
158 | save %sp, -192, %sp | ||
159 | call rwsem_downgrade_wake | ||
160 | mov %i0, %o0 | ||
161 | ret | ||
162 | restore | ||
163 | .size __downgrade_write, .-__downgrade_write | ||
diff --git a/arch/sparc/lib/strlen_64.S b/arch/sparc/lib/strlen_64.S new file mode 100644 index 000000000000..e9ba1920d818 --- /dev/null +++ b/arch/sparc/lib/strlen_64.S | |||
@@ -0,0 +1,80 @@ | |||
1 | /* strlen.S: Sparc64 optimized strlen code | ||
2 | * Hand optimized from GNU libc's strlen | ||
3 | * Copyright (C) 1991,1996 Free Software Foundation | ||
4 | * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) | ||
5 | * Copyright (C) 1996, 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) | ||
6 | */ | ||
7 | |||
8 | #define LO_MAGIC 0x01010101 | ||
9 | #define HI_MAGIC 0x80808080 | ||
10 | |||
11 | .align 32 | ||
12 | .globl strlen | ||
13 | .type strlen,#function | ||
14 | strlen: | ||
15 | mov %o0, %o1 | ||
16 | andcc %o0, 3, %g0 | ||
17 | be,pt %icc, 9f | ||
18 | sethi %hi(HI_MAGIC), %o4 | ||
19 | ldub [%o0], %o5 | ||
20 | brz,pn %o5, 11f | ||
21 | add %o0, 1, %o0 | ||
22 | andcc %o0, 3, %g0 | ||
23 | be,pn %icc, 4f | ||
24 | or %o4, %lo(HI_MAGIC), %o3 | ||
25 | ldub [%o0], %o5 | ||
26 | brz,pn %o5, 12f | ||
27 | add %o0, 1, %o0 | ||
28 | andcc %o0, 3, %g0 | ||
29 | be,pt %icc, 5f | ||
30 | sethi %hi(LO_MAGIC), %o4 | ||
31 | ldub [%o0], %o5 | ||
32 | brz,pn %o5, 13f | ||
33 | add %o0, 1, %o0 | ||
34 | ba,pt %icc, 8f | ||
35 | or %o4, %lo(LO_MAGIC), %o2 | ||
36 | 9: | ||
37 | or %o4, %lo(HI_MAGIC), %o3 | ||
38 | 4: | ||
39 | sethi %hi(LO_MAGIC), %o4 | ||
40 | 5: | ||
41 | or %o4, %lo(LO_MAGIC), %o2 | ||
42 | 8: | ||
43 | ld [%o0], %o5 | ||
44 | 2: | ||
45 | sub %o5, %o2, %o4 | ||
46 | andcc %o4, %o3, %g0 | ||
47 | be,pt %icc, 8b | ||
48 | add %o0, 4, %o0 | ||
49 | |||
50 | /* Check every byte. */ | ||
51 | srl %o5, 24, %g7 | ||
52 | andcc %g7, 0xff, %g0 | ||
53 | be,pn %icc, 1f | ||
54 | add %o0, -4, %o4 | ||
55 | srl %o5, 16, %g7 | ||
56 | andcc %g7, 0xff, %g0 | ||
57 | be,pn %icc, 1f | ||
58 | add %o4, 1, %o4 | ||
59 | srl %o5, 8, %g7 | ||
60 | andcc %g7, 0xff, %g0 | ||
61 | be,pn %icc, 1f | ||
62 | add %o4, 1, %o4 | ||
63 | andcc %o5, 0xff, %g0 | ||
64 | bne,a,pt %icc, 2b | ||
65 | ld [%o0], %o5 | ||
66 | add %o4, 1, %o4 | ||
67 | 1: | ||
68 | retl | ||
69 | sub %o4, %o1, %o0 | ||
70 | 11: | ||
71 | retl | ||
72 | mov 0, %o0 | ||
73 | 12: | ||
74 | retl | ||
75 | mov 1, %o0 | ||
76 | 13: | ||
77 | retl | ||
78 | mov 2, %o0 | ||
79 | |||
80 | .size strlen, .-strlen | ||
diff --git a/arch/sparc/lib/strlen_user_64.S b/arch/sparc/lib/strlen_user_64.S new file mode 100644 index 000000000000..114ed111e251 --- /dev/null +++ b/arch/sparc/lib/strlen_user_64.S | |||
@@ -0,0 +1,95 @@ | |||
1 | /* strlen_user.S: Sparc64 optimized strlen_user code | ||
2 | * | ||
3 | * Return length of string in userspace including terminating 0 | ||
4 | * or 0 for error | ||
5 | * | ||
6 | * Copyright (C) 1991,1996 Free Software Foundation | ||
7 | * Copyright (C) 1996,1999 David S. Miller (davem@redhat.com) | ||
8 | * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) | ||
9 | */ | ||
10 | |||
11 | #include <asm/asi.h> | ||
12 | |||
13 | #define LO_MAGIC 0x01010101 | ||
14 | #define HI_MAGIC 0x80808080 | ||
15 | |||
16 | .align 4 | ||
17 | .global __strlen_user, __strnlen_user | ||
18 | __strlen_user: | ||
19 | sethi %hi(32768), %o1 | ||
20 | __strnlen_user: | ||
21 | mov %o1, %g1 | ||
22 | mov %o0, %o1 | ||
23 | andcc %o0, 3, %g0 | ||
24 | be,pt %icc, 9f | ||
25 | sethi %hi(HI_MAGIC), %o4 | ||
26 | 10: lduba [%o0] %asi, %o5 | ||
27 | brz,pn %o5, 21f | ||
28 | add %o0, 1, %o0 | ||
29 | andcc %o0, 3, %g0 | ||
30 | be,pn %icc, 4f | ||
31 | or %o4, %lo(HI_MAGIC), %o3 | ||
32 | 11: lduba [%o0] %asi, %o5 | ||
33 | brz,pn %o5, 22f | ||
34 | add %o0, 1, %o0 | ||
35 | andcc %o0, 3, %g0 | ||
36 | be,pt %icc, 13f | ||
37 | srl %o3, 7, %o2 | ||
38 | 12: lduba [%o0] %asi, %o5 | ||
39 | brz,pn %o5, 23f | ||
40 | add %o0, 1, %o0 | ||
41 | ba,pt %icc, 2f | ||
42 | 15: lda [%o0] %asi, %o5 | ||
43 | 9: or %o4, %lo(HI_MAGIC), %o3 | ||
44 | 4: srl %o3, 7, %o2 | ||
45 | 13: lda [%o0] %asi, %o5 | ||
46 | 2: sub %o5, %o2, %o4 | ||
47 | andcc %o4, %o3, %g0 | ||
48 | bne,pn %icc, 82f | ||
49 | add %o0, 4, %o0 | ||
50 | sub %o0, %o1, %g2 | ||
51 | 81: cmp %g2, %g1 | ||
52 | blu,pt %icc, 13b | ||
53 | mov %o0, %o4 | ||
54 | ba,a,pt %xcc, 1f | ||
55 | |||
56 | /* Check every byte. */ | ||
57 | 82: srl %o5, 24, %g7 | ||
58 | andcc %g7, 0xff, %g0 | ||
59 | be,pn %icc, 1f | ||
60 | add %o0, -3, %o4 | ||
61 | srl %o5, 16, %g7 | ||
62 | andcc %g7, 0xff, %g0 | ||
63 | be,pn %icc, 1f | ||
64 | add %o4, 1, %o4 | ||
65 | srl %o5, 8, %g7 | ||
66 | andcc %g7, 0xff, %g0 | ||
67 | be,pn %icc, 1f | ||
68 | add %o4, 1, %o4 | ||
69 | andcc %o5, 0xff, %g0 | ||
70 | bne,pt %icc, 81b | ||
71 | sub %o0, %o1, %g2 | ||
72 | add %o4, 1, %o4 | ||
73 | 1: retl | ||
74 | sub %o4, %o1, %o0 | ||
75 | 21: retl | ||
76 | mov 1, %o0 | ||
77 | 22: retl | ||
78 | mov 2, %o0 | ||
79 | 23: retl | ||
80 | mov 3, %o0 | ||
81 | |||
82 | .section .fixup,#alloc,#execinstr | ||
83 | .align 4 | ||
84 | 30: | ||
85 | retl | ||
86 | clr %o0 | ||
87 | |||
88 | .section __ex_table,"a" | ||
89 | .align 4 | ||
90 | |||
91 | .word 10b, 30b | ||
92 | .word 11b, 30b | ||
93 | .word 12b, 30b | ||
94 | .word 15b, 30b | ||
95 | .word 13b, 30b | ||
diff --git a/arch/sparc/lib/strncmp_64.S b/arch/sparc/lib/strncmp_64.S new file mode 100644 index 000000000000..980e83751556 --- /dev/null +++ b/arch/sparc/lib/strncmp_64.S | |||
@@ -0,0 +1,32 @@ | |||
1 | /* | ||
2 | * Sparc64 optimized strncmp code. | ||
3 | * | ||
4 | * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) | ||
5 | */ | ||
6 | |||
7 | #include <asm/asi.h> | ||
8 | |||
9 | .text | ||
10 | .align 32 | ||
11 | .globl strncmp | ||
12 | .type strncmp,#function | ||
13 | strncmp: | ||
14 | brlez,pn %o2, 3f | ||
15 | lduba [%o0] (ASI_PNF), %o3 | ||
16 | 1: | ||
17 | add %o0, 1, %o0 | ||
18 | ldub [%o1], %o4 | ||
19 | brz,pn %o3, 2f | ||
20 | add %o1, 1, %o1 | ||
21 | cmp %o3, %o4 | ||
22 | bne,pn %icc, 2f | ||
23 | subcc %o2, 1, %o2 | ||
24 | bne,a,pt %xcc, 1b | ||
25 | ldub [%o0], %o3 | ||
26 | 2: | ||
27 | retl | ||
28 | sub %o3, %o4, %o0 | ||
29 | 3: | ||
30 | retl | ||
31 | clr %o0 | ||
32 | .size strncmp, .-strncmp | ||
diff --git a/arch/sparc/lib/strncpy_from_user_64.S b/arch/sparc/lib/strncpy_from_user_64.S new file mode 100644 index 000000000000..511c8f136f95 --- /dev/null +++ b/arch/sparc/lib/strncpy_from_user_64.S | |||
@@ -0,0 +1,135 @@ | |||
1 | /* | ||
2 | * strncpy_from_user.S: Sparc64 strncpy from userspace. | ||
3 | * | ||
4 | * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) | ||
5 | */ | ||
6 | |||
7 | #include <asm/asi.h> | ||
8 | #include <asm/errno.h> | ||
9 | |||
10 | .data | ||
11 | .align 8 | ||
12 | 0: .xword 0x0101010101010101 | ||
13 | |||
14 | .text | ||
15 | .align 32 | ||
16 | |||
17 | /* Must return: | ||
18 | * | ||
19 | * -EFAULT for an exception | ||
20 | * count if we hit the buffer limit | ||
21 | * bytes copied if we hit a null byte | ||
22 | * (without the null byte) | ||
23 | * | ||
24 | * This implementation assumes: | ||
25 | * %o1 is 8 aligned => !(%o2 & 7) | ||
26 | * %o0 is 8 aligned (if not, it will be slooooow, but will work) | ||
27 | * | ||
28 | * This is optimized for the common case: | ||
29 | * in my stats, 90% of src are 8 aligned (even on sparc32) | ||
30 | * and average length is 18 or so. | ||
31 | */ | ||
32 | |||
33 | .globl __strncpy_from_user | ||
34 | .type __strncpy_from_user,#function | ||
35 | __strncpy_from_user: | ||
36 | /* %o0=dest, %o1=src, %o2=count */ | ||
37 | andcc %o1, 7, %g0 ! IEU1 Group | ||
38 | bne,pn %icc, 30f ! CTI | ||
39 | add %o0, %o2, %g3 ! IEU0 | ||
40 | 60: ldxa [%o1] %asi, %g1 ! Load Group | ||
41 | brlez,pn %o2, 10f ! CTI | ||
42 | mov %o0, %o3 ! IEU0 | ||
43 | 50: sethi %hi(0b), %o4 ! IEU0 Group | ||
44 | ldx [%o4 + %lo(0b)], %o4 ! Load | ||
45 | sllx %o4, 7, %o5 ! IEU1 Group | ||
46 | 1: sub %g1, %o4, %g2 ! IEU0 Group | ||
47 | stx %g1, [%o0] ! Store | ||
48 | add %o0, 8, %o0 ! IEU1 | ||
49 | andcc %g2, %o5, %g0 ! IEU1 Group | ||
50 | bne,pn %xcc, 5f ! CTI | ||
51 | add %o1, 8, %o1 ! IEU0 | ||
52 | cmp %o0, %g3 ! IEU1 Group | ||
53 | bl,a,pt %xcc, 1b ! CTI | ||
54 | 61: ldxa [%o1] %asi, %g1 ! Load | ||
55 | 10: retl ! CTI Group | ||
56 | mov %o2, %o0 ! IEU0 | ||
57 | 5: srlx %g2, 32, %g7 ! IEU0 Group | ||
58 | sethi %hi(0xff00), %o4 ! IEU1 | ||
59 | andcc %g7, %o5, %g0 ! IEU1 Group | ||
60 | be,pn %icc, 2f ! CTI | ||
61 | or %o4, %lo(0xff00), %o4 ! IEU0 | ||
62 | srlx %g1, 48, %g7 ! IEU0 Group | ||
63 | andcc %g7, %o4, %g0 ! IEU1 Group | ||
64 | be,pn %icc, 50f ! CTI | ||
65 | andcc %g7, 0xff, %g0 ! IEU1 Group | ||
66 | be,pn %icc, 51f ! CTI | ||
67 | srlx %g1, 32, %g7 ! IEU0 | ||
68 | andcc %g7, %o4, %g0 ! IEU1 Group | ||
69 | be,pn %icc, 52f ! CTI | ||
70 | andcc %g7, 0xff, %g0 ! IEU1 Group | ||
71 | be,pn %icc, 53f ! CTI | ||
72 | 2: andcc %g2, %o5, %g0 ! IEU1 Group | ||
73 | be,pn %icc, 2f ! CTI | ||
74 | srl %g1, 16, %g7 ! IEU0 | ||
75 | andcc %g7, %o4, %g0 ! IEU1 Group | ||
76 | be,pn %icc, 54f ! CTI | ||
77 | andcc %g7, 0xff, %g0 ! IEU1 Group | ||
78 | be,pn %icc, 55f ! CTI | ||
79 | andcc %g1, %o4, %g0 ! IEU1 Group | ||
80 | be,pn %icc, 56f ! CTI | ||
81 | andcc %g1, 0xff, %g0 ! IEU1 Group | ||
82 | be,a,pn %icc, 57f ! CTI | ||
83 | sub %o0, %o3, %o0 ! IEU0 | ||
84 | 2: cmp %o0, %g3 ! IEU1 Group | ||
85 | bl,a,pt %xcc, 50b ! CTI | ||
86 | 62: ldxa [%o1] %asi, %g1 ! Load | ||
87 | retl ! CTI Group | ||
88 | mov %o2, %o0 ! IEU0 | ||
89 | 50: sub %o0, %o3, %o0 | ||
90 | retl | ||
91 | sub %o0, 8, %o0 | ||
92 | 51: sub %o0, %o3, %o0 | ||
93 | retl | ||
94 | sub %o0, 7, %o0 | ||
95 | 52: sub %o0, %o3, %o0 | ||
96 | retl | ||
97 | sub %o0, 6, %o0 | ||
98 | 53: sub %o0, %o3, %o0 | ||
99 | retl | ||
100 | sub %o0, 5, %o0 | ||
101 | 54: sub %o0, %o3, %o0 | ||
102 | retl | ||
103 | sub %o0, 4, %o0 | ||
104 | 55: sub %o0, %o3, %o0 | ||
105 | retl | ||
106 | sub %o0, 3, %o0 | ||
107 | 56: sub %o0, %o3, %o0 | ||
108 | retl | ||
109 | sub %o0, 2, %o0 | ||
110 | 57: retl | ||
111 | sub %o0, 1, %o0 | ||
112 | 30: brlez,pn %o2, 3f | ||
113 | sub %g0, %o2, %o3 | ||
114 | add %o0, %o2, %o0 | ||
115 | 63: lduba [%o1] %asi, %o4 | ||
116 | 1: add %o1, 1, %o1 | ||
117 | brz,pn %o4, 2f | ||
118 | stb %o4, [%o0 + %o3] | ||
119 | addcc %o3, 1, %o3 | ||
120 | bne,pt %xcc, 1b | ||
121 | 64: lduba [%o1] %asi, %o4 | ||
122 | 3: retl | ||
123 | mov %o2, %o0 | ||
124 | 2: retl | ||
125 | add %o2, %o3, %o0 | ||
126 | .size __strncpy_from_user, .-__strncpy_from_user | ||
127 | |||
128 | .section __ex_table,"a" | ||
129 | .align 4 | ||
130 | .word 60b, __retl_efault | ||
131 | .word 61b, __retl_efault | ||
132 | .word 62b, __retl_efault | ||
133 | .word 63b, __retl_efault | ||
134 | .word 64b, __retl_efault | ||
135 | .previous | ||
diff --git a/arch/sparc/lib/user_fixup.c b/arch/sparc/lib/user_fixup.c new file mode 100644 index 000000000000..05a361b0a1a4 --- /dev/null +++ b/arch/sparc/lib/user_fixup.c | |||
@@ -0,0 +1,66 @@ | |||
1 | /* user_fixup.c: Fix up user copy faults. | ||
2 | * | ||
3 | * Copyright (C) 2004 David S. Miller <davem@redhat.com> | ||
4 | */ | ||
5 | |||
6 | #include <linux/compiler.h> | ||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/string.h> | ||
9 | #include <linux/errno.h> | ||
10 | #include <asm/uaccess.h> | ||
11 | |||
12 | /* Calculating the exact fault address when using | ||
13 | * block loads and stores can be very complicated. | ||
14 | * | ||
15 | * Instead of trying to be clever and handling all | ||
16 | * of the cases, just fix things up simply here. | ||
17 | */ | ||
18 | |||
19 | static unsigned long compute_size(unsigned long start, unsigned long size, unsigned long *offset) | ||
20 | { | ||
21 | unsigned long fault_addr = current_thread_info()->fault_address; | ||
22 | unsigned long end = start + size; | ||
23 | |||
24 | if (fault_addr < start || fault_addr >= end) { | ||
25 | *offset = 0; | ||
26 | } else { | ||
27 | *offset = fault_addr - start; | ||
28 | size = end - fault_addr; | ||
29 | } | ||
30 | return size; | ||
31 | } | ||
32 | |||
33 | unsigned long copy_from_user_fixup(void *to, const void __user *from, unsigned long size) | ||
34 | { | ||
35 | unsigned long offset; | ||
36 | |||
37 | size = compute_size((unsigned long) from, size, &offset); | ||
38 | if (likely(size)) | ||
39 | memset(to + offset, 0, size); | ||
40 | |||
41 | return size; | ||
42 | } | ||
43 | |||
44 | unsigned long copy_to_user_fixup(void __user *to, const void *from, unsigned long size) | ||
45 | { | ||
46 | unsigned long offset; | ||
47 | |||
48 | return compute_size((unsigned long) to, size, &offset); | ||
49 | } | ||
50 | |||
51 | unsigned long copy_in_user_fixup(void __user *to, void __user *from, unsigned long size) | ||
52 | { | ||
53 | unsigned long fault_addr = current_thread_info()->fault_address; | ||
54 | unsigned long start = (unsigned long) to; | ||
55 | unsigned long end = start + size; | ||
56 | |||
57 | if (fault_addr >= start && fault_addr < end) | ||
58 | return end - fault_addr; | ||
59 | |||
60 | start = (unsigned long) from; | ||
61 | end = start + size; | ||
62 | if (fault_addr >= start && fault_addr < end) | ||
63 | return end - fault_addr; | ||
64 | |||
65 | return size; | ||
66 | } | ||
diff --git a/arch/sparc/lib/xor.S b/arch/sparc/lib/xor.S new file mode 100644 index 000000000000..f44f58f40234 --- /dev/null +++ b/arch/sparc/lib/xor.S | |||
@@ -0,0 +1,652 @@ | |||
1 | /* | ||
2 | * arch/sparc64/lib/xor.S | ||
3 | * | ||
4 | * High speed xor_block operation for RAID4/5 utilizing the | ||
5 | * UltraSparc Visual Instruction Set and Niagara store-init/twin-load. | ||
6 | * | ||
7 | * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) | ||
8 | * Copyright (C) 2006 David S. Miller <davem@davemloft.net> | ||
9 | */ | ||
10 | |||
11 | #include <asm/visasm.h> | ||
12 | #include <asm/asi.h> | ||
13 | #include <asm/dcu.h> | ||
14 | #include <asm/spitfire.h> | ||
15 | |||
16 | /* | ||
17 | * Requirements: | ||
18 | * !(((long)dest | (long)sourceN) & (64 - 1)) && | ||
19 | * !(len & 127) && len >= 256 | ||
20 | */ | ||
21 | .text | ||
22 | .align 32 | ||
23 | |||
24 | /* VIS versions. */ | ||
25 | .globl xor_vis_2 | ||
26 | .type xor_vis_2,#function | ||
27 | xor_vis_2: | ||
28 | rd %fprs, %o5 | ||
29 | andcc %o5, FPRS_FEF|FPRS_DU, %g0 | ||
30 | be,pt %icc, 0f | ||
31 | sethi %hi(VISenter), %g1 | ||
32 | jmpl %g1 + %lo(VISenter), %g7 | ||
33 | add %g7, 8, %g7 | ||
34 | 0: wr %g0, FPRS_FEF, %fprs | ||
35 | rd %asi, %g1 | ||
36 | wr %g0, ASI_BLK_P, %asi | ||
37 | membar #LoadStore|#StoreLoad|#StoreStore | ||
38 | sub %o0, 128, %o0 | ||
39 | ldda [%o1] %asi, %f0 | ||
40 | ldda [%o2] %asi, %f16 | ||
41 | |||
42 | 2: ldda [%o1 + 64] %asi, %f32 | ||
43 | fxor %f0, %f16, %f16 | ||
44 | fxor %f2, %f18, %f18 | ||
45 | fxor %f4, %f20, %f20 | ||
46 | fxor %f6, %f22, %f22 | ||
47 | fxor %f8, %f24, %f24 | ||
48 | fxor %f10, %f26, %f26 | ||
49 | fxor %f12, %f28, %f28 | ||
50 | fxor %f14, %f30, %f30 | ||
51 | stda %f16, [%o1] %asi | ||
52 | ldda [%o2 + 64] %asi, %f48 | ||
53 | ldda [%o1 + 128] %asi, %f0 | ||
54 | fxor %f32, %f48, %f48 | ||
55 | fxor %f34, %f50, %f50 | ||
56 | add %o1, 128, %o1 | ||
57 | fxor %f36, %f52, %f52 | ||
58 | add %o2, 128, %o2 | ||
59 | fxor %f38, %f54, %f54 | ||
60 | subcc %o0, 128, %o0 | ||
61 | fxor %f40, %f56, %f56 | ||
62 | fxor %f42, %f58, %f58 | ||
63 | fxor %f44, %f60, %f60 | ||
64 | fxor %f46, %f62, %f62 | ||
65 | stda %f48, [%o1 - 64] %asi | ||
66 | bne,pt %xcc, 2b | ||
67 | ldda [%o2] %asi, %f16 | ||
68 | |||
69 | ldda [%o1 + 64] %asi, %f32 | ||
70 | fxor %f0, %f16, %f16 | ||
71 | fxor %f2, %f18, %f18 | ||
72 | fxor %f4, %f20, %f20 | ||
73 | fxor %f6, %f22, %f22 | ||
74 | fxor %f8, %f24, %f24 | ||
75 | fxor %f10, %f26, %f26 | ||
76 | fxor %f12, %f28, %f28 | ||
77 | fxor %f14, %f30, %f30 | ||
78 | stda %f16, [%o1] %asi | ||
79 | ldda [%o2 + 64] %asi, %f48 | ||
80 | membar #Sync | ||
81 | fxor %f32, %f48, %f48 | ||
82 | fxor %f34, %f50, %f50 | ||
83 | fxor %f36, %f52, %f52 | ||
84 | fxor %f38, %f54, %f54 | ||
85 | fxor %f40, %f56, %f56 | ||
86 | fxor %f42, %f58, %f58 | ||
87 | fxor %f44, %f60, %f60 | ||
88 | fxor %f46, %f62, %f62 | ||
89 | stda %f48, [%o1 + 64] %asi | ||
90 | membar #Sync|#StoreStore|#StoreLoad | ||
91 | wr %g1, %g0, %asi | ||
92 | retl | ||
93 | wr %g0, 0, %fprs | ||
94 | .size xor_vis_2, .-xor_vis_2 | ||
95 | |||
96 | .globl xor_vis_3 | ||
97 | .type xor_vis_3,#function | ||
98 | xor_vis_3: | ||
99 | rd %fprs, %o5 | ||
100 | andcc %o5, FPRS_FEF|FPRS_DU, %g0 | ||
101 | be,pt %icc, 0f | ||
102 | sethi %hi(VISenter), %g1 | ||
103 | jmpl %g1 + %lo(VISenter), %g7 | ||
104 | add %g7, 8, %g7 | ||
105 | 0: wr %g0, FPRS_FEF, %fprs | ||
106 | rd %asi, %g1 | ||
107 | wr %g0, ASI_BLK_P, %asi | ||
108 | membar #LoadStore|#StoreLoad|#StoreStore | ||
109 | sub %o0, 64, %o0 | ||
110 | ldda [%o1] %asi, %f0 | ||
111 | ldda [%o2] %asi, %f16 | ||
112 | |||
113 | 3: ldda [%o3] %asi, %f32 | ||
114 | fxor %f0, %f16, %f48 | ||
115 | fxor %f2, %f18, %f50 | ||
116 | add %o1, 64, %o1 | ||
117 | fxor %f4, %f20, %f52 | ||
118 | fxor %f6, %f22, %f54 | ||
119 | add %o2, 64, %o2 | ||
120 | fxor %f8, %f24, %f56 | ||
121 | fxor %f10, %f26, %f58 | ||
122 | fxor %f12, %f28, %f60 | ||
123 | fxor %f14, %f30, %f62 | ||
124 | ldda [%o1] %asi, %f0 | ||
125 | fxor %f48, %f32, %f48 | ||
126 | fxor %f50, %f34, %f50 | ||
127 | fxor %f52, %f36, %f52 | ||
128 | fxor %f54, %f38, %f54 | ||
129 | add %o3, 64, %o3 | ||
130 | fxor %f56, %f40, %f56 | ||
131 | fxor %f58, %f42, %f58 | ||
132 | subcc %o0, 64, %o0 | ||
133 | fxor %f60, %f44, %f60 | ||
134 | fxor %f62, %f46, %f62 | ||
135 | stda %f48, [%o1 - 64] %asi | ||
136 | bne,pt %xcc, 3b | ||
137 | ldda [%o2] %asi, %f16 | ||
138 | |||
139 | ldda [%o3] %asi, %f32 | ||
140 | fxor %f0, %f16, %f48 | ||
141 | fxor %f2, %f18, %f50 | ||
142 | fxor %f4, %f20, %f52 | ||
143 | fxor %f6, %f22, %f54 | ||
144 | fxor %f8, %f24, %f56 | ||
145 | fxor %f10, %f26, %f58 | ||
146 | fxor %f12, %f28, %f60 | ||
147 | fxor %f14, %f30, %f62 | ||
148 | membar #Sync | ||
149 | fxor %f48, %f32, %f48 | ||
150 | fxor %f50, %f34, %f50 | ||
151 | fxor %f52, %f36, %f52 | ||
152 | fxor %f54, %f38, %f54 | ||
153 | fxor %f56, %f40, %f56 | ||
154 | fxor %f58, %f42, %f58 | ||
155 | fxor %f60, %f44, %f60 | ||
156 | fxor %f62, %f46, %f62 | ||
157 | stda %f48, [%o1] %asi | ||
158 | membar #Sync|#StoreStore|#StoreLoad | ||
159 | wr %g1, %g0, %asi | ||
160 | retl | ||
161 | wr %g0, 0, %fprs | ||
162 | .size xor_vis_3, .-xor_vis_3 | ||
163 | |||
164 | .globl xor_vis_4 | ||
165 | .type xor_vis_4,#function | ||
166 | xor_vis_4: | ||
167 | rd %fprs, %o5 | ||
168 | andcc %o5, FPRS_FEF|FPRS_DU, %g0 | ||
169 | be,pt %icc, 0f | ||
170 | sethi %hi(VISenter), %g1 | ||
171 | jmpl %g1 + %lo(VISenter), %g7 | ||
172 | add %g7, 8, %g7 | ||
173 | 0: wr %g0, FPRS_FEF, %fprs | ||
174 | rd %asi, %g1 | ||
175 | wr %g0, ASI_BLK_P, %asi | ||
176 | membar #LoadStore|#StoreLoad|#StoreStore | ||
177 | sub %o0, 64, %o0 | ||
178 | ldda [%o1] %asi, %f0 | ||
179 | ldda [%o2] %asi, %f16 | ||
180 | |||
181 | 4: ldda [%o3] %asi, %f32 | ||
182 | fxor %f0, %f16, %f16 | ||
183 | fxor %f2, %f18, %f18 | ||
184 | add %o1, 64, %o1 | ||
185 | fxor %f4, %f20, %f20 | ||
186 | fxor %f6, %f22, %f22 | ||
187 | add %o2, 64, %o2 | ||
188 | fxor %f8, %f24, %f24 | ||
189 | fxor %f10, %f26, %f26 | ||
190 | fxor %f12, %f28, %f28 | ||
191 | fxor %f14, %f30, %f30 | ||
192 | ldda [%o4] %asi, %f48 | ||
193 | fxor %f16, %f32, %f32 | ||
194 | fxor %f18, %f34, %f34 | ||
195 | fxor %f20, %f36, %f36 | ||
196 | fxor %f22, %f38, %f38 | ||
197 | add %o3, 64, %o3 | ||
198 | fxor %f24, %f40, %f40 | ||
199 | fxor %f26, %f42, %f42 | ||
200 | fxor %f28, %f44, %f44 | ||
201 | fxor %f30, %f46, %f46 | ||
202 | ldda [%o1] %asi, %f0 | ||
203 | fxor %f32, %f48, %f48 | ||
204 | fxor %f34, %f50, %f50 | ||
205 | fxor %f36, %f52, %f52 | ||
206 | add %o4, 64, %o4 | ||
207 | fxor %f38, %f54, %f54 | ||
208 | fxor %f40, %f56, %f56 | ||
209 | fxor %f42, %f58, %f58 | ||
210 | subcc %o0, 64, %o0 | ||
211 | fxor %f44, %f60, %f60 | ||
212 | fxor %f46, %f62, %f62 | ||
213 | stda %f48, [%o1 - 64] %asi | ||
214 | bne,pt %xcc, 4b | ||
215 | ldda [%o2] %asi, %f16 | ||
216 | |||
217 | ldda [%o3] %asi, %f32 | ||
218 | fxor %f0, %f16, %f16 | ||
219 | fxor %f2, %f18, %f18 | ||
220 | fxor %f4, %f20, %f20 | ||
221 | fxor %f6, %f22, %f22 | ||
222 | fxor %f8, %f24, %f24 | ||
223 | fxor %f10, %f26, %f26 | ||
224 | fxor %f12, %f28, %f28 | ||
225 | fxor %f14, %f30, %f30 | ||
226 | ldda [%o4] %asi, %f48 | ||
227 | fxor %f16, %f32, %f32 | ||
228 | fxor %f18, %f34, %f34 | ||
229 | fxor %f20, %f36, %f36 | ||
230 | fxor %f22, %f38, %f38 | ||
231 | fxor %f24, %f40, %f40 | ||
232 | fxor %f26, %f42, %f42 | ||
233 | fxor %f28, %f44, %f44 | ||
234 | fxor %f30, %f46, %f46 | ||
235 | membar #Sync | ||
236 | fxor %f32, %f48, %f48 | ||
237 | fxor %f34, %f50, %f50 | ||
238 | fxor %f36, %f52, %f52 | ||
239 | fxor %f38, %f54, %f54 | ||
240 | fxor %f40, %f56, %f56 | ||
241 | fxor %f42, %f58, %f58 | ||
242 | fxor %f44, %f60, %f60 | ||
243 | fxor %f46, %f62, %f62 | ||
244 | stda %f48, [%o1] %asi | ||
245 | membar #Sync|#StoreStore|#StoreLoad | ||
246 | wr %g1, %g0, %asi | ||
247 | retl | ||
248 | wr %g0, 0, %fprs | ||
249 | .size xor_vis_4, .-xor_vis_4 | ||
250 | |||
251 | .globl xor_vis_5 | ||
252 | .type xor_vis_5,#function | ||
253 | xor_vis_5: | ||
254 | save %sp, -192, %sp | ||
255 | rd %fprs, %o5 | ||
256 | andcc %o5, FPRS_FEF|FPRS_DU, %g0 | ||
257 | be,pt %icc, 0f | ||
258 | sethi %hi(VISenter), %g1 | ||
259 | jmpl %g1 + %lo(VISenter), %g7 | ||
260 | add %g7, 8, %g7 | ||
261 | 0: wr %g0, FPRS_FEF, %fprs | ||
262 | rd %asi, %g1 | ||
263 | wr %g0, ASI_BLK_P, %asi | ||
264 | membar #LoadStore|#StoreLoad|#StoreStore | ||
265 | sub %i0, 64, %i0 | ||
266 | ldda [%i1] %asi, %f0 | ||
267 | ldda [%i2] %asi, %f16 | ||
268 | |||
269 | 5: ldda [%i3] %asi, %f32 | ||
270 | fxor %f0, %f16, %f48 | ||
271 | fxor %f2, %f18, %f50 | ||
272 | add %i1, 64, %i1 | ||
273 | fxor %f4, %f20, %f52 | ||
274 | fxor %f6, %f22, %f54 | ||
275 | add %i2, 64, %i2 | ||
276 | fxor %f8, %f24, %f56 | ||
277 | fxor %f10, %f26, %f58 | ||
278 | fxor %f12, %f28, %f60 | ||
279 | fxor %f14, %f30, %f62 | ||
280 | ldda [%i4] %asi, %f16 | ||
281 | fxor %f48, %f32, %f48 | ||
282 | fxor %f50, %f34, %f50 | ||
283 | fxor %f52, %f36, %f52 | ||
284 | fxor %f54, %f38, %f54 | ||
285 | add %i3, 64, %i3 | ||
286 | fxor %f56, %f40, %f56 | ||
287 | fxor %f58, %f42, %f58 | ||
288 | fxor %f60, %f44, %f60 | ||
289 | fxor %f62, %f46, %f62 | ||
290 | ldda [%i5] %asi, %f32 | ||
291 | fxor %f48, %f16, %f48 | ||
292 | fxor %f50, %f18, %f50 | ||
293 | add %i4, 64, %i4 | ||
294 | fxor %f52, %f20, %f52 | ||
295 | fxor %f54, %f22, %f54 | ||
296 | add %i5, 64, %i5 | ||
297 | fxor %f56, %f24, %f56 | ||
298 | fxor %f58, %f26, %f58 | ||
299 | fxor %f60, %f28, %f60 | ||
300 | fxor %f62, %f30, %f62 | ||
301 | ldda [%i1] %asi, %f0 | ||
302 | fxor %f48, %f32, %f48 | ||
303 | fxor %f50, %f34, %f50 | ||
304 | fxor %f52, %f36, %f52 | ||
305 | fxor %f54, %f38, %f54 | ||
306 | fxor %f56, %f40, %f56 | ||
307 | fxor %f58, %f42, %f58 | ||
308 | subcc %i0, 64, %i0 | ||
309 | fxor %f60, %f44, %f60 | ||
310 | fxor %f62, %f46, %f62 | ||
311 | stda %f48, [%i1 - 64] %asi | ||
312 | bne,pt %xcc, 5b | ||
313 | ldda [%i2] %asi, %f16 | ||
314 | |||
315 | ldda [%i3] %asi, %f32 | ||
316 | fxor %f0, %f16, %f48 | ||
317 | fxor %f2, %f18, %f50 | ||
318 | fxor %f4, %f20, %f52 | ||
319 | fxor %f6, %f22, %f54 | ||
320 | fxor %f8, %f24, %f56 | ||
321 | fxor %f10, %f26, %f58 | ||
322 | fxor %f12, %f28, %f60 | ||
323 | fxor %f14, %f30, %f62 | ||
324 | ldda [%i4] %asi, %f16 | ||
325 | fxor %f48, %f32, %f48 | ||
326 | fxor %f50, %f34, %f50 | ||
327 | fxor %f52, %f36, %f52 | ||
328 | fxor %f54, %f38, %f54 | ||
329 | fxor %f56, %f40, %f56 | ||
330 | fxor %f58, %f42, %f58 | ||
331 | fxor %f60, %f44, %f60 | ||
332 | fxor %f62, %f46, %f62 | ||
333 | ldda [%i5] %asi, %f32 | ||
334 | fxor %f48, %f16, %f48 | ||
335 | fxor %f50, %f18, %f50 | ||
336 | fxor %f52, %f20, %f52 | ||
337 | fxor %f54, %f22, %f54 | ||
338 | fxor %f56, %f24, %f56 | ||
339 | fxor %f58, %f26, %f58 | ||
340 | fxor %f60, %f28, %f60 | ||
341 | fxor %f62, %f30, %f62 | ||
342 | membar #Sync | ||
343 | fxor %f48, %f32, %f48 | ||
344 | fxor %f50, %f34, %f50 | ||
345 | fxor %f52, %f36, %f52 | ||
346 | fxor %f54, %f38, %f54 | ||
347 | fxor %f56, %f40, %f56 | ||
348 | fxor %f58, %f42, %f58 | ||
349 | fxor %f60, %f44, %f60 | ||
350 | fxor %f62, %f46, %f62 | ||
351 | stda %f48, [%i1] %asi | ||
352 | membar #Sync|#StoreStore|#StoreLoad | ||
353 | wr %g1, %g0, %asi | ||
354 | wr %g0, 0, %fprs | ||
355 | ret | ||
356 | restore | ||
357 | .size xor_vis_5, .-xor_vis_5 | ||
358 | |||
359 | /* Niagara versions. */ | ||
360 | .globl xor_niagara_2 | ||
361 | .type xor_niagara_2,#function | ||
362 | xor_niagara_2: /* %o0=bytes, %o1=dest, %o2=src */ | ||
363 | save %sp, -192, %sp | ||
364 | prefetch [%i1], #n_writes | ||
365 | prefetch [%i2], #one_read | ||
366 | rd %asi, %g7 | ||
367 | wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi | ||
368 | srlx %i0, 6, %g1 | ||
369 | mov %i1, %i0 | ||
370 | mov %i2, %i1 | ||
371 | 1: ldda [%i1 + 0x00] %asi, %i2 /* %i2/%i3 = src + 0x00 */ | ||
372 | ldda [%i1 + 0x10] %asi, %i4 /* %i4/%i5 = src + 0x10 */ | ||
373 | ldda [%i1 + 0x20] %asi, %g2 /* %g2/%g3 = src + 0x20 */ | ||
374 | ldda [%i1 + 0x30] %asi, %l0 /* %l0/%l1 = src + 0x30 */ | ||
375 | prefetch [%i1 + 0x40], #one_read | ||
376 | ldda [%i0 + 0x00] %asi, %o0 /* %o0/%o1 = dest + 0x00 */ | ||
377 | ldda [%i0 + 0x10] %asi, %o2 /* %o2/%o3 = dest + 0x10 */ | ||
378 | ldda [%i0 + 0x20] %asi, %o4 /* %o4/%o5 = dest + 0x20 */ | ||
379 | ldda [%i0 + 0x30] %asi, %l2 /* %l2/%l3 = dest + 0x30 */ | ||
380 | prefetch [%i0 + 0x40], #n_writes | ||
381 | xor %o0, %i2, %o0 | ||
382 | xor %o1, %i3, %o1 | ||
383 | stxa %o0, [%i0 + 0x00] %asi | ||
384 | stxa %o1, [%i0 + 0x08] %asi | ||
385 | xor %o2, %i4, %o2 | ||
386 | xor %o3, %i5, %o3 | ||
387 | stxa %o2, [%i0 + 0x10] %asi | ||
388 | stxa %o3, [%i0 + 0x18] %asi | ||
389 | xor %o4, %g2, %o4 | ||
390 | xor %o5, %g3, %o5 | ||
391 | stxa %o4, [%i0 + 0x20] %asi | ||
392 | stxa %o5, [%i0 + 0x28] %asi | ||
393 | xor %l2, %l0, %l2 | ||
394 | xor %l3, %l1, %l3 | ||
395 | stxa %l2, [%i0 + 0x30] %asi | ||
396 | stxa %l3, [%i0 + 0x38] %asi | ||
397 | add %i0, 0x40, %i0 | ||
398 | subcc %g1, 1, %g1 | ||
399 | bne,pt %xcc, 1b | ||
400 | add %i1, 0x40, %i1 | ||
401 | membar #Sync | ||
402 | wr %g7, 0x0, %asi | ||
403 | ret | ||
404 | restore | ||
405 | .size xor_niagara_2, .-xor_niagara_2 | ||
406 | |||
407 | .globl xor_niagara_3 | ||
408 | .type xor_niagara_3,#function | ||
409 | xor_niagara_3: /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2 */ | ||
410 | save %sp, -192, %sp | ||
411 | prefetch [%i1], #n_writes | ||
412 | prefetch [%i2], #one_read | ||
413 | prefetch [%i3], #one_read | ||
414 | rd %asi, %g7 | ||
415 | wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi | ||
416 | srlx %i0, 6, %g1 | ||
417 | mov %i1, %i0 | ||
418 | mov %i2, %i1 | ||
419 | mov %i3, %l7 | ||
420 | 1: ldda [%i1 + 0x00] %asi, %i2 /* %i2/%i3 = src1 + 0x00 */ | ||
421 | ldda [%i1 + 0x10] %asi, %i4 /* %i4/%i5 = src1 + 0x10 */ | ||
422 | ldda [%l7 + 0x00] %asi, %g2 /* %g2/%g3 = src2 + 0x00 */ | ||
423 | ldda [%l7 + 0x10] %asi, %l0 /* %l0/%l1 = src2 + 0x10 */ | ||
424 | ldda [%i0 + 0x00] %asi, %o0 /* %o0/%o1 = dest + 0x00 */ | ||
425 | ldda [%i0 + 0x10] %asi, %o2 /* %o2/%o3 = dest + 0x10 */ | ||
426 | xor %g2, %i2, %g2 | ||
427 | xor %g3, %i3, %g3 | ||
428 | xor %o0, %g2, %o0 | ||
429 | xor %o1, %g3, %o1 | ||
430 | stxa %o0, [%i0 + 0x00] %asi | ||
431 | stxa %o1, [%i0 + 0x08] %asi | ||
432 | ldda [%i1 + 0x20] %asi, %i2 /* %i2/%i3 = src1 + 0x20 */ | ||
433 | ldda [%l7 + 0x20] %asi, %g2 /* %g2/%g3 = src2 + 0x20 */ | ||
434 | ldda [%i0 + 0x20] %asi, %o0 /* %o0/%o1 = dest + 0x20 */ | ||
435 | xor %l0, %i4, %l0 | ||
436 | xor %l1, %i5, %l1 | ||
437 | xor %o2, %l0, %o2 | ||
438 | xor %o3, %l1, %o3 | ||
439 | stxa %o2, [%i0 + 0x10] %asi | ||
440 | stxa %o3, [%i0 + 0x18] %asi | ||
441 | ldda [%i1 + 0x30] %asi, %i4 /* %i4/%i5 = src1 + 0x30 */ | ||
442 | ldda [%l7 + 0x30] %asi, %l0 /* %l0/%l1 = src2 + 0x30 */ | ||
443 | ldda [%i0 + 0x30] %asi, %o2 /* %o2/%o3 = dest + 0x30 */ | ||
444 | prefetch [%i1 + 0x40], #one_read | ||
445 | prefetch [%l7 + 0x40], #one_read | ||
446 | prefetch [%i0 + 0x40], #n_writes | ||
447 | xor %g2, %i2, %g2 | ||
448 | xor %g3, %i3, %g3 | ||
449 | xor %o0, %g2, %o0 | ||
450 | xor %o1, %g3, %o1 | ||
451 | stxa %o0, [%i0 + 0x20] %asi | ||
452 | stxa %o1, [%i0 + 0x28] %asi | ||
453 | xor %l0, %i4, %l0 | ||
454 | xor %l1, %i5, %l1 | ||
455 | xor %o2, %l0, %o2 | ||
456 | xor %o3, %l1, %o3 | ||
457 | stxa %o2, [%i0 + 0x30] %asi | ||
458 | stxa %o3, [%i0 + 0x38] %asi | ||
459 | add %i0, 0x40, %i0 | ||
460 | add %i1, 0x40, %i1 | ||
461 | subcc %g1, 1, %g1 | ||
462 | bne,pt %xcc, 1b | ||
463 | add %l7, 0x40, %l7 | ||
464 | membar #Sync | ||
465 | wr %g7, 0x0, %asi | ||
466 | ret | ||
467 | restore | ||
468 | .size xor_niagara_3, .-xor_niagara_3 | ||
469 | |||
470 | .globl xor_niagara_4 | ||
471 | .type xor_niagara_4,#function | ||
472 | xor_niagara_4: /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3 */ | ||
473 | save %sp, -192, %sp | ||
474 | prefetch [%i1], #n_writes | ||
475 | prefetch [%i2], #one_read | ||
476 | prefetch [%i3], #one_read | ||
477 | prefetch [%i4], #one_read | ||
478 | rd %asi, %g7 | ||
479 | wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi | ||
480 | srlx %i0, 6, %g1 | ||
481 | mov %i1, %i0 | ||
482 | mov %i2, %i1 | ||
483 | mov %i3, %l7 | ||
484 | mov %i4, %l6 | ||
485 | 1: ldda [%i1 + 0x00] %asi, %i2 /* %i2/%i3 = src1 + 0x00 */ | ||
486 | ldda [%l7 + 0x00] %asi, %i4 /* %i4/%i5 = src2 + 0x00 */ | ||
487 | ldda [%l6 + 0x00] %asi, %g2 /* %g2/%g3 = src3 + 0x00 */ | ||
488 | ldda [%i0 + 0x00] %asi, %l0 /* %l0/%l1 = dest + 0x00 */ | ||
489 | xor %i4, %i2, %i4 | ||
490 | xor %i5, %i3, %i5 | ||
491 | ldda [%i1 + 0x10] %asi, %i2 /* %i2/%i3 = src1 + 0x10 */ | ||
492 | xor %g2, %i4, %g2 | ||
493 | xor %g3, %i5, %g3 | ||
494 | ldda [%l7 + 0x10] %asi, %i4 /* %i4/%i5 = src2 + 0x10 */ | ||
495 | xor %l0, %g2, %l0 | ||
496 | xor %l1, %g3, %l1 | ||
497 | stxa %l0, [%i0 + 0x00] %asi | ||
498 | stxa %l1, [%i0 + 0x08] %asi | ||
499 | ldda [%l6 + 0x10] %asi, %g2 /* %g2/%g3 = src3 + 0x10 */ | ||
500 | ldda [%i0 + 0x10] %asi, %l0 /* %l0/%l1 = dest + 0x10 */ | ||
501 | |||
502 | xor %i4, %i2, %i4 | ||
503 | xor %i5, %i3, %i5 | ||
504 | ldda [%i1 + 0x20] %asi, %i2 /* %i2/%i3 = src1 + 0x20 */ | ||
505 | xor %g2, %i4, %g2 | ||
506 | xor %g3, %i5, %g3 | ||
507 | ldda [%l7 + 0x20] %asi, %i4 /* %i4/%i5 = src2 + 0x20 */ | ||
508 | xor %l0, %g2, %l0 | ||
509 | xor %l1, %g3, %l1 | ||
510 | stxa %l0, [%i0 + 0x10] %asi | ||
511 | stxa %l1, [%i0 + 0x18] %asi | ||
512 | ldda [%l6 + 0x20] %asi, %g2 /* %g2/%g3 = src3 + 0x20 */ | ||
513 | ldda [%i0 + 0x20] %asi, %l0 /* %l0/%l1 = dest + 0x20 */ | ||
514 | |||
515 | xor %i4, %i2, %i4 | ||
516 | xor %i5, %i3, %i5 | ||
517 | ldda [%i1 + 0x30] %asi, %i2 /* %i2/%i3 = src1 + 0x30 */ | ||
518 | xor %g2, %i4, %g2 | ||
519 | xor %g3, %i5, %g3 | ||
520 | ldda [%l7 + 0x30] %asi, %i4 /* %i4/%i5 = src2 + 0x30 */ | ||
521 | xor %l0, %g2, %l0 | ||
522 | xor %l1, %g3, %l1 | ||
523 | stxa %l0, [%i0 + 0x20] %asi | ||
524 | stxa %l1, [%i0 + 0x28] %asi | ||
525 | ldda [%l6 + 0x30] %asi, %g2 /* %g2/%g3 = src3 + 0x30 */ | ||
526 | ldda [%i0 + 0x30] %asi, %l0 /* %l0/%l1 = dest + 0x30 */ | ||
527 | |||
528 | prefetch [%i1 + 0x40], #one_read | ||
529 | prefetch [%l7 + 0x40], #one_read | ||
530 | prefetch [%l6 + 0x40], #one_read | ||
531 | prefetch [%i0 + 0x40], #n_writes | ||
532 | |||
533 | xor %i4, %i2, %i4 | ||
534 | xor %i5, %i3, %i5 | ||
535 | xor %g2, %i4, %g2 | ||
536 | xor %g3, %i5, %g3 | ||
537 | xor %l0, %g2, %l0 | ||
538 | xor %l1, %g3, %l1 | ||
539 | stxa %l0, [%i0 + 0x30] %asi | ||
540 | stxa %l1, [%i0 + 0x38] %asi | ||
541 | |||
542 | add %i0, 0x40, %i0 | ||
543 | add %i1, 0x40, %i1 | ||
544 | add %l7, 0x40, %l7 | ||
545 | subcc %g1, 1, %g1 | ||
546 | bne,pt %xcc, 1b | ||
547 | add %l6, 0x40, %l6 | ||
548 | membar #Sync | ||
549 | wr %g7, 0x0, %asi | ||
550 | ret | ||
551 | restore | ||
552 | .size xor_niagara_4, .-xor_niagara_4 | ||
553 | |||
554 | .globl xor_niagara_5 | ||
555 | .type xor_niagara_5,#function | ||
556 | xor_niagara_5: /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3, %o5=src4 */ | ||
557 | save %sp, -192, %sp | ||
558 | prefetch [%i1], #n_writes | ||
559 | prefetch [%i2], #one_read | ||
560 | prefetch [%i3], #one_read | ||
561 | prefetch [%i4], #one_read | ||
562 | prefetch [%i5], #one_read | ||
563 | rd %asi, %g7 | ||
564 | wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi | ||
565 | srlx %i0, 6, %g1 | ||
566 | mov %i1, %i0 | ||
567 | mov %i2, %i1 | ||
568 | mov %i3, %l7 | ||
569 | mov %i4, %l6 | ||
570 | mov %i5, %l5 | ||
571 | 1: ldda [%i1 + 0x00] %asi, %i2 /* %i2/%i3 = src1 + 0x00 */ | ||
572 | ldda [%l7 + 0x00] %asi, %i4 /* %i4/%i5 = src2 + 0x00 */ | ||
573 | ldda [%l6 + 0x00] %asi, %g2 /* %g2/%g3 = src3 + 0x00 */ | ||
574 | ldda [%l5 + 0x00] %asi, %l0 /* %l0/%l1 = src4 + 0x00 */ | ||
575 | ldda [%i0 + 0x00] %asi, %l2 /* %l2/%l3 = dest + 0x00 */ | ||
576 | xor %i4, %i2, %i4 | ||
577 | xor %i5, %i3, %i5 | ||
578 | ldda [%i1 + 0x10] %asi, %i2 /* %i2/%i3 = src1 + 0x10 */ | ||
579 | xor %g2, %i4, %g2 | ||
580 | xor %g3, %i5, %g3 | ||
581 | ldda [%l7 + 0x10] %asi, %i4 /* %i4/%i5 = src2 + 0x10 */ | ||
582 | xor %l0, %g2, %l0 | ||
583 | xor %l1, %g3, %l1 | ||
584 | ldda [%l6 + 0x10] %asi, %g2 /* %g2/%g3 = src3 + 0x10 */ | ||
585 | xor %l2, %l0, %l2 | ||
586 | xor %l3, %l1, %l3 | ||
587 | stxa %l2, [%i0 + 0x00] %asi | ||
588 | stxa %l3, [%i0 + 0x08] %asi | ||
589 | ldda [%l5 + 0x10] %asi, %l0 /* %l0/%l1 = src4 + 0x10 */ | ||
590 | ldda [%i0 + 0x10] %asi, %l2 /* %l2/%l3 = dest + 0x10 */ | ||
591 | |||
592 | xor %i4, %i2, %i4 | ||
593 | xor %i5, %i3, %i5 | ||
594 | ldda [%i1 + 0x20] %asi, %i2 /* %i2/%i3 = src1 + 0x20 */ | ||
595 | xor %g2, %i4, %g2 | ||
596 | xor %g3, %i5, %g3 | ||
597 | ldda [%l7 + 0x20] %asi, %i4 /* %i4/%i5 = src2 + 0x20 */ | ||
598 | xor %l0, %g2, %l0 | ||
599 | xor %l1, %g3, %l1 | ||
600 | ldda [%l6 + 0x20] %asi, %g2 /* %g2/%g3 = src3 + 0x20 */ | ||
601 | xor %l2, %l0, %l2 | ||
602 | xor %l3, %l1, %l3 | ||
603 | stxa %l2, [%i0 + 0x10] %asi | ||
604 | stxa %l3, [%i0 + 0x18] %asi | ||
605 | ldda [%l5 + 0x20] %asi, %l0 /* %l0/%l1 = src4 + 0x20 */ | ||
606 | ldda [%i0 + 0x20] %asi, %l2 /* %l2/%l3 = dest + 0x20 */ | ||
607 | |||
608 | xor %i4, %i2, %i4 | ||
609 | xor %i5, %i3, %i5 | ||
610 | ldda [%i1 + 0x30] %asi, %i2 /* %i2/%i3 = src1 + 0x30 */ | ||
611 | xor %g2, %i4, %g2 | ||
612 | xor %g3, %i5, %g3 | ||
613 | ldda [%l7 + 0x30] %asi, %i4 /* %i4/%i5 = src2 + 0x30 */ | ||
614 | xor %l0, %g2, %l0 | ||
615 | xor %l1, %g3, %l1 | ||
616 | ldda [%l6 + 0x30] %asi, %g2 /* %g2/%g3 = src3 + 0x30 */ | ||
617 | xor %l2, %l0, %l2 | ||
618 | xor %l3, %l1, %l3 | ||
619 | stxa %l2, [%i0 + 0x20] %asi | ||
620 | stxa %l3, [%i0 + 0x28] %asi | ||
621 | ldda [%l5 + 0x30] %asi, %l0 /* %l0/%l1 = src4 + 0x30 */ | ||
622 | ldda [%i0 + 0x30] %asi, %l2 /* %l2/%l3 = dest + 0x30 */ | ||
623 | |||
624 | prefetch [%i1 + 0x40], #one_read | ||
625 | prefetch [%l7 + 0x40], #one_read | ||
626 | prefetch [%l6 + 0x40], #one_read | ||
627 | prefetch [%l5 + 0x40], #one_read | ||
628 | prefetch [%i0 + 0x40], #n_writes | ||
629 | |||
630 | xor %i4, %i2, %i4 | ||
631 | xor %i5, %i3, %i5 | ||
632 | xor %g2, %i4, %g2 | ||
633 | xor %g3, %i5, %g3 | ||
634 | xor %l0, %g2, %l0 | ||
635 | xor %l1, %g3, %l1 | ||
636 | xor %l2, %l0, %l2 | ||
637 | xor %l3, %l1, %l3 | ||
638 | stxa %l2, [%i0 + 0x30] %asi | ||
639 | stxa %l3, [%i0 + 0x38] %asi | ||
640 | |||
641 | add %i0, 0x40, %i0 | ||
642 | add %i1, 0x40, %i1 | ||
643 | add %l7, 0x40, %l7 | ||
644 | add %l6, 0x40, %l6 | ||
645 | subcc %g1, 1, %g1 | ||
646 | bne,pt %xcc, 1b | ||
647 | add %l5, 0x40, %l5 | ||
648 | membar #Sync | ||
649 | wr %g7, 0x0, %asi | ||
650 | ret | ||
651 | restore | ||
652 | .size xor_niagara_5, .-xor_niagara_5 | ||