diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/alpha/lib |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/alpha/lib')
53 files changed, 7901 insertions, 0 deletions
diff --git a/arch/alpha/lib/Makefile b/arch/alpha/lib/Makefile new file mode 100644 index 000000000000..21cf624d7329 --- /dev/null +++ b/arch/alpha/lib/Makefile | |||
@@ -0,0 +1,58 @@ | |||
1 | # | ||
2 | # Makefile for alpha-specific library files.. | ||
3 | # | ||
4 | |||
5 | EXTRA_AFLAGS := $(CFLAGS) | ||
6 | EXTRA_CFLAGS := -Werror | ||
7 | |||
8 | # Many of these routines have implementations tuned for ev6. | ||
9 | # Choose them iff we're targeting ev6 specifically. | ||
10 | ev6-$(CONFIG_ALPHA_EV6) := ev6- | ||
11 | |||
12 | # Several make use of the cttz instruction introduced in ev67. | ||
13 | ev67-$(CONFIG_ALPHA_EV67) := ev67- | ||
14 | |||
15 | lib-y = __divqu.o __remqu.o __divlu.o __remlu.o \ | ||
16 | udelay.o \ | ||
17 | $(ev6-y)memset.o \ | ||
18 | $(ev6-y)memcpy.o \ | ||
19 | memmove.o \ | ||
20 | checksum.o \ | ||
21 | csum_partial_copy.o \ | ||
22 | $(ev67-y)strlen.o \ | ||
23 | $(ev67-y)strcat.o \ | ||
24 | strcpy.o \ | ||
25 | $(ev67-y)strncat.o \ | ||
26 | strncpy.o \ | ||
27 | $(ev6-y)stxcpy.o \ | ||
28 | $(ev6-y)stxncpy.o \ | ||
29 | $(ev67-y)strchr.o \ | ||
30 | $(ev67-y)strrchr.o \ | ||
31 | $(ev6-y)memchr.o \ | ||
32 | $(ev6-y)copy_user.o \ | ||
33 | $(ev6-y)clear_user.o \ | ||
34 | $(ev6-y)strncpy_from_user.o \ | ||
35 | $(ev67-y)strlen_user.o \ | ||
36 | $(ev6-y)csum_ipv6_magic.o \ | ||
37 | $(ev6-y)clear_page.o \ | ||
38 | $(ev6-y)copy_page.o \ | ||
39 | strcasecmp.o \ | ||
40 | fpreg.o \ | ||
41 | callback_srm.o srm_puts.o srm_printk.o | ||
42 | |||
43 | lib-$(CONFIG_SMP) += dec_and_lock.o | ||
44 | |||
45 | # The division routines are built from single source, with different defines. | ||
46 | AFLAGS___divqu.o = -DDIV | ||
47 | AFLAGS___remqu.o = -DREM | ||
48 | AFLAGS___divlu.o = -DDIV -DINTSIZE | ||
49 | AFLAGS___remlu.o = -DREM -DINTSIZE | ||
50 | |||
51 | $(obj)/__divqu.o: $(obj)/$(ev6-y)divide.S | ||
52 | $(cmd_as_o_S) | ||
53 | $(obj)/__remqu.o: $(obj)/$(ev6-y)divide.S | ||
54 | $(cmd_as_o_S) | ||
55 | $(obj)/__divlu.o: $(obj)/$(ev6-y)divide.S | ||
56 | $(cmd_as_o_S) | ||
57 | $(obj)/__remlu.o: $(obj)/$(ev6-y)divide.S | ||
58 | $(cmd_as_o_S) | ||
diff --git a/arch/alpha/lib/callback_srm.S b/arch/alpha/lib/callback_srm.S new file mode 100644 index 000000000000..0528acd0d9ad --- /dev/null +++ b/arch/alpha/lib/callback_srm.S | |||
@@ -0,0 +1,104 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/callback_srm.S | ||
3 | */ | ||
4 | |||
5 | #include <linux/config.h> | ||
6 | #include <asm/console.h> | ||
7 | |||
8 | .text | ||
9 | #define HWRPB_CRB_OFFSET 0xc0 | ||
10 | |||
11 | #if defined(CONFIG_ALPHA_SRM) || defined(CONFIG_ALPHA_GENERIC) | ||
12 | .align 4 | ||
13 | srm_dispatch: | ||
14 | #if defined(CONFIG_ALPHA_GENERIC) | ||
15 | ldl $4,alpha_using_srm | ||
16 | beq $4,nosrm | ||
17 | #endif | ||
18 | ldq $0,hwrpb # gp is set up by CALLBACK macro. | ||
19 | ldl $25,0($25) # Pick up the wrapper data. | ||
20 | mov $20,$21 # Shift arguments right. | ||
21 | mov $19,$20 | ||
22 | ldq $1,HWRPB_CRB_OFFSET($0) | ||
23 | mov $18,$19 | ||
24 | mov $17,$18 | ||
25 | mov $16,$17 | ||
26 | addq $0,$1,$2 # CRB address | ||
27 | ldq $27,0($2) # DISPATCH procedure descriptor (VMS call std) | ||
28 | extwl $25,0,$16 # SRM callback function code | ||
29 | ldq $3,8($27) # call address | ||
30 | extwl $25,2,$25 # argument information (VMS calling std) | ||
31 | jmp ($3) # Return directly to caller of wrapper. | ||
32 | |||
33 | .align 4 | ||
34 | .globl srm_fixup | ||
35 | .ent srm_fixup | ||
36 | srm_fixup: | ||
37 | ldgp $29,0($27) | ||
38 | #if defined(CONFIG_ALPHA_GENERIC) | ||
39 | ldl $4,alpha_using_srm | ||
40 | beq $4,nosrm | ||
41 | #endif | ||
42 | ldq $0,hwrpb | ||
43 | ldq $1,HWRPB_CRB_OFFSET($0) | ||
44 | addq $0,$1,$2 # CRB address | ||
45 | ldq $27,16($2) # VA of FIXUP procedure descriptor | ||
46 | ldq $3,8($27) # call address | ||
47 | lda $25,2($31) # two integer arguments | ||
48 | jmp ($3) # Return directly to caller of srm_fixup. | ||
49 | .end srm_fixup | ||
50 | |||
51 | #if defined(CONFIG_ALPHA_GENERIC) | ||
52 | .align 3 | ||
53 | nosrm: | ||
54 | lda $0,-1($31) | ||
55 | ret | ||
56 | #endif | ||
57 | |||
58 | #define CALLBACK(NAME, CODE, ARG_CNT) \ | ||
59 | .align 4; .globl callback_##NAME; .ent callback_##NAME; callback_##NAME##: \ | ||
60 | ldgp $29,0($27); br $25,srm_dispatch; .word CODE, ARG_CNT; .end callback_##NAME | ||
61 | |||
62 | #else /* defined(CONFIG_ALPHA_SRM) || defined(CONFIG_ALPHA_GENERIC) */ | ||
63 | |||
64 | #define CALLBACK(NAME, CODE, ARG_CNT) \ | ||
65 | .align 3; .globl callback_##NAME; .ent callback_##NAME; callback_##NAME##: \ | ||
66 | lda $0,-1($31); ret; .end callback_##NAME | ||
67 | |||
68 | .align 3 | ||
69 | .globl srm_fixup | ||
70 | .ent srm_fixup | ||
71 | srm_fixup: | ||
72 | lda $0,-1($31) | ||
73 | ret | ||
74 | .end srm_fixup | ||
75 | #endif /* defined(CONFIG_ALPHA_SRM) || defined(CONFIG_ALPHA_GENERIC) */ | ||
76 | |||
77 | CALLBACK(puts, CCB_PUTS, 4) | ||
78 | CALLBACK(open, CCB_OPEN, 3) | ||
79 | CALLBACK(close, CCB_CLOSE, 2) | ||
80 | CALLBACK(read, CCB_READ, 5) | ||
81 | CALLBACK(open_console, CCB_OPEN_CONSOLE, 1) | ||
82 | CALLBACK(close_console, CCB_CLOSE_CONSOLE, 1) | ||
83 | CALLBACK(getenv, CCB_GET_ENV, 4) | ||
84 | CALLBACK(setenv, CCB_SET_ENV, 4) | ||
85 | CALLBACK(getc, CCB_GETC, 2) | ||
86 | CALLBACK(reset_term, CCB_RESET_TERM, 2) | ||
87 | CALLBACK(term_int, CCB_SET_TERM_INT, 3) | ||
88 | CALLBACK(term_ctl, CCB_SET_TERM_CTL, 3) | ||
89 | CALLBACK(process_keycode, CCB_PROCESS_KEYCODE, 3) | ||
90 | CALLBACK(ioctl, CCB_IOCTL, 6) | ||
91 | CALLBACK(write, CCB_WRITE, 5) | ||
92 | CALLBACK(reset_env, CCB_RESET_ENV, 4) | ||
93 | CALLBACK(save_env, CCB_SAVE_ENV, 1) | ||
94 | CALLBACK(pswitch, CCB_PSWITCH, 3) | ||
95 | CALLBACK(bios_emul, CCB_BIOS_EMUL, 5) | ||
96 | |||
97 | .data | ||
98 | __alpha_using_srm: # For use by bootpheader | ||
99 | .long 7 # value is not 1 for link debugging | ||
100 | .weak alpha_using_srm; alpha_using_srm = __alpha_using_srm | ||
101 | __callback_init_done: # For use by bootpheader | ||
102 | .long 7 # value is not 1 for link debugging | ||
103 | .weak callback_init_done; callback_init_done = __callback_init_done | ||
104 | |||
diff --git a/arch/alpha/lib/checksum.c b/arch/alpha/lib/checksum.c new file mode 100644 index 000000000000..89044e6385fe --- /dev/null +++ b/arch/alpha/lib/checksum.c | |||
@@ -0,0 +1,186 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/checksum.c | ||
3 | * | ||
4 | * This file contains network checksum routines that are better done | ||
5 | * in an architecture-specific manner due to speed.. | ||
6 | * Comments in other versions indicate that the algorithms are from RFC1071 | ||
7 | * | ||
8 | * accellerated versions (and 21264 assembly versions ) contributed by | ||
9 | * Rick Gorton <rick.gorton@alpha-processor.com> | ||
10 | */ | ||
11 | |||
12 | #include <linux/module.h> | ||
13 | #include <linux/string.h> | ||
14 | |||
15 | #include <asm/byteorder.h> | ||
16 | |||
17 | static inline unsigned short from64to16(unsigned long x) | ||
18 | { | ||
19 | /* Using extract instructions is a bit more efficient | ||
20 | than the original shift/bitmask version. */ | ||
21 | |||
22 | union { | ||
23 | unsigned long ul; | ||
24 | unsigned int ui[2]; | ||
25 | unsigned short us[4]; | ||
26 | } in_v, tmp_v, out_v; | ||
27 | |||
28 | in_v.ul = x; | ||
29 | tmp_v.ul = (unsigned long) in_v.ui[0] + (unsigned long) in_v.ui[1]; | ||
30 | |||
31 | /* Since the bits of tmp_v.sh[3] are going to always be zero, | ||
32 | we don't have to bother to add that in. */ | ||
33 | out_v.ul = (unsigned long) tmp_v.us[0] + (unsigned long) tmp_v.us[1] | ||
34 | + (unsigned long) tmp_v.us[2]; | ||
35 | |||
36 | /* Similarly, out_v.us[2] is always zero for the final add. */ | ||
37 | return out_v.us[0] + out_v.us[1]; | ||
38 | } | ||
39 | |||
40 | /* | ||
41 | * computes the checksum of the TCP/UDP pseudo-header | ||
42 | * returns a 16-bit checksum, already complemented. | ||
43 | */ | ||
44 | unsigned short int csum_tcpudp_magic(unsigned long saddr, | ||
45 | unsigned long daddr, | ||
46 | unsigned short len, | ||
47 | unsigned short proto, | ||
48 | unsigned int sum) | ||
49 | { | ||
50 | return ~from64to16(saddr + daddr + sum + | ||
51 | ((unsigned long) ntohs(len) << 16) + | ||
52 | ((unsigned long) proto << 8)); | ||
53 | } | ||
54 | |||
55 | unsigned int csum_tcpudp_nofold(unsigned long saddr, | ||
56 | unsigned long daddr, | ||
57 | unsigned short len, | ||
58 | unsigned short proto, | ||
59 | unsigned int sum) | ||
60 | { | ||
61 | unsigned long result; | ||
62 | |||
63 | result = (saddr + daddr + sum + | ||
64 | ((unsigned long) ntohs(len) << 16) + | ||
65 | ((unsigned long) proto << 8)); | ||
66 | |||
67 | /* Fold down to 32-bits so we don't lose in the typedef-less | ||
68 | network stack. */ | ||
69 | /* 64 to 33 */ | ||
70 | result = (result & 0xffffffff) + (result >> 32); | ||
71 | /* 33 to 32 */ | ||
72 | result = (result & 0xffffffff) + (result >> 32); | ||
73 | return result; | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * Do a 64-bit checksum on an arbitrary memory area.. | ||
78 | * | ||
79 | * This isn't a great routine, but it's not _horrible_ either. The | ||
80 | * inner loop could be unrolled a bit further, and there are better | ||
81 | * ways to do the carry, but this is reasonable. | ||
82 | */ | ||
83 | static inline unsigned long do_csum(const unsigned char * buff, int len) | ||
84 | { | ||
85 | int odd, count; | ||
86 | unsigned long result = 0; | ||
87 | |||
88 | if (len <= 0) | ||
89 | goto out; | ||
90 | odd = 1 & (unsigned long) buff; | ||
91 | if (odd) { | ||
92 | result = *buff << 8; | ||
93 | len--; | ||
94 | buff++; | ||
95 | } | ||
96 | count = len >> 1; /* nr of 16-bit words.. */ | ||
97 | if (count) { | ||
98 | if (2 & (unsigned long) buff) { | ||
99 | result += *(unsigned short *) buff; | ||
100 | count--; | ||
101 | len -= 2; | ||
102 | buff += 2; | ||
103 | } | ||
104 | count >>= 1; /* nr of 32-bit words.. */ | ||
105 | if (count) { | ||
106 | if (4 & (unsigned long) buff) { | ||
107 | result += *(unsigned int *) buff; | ||
108 | count--; | ||
109 | len -= 4; | ||
110 | buff += 4; | ||
111 | } | ||
112 | count >>= 1; /* nr of 64-bit words.. */ | ||
113 | if (count) { | ||
114 | unsigned long carry = 0; | ||
115 | do { | ||
116 | unsigned long w = *(unsigned long *) buff; | ||
117 | count--; | ||
118 | buff += 8; | ||
119 | result += carry; | ||
120 | result += w; | ||
121 | carry = (w > result); | ||
122 | } while (count); | ||
123 | result += carry; | ||
124 | result = (result & 0xffffffff) + (result >> 32); | ||
125 | } | ||
126 | if (len & 4) { | ||
127 | result += *(unsigned int *) buff; | ||
128 | buff += 4; | ||
129 | } | ||
130 | } | ||
131 | if (len & 2) { | ||
132 | result += *(unsigned short *) buff; | ||
133 | buff += 2; | ||
134 | } | ||
135 | } | ||
136 | if (len & 1) | ||
137 | result += *buff; | ||
138 | result = from64to16(result); | ||
139 | if (odd) | ||
140 | result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); | ||
141 | out: | ||
142 | return result; | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * This is a version of ip_compute_csum() optimized for IP headers, | ||
147 | * which always checksum on 4 octet boundaries. | ||
148 | */ | ||
149 | unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl) | ||
150 | { | ||
151 | return ~do_csum(iph,ihl*4); | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * computes the checksum of a memory block at buff, length len, | ||
156 | * and adds in "sum" (32-bit) | ||
157 | * | ||
158 | * returns a 32-bit number suitable for feeding into itself | ||
159 | * or csum_tcpudp_magic | ||
160 | * | ||
161 | * this function must be called with even lengths, except | ||
162 | * for the last fragment, which may be odd | ||
163 | * | ||
164 | * it's best to have buff aligned on a 32-bit boundary | ||
165 | */ | ||
166 | unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) | ||
167 | { | ||
168 | unsigned long result = do_csum(buff, len); | ||
169 | |||
170 | /* add in old sum, and carry.. */ | ||
171 | result += sum; | ||
172 | /* 32+c bits -> 32 bits */ | ||
173 | result = (result & 0xffffffff) + (result >> 32); | ||
174 | return result; | ||
175 | } | ||
176 | |||
177 | EXPORT_SYMBOL(csum_partial); | ||
178 | |||
179 | /* | ||
180 | * this routine is used for miscellaneous IP-like checksums, mainly | ||
181 | * in icmp.c | ||
182 | */ | ||
183 | unsigned short ip_compute_csum(unsigned char * buff, int len) | ||
184 | { | ||
185 | return ~from64to16(do_csum(buff,len)); | ||
186 | } | ||
diff --git a/arch/alpha/lib/clear_page.S b/arch/alpha/lib/clear_page.S new file mode 100644 index 000000000000..a221ae266e29 --- /dev/null +++ b/arch/alpha/lib/clear_page.S | |||
@@ -0,0 +1,39 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/clear_page.S | ||
3 | * | ||
4 | * Zero an entire page. | ||
5 | */ | ||
6 | |||
7 | .text | ||
8 | .align 4 | ||
9 | .global clear_page | ||
10 | .ent clear_page | ||
11 | clear_page: | ||
12 | .prologue 0 | ||
13 | |||
14 | lda $0,128 | ||
15 | nop | ||
16 | unop | ||
17 | nop | ||
18 | |||
19 | 1: stq $31,0($16) | ||
20 | stq $31,8($16) | ||
21 | stq $31,16($16) | ||
22 | stq $31,24($16) | ||
23 | |||
24 | stq $31,32($16) | ||
25 | stq $31,40($16) | ||
26 | stq $31,48($16) | ||
27 | subq $0,1,$0 | ||
28 | |||
29 | stq $31,56($16) | ||
30 | addq $16,64,$16 | ||
31 | unop | ||
32 | bne $0,1b | ||
33 | |||
34 | ret | ||
35 | nop | ||
36 | unop | ||
37 | nop | ||
38 | |||
39 | .end clear_page | ||
diff --git a/arch/alpha/lib/clear_user.S b/arch/alpha/lib/clear_user.S new file mode 100644 index 000000000000..8860316c1957 --- /dev/null +++ b/arch/alpha/lib/clear_user.S | |||
@@ -0,0 +1,113 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/clear_user.S | ||
3 | * Contributed by Richard Henderson <rth@tamu.edu> | ||
4 | * | ||
5 | * Zero user space, handling exceptions as we go. | ||
6 | * | ||
7 | * We have to make sure that $0 is always up-to-date and contains the | ||
8 | * right "bytes left to zero" value (and that it is updated only _after_ | ||
9 | * a successful copy). There is also some rather minor exception setup | ||
10 | * stuff. | ||
11 | * | ||
12 | * NOTE! This is not directly C-callable, because the calling semantics | ||
13 | * are different: | ||
14 | * | ||
15 | * Inputs: | ||
16 | * length in $0 | ||
17 | * destination address in $6 | ||
18 | * exception pointer in $7 | ||
19 | * return address in $28 (exceptions expect it there) | ||
20 | * | ||
21 | * Outputs: | ||
22 | * bytes left to copy in $0 | ||
23 | * | ||
24 | * Clobbers: | ||
25 | * $1,$2,$3,$4,$5,$6 | ||
26 | */ | ||
27 | |||
28 | /* Allow an exception for an insn; exit if we get one. */ | ||
29 | #define EX(x,y...) \ | ||
30 | 99: x,##y; \ | ||
31 | .section __ex_table,"a"; \ | ||
32 | .long 99b - .; \ | ||
33 | lda $31, $exception-99b($31); \ | ||
34 | .previous | ||
35 | |||
36 | .set noat | ||
37 | .set noreorder | ||
38 | .align 4 | ||
39 | |||
40 | .globl __do_clear_user | ||
41 | .ent __do_clear_user | ||
42 | .frame $30, 0, $28 | ||
43 | .prologue 0 | ||
44 | |||
45 | $loop: | ||
46 | and $1, 3, $4 # e0 : | ||
47 | beq $4, 1f # .. e1 : | ||
48 | |||
49 | 0: EX( stq_u $31, 0($6) ) # e0 : zero one word | ||
50 | subq $0, 8, $0 # .. e1 : | ||
51 | subq $4, 1, $4 # e0 : | ||
52 | addq $6, 8, $6 # .. e1 : | ||
53 | bne $4, 0b # e1 : | ||
54 | unop # : | ||
55 | |||
56 | 1: bic $1, 3, $1 # e0 : | ||
57 | beq $1, $tail # .. e1 : | ||
58 | |||
59 | 2: EX( stq_u $31, 0($6) ) # e0 : zero four words | ||
60 | subq $0, 8, $0 # .. e1 : | ||
61 | EX( stq_u $31, 8($6) ) # e0 : | ||
62 | subq $0, 8, $0 # .. e1 : | ||
63 | EX( stq_u $31, 16($6) ) # e0 : | ||
64 | subq $0, 8, $0 # .. e1 : | ||
65 | EX( stq_u $31, 24($6) ) # e0 : | ||
66 | subq $0, 8, $0 # .. e1 : | ||
67 | subq $1, 4, $1 # e0 : | ||
68 | addq $6, 32, $6 # .. e1 : | ||
69 | bne $1, 2b # e1 : | ||
70 | |||
71 | $tail: | ||
72 | bne $2, 1f # e1 : is there a tail to do? | ||
73 | ret $31, ($28), 1 # .. e1 : | ||
74 | |||
75 | 1: EX( ldq_u $5, 0($6) ) # e0 : | ||
76 | clr $0 # .. e1 : | ||
77 | nop # e1 : | ||
78 | mskqh $5, $0, $5 # e0 : | ||
79 | EX( stq_u $5, 0($6) ) # e0 : | ||
80 | ret $31, ($28), 1 # .. e1 : | ||
81 | |||
82 | __do_clear_user: | ||
83 | and $6, 7, $4 # e0 : find dest misalignment | ||
84 | beq $0, $zerolength # .. e1 : | ||
85 | addq $0, $4, $1 # e0 : bias counter | ||
86 | and $1, 7, $2 # e1 : number of bytes in tail | ||
87 | srl $1, 3, $1 # e0 : | ||
88 | beq $4, $loop # .. e1 : | ||
89 | |||
90 | EX( ldq_u $5, 0($6) ) # e0 : load dst word to mask back in | ||
91 | beq $1, $oneword # .. e1 : sub-word store? | ||
92 | |||
93 | mskql $5, $6, $5 # e0 : take care of misaligned head | ||
94 | addq $6, 8, $6 # .. e1 : | ||
95 | EX( stq_u $5, -8($6) ) # e0 : | ||
96 | addq $0, $4, $0 # .. e1 : bytes left -= 8 - misalignment | ||
97 | subq $1, 1, $1 # e0 : | ||
98 | subq $0, 8, $0 # .. e1 : | ||
99 | br $loop # e1 : | ||
100 | unop # : | ||
101 | |||
102 | $oneword: | ||
103 | mskql $5, $6, $4 # e0 : | ||
104 | mskqh $5, $2, $5 # e0 : | ||
105 | or $5, $4, $5 # e1 : | ||
106 | EX( stq_u $5, 0($6) ) # e0 : | ||
107 | clr $0 # .. e1 : | ||
108 | |||
109 | $zerolength: | ||
110 | $exception: | ||
111 | ret $31, ($28), 1 # .. e1 : | ||
112 | |||
113 | .end __do_clear_user | ||
diff --git a/arch/alpha/lib/copy_page.S b/arch/alpha/lib/copy_page.S new file mode 100644 index 000000000000..9f3b97459cc6 --- /dev/null +++ b/arch/alpha/lib/copy_page.S | |||
@@ -0,0 +1,49 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/copy_page.S | ||
3 | * | ||
4 | * Copy an entire page. | ||
5 | */ | ||
6 | |||
7 | .text | ||
8 | .align 4 | ||
9 | .global copy_page | ||
10 | .ent copy_page | ||
11 | copy_page: | ||
12 | .prologue 0 | ||
13 | |||
14 | lda $18,128 | ||
15 | nop | ||
16 | unop | ||
17 | nop | ||
18 | |||
19 | 1: ldq $0,0($17) | ||
20 | ldq $1,8($17) | ||
21 | ldq $2,16($17) | ||
22 | ldq $3,24($17) | ||
23 | |||
24 | ldq $4,32($17) | ||
25 | ldq $5,40($17) | ||
26 | ldq $6,48($17) | ||
27 | ldq $7,56($17) | ||
28 | |||
29 | stq $0,0($16) | ||
30 | subq $18,1,$18 | ||
31 | stq $1,8($16) | ||
32 | addq $17,64,$17 | ||
33 | |||
34 | stq $2,16($16) | ||
35 | stq $3,24($16) | ||
36 | stq $4,32($16) | ||
37 | stq $5,40($16) | ||
38 | |||
39 | stq $6,48($16) | ||
40 | stq $7,56($16) | ||
41 | addq $16,64,$16 | ||
42 | bne $18, 1b | ||
43 | |||
44 | ret | ||
45 | nop | ||
46 | unop | ||
47 | nop | ||
48 | |||
49 | .end copy_page | ||
diff --git a/arch/alpha/lib/copy_user.S b/arch/alpha/lib/copy_user.S new file mode 100644 index 000000000000..6f3fab9eb434 --- /dev/null +++ b/arch/alpha/lib/copy_user.S | |||
@@ -0,0 +1,145 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/copy_user.S | ||
3 | * | ||
4 | * Copy to/from user space, handling exceptions as we go.. This | ||
5 | * isn't exactly pretty. | ||
6 | * | ||
7 | * This is essentially the same as "memcpy()", but with a few twists. | ||
8 | * Notably, we have to make sure that $0 is always up-to-date and | ||
9 | * contains the right "bytes left to copy" value (and that it is updated | ||
10 | * only _after_ a successful copy). There is also some rather minor | ||
11 | * exception setup stuff.. | ||
12 | * | ||
13 | * NOTE! This is not directly C-callable, because the calling semantics are | ||
14 | * different: | ||
15 | * | ||
16 | * Inputs: | ||
17 | * length in $0 | ||
18 | * destination address in $6 | ||
19 | * source address in $7 | ||
20 | * return address in $28 | ||
21 | * | ||
22 | * Outputs: | ||
23 | * bytes left to copy in $0 | ||
24 | * | ||
25 | * Clobbers: | ||
26 | * $1,$2,$3,$4,$5,$6,$7 | ||
27 | */ | ||
28 | |||
29 | /* Allow an exception for an insn; exit if we get one. */ | ||
30 | #define EXI(x,y...) \ | ||
31 | 99: x,##y; \ | ||
32 | .section __ex_table,"a"; \ | ||
33 | .long 99b - .; \ | ||
34 | lda $31, $exitin-99b($31); \ | ||
35 | .previous | ||
36 | |||
37 | #define EXO(x,y...) \ | ||
38 | 99: x,##y; \ | ||
39 | .section __ex_table,"a"; \ | ||
40 | .long 99b - .; \ | ||
41 | lda $31, $exitout-99b($31); \ | ||
42 | .previous | ||
43 | |||
44 | .set noat | ||
45 | .align 4 | ||
46 | .globl __copy_user | ||
47 | .ent __copy_user | ||
48 | __copy_user: | ||
49 | .prologue 0 | ||
50 | and $6,7,$3 | ||
51 | beq $0,$35 | ||
52 | beq $3,$36 | ||
53 | subq $3,8,$3 | ||
54 | .align 4 | ||
55 | $37: | ||
56 | EXI( ldq_u $1,0($7) ) | ||
57 | EXO( ldq_u $2,0($6) ) | ||
58 | extbl $1,$7,$1 | ||
59 | mskbl $2,$6,$2 | ||
60 | insbl $1,$6,$1 | ||
61 | addq $3,1,$3 | ||
62 | bis $1,$2,$1 | ||
63 | EXO( stq_u $1,0($6) ) | ||
64 | subq $0,1,$0 | ||
65 | addq $6,1,$6 | ||
66 | addq $7,1,$7 | ||
67 | beq $0,$41 | ||
68 | bne $3,$37 | ||
69 | $36: | ||
70 | and $7,7,$1 | ||
71 | bic $0,7,$4 | ||
72 | beq $1,$43 | ||
73 | beq $4,$48 | ||
74 | EXI( ldq_u $3,0($7) ) | ||
75 | .align 4 | ||
76 | $50: | ||
77 | EXI( ldq_u $2,8($7) ) | ||
78 | subq $4,8,$4 | ||
79 | extql $3,$7,$3 | ||
80 | extqh $2,$7,$1 | ||
81 | bis $3,$1,$1 | ||
82 | EXO( stq $1,0($6) ) | ||
83 | addq $7,8,$7 | ||
84 | subq $0,8,$0 | ||
85 | addq $6,8,$6 | ||
86 | bis $2,$2,$3 | ||
87 | bne $4,$50 | ||
88 | $48: | ||
89 | beq $0,$41 | ||
90 | .align 4 | ||
91 | $57: | ||
92 | EXI( ldq_u $1,0($7) ) | ||
93 | EXO( ldq_u $2,0($6) ) | ||
94 | extbl $1,$7,$1 | ||
95 | mskbl $2,$6,$2 | ||
96 | insbl $1,$6,$1 | ||
97 | bis $1,$2,$1 | ||
98 | EXO( stq_u $1,0($6) ) | ||
99 | subq $0,1,$0 | ||
100 | addq $6,1,$6 | ||
101 | addq $7,1,$7 | ||
102 | bne $0,$57 | ||
103 | br $31,$41 | ||
104 | .align 4 | ||
105 | $43: | ||
106 | beq $4,$65 | ||
107 | .align 4 | ||
108 | $66: | ||
109 | EXI( ldq $1,0($7) ) | ||
110 | subq $4,8,$4 | ||
111 | EXO( stq $1,0($6) ) | ||
112 | addq $7,8,$7 | ||
113 | subq $0,8,$0 | ||
114 | addq $6,8,$6 | ||
115 | bne $4,$66 | ||
116 | $65: | ||
117 | beq $0,$41 | ||
118 | EXI( ldq $2,0($7) ) | ||
119 | EXO( ldq $1,0($6) ) | ||
120 | mskql $2,$0,$2 | ||
121 | mskqh $1,$0,$1 | ||
122 | bis $2,$1,$2 | ||
123 | EXO( stq $2,0($6) ) | ||
124 | bis $31,$31,$0 | ||
125 | $41: | ||
126 | $35: | ||
127 | $exitout: | ||
128 | ret $31,($28),1 | ||
129 | |||
130 | $exitin: | ||
131 | /* A stupid byte-by-byte zeroing of the rest of the output | ||
132 | buffer. This cures security holes by never leaving | ||
133 | random kernel data around to be copied elsewhere. */ | ||
134 | |||
135 | mov $0,$1 | ||
136 | $101: | ||
137 | EXO ( ldq_u $2,0($6) ) | ||
138 | subq $1,1,$1 | ||
139 | mskbl $2,$6,$2 | ||
140 | EXO ( stq_u $2,0($6) ) | ||
141 | addq $6,1,$6 | ||
142 | bgt $1,$101 | ||
143 | ret $31,($28),1 | ||
144 | |||
145 | .end __copy_user | ||
diff --git a/arch/alpha/lib/csum_ipv6_magic.S b/arch/alpha/lib/csum_ipv6_magic.S new file mode 100644 index 000000000000..e09748dbf2ed --- /dev/null +++ b/arch/alpha/lib/csum_ipv6_magic.S | |||
@@ -0,0 +1,92 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/csum_ipv6_magic.S | ||
3 | * Contributed by Richard Henderson <rth@tamu.edu> | ||
4 | * | ||
5 | * unsigned short csum_ipv6_magic(struct in6_addr *saddr, | ||
6 | * struct in6_addr *daddr, | ||
7 | * __u32 len, | ||
8 | * unsigned short proto, | ||
9 | * unsigned int csum); | ||
10 | */ | ||
11 | |||
12 | .globl csum_ipv6_magic | ||
13 | .align 4 | ||
14 | .ent csum_ipv6_magic | ||
15 | .frame $30,0,$26,0 | ||
16 | csum_ipv6_magic: | ||
17 | .prologue 0 | ||
18 | |||
19 | ldq $0,0($16) # e0 : load src & dst addr words | ||
20 | zapnot $20,15,$20 # .. e1 : zero extend incoming csum | ||
21 | extqh $18,1,$4 # e0 : byte swap len & proto while we wait | ||
22 | ldq $1,8($16) # .. e1 : | ||
23 | |||
24 | extbl $18,1,$5 # e0 : | ||
25 | ldq $2,0($17) # .. e1 : | ||
26 | extbl $18,2,$6 # e0 : | ||
27 | ldq $3,8($17) # .. e1 : | ||
28 | |||
29 | extbl $18,3,$18 # e0 : | ||
30 | sra $4,32,$4 # e0 : | ||
31 | sll $5,16,$5 # e0 : | ||
32 | addq $20,$0,$20 # .. e1 : begin summing the words | ||
33 | |||
34 | sll $6,8,$6 # e0 : | ||
35 | cmpult $20,$0,$0 # .. e1 : | ||
36 | extwh $19,7,$7 # e0 : | ||
37 | or $4,$18,$18 # .. e1 : | ||
38 | |||
39 | extbl $19,1,$19 # e0 : | ||
40 | or $5,$6,$5 # .. e1 : | ||
41 | or $18,$5,$18 # e0 : len complete | ||
42 | or $19,$7,$19 # .. e1 : | ||
43 | |||
44 | sll $19,48,$19 # e0 : | ||
45 | addq $20,$1,$20 # .. e1 : | ||
46 | sra $19,32,$19 # e0 : proto complete | ||
47 | cmpult $20,$1,$1 # .. e1 : | ||
48 | |||
49 | nop # e0 : | ||
50 | addq $20,$2,$20 # .. e1 : | ||
51 | cmpult $20,$2,$2 # e0 : | ||
52 | addq $20,$3,$20 # .. e1 : | ||
53 | |||
54 | cmpult $20,$3,$3 # e0 : | ||
55 | addq $20,$18,$20 # .. e1 : | ||
56 | cmpult $20,$18,$18 # e0 : | ||
57 | addq $20,$19,$20 # .. e1 : | ||
58 | |||
59 | cmpult $20,$19,$19 # e0 : | ||
60 | addq $0,$1,$0 # .. e1 : merge the carries back into the csum | ||
61 | addq $2,$3,$2 # e0 : | ||
62 | addq $18,$19,$18 # .. e1 : | ||
63 | |||
64 | addq $0,$2,$0 # e0 : | ||
65 | addq $20,$18,$20 # .. e1 : | ||
66 | addq $0,$20,$0 # e0 : | ||
67 | unop # : | ||
68 | |||
69 | extwl $0,2,$2 # e0 : begin folding the 64-bit value | ||
70 | zapnot $0,3,$3 # .. e1 : | ||
71 | extwl $0,4,$1 # e0 : | ||
72 | addq $2,$3,$3 # .. e1 : | ||
73 | |||
74 | extwl $0,6,$0 # e0 : | ||
75 | addq $3,$1,$3 # .. e1 : | ||
76 | addq $0,$3,$0 # e0 : | ||
77 | unop # : | ||
78 | |||
79 | extwl $0,2,$1 # e0 : fold 18-bit value | ||
80 | zapnot $0,3,$0 # .. e1 : | ||
81 | addq $0,$1,$0 # e0 : | ||
82 | unop # : | ||
83 | |||
84 | extwl $0,2,$1 # e0 : fold 17-bit value | ||
85 | zapnot $0,3,$0 # .. e1 : | ||
86 | addq $0,$1,$0 # e0 : | ||
87 | not $0,$0 # e1 : and complement. | ||
88 | |||
89 | zapnot $0,3,$0 # e0 : | ||
90 | ret # .. e1 : | ||
91 | |||
92 | .end csum_ipv6_magic | ||
diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c new file mode 100644 index 000000000000..a37948f3037a --- /dev/null +++ b/arch/alpha/lib/csum_partial_copy.c | |||
@@ -0,0 +1,391 @@ | |||
1 | /* | ||
2 | * csum_partial_copy - do IP checksumming and copy | ||
3 | * | ||
4 | * (C) Copyright 1996 Linus Torvalds | ||
5 | * accellerated versions (and 21264 assembly versions ) contributed by | ||
6 | * Rick Gorton <rick.gorton@alpha-processor.com> | ||
7 | * | ||
8 | * Don't look at this too closely - you'll go mad. The things | ||
9 | * we do for performance.. | ||
10 | */ | ||
11 | |||
12 | #include <linux/types.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <asm/uaccess.h> | ||
15 | |||
16 | |||
17 | #define ldq_u(x,y) \ | ||
18 | __asm__ __volatile__("ldq_u %0,%1":"=r" (x):"m" (*(const unsigned long *)(y))) | ||
19 | |||
20 | #define stq_u(x,y) \ | ||
21 | __asm__ __volatile__("stq_u %1,%0":"=m" (*(unsigned long *)(y)):"r" (x)) | ||
22 | |||
23 | #define extql(x,y,z) \ | ||
24 | __asm__ __volatile__("extql %1,%2,%0":"=r" (z):"r" (x),"r" (y)) | ||
25 | |||
26 | #define extqh(x,y,z) \ | ||
27 | __asm__ __volatile__("extqh %1,%2,%0":"=r" (z):"r" (x),"r" (y)) | ||
28 | |||
29 | #define mskql(x,y,z) \ | ||
30 | __asm__ __volatile__("mskql %1,%2,%0":"=r" (z):"r" (x),"r" (y)) | ||
31 | |||
32 | #define mskqh(x,y,z) \ | ||
33 | __asm__ __volatile__("mskqh %1,%2,%0":"=r" (z):"r" (x),"r" (y)) | ||
34 | |||
35 | #define insql(x,y,z) \ | ||
36 | __asm__ __volatile__("insql %1,%2,%0":"=r" (z):"r" (x),"r" (y)) | ||
37 | |||
38 | #define insqh(x,y,z) \ | ||
39 | __asm__ __volatile__("insqh %1,%2,%0":"=r" (z):"r" (x),"r" (y)) | ||
40 | |||
41 | |||
42 | #define __get_user_u(x,ptr) \ | ||
43 | ({ \ | ||
44 | long __guu_err; \ | ||
45 | __asm__ __volatile__( \ | ||
46 | "1: ldq_u %0,%2\n" \ | ||
47 | "2:\n" \ | ||
48 | ".section __ex_table,\"a\"\n" \ | ||
49 | " .long 1b - .\n" \ | ||
50 | " lda %0,2b-1b(%1)\n" \ | ||
51 | ".previous" \ | ||
52 | : "=r"(x), "=r"(__guu_err) \ | ||
53 | : "m"(__m(ptr)), "1"(0)); \ | ||
54 | __guu_err; \ | ||
55 | }) | ||
56 | |||
57 | #define __put_user_u(x,ptr) \ | ||
58 | ({ \ | ||
59 | long __puu_err; \ | ||
60 | __asm__ __volatile__( \ | ||
61 | "1: stq_u %2,%1\n" \ | ||
62 | "2:\n" \ | ||
63 | ".section __ex_table,\"a\"\n" \ | ||
64 | " .long 1b - ." \ | ||
65 | " lda $31,2b-1b(%0)\n" \ | ||
66 | ".previous" \ | ||
67 | : "=r"(__puu_err) \ | ||
68 | : "m"(__m(addr)), "rJ"(x), "0"(0)); \ | ||
69 | __puu_err; \ | ||
70 | }) | ||
71 | |||
72 | |||
73 | static inline unsigned short from64to16(unsigned long x) | ||
74 | { | ||
75 | /* Using extract instructions is a bit more efficient | ||
76 | than the original shift/bitmask version. */ | ||
77 | |||
78 | union { | ||
79 | unsigned long ul; | ||
80 | unsigned int ui[2]; | ||
81 | unsigned short us[4]; | ||
82 | } in_v, tmp_v, out_v; | ||
83 | |||
84 | in_v.ul = x; | ||
85 | tmp_v.ul = (unsigned long) in_v.ui[0] + (unsigned long) in_v.ui[1]; | ||
86 | |||
87 | /* Since the bits of tmp_v.sh[3] are going to always be zero, | ||
88 | we don't have to bother to add that in. */ | ||
89 | out_v.ul = (unsigned long) tmp_v.us[0] + (unsigned long) tmp_v.us[1] | ||
90 | + (unsigned long) tmp_v.us[2]; | ||
91 | |||
92 | /* Similarly, out_v.us[2] is always zero for the final add. */ | ||
93 | return out_v.us[0] + out_v.us[1]; | ||
94 | } | ||
95 | |||
96 | |||
97 | |||
98 | /* | ||
99 | * Ok. This isn't fun, but this is the EASY case. | ||
100 | */ | ||
101 | static inline unsigned long | ||
102 | csum_partial_cfu_aligned(const unsigned long __user *src, unsigned long *dst, | ||
103 | long len, unsigned long checksum, | ||
104 | int *errp) | ||
105 | { | ||
106 | unsigned long carry = 0; | ||
107 | int err = 0; | ||
108 | |||
109 | while (len >= 0) { | ||
110 | unsigned long word; | ||
111 | err |= __get_user(word, src); | ||
112 | checksum += carry; | ||
113 | src++; | ||
114 | checksum += word; | ||
115 | len -= 8; | ||
116 | carry = checksum < word; | ||
117 | *dst = word; | ||
118 | dst++; | ||
119 | } | ||
120 | len += 8; | ||
121 | checksum += carry; | ||
122 | if (len) { | ||
123 | unsigned long word, tmp; | ||
124 | err |= __get_user(word, src); | ||
125 | tmp = *dst; | ||
126 | mskql(word, len, word); | ||
127 | checksum += word; | ||
128 | mskqh(tmp, len, tmp); | ||
129 | carry = checksum < word; | ||
130 | *dst = word | tmp; | ||
131 | checksum += carry; | ||
132 | } | ||
133 | if (err) *errp = err; | ||
134 | return checksum; | ||
135 | } | ||
136 | |||
137 | /* | ||
138 | * This is even less fun, but this is still reasonably | ||
139 | * easy. | ||
140 | */ | ||
141 | static inline unsigned long | ||
142 | csum_partial_cfu_dest_aligned(const unsigned long __user *src, | ||
143 | unsigned long *dst, | ||
144 | unsigned long soff, | ||
145 | long len, unsigned long checksum, | ||
146 | int *errp) | ||
147 | { | ||
148 | unsigned long first; | ||
149 | unsigned long word, carry; | ||
150 | unsigned long lastsrc = 7+len+(unsigned long)src; | ||
151 | int err = 0; | ||
152 | |||
153 | err |= __get_user_u(first,src); | ||
154 | carry = 0; | ||
155 | while (len >= 0) { | ||
156 | unsigned long second; | ||
157 | |||
158 | err |= __get_user_u(second, src+1); | ||
159 | extql(first, soff, word); | ||
160 | len -= 8; | ||
161 | src++; | ||
162 | extqh(second, soff, first); | ||
163 | checksum += carry; | ||
164 | word |= first; | ||
165 | first = second; | ||
166 | checksum += word; | ||
167 | *dst = word; | ||
168 | dst++; | ||
169 | carry = checksum < word; | ||
170 | } | ||
171 | len += 8; | ||
172 | checksum += carry; | ||
173 | if (len) { | ||
174 | unsigned long tmp; | ||
175 | unsigned long second; | ||
176 | err |= __get_user_u(second, lastsrc); | ||
177 | tmp = *dst; | ||
178 | extql(first, soff, word); | ||
179 | extqh(second, soff, first); | ||
180 | word |= first; | ||
181 | mskql(word, len, word); | ||
182 | checksum += word; | ||
183 | mskqh(tmp, len, tmp); | ||
184 | carry = checksum < word; | ||
185 | *dst = word | tmp; | ||
186 | checksum += carry; | ||
187 | } | ||
188 | if (err) *errp = err; | ||
189 | return checksum; | ||
190 | } | ||
191 | |||
192 | /* | ||
193 | * This is slightly less fun than the above.. | ||
194 | */ | ||
195 | static inline unsigned long | ||
196 | csum_partial_cfu_src_aligned(const unsigned long __user *src, | ||
197 | unsigned long *dst, | ||
198 | unsigned long doff, | ||
199 | long len, unsigned long checksum, | ||
200 | unsigned long partial_dest, | ||
201 | int *errp) | ||
202 | { | ||
203 | unsigned long carry = 0; | ||
204 | unsigned long word; | ||
205 | unsigned long second_dest; | ||
206 | int err = 0; | ||
207 | |||
208 | mskql(partial_dest, doff, partial_dest); | ||
209 | while (len >= 0) { | ||
210 | err |= __get_user(word, src); | ||
211 | len -= 8; | ||
212 | insql(word, doff, second_dest); | ||
213 | checksum += carry; | ||
214 | stq_u(partial_dest | second_dest, dst); | ||
215 | src++; | ||
216 | checksum += word; | ||
217 | insqh(word, doff, partial_dest); | ||
218 | carry = checksum < word; | ||
219 | dst++; | ||
220 | } | ||
221 | len += 8; | ||
222 | if (len) { | ||
223 | checksum += carry; | ||
224 | err |= __get_user(word, src); | ||
225 | mskql(word, len, word); | ||
226 | len -= 8; | ||
227 | checksum += word; | ||
228 | insql(word, doff, second_dest); | ||
229 | len += doff; | ||
230 | carry = checksum < word; | ||
231 | partial_dest |= second_dest; | ||
232 | if (len >= 0) { | ||
233 | stq_u(partial_dest, dst); | ||
234 | if (!len) goto out; | ||
235 | dst++; | ||
236 | insqh(word, doff, partial_dest); | ||
237 | } | ||
238 | doff = len; | ||
239 | } | ||
240 | ldq_u(second_dest, dst); | ||
241 | mskqh(second_dest, doff, second_dest); | ||
242 | stq_u(partial_dest | second_dest, dst); | ||
243 | out: | ||
244 | checksum += carry; | ||
245 | if (err) *errp = err; | ||
246 | return checksum; | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * This is so totally un-fun that it's frightening. Don't | ||
251 | * look at this too closely, you'll go blind. | ||
252 | */ | ||
253 | static inline unsigned long | ||
254 | csum_partial_cfu_unaligned(const unsigned long __user * src, | ||
255 | unsigned long * dst, | ||
256 | unsigned long soff, unsigned long doff, | ||
257 | long len, unsigned long checksum, | ||
258 | unsigned long partial_dest, | ||
259 | int *errp) | ||
260 | { | ||
261 | unsigned long carry = 0; | ||
262 | unsigned long first; | ||
263 | unsigned long lastsrc; | ||
264 | int err = 0; | ||
265 | |||
266 | err |= __get_user_u(first, src); | ||
267 | lastsrc = 7+len+(unsigned long)src; | ||
268 | mskql(partial_dest, doff, partial_dest); | ||
269 | while (len >= 0) { | ||
270 | unsigned long second, word; | ||
271 | unsigned long second_dest; | ||
272 | |||
273 | err |= __get_user_u(second, src+1); | ||
274 | extql(first, soff, word); | ||
275 | checksum += carry; | ||
276 | len -= 8; | ||
277 | extqh(second, soff, first); | ||
278 | src++; | ||
279 | word |= first; | ||
280 | first = second; | ||
281 | insql(word, doff, second_dest); | ||
282 | checksum += word; | ||
283 | stq_u(partial_dest | second_dest, dst); | ||
284 | carry = checksum < word; | ||
285 | insqh(word, doff, partial_dest); | ||
286 | dst++; | ||
287 | } | ||
288 | len += doff; | ||
289 | checksum += carry; | ||
290 | if (len >= 0) { | ||
291 | unsigned long second, word; | ||
292 | unsigned long second_dest; | ||
293 | |||
294 | err |= __get_user_u(second, lastsrc); | ||
295 | extql(first, soff, word); | ||
296 | extqh(second, soff, first); | ||
297 | word |= first; | ||
298 | first = second; | ||
299 | mskql(word, len-doff, word); | ||
300 | checksum += word; | ||
301 | insql(word, doff, second_dest); | ||
302 | carry = checksum < word; | ||
303 | stq_u(partial_dest | second_dest, dst); | ||
304 | if (len) { | ||
305 | ldq_u(second_dest, dst+1); | ||
306 | insqh(word, doff, partial_dest); | ||
307 | mskqh(second_dest, len, second_dest); | ||
308 | stq_u(partial_dest | second_dest, dst+1); | ||
309 | } | ||
310 | checksum += carry; | ||
311 | } else { | ||
312 | unsigned long second, word; | ||
313 | unsigned long second_dest; | ||
314 | |||
315 | err |= __get_user_u(second, lastsrc); | ||
316 | extql(first, soff, word); | ||
317 | extqh(second, soff, first); | ||
318 | word |= first; | ||
319 | ldq_u(second_dest, dst); | ||
320 | mskql(word, len-doff, word); | ||
321 | checksum += word; | ||
322 | mskqh(second_dest, len, second_dest); | ||
323 | carry = checksum < word; | ||
324 | insql(word, doff, word); | ||
325 | stq_u(partial_dest | word | second_dest, dst); | ||
326 | checksum += carry; | ||
327 | } | ||
328 | if (err) *errp = err; | ||
329 | return checksum; | ||
330 | } | ||
331 | |||
332 | static unsigned int | ||
333 | do_csum_partial_copy_from_user(const char __user *src, char *dst, int len, | ||
334 | unsigned int sum, int *errp) | ||
335 | { | ||
336 | unsigned long checksum = (unsigned) sum; | ||
337 | unsigned long soff = 7 & (unsigned long) src; | ||
338 | unsigned long doff = 7 & (unsigned long) dst; | ||
339 | |||
340 | if (len) { | ||
341 | if (!doff) { | ||
342 | if (!soff) | ||
343 | checksum = csum_partial_cfu_aligned( | ||
344 | (const unsigned long __user *) src, | ||
345 | (unsigned long *) dst, | ||
346 | len-8, checksum, errp); | ||
347 | else | ||
348 | checksum = csum_partial_cfu_dest_aligned( | ||
349 | (const unsigned long __user *) src, | ||
350 | (unsigned long *) dst, | ||
351 | soff, len-8, checksum, errp); | ||
352 | } else { | ||
353 | unsigned long partial_dest; | ||
354 | ldq_u(partial_dest, dst); | ||
355 | if (!soff) | ||
356 | checksum = csum_partial_cfu_src_aligned( | ||
357 | (const unsigned long __user *) src, | ||
358 | (unsigned long *) dst, | ||
359 | doff, len-8, checksum, | ||
360 | partial_dest, errp); | ||
361 | else | ||
362 | checksum = csum_partial_cfu_unaligned( | ||
363 | (const unsigned long __user *) src, | ||
364 | (unsigned long *) dst, | ||
365 | soff, doff, len-8, checksum, | ||
366 | partial_dest, errp); | ||
367 | } | ||
368 | checksum = from64to16 (checksum); | ||
369 | } | ||
370 | return checksum; | ||
371 | } | ||
372 | |||
373 | unsigned int | ||
374 | csum_partial_copy_from_user(const char __user *src, char *dst, int len, | ||
375 | unsigned int sum, int *errp) | ||
376 | { | ||
377 | if (!access_ok(VERIFY_READ, src, len)) { | ||
378 | *errp = -EFAULT; | ||
379 | memset(dst, 0, len); | ||
380 | return sum; | ||
381 | } | ||
382 | |||
383 | return do_csum_partial_copy_from_user(src, dst, len, sum, errp); | ||
384 | } | ||
385 | |||
386 | unsigned int | ||
387 | csum_partial_copy_nocheck(const char __user *src, char *dst, int len, | ||
388 | unsigned int sum) | ||
389 | { | ||
390 | return do_csum_partial_copy_from_user(src, dst, len, sum, NULL); | ||
391 | } | ||
diff --git a/arch/alpha/lib/dbg_current.S b/arch/alpha/lib/dbg_current.S new file mode 100644 index 000000000000..e6d071015f9b --- /dev/null +++ b/arch/alpha/lib/dbg_current.S | |||
@@ -0,0 +1,29 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/dbg_current.S | ||
3 | * Contributed by Richard Henderson (rth@cygnus.com) | ||
4 | * | ||
5 | * Trap if we find current not correct. | ||
6 | */ | ||
7 | |||
8 | #include <asm/pal.h> | ||
9 | |||
10 | .text | ||
11 | .set noat | ||
12 | |||
13 | .globl _mcount | ||
14 | .ent _mcount | ||
15 | _mcount: | ||
16 | .frame $30, 0, $28, 0 | ||
17 | .prologue 0 | ||
18 | |||
19 | lda $0, -0x4000($30) | ||
20 | cmpult $8, $30, $1 | ||
21 | cmpule $0, $30, $2 | ||
22 | and $1, $2, $3 | ||
23 | bne $3, 1f | ||
24 | |||
25 | call_pal PAL_bugchk | ||
26 | |||
27 | 1: ret $31, ($28), 1 | ||
28 | |||
29 | .end _mcount | ||
diff --git a/arch/alpha/lib/dbg_stackcheck.S b/arch/alpha/lib/dbg_stackcheck.S new file mode 100644 index 000000000000..cc5ce3a5fcad --- /dev/null +++ b/arch/alpha/lib/dbg_stackcheck.S | |||
@@ -0,0 +1,27 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/stackcheck.S | ||
3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
4 | * | ||
5 | * Verify that we have not overflowed the stack. Oops if we have. | ||
6 | */ | ||
7 | |||
8 | #include <asm/asm_offsets.h> | ||
9 | |||
10 | .text | ||
11 | .set noat | ||
12 | |||
13 | .align 3 | ||
14 | .globl _mcount | ||
15 | .ent _mcount | ||
16 | _mcount: | ||
17 | .frame $30, 0, $28, 0 | ||
18 | .prologue 0 | ||
19 | |||
20 | lda $0, TASK_SIZE($8) | ||
21 | cmpult $30, $0, $0 | ||
22 | bne $0, 1f | ||
23 | ret ($28) | ||
24 | 1: stq $31, -8($31) # oops me, damn it. | ||
25 | br 1b | ||
26 | |||
27 | .end _mcount | ||
diff --git a/arch/alpha/lib/dbg_stackkill.S b/arch/alpha/lib/dbg_stackkill.S new file mode 100644 index 000000000000..e09f2ae1e09e --- /dev/null +++ b/arch/alpha/lib/dbg_stackkill.S | |||
@@ -0,0 +1,35 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/killstack.S | ||
3 | * Contributed by Richard Henderson (rth@cygnus.com) | ||
4 | * | ||
5 | * Clobber the balance of the kernel stack, hoping to catch | ||
6 | * uninitialized local variables in the act. | ||
7 | */ | ||
8 | |||
9 | #include <asm/asm_offsets.h> | ||
10 | |||
11 | .text | ||
12 | .set noat | ||
13 | |||
14 | .align 5 | ||
15 | .globl _mcount | ||
16 | .ent _mcount | ||
17 | _mcount: | ||
18 | .frame $30, 0, $28, 0 | ||
19 | .prologue 0 | ||
20 | |||
21 | ldi $0, 0xdeadbeef | ||
22 | lda $2, -STACK_SIZE | ||
23 | sll $0, 32, $1 | ||
24 | and $30, $2, $2 | ||
25 | or $0, $1, $0 | ||
26 | lda $2, TASK_SIZE($2) | ||
27 | cmpult $2, $30, $1 | ||
28 | beq $1, 2f | ||
29 | 1: stq $0, 0($2) | ||
30 | addq $2, 8, $2 | ||
31 | cmpult $2, $30, $1 | ||
32 | bne $1, 1b | ||
33 | 2: ret ($28) | ||
34 | |||
35 | .end _mcount | ||
diff --git a/arch/alpha/lib/dec_and_lock.c b/arch/alpha/lib/dec_and_lock.c new file mode 100644 index 000000000000..6ae2500a9d9e --- /dev/null +++ b/arch/alpha/lib/dec_and_lock.c | |||
@@ -0,0 +1,42 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/dec_and_lock.c | ||
3 | * | ||
4 | * ll/sc version of atomic_dec_and_lock() | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | #include <linux/spinlock.h> | ||
9 | #include <asm/atomic.h> | ||
10 | |||
11 | asm (".text \n\ | ||
12 | .global _atomic_dec_and_lock \n\ | ||
13 | .ent _atomic_dec_and_lock \n\ | ||
14 | .align 4 \n\ | ||
15 | _atomic_dec_and_lock: \n\ | ||
16 | .prologue 0 \n\ | ||
17 | 1: ldl_l $1, 0($16) \n\ | ||
18 | subl $1, 1, $1 \n\ | ||
19 | beq $1, 2f \n\ | ||
20 | stl_c $1, 0($16) \n\ | ||
21 | beq $1, 4f \n\ | ||
22 | mb \n\ | ||
23 | clr $0 \n\ | ||
24 | ret \n\ | ||
25 | 2: br $29, 3f \n\ | ||
26 | 3: ldgp $29, 0($29) \n\ | ||
27 | br $atomic_dec_and_lock_1..ng \n\ | ||
28 | .subsection 2 \n\ | ||
29 | 4: br 1b \n\ | ||
30 | .previous \n\ | ||
31 | .end _atomic_dec_and_lock"); | ||
32 | |||
33 | static int __attribute_used__ | ||
34 | atomic_dec_and_lock_1(atomic_t *atomic, spinlock_t *lock) | ||
35 | { | ||
36 | /* Slow path */ | ||
37 | spin_lock(lock); | ||
38 | if (atomic_dec_and_test(atomic)) | ||
39 | return 1; | ||
40 | spin_unlock(lock); | ||
41 | return 0; | ||
42 | } | ||
diff --git a/arch/alpha/lib/divide.S b/arch/alpha/lib/divide.S new file mode 100644 index 000000000000..2d1a0484a99e --- /dev/null +++ b/arch/alpha/lib/divide.S | |||
@@ -0,0 +1,195 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/divide.S | ||
3 | * | ||
4 | * (C) 1995 Linus Torvalds | ||
5 | * | ||
6 | * Alpha division.. | ||
7 | */ | ||
8 | |||
9 | /* | ||
10 | * The alpha chip doesn't provide hardware division, so we have to do it | ||
11 | * by hand. The compiler expects the functions | ||
12 | * | ||
13 | * __divqu: 64-bit unsigned long divide | ||
14 | * __remqu: 64-bit unsigned long remainder | ||
15 | * __divqs/__remqs: signed 64-bit | ||
16 | * __divlu/__remlu: unsigned 32-bit | ||
17 | * __divls/__remls: signed 32-bit | ||
18 | * | ||
19 | * These are not normal C functions: instead of the normal | ||
20 | * calling sequence, these expect their arguments in registers | ||
21 | * $24 and $25, and return the result in $27. Register $28 may | ||
22 | * be clobbered (assembly temporary), anything else must be saved. | ||
23 | * | ||
24 | * In short: painful. | ||
25 | * | ||
26 | * This is a rather simple bit-at-a-time algorithm: it's very good | ||
27 | * at dividing random 64-bit numbers, but the more usual case where | ||
28 | * the divisor is small is handled better by the DEC algorithm | ||
29 | * using lookup tables. This uses much less memory, though, and is | ||
30 | * nicer on the cache.. Besides, I don't know the copyright status | ||
31 | * of the DEC code. | ||
32 | */ | ||
33 | |||
34 | /* | ||
35 | * My temporaries: | ||
36 | * $0 - current bit | ||
37 | * $1 - shifted divisor | ||
38 | * $2 - modulus/quotient | ||
39 | * | ||
40 | * $23 - return address | ||
41 | * $24 - dividend | ||
42 | * $25 - divisor | ||
43 | * | ||
44 | * $27 - quotient/modulus | ||
45 | * $28 - compare status | ||
46 | */ | ||
47 | |||
48 | #define halt .long 0 | ||
49 | |||
50 | /* | ||
51 | * Select function type and registers | ||
52 | */ | ||
53 | #define mask $0 | ||
54 | #define divisor $1 | ||
55 | #define compare $28 | ||
56 | #define tmp1 $3 | ||
57 | #define tmp2 $4 | ||
58 | |||
59 | #ifdef DIV | ||
60 | #define DIV_ONLY(x,y...) x,##y | ||
61 | #define MOD_ONLY(x,y...) | ||
62 | #define func(x) __div##x | ||
63 | #define modulus $2 | ||
64 | #define quotient $27 | ||
65 | #define GETSIGN(x) xor $24,$25,x | ||
66 | #define STACK 48 | ||
67 | #else | ||
68 | #define DIV_ONLY(x,y...) | ||
69 | #define MOD_ONLY(x,y...) x,##y | ||
70 | #define func(x) __rem##x | ||
71 | #define modulus $27 | ||
72 | #define quotient $2 | ||
73 | #define GETSIGN(x) bis $24,$24,x | ||
74 | #define STACK 32 | ||
75 | #endif | ||
76 | |||
77 | /* | ||
78 | * For 32-bit operations, we need to extend to 64-bit | ||
79 | */ | ||
80 | #ifdef INTSIZE | ||
81 | #define ufunction func(lu) | ||
82 | #define sfunction func(l) | ||
83 | #define LONGIFY(x) zapnot x,15,x | ||
84 | #define SLONGIFY(x) addl x,0,x | ||
85 | #else | ||
86 | #define ufunction func(qu) | ||
87 | #define sfunction func(q) | ||
88 | #define LONGIFY(x) | ||
89 | #define SLONGIFY(x) | ||
90 | #endif | ||
91 | |||
92 | .set noat | ||
93 | .align 3 | ||
94 | .globl ufunction | ||
95 | .ent ufunction | ||
96 | ufunction: | ||
97 | subq $30,STACK,$30 | ||
98 | .frame $30,STACK,$23 | ||
99 | .prologue 0 | ||
100 | |||
101 | 7: stq $1, 0($30) | ||
102 | bis $25,$25,divisor | ||
103 | stq $2, 8($30) | ||
104 | bis $24,$24,modulus | ||
105 | stq $0,16($30) | ||
106 | bis $31,$31,quotient | ||
107 | LONGIFY(divisor) | ||
108 | stq tmp1,24($30) | ||
109 | LONGIFY(modulus) | ||
110 | bis $31,1,mask | ||
111 | DIV_ONLY(stq tmp2,32($30)) | ||
112 | beq divisor, 9f /* div by zero */ | ||
113 | |||
114 | #ifdef INTSIZE | ||
115 | /* | ||
116 | * shift divisor left, using 3-bit shifts for | ||
117 | * 32-bit divides as we can't overflow. Three-bit | ||
118 | * shifts will result in looping three times less | ||
119 | * here, but can result in two loops more later. | ||
120 | * Thus using a large shift isn't worth it (and | ||
121 | * s8add pairs better than a sll..) | ||
122 | */ | ||
123 | 1: cmpult divisor,modulus,compare | ||
124 | s8addq divisor,$31,divisor | ||
125 | s8addq mask,$31,mask | ||
126 | bne compare,1b | ||
127 | #else | ||
128 | 1: cmpult divisor,modulus,compare | ||
129 | blt divisor, 2f | ||
130 | addq divisor,divisor,divisor | ||
131 | addq mask,mask,mask | ||
132 | bne compare,1b | ||
133 | unop | ||
134 | #endif | ||
135 | |||
136 | /* ok, start to go right again.. */ | ||
137 | 2: DIV_ONLY(addq quotient,mask,tmp2) | ||
138 | srl mask,1,mask | ||
139 | cmpule divisor,modulus,compare | ||
140 | subq modulus,divisor,tmp1 | ||
141 | DIV_ONLY(cmovne compare,tmp2,quotient) | ||
142 | srl divisor,1,divisor | ||
143 | cmovne compare,tmp1,modulus | ||
144 | bne mask,2b | ||
145 | |||
146 | 9: ldq $1, 0($30) | ||
147 | ldq $2, 8($30) | ||
148 | ldq $0,16($30) | ||
149 | ldq tmp1,24($30) | ||
150 | DIV_ONLY(ldq tmp2,32($30)) | ||
151 | addq $30,STACK,$30 | ||
152 | ret $31,($23),1 | ||
153 | .end ufunction | ||
154 | |||
155 | /* | ||
156 | * Uhh.. Ugly signed division. I'd rather not have it at all, but | ||
157 | * it's needed in some circumstances. There are different ways to | ||
158 | * handle this, really. This does: | ||
159 | * -a / b = a / -b = -(a / b) | ||
160 | * -a % b = -(a % b) | ||
161 | * a % -b = a % b | ||
162 | * which is probably not the best solution, but at least should | ||
163 | * have the property that (x/y)*y + (x%y) = x. | ||
164 | */ | ||
165 | .align 3 | ||
166 | .globl sfunction | ||
167 | .ent sfunction | ||
168 | sfunction: | ||
169 | subq $30,STACK,$30 | ||
170 | .frame $30,STACK,$23 | ||
171 | .prologue 0 | ||
172 | bis $24,$25,$28 | ||
173 | SLONGIFY($28) | ||
174 | bge $28,7b | ||
175 | stq $24,0($30) | ||
176 | subq $31,$24,$28 | ||
177 | stq $25,8($30) | ||
178 | cmovlt $24,$28,$24 /* abs($24) */ | ||
179 | stq $23,16($30) | ||
180 | subq $31,$25,$28 | ||
181 | stq tmp1,24($30) | ||
182 | cmovlt $25,$28,$25 /* abs($25) */ | ||
183 | unop | ||
184 | bsr $23,ufunction | ||
185 | ldq $24,0($30) | ||
186 | ldq $25,8($30) | ||
187 | GETSIGN($28) | ||
188 | subq $31,$27,tmp1 | ||
189 | SLONGIFY($28) | ||
190 | ldq $23,16($30) | ||
191 | cmovlt $28,tmp1,$27 | ||
192 | ldq tmp1,24($30) | ||
193 | addq $30,STACK,$30 | ||
194 | ret $31,($23),1 | ||
195 | .end sfunction | ||
diff --git a/arch/alpha/lib/ev6-clear_page.S b/arch/alpha/lib/ev6-clear_page.S new file mode 100644 index 000000000000..adf4f7be0e2b --- /dev/null +++ b/arch/alpha/lib/ev6-clear_page.S | |||
@@ -0,0 +1,54 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-clear_page.S | ||
3 | * | ||
4 | * Zero an entire page. | ||
5 | */ | ||
6 | |||
7 | .text | ||
8 | .align 4 | ||
9 | .global clear_page | ||
10 | .ent clear_page | ||
11 | clear_page: | ||
12 | .prologue 0 | ||
13 | |||
14 | lda $0,128 | ||
15 | lda $1,125 | ||
16 | addq $16,64,$2 | ||
17 | addq $16,128,$3 | ||
18 | |||
19 | addq $16,192,$17 | ||
20 | wh64 ($16) | ||
21 | wh64 ($2) | ||
22 | wh64 ($3) | ||
23 | |||
24 | 1: wh64 ($17) | ||
25 | stq $31,0($16) | ||
26 | subq $0,1,$0 | ||
27 | subq $1,1,$1 | ||
28 | |||
29 | stq $31,8($16) | ||
30 | stq $31,16($16) | ||
31 | addq $17,64,$2 | ||
32 | nop | ||
33 | |||
34 | stq $31,24($16) | ||
35 | stq $31,32($16) | ||
36 | cmovgt $1,$2,$17 | ||
37 | nop | ||
38 | |||
39 | stq $31,40($16) | ||
40 | stq $31,48($16) | ||
41 | nop | ||
42 | nop | ||
43 | |||
44 | stq $31,56($16) | ||
45 | addq $16,64,$16 | ||
46 | nop | ||
47 | bne $0,1b | ||
48 | |||
49 | ret | ||
50 | nop | ||
51 | nop | ||
52 | nop | ||
53 | |||
54 | .end clear_page | ||
diff --git a/arch/alpha/lib/ev6-clear_user.S b/arch/alpha/lib/ev6-clear_user.S new file mode 100644 index 000000000000..4f42a16b7f53 --- /dev/null +++ b/arch/alpha/lib/ev6-clear_user.S | |||
@@ -0,0 +1,225 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-clear_user.S | ||
3 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
4 | * | ||
5 | * Zero user space, handling exceptions as we go. | ||
6 | * | ||
7 | * We have to make sure that $0 is always up-to-date and contains the | ||
8 | * right "bytes left to zero" value (and that it is updated only _after_ | ||
9 | * a successful copy). There is also some rather minor exception setup | ||
10 | * stuff. | ||
11 | * | ||
12 | * NOTE! This is not directly C-callable, because the calling semantics | ||
13 | * are different: | ||
14 | * | ||
15 | * Inputs: | ||
16 | * length in $0 | ||
17 | * destination address in $6 | ||
18 | * exception pointer in $7 | ||
19 | * return address in $28 (exceptions expect it there) | ||
20 | * | ||
21 | * Outputs: | ||
22 | * bytes left to copy in $0 | ||
23 | * | ||
24 | * Clobbers: | ||
25 | * $1,$2,$3,$4,$5,$6 | ||
26 | * | ||
27 | * Much of the information about 21264 scheduling/coding comes from: | ||
28 | * Compiler Writer's Guide for the Alpha 21264 | ||
29 | * abbreviated as 'CWG' in other comments here | ||
30 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
31 | * Scheduling notation: | ||
32 | * E - either cluster | ||
33 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
34 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
35 | * Try not to change the actual algorithm if possible for consistency. | ||
36 | * Determining actual stalls (other than slotting) doesn't appear to be easy to do. | ||
37 | * From perusing the source code context where this routine is called, it is | ||
38 | * a fair assumption that significant fractions of entire pages are zeroed, so | ||
39 | * it's going to be worth the effort to hand-unroll a big loop, and use wh64. | ||
40 | * ASSUMPTION: | ||
41 | * The believed purpose of only updating $0 after a store is that a signal | ||
42 | * may come along during the execution of this chunk of code, and we don't | ||
43 | * want to leave a hole (and we also want to avoid repeating lots of work) | ||
44 | */ | ||
45 | |||
46 | /* Allow an exception for an insn; exit if we get one. */ | ||
47 | #define EX(x,y...) \ | ||
48 | 99: x,##y; \ | ||
49 | .section __ex_table,"a"; \ | ||
50 | .long 99b - .; \ | ||
51 | lda $31, $exception-99b($31); \ | ||
52 | .previous | ||
53 | |||
54 | .set noat | ||
55 | .set noreorder | ||
56 | .align 4 | ||
57 | |||
58 | .globl __do_clear_user | ||
59 | .ent __do_clear_user | ||
60 | .frame $30, 0, $28 | ||
61 | .prologue 0 | ||
62 | |||
63 | # Pipeline info : Slotting & Comments | ||
64 | __do_clear_user: | ||
65 | and $6, 7, $4 # .. E .. .. : find dest head misalignment | ||
66 | beq $0, $zerolength # U .. .. .. : U L U L | ||
67 | |||
68 | addq $0, $4, $1 # .. .. .. E : bias counter | ||
69 | and $1, 7, $2 # .. .. E .. : number of misaligned bytes in tail | ||
70 | # Note - we never actually use $2, so this is a moot computation | ||
71 | # and we can rewrite this later... | ||
72 | srl $1, 3, $1 # .. E .. .. : number of quadwords to clear | ||
73 | beq $4, $headalign # U .. .. .. : U L U L | ||
74 | |||
75 | /* | ||
76 | * Head is not aligned. Write (8 - $4) bytes to head of destination | ||
77 | * This means $6 is known to be misaligned | ||
78 | */ | ||
79 | EX( ldq_u $5, 0($6) ) # .. .. .. L : load dst word to mask back in | ||
80 | beq $1, $onebyte # .. .. U .. : sub-word store? | ||
81 | mskql $5, $6, $5 # .. U .. .. : take care of misaligned head | ||
82 | addq $6, 8, $6 # E .. .. .. : L U U L | ||
83 | |||
84 | EX( stq_u $5, -8($6) ) # .. .. .. L : | ||
85 | subq $1, 1, $1 # .. .. E .. : | ||
86 | addq $0, $4, $0 # .. E .. .. : bytes left -= 8 - misalignment | ||
87 | subq $0, 8, $0 # E .. .. .. : U L U L | ||
88 | |||
89 | .align 4 | ||
90 | /* | ||
91 | * (The .align directive ought to be a moot point) | ||
92 | * values upon initial entry to the loop | ||
93 | * $1 is number of quadwords to clear (zero is a valid value) | ||
94 | * $2 is number of trailing bytes (0..7) ($2 never used...) | ||
95 | * $6 is known to be aligned 0mod8 | ||
96 | */ | ||
97 | $headalign: | ||
98 | subq $1, 16, $4 # .. .. .. E : If < 16, we can not use the huge loop | ||
99 | and $6, 0x3f, $2 # .. .. E .. : Forward work for huge loop | ||
100 | subq $2, 0x40, $3 # .. E .. .. : bias counter (huge loop) | ||
101 | blt $4, $trailquad # U .. .. .. : U L U L | ||
102 | |||
103 | /* | ||
104 | * We know that we're going to do at least 16 quads, which means we are | ||
105 | * going to be able to use the large block clear loop at least once. | ||
106 | * Figure out how many quads we need to clear before we are 0mod64 aligned | ||
107 | * so we can use the wh64 instruction. | ||
108 | */ | ||
109 | |||
110 | nop # .. .. .. E | ||
111 | nop # .. .. E .. | ||
112 | nop # .. E .. .. | ||
113 | beq $3, $bigalign # U .. .. .. : U L U L : Aligned 0mod64 | ||
114 | |||
115 | $alignmod64: | ||
116 | EX( stq_u $31, 0($6) ) # .. .. .. L | ||
117 | addq $3, 8, $3 # .. .. E .. | ||
118 | subq $0, 8, $0 # .. E .. .. | ||
119 | nop # E .. .. .. : U L U L | ||
120 | |||
121 | nop # .. .. .. E | ||
122 | subq $1, 1, $1 # .. .. E .. | ||
123 | addq $6, 8, $6 # .. E .. .. | ||
124 | blt $3, $alignmod64 # U .. .. .. : U L U L | ||
125 | |||
126 | $bigalign: | ||
127 | /* | ||
128 | * $0 is the number of bytes left | ||
129 | * $1 is the number of quads left | ||
130 | * $6 is aligned 0mod64 | ||
131 | * we know that we'll be taking a minimum of one trip through | ||
132 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle | ||
133 | * We are _not_ going to update $0 after every single store. That | ||
134 | * would be silly, because there will be cross-cluster dependencies | ||
135 | * no matter how the code is scheduled. By doing it in slightly | ||
136 | * staggered fashion, we can still do this loop in 5 fetches | ||
137 | * The worse case will be doing two extra quads in some future execution, | ||
138 | * in the event of an interrupted clear. | ||
139 | * Assumes the wh64 needs to be for 2 trips through the loop in the future | ||
140 | * The wh64 is issued on for the starting destination address for trip +2 | ||
141 | * through the loop, and if there are less than two trips left, the target | ||
142 | * address will be for the current trip. | ||
143 | */ | ||
144 | nop # E : | ||
145 | nop # E : | ||
146 | nop # E : | ||
147 | bis $6,$6,$3 # E : U L U L : Initial wh64 address is dest | ||
148 | /* This might actually help for the current trip... */ | ||
149 | |||
150 | $do_wh64: | ||
151 | wh64 ($3) # .. .. .. L1 : memory subsystem hint | ||
152 | subq $1, 16, $4 # .. .. E .. : Forward calculation - repeat the loop? | ||
153 | EX( stq_u $31, 0($6) ) # .. L .. .. | ||
154 | subq $0, 8, $0 # E .. .. .. : U L U L | ||
155 | |||
156 | addq $6, 128, $3 # E : Target address of wh64 | ||
157 | EX( stq_u $31, 8($6) ) # L : | ||
158 | EX( stq_u $31, 16($6) ) # L : | ||
159 | subq $0, 16, $0 # E : U L L U | ||
160 | |||
161 | nop # E : | ||
162 | EX( stq_u $31, 24($6) ) # L : | ||
163 | EX( stq_u $31, 32($6) ) # L : | ||
164 | subq $0, 168, $5 # E : U L L U : two trips through the loop left? | ||
165 | /* 168 = 192 - 24, since we've already completed some stores */ | ||
166 | |||
167 | subq $0, 16, $0 # E : | ||
168 | EX( stq_u $31, 40($6) ) # L : | ||
169 | EX( stq_u $31, 48($6) ) # L : | ||
170 | cmovlt $5, $6, $3 # E : U L L U : Latency 2, extra mapping cycle | ||
171 | |||
172 | subq $1, 8, $1 # E : | ||
173 | subq $0, 16, $0 # E : | ||
174 | EX( stq_u $31, 56($6) ) # L : | ||
175 | nop # E : U L U L | ||
176 | |||
177 | nop # E : | ||
178 | subq $0, 8, $0 # E : | ||
179 | addq $6, 64, $6 # E : | ||
180 | bge $4, $do_wh64 # U : U L U L | ||
181 | |||
182 | $trailquad: | ||
183 | # zero to 16 quadwords left to store, plus any trailing bytes | ||
184 | # $1 is the number of quadwords left to go. | ||
185 | # | ||
186 | nop # .. .. .. E | ||
187 | nop # .. .. E .. | ||
188 | nop # .. E .. .. | ||
189 | beq $1, $trailbytes # U .. .. .. : U L U L : Only 0..7 bytes to go | ||
190 | |||
191 | $onequad: | ||
192 | EX( stq_u $31, 0($6) ) # .. .. .. L | ||
193 | subq $1, 1, $1 # .. .. E .. | ||
194 | subq $0, 8, $0 # .. E .. .. | ||
195 | nop # E .. .. .. : U L U L | ||
196 | |||
197 | nop # .. .. .. E | ||
198 | nop # .. .. E .. | ||
199 | addq $6, 8, $6 # .. E .. .. | ||
200 | bgt $1, $onequad # U .. .. .. : U L U L | ||
201 | |||
202 | # We have an unknown number of bytes left to go. | ||
203 | $trailbytes: | ||
204 | nop # .. .. .. E | ||
205 | nop # .. .. E .. | ||
206 | nop # .. E .. .. | ||
207 | beq $0, $zerolength # U .. .. .. : U L U L | ||
208 | |||
209 | # $0 contains the number of bytes left to copy (0..31) | ||
210 | # so we will use $0 as the loop counter | ||
211 | # We know for a fact that $0 > 0 zero due to previous context | ||
212 | $onebyte: | ||
213 | EX( stb $31, 0($6) ) # .. .. .. L | ||
214 | subq $0, 1, $0 # .. .. E .. : | ||
215 | addq $6, 1, $6 # .. E .. .. : | ||
216 | bgt $0, $onebyte # U .. .. .. : U L U L | ||
217 | |||
218 | $zerolength: | ||
219 | $exception: # Destination for exception recovery(?) | ||
220 | nop # .. .. .. E : | ||
221 | nop # .. .. E .. : | ||
222 | nop # .. E .. .. : | ||
223 | ret $31, ($28), 1 # L0 .. .. .. : L U L U | ||
224 | .end __do_clear_user | ||
225 | |||
diff --git a/arch/alpha/lib/ev6-copy_page.S b/arch/alpha/lib/ev6-copy_page.S new file mode 100644 index 000000000000..b789db192754 --- /dev/null +++ b/arch/alpha/lib/ev6-copy_page.S | |||
@@ -0,0 +1,203 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-copy_page.S | ||
3 | * | ||
4 | * Copy an entire page. | ||
5 | */ | ||
6 | |||
7 | /* The following comparison of this routine vs the normal copy_page.S | ||
8 | was written by an unnamed ev6 hardware designer and forwarded to me | ||
9 | via Steven Hobbs <hobbs@steven.zko.dec.com>. | ||
10 | |||
11 | First Problem: STQ overflows. | ||
12 | ----------------------------- | ||
13 | |||
14 | It would be nice if EV6 handled every resource overflow efficiently, | ||
15 | but for some it doesn't. Including store queue overflows. It causes | ||
16 | a trap and a restart of the pipe. | ||
17 | |||
18 | To get around this we sometimes use (to borrow a term from a VSSAD | ||
19 | researcher) "aeration". The idea is to slow the rate at which the | ||
20 | processor receives valid instructions by inserting nops in the fetch | ||
21 | path. In doing so, you can prevent the overflow and actually make | ||
22 | the code run faster. You can, of course, take advantage of the fact | ||
23 | that the processor can fetch at most 4 aligned instructions per cycle. | ||
24 | |||
25 | I inserted enough nops to force it to take 10 cycles to fetch the | ||
26 | loop code. In theory, EV6 should be able to execute this loop in | ||
27 | 9 cycles but I was not able to get it to run that fast -- the initial | ||
28 | conditions were such that I could not reach this optimum rate on | ||
29 | (chaotic) EV6. I wrote the code such that everything would issue | ||
30 | in order. | ||
31 | |||
32 | Second Problem: Dcache index matches. | ||
33 | ------------------------------------- | ||
34 | |||
35 | If you are going to use this routine on random aligned pages, there | ||
36 | is a 25% chance that the pages will be at the same dcache indices. | ||
37 | This results in many nasty memory traps without care. | ||
38 | |||
39 | The solution is to schedule the prefetches to avoid the memory | ||
40 | conflicts. I schedule the wh64 prefetches farther ahead of the | ||
41 | read prefetches to avoid this problem. | ||
42 | |||
43 | Third Problem: Needs more prefetching. | ||
44 | -------------------------------------- | ||
45 | |||
46 | In order to improve the code I added deeper prefetching to take the | ||
47 | most advantage of EV6's bandwidth. | ||
48 | |||
49 | I also prefetched the read stream. Note that adding the read prefetch | ||
50 | forced me to add another cycle to the inner-most kernel - up to 11 | ||
51 | from the original 8 cycles per iteration. We could improve performance | ||
52 | further by unrolling the loop and doing multiple prefetches per cycle. | ||
53 | |||
54 | I think that the code below will be very robust and fast code for the | ||
55 | purposes of copying aligned pages. It is slower when both source and | ||
56 | destination pages are in the dcache, but it is my guess that this is | ||
57 | less important than the dcache miss case. */ | ||
58 | |||
59 | |||
60 | .text | ||
61 | .align 4 | ||
62 | .global copy_page | ||
63 | .ent copy_page | ||
64 | copy_page: | ||
65 | .prologue 0 | ||
66 | |||
67 | /* Prefetch 5 read cachelines; write-hint 10 cache lines. */ | ||
68 | wh64 ($16) | ||
69 | ldl $31,0($17) | ||
70 | ldl $31,64($17) | ||
71 | lda $1,1*64($16) | ||
72 | |||
73 | wh64 ($1) | ||
74 | ldl $31,128($17) | ||
75 | ldl $31,192($17) | ||
76 | lda $1,2*64($16) | ||
77 | |||
78 | wh64 ($1) | ||
79 | ldl $31,256($17) | ||
80 | lda $18,118 | ||
81 | lda $1,3*64($16) | ||
82 | |||
83 | wh64 ($1) | ||
84 | nop | ||
85 | lda $1,4*64($16) | ||
86 | lda $2,5*64($16) | ||
87 | |||
88 | wh64 ($1) | ||
89 | wh64 ($2) | ||
90 | lda $1,6*64($16) | ||
91 | lda $2,7*64($16) | ||
92 | |||
93 | wh64 ($1) | ||
94 | wh64 ($2) | ||
95 | lda $1,8*64($16) | ||
96 | lda $2,9*64($16) | ||
97 | |||
98 | wh64 ($1) | ||
99 | wh64 ($2) | ||
100 | lda $19,10*64($16) | ||
101 | nop | ||
102 | |||
103 | /* Main prefetching/write-hinting loop. */ | ||
104 | 1: ldq $0,0($17) | ||
105 | ldq $1,8($17) | ||
106 | unop | ||
107 | unop | ||
108 | |||
109 | unop | ||
110 | unop | ||
111 | ldq $2,16($17) | ||
112 | ldq $3,24($17) | ||
113 | |||
114 | ldq $4,32($17) | ||
115 | ldq $5,40($17) | ||
116 | unop | ||
117 | unop | ||
118 | |||
119 | unop | ||
120 | unop | ||
121 | ldq $6,48($17) | ||
122 | ldq $7,56($17) | ||
123 | |||
124 | ldl $31,320($17) | ||
125 | unop | ||
126 | unop | ||
127 | unop | ||
128 | |||
129 | /* This gives the extra cycle of aeration above the minimum. */ | ||
130 | unop | ||
131 | unop | ||
132 | unop | ||
133 | unop | ||
134 | |||
135 | wh64 ($19) | ||
136 | unop | ||
137 | unop | ||
138 | unop | ||
139 | |||
140 | stq $0,0($16) | ||
141 | subq $18,1,$18 | ||
142 | stq $1,8($16) | ||
143 | unop | ||
144 | |||
145 | unop | ||
146 | stq $2,16($16) | ||
147 | addq $17,64,$17 | ||
148 | stq $3,24($16) | ||
149 | |||
150 | stq $4,32($16) | ||
151 | stq $5,40($16) | ||
152 | addq $19,64,$19 | ||
153 | unop | ||
154 | |||
155 | stq $6,48($16) | ||
156 | stq $7,56($16) | ||
157 | addq $16,64,$16 | ||
158 | bne $18, 1b | ||
159 | |||
160 | /* Prefetch the final 5 cache lines of the read stream. */ | ||
161 | lda $18,10 | ||
162 | ldl $31,320($17) | ||
163 | ldl $31,384($17) | ||
164 | ldl $31,448($17) | ||
165 | |||
166 | ldl $31,512($17) | ||
167 | ldl $31,576($17) | ||
168 | nop | ||
169 | nop | ||
170 | |||
171 | /* Non-prefetching, non-write-hinting cleanup loop for the | ||
172 | final 10 cache lines. */ | ||
173 | 2: ldq $0,0($17) | ||
174 | ldq $1,8($17) | ||
175 | ldq $2,16($17) | ||
176 | ldq $3,24($17) | ||
177 | |||
178 | ldq $4,32($17) | ||
179 | ldq $5,40($17) | ||
180 | ldq $6,48($17) | ||
181 | ldq $7,56($17) | ||
182 | |||
183 | stq $0,0($16) | ||
184 | subq $18,1,$18 | ||
185 | stq $1,8($16) | ||
186 | addq $17,64,$17 | ||
187 | |||
188 | stq $2,16($16) | ||
189 | stq $3,24($16) | ||
190 | stq $4,32($16) | ||
191 | stq $5,40($16) | ||
192 | |||
193 | stq $6,48($16) | ||
194 | stq $7,56($16) | ||
195 | addq $16,64,$16 | ||
196 | bne $18, 2b | ||
197 | |||
198 | ret | ||
199 | nop | ||
200 | unop | ||
201 | nop | ||
202 | |||
203 | .end copy_page | ||
diff --git a/arch/alpha/lib/ev6-copy_user.S b/arch/alpha/lib/ev6-copy_user.S new file mode 100644 index 000000000000..db42ffe9c350 --- /dev/null +++ b/arch/alpha/lib/ev6-copy_user.S | |||
@@ -0,0 +1,259 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-copy_user.S | ||
3 | * | ||
4 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
5 | * | ||
6 | * Copy to/from user space, handling exceptions as we go.. This | ||
7 | * isn't exactly pretty. | ||
8 | * | ||
9 | * This is essentially the same as "memcpy()", but with a few twists. | ||
10 | * Notably, we have to make sure that $0 is always up-to-date and | ||
11 | * contains the right "bytes left to copy" value (and that it is updated | ||
12 | * only _after_ a successful copy). There is also some rather minor | ||
13 | * exception setup stuff.. | ||
14 | * | ||
15 | * NOTE! This is not directly C-callable, because the calling semantics are | ||
16 | * different: | ||
17 | * | ||
18 | * Inputs: | ||
19 | * length in $0 | ||
20 | * destination address in $6 | ||
21 | * source address in $7 | ||
22 | * return address in $28 | ||
23 | * | ||
24 | * Outputs: | ||
25 | * bytes left to copy in $0 | ||
26 | * | ||
27 | * Clobbers: | ||
28 | * $1,$2,$3,$4,$5,$6,$7 | ||
29 | * | ||
30 | * Much of the information about 21264 scheduling/coding comes from: | ||
31 | * Compiler Writer's Guide for the Alpha 21264 | ||
32 | * abbreviated as 'CWG' in other comments here | ||
33 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
34 | * Scheduling notation: | ||
35 | * E - either cluster | ||
36 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
37 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
38 | */ | ||
39 | |||
40 | /* Allow an exception for an insn; exit if we get one. */ | ||
41 | #define EXI(x,y...) \ | ||
42 | 99: x,##y; \ | ||
43 | .section __ex_table,"a"; \ | ||
44 | .long 99b - .; \ | ||
45 | lda $31, $exitin-99b($31); \ | ||
46 | .previous | ||
47 | |||
48 | #define EXO(x,y...) \ | ||
49 | 99: x,##y; \ | ||
50 | .section __ex_table,"a"; \ | ||
51 | .long 99b - .; \ | ||
52 | lda $31, $exitout-99b($31); \ | ||
53 | .previous | ||
54 | |||
55 | .set noat | ||
56 | .align 4 | ||
57 | .globl __copy_user | ||
58 | .ent __copy_user | ||
59 | # Pipeline info: Slotting & Comments | ||
60 | __copy_user: | ||
61 | .prologue 0 | ||
62 | subq $0, 32, $1 # .. E .. .. : Is this going to be a small copy? | ||
63 | beq $0, $zerolength # U .. .. .. : U L U L | ||
64 | |||
65 | and $6,7,$3 # .. .. .. E : is leading dest misalignment | ||
66 | ble $1, $onebyteloop # .. .. U .. : 1st branch : small amount of data | ||
67 | beq $3, $destaligned # .. U .. .. : 2nd (one cycle fetcher stall) | ||
68 | subq $3, 8, $3 # E .. .. .. : L U U L : trip counter | ||
69 | /* | ||
70 | * The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U) | ||
71 | * This loop aligns the destination a byte at a time | ||
72 | * We know we have at least one trip through this loop | ||
73 | */ | ||
74 | $aligndest: | ||
75 | EXI( ldbu $1,0($7) ) # .. .. .. L : Keep loads separate from stores | ||
76 | addq $6,1,$6 # .. .. E .. : Section 3.8 in the CWG | ||
77 | addq $3,1,$3 # .. E .. .. : | ||
78 | nop # E .. .. .. : U L U L | ||
79 | |||
80 | /* | ||
81 | * the -1 is to compensate for the inc($6) done in a previous quadpack | ||
82 | * which allows us zero dependencies within either quadpack in the loop | ||
83 | */ | ||
84 | EXO( stb $1,-1($6) ) # .. .. .. L : | ||
85 | addq $7,1,$7 # .. .. E .. : Section 3.8 in the CWG | ||
86 | subq $0,1,$0 # .. E .. .. : | ||
87 | bne $3, $aligndest # U .. .. .. : U L U L | ||
88 | |||
89 | /* | ||
90 | * If we fell through into here, we have a minimum of 33 - 7 bytes | ||
91 | * If we arrived via branch, we have a minimum of 32 bytes | ||
92 | */ | ||
93 | $destaligned: | ||
94 | and $7,7,$1 # .. .. .. E : Check _current_ source alignment | ||
95 | bic $0,7,$4 # .. .. E .. : number bytes as a quadword loop | ||
96 | EXI( ldq_u $3,0($7) ) # .. L .. .. : Forward fetch for fallthrough code | ||
97 | beq $1,$quadaligned # U .. .. .. : U L U L | ||
98 | |||
99 | /* | ||
100 | * In the worst case, we've just executed an ldq_u here from 0($7) | ||
101 | * and we'll repeat it once if we take the branch | ||
102 | */ | ||
103 | |||
104 | /* Misaligned quadword loop - not unrolled. Leave it that way. */ | ||
105 | $misquad: | ||
106 | EXI( ldq_u $2,8($7) ) # .. .. .. L : | ||
107 | subq $4,8,$4 # .. .. E .. : | ||
108 | extql $3,$7,$3 # .. U .. .. : | ||
109 | extqh $2,$7,$1 # U .. .. .. : U U L L | ||
110 | |||
111 | bis $3,$1,$1 # .. .. .. E : | ||
112 | EXO( stq $1,0($6) ) # .. .. L .. : | ||
113 | addq $7,8,$7 # .. E .. .. : | ||
114 | subq $0,8,$0 # E .. .. .. : U L L U | ||
115 | |||
116 | addq $6,8,$6 # .. .. .. E : | ||
117 | bis $2,$2,$3 # .. .. E .. : | ||
118 | nop # .. E .. .. : | ||
119 | bne $4,$misquad # U .. .. .. : U L U L | ||
120 | |||
121 | nop # .. .. .. E | ||
122 | nop # .. .. E .. | ||
123 | nop # .. E .. .. | ||
124 | beq $0,$zerolength # U .. .. .. : U L U L | ||
125 | |||
126 | /* We know we have at least one trip through the byte loop */ | ||
127 | EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad | ||
128 | addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG) | ||
129 | nop # .. E .. .. : | ||
130 | br $31, $dirtyentry # L0 .. .. .. : L U U L | ||
131 | /* Do the trailing byte loop load, then hop into the store part of the loop */ | ||
132 | |||
133 | /* | ||
134 | * A minimum of (33 - 7) bytes to do a quad at a time. | ||
135 | * Based upon the usage context, it's worth the effort to unroll this loop | ||
136 | * $0 - number of bytes to be moved | ||
137 | * $4 - number of bytes to move as quadwords | ||
138 | * $6 is current destination address | ||
139 | * $7 is current source address | ||
140 | */ | ||
141 | $quadaligned: | ||
142 | subq $4, 32, $2 # .. .. .. E : do not unroll for small stuff | ||
143 | nop # .. .. E .. | ||
144 | nop # .. E .. .. | ||
145 | blt $2, $onequad # U .. .. .. : U L U L | ||
146 | |||
147 | /* | ||
148 | * There is a significant assumption here that the source and destination | ||
149 | * addresses differ by more than 32 bytes. In this particular case, a | ||
150 | * sparsity of registers further bounds this to be a minimum of 8 bytes. | ||
151 | * But if this isn't met, then the output result will be incorrect. | ||
152 | * Furthermore, due to a lack of available registers, we really can't | ||
153 | * unroll this to be an 8x loop (which would enable us to use the wh64 | ||
154 | * instruction memory hint instruction). | ||
155 | */ | ||
156 | $unroll4: | ||
157 | EXI( ldq $1,0($7) ) # .. .. .. L | ||
158 | EXI( ldq $2,8($7) ) # .. .. L .. | ||
159 | subq $4,32,$4 # .. E .. .. | ||
160 | nop # E .. .. .. : U U L L | ||
161 | |||
162 | addq $7,16,$7 # .. .. .. E | ||
163 | EXO( stq $1,0($6) ) # .. .. L .. | ||
164 | EXO( stq $2,8($6) ) # .. L .. .. | ||
165 | subq $0,16,$0 # E .. .. .. : U L L U | ||
166 | |||
167 | addq $6,16,$6 # .. .. .. E | ||
168 | EXI( ldq $1,0($7) ) # .. .. L .. | ||
169 | EXI( ldq $2,8($7) ) # .. L .. .. | ||
170 | subq $4, 32, $3 # E .. .. .. : U U L L : is there enough for another trip? | ||
171 | |||
172 | EXO( stq $1,0($6) ) # .. .. .. L | ||
173 | EXO( stq $2,8($6) ) # .. .. L .. | ||
174 | subq $0,16,$0 # .. E .. .. | ||
175 | addq $7,16,$7 # E .. .. .. : U L L U | ||
176 | |||
177 | nop # .. .. .. E | ||
178 | nop # .. .. E .. | ||
179 | addq $6,16,$6 # .. E .. .. | ||
180 | bgt $3,$unroll4 # U .. .. .. : U L U L | ||
181 | |||
182 | nop | ||
183 | nop | ||
184 | nop | ||
185 | beq $4, $noquads | ||
186 | |||
187 | $onequad: | ||
188 | EXI( ldq $1,0($7) ) | ||
189 | subq $4,8,$4 | ||
190 | addq $7,8,$7 | ||
191 | nop | ||
192 | |||
193 | EXO( stq $1,0($6) ) | ||
194 | subq $0,8,$0 | ||
195 | addq $6,8,$6 | ||
196 | bne $4,$onequad | ||
197 | |||
198 | $noquads: | ||
199 | nop | ||
200 | nop | ||
201 | nop | ||
202 | beq $0,$zerolength | ||
203 | |||
204 | /* | ||
205 | * For small copies (or the tail of a larger copy), do a very simple byte loop. | ||
206 | * There's no point in doing a lot of complex alignment calculations to try to | ||
207 | * to quadword stuff for a small amount of data. | ||
208 | * $0 - remaining number of bytes left to copy | ||
209 | * $6 - current dest addr | ||
210 | * $7 - current source addr | ||
211 | */ | ||
212 | |||
213 | $onebyteloop: | ||
214 | EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad | ||
215 | addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG) | ||
216 | nop # .. E .. .. : | ||
217 | nop # E .. .. .. : U L U L | ||
218 | |||
219 | $dirtyentry: | ||
220 | /* | ||
221 | * the -1 is to compensate for the inc($6) done in a previous quadpack | ||
222 | * which allows us zero dependencies within either quadpack in the loop | ||
223 | */ | ||
224 | EXO ( stb $2,-1($6) ) # .. .. .. L : | ||
225 | addq $7,1,$7 # .. .. E .. : quadpack as the load | ||
226 | subq $0,1,$0 # .. E .. .. : change count _after_ copy | ||
227 | bgt $0,$onebyteloop # U .. .. .. : U L U L | ||
228 | |||
229 | $zerolength: | ||
230 | $exitout: # Destination for exception recovery(?) | ||
231 | nop # .. .. .. E | ||
232 | nop # .. .. E .. | ||
233 | nop # .. E .. .. | ||
234 | ret $31,($28),1 # L0 .. .. .. : L U L U | ||
235 | |||
236 | $exitin: | ||
237 | |||
238 | /* A stupid byte-by-byte zeroing of the rest of the output | ||
239 | buffer. This cures security holes by never leaving | ||
240 | random kernel data around to be copied elsewhere. */ | ||
241 | |||
242 | nop | ||
243 | nop | ||
244 | nop | ||
245 | mov $0,$1 | ||
246 | |||
247 | $101: | ||
248 | EXO ( stb $31,0($6) ) # L | ||
249 | subq $1,1,$1 # E | ||
250 | addq $6,1,$6 # E | ||
251 | bgt $1,$101 # U | ||
252 | |||
253 | nop | ||
254 | nop | ||
255 | nop | ||
256 | ret $31,($28),1 # L0 | ||
257 | |||
258 | .end __copy_user | ||
259 | |||
diff --git a/arch/alpha/lib/ev6-csum_ipv6_magic.S b/arch/alpha/lib/ev6-csum_ipv6_magic.S new file mode 100644 index 000000000000..de1948a69118 --- /dev/null +++ b/arch/alpha/lib/ev6-csum_ipv6_magic.S | |||
@@ -0,0 +1,126 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-csum_ipv6_magic.S | ||
3 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
4 | * | ||
5 | * unsigned short csum_ipv6_magic(struct in6_addr *saddr, | ||
6 | * struct in6_addr *daddr, | ||
7 | * __u32 len, | ||
8 | * unsigned short proto, | ||
9 | * unsigned int csum); | ||
10 | * | ||
11 | * Much of the information about 21264 scheduling/coding comes from: | ||
12 | * Compiler Writer's Guide for the Alpha 21264 | ||
13 | * abbreviated as 'CWG' in other comments here | ||
14 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
15 | * Scheduling notation: | ||
16 | * E - either cluster | ||
17 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
18 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
19 | * Try not to change the actual algorithm if possible for consistency. | ||
20 | * Determining actual stalls (other than slotting) doesn't appear to be easy to do. | ||
21 | * | ||
22 | * unsigned short csum_ipv6_magic(struct in6_addr *saddr, | ||
23 | * struct in6_addr *daddr, | ||
24 | * __u32 len, | ||
25 | * unsigned short proto, | ||
26 | * unsigned int csum); | ||
27 | * | ||
28 | * Swap <proto> (takes form 0xaabb) | ||
29 | * Then shift it left by 48, so result is: | ||
30 | * 0xbbaa0000 00000000 | ||
31 | * Then turn it back into a sign extended 32-bit item | ||
32 | * 0xbbaa0000 | ||
33 | * | ||
34 | * Swap <len> (an unsigned int) using Mike Burrows' 7-instruction sequence | ||
35 | * (we can't hide the 3-cycle latency of the unpkbw in the 6-instruction sequence) | ||
36 | * Assume input takes form 0xAABBCCDD | ||
37 | * | ||
38 | * Finally, original 'folding' approach is to split the long into 4 unsigned shorts | ||
39 | * add 4 ushorts, resulting in ushort/carry | ||
40 | * add carry bits + ushort --> ushort | ||
41 | * add carry bits + ushort --> ushort (in case the carry results in an overflow) | ||
42 | * Truncate to a ushort. (took 13 instructions) | ||
43 | * From doing some testing, using the approach in checksum.c:from64to16() | ||
44 | * results in the same outcome: | ||
45 | * split into 2 uints, add those, generating a ulong | ||
46 | * add the 3 low ushorts together, generating a uint | ||
47 | * a final add of the 2 lower ushorts | ||
48 | * truncating the result. | ||
49 | */ | ||
50 | |||
51 | .globl csum_ipv6_magic | ||
52 | .align 4 | ||
53 | .ent csum_ipv6_magic | ||
54 | .frame $30,0,$26,0 | ||
55 | csum_ipv6_magic: | ||
56 | .prologue 0 | ||
57 | |||
58 | ldq $0,0($16) # L : Latency: 3 | ||
59 | inslh $18,7,$4 # U : 0000000000AABBCC | ||
60 | ldq $1,8($16) # L : Latency: 3 | ||
61 | sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00 | ||
62 | |||
63 | zapnot $20,15,$20 # U : zero extend incoming csum | ||
64 | ldq $2,0($17) # L : Latency: 3 | ||
65 | sll $19,24,$19 # U : U L L U : 0x000000aa bb000000 | ||
66 | inswl $18,3,$18 # U : 000000CCDD000000 | ||
67 | |||
68 | ldq $3,8($17) # L : Latency: 3 | ||
69 | bis $18,$4,$18 # E : 000000CCDDAABBCC | ||
70 | addl $19,$7,$19 # E : <sign bits>bbaabb00 | ||
71 | nop # E : U L U L | ||
72 | |||
73 | addq $20,$0,$20 # E : begin summing the words | ||
74 | srl $18,16,$4 # U : 0000000000CCDDAA | ||
75 | zap $19,0x3,$19 # U : <sign bits>bbaa0000 | ||
76 | nop # E : L U U L | ||
77 | |||
78 | cmpult $20,$0,$0 # E : | ||
79 | addq $20,$1,$20 # E : | ||
80 | zapnot $18,0xa,$18 # U : 00000000DD00BB00 | ||
81 | zap $4,0xa,$4 # U : U U L L : 0000000000CC00AA | ||
82 | |||
83 | or $18,$4,$18 # E : 00000000DDCCBBAA | ||
84 | nop # E : | ||
85 | cmpult $20,$1,$1 # E : | ||
86 | addq $20,$2,$20 # E : U L U L | ||
87 | |||
88 | cmpult $20,$2,$2 # E : | ||
89 | addq $20,$3,$20 # E : | ||
90 | cmpult $20,$3,$3 # E : (1 cycle stall on $20) | ||
91 | addq $20,$18,$20 # E : U L U L (1 cycle stall on $20) | ||
92 | |||
93 | cmpult $20,$18,$18 # E : | ||
94 | addq $20,$19,$20 # E : (1 cycle stall on $20) | ||
95 | addq $0,$1,$0 # E : merge the carries back into the csum | ||
96 | addq $2,$3,$2 # E : | ||
97 | |||
98 | cmpult $20,$19,$19 # E : | ||
99 | addq $18,$19,$18 # E : (1 cycle stall on $19) | ||
100 | addq $0,$2,$0 # E : | ||
101 | addq $20,$18,$20 # E : U L U L : | ||
102 | /* (1 cycle stall on $18, 2 cycles on $20) */ | ||
103 | |||
104 | addq $0,$20,$0 # E : | ||
105 | zapnot $0,15,$1 # U : Start folding output (1 cycle stall on $0) | ||
106 | nop # E : | ||
107 | srl $0,32,$0 # U : U L U L : (1 cycle stall on $0) | ||
108 | |||
109 | addq $1,$0,$1 # E : Finished generating ulong | ||
110 | extwl $1,2,$2 # U : ushort[1] (1 cycle stall on $1) | ||
111 | zapnot $1,3,$0 # U : ushort[0] (1 cycle stall on $1) | ||
112 | extwl $1,4,$1 # U : ushort[2] (1 cycle stall on $1) | ||
113 | |||
114 | addq $0,$2,$0 # E | ||
115 | addq $0,$1,$3 # E : Finished generating uint | ||
116 | /* (1 cycle stall on $0) */ | ||
117 | extwl $3,2,$1 # U : ushort[1] (1 cycle stall on $3) | ||
118 | nop # E : L U L U | ||
119 | |||
120 | addq $1,$3,$0 # E : Final carry | ||
121 | not $0,$4 # E : complement (1 cycle stall on $0) | ||
122 | zapnot $4,3,$0 # U : clear upper garbage bits | ||
123 | /* (1 cycle stall on $4) */ | ||
124 | ret # L0 : L U L U | ||
125 | |||
126 | .end csum_ipv6_magic | ||
diff --git a/arch/alpha/lib/ev6-divide.S b/arch/alpha/lib/ev6-divide.S new file mode 100644 index 000000000000..2a82b9be93fa --- /dev/null +++ b/arch/alpha/lib/ev6-divide.S | |||
@@ -0,0 +1,259 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-divide.S | ||
3 | * | ||
4 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
5 | * | ||
6 | * Alpha division.. | ||
7 | */ | ||
8 | |||
9 | /* | ||
10 | * The alpha chip doesn't provide hardware division, so we have to do it | ||
11 | * by hand. The compiler expects the functions | ||
12 | * | ||
13 | * __divqu: 64-bit unsigned long divide | ||
14 | * __remqu: 64-bit unsigned long remainder | ||
15 | * __divqs/__remqs: signed 64-bit | ||
16 | * __divlu/__remlu: unsigned 32-bit | ||
17 | * __divls/__remls: signed 32-bit | ||
18 | * | ||
19 | * These are not normal C functions: instead of the normal | ||
20 | * calling sequence, these expect their arguments in registers | ||
21 | * $24 and $25, and return the result in $27. Register $28 may | ||
22 | * be clobbered (assembly temporary), anything else must be saved. | ||
23 | * | ||
24 | * In short: painful. | ||
25 | * | ||
26 | * This is a rather simple bit-at-a-time algorithm: it's very good | ||
27 | * at dividing random 64-bit numbers, but the more usual case where | ||
28 | * the divisor is small is handled better by the DEC algorithm | ||
29 | * using lookup tables. This uses much less memory, though, and is | ||
30 | * nicer on the cache.. Besides, I don't know the copyright status | ||
31 | * of the DEC code. | ||
32 | */ | ||
33 | |||
34 | /* | ||
35 | * My temporaries: | ||
36 | * $0 - current bit | ||
37 | * $1 - shifted divisor | ||
38 | * $2 - modulus/quotient | ||
39 | * | ||
40 | * $23 - return address | ||
41 | * $24 - dividend | ||
42 | * $25 - divisor | ||
43 | * | ||
44 | * $27 - quotient/modulus | ||
45 | * $28 - compare status | ||
46 | * | ||
47 | * Much of the information about 21264 scheduling/coding comes from: | ||
48 | * Compiler Writer's Guide for the Alpha 21264 | ||
49 | * abbreviated as 'CWG' in other comments here | ||
50 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
51 | * Scheduling notation: | ||
52 | * E - either cluster | ||
53 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
54 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
55 | * Try not to change the actual algorithm if possible for consistency. | ||
56 | */ | ||
57 | |||
58 | #define halt .long 0 | ||
59 | |||
60 | /* | ||
61 | * Select function type and registers | ||
62 | */ | ||
63 | #define mask $0 | ||
64 | #define divisor $1 | ||
65 | #define compare $28 | ||
66 | #define tmp1 $3 | ||
67 | #define tmp2 $4 | ||
68 | |||
69 | #ifdef DIV | ||
70 | #define DIV_ONLY(x,y...) x,##y | ||
71 | #define MOD_ONLY(x,y...) | ||
72 | #define func(x) __div##x | ||
73 | #define modulus $2 | ||
74 | #define quotient $27 | ||
75 | #define GETSIGN(x) xor $24,$25,x | ||
76 | #define STACK 48 | ||
77 | #else | ||
78 | #define DIV_ONLY(x,y...) | ||
79 | #define MOD_ONLY(x,y...) x,##y | ||
80 | #define func(x) __rem##x | ||
81 | #define modulus $27 | ||
82 | #define quotient $2 | ||
83 | #define GETSIGN(x) bis $24,$24,x | ||
84 | #define STACK 32 | ||
85 | #endif | ||
86 | |||
87 | /* | ||
88 | * For 32-bit operations, we need to extend to 64-bit | ||
89 | */ | ||
90 | #ifdef INTSIZE | ||
91 | #define ufunction func(lu) | ||
92 | #define sfunction func(l) | ||
93 | #define LONGIFY(x) zapnot x,15,x | ||
94 | #define SLONGIFY(x) addl x,0,x | ||
95 | #else | ||
96 | #define ufunction func(qu) | ||
97 | #define sfunction func(q) | ||
98 | #define LONGIFY(x) | ||
99 | #define SLONGIFY(x) | ||
100 | #endif | ||
101 | |||
102 | .set noat | ||
103 | .align 4 | ||
104 | .globl ufunction | ||
105 | .ent ufunction | ||
106 | ufunction: | ||
107 | subq $30,STACK,$30 # E : | ||
108 | .frame $30,STACK,$23 | ||
109 | .prologue 0 | ||
110 | |||
111 | 7: stq $1, 0($30) # L : | ||
112 | bis $25,$25,divisor # E : | ||
113 | stq $2, 8($30) # L : L U L U | ||
114 | |||
115 | bis $24,$24,modulus # E : | ||
116 | stq $0,16($30) # L : | ||
117 | bis $31,$31,quotient # E : | ||
118 | LONGIFY(divisor) # E : U L L U | ||
119 | |||
120 | stq tmp1,24($30) # L : | ||
121 | LONGIFY(modulus) # E : | ||
122 | bis $31,1,mask # E : | ||
123 | DIV_ONLY(stq tmp2,32($30)) # L : L U U L | ||
124 | |||
125 | beq divisor, 9f /* div by zero */ | ||
126 | /* | ||
127 | * In spite of the DIV_ONLY being either a non-instruction | ||
128 | * or an actual stq, the addition of the .align directive | ||
129 | * below ensures that label 1 is going to be nicely aligned | ||
130 | */ | ||
131 | |||
132 | .align 4 | ||
133 | #ifdef INTSIZE | ||
134 | /* | ||
135 | * shift divisor left, using 3-bit shifts for | ||
136 | * 32-bit divides as we can't overflow. Three-bit | ||
137 | * shifts will result in looping three times less | ||
138 | * here, but can result in two loops more later. | ||
139 | * Thus using a large shift isn't worth it (and | ||
140 | * s8add pairs better than a sll..) | ||
141 | */ | ||
142 | 1: cmpult divisor,modulus,compare # E : | ||
143 | s8addq divisor,$31,divisor # E : | ||
144 | s8addq mask,$31,mask # E : | ||
145 | bne compare,1b # U : U L U L | ||
146 | #else | ||
147 | 1: cmpult divisor,modulus,compare # E : | ||
148 | nop # E : | ||
149 | nop # E : | ||
150 | blt divisor, 2f # U : U L U L | ||
151 | |||
152 | addq divisor,divisor,divisor # E : | ||
153 | addq mask,mask,mask # E : | ||
154 | unop # E : | ||
155 | bne compare,1b # U : U L U L | ||
156 | #endif | ||
157 | |||
158 | /* ok, start to go right again.. */ | ||
159 | 2: | ||
160 | /* | ||
161 | * Keep things nicely bundled... use a nop instead of not | ||
162 | * having an instruction for DIV_ONLY | ||
163 | */ | ||
164 | #ifdef DIV | ||
165 | DIV_ONLY(addq quotient,mask,tmp2) # E : | ||
166 | #else | ||
167 | nop # E : | ||
168 | #endif | ||
169 | srl mask,1,mask # U : | ||
170 | cmpule divisor,modulus,compare # E : | ||
171 | subq modulus,divisor,tmp1 # E : | ||
172 | |||
173 | #ifdef DIV | ||
174 | DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot | ||
175 | nop # E : as part of the cmovne | ||
176 | srl divisor,1,divisor # U : | ||
177 | nop # E : L U L U | ||
178 | |||
179 | nop # E : | ||
180 | cmovne compare,tmp1,modulus # E : Latency 2, extra map slot | ||
181 | nop # E : as part of the cmovne | ||
182 | bne mask,2b # U : U L U L | ||
183 | #else | ||
184 | srl divisor,1,divisor # U : | ||
185 | cmovne compare,tmp1,modulus # E : Latency 2, extra map slot | ||
186 | nop # E : as part of the cmovne | ||
187 | bne mask,2b # U : U L L U | ||
188 | #endif | ||
189 | |||
190 | 9: ldq $1, 0($30) # L : | ||
191 | ldq $2, 8($30) # L : | ||
192 | nop # E : | ||
193 | nop # E : U U L L | ||
194 | |||
195 | ldq $0,16($30) # L : | ||
196 | ldq tmp1,24($30) # L : | ||
197 | nop # E : | ||
198 | nop # E : | ||
199 | |||
200 | #ifdef DIV | ||
201 | DIV_ONLY(ldq tmp2,32($30)) # L : | ||
202 | #else | ||
203 | nop # E : | ||
204 | #endif | ||
205 | addq $30,STACK,$30 # E : | ||
206 | ret $31,($23),1 # L0 : L U U L | ||
207 | .end ufunction | ||
208 | |||
209 | /* | ||
210 | * Uhh.. Ugly signed division. I'd rather not have it at all, but | ||
211 | * it's needed in some circumstances. There are different ways to | ||
212 | * handle this, really. This does: | ||
213 | * -a / b = a / -b = -(a / b) | ||
214 | * -a % b = -(a % b) | ||
215 | * a % -b = a % b | ||
216 | * which is probably not the best solution, but at least should | ||
217 | * have the property that (x/y)*y + (x%y) = x. | ||
218 | */ | ||
219 | .align 4 | ||
220 | .globl sfunction | ||
221 | .ent sfunction | ||
222 | sfunction: | ||
223 | subq $30,STACK,$30 # E : | ||
224 | .frame $30,STACK,$23 | ||
225 | .prologue 0 | ||
226 | bis $24,$25,$28 # E : | ||
227 | SLONGIFY($28) # E : | ||
228 | bge $28,7b # U : | ||
229 | |||
230 | stq $24,0($30) # L : | ||
231 | subq $31,$24,$28 # E : | ||
232 | stq $25,8($30) # L : | ||
233 | nop # E : U L U L | ||
234 | |||
235 | cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot | ||
236 | nop # E : as part of the cmov | ||
237 | stq $23,16($30) # L : | ||
238 | subq $31,$25,$28 # E : U L U L | ||
239 | |||
240 | stq tmp1,24($30) # L : | ||
241 | cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot | ||
242 | nop # E : | ||
243 | bsr $23,ufunction # L0: L U L U | ||
244 | |||
245 | ldq $24,0($30) # L : | ||
246 | ldq $25,8($30) # L : | ||
247 | GETSIGN($28) # E : | ||
248 | subq $31,$27,tmp1 # E : U U L L | ||
249 | |||
250 | SLONGIFY($28) # E : | ||
251 | ldq $23,16($30) # L : | ||
252 | cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot | ||
253 | nop # E : U L L U : as part of the cmov | ||
254 | |||
255 | ldq tmp1,24($30) # L : | ||
256 | nop # E : as part of the cmov | ||
257 | addq $30,STACK,$30 # E : | ||
258 | ret $31,($23),1 # L0 : L U U L | ||
259 | .end sfunction | ||
diff --git a/arch/alpha/lib/ev6-memchr.S b/arch/alpha/lib/ev6-memchr.S new file mode 100644 index 000000000000..a8e843dbcc23 --- /dev/null +++ b/arch/alpha/lib/ev6-memchr.S | |||
@@ -0,0 +1,191 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-memchr.S | ||
3 | * | ||
4 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
5 | * | ||
6 | * Finds characters in a memory area. Optimized for the Alpha: | ||
7 | * | ||
8 | * - memory accessed as aligned quadwords only | ||
9 | * - uses cmpbge to compare 8 bytes in parallel | ||
10 | * - does binary search to find 0 byte in last | ||
11 | * quadword (HAKMEM needed 12 instructions to | ||
12 | * do this instead of the 9 instructions that | ||
13 | * binary search needs). | ||
14 | * | ||
15 | * For correctness consider that: | ||
16 | * | ||
17 | * - only minimum number of quadwords may be accessed | ||
18 | * - the third argument is an unsigned long | ||
19 | * | ||
20 | * Much of the information about 21264 scheduling/coding comes from: | ||
21 | * Compiler Writer's Guide for the Alpha 21264 | ||
22 | * abbreviated as 'CWG' in other comments here | ||
23 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
24 | * Scheduling notation: | ||
25 | * E - either cluster | ||
26 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
27 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
28 | * Try not to change the actual algorithm if possible for consistency. | ||
29 | */ | ||
30 | |||
31 | .set noreorder | ||
32 | .set noat | ||
33 | |||
34 | .align 4 | ||
35 | .globl memchr | ||
36 | .ent memchr | ||
37 | memchr: | ||
38 | .frame $30,0,$26,0 | ||
39 | .prologue 0 | ||
40 | |||
41 | # Hack -- if someone passes in (size_t)-1, hoping to just | ||
42 | # search til the end of the address space, we will overflow | ||
43 | # below when we find the address of the last byte. Given | ||
44 | # that we will never have a 56-bit address space, cropping | ||
45 | # the length is the easiest way to avoid trouble. | ||
46 | zap $18, 0x80, $5 # U : Bound length | ||
47 | beq $18, $not_found # U : | ||
48 | ldq_u $1, 0($16) # L : load first quadword Latency=3 | ||
49 | and $17, 0xff, $17 # E : L L U U : 00000000000000ch | ||
50 | |||
51 | insbl $17, 1, $2 # U : 000000000000ch00 | ||
52 | cmpult $18, 9, $4 # E : small (< 1 quad) string? | ||
53 | or $2, $17, $17 # E : 000000000000chch | ||
54 | lda $3, -1($31) # E : U L L U | ||
55 | |||
56 | sll $17, 16, $2 # U : 00000000chch0000 | ||
57 | addq $16, $5, $5 # E : Max search address | ||
58 | or $2, $17, $17 # E : 00000000chchchch | ||
59 | sll $17, 32, $2 # U : U L L U : chchchch00000000 | ||
60 | |||
61 | or $2, $17, $17 # E : chchchchchchchch | ||
62 | extql $1, $16, $7 # U : $7 is upper bits | ||
63 | beq $4, $first_quad # U : | ||
64 | ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3 | ||
65 | |||
66 | extqh $6, $16, $6 # U : 2 cycle stall for $6 | ||
67 | mov $16, $0 # E : | ||
68 | nop # E : | ||
69 | or $7, $6, $1 # E : L U L U $1 = quadword starting at $16 | ||
70 | |||
71 | # Deal with the case where at most 8 bytes remain to be searched | ||
72 | # in $1. E.g.: | ||
73 | # $18 = 6 | ||
74 | # $1 = ????c6c5c4c3c2c1 | ||
75 | $last_quad: | ||
76 | negq $18, $6 # E : | ||
77 | xor $17, $1, $1 # E : | ||
78 | srl $3, $6, $6 # U : $6 = mask of $18 bits set | ||
79 | cmpbge $31, $1, $2 # E : L U L U | ||
80 | |||
81 | nop | ||
82 | nop | ||
83 | and $2, $6, $2 # E : | ||
84 | beq $2, $not_found # U : U L U L | ||
85 | |||
86 | $found_it: | ||
87 | #if defined(__alpha_fix__) && defined(__alpha_cix__) | ||
88 | /* | ||
89 | * Since we are guaranteed to have set one of the bits, we don't | ||
90 | * have to worry about coming back with a 0x40 out of cttz... | ||
91 | */ | ||
92 | cttz $2, $3 # U0 : | ||
93 | addq $0, $3, $0 # E : All done | ||
94 | nop # E : | ||
95 | ret # L0 : L U L U | ||
96 | #else | ||
97 | /* | ||
98 | * Slow and clunky. It can probably be improved. | ||
99 | * An exercise left for others. | ||
100 | */ | ||
101 | negq $2, $3 # E : | ||
102 | and $2, $3, $2 # E : | ||
103 | and $2, 0x0f, $1 # E : | ||
104 | addq $0, 4, $3 # E : | ||
105 | |||
106 | cmoveq $1, $3, $0 # E : Latency 2, extra map cycle | ||
107 | nop # E : keep with cmov | ||
108 | and $2, 0x33, $1 # E : | ||
109 | addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0 | ||
110 | |||
111 | cmoveq $1, $3, $0 # E : Latency 2, extra map cycle | ||
112 | nop # E : keep with cmov | ||
113 | and $2, 0x55, $1 # E : | ||
114 | addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0 | ||
115 | |||
116 | cmoveq $1, $3, $0 # E : Latency 2, extra map cycle | ||
117 | nop | ||
118 | nop | ||
119 | ret # L0 : L U L U | ||
120 | #endif | ||
121 | |||
122 | # Deal with the case where $18 > 8 bytes remain to be | ||
123 | # searched. $16 may not be aligned. | ||
124 | .align 4 | ||
125 | $first_quad: | ||
126 | andnot $16, 0x7, $0 # E : | ||
127 | insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff) | ||
128 | xor $1, $17, $1 # E : | ||
129 | or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff | ||
130 | |||
131 | cmpbge $31, $1, $2 # E : | ||
132 | bne $2, $found_it # U : | ||
133 | # At least one byte left to process. | ||
134 | ldq $1, 8($0) # L : | ||
135 | subq $5, 1, $18 # E : U L U L | ||
136 | |||
137 | addq $0, 8, $0 # E : | ||
138 | # Make $18 point to last quad to be accessed (the | ||
139 | # last quad may or may not be partial). | ||
140 | andnot $18, 0x7, $18 # E : | ||
141 | cmpult $0, $18, $2 # E : | ||
142 | beq $2, $final # U : U L U L | ||
143 | |||
144 | # At least two quads remain to be accessed. | ||
145 | |||
146 | subq $18, $0, $4 # E : $4 <- nr quads to be processed | ||
147 | and $4, 8, $4 # E : odd number of quads? | ||
148 | bne $4, $odd_quad_count # U : | ||
149 | # At least three quads remain to be accessed | ||
150 | mov $1, $4 # E : L U L U : move prefetched value to correct reg | ||
151 | |||
152 | .align 4 | ||
153 | $unrolled_loop: | ||
154 | ldq $1, 8($0) # L : prefetch $1 | ||
155 | xor $17, $4, $2 # E : | ||
156 | cmpbge $31, $2, $2 # E : | ||
157 | bne $2, $found_it # U : U L U L | ||
158 | |||
159 | addq $0, 8, $0 # E : | ||
160 | nop # E : | ||
161 | nop # E : | ||
162 | nop # E : | ||
163 | |||
164 | $odd_quad_count: | ||
165 | xor $17, $1, $2 # E : | ||
166 | ldq $4, 8($0) # L : prefetch $4 | ||
167 | cmpbge $31, $2, $2 # E : | ||
168 | addq $0, 8, $6 # E : | ||
169 | |||
170 | bne $2, $found_it # U : | ||
171 | cmpult $6, $18, $6 # E : | ||
172 | addq $0, 8, $0 # E : | ||
173 | nop # E : | ||
174 | |||
175 | bne $6, $unrolled_loop # U : | ||
176 | mov $4, $1 # E : move prefetched value into $1 | ||
177 | nop # E : | ||
178 | nop # E : | ||
179 | |||
180 | $final: subq $5, $0, $18 # E : $18 <- number of bytes left to do | ||
181 | nop # E : | ||
182 | nop # E : | ||
183 | bne $18, $last_quad # U : | ||
184 | |||
185 | $not_found: | ||
186 | mov $31, $0 # E : | ||
187 | nop # E : | ||
188 | nop # E : | ||
189 | ret # L0 : | ||
190 | |||
191 | .end memchr | ||
diff --git a/arch/alpha/lib/ev6-memcpy.S b/arch/alpha/lib/ev6-memcpy.S new file mode 100644 index 000000000000..52b37b0f2af5 --- /dev/null +++ b/arch/alpha/lib/ev6-memcpy.S | |||
@@ -0,0 +1,248 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-memcpy.S | ||
3 | * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com> | ||
4 | * | ||
5 | * Reasonably optimized memcpy() routine for the Alpha 21264 | ||
6 | * | ||
7 | * - memory accessed as aligned quadwords only | ||
8 | * - uses bcmpge to compare 8 bytes in parallel | ||
9 | * | ||
10 | * Much of the information about 21264 scheduling/coding comes from: | ||
11 | * Compiler Writer's Guide for the Alpha 21264 | ||
12 | * abbreviated as 'CWG' in other comments here | ||
13 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
14 | * Scheduling notation: | ||
15 | * E - either cluster | ||
16 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
17 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
18 | * | ||
19 | * Temp usage notes: | ||
20 | * $1,$2, - scratch | ||
21 | */ | ||
22 | |||
23 | .set noreorder | ||
24 | .set noat | ||
25 | |||
26 | .align 4 | ||
27 | .globl memcpy | ||
28 | .ent memcpy | ||
29 | memcpy: | ||
30 | .frame $30,0,$26,0 | ||
31 | .prologue 0 | ||
32 | |||
33 | mov $16, $0 # E : copy dest to return | ||
34 | ble $18, $nomoredata # U : done with the copy? | ||
35 | xor $16, $17, $1 # E : are source and dest alignments the same? | ||
36 | and $1, 7, $1 # E : are they the same mod 8? | ||
37 | |||
38 | bne $1, $misaligned # U : Nope - gotta do this the slow way | ||
39 | /* source and dest are same mod 8 address */ | ||
40 | and $16, 7, $1 # E : Are both 0mod8? | ||
41 | beq $1, $both_0mod8 # U : Yes | ||
42 | nop # E : | ||
43 | |||
44 | /* | ||
45 | * source and dest are same misalignment. move a byte at a time | ||
46 | * until a 0mod8 alignment for both is reached. | ||
47 | * At least one byte more to move | ||
48 | */ | ||
49 | |||
50 | $head_align: | ||
51 | ldbu $1, 0($17) # L : grab a byte | ||
52 | subq $18, 1, $18 # E : count-- | ||
53 | addq $17, 1, $17 # E : src++ | ||
54 | stb $1, 0($16) # L : | ||
55 | addq $16, 1, $16 # E : dest++ | ||
56 | and $16, 7, $1 # E : Are we at 0mod8 yet? | ||
57 | ble $18, $nomoredata # U : done with the copy? | ||
58 | bne $1, $head_align # U : | ||
59 | |||
60 | $both_0mod8: | ||
61 | cmple $18, 127, $1 # E : Can we unroll the loop? | ||
62 | bne $1, $no_unroll # U : | ||
63 | and $16, 63, $1 # E : get mod64 alignment | ||
64 | beq $1, $do_unroll # U : no single quads to fiddle | ||
65 | |||
66 | $single_head_quad: | ||
67 | ldq $1, 0($17) # L : get 8 bytes | ||
68 | subq $18, 8, $18 # E : count -= 8 | ||
69 | addq $17, 8, $17 # E : src += 8 | ||
70 | nop # E : | ||
71 | |||
72 | stq $1, 0($16) # L : store | ||
73 | addq $16, 8, $16 # E : dest += 8 | ||
74 | and $16, 63, $1 # E : get mod64 alignment | ||
75 | bne $1, $single_head_quad # U : still not fully aligned | ||
76 | |||
77 | $do_unroll: | ||
78 | addq $16, 64, $7 # E : Initial (+1 trip) wh64 address | ||
79 | cmple $18, 127, $1 # E : Can we go through the unrolled loop? | ||
80 | bne $1, $tail_quads # U : Nope | ||
81 | nop # E : | ||
82 | |||
83 | $unroll_body: | ||
84 | wh64 ($7) # L1 : memory subsystem hint: 64 bytes at | ||
85 | # ($7) are about to be over-written | ||
86 | ldq $6, 0($17) # L0 : bytes 0..7 | ||
87 | nop # E : | ||
88 | nop # E : | ||
89 | |||
90 | ldq $4, 8($17) # L : bytes 8..15 | ||
91 | ldq $5, 16($17) # L : bytes 16..23 | ||
92 | addq $7, 64, $7 # E : Update next wh64 address | ||
93 | nop # E : | ||
94 | |||
95 | ldq $3, 24($17) # L : bytes 24..31 | ||
96 | addq $16, 64, $1 # E : fallback value for wh64 | ||
97 | nop # E : | ||
98 | nop # E : | ||
99 | |||
100 | addq $17, 32, $17 # E : src += 32 bytes | ||
101 | stq $6, 0($16) # L : bytes 0..7 | ||
102 | nop # E : | ||
103 | nop # E : | ||
104 | |||
105 | stq $4, 8($16) # L : bytes 8..15 | ||
106 | stq $5, 16($16) # L : bytes 16..23 | ||
107 | subq $18, 192, $2 # E : At least two more trips to go? | ||
108 | nop # E : | ||
109 | |||
110 | stq $3, 24($16) # L : bytes 24..31 | ||
111 | addq $16, 32, $16 # E : dest += 32 bytes | ||
112 | nop # E : | ||
113 | nop # E : | ||
114 | |||
115 | ldq $6, 0($17) # L : bytes 0..7 | ||
116 | ldq $4, 8($17) # L : bytes 8..15 | ||
117 | cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use | ||
118 | # fallback wh64 address if < 2 more trips | ||
119 | nop # E : | ||
120 | |||
121 | ldq $5, 16($17) # L : bytes 16..23 | ||
122 | ldq $3, 24($17) # L : bytes 24..31 | ||
123 | addq $16, 32, $16 # E : dest += 32 | ||
124 | subq $18, 64, $18 # E : count -= 64 | ||
125 | |||
126 | addq $17, 32, $17 # E : src += 32 | ||
127 | stq $6, -32($16) # L : bytes 0..7 | ||
128 | stq $4, -24($16) # L : bytes 8..15 | ||
129 | cmple $18, 63, $1 # E : At least one more trip? | ||
130 | |||
131 | stq $5, -16($16) # L : bytes 16..23 | ||
132 | stq $3, -8($16) # L : bytes 24..31 | ||
133 | nop # E : | ||
134 | beq $1, $unroll_body | ||
135 | |||
136 | $tail_quads: | ||
137 | $no_unroll: | ||
138 | .align 4 | ||
139 | subq $18, 8, $18 # E : At least a quad left? | ||
140 | blt $18, $less_than_8 # U : Nope | ||
141 | nop # E : | ||
142 | nop # E : | ||
143 | |||
144 | $move_a_quad: | ||
145 | ldq $1, 0($17) # L : fetch 8 | ||
146 | subq $18, 8, $18 # E : count -= 8 | ||
147 | addq $17, 8, $17 # E : src += 8 | ||
148 | nop # E : | ||
149 | |||
150 | stq $1, 0($16) # L : store 8 | ||
151 | addq $16, 8, $16 # E : dest += 8 | ||
152 | bge $18, $move_a_quad # U : | ||
153 | nop # E : | ||
154 | |||
155 | $less_than_8: | ||
156 | .align 4 | ||
157 | addq $18, 8, $18 # E : add back for trailing bytes | ||
158 | ble $18, $nomoredata # U : All-done | ||
159 | nop # E : | ||
160 | nop # E : | ||
161 | |||
162 | /* Trailing bytes */ | ||
163 | $tail_bytes: | ||
164 | subq $18, 1, $18 # E : count-- | ||
165 | ldbu $1, 0($17) # L : fetch a byte | ||
166 | addq $17, 1, $17 # E : src++ | ||
167 | nop # E : | ||
168 | |||
169 | stb $1, 0($16) # L : store a byte | ||
170 | addq $16, 1, $16 # E : dest++ | ||
171 | bgt $18, $tail_bytes # U : more to be done? | ||
172 | nop # E : | ||
173 | |||
174 | /* branching to exit takes 3 extra cycles, so replicate exit here */ | ||
175 | ret $31, ($26), 1 # L0 : | ||
176 | nop # E : | ||
177 | nop # E : | ||
178 | nop # E : | ||
179 | |||
180 | $misaligned: | ||
181 | mov $0, $4 # E : dest temp | ||
182 | and $0, 7, $1 # E : dest alignment mod8 | ||
183 | beq $1, $dest_0mod8 # U : life doesnt totally suck | ||
184 | nop | ||
185 | |||
186 | $aligndest: | ||
187 | ble $18, $nomoredata # U : | ||
188 | ldbu $1, 0($17) # L : fetch a byte | ||
189 | subq $18, 1, $18 # E : count-- | ||
190 | addq $17, 1, $17 # E : src++ | ||
191 | |||
192 | stb $1, 0($4) # L : store it | ||
193 | addq $4, 1, $4 # E : dest++ | ||
194 | and $4, 7, $1 # E : dest 0mod8 yet? | ||
195 | bne $1, $aligndest # U : go until we are aligned. | ||
196 | |||
197 | /* Source has unknown alignment, but dest is known to be 0mod8 */ | ||
198 | $dest_0mod8: | ||
199 | subq $18, 8, $18 # E : At least a quad left? | ||
200 | blt $18, $misalign_tail # U : Nope | ||
201 | ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes | ||
202 | nop # E : | ||
203 | |||
204 | $mis_quad: | ||
205 | ldq_u $16, 8($17) # L : Fetch next 8 | ||
206 | extql $3, $17, $3 # U : masking | ||
207 | extqh $16, $17, $1 # U : masking | ||
208 | bis $3, $1, $1 # E : merged bytes to store | ||
209 | |||
210 | subq $18, 8, $18 # E : count -= 8 | ||
211 | addq $17, 8, $17 # E : src += 8 | ||
212 | stq $1, 0($4) # L : store 8 (aligned) | ||
213 | mov $16, $3 # E : "rotate" source data | ||
214 | |||
215 | addq $4, 8, $4 # E : dest += 8 | ||
216 | bge $18, $mis_quad # U : More quads to move | ||
217 | nop | ||
218 | nop | ||
219 | |||
220 | $misalign_tail: | ||
221 | addq $18, 8, $18 # E : account for tail stuff | ||
222 | ble $18, $nomoredata # U : | ||
223 | nop | ||
224 | nop | ||
225 | |||
226 | $misalign_byte: | ||
227 | ldbu $1, 0($17) # L : fetch 1 | ||
228 | subq $18, 1, $18 # E : count-- | ||
229 | addq $17, 1, $17 # E : src++ | ||
230 | nop # E : | ||
231 | |||
232 | stb $1, 0($4) # L : store | ||
233 | addq $4, 1, $4 # E : dest++ | ||
234 | bgt $18, $misalign_byte # U : more to go? | ||
235 | nop | ||
236 | |||
237 | |||
238 | $nomoredata: | ||
239 | ret $31, ($26), 1 # L0 : | ||
240 | nop # E : | ||
241 | nop # E : | ||
242 | nop # E : | ||
243 | |||
244 | .end memcpy | ||
245 | |||
246 | /* For backwards module compatibility. */ | ||
247 | __memcpy = memcpy | ||
248 | .globl __memcpy | ||
diff --git a/arch/alpha/lib/ev6-memset.S b/arch/alpha/lib/ev6-memset.S new file mode 100644 index 000000000000..d8b94e1c7fca --- /dev/null +++ b/arch/alpha/lib/ev6-memset.S | |||
@@ -0,0 +1,597 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-memset.S | ||
3 | * | ||
4 | * This is an efficient (and relatively small) implementation of the C library | ||
5 | * "memset()" function for the 21264 implementation of Alpha. | ||
6 | * | ||
7 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
8 | * | ||
9 | * Much of the information about 21264 scheduling/coding comes from: | ||
10 | * Compiler Writer's Guide for the Alpha 21264 | ||
11 | * abbreviated as 'CWG' in other comments here | ||
12 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
13 | * Scheduling notation: | ||
14 | * E - either cluster | ||
15 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
16 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
17 | * The algorithm for the leading and trailing quadwords remains the same, | ||
18 | * however the loop has been unrolled to enable better memory throughput, | ||
19 | * and the code has been replicated for each of the entry points: __memset | ||
20 | * and __memsetw to permit better scheduling to eliminate the stalling | ||
21 | * encountered during the mask replication. | ||
22 | * A future enhancement might be to put in a byte store loop for really | ||
23 | * small (say < 32 bytes) memset()s. Whether or not that change would be | ||
24 | * a win in the kernel would depend upon the contextual usage. | ||
25 | * WARNING: Maintaining this is going to be more work than the above version, | ||
26 | * as fixes will need to be made in multiple places. The performance gain | ||
27 | * is worth it. | ||
28 | */ | ||
29 | |||
30 | .set noat | ||
31 | .set noreorder | ||
32 | .text | ||
33 | .globl __memset | ||
34 | .globl __memsetw | ||
35 | .globl __constant_c_memset | ||
36 | .globl memset | ||
37 | |||
38 | .ent __memset | ||
39 | .align 5 | ||
40 | __memset: | ||
41 | .frame $30,0,$26,0 | ||
42 | .prologue 0 | ||
43 | |||
44 | /* | ||
45 | * Serious stalling happens. The only way to mitigate this is to | ||
46 | * undertake a major re-write to interleave the constant materialization | ||
47 | * with other parts of the fall-through code. This is important, even | ||
48 | * though it makes maintenance tougher. | ||
49 | * Do this later. | ||
50 | */ | ||
51 | and $17,255,$1 # E : 00000000000000ch | ||
52 | insbl $17,1,$2 # U : 000000000000ch00 | ||
53 | bis $16,$16,$0 # E : return value | ||
54 | ble $18,end_b # U : zero length requested? | ||
55 | |||
56 | addq $18,$16,$6 # E : max address to write to | ||
57 | bis $1,$2,$17 # E : 000000000000chch | ||
58 | insbl $1,2,$3 # U : 0000000000ch0000 | ||
59 | insbl $1,3,$4 # U : 00000000ch000000 | ||
60 | |||
61 | or $3,$4,$3 # E : 00000000chch0000 | ||
62 | inswl $17,4,$5 # U : 0000chch00000000 | ||
63 | xor $16,$6,$1 # E : will complete write be within one quadword? | ||
64 | inswl $17,6,$2 # U : chch000000000000 | ||
65 | |||
66 | or $17,$3,$17 # E : 00000000chchchch | ||
67 | or $2,$5,$2 # E : chchchch00000000 | ||
68 | bic $1,7,$1 # E : fit within a single quadword? | ||
69 | and $16,7,$3 # E : Target addr misalignment | ||
70 | |||
71 | or $17,$2,$17 # E : chchchchchchchch | ||
72 | beq $1,within_quad_b # U : | ||
73 | nop # E : | ||
74 | beq $3,aligned_b # U : target is 0mod8 | ||
75 | |||
76 | /* | ||
77 | * Target address is misaligned, and won't fit within a quadword | ||
78 | */ | ||
79 | ldq_u $4,0($16) # L : Fetch first partial | ||
80 | bis $16,$16,$5 # E : Save the address | ||
81 | insql $17,$16,$2 # U : Insert new bytes | ||
82 | subq $3,8,$3 # E : Invert (for addressing uses) | ||
83 | |||
84 | addq $18,$3,$18 # E : $18 is new count ($3 is negative) | ||
85 | mskql $4,$16,$4 # U : clear relevant parts of the quad | ||
86 | subq $16,$3,$16 # E : $16 is new aligned destination | ||
87 | bis $2,$4,$1 # E : Final bytes | ||
88 | |||
89 | nop | ||
90 | stq_u $1,0($5) # L : Store result | ||
91 | nop | ||
92 | nop | ||
93 | |||
94 | .align 4 | ||
95 | aligned_b: | ||
96 | /* | ||
97 | * We are now guaranteed to be quad aligned, with at least | ||
98 | * one partial quad to write. | ||
99 | */ | ||
100 | |||
101 | sra $18,3,$3 # U : Number of remaining quads to write | ||
102 | and $18,7,$18 # E : Number of trailing bytes to write | ||
103 | bis $16,$16,$5 # E : Save dest address | ||
104 | beq $3,no_quad_b # U : tail stuff only | ||
105 | |||
106 | /* | ||
107 | * it's worth the effort to unroll this and use wh64 if possible | ||
108 | * Lifted a bunch of code from clear_user.S | ||
109 | * At this point, entry values are: | ||
110 | * $16 Current destination address | ||
111 | * $5 A copy of $16 | ||
112 | * $6 The max quadword address to write to | ||
113 | * $18 Number trailer bytes | ||
114 | * $3 Number quads to write | ||
115 | */ | ||
116 | |||
117 | and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) | ||
118 | subq $3, 16, $4 # E : Only try to unroll if > 128 bytes | ||
119 | subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) | ||
120 | blt $4, loop_b # U : | ||
121 | |||
122 | /* | ||
123 | * We know we've got at least 16 quads, minimum of one trip | ||
124 | * through unrolled loop. Do a quad at a time to get us 0mod64 | ||
125 | * aligned. | ||
126 | */ | ||
127 | |||
128 | nop # E : | ||
129 | nop # E : | ||
130 | nop # E : | ||
131 | beq $1, $bigalign_b # U : | ||
132 | |||
133 | $alignmod64_b: | ||
134 | stq $17, 0($5) # L : | ||
135 | subq $3, 1, $3 # E : For consistency later | ||
136 | addq $1, 8, $1 # E : Increment towards zero for alignment | ||
137 | addq $5, 8, $4 # E : Initial wh64 address (filler instruction) | ||
138 | |||
139 | nop | ||
140 | nop | ||
141 | addq $5, 8, $5 # E : Inc address | ||
142 | blt $1, $alignmod64_b # U : | ||
143 | |||
144 | $bigalign_b: | ||
145 | /* | ||
146 | * $3 - number quads left to go | ||
147 | * $5 - target address (aligned 0mod64) | ||
148 | * $17 - mask of stuff to store | ||
149 | * Scratch registers available: $7, $2, $4, $1 | ||
150 | * we know that we'll be taking a minimum of one trip through | ||
151 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle | ||
152 | * Assumes the wh64 needs to be for 2 trips through the loop in the future | ||
153 | * The wh64 is issued on for the starting destination address for trip +2 | ||
154 | * through the loop, and if there are less than two trips left, the target | ||
155 | * address will be for the current trip. | ||
156 | */ | ||
157 | |||
158 | $do_wh64_b: | ||
159 | wh64 ($4) # L1 : memory subsystem write hint | ||
160 | subq $3, 24, $2 # E : For determining future wh64 addresses | ||
161 | stq $17, 0($5) # L : | ||
162 | nop # E : | ||
163 | |||
164 | addq $5, 128, $4 # E : speculative target of next wh64 | ||
165 | stq $17, 8($5) # L : | ||
166 | stq $17, 16($5) # L : | ||
167 | addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) | ||
168 | |||
169 | stq $17, 24($5) # L : | ||
170 | stq $17, 32($5) # L : | ||
171 | cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle | ||
172 | nop | ||
173 | |||
174 | stq $17, 40($5) # L : | ||
175 | stq $17, 48($5) # L : | ||
176 | subq $3, 16, $2 # E : Repeat the loop at least once more? | ||
177 | nop | ||
178 | |||
179 | stq $17, 56($5) # L : | ||
180 | addq $5, 64, $5 # E : | ||
181 | subq $3, 8, $3 # E : | ||
182 | bge $2, $do_wh64_b # U : | ||
183 | |||
184 | nop | ||
185 | nop | ||
186 | nop | ||
187 | beq $3, no_quad_b # U : Might have finished already | ||
188 | |||
189 | .align 4 | ||
190 | /* | ||
191 | * Simple loop for trailing quadwords, or for small amounts | ||
192 | * of data (where we can't use an unrolled loop and wh64) | ||
193 | */ | ||
194 | loop_b: | ||
195 | stq $17,0($5) # L : | ||
196 | subq $3,1,$3 # E : Decrement number quads left | ||
197 | addq $5,8,$5 # E : Inc address | ||
198 | bne $3,loop_b # U : more? | ||
199 | |||
200 | no_quad_b: | ||
201 | /* | ||
202 | * Write 0..7 trailing bytes. | ||
203 | */ | ||
204 | nop # E : | ||
205 | beq $18,end_b # U : All done? | ||
206 | ldq $7,0($5) # L : | ||
207 | mskqh $7,$6,$2 # U : Mask final quad | ||
208 | |||
209 | insqh $17,$6,$4 # U : New bits | ||
210 | bis $2,$4,$1 # E : Put it all together | ||
211 | stq $1,0($5) # L : And back to memory | ||
212 | ret $31,($26),1 # L0 : | ||
213 | |||
214 | within_quad_b: | ||
215 | ldq_u $1,0($16) # L : | ||
216 | insql $17,$16,$2 # U : New bits | ||
217 | mskql $1,$16,$4 # U : Clear old | ||
218 | bis $2,$4,$2 # E : New result | ||
219 | |||
220 | mskql $2,$6,$4 # U : | ||
221 | mskqh $1,$6,$2 # U : | ||
222 | bis $2,$4,$1 # E : | ||
223 | stq_u $1,0($16) # L : | ||
224 | |||
225 | end_b: | ||
226 | nop | ||
227 | nop | ||
228 | nop | ||
229 | ret $31,($26),1 # L0 : | ||
230 | .end __memset | ||
231 | |||
232 | /* | ||
233 | * This is the original body of code, prior to replication and | ||
234 | * rescheduling. Leave it here, as there may be calls to this | ||
235 | * entry point. | ||
236 | */ | ||
237 | .align 4 | ||
238 | .ent __constant_c_memset | ||
239 | __constant_c_memset: | ||
240 | .frame $30,0,$26,0 | ||
241 | .prologue 0 | ||
242 | |||
243 | addq $18,$16,$6 # E : max address to write to | ||
244 | bis $16,$16,$0 # E : return value | ||
245 | xor $16,$6,$1 # E : will complete write be within one quadword? | ||
246 | ble $18,end # U : zero length requested? | ||
247 | |||
248 | bic $1,7,$1 # E : fit within a single quadword | ||
249 | beq $1,within_one_quad # U : | ||
250 | and $16,7,$3 # E : Target addr misalignment | ||
251 | beq $3,aligned # U : target is 0mod8 | ||
252 | |||
253 | /* | ||
254 | * Target address is misaligned, and won't fit within a quadword | ||
255 | */ | ||
256 | ldq_u $4,0($16) # L : Fetch first partial | ||
257 | bis $16,$16,$5 # E : Save the address | ||
258 | insql $17,$16,$2 # U : Insert new bytes | ||
259 | subq $3,8,$3 # E : Invert (for addressing uses) | ||
260 | |||
261 | addq $18,$3,$18 # E : $18 is new count ($3 is negative) | ||
262 | mskql $4,$16,$4 # U : clear relevant parts of the quad | ||
263 | subq $16,$3,$16 # E : $16 is new aligned destination | ||
264 | bis $2,$4,$1 # E : Final bytes | ||
265 | |||
266 | nop | ||
267 | stq_u $1,0($5) # L : Store result | ||
268 | nop | ||
269 | nop | ||
270 | |||
271 | .align 4 | ||
272 | aligned: | ||
273 | /* | ||
274 | * We are now guaranteed to be quad aligned, with at least | ||
275 | * one partial quad to write. | ||
276 | */ | ||
277 | |||
278 | sra $18,3,$3 # U : Number of remaining quads to write | ||
279 | and $18,7,$18 # E : Number of trailing bytes to write | ||
280 | bis $16,$16,$5 # E : Save dest address | ||
281 | beq $3,no_quad # U : tail stuff only | ||
282 | |||
283 | /* | ||
284 | * it's worth the effort to unroll this and use wh64 if possible | ||
285 | * Lifted a bunch of code from clear_user.S | ||
286 | * At this point, entry values are: | ||
287 | * $16 Current destination address | ||
288 | * $5 A copy of $16 | ||
289 | * $6 The max quadword address to write to | ||
290 | * $18 Number trailer bytes | ||
291 | * $3 Number quads to write | ||
292 | */ | ||
293 | |||
294 | and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) | ||
295 | subq $3, 16, $4 # E : Only try to unroll if > 128 bytes | ||
296 | subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) | ||
297 | blt $4, loop # U : | ||
298 | |||
299 | /* | ||
300 | * We know we've got at least 16 quads, minimum of one trip | ||
301 | * through unrolled loop. Do a quad at a time to get us 0mod64 | ||
302 | * aligned. | ||
303 | */ | ||
304 | |||
305 | nop # E : | ||
306 | nop # E : | ||
307 | nop # E : | ||
308 | beq $1, $bigalign # U : | ||
309 | |||
310 | $alignmod64: | ||
311 | stq $17, 0($5) # L : | ||
312 | subq $3, 1, $3 # E : For consistency later | ||
313 | addq $1, 8, $1 # E : Increment towards zero for alignment | ||
314 | addq $5, 8, $4 # E : Initial wh64 address (filler instruction) | ||
315 | |||
316 | nop | ||
317 | nop | ||
318 | addq $5, 8, $5 # E : Inc address | ||
319 | blt $1, $alignmod64 # U : | ||
320 | |||
321 | $bigalign: | ||
322 | /* | ||
323 | * $3 - number quads left to go | ||
324 | * $5 - target address (aligned 0mod64) | ||
325 | * $17 - mask of stuff to store | ||
326 | * Scratch registers available: $7, $2, $4, $1 | ||
327 | * we know that we'll be taking a minimum of one trip through | ||
328 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle | ||
329 | * Assumes the wh64 needs to be for 2 trips through the loop in the future | ||
330 | * The wh64 is issued on for the starting destination address for trip +2 | ||
331 | * through the loop, and if there are less than two trips left, the target | ||
332 | * address will be for the current trip. | ||
333 | */ | ||
334 | |||
335 | $do_wh64: | ||
336 | wh64 ($4) # L1 : memory subsystem write hint | ||
337 | subq $3, 24, $2 # E : For determining future wh64 addresses | ||
338 | stq $17, 0($5) # L : | ||
339 | nop # E : | ||
340 | |||
341 | addq $5, 128, $4 # E : speculative target of next wh64 | ||
342 | stq $17, 8($5) # L : | ||
343 | stq $17, 16($5) # L : | ||
344 | addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) | ||
345 | |||
346 | stq $17, 24($5) # L : | ||
347 | stq $17, 32($5) # L : | ||
348 | cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle | ||
349 | nop | ||
350 | |||
351 | stq $17, 40($5) # L : | ||
352 | stq $17, 48($5) # L : | ||
353 | subq $3, 16, $2 # E : Repeat the loop at least once more? | ||
354 | nop | ||
355 | |||
356 | stq $17, 56($5) # L : | ||
357 | addq $5, 64, $5 # E : | ||
358 | subq $3, 8, $3 # E : | ||
359 | bge $2, $do_wh64 # U : | ||
360 | |||
361 | nop | ||
362 | nop | ||
363 | nop | ||
364 | beq $3, no_quad # U : Might have finished already | ||
365 | |||
366 | .align 4 | ||
367 | /* | ||
368 | * Simple loop for trailing quadwords, or for small amounts | ||
369 | * of data (where we can't use an unrolled loop and wh64) | ||
370 | */ | ||
371 | loop: | ||
372 | stq $17,0($5) # L : | ||
373 | subq $3,1,$3 # E : Decrement number quads left | ||
374 | addq $5,8,$5 # E : Inc address | ||
375 | bne $3,loop # U : more? | ||
376 | |||
377 | no_quad: | ||
378 | /* | ||
379 | * Write 0..7 trailing bytes. | ||
380 | */ | ||
381 | nop # E : | ||
382 | beq $18,end # U : All done? | ||
383 | ldq $7,0($5) # L : | ||
384 | mskqh $7,$6,$2 # U : Mask final quad | ||
385 | |||
386 | insqh $17,$6,$4 # U : New bits | ||
387 | bis $2,$4,$1 # E : Put it all together | ||
388 | stq $1,0($5) # L : And back to memory | ||
389 | ret $31,($26),1 # L0 : | ||
390 | |||
391 | within_one_quad: | ||
392 | ldq_u $1,0($16) # L : | ||
393 | insql $17,$16,$2 # U : New bits | ||
394 | mskql $1,$16,$4 # U : Clear old | ||
395 | bis $2,$4,$2 # E : New result | ||
396 | |||
397 | mskql $2,$6,$4 # U : | ||
398 | mskqh $1,$6,$2 # U : | ||
399 | bis $2,$4,$1 # E : | ||
400 | stq_u $1,0($16) # L : | ||
401 | |||
402 | end: | ||
403 | nop | ||
404 | nop | ||
405 | nop | ||
406 | ret $31,($26),1 # L0 : | ||
407 | .end __constant_c_memset | ||
408 | |||
409 | /* | ||
410 | * This is a replicant of the __constant_c_memset code, rescheduled | ||
411 | * to mask stalls. Note that entry point names also had to change | ||
412 | */ | ||
413 | .align 5 | ||
414 | .ent __memsetw | ||
415 | |||
416 | __memsetw: | ||
417 | .frame $30,0,$26,0 | ||
418 | .prologue 0 | ||
419 | |||
420 | inswl $17,0,$5 # U : 000000000000c1c2 | ||
421 | inswl $17,2,$2 # U : 00000000c1c20000 | ||
422 | bis $16,$16,$0 # E : return value | ||
423 | addq $18,$16,$6 # E : max address to write to | ||
424 | |||
425 | ble $18, end_w # U : zero length requested? | ||
426 | inswl $17,4,$3 # U : 0000c1c200000000 | ||
427 | inswl $17,6,$4 # U : c1c2000000000000 | ||
428 | xor $16,$6,$1 # E : will complete write be within one quadword? | ||
429 | |||
430 | or $2,$5,$2 # E : 00000000c1c2c1c2 | ||
431 | or $3,$4,$17 # E : c1c2c1c200000000 | ||
432 | bic $1,7,$1 # E : fit within a single quadword | ||
433 | and $16,7,$3 # E : Target addr misalignment | ||
434 | |||
435 | or $17,$2,$17 # E : c1c2c1c2c1c2c1c2 | ||
436 | beq $1,within_quad_w # U : | ||
437 | nop | ||
438 | beq $3,aligned_w # U : target is 0mod8 | ||
439 | |||
440 | /* | ||
441 | * Target address is misaligned, and won't fit within a quadword | ||
442 | */ | ||
443 | ldq_u $4,0($16) # L : Fetch first partial | ||
444 | bis $16,$16,$5 # E : Save the address | ||
445 | insql $17,$16,$2 # U : Insert new bytes | ||
446 | subq $3,8,$3 # E : Invert (for addressing uses) | ||
447 | |||
448 | addq $18,$3,$18 # E : $18 is new count ($3 is negative) | ||
449 | mskql $4,$16,$4 # U : clear relevant parts of the quad | ||
450 | subq $16,$3,$16 # E : $16 is new aligned destination | ||
451 | bis $2,$4,$1 # E : Final bytes | ||
452 | |||
453 | nop | ||
454 | stq_u $1,0($5) # L : Store result | ||
455 | nop | ||
456 | nop | ||
457 | |||
458 | .align 4 | ||
459 | aligned_w: | ||
460 | /* | ||
461 | * We are now guaranteed to be quad aligned, with at least | ||
462 | * one partial quad to write. | ||
463 | */ | ||
464 | |||
465 | sra $18,3,$3 # U : Number of remaining quads to write | ||
466 | and $18,7,$18 # E : Number of trailing bytes to write | ||
467 | bis $16,$16,$5 # E : Save dest address | ||
468 | beq $3,no_quad_w # U : tail stuff only | ||
469 | |||
470 | /* | ||
471 | * it's worth the effort to unroll this and use wh64 if possible | ||
472 | * Lifted a bunch of code from clear_user.S | ||
473 | * At this point, entry values are: | ||
474 | * $16 Current destination address | ||
475 | * $5 A copy of $16 | ||
476 | * $6 The max quadword address to write to | ||
477 | * $18 Number trailer bytes | ||
478 | * $3 Number quads to write | ||
479 | */ | ||
480 | |||
481 | and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) | ||
482 | subq $3, 16, $4 # E : Only try to unroll if > 128 bytes | ||
483 | subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) | ||
484 | blt $4, loop_w # U : | ||
485 | |||
486 | /* | ||
487 | * We know we've got at least 16 quads, minimum of one trip | ||
488 | * through unrolled loop. Do a quad at a time to get us 0mod64 | ||
489 | * aligned. | ||
490 | */ | ||
491 | |||
492 | nop # E : | ||
493 | nop # E : | ||
494 | nop # E : | ||
495 | beq $1, $bigalign_w # U : | ||
496 | |||
497 | $alignmod64_w: | ||
498 | stq $17, 0($5) # L : | ||
499 | subq $3, 1, $3 # E : For consistency later | ||
500 | addq $1, 8, $1 # E : Increment towards zero for alignment | ||
501 | addq $5, 8, $4 # E : Initial wh64 address (filler instruction) | ||
502 | |||
503 | nop | ||
504 | nop | ||
505 | addq $5, 8, $5 # E : Inc address | ||
506 | blt $1, $alignmod64_w # U : | ||
507 | |||
508 | $bigalign_w: | ||
509 | /* | ||
510 | * $3 - number quads left to go | ||
511 | * $5 - target address (aligned 0mod64) | ||
512 | * $17 - mask of stuff to store | ||
513 | * Scratch registers available: $7, $2, $4, $1 | ||
514 | * we know that we'll be taking a minimum of one trip through | ||
515 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle | ||
516 | * Assumes the wh64 needs to be for 2 trips through the loop in the future | ||
517 | * The wh64 is issued on for the starting destination address for trip +2 | ||
518 | * through the loop, and if there are less than two trips left, the target | ||
519 | * address will be for the current trip. | ||
520 | */ | ||
521 | |||
522 | $do_wh64_w: | ||
523 | wh64 ($4) # L1 : memory subsystem write hint | ||
524 | subq $3, 24, $2 # E : For determining future wh64 addresses | ||
525 | stq $17, 0($5) # L : | ||
526 | nop # E : | ||
527 | |||
528 | addq $5, 128, $4 # E : speculative target of next wh64 | ||
529 | stq $17, 8($5) # L : | ||
530 | stq $17, 16($5) # L : | ||
531 | addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) | ||
532 | |||
533 | stq $17, 24($5) # L : | ||
534 | stq $17, 32($5) # L : | ||
535 | cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle | ||
536 | nop | ||
537 | |||
538 | stq $17, 40($5) # L : | ||
539 | stq $17, 48($5) # L : | ||
540 | subq $3, 16, $2 # E : Repeat the loop at least once more? | ||
541 | nop | ||
542 | |||
543 | stq $17, 56($5) # L : | ||
544 | addq $5, 64, $5 # E : | ||
545 | subq $3, 8, $3 # E : | ||
546 | bge $2, $do_wh64_w # U : | ||
547 | |||
548 | nop | ||
549 | nop | ||
550 | nop | ||
551 | beq $3, no_quad_w # U : Might have finished already | ||
552 | |||
553 | .align 4 | ||
554 | /* | ||
555 | * Simple loop for trailing quadwords, or for small amounts | ||
556 | * of data (where we can't use an unrolled loop and wh64) | ||
557 | */ | ||
558 | loop_w: | ||
559 | stq $17,0($5) # L : | ||
560 | subq $3,1,$3 # E : Decrement number quads left | ||
561 | addq $5,8,$5 # E : Inc address | ||
562 | bne $3,loop_w # U : more? | ||
563 | |||
564 | no_quad_w: | ||
565 | /* | ||
566 | * Write 0..7 trailing bytes. | ||
567 | */ | ||
568 | nop # E : | ||
569 | beq $18,end_w # U : All done? | ||
570 | ldq $7,0($5) # L : | ||
571 | mskqh $7,$6,$2 # U : Mask final quad | ||
572 | |||
573 | insqh $17,$6,$4 # U : New bits | ||
574 | bis $2,$4,$1 # E : Put it all together | ||
575 | stq $1,0($5) # L : And back to memory | ||
576 | ret $31,($26),1 # L0 : | ||
577 | |||
578 | within_quad_w: | ||
579 | ldq_u $1,0($16) # L : | ||
580 | insql $17,$16,$2 # U : New bits | ||
581 | mskql $1,$16,$4 # U : Clear old | ||
582 | bis $2,$4,$2 # E : New result | ||
583 | |||
584 | mskql $2,$6,$4 # U : | ||
585 | mskqh $1,$6,$2 # U : | ||
586 | bis $2,$4,$1 # E : | ||
587 | stq_u $1,0($16) # L : | ||
588 | |||
589 | end_w: | ||
590 | nop | ||
591 | nop | ||
592 | nop | ||
593 | ret $31,($26),1 # L0 : | ||
594 | |||
595 | .end __memsetw | ||
596 | |||
597 | memset = __memset | ||
diff --git a/arch/alpha/lib/ev6-strncpy_from_user.S b/arch/alpha/lib/ev6-strncpy_from_user.S new file mode 100644 index 000000000000..d2e28178cacc --- /dev/null +++ b/arch/alpha/lib/ev6-strncpy_from_user.S | |||
@@ -0,0 +1,424 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-strncpy_from_user.S | ||
3 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
4 | * | ||
5 | * Just like strncpy except in the return value: | ||
6 | * | ||
7 | * -EFAULT if an exception occurs before the terminator is copied. | ||
8 | * N if the buffer filled. | ||
9 | * | ||
10 | * Otherwise the length of the string is returned. | ||
11 | * | ||
12 | * Much of the information about 21264 scheduling/coding comes from: | ||
13 | * Compiler Writer's Guide for the Alpha 21264 | ||
14 | * abbreviated as 'CWG' in other comments here | ||
15 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
16 | * Scheduling notation: | ||
17 | * E - either cluster | ||
18 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
19 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
20 | * A bunch of instructions got moved and temp registers were changed | ||
21 | * to aid in scheduling. Control flow was also re-arranged to eliminate | ||
22 | * branches, and to provide longer code sequences to enable better scheduling. | ||
23 | * A total rewrite (using byte load/stores for start & tail sequences) | ||
24 | * is desirable, but very difficult to do without a from-scratch rewrite. | ||
25 | * Save that for the future. | ||
26 | */ | ||
27 | |||
28 | |||
29 | #include <asm/errno.h> | ||
30 | #include <asm/regdef.h> | ||
31 | |||
32 | |||
33 | /* Allow an exception for an insn; exit if we get one. */ | ||
34 | #define EX(x,y...) \ | ||
35 | 99: x,##y; \ | ||
36 | .section __ex_table,"a"; \ | ||
37 | .long 99b - .; \ | ||
38 | lda $31, $exception-99b($0); \ | ||
39 | .previous | ||
40 | |||
41 | |||
42 | .set noat | ||
43 | .set noreorder | ||
44 | .text | ||
45 | |||
46 | .globl __strncpy_from_user | ||
47 | .ent __strncpy_from_user | ||
48 | .frame $30, 0, $26 | ||
49 | .prologue 0 | ||
50 | |||
51 | .align 4 | ||
52 | __strncpy_from_user: | ||
53 | and a0, 7, t3 # E : find dest misalignment | ||
54 | beq a2, $zerolength # U : | ||
55 | |||
56 | /* Are source and destination co-aligned? */ | ||
57 | mov a0, v0 # E : save the string start | ||
58 | xor a0, a1, t4 # E : | ||
59 | EX( ldq_u t1, 0(a1) ) # L : Latency=3 load first quadword | ||
60 | ldq_u t0, 0(a0) # L : load first (partial) aligned dest quadword | ||
61 | |||
62 | addq a2, t3, a2 # E : bias count by dest misalignment | ||
63 | subq a2, 1, a3 # E : | ||
64 | addq zero, 1, t10 # E : | ||
65 | and t4, 7, t4 # E : misalignment between the two | ||
66 | |||
67 | and a3, 7, t6 # E : number of tail bytes | ||
68 | sll t10, t6, t10 # E : t10 = bitmask of last count byte | ||
69 | bne t4, $unaligned # U : | ||
70 | lda t2, -1 # E : build a mask against false zero | ||
71 | |||
72 | /* | ||
73 | * We are co-aligned; take care of a partial first word. | ||
74 | * On entry to this basic block: | ||
75 | * t0 == the first destination word for masking back in | ||
76 | * t1 == the first source word. | ||
77 | */ | ||
78 | |||
79 | srl a3, 3, a2 # E : a2 = loop counter = (count - 1)/8 | ||
80 | addq a1, 8, a1 # E : | ||
81 | mskqh t2, a1, t2 # U : detection in the src word | ||
82 | nop | ||
83 | |||
84 | /* Create the 1st output word and detect 0's in the 1st input word. */ | ||
85 | mskqh t1, a1, t3 # U : | ||
86 | mskql t0, a1, t0 # U : assemble the first output word | ||
87 | ornot t1, t2, t2 # E : | ||
88 | nop | ||
89 | |||
90 | cmpbge zero, t2, t8 # E : bits set iff null found | ||
91 | or t0, t3, t0 # E : | ||
92 | beq a2, $a_eoc # U : | ||
93 | bne t8, $a_eos # U : 2nd branch in a quad. Bad. | ||
94 | |||
95 | /* On entry to this basic block: | ||
96 | * t0 == a source quad not containing a null. | ||
97 | * a0 - current aligned destination address | ||
98 | * a1 - current aligned source address | ||
99 | * a2 - count of quadwords to move. | ||
100 | * NOTE: Loop improvement - unrolling this is going to be | ||
101 | * a huge win, since we're going to stall otherwise. | ||
102 | * Fix this later. For _really_ large copies, look | ||
103 | * at using wh64 on a look-ahead basis. See the code | ||
104 | * in clear_user.S and copy_user.S. | ||
105 | * Presumably, since (a0) and (a1) do not overlap (by C definition) | ||
106 | * Lots of nops here: | ||
107 | * - Separate loads from stores | ||
108 | * - Keep it to 1 branch/quadpack so the branch predictor | ||
109 | * can train. | ||
110 | */ | ||
111 | $a_loop: | ||
112 | stq_u t0, 0(a0) # L : | ||
113 | addq a0, 8, a0 # E : | ||
114 | nop | ||
115 | subq a2, 1, a2 # E : | ||
116 | |||
117 | EX( ldq_u t0, 0(a1) ) # L : | ||
118 | addq a1, 8, a1 # E : | ||
119 | cmpbge zero, t0, t8 # E : Stall 2 cycles on t0 | ||
120 | beq a2, $a_eoc # U : | ||
121 | |||
122 | beq t8, $a_loop # U : | ||
123 | nop | ||
124 | nop | ||
125 | nop | ||
126 | |||
127 | /* Take care of the final (partial) word store. At this point | ||
128 | * the end-of-count bit is set in t8 iff it applies. | ||
129 | * | ||
130 | * On entry to this basic block we have: | ||
131 | * t0 == the source word containing the null | ||
132 | * t8 == the cmpbge mask that found it. | ||
133 | */ | ||
134 | $a_eos: | ||
135 | negq t8, t12 # E : find low bit set | ||
136 | and t8, t12, t12 # E : | ||
137 | |||
138 | /* We're doing a partial word store and so need to combine | ||
139 | our source and original destination words. */ | ||
140 | ldq_u t1, 0(a0) # L : | ||
141 | subq t12, 1, t6 # E : | ||
142 | |||
143 | or t12, t6, t8 # E : | ||
144 | zapnot t0, t8, t0 # U : clear src bytes > null | ||
145 | zap t1, t8, t1 # U : clear dst bytes <= null | ||
146 | or t0, t1, t0 # E : | ||
147 | |||
148 | stq_u t0, 0(a0) # L : | ||
149 | br $finish_up # L0 : | ||
150 | nop | ||
151 | nop | ||
152 | |||
153 | /* Add the end-of-count bit to the eos detection bitmask. */ | ||
154 | .align 4 | ||
155 | $a_eoc: | ||
156 | or t10, t8, t8 | ||
157 | br $a_eos | ||
158 | nop | ||
159 | nop | ||
160 | |||
161 | |||
162 | /* The source and destination are not co-aligned. Align the destination | ||
163 | and cope. We have to be very careful about not reading too much and | ||
164 | causing a SEGV. */ | ||
165 | |||
166 | .align 4 | ||
167 | $u_head: | ||
168 | /* We know just enough now to be able to assemble the first | ||
169 | full source word. We can still find a zero at the end of it | ||
170 | that prevents us from outputting the whole thing. | ||
171 | |||
172 | On entry to this basic block: | ||
173 | t0 == the first dest word, unmasked | ||
174 | t1 == the shifted low bits of the first source word | ||
175 | t6 == bytemask that is -1 in dest word bytes */ | ||
176 | |||
177 | EX( ldq_u t2, 8(a1) ) # L : load second src word | ||
178 | addq a1, 8, a1 # E : | ||
179 | mskql t0, a0, t0 # U : mask trailing garbage in dst | ||
180 | extqh t2, a1, t4 # U : | ||
181 | |||
182 | or t1, t4, t1 # E : first aligned src word complete | ||
183 | mskqh t1, a0, t1 # U : mask leading garbage in src | ||
184 | or t0, t1, t0 # E : first output word complete | ||
185 | or t0, t6, t6 # E : mask original data for zero test | ||
186 | |||
187 | cmpbge zero, t6, t8 # E : | ||
188 | beq a2, $u_eocfin # U : | ||
189 | bne t8, $u_final # U : bad news - 2nd branch in a quad | ||
190 | lda t6, -1 # E : mask out the bits we have | ||
191 | |||
192 | mskql t6, a1, t6 # U : already seen | ||
193 | stq_u t0, 0(a0) # L : store first output word | ||
194 | or t6, t2, t2 # E : | ||
195 | cmpbge zero, t2, t8 # E : find nulls in second partial | ||
196 | |||
197 | addq a0, 8, a0 # E : | ||
198 | subq a2, 1, a2 # E : | ||
199 | bne t8, $u_late_head_exit # U : | ||
200 | nop | ||
201 | |||
202 | /* Finally, we've got all the stupid leading edge cases taken care | ||
203 | of and we can set up to enter the main loop. */ | ||
204 | |||
205 | extql t2, a1, t1 # U : position hi-bits of lo word | ||
206 | EX( ldq_u t2, 8(a1) ) # L : read next high-order source word | ||
207 | addq a1, 8, a1 # E : | ||
208 | cmpbge zero, t2, t8 # E : | ||
209 | |||
210 | beq a2, $u_eoc # U : | ||
211 | bne t8, $u_eos # U : | ||
212 | nop | ||
213 | nop | ||
214 | |||
215 | /* Unaligned copy main loop. In order to avoid reading too much, | ||
216 | the loop is structured to detect zeros in aligned source words. | ||
217 | This has, unfortunately, effectively pulled half of a loop | ||
218 | iteration out into the head and half into the tail, but it does | ||
219 | prevent nastiness from accumulating in the very thing we want | ||
220 | to run as fast as possible. | ||
221 | |||
222 | On entry to this basic block: | ||
223 | t1 == the shifted high-order bits from the previous source word | ||
224 | t2 == the unshifted current source word | ||
225 | |||
226 | We further know that t2 does not contain a null terminator. */ | ||
227 | |||
228 | /* | ||
229 | * Extra nops here: | ||
230 | * separate load quads from store quads | ||
231 | * only one branch/quad to permit predictor training | ||
232 | */ | ||
233 | |||
234 | .align 4 | ||
235 | $u_loop: | ||
236 | extqh t2, a1, t0 # U : extract high bits for current word | ||
237 | addq a1, 8, a1 # E : | ||
238 | extql t2, a1, t3 # U : extract low bits for next time | ||
239 | addq a0, 8, a0 # E : | ||
240 | |||
241 | or t0, t1, t0 # E : current dst word now complete | ||
242 | EX( ldq_u t2, 0(a1) ) # L : load high word for next time | ||
243 | subq a2, 1, a2 # E : | ||
244 | nop | ||
245 | |||
246 | stq_u t0, -8(a0) # L : save the current word | ||
247 | mov t3, t1 # E : | ||
248 | cmpbge zero, t2, t8 # E : test new word for eos | ||
249 | beq a2, $u_eoc # U : | ||
250 | |||
251 | beq t8, $u_loop # U : | ||
252 | nop | ||
253 | nop | ||
254 | nop | ||
255 | |||
256 | /* We've found a zero somewhere in the source word we just read. | ||
257 | If it resides in the lower half, we have one (probably partial) | ||
258 | word to write out, and if it resides in the upper half, we | ||
259 | have one full and one partial word left to write out. | ||
260 | |||
261 | On entry to this basic block: | ||
262 | t1 == the shifted high-order bits from the previous source word | ||
263 | t2 == the unshifted current source word. */ | ||
264 | .align 4 | ||
265 | $u_eos: | ||
266 | extqh t2, a1, t0 # U : | ||
267 | or t0, t1, t0 # E : first (partial) source word complete | ||
268 | cmpbge zero, t0, t8 # E : is the null in this first bit? | ||
269 | nop | ||
270 | |||
271 | bne t8, $u_final # U : | ||
272 | stq_u t0, 0(a0) # L : the null was in the high-order bits | ||
273 | addq a0, 8, a0 # E : | ||
274 | subq a2, 1, a2 # E : | ||
275 | |||
276 | .align 4 | ||
277 | $u_late_head_exit: | ||
278 | extql t2, a1, t0 # U : | ||
279 | cmpbge zero, t0, t8 # E : | ||
280 | or t8, t10, t6 # E : | ||
281 | cmoveq a2, t6, t8 # E : | ||
282 | |||
283 | /* Take care of a final (probably partial) result word. | ||
284 | On entry to this basic block: | ||
285 | t0 == assembled source word | ||
286 | t8 == cmpbge mask that found the null. */ | ||
287 | .align 4 | ||
288 | $u_final: | ||
289 | negq t8, t6 # E : isolate low bit set | ||
290 | and t6, t8, t12 # E : | ||
291 | ldq_u t1, 0(a0) # L : | ||
292 | subq t12, 1, t6 # E : | ||
293 | |||
294 | or t6, t12, t8 # E : | ||
295 | zapnot t0, t8, t0 # U : kill source bytes > null | ||
296 | zap t1, t8, t1 # U : kill dest bytes <= null | ||
297 | or t0, t1, t0 # E : | ||
298 | |||
299 | stq_u t0, 0(a0) # E : | ||
300 | br $finish_up # U : | ||
301 | nop | ||
302 | nop | ||
303 | |||
304 | .align 4 | ||
305 | $u_eoc: # end-of-count | ||
306 | extqh t2, a1, t0 # U : | ||
307 | or t0, t1, t0 # E : | ||
308 | cmpbge zero, t0, t8 # E : | ||
309 | nop | ||
310 | |||
311 | .align 4 | ||
312 | $u_eocfin: # end-of-count, final word | ||
313 | or t10, t8, t8 # E : | ||
314 | br $u_final # U : | ||
315 | nop | ||
316 | nop | ||
317 | |||
318 | /* Unaligned copy entry point. */ | ||
319 | .align 4 | ||
320 | $unaligned: | ||
321 | |||
322 | srl a3, 3, a2 # U : a2 = loop counter = (count - 1)/8 | ||
323 | and a0, 7, t4 # E : find dest misalignment | ||
324 | and a1, 7, t5 # E : find src misalignment | ||
325 | mov zero, t0 # E : | ||
326 | |||
327 | /* Conditionally load the first destination word and a bytemask | ||
328 | with 0xff indicating that the destination byte is sacrosanct. */ | ||
329 | |||
330 | mov zero, t6 # E : | ||
331 | beq t4, 1f # U : | ||
332 | ldq_u t0, 0(a0) # L : | ||
333 | lda t6, -1 # E : | ||
334 | |||
335 | mskql t6, a0, t6 # E : | ||
336 | nop | ||
337 | nop | ||
338 | nop | ||
339 | |||
340 | .align 4 | ||
341 | 1: | ||
342 | subq a1, t4, a1 # E : sub dest misalignment from src addr | ||
343 | /* If source misalignment is larger than dest misalignment, we need | ||
344 | extra startup checks to avoid SEGV. */ | ||
345 | cmplt t4, t5, t12 # E : | ||
346 | extql t1, a1, t1 # U : shift src into place | ||
347 | lda t2, -1 # E : for creating masks later | ||
348 | |||
349 | beq t12, $u_head # U : | ||
350 | mskqh t2, t5, t2 # U : begin src byte validity mask | ||
351 | cmpbge zero, t1, t8 # E : is there a zero? | ||
352 | nop | ||
353 | |||
354 | extql t2, a1, t2 # U : | ||
355 | or t8, t10, t5 # E : test for end-of-count too | ||
356 | cmpbge zero, t2, t3 # E : | ||
357 | cmoveq a2, t5, t8 # E : Latency=2, extra map slot | ||
358 | |||
359 | nop # E : goes with cmov | ||
360 | andnot t8, t3, t8 # E : | ||
361 | beq t8, $u_head # U : | ||
362 | nop | ||
363 | |||
364 | /* At this point we've found a zero in the first partial word of | ||
365 | the source. We need to isolate the valid source data and mask | ||
366 | it into the original destination data. (Incidentally, we know | ||
367 | that we'll need at least one byte of that original dest word.) */ | ||
368 | |||
369 | ldq_u t0, 0(a0) # L : | ||
370 | negq t8, t6 # E : build bitmask of bytes <= zero | ||
371 | mskqh t1, t4, t1 # U : | ||
372 | and t6, t8, t12 # E : | ||
373 | |||
374 | subq t12, 1, t6 # E : | ||
375 | or t6, t12, t8 # E : | ||
376 | zapnot t2, t8, t2 # U : prepare source word; mirror changes | ||
377 | zapnot t1, t8, t1 # U : to source validity mask | ||
378 | |||
379 | andnot t0, t2, t0 # E : zero place for source to reside | ||
380 | or t0, t1, t0 # E : and put it there | ||
381 | stq_u t0, 0(a0) # L : | ||
382 | nop | ||
383 | |||
384 | .align 4 | ||
385 | $finish_up: | ||
386 | zapnot t0, t12, t4 # U : was last byte written null? | ||
387 | and t12, 0xf0, t3 # E : binary search for the address of the | ||
388 | cmovne t4, 1, t4 # E : Latency=2, extra map slot | ||
389 | nop # E : with cmovne | ||
390 | |||
391 | and t12, 0xcc, t2 # E : last byte written | ||
392 | and t12, 0xaa, t1 # E : | ||
393 | cmovne t3, 4, t3 # E : Latency=2, extra map slot | ||
394 | nop # E : with cmovne | ||
395 | |||
396 | bic a0, 7, t0 | ||
397 | cmovne t2, 2, t2 # E : Latency=2, extra map slot | ||
398 | nop # E : with cmovne | ||
399 | nop | ||
400 | |||
401 | cmovne t1, 1, t1 # E : Latency=2, extra map slot | ||
402 | nop # E : with cmovne | ||
403 | addq t0, t3, t0 # E : | ||
404 | addq t1, t2, t1 # E : | ||
405 | |||
406 | addq t0, t1, t0 # E : | ||
407 | addq t0, t4, t0 # add one if we filled the buffer | ||
408 | subq t0, v0, v0 # find string length | ||
409 | ret # L0 : | ||
410 | |||
411 | .align 4 | ||
412 | $zerolength: | ||
413 | nop | ||
414 | nop | ||
415 | nop | ||
416 | clr v0 | ||
417 | |||
418 | $exception: | ||
419 | nop | ||
420 | nop | ||
421 | nop | ||
422 | ret | ||
423 | |||
424 | .end __strncpy_from_user | ||
diff --git a/arch/alpha/lib/ev6-stxcpy.S b/arch/alpha/lib/ev6-stxcpy.S new file mode 100644 index 000000000000..4643ff2ffc8d --- /dev/null +++ b/arch/alpha/lib/ev6-stxcpy.S | |||
@@ -0,0 +1,321 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-stxcpy.S | ||
3 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
4 | * | ||
5 | * Copy a null-terminated string from SRC to DST. | ||
6 | * | ||
7 | * This is an internal routine used by strcpy, stpcpy, and strcat. | ||
8 | * As such, it uses special linkage conventions to make implementation | ||
9 | * of these public functions more efficient. | ||
10 | * | ||
11 | * On input: | ||
12 | * t9 = return address | ||
13 | * a0 = DST | ||
14 | * a1 = SRC | ||
15 | * | ||
16 | * On output: | ||
17 | * t12 = bitmask (with one bit set) indicating the last byte written | ||
18 | * a0 = unaligned address of the last *word* written | ||
19 | * | ||
20 | * Furthermore, v0, a3-a5, t11, and t12 are untouched. | ||
21 | * | ||
22 | * Much of the information about 21264 scheduling/coding comes from: | ||
23 | * Compiler Writer's Guide for the Alpha 21264 | ||
24 | * abbreviated as 'CWG' in other comments here | ||
25 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
26 | * Scheduling notation: | ||
27 | * E - either cluster | ||
28 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
29 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
30 | * Try not to change the actual algorithm if possible for consistency. | ||
31 | */ | ||
32 | |||
33 | #include <asm/regdef.h> | ||
34 | |||
35 | .set noat | ||
36 | .set noreorder | ||
37 | |||
38 | .text | ||
39 | |||
40 | /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that | ||
41 | doesn't like putting the entry point for a procedure somewhere in the | ||
42 | middle of the procedure descriptor. Work around this by putting the | ||
43 | aligned copy in its own procedure descriptor */ | ||
44 | |||
45 | |||
46 | .ent stxcpy_aligned | ||
47 | .align 4 | ||
48 | stxcpy_aligned: | ||
49 | .frame sp, 0, t9 | ||
50 | .prologue 0 | ||
51 | |||
52 | /* On entry to this basic block: | ||
53 | t0 == the first destination word for masking back in | ||
54 | t1 == the first source word. */ | ||
55 | |||
56 | /* Create the 1st output word and detect 0's in the 1st input word. */ | ||
57 | lda t2, -1 # E : build a mask against false zero | ||
58 | mskqh t2, a1, t2 # U : detection in the src word (stall) | ||
59 | mskqh t1, a1, t3 # U : | ||
60 | ornot t1, t2, t2 # E : (stall) | ||
61 | |||
62 | mskql t0, a1, t0 # U : assemble the first output word | ||
63 | cmpbge zero, t2, t8 # E : bits set iff null found | ||
64 | or t0, t3, t1 # E : (stall) | ||
65 | bne t8, $a_eos # U : (stall) | ||
66 | |||
67 | /* On entry to this basic block: | ||
68 | t0 == the first destination word for masking back in | ||
69 | t1 == a source word not containing a null. */ | ||
70 | /* Nops here to separate store quads from load quads */ | ||
71 | |||
72 | $a_loop: | ||
73 | stq_u t1, 0(a0) # L : | ||
74 | addq a0, 8, a0 # E : | ||
75 | nop | ||
76 | nop | ||
77 | |||
78 | ldq_u t1, 0(a1) # L : Latency=3 | ||
79 | addq a1, 8, a1 # E : | ||
80 | cmpbge zero, t1, t8 # E : (3 cycle stall) | ||
81 | beq t8, $a_loop # U : (stall for t8) | ||
82 | |||
83 | /* Take care of the final (partial) word store. | ||
84 | On entry to this basic block we have: | ||
85 | t1 == the source word containing the null | ||
86 | t8 == the cmpbge mask that found it. */ | ||
87 | $a_eos: | ||
88 | negq t8, t6 # E : find low bit set | ||
89 | and t8, t6, t12 # E : (stall) | ||
90 | /* For the sake of the cache, don't read a destination word | ||
91 | if we're not going to need it. */ | ||
92 | and t12, 0x80, t6 # E : (stall) | ||
93 | bne t6, 1f # U : (stall) | ||
94 | |||
95 | /* We're doing a partial word store and so need to combine | ||
96 | our source and original destination words. */ | ||
97 | ldq_u t0, 0(a0) # L : Latency=3 | ||
98 | subq t12, 1, t6 # E : | ||
99 | zapnot t1, t6, t1 # U : clear src bytes >= null (stall) | ||
100 | or t12, t6, t8 # E : (stall) | ||
101 | |||
102 | zap t0, t8, t0 # E : clear dst bytes <= null | ||
103 | or t0, t1, t1 # E : (stall) | ||
104 | nop | ||
105 | nop | ||
106 | |||
107 | 1: stq_u t1, 0(a0) # L : | ||
108 | ret (t9) # L0 : Latency=3 | ||
109 | nop | ||
110 | nop | ||
111 | |||
112 | .end stxcpy_aligned | ||
113 | |||
114 | .align 4 | ||
115 | .ent __stxcpy | ||
116 | .globl __stxcpy | ||
117 | __stxcpy: | ||
118 | .frame sp, 0, t9 | ||
119 | .prologue 0 | ||
120 | |||
121 | /* Are source and destination co-aligned? */ | ||
122 | xor a0, a1, t0 # E : | ||
123 | unop # E : | ||
124 | and t0, 7, t0 # E : (stall) | ||
125 | bne t0, $unaligned # U : (stall) | ||
126 | |||
127 | /* We are co-aligned; take care of a partial first word. */ | ||
128 | ldq_u t1, 0(a1) # L : load first src word | ||
129 | and a0, 7, t0 # E : take care not to load a word ... | ||
130 | addq a1, 8, a1 # E : | ||
131 | beq t0, stxcpy_aligned # U : ... if we wont need it (stall) | ||
132 | |||
133 | ldq_u t0, 0(a0) # L : | ||
134 | br stxcpy_aligned # L0 : Latency=3 | ||
135 | nop | ||
136 | nop | ||
137 | |||
138 | |||
139 | /* The source and destination are not co-aligned. Align the destination | ||
140 | and cope. We have to be very careful about not reading too much and | ||
141 | causing a SEGV. */ | ||
142 | |||
143 | .align 4 | ||
144 | $u_head: | ||
145 | /* We know just enough now to be able to assemble the first | ||
146 | full source word. We can still find a zero at the end of it | ||
147 | that prevents us from outputting the whole thing. | ||
148 | |||
149 | On entry to this basic block: | ||
150 | t0 == the first dest word, for masking back in, if needed else 0 | ||
151 | t1 == the low bits of the first source word | ||
152 | t6 == bytemask that is -1 in dest word bytes */ | ||
153 | |||
154 | ldq_u t2, 8(a1) # L : | ||
155 | addq a1, 8, a1 # E : | ||
156 | extql t1, a1, t1 # U : (stall on a1) | ||
157 | extqh t2, a1, t4 # U : (stall on a1) | ||
158 | |||
159 | mskql t0, a0, t0 # U : | ||
160 | or t1, t4, t1 # E : | ||
161 | mskqh t1, a0, t1 # U : (stall on t1) | ||
162 | or t0, t1, t1 # E : (stall on t1) | ||
163 | |||
164 | or t1, t6, t6 # E : | ||
165 | cmpbge zero, t6, t8 # E : (stall) | ||
166 | lda t6, -1 # E : for masking just below | ||
167 | bne t8, $u_final # U : (stall) | ||
168 | |||
169 | mskql t6, a1, t6 # U : mask out the bits we have | ||
170 | or t6, t2, t2 # E : already extracted before (stall) | ||
171 | cmpbge zero, t2, t8 # E : testing eos (stall) | ||
172 | bne t8, $u_late_head_exit # U : (stall) | ||
173 | |||
174 | /* Finally, we've got all the stupid leading edge cases taken care | ||
175 | of and we can set up to enter the main loop. */ | ||
176 | |||
177 | stq_u t1, 0(a0) # L : store first output word | ||
178 | addq a0, 8, a0 # E : | ||
179 | extql t2, a1, t0 # U : position ho-bits of lo word | ||
180 | ldq_u t2, 8(a1) # U : read next high-order source word | ||
181 | |||
182 | addq a1, 8, a1 # E : | ||
183 | cmpbge zero, t2, t8 # E : (stall for t2) | ||
184 | nop # E : | ||
185 | bne t8, $u_eos # U : (stall) | ||
186 | |||
187 | /* Unaligned copy main loop. In order to avoid reading too much, | ||
188 | the loop is structured to detect zeros in aligned source words. | ||
189 | This has, unfortunately, effectively pulled half of a loop | ||
190 | iteration out into the head and half into the tail, but it does | ||
191 | prevent nastiness from accumulating in the very thing we want | ||
192 | to run as fast as possible. | ||
193 | |||
194 | On entry to this basic block: | ||
195 | t0 == the shifted high-order bits from the previous source word | ||
196 | t2 == the unshifted current source word | ||
197 | |||
198 | We further know that t2 does not contain a null terminator. */ | ||
199 | |||
200 | .align 3 | ||
201 | $u_loop: | ||
202 | extqh t2, a1, t1 # U : extract high bits for current word | ||
203 | addq a1, 8, a1 # E : (stall) | ||
204 | extql t2, a1, t3 # U : extract low bits for next time (stall) | ||
205 | addq a0, 8, a0 # E : | ||
206 | |||
207 | or t0, t1, t1 # E : current dst word now complete | ||
208 | ldq_u t2, 0(a1) # L : Latency=3 load high word for next time | ||
209 | stq_u t1, -8(a0) # L : save the current word (stall) | ||
210 | mov t3, t0 # E : | ||
211 | |||
212 | cmpbge zero, t2, t8 # E : test new word for eos | ||
213 | beq t8, $u_loop # U : (stall) | ||
214 | nop | ||
215 | nop | ||
216 | |||
217 | /* We've found a zero somewhere in the source word we just read. | ||
218 | If it resides in the lower half, we have one (probably partial) | ||
219 | word to write out, and if it resides in the upper half, we | ||
220 | have one full and one partial word left to write out. | ||
221 | |||
222 | On entry to this basic block: | ||
223 | t0 == the shifted high-order bits from the previous source word | ||
224 | t2 == the unshifted current source word. */ | ||
225 | $u_eos: | ||
226 | extqh t2, a1, t1 # U : | ||
227 | or t0, t1, t1 # E : first (partial) source word complete (stall) | ||
228 | cmpbge zero, t1, t8 # E : is the null in this first bit? (stall) | ||
229 | bne t8, $u_final # U : (stall) | ||
230 | |||
231 | $u_late_head_exit: | ||
232 | stq_u t1, 0(a0) # L : the null was in the high-order bits | ||
233 | addq a0, 8, a0 # E : | ||
234 | extql t2, a1, t1 # U : | ||
235 | cmpbge zero, t1, t8 # E : (stall) | ||
236 | |||
237 | /* Take care of a final (probably partial) result word. | ||
238 | On entry to this basic block: | ||
239 | t1 == assembled source word | ||
240 | t8 == cmpbge mask that found the null. */ | ||
241 | $u_final: | ||
242 | negq t8, t6 # E : isolate low bit set | ||
243 | and t6, t8, t12 # E : (stall) | ||
244 | and t12, 0x80, t6 # E : avoid dest word load if we can (stall) | ||
245 | bne t6, 1f # U : (stall) | ||
246 | |||
247 | ldq_u t0, 0(a0) # E : | ||
248 | subq t12, 1, t6 # E : | ||
249 | or t6, t12, t8 # E : (stall) | ||
250 | zapnot t1, t6, t1 # U : kill source bytes >= null (stall) | ||
251 | |||
252 | zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall) | ||
253 | or t0, t1, t1 # E : (stall) | ||
254 | nop | ||
255 | nop | ||
256 | |||
257 | 1: stq_u t1, 0(a0) # L : | ||
258 | ret (t9) # L0 : Latency=3 | ||
259 | nop | ||
260 | nop | ||
261 | |||
262 | /* Unaligned copy entry point. */ | ||
263 | .align 4 | ||
264 | $unaligned: | ||
265 | |||
266 | ldq_u t1, 0(a1) # L : load first source word | ||
267 | and a0, 7, t4 # E : find dest misalignment | ||
268 | and a1, 7, t5 # E : find src misalignment | ||
269 | /* Conditionally load the first destination word and a bytemask | ||
270 | with 0xff indicating that the destination byte is sacrosanct. */ | ||
271 | mov zero, t0 # E : | ||
272 | |||
273 | mov zero, t6 # E : | ||
274 | beq t4, 1f # U : | ||
275 | ldq_u t0, 0(a0) # L : | ||
276 | lda t6, -1 # E : | ||
277 | |||
278 | mskql t6, a0, t6 # U : | ||
279 | nop | ||
280 | nop | ||
281 | nop | ||
282 | 1: | ||
283 | subq a1, t4, a1 # E : sub dest misalignment from src addr | ||
284 | /* If source misalignment is larger than dest misalignment, we need | ||
285 | extra startup checks to avoid SEGV. */ | ||
286 | cmplt t4, t5, t12 # E : | ||
287 | beq t12, $u_head # U : | ||
288 | lda t2, -1 # E : mask out leading garbage in source | ||
289 | |||
290 | mskqh t2, t5, t2 # U : | ||
291 | ornot t1, t2, t3 # E : (stall) | ||
292 | cmpbge zero, t3, t8 # E : is there a zero? (stall) | ||
293 | beq t8, $u_head # U : (stall) | ||
294 | |||
295 | /* At this point we've found a zero in the first partial word of | ||
296 | the source. We need to isolate the valid source data and mask | ||
297 | it into the original destination data. (Incidentally, we know | ||
298 | that we'll need at least one byte of that original dest word.) */ | ||
299 | |||
300 | ldq_u t0, 0(a0) # L : | ||
301 | negq t8, t6 # E : build bitmask of bytes <= zero | ||
302 | and t6, t8, t12 # E : (stall) | ||
303 | and a1, 7, t5 # E : | ||
304 | |||
305 | subq t12, 1, t6 # E : | ||
306 | or t6, t12, t8 # E : (stall) | ||
307 | srl t12, t5, t12 # U : adjust final null return value | ||
308 | zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall) | ||
309 | |||
310 | and t1, t2, t1 # E : to source validity mask | ||
311 | extql t2, a1, t2 # U : | ||
312 | extql t1, a1, t1 # U : (stall) | ||
313 | andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall) | ||
314 | |||
315 | or t0, t1, t1 # e1 : and put it there | ||
316 | stq_u t1, 0(a0) # .. e0 : (stall) | ||
317 | ret (t9) # e1 : | ||
318 | nop | ||
319 | |||
320 | .end __stxcpy | ||
321 | |||
diff --git a/arch/alpha/lib/ev6-stxncpy.S b/arch/alpha/lib/ev6-stxncpy.S new file mode 100644 index 000000000000..b581a7af2456 --- /dev/null +++ b/arch/alpha/lib/ev6-stxncpy.S | |||
@@ -0,0 +1,397 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-stxncpy.S | ||
3 | * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com> | ||
4 | * | ||
5 | * Copy no more than COUNT bytes of the null-terminated string from | ||
6 | * SRC to DST. | ||
7 | * | ||
8 | * This is an internal routine used by strncpy, stpncpy, and strncat. | ||
9 | * As such, it uses special linkage conventions to make implementation | ||
10 | * of these public functions more efficient. | ||
11 | * | ||
12 | * On input: | ||
13 | * t9 = return address | ||
14 | * a0 = DST | ||
15 | * a1 = SRC | ||
16 | * a2 = COUNT | ||
17 | * | ||
18 | * Furthermore, COUNT may not be zero. | ||
19 | * | ||
20 | * On output: | ||
21 | * t0 = last word written | ||
22 | * t10 = bitmask (with one bit set) indicating the byte position of | ||
23 | * the end of the range specified by COUNT | ||
24 | * t12 = bitmask (with one bit set) indicating the last byte written | ||
25 | * a0 = unaligned address of the last *word* written | ||
26 | * a2 = the number of full words left in COUNT | ||
27 | * | ||
28 | * Furthermore, v0, a3-a5, t11, and $at are untouched. | ||
29 | * | ||
30 | * Much of the information about 21264 scheduling/coding comes from: | ||
31 | * Compiler Writer's Guide for the Alpha 21264 | ||
32 | * abbreviated as 'CWG' in other comments here | ||
33 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
34 | * Scheduling notation: | ||
35 | * E - either cluster | ||
36 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
37 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
38 | * Try not to change the actual algorithm if possible for consistency. | ||
39 | */ | ||
40 | |||
41 | #include <asm/regdef.h> | ||
42 | |||
43 | .set noat | ||
44 | .set noreorder | ||
45 | |||
46 | .text | ||
47 | |||
48 | /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that | ||
49 | doesn't like putting the entry point for a procedure somewhere in the | ||
50 | middle of the procedure descriptor. Work around this by putting the | ||
51 | aligned copy in its own procedure descriptor */ | ||
52 | |||
53 | |||
54 | .ent stxncpy_aligned | ||
55 | .align 4 | ||
56 | stxncpy_aligned: | ||
57 | .frame sp, 0, t9, 0 | ||
58 | .prologue 0 | ||
59 | |||
60 | /* On entry to this basic block: | ||
61 | t0 == the first destination word for masking back in | ||
62 | t1 == the first source word. */ | ||
63 | |||
64 | /* Create the 1st output word and detect 0's in the 1st input word. */ | ||
65 | lda t2, -1 # E : build a mask against false zero | ||
66 | mskqh t2, a1, t2 # U : detection in the src word (stall) | ||
67 | mskqh t1, a1, t3 # U : | ||
68 | ornot t1, t2, t2 # E : (stall) | ||
69 | |||
70 | mskql t0, a1, t0 # U : assemble the first output word | ||
71 | cmpbge zero, t2, t8 # E : bits set iff null found | ||
72 | or t0, t3, t0 # E : (stall) | ||
73 | beq a2, $a_eoc # U : | ||
74 | |||
75 | bne t8, $a_eos # U : | ||
76 | nop | ||
77 | nop | ||
78 | nop | ||
79 | |||
80 | /* On entry to this basic block: | ||
81 | t0 == a source word not containing a null. */ | ||
82 | |||
83 | /* | ||
84 | * nops here to: | ||
85 | * separate store quads from load quads | ||
86 | * limit of 1 bcond/quad to permit training | ||
87 | */ | ||
88 | $a_loop: | ||
89 | stq_u t0, 0(a0) # L : | ||
90 | addq a0, 8, a0 # E : | ||
91 | subq a2, 1, a2 # E : | ||
92 | nop | ||
93 | |||
94 | ldq_u t0, 0(a1) # L : | ||
95 | addq a1, 8, a1 # E : | ||
96 | cmpbge zero, t0, t8 # E : | ||
97 | beq a2, $a_eoc # U : | ||
98 | |||
99 | beq t8, $a_loop # U : | ||
100 | nop | ||
101 | nop | ||
102 | nop | ||
103 | |||
104 | /* Take care of the final (partial) word store. At this point | ||
105 | the end-of-count bit is set in t8 iff it applies. | ||
106 | |||
107 | On entry to this basic block we have: | ||
108 | t0 == the source word containing the null | ||
109 | t8 == the cmpbge mask that found it. */ | ||
110 | |||
111 | $a_eos: | ||
112 | negq t8, t12 # E : find low bit set | ||
113 | and t8, t12, t12 # E : (stall) | ||
114 | /* For the sake of the cache, don't read a destination word | ||
115 | if we're not going to need it. */ | ||
116 | and t12, 0x80, t6 # E : (stall) | ||
117 | bne t6, 1f # U : (stall) | ||
118 | |||
119 | /* We're doing a partial word store and so need to combine | ||
120 | our source and original destination words. */ | ||
121 | ldq_u t1, 0(a0) # L : | ||
122 | subq t12, 1, t6 # E : | ||
123 | or t12, t6, t8 # E : (stall) | ||
124 | zapnot t0, t8, t0 # U : clear src bytes > null (stall) | ||
125 | |||
126 | zap t1, t8, t1 # .. e1 : clear dst bytes <= null | ||
127 | or t0, t1, t0 # e1 : (stall) | ||
128 | nop | ||
129 | nop | ||
130 | |||
131 | 1: stq_u t0, 0(a0) # L : | ||
132 | ret (t9) # L0 : Latency=3 | ||
133 | nop | ||
134 | nop | ||
135 | |||
136 | /* Add the end-of-count bit to the eos detection bitmask. */ | ||
137 | $a_eoc: | ||
138 | or t10, t8, t8 # E : | ||
139 | br $a_eos # L0 : Latency=3 | ||
140 | nop | ||
141 | nop | ||
142 | |||
143 | .end stxncpy_aligned | ||
144 | |||
145 | .align 4 | ||
146 | .ent __stxncpy | ||
147 | .globl __stxncpy | ||
148 | __stxncpy: | ||
149 | .frame sp, 0, t9, 0 | ||
150 | .prologue 0 | ||
151 | |||
152 | /* Are source and destination co-aligned? */ | ||
153 | xor a0, a1, t1 # E : | ||
154 | and a0, 7, t0 # E : find dest misalignment | ||
155 | and t1, 7, t1 # E : (stall) | ||
156 | addq a2, t0, a2 # E : bias count by dest misalignment (stall) | ||
157 | |||
158 | subq a2, 1, a2 # E : | ||
159 | and a2, 7, t2 # E : (stall) | ||
160 | srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall) | ||
161 | addq zero, 1, t10 # E : | ||
162 | |||
163 | sll t10, t2, t10 # U : t10 = bitmask of last count byte | ||
164 | bne t1, $unaligned # U : | ||
165 | /* We are co-aligned; take care of a partial first word. */ | ||
166 | ldq_u t1, 0(a1) # L : load first src word | ||
167 | addq a1, 8, a1 # E : | ||
168 | |||
169 | beq t0, stxncpy_aligned # U : avoid loading dest word if not needed | ||
170 | ldq_u t0, 0(a0) # L : | ||
171 | nop | ||
172 | nop | ||
173 | |||
174 | br stxncpy_aligned # .. e1 : | ||
175 | nop | ||
176 | nop | ||
177 | nop | ||
178 | |||
179 | |||
180 | |||
181 | /* The source and destination are not co-aligned. Align the destination | ||
182 | and cope. We have to be very careful about not reading too much and | ||
183 | causing a SEGV. */ | ||
184 | |||
185 | .align 4 | ||
186 | $u_head: | ||
187 | /* We know just enough now to be able to assemble the first | ||
188 | full source word. We can still find a zero at the end of it | ||
189 | that prevents us from outputting the whole thing. | ||
190 | |||
191 | On entry to this basic block: | ||
192 | t0 == the first dest word, unmasked | ||
193 | t1 == the shifted low bits of the first source word | ||
194 | t6 == bytemask that is -1 in dest word bytes */ | ||
195 | |||
196 | ldq_u t2, 8(a1) # L : Latency=3 load second src word | ||
197 | addq a1, 8, a1 # E : | ||
198 | mskql t0, a0, t0 # U : mask trailing garbage in dst | ||
199 | extqh t2, a1, t4 # U : (3 cycle stall on t2) | ||
200 | |||
201 | or t1, t4, t1 # E : first aligned src word complete (stall) | ||
202 | mskqh t1, a0, t1 # U : mask leading garbage in src (stall) | ||
203 | or t0, t1, t0 # E : first output word complete (stall) | ||
204 | or t0, t6, t6 # E : mask original data for zero test (stall) | ||
205 | |||
206 | cmpbge zero, t6, t8 # E : | ||
207 | beq a2, $u_eocfin # U : | ||
208 | lda t6, -1 # E : | ||
209 | nop | ||
210 | |||
211 | bne t8, $u_final # U : | ||
212 | mskql t6, a1, t6 # U : mask out bits already seen | ||
213 | stq_u t0, 0(a0) # L : store first output word | ||
214 | or t6, t2, t2 # E : (stall) | ||
215 | |||
216 | cmpbge zero, t2, t8 # E : find nulls in second partial | ||
217 | addq a0, 8, a0 # E : | ||
218 | subq a2, 1, a2 # E : | ||
219 | bne t8, $u_late_head_exit # U : | ||
220 | |||
221 | /* Finally, we've got all the stupid leading edge cases taken care | ||
222 | of and we can set up to enter the main loop. */ | ||
223 | extql t2, a1, t1 # U : position hi-bits of lo word | ||
224 | beq a2, $u_eoc # U : | ||
225 | ldq_u t2, 8(a1) # L : read next high-order source word | ||
226 | addq a1, 8, a1 # E : | ||
227 | |||
228 | extqh t2, a1, t0 # U : position lo-bits of hi word (stall) | ||
229 | cmpbge zero, t2, t8 # E : | ||
230 | nop | ||
231 | bne t8, $u_eos # U : | ||
232 | |||
233 | /* Unaligned copy main loop. In order to avoid reading too much, | ||
234 | the loop is structured to detect zeros in aligned source words. | ||
235 | This has, unfortunately, effectively pulled half of a loop | ||
236 | iteration out into the head and half into the tail, but it does | ||
237 | prevent nastiness from accumulating in the very thing we want | ||
238 | to run as fast as possible. | ||
239 | |||
240 | On entry to this basic block: | ||
241 | t0 == the shifted low-order bits from the current source word | ||
242 | t1 == the shifted high-order bits from the previous source word | ||
243 | t2 == the unshifted current source word | ||
244 | |||
245 | We further know that t2 does not contain a null terminator. */ | ||
246 | |||
247 | .align 4 | ||
248 | $u_loop: | ||
249 | or t0, t1, t0 # E : current dst word now complete | ||
250 | subq a2, 1, a2 # E : decrement word count | ||
251 | extql t2, a1, t1 # U : extract low bits for next time | ||
252 | addq a0, 8, a0 # E : | ||
253 | |||
254 | stq_u t0, -8(a0) # U : save the current word | ||
255 | beq a2, $u_eoc # U : | ||
256 | ldq_u t2, 8(a1) # U : Latency=3 load high word for next time | ||
257 | addq a1, 8, a1 # E : | ||
258 | |||
259 | extqh t2, a1, t0 # U : extract low bits (2 cycle stall) | ||
260 | cmpbge zero, t2, t8 # E : test new word for eos | ||
261 | nop | ||
262 | beq t8, $u_loop # U : | ||
263 | |||
264 | /* We've found a zero somewhere in the source word we just read. | ||
265 | If it resides in the lower half, we have one (probably partial) | ||
266 | word to write out, and if it resides in the upper half, we | ||
267 | have one full and one partial word left to write out. | ||
268 | |||
269 | On entry to this basic block: | ||
270 | t0 == the shifted low-order bits from the current source word | ||
271 | t1 == the shifted high-order bits from the previous source word | ||
272 | t2 == the unshifted current source word. */ | ||
273 | $u_eos: | ||
274 | or t0, t1, t0 # E : first (partial) source word complete | ||
275 | nop | ||
276 | cmpbge zero, t0, t8 # E : is the null in this first bit? (stall) | ||
277 | bne t8, $u_final # U : (stall) | ||
278 | |||
279 | stq_u t0, 0(a0) # L : the null was in the high-order bits | ||
280 | addq a0, 8, a0 # E : | ||
281 | subq a2, 1, a2 # E : | ||
282 | nop | ||
283 | |||
284 | $u_late_head_exit: | ||
285 | extql t2, a1, t0 # U : | ||
286 | cmpbge zero, t0, t8 # E : | ||
287 | or t8, t10, t6 # E : (stall) | ||
288 | cmoveq a2, t6, t8 # E : Latency=2, extra map slot (stall) | ||
289 | |||
290 | /* Take care of a final (probably partial) result word. | ||
291 | On entry to this basic block: | ||
292 | t0 == assembled source word | ||
293 | t8 == cmpbge mask that found the null. */ | ||
294 | $u_final: | ||
295 | negq t8, t6 # E : isolate low bit set | ||
296 | and t6, t8, t12 # E : (stall) | ||
297 | and t12, 0x80, t6 # E : avoid dest word load if we can (stall) | ||
298 | bne t6, 1f # U : (stall) | ||
299 | |||
300 | ldq_u t1, 0(a0) # L : | ||
301 | subq t12, 1, t6 # E : | ||
302 | or t6, t12, t8 # E : (stall) | ||
303 | zapnot t0, t8, t0 # U : kill source bytes > null | ||
304 | |||
305 | zap t1, t8, t1 # U : kill dest bytes <= null | ||
306 | or t0, t1, t0 # E : (stall) | ||
307 | nop | ||
308 | nop | ||
309 | |||
310 | 1: stq_u t0, 0(a0) # L : | ||
311 | ret (t9) # L0 : Latency=3 | ||
312 | |||
313 | /* Got to end-of-count before end of string. | ||
314 | On entry to this basic block: | ||
315 | t1 == the shifted high-order bits from the previous source word */ | ||
316 | $u_eoc: | ||
317 | and a1, 7, t6 # E : avoid final load if possible | ||
318 | sll t10, t6, t6 # U : (stall) | ||
319 | and t6, 0xff, t6 # E : (stall) | ||
320 | bne t6, 1f # U : (stall) | ||
321 | |||
322 | ldq_u t2, 8(a1) # L : load final src word | ||
323 | nop | ||
324 | extqh t2, a1, t0 # U : extract low bits for last word (stall) | ||
325 | or t1, t0, t1 # E : (stall) | ||
326 | |||
327 | 1: cmpbge zero, t1, t8 # E : | ||
328 | mov t1, t0 # E : | ||
329 | |||
330 | $u_eocfin: # end-of-count, final word | ||
331 | or t10, t8, t8 # E : | ||
332 | br $u_final # L0 : Latency=3 | ||
333 | |||
334 | /* Unaligned copy entry point. */ | ||
335 | .align 4 | ||
336 | $unaligned: | ||
337 | |||
338 | ldq_u t1, 0(a1) # L : load first source word | ||
339 | and a0, 7, t4 # E : find dest misalignment | ||
340 | and a1, 7, t5 # E : find src misalignment | ||
341 | /* Conditionally load the first destination word and a bytemask | ||
342 | with 0xff indicating that the destination byte is sacrosanct. */ | ||
343 | mov zero, t0 # E : | ||
344 | |||
345 | mov zero, t6 # E : | ||
346 | beq t4, 1f # U : | ||
347 | ldq_u t0, 0(a0) # L : | ||
348 | lda t6, -1 # E : | ||
349 | |||
350 | mskql t6, a0, t6 # U : | ||
351 | nop | ||
352 | nop | ||
353 | subq a1, t4, a1 # E : sub dest misalignment from src addr | ||
354 | |||
355 | /* If source misalignment is larger than dest misalignment, we need | ||
356 | extra startup checks to avoid SEGV. */ | ||
357 | |||
358 | 1: cmplt t4, t5, t12 # E : | ||
359 | extql t1, a1, t1 # U : shift src into place | ||
360 | lda t2, -1 # E : for creating masks later | ||
361 | beq t12, $u_head # U : (stall) | ||
362 | |||
363 | extql t2, a1, t2 # U : | ||
364 | cmpbge zero, t1, t8 # E : is there a zero? | ||
365 | andnot t2, t6, t12 # E : dest mask for a single word copy | ||
366 | or t8, t10, t5 # E : test for end-of-count too | ||
367 | |||
368 | cmpbge zero, t12, t3 # E : | ||
369 | cmoveq a2, t5, t8 # E : Latency=2, extra map slot | ||
370 | nop # E : keep with cmoveq | ||
371 | andnot t8, t3, t8 # E : (stall) | ||
372 | |||
373 | beq t8, $u_head # U : | ||
374 | /* At this point we've found a zero in the first partial word of | ||
375 | the source. We need to isolate the valid source data and mask | ||
376 | it into the original destination data. (Incidentally, we know | ||
377 | that we'll need at least one byte of that original dest word.) */ | ||
378 | ldq_u t0, 0(a0) # L : | ||
379 | negq t8, t6 # E : build bitmask of bytes <= zero | ||
380 | mskqh t1, t4, t1 # U : | ||
381 | |||
382 | and t6, t8, t2 # E : | ||
383 | subq t2, 1, t6 # E : (stall) | ||
384 | or t6, t2, t8 # E : (stall) | ||
385 | zapnot t12, t8, t12 # U : prepare source word; mirror changes (stall) | ||
386 | |||
387 | zapnot t1, t8, t1 # U : to source validity mask | ||
388 | andnot t0, t12, t0 # E : zero place for source to reside | ||
389 | or t0, t1, t0 # E : and put it there (stall both t0, t1) | ||
390 | stq_u t0, 0(a0) # L : (stall) | ||
391 | |||
392 | ret (t9) # L0 : Latency=3 | ||
393 | nop | ||
394 | nop | ||
395 | nop | ||
396 | |||
397 | .end __stxncpy | ||
diff --git a/arch/alpha/lib/ev67-strcat.S b/arch/alpha/lib/ev67-strcat.S new file mode 100644 index 000000000000..c426fe3ed72f --- /dev/null +++ b/arch/alpha/lib/ev67-strcat.S | |||
@@ -0,0 +1,54 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev67-strcat.S | ||
3 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
4 | * | ||
5 | * Append a null-terminated string from SRC to DST. | ||
6 | * | ||
7 | * Much of the information about 21264 scheduling/coding comes from: | ||
8 | * Compiler Writer's Guide for the Alpha 21264 | ||
9 | * abbreviated as 'CWG' in other comments here | ||
10 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
11 | * Scheduling notation: | ||
12 | * E - either cluster | ||
13 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
14 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
15 | * Try not to change the actual algorithm if possible for consistency. | ||
16 | * Commentary: It seems bogus to walk the input string twice - once | ||
17 | * to determine the length, and then again while doing the copy. | ||
18 | * A significant (future) enhancement would be to only read the input | ||
19 | * string once. | ||
20 | */ | ||
21 | |||
22 | |||
23 | .text | ||
24 | |||
25 | .align 4 | ||
26 | .globl strcat | ||
27 | .ent strcat | ||
28 | strcat: | ||
29 | .frame $30, 0, $26 | ||
30 | .prologue 0 | ||
31 | |||
32 | mov $16, $0 # E : set up return value | ||
33 | /* Find the end of the string. */ | ||
34 | ldq_u $1, 0($16) # L : load first quadword (a0 may be misaligned) | ||
35 | lda $2, -1 # E : | ||
36 | insqh $2, $16, $2 # U : | ||
37 | |||
38 | andnot $16, 7, $16 # E : | ||
39 | or $2, $1, $1 # E : | ||
40 | cmpbge $31, $1, $2 # E : bits set iff byte == 0 | ||
41 | bne $2, $found # U : | ||
42 | |||
43 | $loop: ldq $1, 8($16) # L : | ||
44 | addq $16, 8, $16 # E : | ||
45 | cmpbge $31, $1, $2 # E : | ||
46 | beq $2, $loop # U : | ||
47 | |||
48 | $found: cttz $2, $3 # U0 : | ||
49 | addq $16, $3, $16 # E : | ||
50 | /* Now do the append. */ | ||
51 | mov $26, $23 # E : | ||
52 | br __stxcpy # L0 : | ||
53 | |||
54 | .end strcat | ||
diff --git a/arch/alpha/lib/ev67-strchr.S b/arch/alpha/lib/ev67-strchr.S new file mode 100644 index 000000000000..fbb7b4ffade9 --- /dev/null +++ b/arch/alpha/lib/ev67-strchr.S | |||
@@ -0,0 +1,88 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev67-strchr.S | ||
3 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
4 | * | ||
5 | * Return the address of a given character within a null-terminated | ||
6 | * string, or null if it is not found. | ||
7 | * | ||
8 | * Much of the information about 21264 scheduling/coding comes from: | ||
9 | * Compiler Writer's Guide for the Alpha 21264 | ||
10 | * abbreviated as 'CWG' in other comments here | ||
11 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
12 | * Scheduling notation: | ||
13 | * E - either cluster | ||
14 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
15 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
16 | * Try not to change the actual algorithm if possible for consistency. | ||
17 | */ | ||
18 | |||
19 | #include <asm/regdef.h> | ||
20 | |||
21 | .set noreorder | ||
22 | .set noat | ||
23 | |||
24 | .align 4 | ||
25 | .globl strchr | ||
26 | .ent strchr | ||
27 | strchr: | ||
28 | .frame sp, 0, ra | ||
29 | .prologue 0 | ||
30 | |||
31 | ldq_u t0, 0(a0) # L : load first quadword Latency=3 | ||
32 | and a1, 0xff, t3 # E : 00000000000000ch | ||
33 | insbl a1, 1, t5 # U : 000000000000ch00 | ||
34 | insbl a1, 7, a2 # U : ch00000000000000 | ||
35 | |||
36 | insbl t3, 6, a3 # U : 00ch000000000000 | ||
37 | or t5, t3, a1 # E : 000000000000chch | ||
38 | andnot a0, 7, v0 # E : align our loop pointer | ||
39 | lda t4, -1 # E : build garbage mask | ||
40 | |||
41 | mskqh t4, a0, t4 # U : only want relevant part of first quad | ||
42 | or a2, a3, a2 # E : chch000000000000 | ||
43 | inswl a1, 2, t5 # E : 00000000chch0000 | ||
44 | inswl a1, 4, a3 # E : 0000chch00000000 | ||
45 | |||
46 | or a1, a2, a1 # E : chch00000000chch | ||
47 | or a3, t5, t5 # E : 0000chchchch0000 | ||
48 | cmpbge zero, t0, t2 # E : bits set iff byte == zero | ||
49 | cmpbge zero, t4, t4 # E : bits set iff byte is garbage | ||
50 | |||
51 | /* This quad is _very_ serialized. Lots of stalling happens */ | ||
52 | or t5, a1, a1 # E : chchchchchchchch | ||
53 | xor t0, a1, t1 # E : make bytes == c zero | ||
54 | cmpbge zero, t1, t3 # E : bits set iff byte == c | ||
55 | or t2, t3, t0 # E : bits set iff char match or zero match | ||
56 | |||
57 | andnot t0, t4, t0 # E : clear garbage bits | ||
58 | cttz t0, a2 # U0 : speculative (in case we get a match) | ||
59 | nop # E : | ||
60 | bne t0, $found # U : | ||
61 | |||
62 | /* | ||
63 | * Yuk. This loop is going to stall like crazy waiting for the | ||
64 | * data to be loaded. Not much can be done about it unless it's | ||
65 | * unrolled multiple times - is that safe to do in kernel space? | ||
66 | * Or would exception handling recovery code do the trick here? | ||
67 | */ | ||
68 | $loop: ldq t0, 8(v0) # L : Latency=3 | ||
69 | addq v0, 8, v0 # E : | ||
70 | xor t0, a1, t1 # E : | ||
71 | cmpbge zero, t0, t2 # E : bits set iff byte == 0 | ||
72 | |||
73 | cmpbge zero, t1, t3 # E : bits set iff byte == c | ||
74 | or t2, t3, t0 # E : | ||
75 | cttz t3, a2 # U0 : speculative (in case we get a match) | ||
76 | beq t0, $loop # U : | ||
77 | |||
78 | $found: negq t0, t1 # E : clear all but least set bit | ||
79 | and t0, t1, t0 # E : | ||
80 | and t0, t3, t1 # E : bit set iff byte was the char | ||
81 | addq v0, a2, v0 # E : Add in the bit number from above | ||
82 | |||
83 | cmoveq t1, $31, v0 # E : Two mapping slots, latency = 2 | ||
84 | nop | ||
85 | nop | ||
86 | ret # L0 : | ||
87 | |||
88 | .end strchr | ||
diff --git a/arch/alpha/lib/ev67-strlen.S b/arch/alpha/lib/ev67-strlen.S new file mode 100644 index 000000000000..503928072523 --- /dev/null +++ b/arch/alpha/lib/ev67-strlen.S | |||
@@ -0,0 +1,49 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev67-strlen.S | ||
3 | * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com> | ||
4 | * | ||
5 | * Finds length of a 0-terminated string. Optimized for the | ||
6 | * Alpha architecture: | ||
7 | * | ||
8 | * - memory accessed as aligned quadwords only | ||
9 | * - uses bcmpge to compare 8 bytes in parallel | ||
10 | * | ||
11 | * Much of the information about 21264 scheduling/coding comes from: | ||
12 | * Compiler Writer's Guide for the Alpha 21264 | ||
13 | * abbreviated as 'CWG' in other comments here | ||
14 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
15 | * Scheduling notation: | ||
16 | * E - either cluster | ||
17 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
18 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
19 | */ | ||
20 | |||
21 | .set noreorder | ||
22 | .set noat | ||
23 | |||
24 | .globl strlen | ||
25 | .ent strlen | ||
26 | .align 4 | ||
27 | strlen: | ||
28 | ldq_u $1, 0($16) # L : load first quadword ($16 may be misaligned) | ||
29 | lda $2, -1($31) # E : | ||
30 | insqh $2, $16, $2 # U : | ||
31 | andnot $16, 7, $0 # E : | ||
32 | |||
33 | or $2, $1, $1 # E : | ||
34 | cmpbge $31, $1, $2 # E : $2 <- bitmask: bit i == 1 <==> i-th byte == 0 | ||
35 | nop # E : | ||
36 | bne $2, $found # U : | ||
37 | |||
38 | $loop: ldq $1, 8($0) # L : | ||
39 | addq $0, 8, $0 # E : addr += 8 | ||
40 | cmpbge $31, $1, $2 # E : | ||
41 | beq $2, $loop # U : | ||
42 | |||
43 | $found: | ||
44 | cttz $2, $3 # U0 : | ||
45 | addq $0, $3, $0 # E : | ||
46 | subq $0, $16, $0 # E : | ||
47 | ret $31, ($26) # L0 : | ||
48 | |||
49 | .end strlen | ||
diff --git a/arch/alpha/lib/ev67-strlen_user.S b/arch/alpha/lib/ev67-strlen_user.S new file mode 100644 index 000000000000..57e0d77b81a6 --- /dev/null +++ b/arch/alpha/lib/ev67-strlen_user.S | |||
@@ -0,0 +1,107 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev67-strlen_user.S | ||
3 | * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com> | ||
4 | * | ||
5 | * Return the length of the string including the NULL terminator | ||
6 | * (strlen+1) or zero if an error occurred. | ||
7 | * | ||
8 | * In places where it is critical to limit the processing time, | ||
9 | * and the data is not trusted, strnlen_user() should be used. | ||
10 | * It will return a value greater than its second argument if | ||
11 | * that limit would be exceeded. This implementation is allowed | ||
12 | * to access memory beyond the limit, but will not cross a page | ||
13 | * boundary when doing so. | ||
14 | * | ||
15 | * Much of the information about 21264 scheduling/coding comes from: | ||
16 | * Compiler Writer's Guide for the Alpha 21264 | ||
17 | * abbreviated as 'CWG' in other comments here | ||
18 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
19 | * Scheduling notation: | ||
20 | * E - either cluster | ||
21 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
22 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
23 | * Try not to change the actual algorithm if possible for consistency. | ||
24 | */ | ||
25 | |||
26 | #include <asm/regdef.h> | ||
27 | |||
28 | |||
29 | /* Allow an exception for an insn; exit if we get one. */ | ||
30 | #define EX(x,y...) \ | ||
31 | 99: x,##y; \ | ||
32 | .section __ex_table,"a"; \ | ||
33 | .long 99b - .; \ | ||
34 | lda v0, $exception-99b(zero); \ | ||
35 | .previous | ||
36 | |||
37 | |||
38 | .set noreorder | ||
39 | .set noat | ||
40 | .text | ||
41 | |||
42 | .globl __strlen_user | ||
43 | .ent __strlen_user | ||
44 | .frame sp, 0, ra | ||
45 | |||
46 | .align 4 | ||
47 | __strlen_user: | ||
48 | ldah a1, 32767(zero) # do not use plain strlen_user() for strings | ||
49 | # that might be almost 2 GB long; you should | ||
50 | # be using strnlen_user() instead | ||
51 | nop | ||
52 | nop | ||
53 | nop | ||
54 | |||
55 | .globl __strnlen_user | ||
56 | |||
57 | .align 4 | ||
58 | __strnlen_user: | ||
59 | .prologue 0 | ||
60 | EX( ldq_u t0, 0(a0) ) # L : load first quadword (a0 may be misaligned) | ||
61 | lda t1, -1(zero) # E : | ||
62 | |||
63 | insqh t1, a0, t1 # U : | ||
64 | andnot a0, 7, v0 # E : | ||
65 | or t1, t0, t0 # E : | ||
66 | subq a0, 1, a0 # E : get our +1 for the return | ||
67 | |||
68 | cmpbge zero, t0, t1 # E : t1 <- bitmask: bit i == 1 <==> i-th byte == 0 | ||
69 | subq a1, 7, t2 # E : | ||
70 | subq a0, v0, t0 # E : | ||
71 | bne t1, $found # U : | ||
72 | |||
73 | addq t2, t0, t2 # E : | ||
74 | addq a1, 1, a1 # E : | ||
75 | nop # E : | ||
76 | nop # E : | ||
77 | |||
78 | .align 4 | ||
79 | $loop: ble t2, $limit # U : | ||
80 | EX( ldq t0, 8(v0) ) # L : | ||
81 | nop # E : | ||
82 | nop # E : | ||
83 | |||
84 | cmpbge zero, t0, t1 # E : | ||
85 | subq t2, 8, t2 # E : | ||
86 | addq v0, 8, v0 # E : addr += 8 | ||
87 | beq t1, $loop # U : | ||
88 | |||
89 | $found: cttz t1, t2 # U0 : | ||
90 | addq v0, t2, v0 # E : | ||
91 | subq v0, a0, v0 # E : | ||
92 | ret # L0 : | ||
93 | |||
94 | $exception: | ||
95 | nop | ||
96 | nop | ||
97 | nop | ||
98 | ret | ||
99 | |||
100 | .align 4 # currently redundant | ||
101 | $limit: | ||
102 | nop | ||
103 | nop | ||
104 | subq a1, t2, v0 | ||
105 | ret | ||
106 | |||
107 | .end __strlen_user | ||
diff --git a/arch/alpha/lib/ev67-strncat.S b/arch/alpha/lib/ev67-strncat.S new file mode 100644 index 000000000000..4ae716cd2bfb --- /dev/null +++ b/arch/alpha/lib/ev67-strncat.S | |||
@@ -0,0 +1,94 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev67-strncat.S | ||
3 | * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com> | ||
4 | * | ||
5 | * Append no more than COUNT characters from the null-terminated string SRC | ||
6 | * to the null-terminated string DST. Always null-terminate the new DST. | ||
7 | * | ||
8 | * This differs slightly from the semantics in libc in that we never write | ||
9 | * past count, whereas libc may write to count+1. This follows the generic | ||
10 | * implementation in lib/string.c and is, IMHO, more sensible. | ||
11 | * | ||
12 | * Much of the information about 21264 scheduling/coding comes from: | ||
13 | * Compiler Writer's Guide for the Alpha 21264 | ||
14 | * abbreviated as 'CWG' in other comments here | ||
15 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
16 | * Scheduling notation: | ||
17 | * E - either cluster | ||
18 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
19 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
20 | * Try not to change the actual algorithm if possible for consistency. | ||
21 | */ | ||
22 | |||
23 | |||
24 | .text | ||
25 | |||
26 | .align 4 | ||
27 | .globl strncat | ||
28 | .ent strncat | ||
29 | strncat: | ||
30 | .frame $30, 0, $26 | ||
31 | .prologue 0 | ||
32 | |||
33 | mov $16, $0 # set up return value | ||
34 | beq $18, $zerocount # U : | ||
35 | /* Find the end of the string. */ | ||
36 | ldq_u $1, 0($16) # L : load first quadword ($16 may be misaligned) | ||
37 | lda $2, -1($31) # E : | ||
38 | |||
39 | insqh $2, $0, $2 # U : | ||
40 | andnot $16, 7, $16 # E : | ||
41 | nop # E : | ||
42 | or $2, $1, $1 # E : | ||
43 | |||
44 | nop # E : | ||
45 | nop # E : | ||
46 | cmpbge $31, $1, $2 # E : bits set iff byte == 0 | ||
47 | bne $2, $found # U : | ||
48 | |||
49 | $loop: ldq $1, 8($16) # L : | ||
50 | addq $16, 8, $16 # E : | ||
51 | cmpbge $31, $1, $2 # E : | ||
52 | beq $2, $loop # U : | ||
53 | |||
54 | $found: cttz $2, $3 # U0 : | ||
55 | addq $16, $3, $16 # E : | ||
56 | nop # E : | ||
57 | bsr $23, __stxncpy # L0 :/* Now do the append. */ | ||
58 | |||
59 | /* Worry about the null termination. */ | ||
60 | |||
61 | zapnot $1, $27, $2 # U : was last byte a null? | ||
62 | cmplt $27, $24, $5 # E : did we fill the buffer completely? | ||
63 | bne $2, 0f # U : | ||
64 | ret # L0 : | ||
65 | |||
66 | 0: or $5, $18, $2 # E : | ||
67 | nop | ||
68 | bne $2, 2f # U : | ||
69 | and $24, 0x80, $3 # E : no zero next byte | ||
70 | |||
71 | nop # E : | ||
72 | bne $3, 1f # U : | ||
73 | /* Here there are bytes left in the current word. Clear one. */ | ||
74 | addq $24, $24, $24 # E : end-of-count bit <<= 1 | ||
75 | nop # E : | ||
76 | |||
77 | 2: zap $1, $24, $1 # U : | ||
78 | nop # E : | ||
79 | stq_u $1, 0($16) # L : | ||
80 | ret # L0 : | ||
81 | |||
82 | 1: /* Here we must clear the first byte of the next DST word */ | ||
83 | stb $31, 8($16) # L : | ||
84 | nop # E : | ||
85 | nop # E : | ||
86 | ret # L0 : | ||
87 | |||
88 | $zerocount: | ||
89 | nop # E : | ||
90 | nop # E : | ||
91 | nop # E : | ||
92 | ret # L0 : | ||
93 | |||
94 | .end strncat | ||
diff --git a/arch/alpha/lib/ev67-strrchr.S b/arch/alpha/lib/ev67-strrchr.S new file mode 100644 index 000000000000..3fd8bf414c7b --- /dev/null +++ b/arch/alpha/lib/ev67-strrchr.S | |||
@@ -0,0 +1,109 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev67-strrchr.S | ||
3 | * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com> | ||
4 | * | ||
5 | * Finds length of a 0-terminated string. Optimized for the | ||
6 | * Alpha architecture: | ||
7 | * | ||
8 | * - memory accessed as aligned quadwords only | ||
9 | * - uses bcmpge to compare 8 bytes in parallel | ||
10 | * | ||
11 | * Much of the information about 21264 scheduling/coding comes from: | ||
12 | * Compiler Writer's Guide for the Alpha 21264 | ||
13 | * abbreviated as 'CWG' in other comments here | ||
14 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
15 | * Scheduling notation: | ||
16 | * E - either cluster | ||
17 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
18 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
19 | */ | ||
20 | |||
21 | |||
22 | #include <asm/regdef.h> | ||
23 | |||
24 | .set noreorder | ||
25 | .set noat | ||
26 | |||
27 | .align 4 | ||
28 | .ent strrchr | ||
29 | .globl strrchr | ||
30 | strrchr: | ||
31 | .frame sp, 0, ra | ||
32 | .prologue 0 | ||
33 | |||
34 | and a1, 0xff, t2 # E : 00000000000000ch | ||
35 | insbl a1, 1, t4 # U : 000000000000ch00 | ||
36 | insbl a1, 2, t5 # U : 0000000000ch0000 | ||
37 | ldq_u t0, 0(a0) # L : load first quadword Latency=3 | ||
38 | |||
39 | mov zero, t6 # E : t6 is last match aligned addr | ||
40 | or t2, t4, a1 # E : 000000000000chch | ||
41 | sll t5, 8, t3 # U : 00000000ch000000 | ||
42 | mov zero, t8 # E : t8 is last match byte compare mask | ||
43 | |||
44 | andnot a0, 7, v0 # E : align source addr | ||
45 | or t5, t3, t3 # E : 00000000chch0000 | ||
46 | sll a1, 32, t2 # U : 0000chch00000000 | ||
47 | sll a1, 48, t4 # U : chch000000000000 | ||
48 | |||
49 | or t4, a1, a1 # E : chch00000000chch | ||
50 | or t2, t3, t2 # E : 0000chchchch0000 | ||
51 | or a1, t2, a1 # E : chchchchchchchch | ||
52 | lda t5, -1 # E : build garbage mask | ||
53 | |||
54 | cmpbge zero, t0, t1 # E : bits set iff byte == zero | ||
55 | mskqh t5, a0, t4 # E : Complete garbage mask | ||
56 | xor t0, a1, t2 # E : make bytes == c zero | ||
57 | cmpbge zero, t4, t4 # E : bits set iff byte is garbage | ||
58 | |||
59 | cmpbge zero, t2, t3 # E : bits set iff byte == c | ||
60 | andnot t1, t4, t1 # E : clear garbage from null test | ||
61 | andnot t3, t4, t3 # E : clear garbage from char test | ||
62 | bne t1, $eos # U : did we already hit the terminator? | ||
63 | |||
64 | /* Character search main loop */ | ||
65 | $loop: | ||
66 | ldq t0, 8(v0) # L : load next quadword | ||
67 | cmovne t3, v0, t6 # E : save previous comparisons match | ||
68 | nop # : Latency=2, extra map slot (keep nop with cmov) | ||
69 | nop | ||
70 | |||
71 | cmovne t3, t3, t8 # E : Latency=2, extra map slot | ||
72 | nop # : keep with cmovne | ||
73 | addq v0, 8, v0 # E : | ||
74 | xor t0, a1, t2 # E : | ||
75 | |||
76 | cmpbge zero, t0, t1 # E : bits set iff byte == zero | ||
77 | cmpbge zero, t2, t3 # E : bits set iff byte == c | ||
78 | beq t1, $loop # U : if we havnt seen a null, loop | ||
79 | nop | ||
80 | |||
81 | /* Mask out character matches after terminator */ | ||
82 | $eos: | ||
83 | negq t1, t4 # E : isolate first null byte match | ||
84 | and t1, t4, t4 # E : | ||
85 | subq t4, 1, t5 # E : build a mask of the bytes upto... | ||
86 | or t4, t5, t4 # E : ... and including the null | ||
87 | |||
88 | and t3, t4, t3 # E : mask out char matches after null | ||
89 | cmovne t3, t3, t8 # E : save it, if match found Latency=2, extra map slot | ||
90 | nop # : Keep with cmovne | ||
91 | nop | ||
92 | |||
93 | cmovne t3, v0, t6 # E : | ||
94 | nop # : Keep with cmovne | ||
95 | /* Locate the address of the last matched character */ | ||
96 | ctlz t8, t2 # U0 : Latency=3 (0x40 for t8=0) | ||
97 | nop | ||
98 | |||
99 | cmoveq t8, 0x3f, t2 # E : Compensate for case when no match is seen | ||
100 | nop # E : hide the cmov latency (2) behind ctlz latency | ||
101 | lda t5, 0x3f($31) # E : | ||
102 | subq t5, t2, t5 # E : Normalize leading zero count | ||
103 | |||
104 | addq t6, t5, v0 # E : and add to quadword address | ||
105 | ret # L0 : Latency=3 | ||
106 | nop | ||
107 | nop | ||
108 | |||
109 | .end strrchr | ||
diff --git a/arch/alpha/lib/fpreg.c b/arch/alpha/lib/fpreg.c new file mode 100644 index 000000000000..97c4d9d7a4d5 --- /dev/null +++ b/arch/alpha/lib/fpreg.c | |||
@@ -0,0 +1,193 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/fpreg.c | ||
3 | * | ||
4 | * (C) Copyright 1998 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | #if defined(__alpha_cix__) || defined(__alpha_fix__) | ||
8 | #define STT(reg,val) asm volatile ("ftoit $f"#reg",%0" : "=r"(val)); | ||
9 | #else | ||
10 | #define STT(reg,val) asm volatile ("stt $f"#reg",%0" : "=m"(val)); | ||
11 | #endif | ||
12 | |||
13 | unsigned long | ||
14 | alpha_read_fp_reg (unsigned long reg) | ||
15 | { | ||
16 | unsigned long val; | ||
17 | |||
18 | switch (reg) { | ||
19 | case 0: STT( 0, val); break; | ||
20 | case 1: STT( 1, val); break; | ||
21 | case 2: STT( 2, val); break; | ||
22 | case 3: STT( 3, val); break; | ||
23 | case 4: STT( 4, val); break; | ||
24 | case 5: STT( 5, val); break; | ||
25 | case 6: STT( 6, val); break; | ||
26 | case 7: STT( 7, val); break; | ||
27 | case 8: STT( 8, val); break; | ||
28 | case 9: STT( 9, val); break; | ||
29 | case 10: STT(10, val); break; | ||
30 | case 11: STT(11, val); break; | ||
31 | case 12: STT(12, val); break; | ||
32 | case 13: STT(13, val); break; | ||
33 | case 14: STT(14, val); break; | ||
34 | case 15: STT(15, val); break; | ||
35 | case 16: STT(16, val); break; | ||
36 | case 17: STT(17, val); break; | ||
37 | case 18: STT(18, val); break; | ||
38 | case 19: STT(19, val); break; | ||
39 | case 20: STT(20, val); break; | ||
40 | case 21: STT(21, val); break; | ||
41 | case 22: STT(22, val); break; | ||
42 | case 23: STT(23, val); break; | ||
43 | case 24: STT(24, val); break; | ||
44 | case 25: STT(25, val); break; | ||
45 | case 26: STT(26, val); break; | ||
46 | case 27: STT(27, val); break; | ||
47 | case 28: STT(28, val); break; | ||
48 | case 29: STT(29, val); break; | ||
49 | case 30: STT(30, val); break; | ||
50 | case 31: STT(31, val); break; | ||
51 | default: return 0; | ||
52 | } | ||
53 | return val; | ||
54 | } | ||
55 | |||
56 | #if defined(__alpha_cix__) || defined(__alpha_fix__) | ||
57 | #define LDT(reg,val) asm volatile ("itoft %0,$f"#reg : : "r"(val)); | ||
58 | #else | ||
59 | #define LDT(reg,val) asm volatile ("ldt $f"#reg",%0" : : "m"(val)); | ||
60 | #endif | ||
61 | |||
62 | void | ||
63 | alpha_write_fp_reg (unsigned long reg, unsigned long val) | ||
64 | { | ||
65 | switch (reg) { | ||
66 | case 0: LDT( 0, val); break; | ||
67 | case 1: LDT( 1, val); break; | ||
68 | case 2: LDT( 2, val); break; | ||
69 | case 3: LDT( 3, val); break; | ||
70 | case 4: LDT( 4, val); break; | ||
71 | case 5: LDT( 5, val); break; | ||
72 | case 6: LDT( 6, val); break; | ||
73 | case 7: LDT( 7, val); break; | ||
74 | case 8: LDT( 8, val); break; | ||
75 | case 9: LDT( 9, val); break; | ||
76 | case 10: LDT(10, val); break; | ||
77 | case 11: LDT(11, val); break; | ||
78 | case 12: LDT(12, val); break; | ||
79 | case 13: LDT(13, val); break; | ||
80 | case 14: LDT(14, val); break; | ||
81 | case 15: LDT(15, val); break; | ||
82 | case 16: LDT(16, val); break; | ||
83 | case 17: LDT(17, val); break; | ||
84 | case 18: LDT(18, val); break; | ||
85 | case 19: LDT(19, val); break; | ||
86 | case 20: LDT(20, val); break; | ||
87 | case 21: LDT(21, val); break; | ||
88 | case 22: LDT(22, val); break; | ||
89 | case 23: LDT(23, val); break; | ||
90 | case 24: LDT(24, val); break; | ||
91 | case 25: LDT(25, val); break; | ||
92 | case 26: LDT(26, val); break; | ||
93 | case 27: LDT(27, val); break; | ||
94 | case 28: LDT(28, val); break; | ||
95 | case 29: LDT(29, val); break; | ||
96 | case 30: LDT(30, val); break; | ||
97 | case 31: LDT(31, val); break; | ||
98 | } | ||
99 | } | ||
100 | |||
101 | #if defined(__alpha_cix__) || defined(__alpha_fix__) | ||
102 | #define STS(reg,val) asm volatile ("ftois $f"#reg",%0" : "=r"(val)); | ||
103 | #else | ||
104 | #define STS(reg,val) asm volatile ("sts $f"#reg",%0" : "=m"(val)); | ||
105 | #endif | ||
106 | |||
107 | unsigned long | ||
108 | alpha_read_fp_reg_s (unsigned long reg) | ||
109 | { | ||
110 | unsigned long val; | ||
111 | |||
112 | switch (reg) { | ||
113 | case 0: STS( 0, val); break; | ||
114 | case 1: STS( 1, val); break; | ||
115 | case 2: STS( 2, val); break; | ||
116 | case 3: STS( 3, val); break; | ||
117 | case 4: STS( 4, val); break; | ||
118 | case 5: STS( 5, val); break; | ||
119 | case 6: STS( 6, val); break; | ||
120 | case 7: STS( 7, val); break; | ||
121 | case 8: STS( 8, val); break; | ||
122 | case 9: STS( 9, val); break; | ||
123 | case 10: STS(10, val); break; | ||
124 | case 11: STS(11, val); break; | ||
125 | case 12: STS(12, val); break; | ||
126 | case 13: STS(13, val); break; | ||
127 | case 14: STS(14, val); break; | ||
128 | case 15: STS(15, val); break; | ||
129 | case 16: STS(16, val); break; | ||
130 | case 17: STS(17, val); break; | ||
131 | case 18: STS(18, val); break; | ||
132 | case 19: STS(19, val); break; | ||
133 | case 20: STS(20, val); break; | ||
134 | case 21: STS(21, val); break; | ||
135 | case 22: STS(22, val); break; | ||
136 | case 23: STS(23, val); break; | ||
137 | case 24: STS(24, val); break; | ||
138 | case 25: STS(25, val); break; | ||
139 | case 26: STS(26, val); break; | ||
140 | case 27: STS(27, val); break; | ||
141 | case 28: STS(28, val); break; | ||
142 | case 29: STS(29, val); break; | ||
143 | case 30: STS(30, val); break; | ||
144 | case 31: STS(31, val); break; | ||
145 | default: return 0; | ||
146 | } | ||
147 | return val; | ||
148 | } | ||
149 | |||
150 | #if defined(__alpha_cix__) || defined(__alpha_fix__) | ||
151 | #define LDS(reg,val) asm volatile ("itofs %0,$f"#reg : : "r"(val)); | ||
152 | #else | ||
153 | #define LDS(reg,val) asm volatile ("lds $f"#reg",%0" : : "m"(val)); | ||
154 | #endif | ||
155 | |||
156 | void | ||
157 | alpha_write_fp_reg_s (unsigned long reg, unsigned long val) | ||
158 | { | ||
159 | switch (reg) { | ||
160 | case 0: LDS( 0, val); break; | ||
161 | case 1: LDS( 1, val); break; | ||
162 | case 2: LDS( 2, val); break; | ||
163 | case 3: LDS( 3, val); break; | ||
164 | case 4: LDS( 4, val); break; | ||
165 | case 5: LDS( 5, val); break; | ||
166 | case 6: LDS( 6, val); break; | ||
167 | case 7: LDS( 7, val); break; | ||
168 | case 8: LDS( 8, val); break; | ||
169 | case 9: LDS( 9, val); break; | ||
170 | case 10: LDS(10, val); break; | ||
171 | case 11: LDS(11, val); break; | ||
172 | case 12: LDS(12, val); break; | ||
173 | case 13: LDS(13, val); break; | ||
174 | case 14: LDS(14, val); break; | ||
175 | case 15: LDS(15, val); break; | ||
176 | case 16: LDS(16, val); break; | ||
177 | case 17: LDS(17, val); break; | ||
178 | case 18: LDS(18, val); break; | ||
179 | case 19: LDS(19, val); break; | ||
180 | case 20: LDS(20, val); break; | ||
181 | case 21: LDS(21, val); break; | ||
182 | case 22: LDS(22, val); break; | ||
183 | case 23: LDS(23, val); break; | ||
184 | case 24: LDS(24, val); break; | ||
185 | case 25: LDS(25, val); break; | ||
186 | case 26: LDS(26, val); break; | ||
187 | case 27: LDS(27, val); break; | ||
188 | case 28: LDS(28, val); break; | ||
189 | case 29: LDS(29, val); break; | ||
190 | case 30: LDS(30, val); break; | ||
191 | case 31: LDS(31, val); break; | ||
192 | } | ||
193 | } | ||
diff --git a/arch/alpha/lib/memchr.S b/arch/alpha/lib/memchr.S new file mode 100644 index 000000000000..14427eeb555e --- /dev/null +++ b/arch/alpha/lib/memchr.S | |||
@@ -0,0 +1,164 @@ | |||
1 | /* Copyright (C) 1996 Free Software Foundation, Inc. | ||
2 | This file is part of the GNU C Library. | ||
3 | Contributed by David Mosberger (davidm@cs.arizona.edu). | ||
4 | |||
5 | The GNU C Library is free software; you can redistribute it and/or | ||
6 | modify it under the terms of the GNU Library General Public License as | ||
7 | published by the Free Software Foundation; either version 2 of the | ||
8 | License, or (at your option) any later version. | ||
9 | |||
10 | The GNU C Library is distributed in the hope that it will be useful, | ||
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | Library General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU Library General Public | ||
16 | License along with the GNU C Library; see the file COPYING.LIB. If not, | ||
17 | write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
18 | Boston, MA 02111-1307, USA. */ | ||
19 | |||
20 | /* Finds characters in a memory area. Optimized for the Alpha: | ||
21 | |||
22 | - memory accessed as aligned quadwords only | ||
23 | - uses cmpbge to compare 8 bytes in parallel | ||
24 | - does binary search to find 0 byte in last | ||
25 | quadword (HAKMEM needed 12 instructions to | ||
26 | do this instead of the 9 instructions that | ||
27 | binary search needs). | ||
28 | |||
29 | For correctness consider that: | ||
30 | |||
31 | - only minimum number of quadwords may be accessed | ||
32 | - the third argument is an unsigned long | ||
33 | */ | ||
34 | |||
35 | .set noreorder | ||
36 | .set noat | ||
37 | |||
38 | .globl memchr | ||
39 | .ent memchr | ||
40 | memchr: | ||
41 | .frame $30,0,$26,0 | ||
42 | .prologue 0 | ||
43 | |||
44 | # Hack -- if someone passes in (size_t)-1, hoping to just | ||
45 | # search til the end of the address space, we will overflow | ||
46 | # below when we find the address of the last byte. Given | ||
47 | # that we will never have a 56-bit address space, cropping | ||
48 | # the length is the easiest way to avoid trouble. | ||
49 | zap $18, 0x80, $5 #-e0 : | ||
50 | |||
51 | beq $18, $not_found # .. e1 : | ||
52 | ldq_u $1, 0($16) # e1 : load first quadword | ||
53 | insbl $17, 1, $2 # .. e0 : $2 = 000000000000ch00 | ||
54 | and $17, 0xff, $17 #-e0 : $17 = 00000000000000ch | ||
55 | cmpult $18, 9, $4 # .. e1 : | ||
56 | or $2, $17, $17 # e0 : $17 = 000000000000chch | ||
57 | lda $3, -1($31) # .. e1 : | ||
58 | sll $17, 16, $2 #-e0 : $2 = 00000000chch0000 | ||
59 | addq $16, $5, $5 # .. e1 : | ||
60 | or $2, $17, $17 # e1 : $17 = 00000000chchchch | ||
61 | unop # : | ||
62 | sll $17, 32, $2 #-e0 : $2 = chchchch00000000 | ||
63 | or $2, $17, $17 # e1 : $17 = chchchchchchchch | ||
64 | extql $1, $16, $7 # e0 : | ||
65 | beq $4, $first_quad # .. e1 : | ||
66 | |||
67 | ldq_u $6, -1($5) #-e1 : eight or less bytes to search | ||
68 | extqh $6, $16, $6 # .. e0 : | ||
69 | mov $16, $0 # e0 : | ||
70 | or $7, $6, $1 # .. e1 : $1 = quadword starting at $16 | ||
71 | |||
72 | # Deal with the case where at most 8 bytes remain to be searched | ||
73 | # in $1. E.g.: | ||
74 | # $18 = 6 | ||
75 | # $1 = ????c6c5c4c3c2c1 | ||
76 | $last_quad: | ||
77 | negq $18, $6 #-e0 : | ||
78 | xor $17, $1, $1 # .. e1 : | ||
79 | srl $3, $6, $6 # e0 : $6 = mask of $18 bits set | ||
80 | cmpbge $31, $1, $2 # .. e1 : | ||
81 | and $2, $6, $2 #-e0 : | ||
82 | beq $2, $not_found # .. e1 : | ||
83 | |||
84 | $found_it: | ||
85 | # Now, determine which byte matched: | ||
86 | negq $2, $3 # e0 : | ||
87 | and $2, $3, $2 # e1 : | ||
88 | |||
89 | and $2, 0x0f, $1 #-e0 : | ||
90 | addq $0, 4, $3 # .. e1 : | ||
91 | cmoveq $1, $3, $0 # e0 : | ||
92 | |||
93 | addq $0, 2, $3 # .. e1 : | ||
94 | and $2, 0x33, $1 #-e0 : | ||
95 | cmoveq $1, $3, $0 # .. e1 : | ||
96 | |||
97 | and $2, 0x55, $1 # e0 : | ||
98 | addq $0, 1, $3 # .. e1 : | ||
99 | cmoveq $1, $3, $0 #-e0 : | ||
100 | |||
101 | $done: ret # .. e1 : | ||
102 | |||
103 | # Deal with the case where $18 > 8 bytes remain to be | ||
104 | # searched. $16 may not be aligned. | ||
105 | .align 4 | ||
106 | $first_quad: | ||
107 | andnot $16, 0x7, $0 #-e1 : | ||
108 | insqh $3, $16, $2 # .. e0 : $2 = 0000ffffffffffff ($16<0:2> ff) | ||
109 | xor $1, $17, $1 # e0 : | ||
110 | or $1, $2, $1 # e1 : $1 = ====ffffffffffff | ||
111 | cmpbge $31, $1, $2 #-e0 : | ||
112 | bne $2, $found_it # .. e1 : | ||
113 | |||
114 | # At least one byte left to process. | ||
115 | |||
116 | ldq $1, 8($0) # e0 : | ||
117 | subq $5, 1, $18 # .. e1 : | ||
118 | addq $0, 8, $0 #-e0 : | ||
119 | |||
120 | # Make $18 point to last quad to be accessed (the | ||
121 | # last quad may or may not be partial). | ||
122 | |||
123 | andnot $18, 0x7, $18 # .. e1 : | ||
124 | cmpult $0, $18, $2 # e0 : | ||
125 | beq $2, $final # .. e1 : | ||
126 | |||
127 | # At least two quads remain to be accessed. | ||
128 | |||
129 | subq $18, $0, $4 #-e0 : $4 <- nr quads to be processed | ||
130 | and $4, 8, $4 # e1 : odd number of quads? | ||
131 | bne $4, $odd_quad_count # e1 : | ||
132 | |||
133 | # At least three quads remain to be accessed | ||
134 | |||
135 | mov $1, $4 # e0 : move prefetched value to correct reg | ||
136 | |||
137 | .align 4 | ||
138 | $unrolled_loop: | ||
139 | ldq $1, 8($0) #-e0 : prefetch $1 | ||
140 | xor $17, $4, $2 # .. e1 : | ||
141 | cmpbge $31, $2, $2 # e0 : | ||
142 | bne $2, $found_it # .. e1 : | ||
143 | |||
144 | addq $0, 8, $0 #-e0 : | ||
145 | $odd_quad_count: | ||
146 | xor $17, $1, $2 # .. e1 : | ||
147 | ldq $4, 8($0) # e0 : prefetch $4 | ||
148 | cmpbge $31, $2, $2 # .. e1 : | ||
149 | addq $0, 8, $6 #-e0 : | ||
150 | bne $2, $found_it # .. e1 : | ||
151 | |||
152 | cmpult $6, $18, $6 # e0 : | ||
153 | addq $0, 8, $0 # .. e1 : | ||
154 | bne $6, $unrolled_loop #-e1 : | ||
155 | |||
156 | mov $4, $1 # e0 : move prefetched value into $1 | ||
157 | $final: subq $5, $0, $18 # .. e1 : $18 <- number of bytes left to do | ||
158 | bne $18, $last_quad # e1 : | ||
159 | |||
160 | $not_found: | ||
161 | mov $31, $0 #-e0 : | ||
162 | ret # .. e1 : | ||
163 | |||
164 | .end memchr | ||
diff --git a/arch/alpha/lib/memcpy.c b/arch/alpha/lib/memcpy.c new file mode 100644 index 000000000000..64083fc73238 --- /dev/null +++ b/arch/alpha/lib/memcpy.c | |||
@@ -0,0 +1,163 @@ | |||
1 | /* | ||
2 | * linux/arch/alpha/lib/memcpy.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * This is a reasonably optimized memcpy() routine. | ||
9 | */ | ||
10 | |||
11 | /* | ||
12 | * Note that the C code is written to be optimized into good assembly. However, | ||
13 | * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a | ||
14 | * explicit compare against 0 (instead of just using the proper "blt reg, xx" or | ||
15 | * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually.. | ||
16 | */ | ||
17 | |||
18 | #include <linux/types.h> | ||
19 | |||
20 | /* | ||
21 | * This should be done in one go with ldq_u*2/mask/stq_u. Do it | ||
22 | * with a macro so that we can fix it up later.. | ||
23 | */ | ||
24 | #define ALIGN_DEST_TO8_UP(d,s,n) \ | ||
25 | while (d & 7) { \ | ||
26 | if (n <= 0) return; \ | ||
27 | n--; \ | ||
28 | *(char *) d = *(char *) s; \ | ||
29 | d++; s++; \ | ||
30 | } | ||
31 | #define ALIGN_DEST_TO8_DN(d,s,n) \ | ||
32 | while (d & 7) { \ | ||
33 | if (n <= 0) return; \ | ||
34 | n--; \ | ||
35 | d--; s--; \ | ||
36 | *(char *) d = *(char *) s; \ | ||
37 | } | ||
38 | |||
39 | /* | ||
40 | * This should similarly be done with ldq_u*2/mask/stq. The destination | ||
41 | * is aligned, but we don't fill in a full quad-word | ||
42 | */ | ||
43 | #define DO_REST_UP(d,s,n) \ | ||
44 | while (n > 0) { \ | ||
45 | n--; \ | ||
46 | *(char *) d = *(char *) s; \ | ||
47 | d++; s++; \ | ||
48 | } | ||
49 | #define DO_REST_DN(d,s,n) \ | ||
50 | while (n > 0) { \ | ||
51 | n--; \ | ||
52 | d--; s--; \ | ||
53 | *(char *) d = *(char *) s; \ | ||
54 | } | ||
55 | |||
56 | /* | ||
57 | * This should be done with ldq/mask/stq. The source and destination are | ||
58 | * aligned, but we don't fill in a full quad-word | ||
59 | */ | ||
60 | #define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n) | ||
61 | #define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n) | ||
62 | |||
63 | /* | ||
64 | * This does unaligned memory copies. We want to avoid storing to | ||
65 | * an unaligned address, as that would do a read-modify-write cycle. | ||
66 | * We also want to avoid double-reading the unaligned reads. | ||
67 | * | ||
68 | * Note the ordering to try to avoid load (and address generation) latencies. | ||
69 | */ | ||
70 | static inline void __memcpy_unaligned_up (unsigned long d, unsigned long s, | ||
71 | long n) | ||
72 | { | ||
73 | ALIGN_DEST_TO8_UP(d,s,n); | ||
74 | n -= 8; /* to avoid compare against 8 in the loop */ | ||
75 | if (n >= 0) { | ||
76 | unsigned long low_word, high_word; | ||
77 | __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s)); | ||
78 | do { | ||
79 | unsigned long tmp; | ||
80 | __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8))); | ||
81 | n -= 8; | ||
82 | __asm__("extql %1,%2,%0" | ||
83 | :"=r" (low_word) | ||
84 | :"r" (low_word), "r" (s)); | ||
85 | __asm__("extqh %1,%2,%0" | ||
86 | :"=r" (tmp) | ||
87 | :"r" (high_word), "r" (s)); | ||
88 | s += 8; | ||
89 | *(unsigned long *) d = low_word | tmp; | ||
90 | d += 8; | ||
91 | low_word = high_word; | ||
92 | } while (n >= 0); | ||
93 | } | ||
94 | n += 8; | ||
95 | DO_REST_UP(d,s,n); | ||
96 | } | ||
97 | |||
98 | static inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s, | ||
99 | long n) | ||
100 | { | ||
101 | /* I don't understand AXP assembler well enough for this. -Tim */ | ||
102 | s += n; | ||
103 | d += n; | ||
104 | while (n--) | ||
105 | * (char *) --d = * (char *) --s; | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register | ||
110 | * for the load-store. I don't know why, but it would seem that using a floating | ||
111 | * point register for the move seems to slow things down (very small difference, | ||
112 | * though). | ||
113 | * | ||
114 | * Note the ordering to try to avoid load (and address generation) latencies. | ||
115 | */ | ||
116 | static inline void __memcpy_aligned_up (unsigned long d, unsigned long s, | ||
117 | long n) | ||
118 | { | ||
119 | ALIGN_DEST_TO8_UP(d,s,n); | ||
120 | n -= 8; | ||
121 | while (n >= 0) { | ||
122 | unsigned long tmp; | ||
123 | __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s)); | ||
124 | n -= 8; | ||
125 | s += 8; | ||
126 | *(unsigned long *) d = tmp; | ||
127 | d += 8; | ||
128 | } | ||
129 | n += 8; | ||
130 | DO_REST_ALIGNED_UP(d,s,n); | ||
131 | } | ||
132 | static inline void __memcpy_aligned_dn (unsigned long d, unsigned long s, | ||
133 | long n) | ||
134 | { | ||
135 | s += n; | ||
136 | d += n; | ||
137 | ALIGN_DEST_TO8_DN(d,s,n); | ||
138 | n -= 8; | ||
139 | while (n >= 0) { | ||
140 | unsigned long tmp; | ||
141 | s -= 8; | ||
142 | __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s)); | ||
143 | n -= 8; | ||
144 | d -= 8; | ||
145 | *(unsigned long *) d = tmp; | ||
146 | } | ||
147 | n += 8; | ||
148 | DO_REST_ALIGNED_DN(d,s,n); | ||
149 | } | ||
150 | |||
151 | void * memcpy(void * dest, const void *src, size_t n) | ||
152 | { | ||
153 | if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) { | ||
154 | __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src, | ||
155 | n); | ||
156 | return dest; | ||
157 | } | ||
158 | __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n); | ||
159 | return dest; | ||
160 | } | ||
161 | |||
162 | /* For backward modules compatibility, define __memcpy. */ | ||
163 | asm("__memcpy = memcpy; .globl __memcpy"); | ||
diff --git a/arch/alpha/lib/memmove.S b/arch/alpha/lib/memmove.S new file mode 100644 index 000000000000..eb3b6e02242f --- /dev/null +++ b/arch/alpha/lib/memmove.S | |||
@@ -0,0 +1,181 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/memmove.S | ||
3 | * | ||
4 | * Barely optimized memmove routine for Alpha EV5. | ||
5 | * | ||
6 | * This is hand-massaged output from the original memcpy.c. We defer to | ||
7 | * memcpy whenever possible; the backwards copy loops are not unrolled. | ||
8 | */ | ||
9 | |||
10 | .set noat | ||
11 | .set noreorder | ||
12 | .text | ||
13 | |||
14 | .align 4 | ||
15 | .globl memmove | ||
16 | .ent memmove | ||
17 | memmove: | ||
18 | ldgp $29, 0($27) | ||
19 | unop | ||
20 | nop | ||
21 | .prologue 1 | ||
22 | |||
23 | addq $16,$18,$4 | ||
24 | addq $17,$18,$5 | ||
25 | cmpule $4,$17,$1 /* dest + n <= src */ | ||
26 | cmpule $5,$16,$2 /* dest >= src + n */ | ||
27 | |||
28 | bis $1,$2,$1 | ||
29 | mov $16,$0 | ||
30 | xor $16,$17,$2 | ||
31 | bne $1,memcpy !samegp | ||
32 | |||
33 | and $2,7,$2 /* Test for src/dest co-alignment. */ | ||
34 | and $16,7,$1 | ||
35 | cmpule $16,$17,$3 | ||
36 | bne $3,$memmove_up /* dest < src */ | ||
37 | |||
38 | and $4,7,$1 | ||
39 | bne $2,$misaligned_dn | ||
40 | unop | ||
41 | beq $1,$skip_aligned_byte_loop_head_dn | ||
42 | |||
43 | $aligned_byte_loop_head_dn: | ||
44 | lda $4,-1($4) | ||
45 | lda $5,-1($5) | ||
46 | unop | ||
47 | ble $18,$egress | ||
48 | |||
49 | ldq_u $3,0($5) | ||
50 | ldq_u $2,0($4) | ||
51 | lda $18,-1($18) | ||
52 | extbl $3,$5,$1 | ||
53 | |||
54 | insbl $1,$4,$1 | ||
55 | mskbl $2,$4,$2 | ||
56 | bis $1,$2,$1 | ||
57 | and $4,7,$6 | ||
58 | |||
59 | stq_u $1,0($4) | ||
60 | bne $6,$aligned_byte_loop_head_dn | ||
61 | |||
62 | $skip_aligned_byte_loop_head_dn: | ||
63 | lda $18,-8($18) | ||
64 | blt $18,$skip_aligned_word_loop_dn | ||
65 | |||
66 | $aligned_word_loop_dn: | ||
67 | ldq $1,-8($5) | ||
68 | nop | ||
69 | lda $5,-8($5) | ||
70 | lda $18,-8($18) | ||
71 | |||
72 | stq $1,-8($4) | ||
73 | nop | ||
74 | lda $4,-8($4) | ||
75 | bge $18,$aligned_word_loop_dn | ||
76 | |||
77 | $skip_aligned_word_loop_dn: | ||
78 | lda $18,8($18) | ||
79 | bgt $18,$byte_loop_tail_dn | ||
80 | unop | ||
81 | ret $31,($26),1 | ||
82 | |||
83 | .align 4 | ||
84 | $misaligned_dn: | ||
85 | nop | ||
86 | fnop | ||
87 | unop | ||
88 | beq $18,$egress | ||
89 | |||
90 | $byte_loop_tail_dn: | ||
91 | ldq_u $3,-1($5) | ||
92 | ldq_u $2,-1($4) | ||
93 | lda $5,-1($5) | ||
94 | lda $4,-1($4) | ||
95 | |||
96 | lda $18,-1($18) | ||
97 | extbl $3,$5,$1 | ||
98 | insbl $1,$4,$1 | ||
99 | mskbl $2,$4,$2 | ||
100 | |||
101 | bis $1,$2,$1 | ||
102 | stq_u $1,0($4) | ||
103 | bgt $18,$byte_loop_tail_dn | ||
104 | br $egress | ||
105 | |||
106 | $memmove_up: | ||
107 | mov $16,$4 | ||
108 | mov $17,$5 | ||
109 | bne $2,$misaligned_up | ||
110 | beq $1,$skip_aligned_byte_loop_head_up | ||
111 | |||
112 | $aligned_byte_loop_head_up: | ||
113 | unop | ||
114 | ble $18,$egress | ||
115 | ldq_u $3,0($5) | ||
116 | ldq_u $2,0($4) | ||
117 | |||
118 | lda $18,-1($18) | ||
119 | extbl $3,$5,$1 | ||
120 | insbl $1,$4,$1 | ||
121 | mskbl $2,$4,$2 | ||
122 | |||
123 | bis $1,$2,$1 | ||
124 | lda $5,1($5) | ||
125 | stq_u $1,0($4) | ||
126 | lda $4,1($4) | ||
127 | |||
128 | and $4,7,$6 | ||
129 | bne $6,$aligned_byte_loop_head_up | ||
130 | |||
131 | $skip_aligned_byte_loop_head_up: | ||
132 | lda $18,-8($18) | ||
133 | blt $18,$skip_aligned_word_loop_up | ||
134 | |||
135 | $aligned_word_loop_up: | ||
136 | ldq $1,0($5) | ||
137 | nop | ||
138 | lda $5,8($5) | ||
139 | lda $18,-8($18) | ||
140 | |||
141 | stq $1,0($4) | ||
142 | nop | ||
143 | lda $4,8($4) | ||
144 | bge $18,$aligned_word_loop_up | ||
145 | |||
146 | $skip_aligned_word_loop_up: | ||
147 | lda $18,8($18) | ||
148 | bgt $18,$byte_loop_tail_up | ||
149 | unop | ||
150 | ret $31,($26),1 | ||
151 | |||
152 | .align 4 | ||
153 | $misaligned_up: | ||
154 | nop | ||
155 | fnop | ||
156 | unop | ||
157 | beq $18,$egress | ||
158 | |||
159 | $byte_loop_tail_up: | ||
160 | ldq_u $3,0($5) | ||
161 | ldq_u $2,0($4) | ||
162 | lda $18,-1($18) | ||
163 | extbl $3,$5,$1 | ||
164 | |||
165 | insbl $1,$4,$1 | ||
166 | mskbl $2,$4,$2 | ||
167 | bis $1,$2,$1 | ||
168 | stq_u $1,0($4) | ||
169 | |||
170 | lda $5,1($5) | ||
171 | lda $4,1($4) | ||
172 | nop | ||
173 | bgt $18,$byte_loop_tail_up | ||
174 | |||
175 | $egress: | ||
176 | ret $31,($26),1 | ||
177 | nop | ||
178 | nop | ||
179 | nop | ||
180 | |||
181 | .end memmove | ||
diff --git a/arch/alpha/lib/memset.S b/arch/alpha/lib/memset.S new file mode 100644 index 000000000000..8ff6e7e1773e --- /dev/null +++ b/arch/alpha/lib/memset.S | |||
@@ -0,0 +1,124 @@ | |||
1 | /* | ||
2 | * linux/arch/alpha/memset.S | ||
3 | * | ||
4 | * This is an efficient (and small) implementation of the C library "memset()" | ||
5 | * function for the alpha. | ||
6 | * | ||
7 | * (C) Copyright 1996 Linus Torvalds | ||
8 | * | ||
9 | * This routine is "moral-ware": you are free to use it any way you wish, and | ||
10 | * the only obligation I put on you is a moral one: if you make any improvements | ||
11 | * to the routine, please send me your improvements for me to use similarly. | ||
12 | * | ||
13 | * The scheduling comments are according to the EV5 documentation (and done by | ||
14 | * hand, so they might well be incorrect, please do tell me about it..) | ||
15 | */ | ||
16 | |||
17 | .set noat | ||
18 | .set noreorder | ||
19 | .text | ||
20 | .globl memset | ||
21 | .globl __memset | ||
22 | .globl __memsetw | ||
23 | .globl __constant_c_memset | ||
24 | .ent __memset | ||
25 | .align 5 | ||
26 | __memset: | ||
27 | .frame $30,0,$26,0 | ||
28 | .prologue 0 | ||
29 | |||
30 | and $17,255,$1 /* E1 */ | ||
31 | insbl $17,1,$17 /* .. E0 */ | ||
32 | bis $17,$1,$17 /* E0 (p-c latency, next cycle) */ | ||
33 | sll $17,16,$1 /* E1 (p-c latency, next cycle) */ | ||
34 | |||
35 | bis $17,$1,$17 /* E0 (p-c latency, next cycle) */ | ||
36 | sll $17,32,$1 /* E1 (p-c latency, next cycle) */ | ||
37 | bis $17,$1,$17 /* E0 (p-c latency, next cycle) */ | ||
38 | ldq_u $31,0($30) /* .. E1 */ | ||
39 | |||
40 | .align 5 | ||
41 | __constant_c_memset: | ||
42 | addq $18,$16,$6 /* E0 */ | ||
43 | bis $16,$16,$0 /* .. E1 */ | ||
44 | xor $16,$6,$1 /* E0 */ | ||
45 | ble $18,end /* .. E1 */ | ||
46 | |||
47 | bic $1,7,$1 /* E0 */ | ||
48 | beq $1,within_one_quad /* .. E1 (note EV5 zero-latency forwarding) */ | ||
49 | and $16,7,$3 /* E0 */ | ||
50 | beq $3,aligned /* .. E1 (note EV5 zero-latency forwarding) */ | ||
51 | |||
52 | ldq_u $4,0($16) /* E0 */ | ||
53 | bis $16,$16,$5 /* .. E1 */ | ||
54 | insql $17,$16,$2 /* E0 */ | ||
55 | subq $3,8,$3 /* .. E1 */ | ||
56 | |||
57 | addq $18,$3,$18 /* E0 $18 is new count ($3 is negative) */ | ||
58 | mskql $4,$16,$4 /* .. E1 (and possible load stall) */ | ||
59 | subq $16,$3,$16 /* E0 $16 is new aligned destination */ | ||
60 | bis $2,$4,$1 /* .. E1 */ | ||
61 | |||
62 | bis $31,$31,$31 /* E0 */ | ||
63 | ldq_u $31,0($30) /* .. E1 */ | ||
64 | stq_u $1,0($5) /* E0 */ | ||
65 | bis $31,$31,$31 /* .. E1 */ | ||
66 | |||
67 | .align 4 | ||
68 | aligned: | ||
69 | sra $18,3,$3 /* E0 */ | ||
70 | and $18,7,$18 /* .. E1 */ | ||
71 | bis $16,$16,$5 /* E0 */ | ||
72 | beq $3,no_quad /* .. E1 */ | ||
73 | |||
74 | .align 3 | ||
75 | loop: | ||
76 | stq $17,0($5) /* E0 */ | ||
77 | subq $3,1,$3 /* .. E1 */ | ||
78 | addq $5,8,$5 /* E0 */ | ||
79 | bne $3,loop /* .. E1 */ | ||
80 | |||
81 | no_quad: | ||
82 | bis $31,$31,$31 /* E0 */ | ||
83 | beq $18,end /* .. E1 */ | ||
84 | ldq $7,0($5) /* E0 */ | ||
85 | mskqh $7,$6,$2 /* .. E1 (and load stall) */ | ||
86 | |||
87 | insqh $17,$6,$4 /* E0 */ | ||
88 | bis $2,$4,$1 /* .. E1 */ | ||
89 | stq $1,0($5) /* E0 */ | ||
90 | ret $31,($26),1 /* .. E1 */ | ||
91 | |||
92 | .align 3 | ||
93 | within_one_quad: | ||
94 | ldq_u $1,0($16) /* E0 */ | ||
95 | insql $17,$16,$2 /* E1 */ | ||
96 | mskql $1,$16,$4 /* E0 (after load stall) */ | ||
97 | bis $2,$4,$2 /* E0 */ | ||
98 | |||
99 | mskql $2,$6,$4 /* E0 */ | ||
100 | mskqh $1,$6,$2 /* .. E1 */ | ||
101 | bis $2,$4,$1 /* E0 */ | ||
102 | stq_u $1,0($16) /* E0 */ | ||
103 | |||
104 | end: | ||
105 | ret $31,($26),1 /* E1 */ | ||
106 | .end __memset | ||
107 | |||
108 | .align 5 | ||
109 | .ent __memsetw | ||
110 | __memsetw: | ||
111 | .prologue 0 | ||
112 | |||
113 | inswl $17,0,$1 /* E0 */ | ||
114 | inswl $17,2,$2 /* E0 */ | ||
115 | inswl $17,4,$3 /* E0 */ | ||
116 | or $1,$2,$1 /* .. E1 */ | ||
117 | inswl $17,6,$4 /* E0 */ | ||
118 | or $1,$3,$1 /* .. E1 */ | ||
119 | or $1,$4,$17 /* E0 */ | ||
120 | br __constant_c_memset /* .. E1 */ | ||
121 | |||
122 | .end __memsetw | ||
123 | |||
124 | memset = __memset | ||
diff --git a/arch/alpha/lib/srm_printk.c b/arch/alpha/lib/srm_printk.c new file mode 100644 index 000000000000..31b53c49435e --- /dev/null +++ b/arch/alpha/lib/srm_printk.c | |||
@@ -0,0 +1,41 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/srm_printk.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/kernel.h> | ||
6 | #include <asm/console.h> | ||
7 | |||
8 | long | ||
9 | srm_printk(const char *fmt, ...) | ||
10 | { | ||
11 | static char buf[1024]; | ||
12 | va_list args; | ||
13 | long len, num_lf; | ||
14 | char *src, *dst; | ||
15 | |||
16 | va_start(args, fmt); | ||
17 | len = vsprintf(buf, fmt, args); | ||
18 | va_end(args); | ||
19 | |||
20 | /* count number of linefeeds in string: */ | ||
21 | |||
22 | num_lf = 0; | ||
23 | for (src = buf; *src; ++src) { | ||
24 | if (*src == '\n') { | ||
25 | ++num_lf; | ||
26 | } | ||
27 | } | ||
28 | |||
29 | if (num_lf) { | ||
30 | /* expand each linefeed into carriage-return/linefeed: */ | ||
31 | for (dst = src + num_lf; src >= buf; ) { | ||
32 | if (*src == '\n') { | ||
33 | *dst-- = '\r'; | ||
34 | } | ||
35 | *dst-- = *src--; | ||
36 | } | ||
37 | } | ||
38 | |||
39 | srm_puts(buf, num_lf+len); | ||
40 | return len; | ||
41 | } | ||
diff --git a/arch/alpha/lib/srm_puts.c b/arch/alpha/lib/srm_puts.c new file mode 100644 index 000000000000..7b60a6f75a78 --- /dev/null +++ b/arch/alpha/lib/srm_puts.c | |||
@@ -0,0 +1,23 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/srm_puts.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/string.h> | ||
6 | #include <asm/console.h> | ||
7 | |||
8 | long | ||
9 | srm_puts(const char *str, long len) | ||
10 | { | ||
11 | long remaining, written; | ||
12 | |||
13 | if (!callback_init_done) | ||
14 | return len; | ||
15 | |||
16 | for (remaining = len; remaining > 0; remaining -= written) | ||
17 | { | ||
18 | written = callback_puts(0, str, remaining); | ||
19 | written &= 0xffffffff; | ||
20 | str += written; | ||
21 | } | ||
22 | return len; | ||
23 | } | ||
diff --git a/arch/alpha/lib/stacktrace.c b/arch/alpha/lib/stacktrace.c new file mode 100644 index 000000000000..6d432e42aedc --- /dev/null +++ b/arch/alpha/lib/stacktrace.c | |||
@@ -0,0 +1,103 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <asm/system.h> | ||
3 | |||
4 | typedef unsigned int instr; | ||
5 | |||
6 | #define MAJOR_OP 0xfc000000 | ||
7 | #define LDA_OP 0x20000000 | ||
8 | #define STQ_OP 0xb4000000 | ||
9 | #define BR_OP 0xc0000000 | ||
10 | |||
11 | #define STK_ALLOC_1 0x23de8000 /* lda $30,-X($30) */ | ||
12 | #define STK_ALLOC_1M 0xffff8000 | ||
13 | #define STK_ALLOC_2 0x43c0153e /* subq $30,X,$30 */ | ||
14 | #define STK_ALLOC_2M 0xffe01fff | ||
15 | |||
16 | #define MEM_REG 0x03e00000 | ||
17 | #define MEM_BASE 0x001f0000 | ||
18 | #define MEM_OFF 0x0000ffff | ||
19 | #define MEM_OFF_SIGN 0x00008000 | ||
20 | #define BASE_SP 0x001e0000 | ||
21 | |||
22 | #define STK_ALLOC_MATCH(INSTR) \ | ||
23 | (((INSTR) & STK_ALLOC_1M) == STK_ALLOC_1 \ | ||
24 | || ((INSTR) & STK_ALLOC_2M) == STK_ALLOC_2) | ||
25 | #define STK_PUSH_MATCH(INSTR) \ | ||
26 | (((INSTR) & (MAJOR_OP | MEM_BASE | MEM_OFF_SIGN)) == (STQ_OP | BASE_SP)) | ||
27 | #define MEM_OP_OFFSET(INSTR) \ | ||
28 | (((long)((INSTR) & MEM_OFF) << 48) >> 48) | ||
29 | #define MEM_OP_REG(INSTR) \ | ||
30 | (((INSTR) & MEM_REG) >> 22) | ||
31 | |||
32 | /* Branches, jumps, PAL calls, and illegal opcodes end a basic block. */ | ||
33 | #define BB_END(INSTR) \ | ||
34 | (((instr)(INSTR) >= BR_OP) | ((instr)(INSTR) < LDA_OP) | \ | ||
35 | ((((instr)(INSTR) ^ 0x60000000) < 0x20000000) & \ | ||
36 | (((instr)(INSTR) & 0x0c000000) != 0))) | ||
37 | |||
38 | #define IS_KERNEL_TEXT(PC) ((unsigned long)(PC) > START_ADDR) | ||
39 | |||
40 | static char reg_name[][4] = { | ||
41 | "v0 ", "t0 ", "t1 ", "t2 ", "t3 ", "t4 ", "t5 ", "t6 ", "t7 ", | ||
42 | "s0 ", "s1 ", "s2 ", "s3 ", "s4 ", "s5 ", "s6 ", "a0 ", "a1 ", | ||
43 | "a2 ", "a3 ", "a4 ", "a5 ", "t8 ", "t9 ", "t10", "t11", "ra ", | ||
44 | "pv ", "at ", "gp ", "sp ", "0" | ||
45 | }; | ||
46 | |||
47 | |||
48 | static instr * | ||
49 | display_stored_regs(instr * pro_pc, unsigned char * sp) | ||
50 | { | ||
51 | instr * ret_pc = 0; | ||
52 | int reg; | ||
53 | unsigned long value; | ||
54 | |||
55 | printk("Prologue [<%p>], Frame %p:\n", pro_pc, sp); | ||
56 | while (!BB_END(*pro_pc)) | ||
57 | if (STK_PUSH_MATCH(*pro_pc)) { | ||
58 | reg = (*pro_pc & MEM_REG) >> 21; | ||
59 | value = *(unsigned long *)(sp + (*pro_pc & MEM_OFF)); | ||
60 | if (reg == 26) | ||
61 | ret_pc = (instr *)value; | ||
62 | printk("\t\t%s / 0x%016lx\n", reg_name[reg], value); | ||
63 | } | ||
64 | return ret_pc; | ||
65 | } | ||
66 | |||
67 | static instr * | ||
68 | seek_prologue(instr * pc) | ||
69 | { | ||
70 | while (!STK_ALLOC_MATCH(*pc)) | ||
71 | --pc; | ||
72 | while (!BB_END(*(pc - 1))) | ||
73 | --pc; | ||
74 | return pc; | ||
75 | } | ||
76 | |||
77 | static long | ||
78 | stack_increment(instr * prologue_pc) | ||
79 | { | ||
80 | while (!STK_ALLOC_MATCH(*prologue_pc)) | ||
81 | ++prologue_pc; | ||
82 | |||
83 | /* Count the bytes allocated. */ | ||
84 | if ((*prologue_pc & STK_ALLOC_1M) == STK_ALLOC_1M) | ||
85 | return -(((long)(*prologue_pc) << 48) >> 48); | ||
86 | else | ||
87 | return (*prologue_pc >> 13) & 0xff; | ||
88 | } | ||
89 | |||
90 | void | ||
91 | stacktrace(void) | ||
92 | { | ||
93 | instr * ret_pc; | ||
94 | instr * prologue = (instr *)stacktrace; | ||
95 | register unsigned char * sp __asm__ ("$30"); | ||
96 | |||
97 | printk("\tstack trace:\n"); | ||
98 | do { | ||
99 | ret_pc = display_stored_regs(prologue, sp); | ||
100 | sp += stack_increment(prologue); | ||
101 | prologue = seek_prologue(ret_pc); | ||
102 | } while (IS_KERNEL_TEXT(ret_pc)); | ||
103 | } | ||
diff --git a/arch/alpha/lib/strcasecmp.c b/arch/alpha/lib/strcasecmp.c new file mode 100644 index 000000000000..4e57a216feaf --- /dev/null +++ b/arch/alpha/lib/strcasecmp.c | |||
@@ -0,0 +1,26 @@ | |||
1 | /* | ||
2 | * linux/arch/alpha/lib/strcasecmp.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/string.h> | ||
6 | |||
7 | |||
8 | /* We handle nothing here except the C locale. Since this is used in | ||
9 | only one place, on strings known to contain only 7 bit ASCII, this | ||
10 | is ok. */ | ||
11 | |||
12 | int strcasecmp(const char *a, const char *b) | ||
13 | { | ||
14 | int ca, cb; | ||
15 | |||
16 | do { | ||
17 | ca = *a++ & 0xff; | ||
18 | cb = *b++ & 0xff; | ||
19 | if (ca >= 'A' && ca <= 'Z') | ||
20 | ca += 'a' - 'A'; | ||
21 | if (cb >= 'A' && cb <= 'Z') | ||
22 | cb += 'a' - 'A'; | ||
23 | } while (ca == cb && ca != '\0'); | ||
24 | |||
25 | return ca - cb; | ||
26 | } | ||
diff --git a/arch/alpha/lib/strcat.S b/arch/alpha/lib/strcat.S new file mode 100644 index 000000000000..393f50384878 --- /dev/null +++ b/arch/alpha/lib/strcat.S | |||
@@ -0,0 +1,52 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/strcat.S | ||
3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
4 | * | ||
5 | * Append a null-terminated string from SRC to DST. | ||
6 | */ | ||
7 | |||
8 | .text | ||
9 | |||
10 | .align 3 | ||
11 | .globl strcat | ||
12 | .ent strcat | ||
13 | strcat: | ||
14 | .frame $30, 0, $26 | ||
15 | .prologue 0 | ||
16 | |||
17 | mov $16, $0 # set up return value | ||
18 | |||
19 | /* Find the end of the string. */ | ||
20 | |||
21 | ldq_u $1, 0($16) # load first quadword (a0 may be misaligned) | ||
22 | lda $2, -1 | ||
23 | insqh $2, $16, $2 | ||
24 | andnot $16, 7, $16 | ||
25 | or $2, $1, $1 | ||
26 | cmpbge $31, $1, $2 # bits set iff byte == 0 | ||
27 | bne $2, $found | ||
28 | |||
29 | $loop: ldq $1, 8($16) | ||
30 | addq $16, 8, $16 | ||
31 | cmpbge $31, $1, $2 | ||
32 | beq $2, $loop | ||
33 | |||
34 | $found: negq $2, $3 # clear all but least set bit | ||
35 | and $2, $3, $2 | ||
36 | |||
37 | and $2, 0xf0, $3 # binary search for that set bit | ||
38 | and $2, 0xcc, $4 | ||
39 | and $2, 0xaa, $5 | ||
40 | cmovne $3, 4, $3 | ||
41 | cmovne $4, 2, $4 | ||
42 | cmovne $5, 1, $5 | ||
43 | addq $3, $4, $3 | ||
44 | addq $16, $5, $16 | ||
45 | addq $16, $3, $16 | ||
46 | |||
47 | /* Now do the append. */ | ||
48 | |||
49 | mov $26, $23 | ||
50 | br __stxcpy | ||
51 | |||
52 | .end strcat | ||
diff --git a/arch/alpha/lib/strchr.S b/arch/alpha/lib/strchr.S new file mode 100644 index 000000000000..011a175e8329 --- /dev/null +++ b/arch/alpha/lib/strchr.S | |||
@@ -0,0 +1,70 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/strchr.S | ||
3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
4 | * | ||
5 | * Return the address of a given character within a null-terminated | ||
6 | * string, or null if it is not found. | ||
7 | */ | ||
8 | |||
9 | #include <asm/regdef.h> | ||
10 | |||
11 | .set noreorder | ||
12 | .set noat | ||
13 | |||
14 | .align 3 | ||
15 | .globl strchr | ||
16 | .ent strchr | ||
17 | strchr: | ||
18 | .frame sp, 0, ra | ||
19 | .prologue 0 | ||
20 | |||
21 | zapnot a1, 1, a1 # e0 : zero extend the search character | ||
22 | ldq_u t0, 0(a0) # .. e1 : load first quadword | ||
23 | sll a1, 8, t5 # e0 : replicate the search character | ||
24 | andnot a0, 7, v0 # .. e1 : align our loop pointer | ||
25 | or t5, a1, a1 # e0 : | ||
26 | lda t4, -1 # .. e1 : build garbage mask | ||
27 | sll a1, 16, t5 # e0 : | ||
28 | cmpbge zero, t0, t2 # .. e1 : bits set iff byte == zero | ||
29 | mskqh t4, a0, t4 # e0 : | ||
30 | or t5, a1, a1 # .. e1 : | ||
31 | sll a1, 32, t5 # e0 : | ||
32 | cmpbge zero, t4, t4 # .. e1 : bits set iff byte is garbage | ||
33 | or t5, a1, a1 # e0 : | ||
34 | xor t0, a1, t1 # .. e1 : make bytes == c zero | ||
35 | cmpbge zero, t1, t3 # e0 : bits set iff byte == c | ||
36 | or t2, t3, t0 # e1 : bits set iff char match or zero match | ||
37 | andnot t0, t4, t0 # e0 : clear garbage bits | ||
38 | bne t0, $found # .. e1 (zdb) | ||
39 | |||
40 | $loop: ldq t0, 8(v0) # e0 : | ||
41 | addq v0, 8, v0 # .. e1 : | ||
42 | nop # e0 : | ||
43 | xor t0, a1, t1 # .. e1 (ev5 data stall) | ||
44 | cmpbge zero, t0, t2 # e0 : bits set iff byte == 0 | ||
45 | cmpbge zero, t1, t3 # .. e1 : bits set iff byte == c | ||
46 | or t2, t3, t0 # e0 : | ||
47 | beq t0, $loop # .. e1 (zdb) | ||
48 | |||
49 | $found: negq t0, t1 # e0 : clear all but least set bit | ||
50 | and t0, t1, t0 # e1 (stall) | ||
51 | |||
52 | and t0, t3, t1 # e0 : bit set iff byte was the char | ||
53 | beq t1, $retnull # .. e1 (zdb) | ||
54 | |||
55 | and t0, 0xf0, t2 # e0 : binary search for that set bit | ||
56 | and t0, 0xcc, t3 # .. e1 : | ||
57 | and t0, 0xaa, t4 # e0 : | ||
58 | cmovne t2, 4, t2 # .. e1 : | ||
59 | cmovne t3, 2, t3 # e0 : | ||
60 | cmovne t4, 1, t4 # .. e1 : | ||
61 | addq t2, t3, t2 # e0 : | ||
62 | addq v0, t4, v0 # .. e1 : | ||
63 | addq v0, t2, v0 # e0 : | ||
64 | ret # .. e1 : | ||
65 | |||
66 | $retnull: | ||
67 | mov zero, v0 # e0 : | ||
68 | ret # .. e1 : | ||
69 | |||
70 | .end strchr | ||
diff --git a/arch/alpha/lib/strcpy.S b/arch/alpha/lib/strcpy.S new file mode 100644 index 000000000000..e0728e4ad21f --- /dev/null +++ b/arch/alpha/lib/strcpy.S | |||
@@ -0,0 +1,23 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/strcpy.S | ||
3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
4 | * | ||
5 | * Copy a null-terminated string from SRC to DST. Return a pointer | ||
6 | * to the null-terminator in the source. | ||
7 | */ | ||
8 | |||
9 | .text | ||
10 | |||
11 | .align 3 | ||
12 | .globl strcpy | ||
13 | .ent strcpy | ||
14 | strcpy: | ||
15 | .frame $30, 0, $26 | ||
16 | .prologue 0 | ||
17 | |||
18 | mov $16, $0 # set up return value | ||
19 | mov $26, $23 # set up return address | ||
20 | unop | ||
21 | br __stxcpy # do the copy | ||
22 | |||
23 | .end strcpy | ||
diff --git a/arch/alpha/lib/strlen.S b/arch/alpha/lib/strlen.S new file mode 100644 index 000000000000..fe63353de152 --- /dev/null +++ b/arch/alpha/lib/strlen.S | |||
@@ -0,0 +1,57 @@ | |||
1 | /* | ||
2 | * strlen.S (c) 1995 David Mosberger (davidm@cs.arizona.edu) | ||
3 | * | ||
4 | * Finds length of a 0-terminated string. Optimized for the | ||
5 | * Alpha architecture: | ||
6 | * | ||
7 | * - memory accessed as aligned quadwords only | ||
8 | * - uses bcmpge to compare 8 bytes in parallel | ||
9 | * - does binary search to find 0 byte in last | ||
10 | * quadword (HAKMEM needed 12 instructions to | ||
11 | * do this instead of the 9 instructions that | ||
12 | * binary search needs). | ||
13 | */ | ||
14 | |||
15 | .set noreorder | ||
16 | .set noat | ||
17 | |||
18 | .align 3 | ||
19 | |||
20 | .globl strlen | ||
21 | .ent strlen | ||
22 | |||
23 | strlen: | ||
24 | ldq_u $1, 0($16) # load first quadword ($16 may be misaligned) | ||
25 | lda $2, -1($31) | ||
26 | insqh $2, $16, $2 | ||
27 | andnot $16, 7, $0 | ||
28 | or $2, $1, $1 | ||
29 | cmpbge $31, $1, $2 # $2 <- bitmask: bit i == 1 <==> i-th byte == 0 | ||
30 | bne $2, found | ||
31 | |||
32 | loop: ldq $1, 8($0) | ||
33 | addq $0, 8, $0 # addr += 8 | ||
34 | nop # helps dual issue last two insns | ||
35 | cmpbge $31, $1, $2 | ||
36 | beq $2, loop | ||
37 | |||
38 | found: blbs $2, done # make aligned case fast | ||
39 | negq $2, $3 | ||
40 | and $2, $3, $2 | ||
41 | |||
42 | and $2, 0x0f, $1 | ||
43 | addq $0, 4, $3 | ||
44 | cmoveq $1, $3, $0 | ||
45 | |||
46 | and $2, 0x33, $1 | ||
47 | addq $0, 2, $3 | ||
48 | cmoveq $1, $3, $0 | ||
49 | |||
50 | and $2, 0x55, $1 | ||
51 | addq $0, 1, $3 | ||
52 | cmoveq $1, $3, $0 | ||
53 | |||
54 | done: subq $0, $16, $0 | ||
55 | ret $31, ($26) | ||
56 | |||
57 | .end strlen | ||
diff --git a/arch/alpha/lib/strlen_user.S b/arch/alpha/lib/strlen_user.S new file mode 100644 index 000000000000..508a18e96479 --- /dev/null +++ b/arch/alpha/lib/strlen_user.S | |||
@@ -0,0 +1,91 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/strlen_user.S | ||
3 | * | ||
4 | * Return the length of the string including the NUL terminator | ||
5 | * (strlen+1) or zero if an error occurred. | ||
6 | * | ||
7 | * In places where it is critical to limit the processing time, | ||
8 | * and the data is not trusted, strnlen_user() should be used. | ||
9 | * It will return a value greater than its second argument if | ||
10 | * that limit would be exceeded. This implementation is allowed | ||
11 | * to access memory beyond the limit, but will not cross a page | ||
12 | * boundary when doing so. | ||
13 | */ | ||
14 | |||
15 | #include <asm/regdef.h> | ||
16 | |||
17 | |||
18 | /* Allow an exception for an insn; exit if we get one. */ | ||
19 | #define EX(x,y...) \ | ||
20 | 99: x,##y; \ | ||
21 | .section __ex_table,"a"; \ | ||
22 | .long 99b - .; \ | ||
23 | lda v0, $exception-99b(zero); \ | ||
24 | .previous | ||
25 | |||
26 | |||
27 | .set noreorder | ||
28 | .set noat | ||
29 | .text | ||
30 | |||
31 | .globl __strlen_user | ||
32 | .ent __strlen_user | ||
33 | .frame sp, 0, ra | ||
34 | |||
35 | .align 3 | ||
36 | __strlen_user: | ||
37 | ldah a1, 32767(zero) # do not use plain strlen_user() for strings | ||
38 | # that might be almost 2 GB long; you should | ||
39 | # be using strnlen_user() instead | ||
40 | |||
41 | .globl __strnlen_user | ||
42 | |||
43 | .align 3 | ||
44 | __strnlen_user: | ||
45 | .prologue 0 | ||
46 | |||
47 | EX( ldq_u t0, 0(a0) ) # load first quadword (a0 may be misaligned) | ||
48 | lda t1, -1(zero) | ||
49 | insqh t1, a0, t1 | ||
50 | andnot a0, 7, v0 | ||
51 | or t1, t0, t0 | ||
52 | subq a0, 1, a0 # get our +1 for the return | ||
53 | cmpbge zero, t0, t1 # t1 <- bitmask: bit i == 1 <==> i-th byte == 0 | ||
54 | subq a1, 7, t2 | ||
55 | subq a0, v0, t0 | ||
56 | bne t1, $found | ||
57 | |||
58 | addq t2, t0, t2 | ||
59 | addq a1, 1, a1 | ||
60 | |||
61 | .align 3 | ||
62 | $loop: ble t2, $limit | ||
63 | EX( ldq t0, 8(v0) ) | ||
64 | subq t2, 8, t2 | ||
65 | addq v0, 8, v0 # addr += 8 | ||
66 | cmpbge zero, t0, t1 | ||
67 | beq t1, $loop | ||
68 | |||
69 | $found: negq t1, t2 # clear all but least set bit | ||
70 | and t1, t2, t1 | ||
71 | |||
72 | and t1, 0xf0, t2 # binary search for that set bit | ||
73 | and t1, 0xcc, t3 | ||
74 | and t1, 0xaa, t4 | ||
75 | cmovne t2, 4, t2 | ||
76 | cmovne t3, 2, t3 | ||
77 | cmovne t4, 1, t4 | ||
78 | addq t2, t3, t2 | ||
79 | addq v0, t4, v0 | ||
80 | addq v0, t2, v0 | ||
81 | nop # dual issue next two on ev4 and ev5 | ||
82 | subq v0, a0, v0 | ||
83 | $exception: | ||
84 | ret | ||
85 | |||
86 | .align 3 # currently redundant | ||
87 | $limit: | ||
88 | subq a1, t2, v0 | ||
89 | ret | ||
90 | |||
91 | .end __strlen_user | ||
diff --git a/arch/alpha/lib/strncat.S b/arch/alpha/lib/strncat.S new file mode 100644 index 000000000000..a8278163c972 --- /dev/null +++ b/arch/alpha/lib/strncat.S | |||
@@ -0,0 +1,84 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/strncat.S | ||
3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
4 | * | ||
5 | * Append no more than COUNT characters from the null-terminated string SRC | ||
6 | * to the null-terminated string DST. Always null-terminate the new DST. | ||
7 | * | ||
8 | * This differs slightly from the semantics in libc in that we never write | ||
9 | * past count, whereas libc may write to count+1. This follows the generic | ||
10 | * implementation in lib/string.c and is, IMHO, more sensible. | ||
11 | */ | ||
12 | |||
13 | .text | ||
14 | |||
15 | .align 3 | ||
16 | .globl strncat | ||
17 | .ent strncat | ||
18 | strncat: | ||
19 | .frame $30, 0, $26 | ||
20 | .prologue 0 | ||
21 | |||
22 | mov $16, $0 # set up return value | ||
23 | beq $18, $zerocount | ||
24 | |||
25 | /* Find the end of the string. */ | ||
26 | |||
27 | ldq_u $1, 0($16) # load first quadword ($16 may be misaligned) | ||
28 | lda $2, -1($31) | ||
29 | insqh $2, $16, $2 | ||
30 | andnot $16, 7, $16 | ||
31 | or $2, $1, $1 | ||
32 | cmpbge $31, $1, $2 # bits set iff byte == 0 | ||
33 | bne $2, $found | ||
34 | |||
35 | $loop: ldq $1, 8($16) | ||
36 | addq $16, 8, $16 | ||
37 | cmpbge $31, $1, $2 | ||
38 | beq $2, $loop | ||
39 | |||
40 | $found: negq $2, $3 # clear all but least set bit | ||
41 | and $2, $3, $2 | ||
42 | |||
43 | and $2, 0xf0, $3 # binary search for that set bit | ||
44 | and $2, 0xcc, $4 | ||
45 | and $2, 0xaa, $5 | ||
46 | cmovne $3, 4, $3 | ||
47 | cmovne $4, 2, $4 | ||
48 | cmovne $5, 1, $5 | ||
49 | addq $3, $4, $3 | ||
50 | addq $16, $5, $16 | ||
51 | addq $16, $3, $16 | ||
52 | |||
53 | /* Now do the append. */ | ||
54 | |||
55 | bsr $23, __stxncpy | ||
56 | |||
57 | /* Worry about the null termination. */ | ||
58 | |||
59 | zapnot $1, $27, $2 # was last byte a null? | ||
60 | bne $2, 0f | ||
61 | ret | ||
62 | |||
63 | 0: cmplt $27, $24, $2 # did we fill the buffer completely? | ||
64 | or $2, $18, $2 | ||
65 | bne $2, 2f | ||
66 | |||
67 | and $24, 0x80, $2 # no zero next byte | ||
68 | bne $2, 1f | ||
69 | |||
70 | /* Here there are bytes left in the current word. Clear one. */ | ||
71 | addq $24, $24, $24 # end-of-count bit <<= 1 | ||
72 | 2: zap $1, $24, $1 | ||
73 | stq_u $1, 0($16) | ||
74 | ret | ||
75 | |||
76 | 1: /* Here we must read the next DST word and clear the first byte. */ | ||
77 | ldq_u $1, 8($16) | ||
78 | zap $1, 1, $1 | ||
79 | stq_u $1, 8($16) | ||
80 | |||
81 | $zerocount: | ||
82 | ret | ||
83 | |||
84 | .end strncat | ||
diff --git a/arch/alpha/lib/strncpy.S b/arch/alpha/lib/strncpy.S new file mode 100644 index 000000000000..338551c7113c --- /dev/null +++ b/arch/alpha/lib/strncpy.S | |||
@@ -0,0 +1,81 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/strncpy.S | ||
3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
4 | * | ||
5 | * Copy no more than COUNT bytes of the null-terminated string from | ||
6 | * SRC to DST. If SRC does not cover all of COUNT, the balance is | ||
7 | * zeroed. | ||
8 | * | ||
9 | * Or, rather, if the kernel cared about that weird ANSI quirk. This | ||
10 | * version has cropped that bit o' nastiness as well as assuming that | ||
11 | * __stxncpy is in range of a branch. | ||
12 | */ | ||
13 | |||
14 | .set noat | ||
15 | .set noreorder | ||
16 | |||
17 | .text | ||
18 | |||
19 | .align 4 | ||
20 | .globl strncpy | ||
21 | .ent strncpy | ||
22 | strncpy: | ||
23 | .frame $30, 0, $26 | ||
24 | .prologue 0 | ||
25 | |||
26 | mov $16, $0 # set return value now | ||
27 | beq $18, $zerolen | ||
28 | unop | ||
29 | bsr $23, __stxncpy # do the work of the copy | ||
30 | |||
31 | unop | ||
32 | bne $18, $multiword # do we have full words left? | ||
33 | subq $24, 1, $3 # nope | ||
34 | subq $27, 1, $4 | ||
35 | |||
36 | or $3, $24, $3 # clear the bits between the last | ||
37 | or $4, $27, $4 # written byte and the last byte in COUNT | ||
38 | andnot $4, $3, $4 | ||
39 | zap $1, $4, $1 | ||
40 | |||
41 | stq_u $1, 0($16) | ||
42 | ret | ||
43 | |||
44 | .align 4 | ||
45 | $multiword: | ||
46 | subq $24, 1, $2 # clear the final bits in the prev word | ||
47 | or $2, $24, $2 | ||
48 | zapnot $1, $2, $1 | ||
49 | subq $18, 1, $18 | ||
50 | |||
51 | stq_u $1, 0($16) | ||
52 | addq $16, 8, $16 | ||
53 | unop | ||
54 | beq $18, 1f | ||
55 | |||
56 | nop | ||
57 | unop | ||
58 | nop | ||
59 | blbc $18, 0f | ||
60 | |||
61 | stq_u $31, 0($16) # zero one word | ||
62 | subq $18, 1, $18 | ||
63 | addq $16, 8, $16 | ||
64 | beq $18, 1f | ||
65 | |||
66 | 0: stq_u $31, 0($16) # zero two words | ||
67 | subq $18, 2, $18 | ||
68 | stq_u $31, 8($16) | ||
69 | addq $16, 16, $16 | ||
70 | bne $18, 0b | ||
71 | |||
72 | 1: ldq_u $1, 0($16) # clear the leading bits in the final word | ||
73 | subq $27, 1, $2 | ||
74 | or $2, $27, $2 | ||
75 | |||
76 | zap $1, $2, $1 | ||
77 | stq_u $1, 0($16) | ||
78 | $zerolen: | ||
79 | ret | ||
80 | |||
81 | .end strncpy | ||
diff --git a/arch/alpha/lib/strncpy_from_user.S b/arch/alpha/lib/strncpy_from_user.S new file mode 100644 index 000000000000..73ee21160ff7 --- /dev/null +++ b/arch/alpha/lib/strncpy_from_user.S | |||
@@ -0,0 +1,339 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/strncpy_from_user.S | ||
3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
4 | * | ||
5 | * Just like strncpy except in the return value: | ||
6 | * | ||
7 | * -EFAULT if an exception occurs before the terminator is copied. | ||
8 | * N if the buffer filled. | ||
9 | * | ||
10 | * Otherwise the length of the string is returned. | ||
11 | */ | ||
12 | |||
13 | |||
14 | #include <asm/errno.h> | ||
15 | #include <asm/regdef.h> | ||
16 | |||
17 | |||
18 | /* Allow an exception for an insn; exit if we get one. */ | ||
19 | #define EX(x,y...) \ | ||
20 | 99: x,##y; \ | ||
21 | .section __ex_table,"a"; \ | ||
22 | .long 99b - .; \ | ||
23 | lda $31, $exception-99b($0); \ | ||
24 | .previous | ||
25 | |||
26 | |||
27 | .set noat | ||
28 | .set noreorder | ||
29 | .text | ||
30 | |||
31 | .globl __strncpy_from_user | ||
32 | .ent __strncpy_from_user | ||
33 | .frame $30, 0, $26 | ||
34 | .prologue 0 | ||
35 | |||
36 | .align 3 | ||
37 | $aligned: | ||
38 | /* On entry to this basic block: | ||
39 | t0 == the first destination word for masking back in | ||
40 | t1 == the first source word. */ | ||
41 | |||
42 | /* Create the 1st output word and detect 0's in the 1st input word. */ | ||
43 | lda t2, -1 # e1 : build a mask against false zero | ||
44 | mskqh t2, a1, t2 # e0 : detection in the src word | ||
45 | mskqh t1, a1, t3 # e0 : | ||
46 | ornot t1, t2, t2 # .. e1 : | ||
47 | mskql t0, a1, t0 # e0 : assemble the first output word | ||
48 | cmpbge zero, t2, t8 # .. e1 : bits set iff null found | ||
49 | or t0, t3, t0 # e0 : | ||
50 | beq a2, $a_eoc # .. e1 : | ||
51 | bne t8, $a_eos # .. e1 : | ||
52 | |||
53 | /* On entry to this basic block: | ||
54 | t0 == a source word not containing a null. */ | ||
55 | |||
56 | $a_loop: | ||
57 | stq_u t0, 0(a0) # e0 : | ||
58 | addq a0, 8, a0 # .. e1 : | ||
59 | EX( ldq_u t0, 0(a1) ) # e0 : | ||
60 | addq a1, 8, a1 # .. e1 : | ||
61 | subq a2, 1, a2 # e0 : | ||
62 | cmpbge zero, t0, t8 # .. e1 (stall) | ||
63 | beq a2, $a_eoc # e1 : | ||
64 | beq t8, $a_loop # e1 : | ||
65 | |||
66 | /* Take care of the final (partial) word store. At this point | ||
67 | the end-of-count bit is set in t8 iff it applies. | ||
68 | |||
69 | On entry to this basic block we have: | ||
70 | t0 == the source word containing the null | ||
71 | t8 == the cmpbge mask that found it. */ | ||
72 | |||
73 | $a_eos: | ||
74 | negq t8, t12 # e0 : find low bit set | ||
75 | and t8, t12, t12 # e1 (stall) | ||
76 | |||
77 | /* For the sake of the cache, don't read a destination word | ||
78 | if we're not going to need it. */ | ||
79 | and t12, 0x80, t6 # e0 : | ||
80 | bne t6, 1f # .. e1 (zdb) | ||
81 | |||
82 | /* We're doing a partial word store and so need to combine | ||
83 | our source and original destination words. */ | ||
84 | ldq_u t1, 0(a0) # e0 : | ||
85 | subq t12, 1, t6 # .. e1 : | ||
86 | or t12, t6, t8 # e0 : | ||
87 | unop # | ||
88 | zapnot t0, t8, t0 # e0 : clear src bytes > null | ||
89 | zap t1, t8, t1 # .. e1 : clear dst bytes <= null | ||
90 | or t0, t1, t0 # e1 : | ||
91 | |||
92 | 1: stq_u t0, 0(a0) | ||
93 | br $finish_up | ||
94 | |||
95 | /* Add the end-of-count bit to the eos detection bitmask. */ | ||
96 | $a_eoc: | ||
97 | or t10, t8, t8 | ||
98 | br $a_eos | ||
99 | |||
100 | /*** The Function Entry Point ***/ | ||
101 | .align 3 | ||
102 | __strncpy_from_user: | ||
103 | mov a0, v0 # save the string start | ||
104 | beq a2, $zerolength | ||
105 | |||
106 | /* Are source and destination co-aligned? */ | ||
107 | xor a0, a1, t1 # e0 : | ||
108 | and a0, 7, t0 # .. e1 : find dest misalignment | ||
109 | and t1, 7, t1 # e0 : | ||
110 | addq a2, t0, a2 # .. e1 : bias count by dest misalignment | ||
111 | subq a2, 1, a2 # e0 : | ||
112 | and a2, 7, t2 # e1 : | ||
113 | srl a2, 3, a2 # e0 : a2 = loop counter = (count - 1)/8 | ||
114 | addq zero, 1, t10 # .. e1 : | ||
115 | sll t10, t2, t10 # e0 : t10 = bitmask of last count byte | ||
116 | bne t1, $unaligned # .. e1 : | ||
117 | |||
118 | /* We are co-aligned; take care of a partial first word. */ | ||
119 | |||
120 | EX( ldq_u t1, 0(a1) ) # e0 : load first src word | ||
121 | addq a1, 8, a1 # .. e1 : | ||
122 | |||
123 | beq t0, $aligned # avoid loading dest word if not needed | ||
124 | ldq_u t0, 0(a0) # e0 : | ||
125 | br $aligned # .. e1 : | ||
126 | |||
127 | |||
128 | /* The source and destination are not co-aligned. Align the destination | ||
129 | and cope. We have to be very careful about not reading too much and | ||
130 | causing a SEGV. */ | ||
131 | |||
132 | .align 3 | ||
133 | $u_head: | ||
134 | /* We know just enough now to be able to assemble the first | ||
135 | full source word. We can still find a zero at the end of it | ||
136 | that prevents us from outputting the whole thing. | ||
137 | |||
138 | On entry to this basic block: | ||
139 | t0 == the first dest word, unmasked | ||
140 | t1 == the shifted low bits of the first source word | ||
141 | t6 == bytemask that is -1 in dest word bytes */ | ||
142 | |||
143 | EX( ldq_u t2, 8(a1) ) # e0 : load second src word | ||
144 | addq a1, 8, a1 # .. e1 : | ||
145 | mskql t0, a0, t0 # e0 : mask trailing garbage in dst | ||
146 | extqh t2, a1, t4 # e0 : | ||
147 | or t1, t4, t1 # e1 : first aligned src word complete | ||
148 | mskqh t1, a0, t1 # e0 : mask leading garbage in src | ||
149 | or t0, t1, t0 # e0 : first output word complete | ||
150 | or t0, t6, t6 # e1 : mask original data for zero test | ||
151 | cmpbge zero, t6, t8 # e0 : | ||
152 | beq a2, $u_eocfin # .. e1 : | ||
153 | bne t8, $u_final # e1 : | ||
154 | |||
155 | lda t6, -1 # e1 : mask out the bits we have | ||
156 | mskql t6, a1, t6 # e0 : already seen | ||
157 | stq_u t0, 0(a0) # e0 : store first output word | ||
158 | or t6, t2, t2 # .. e1 : | ||
159 | cmpbge zero, t2, t8 # e0 : find nulls in second partial | ||
160 | addq a0, 8, a0 # .. e1 : | ||
161 | subq a2, 1, a2 # e0 : | ||
162 | bne t8, $u_late_head_exit # .. e1 : | ||
163 | |||
164 | /* Finally, we've got all the stupid leading edge cases taken care | ||
165 | of and we can set up to enter the main loop. */ | ||
166 | |||
167 | extql t2, a1, t1 # e0 : position hi-bits of lo word | ||
168 | EX( ldq_u t2, 8(a1) ) # .. e1 : read next high-order source word | ||
169 | addq a1, 8, a1 # e0 : | ||
170 | cmpbge zero, t2, t8 # e1 (stall) | ||
171 | beq a2, $u_eoc # e1 : | ||
172 | bne t8, $u_eos # e1 : | ||
173 | |||
174 | /* Unaligned copy main loop. In order to avoid reading too much, | ||
175 | the loop is structured to detect zeros in aligned source words. | ||
176 | This has, unfortunately, effectively pulled half of a loop | ||
177 | iteration out into the head and half into the tail, but it does | ||
178 | prevent nastiness from accumulating in the very thing we want | ||
179 | to run as fast as possible. | ||
180 | |||
181 | On entry to this basic block: | ||
182 | t1 == the shifted high-order bits from the previous source word | ||
183 | t2 == the unshifted current source word | ||
184 | |||
185 | We further know that t2 does not contain a null terminator. */ | ||
186 | |||
187 | .align 3 | ||
188 | $u_loop: | ||
189 | extqh t2, a1, t0 # e0 : extract high bits for current word | ||
190 | addq a1, 8, a1 # .. e1 : | ||
191 | extql t2, a1, t3 # e0 : extract low bits for next time | ||
192 | addq a0, 8, a0 # .. e1 : | ||
193 | or t0, t1, t0 # e0 : current dst word now complete | ||
194 | EX( ldq_u t2, 0(a1) ) # .. e1 : load high word for next time | ||
195 | stq_u t0, -8(a0) # e0 : save the current word | ||
196 | mov t3, t1 # .. e1 : | ||
197 | subq a2, 1, a2 # e0 : | ||
198 | cmpbge zero, t2, t8 # .. e1 : test new word for eos | ||
199 | beq a2, $u_eoc # e1 : | ||
200 | beq t8, $u_loop # e1 : | ||
201 | |||
202 | /* We've found a zero somewhere in the source word we just read. | ||
203 | If it resides in the lower half, we have one (probably partial) | ||
204 | word to write out, and if it resides in the upper half, we | ||
205 | have one full and one partial word left to write out. | ||
206 | |||
207 | On entry to this basic block: | ||
208 | t1 == the shifted high-order bits from the previous source word | ||
209 | t2 == the unshifted current source word. */ | ||
210 | $u_eos: | ||
211 | extqh t2, a1, t0 # e0 : | ||
212 | or t0, t1, t0 # e1 : first (partial) source word complete | ||
213 | |||
214 | cmpbge zero, t0, t8 # e0 : is the null in this first bit? | ||
215 | bne t8, $u_final # .. e1 (zdb) | ||
216 | |||
217 | stq_u t0, 0(a0) # e0 : the null was in the high-order bits | ||
218 | addq a0, 8, a0 # .. e1 : | ||
219 | subq a2, 1, a2 # e1 : | ||
220 | |||
221 | $u_late_head_exit: | ||
222 | extql t2, a1, t0 # .. e0 : | ||
223 | cmpbge zero, t0, t8 # e0 : | ||
224 | or t8, t10, t6 # e1 : | ||
225 | cmoveq a2, t6, t8 # e0 : | ||
226 | nop # .. e1 : | ||
227 | |||
228 | /* Take care of a final (probably partial) result word. | ||
229 | On entry to this basic block: | ||
230 | t0 == assembled source word | ||
231 | t8 == cmpbge mask that found the null. */ | ||
232 | $u_final: | ||
233 | negq t8, t6 # e0 : isolate low bit set | ||
234 | and t6, t8, t12 # e1 : | ||
235 | |||
236 | and t12, 0x80, t6 # e0 : avoid dest word load if we can | ||
237 | bne t6, 1f # .. e1 (zdb) | ||
238 | |||
239 | ldq_u t1, 0(a0) # e0 : | ||
240 | subq t12, 1, t6 # .. e1 : | ||
241 | or t6, t12, t8 # e0 : | ||
242 | zapnot t0, t8, t0 # .. e1 : kill source bytes > null | ||
243 | zap t1, t8, t1 # e0 : kill dest bytes <= null | ||
244 | or t0, t1, t0 # e1 : | ||
245 | |||
246 | 1: stq_u t0, 0(a0) # e0 : | ||
247 | br $finish_up | ||
248 | |||
249 | $u_eoc: # end-of-count | ||
250 | extqh t2, a1, t0 | ||
251 | or t0, t1, t0 | ||
252 | cmpbge zero, t0, t8 | ||
253 | |||
254 | $u_eocfin: # end-of-count, final word | ||
255 | or t10, t8, t8 | ||
256 | br $u_final | ||
257 | |||
258 | /* Unaligned copy entry point. */ | ||
259 | .align 3 | ||
260 | $unaligned: | ||
261 | |||
262 | EX( ldq_u t1, 0(a1) ) # e0 : load first source word | ||
263 | |||
264 | and a0, 7, t4 # .. e1 : find dest misalignment | ||
265 | and a1, 7, t5 # e0 : find src misalignment | ||
266 | |||
267 | /* Conditionally load the first destination word and a bytemask | ||
268 | with 0xff indicating that the destination byte is sacrosanct. */ | ||
269 | |||
270 | mov zero, t0 # .. e1 : | ||
271 | mov zero, t6 # e0 : | ||
272 | beq t4, 1f # .. e1 : | ||
273 | ldq_u t0, 0(a0) # e0 : | ||
274 | lda t6, -1 # .. e1 : | ||
275 | mskql t6, a0, t6 # e0 : | ||
276 | 1: | ||
277 | subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr | ||
278 | |||
279 | /* If source misalignment is larger than dest misalignment, we need | ||
280 | extra startup checks to avoid SEGV. */ | ||
281 | |||
282 | cmplt t4, t5, t12 # e1 : | ||
283 | extql t1, a1, t1 # .. e0 : shift src into place | ||
284 | lda t2, -1 # e0 : for creating masks later | ||
285 | beq t12, $u_head # e1 : | ||
286 | |||
287 | mskqh t2, t5, t2 # e0 : begin src byte validity mask | ||
288 | cmpbge zero, t1, t8 # .. e1 : is there a zero? | ||
289 | extql t2, a1, t2 # e0 : | ||
290 | or t8, t10, t5 # .. e1 : test for end-of-count too | ||
291 | cmpbge zero, t2, t3 # e0 : | ||
292 | cmoveq a2, t5, t8 # .. e1 : | ||
293 | andnot t8, t3, t8 # e0 : | ||
294 | beq t8, $u_head # .. e1 (zdb) | ||
295 | |||
296 | /* At this point we've found a zero in the first partial word of | ||
297 | the source. We need to isolate the valid source data and mask | ||
298 | it into the original destination data. (Incidentally, we know | ||
299 | that we'll need at least one byte of that original dest word.) */ | ||
300 | |||
301 | ldq_u t0, 0(a0) # e0 : | ||
302 | negq t8, t6 # .. e1 : build bitmask of bytes <= zero | ||
303 | mskqh t1, t4, t1 # e0 : | ||
304 | and t6, t8, t12 # .. e1 : | ||
305 | subq t12, 1, t6 # e0 : | ||
306 | or t6, t12, t8 # e1 : | ||
307 | |||
308 | zapnot t2, t8, t2 # e0 : prepare source word; mirror changes | ||
309 | zapnot t1, t8, t1 # .. e1 : to source validity mask | ||
310 | |||
311 | andnot t0, t2, t0 # e0 : zero place for source to reside | ||
312 | or t0, t1, t0 # e1 : and put it there | ||
313 | stq_u t0, 0(a0) # e0 : | ||
314 | |||
315 | $finish_up: | ||
316 | zapnot t0, t12, t4 # was last byte written null? | ||
317 | cmovne t4, 1, t4 | ||
318 | |||
319 | and t12, 0xf0, t3 # binary search for the address of the | ||
320 | and t12, 0xcc, t2 # last byte written | ||
321 | and t12, 0xaa, t1 | ||
322 | bic a0, 7, t0 | ||
323 | cmovne t3, 4, t3 | ||
324 | cmovne t2, 2, t2 | ||
325 | cmovne t1, 1, t1 | ||
326 | addq t0, t3, t0 | ||
327 | addq t1, t2, t1 | ||
328 | addq t0, t1, t0 | ||
329 | addq t0, t4, t0 # add one if we filled the buffer | ||
330 | |||
331 | subq t0, v0, v0 # find string length | ||
332 | ret | ||
333 | |||
334 | $zerolength: | ||
335 | clr v0 | ||
336 | $exception: | ||
337 | ret | ||
338 | |||
339 | .end __strncpy_from_user | ||
diff --git a/arch/alpha/lib/strrchr.S b/arch/alpha/lib/strrchr.S new file mode 100644 index 000000000000..82cfd0ac907b --- /dev/null +++ b/arch/alpha/lib/strrchr.S | |||
@@ -0,0 +1,87 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/strrchr.S | ||
3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
4 | * | ||
5 | * Return the address of the last occurrence of a given character | ||
6 | * within a null-terminated string, or null if it is not found. | ||
7 | */ | ||
8 | |||
9 | #include <asm/regdef.h> | ||
10 | |||
11 | .set noreorder | ||
12 | .set noat | ||
13 | |||
14 | .align 3 | ||
15 | .ent strrchr | ||
16 | .globl strrchr | ||
17 | strrchr: | ||
18 | .frame sp, 0, ra | ||
19 | .prologue 0 | ||
20 | |||
21 | zapnot a1, 1, a1 # e0 : zero extend our test character | ||
22 | mov zero, t6 # .. e1 : t6 is last match aligned addr | ||
23 | sll a1, 8, t5 # e0 : replicate our test character | ||
24 | mov zero, t8 # .. e1 : t8 is last match byte compare mask | ||
25 | or t5, a1, a1 # e0 : | ||
26 | ldq_u t0, 0(a0) # .. e1 : load first quadword | ||
27 | sll a1, 16, t5 # e0 : | ||
28 | andnot a0, 7, v0 # .. e1 : align source addr | ||
29 | or t5, a1, a1 # e0 : | ||
30 | lda t4, -1 # .. e1 : build garbage mask | ||
31 | sll a1, 32, t5 # e0 : | ||
32 | cmpbge zero, t0, t1 # .. e1 : bits set iff byte == zero | ||
33 | mskqh t4, a0, t4 # e0 : | ||
34 | or t5, a1, a1 # .. e1 : character replication complete | ||
35 | xor t0, a1, t2 # e0 : make bytes == c zero | ||
36 | cmpbge zero, t4, t4 # .. e1 : bits set iff byte is garbage | ||
37 | cmpbge zero, t2, t3 # e0 : bits set iff byte == c | ||
38 | andnot t1, t4, t1 # .. e1 : clear garbage from null test | ||
39 | andnot t3, t4, t3 # e0 : clear garbage from char test | ||
40 | bne t1, $eos # .. e1 : did we already hit the terminator? | ||
41 | |||
42 | /* Character search main loop */ | ||
43 | $loop: | ||
44 | ldq t0, 8(v0) # e0 : load next quadword | ||
45 | cmovne t3, v0, t6 # .. e1 : save previous comparisons match | ||
46 | cmovne t3, t3, t8 # e0 : | ||
47 | addq v0, 8, v0 # .. e1 : | ||
48 | xor t0, a1, t2 # e0 : | ||
49 | cmpbge zero, t0, t1 # .. e1 : bits set iff byte == zero | ||
50 | cmpbge zero, t2, t3 # e0 : bits set iff byte == c | ||
51 | beq t1, $loop # .. e1 : if we havnt seen a null, loop | ||
52 | |||
53 | /* Mask out character matches after terminator */ | ||
54 | $eos: | ||
55 | negq t1, t4 # e0 : isolate first null byte match | ||
56 | and t1, t4, t4 # e1 : | ||
57 | subq t4, 1, t5 # e0 : build a mask of the bytes upto... | ||
58 | or t4, t5, t4 # e1 : ... and including the null | ||
59 | |||
60 | and t3, t4, t3 # e0 : mask out char matches after null | ||
61 | cmovne t3, t3, t8 # .. e1 : save it, if match found | ||
62 | cmovne t3, v0, t6 # e0 : | ||
63 | |||
64 | /* Locate the address of the last matched character */ | ||
65 | |||
66 | /* Retain the early exit for the ev4 -- the ev5 mispredict penalty | ||
67 | is 5 cycles -- the same as just falling through. */ | ||
68 | beq t8, $retnull # .. e1 : | ||
69 | |||
70 | and t8, 0xf0, t2 # e0 : binary search for the high bit set | ||
71 | cmovne t2, t2, t8 # .. e1 (zdb) | ||
72 | cmovne t2, 4, t2 # e0 : | ||
73 | and t8, 0xcc, t1 # .. e1 : | ||
74 | cmovne t1, t1, t8 # e0 : | ||
75 | cmovne t1, 2, t1 # .. e1 : | ||
76 | and t8, 0xaa, t0 # e0 : | ||
77 | cmovne t0, 1, t0 # .. e1 (zdb) | ||
78 | addq t2, t1, t1 # e0 : | ||
79 | addq t6, t0, v0 # .. e1 : add our aligned base ptr to the mix | ||
80 | addq v0, t1, v0 # e0 : | ||
81 | ret # .. e1 : | ||
82 | |||
83 | $retnull: | ||
84 | mov zero, v0 # e0 : | ||
85 | ret # .. e1 : | ||
86 | |||
87 | .end strrchr | ||
diff --git a/arch/alpha/lib/stxcpy.S b/arch/alpha/lib/stxcpy.S new file mode 100644 index 000000000000..2a8d51bfc05d --- /dev/null +++ b/arch/alpha/lib/stxcpy.S | |||
@@ -0,0 +1,289 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/stxcpy.S | ||
3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
4 | * | ||
5 | * Copy a null-terminated string from SRC to DST. | ||
6 | * | ||
7 | * This is an internal routine used by strcpy, stpcpy, and strcat. | ||
8 | * As such, it uses special linkage conventions to make implementation | ||
9 | * of these public functions more efficient. | ||
10 | * | ||
11 | * On input: | ||
12 | * t9 = return address | ||
13 | * a0 = DST | ||
14 | * a1 = SRC | ||
15 | * | ||
16 | * On output: | ||
17 | * t12 = bitmask (with one bit set) indicating the last byte written | ||
18 | * a0 = unaligned address of the last *word* written | ||
19 | * | ||
20 | * Furthermore, v0, a3-a5, t11, and t12 are untouched. | ||
21 | */ | ||
22 | |||
23 | #include <asm/regdef.h> | ||
24 | |||
25 | .set noat | ||
26 | .set noreorder | ||
27 | |||
28 | .text | ||
29 | |||
30 | /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that | ||
31 | doesn't like putting the entry point for a procedure somewhere in the | ||
32 | middle of the procedure descriptor. Work around this by putting the | ||
33 | aligned copy in its own procedure descriptor */ | ||
34 | |||
35 | .ent stxcpy_aligned | ||
36 | .align 3 | ||
37 | stxcpy_aligned: | ||
38 | .frame sp, 0, t9 | ||
39 | .prologue 0 | ||
40 | |||
41 | /* On entry to this basic block: | ||
42 | t0 == the first destination word for masking back in | ||
43 | t1 == the first source word. */ | ||
44 | |||
45 | /* Create the 1st output word and detect 0's in the 1st input word. */ | ||
46 | lda t2, -1 # e1 : build a mask against false zero | ||
47 | mskqh t2, a1, t2 # e0 : detection in the src word | ||
48 | mskqh t1, a1, t3 # e0 : | ||
49 | ornot t1, t2, t2 # .. e1 : | ||
50 | mskql t0, a1, t0 # e0 : assemble the first output word | ||
51 | cmpbge zero, t2, t8 # .. e1 : bits set iff null found | ||
52 | or t0, t3, t1 # e0 : | ||
53 | bne t8, $a_eos # .. e1 : | ||
54 | |||
55 | /* On entry to this basic block: | ||
56 | t0 == the first destination word for masking back in | ||
57 | t1 == a source word not containing a null. */ | ||
58 | |||
59 | $a_loop: | ||
60 | stq_u t1, 0(a0) # e0 : | ||
61 | addq a0, 8, a0 # .. e1 : | ||
62 | ldq_u t1, 0(a1) # e0 : | ||
63 | addq a1, 8, a1 # .. e1 : | ||
64 | cmpbge zero, t1, t8 # e0 (stall) | ||
65 | beq t8, $a_loop # .. e1 (zdb) | ||
66 | |||
67 | /* Take care of the final (partial) word store. | ||
68 | On entry to this basic block we have: | ||
69 | t1 == the source word containing the null | ||
70 | t8 == the cmpbge mask that found it. */ | ||
71 | $a_eos: | ||
72 | negq t8, t6 # e0 : find low bit set | ||
73 | and t8, t6, t12 # e1 (stall) | ||
74 | |||
75 | /* For the sake of the cache, don't read a destination word | ||
76 | if we're not going to need it. */ | ||
77 | and t12, 0x80, t6 # e0 : | ||
78 | bne t6, 1f # .. e1 (zdb) | ||
79 | |||
80 | /* We're doing a partial word store and so need to combine | ||
81 | our source and original destination words. */ | ||
82 | ldq_u t0, 0(a0) # e0 : | ||
83 | subq t12, 1, t6 # .. e1 : | ||
84 | zapnot t1, t6, t1 # e0 : clear src bytes >= null | ||
85 | or t12, t6, t8 # .. e1 : | ||
86 | zap t0, t8, t0 # e0 : clear dst bytes <= null | ||
87 | or t0, t1, t1 # e1 : | ||
88 | |||
89 | 1: stq_u t1, 0(a0) # e0 : | ||
90 | ret (t9) # .. e1 : | ||
91 | |||
92 | .end stxcpy_aligned | ||
93 | |||
94 | .align 3 | ||
95 | .ent __stxcpy | ||
96 | .globl __stxcpy | ||
97 | __stxcpy: | ||
98 | .frame sp, 0, t9 | ||
99 | .prologue 0 | ||
100 | |||
101 | /* Are source and destination co-aligned? */ | ||
102 | xor a0, a1, t0 # e0 : | ||
103 | unop # : | ||
104 | and t0, 7, t0 # e0 : | ||
105 | bne t0, $unaligned # .. e1 : | ||
106 | |||
107 | /* We are co-aligned; take care of a partial first word. */ | ||
108 | ldq_u t1, 0(a1) # e0 : load first src word | ||
109 | and a0, 7, t0 # .. e1 : take care not to load a word ... | ||
110 | addq a1, 8, a1 # e0 : | ||
111 | beq t0, stxcpy_aligned # .. e1 : ... if we wont need it | ||
112 | ldq_u t0, 0(a0) # e0 : | ||
113 | br stxcpy_aligned # .. e1 : | ||
114 | |||
115 | |||
116 | /* The source and destination are not co-aligned. Align the destination | ||
117 | and cope. We have to be very careful about not reading too much and | ||
118 | causing a SEGV. */ | ||
119 | |||
120 | .align 3 | ||
121 | $u_head: | ||
122 | /* We know just enough now to be able to assemble the first | ||
123 | full source word. We can still find a zero at the end of it | ||
124 | that prevents us from outputting the whole thing. | ||
125 | |||
126 | On entry to this basic block: | ||
127 | t0 == the first dest word, for masking back in, if needed else 0 | ||
128 | t1 == the low bits of the first source word | ||
129 | t6 == bytemask that is -1 in dest word bytes */ | ||
130 | |||
131 | ldq_u t2, 8(a1) # e0 : | ||
132 | addq a1, 8, a1 # .. e1 : | ||
133 | |||
134 | extql t1, a1, t1 # e0 : | ||
135 | extqh t2, a1, t4 # e0 : | ||
136 | mskql t0, a0, t0 # e0 : | ||
137 | or t1, t4, t1 # .. e1 : | ||
138 | mskqh t1, a0, t1 # e0 : | ||
139 | or t0, t1, t1 # e1 : | ||
140 | |||
141 | or t1, t6, t6 # e0 : | ||
142 | cmpbge zero, t6, t8 # .. e1 : | ||
143 | lda t6, -1 # e0 : for masking just below | ||
144 | bne t8, $u_final # .. e1 : | ||
145 | |||
146 | mskql t6, a1, t6 # e0 : mask out the bits we have | ||
147 | or t6, t2, t2 # e1 : already extracted before | ||
148 | cmpbge zero, t2, t8 # e0 : testing eos | ||
149 | bne t8, $u_late_head_exit # .. e1 (zdb) | ||
150 | |||
151 | /* Finally, we've got all the stupid leading edge cases taken care | ||
152 | of and we can set up to enter the main loop. */ | ||
153 | |||
154 | stq_u t1, 0(a0) # e0 : store first output word | ||
155 | addq a0, 8, a0 # .. e1 : | ||
156 | extql t2, a1, t0 # e0 : position ho-bits of lo word | ||
157 | ldq_u t2, 8(a1) # .. e1 : read next high-order source word | ||
158 | addq a1, 8, a1 # e0 : | ||
159 | cmpbge zero, t2, t8 # .. e1 : | ||
160 | nop # e0 : | ||
161 | bne t8, $u_eos # .. e1 : | ||
162 | |||
163 | /* Unaligned copy main loop. In order to avoid reading too much, | ||
164 | the loop is structured to detect zeros in aligned source words. | ||
165 | This has, unfortunately, effectively pulled half of a loop | ||
166 | iteration out into the head and half into the tail, but it does | ||
167 | prevent nastiness from accumulating in the very thing we want | ||
168 | to run as fast as possible. | ||
169 | |||
170 | On entry to this basic block: | ||
171 | t0 == the shifted high-order bits from the previous source word | ||
172 | t2 == the unshifted current source word | ||
173 | |||
174 | We further know that t2 does not contain a null terminator. */ | ||
175 | |||
176 | .align 3 | ||
177 | $u_loop: | ||
178 | extqh t2, a1, t1 # e0 : extract high bits for current word | ||
179 | addq a1, 8, a1 # .. e1 : | ||
180 | extql t2, a1, t3 # e0 : extract low bits for next time | ||
181 | addq a0, 8, a0 # .. e1 : | ||
182 | or t0, t1, t1 # e0 : current dst word now complete | ||
183 | ldq_u t2, 0(a1) # .. e1 : load high word for next time | ||
184 | stq_u t1, -8(a0) # e0 : save the current word | ||
185 | mov t3, t0 # .. e1 : | ||
186 | cmpbge zero, t2, t8 # e0 : test new word for eos | ||
187 | beq t8, $u_loop # .. e1 : | ||
188 | |||
189 | /* We've found a zero somewhere in the source word we just read. | ||
190 | If it resides in the lower half, we have one (probably partial) | ||
191 | word to write out, and if it resides in the upper half, we | ||
192 | have one full and one partial word left to write out. | ||
193 | |||
194 | On entry to this basic block: | ||
195 | t0 == the shifted high-order bits from the previous source word | ||
196 | t2 == the unshifted current source word. */ | ||
197 | $u_eos: | ||
198 | extqh t2, a1, t1 # e0 : | ||
199 | or t0, t1, t1 # e1 : first (partial) source word complete | ||
200 | |||
201 | cmpbge zero, t1, t8 # e0 : is the null in this first bit? | ||
202 | bne t8, $u_final # .. e1 (zdb) | ||
203 | |||
204 | $u_late_head_exit: | ||
205 | stq_u t1, 0(a0) # e0 : the null was in the high-order bits | ||
206 | addq a0, 8, a0 # .. e1 : | ||
207 | extql t2, a1, t1 # e0 : | ||
208 | cmpbge zero, t1, t8 # .. e1 : | ||
209 | |||
210 | /* Take care of a final (probably partial) result word. | ||
211 | On entry to this basic block: | ||
212 | t1 == assembled source word | ||
213 | t8 == cmpbge mask that found the null. */ | ||
214 | $u_final: | ||
215 | negq t8, t6 # e0 : isolate low bit set | ||
216 | and t6, t8, t12 # e1 : | ||
217 | |||
218 | and t12, 0x80, t6 # e0 : avoid dest word load if we can | ||
219 | bne t6, 1f # .. e1 (zdb) | ||
220 | |||
221 | ldq_u t0, 0(a0) # e0 : | ||
222 | subq t12, 1, t6 # .. e1 : | ||
223 | or t6, t12, t8 # e0 : | ||
224 | zapnot t1, t6, t1 # .. e1 : kill source bytes >= null | ||
225 | zap t0, t8, t0 # e0 : kill dest bytes <= null | ||
226 | or t0, t1, t1 # e1 : | ||
227 | |||
228 | 1: stq_u t1, 0(a0) # e0 : | ||
229 | ret (t9) # .. e1 : | ||
230 | |||
231 | /* Unaligned copy entry point. */ | ||
232 | .align 3 | ||
233 | $unaligned: | ||
234 | |||
235 | ldq_u t1, 0(a1) # e0 : load first source word | ||
236 | |||
237 | and a0, 7, t4 # .. e1 : find dest misalignment | ||
238 | and a1, 7, t5 # e0 : find src misalignment | ||
239 | |||
240 | /* Conditionally load the first destination word and a bytemask | ||
241 | with 0xff indicating that the destination byte is sacrosanct. */ | ||
242 | |||
243 | mov zero, t0 # .. e1 : | ||
244 | mov zero, t6 # e0 : | ||
245 | beq t4, 1f # .. e1 : | ||
246 | ldq_u t0, 0(a0) # e0 : | ||
247 | lda t6, -1 # .. e1 : | ||
248 | mskql t6, a0, t6 # e0 : | ||
249 | 1: | ||
250 | subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr | ||
251 | |||
252 | /* If source misalignment is larger than dest misalignment, we need | ||
253 | extra startup checks to avoid SEGV. */ | ||
254 | |||
255 | cmplt t4, t5, t12 # e0 : | ||
256 | beq t12, $u_head # .. e1 (zdb) | ||
257 | |||
258 | lda t2, -1 # e1 : mask out leading garbage in source | ||
259 | mskqh t2, t5, t2 # e0 : | ||
260 | nop # e0 : | ||
261 | ornot t1, t2, t3 # .. e1 : | ||
262 | cmpbge zero, t3, t8 # e0 : is there a zero? | ||
263 | beq t8, $u_head # .. e1 (zdb) | ||
264 | |||
265 | /* At this point we've found a zero in the first partial word of | ||
266 | the source. We need to isolate the valid source data and mask | ||
267 | it into the original destination data. (Incidentally, we know | ||
268 | that we'll need at least one byte of that original dest word.) */ | ||
269 | |||
270 | ldq_u t0, 0(a0) # e0 : | ||
271 | |||
272 | negq t8, t6 # .. e1 : build bitmask of bytes <= zero | ||
273 | and t6, t8, t12 # e0 : | ||
274 | and a1, 7, t5 # .. e1 : | ||
275 | subq t12, 1, t6 # e0 : | ||
276 | or t6, t12, t8 # e1 : | ||
277 | srl t12, t5, t12 # e0 : adjust final null return value | ||
278 | |||
279 | zapnot t2, t8, t2 # .. e1 : prepare source word; mirror changes | ||
280 | and t1, t2, t1 # e1 : to source validity mask | ||
281 | extql t2, a1, t2 # .. e0 : | ||
282 | extql t1, a1, t1 # e0 : | ||
283 | |||
284 | andnot t0, t2, t0 # .. e1 : zero place for source to reside | ||
285 | or t0, t1, t1 # e1 : and put it there | ||
286 | stq_u t1, 0(a0) # .. e0 : | ||
287 | ret (t9) # e1 : | ||
288 | |||
289 | .end __stxcpy | ||
diff --git a/arch/alpha/lib/stxncpy.S b/arch/alpha/lib/stxncpy.S new file mode 100644 index 000000000000..da1a72740d29 --- /dev/null +++ b/arch/alpha/lib/stxncpy.S | |||
@@ -0,0 +1,345 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/stxncpy.S | ||
3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
4 | * | ||
5 | * Copy no more than COUNT bytes of the null-terminated string from | ||
6 | * SRC to DST. | ||
7 | * | ||
8 | * This is an internal routine used by strncpy, stpncpy, and strncat. | ||
9 | * As such, it uses special linkage conventions to make implementation | ||
10 | * of these public functions more efficient. | ||
11 | * | ||
12 | * On input: | ||
13 | * t9 = return address | ||
14 | * a0 = DST | ||
15 | * a1 = SRC | ||
16 | * a2 = COUNT | ||
17 | * | ||
18 | * Furthermore, COUNT may not be zero. | ||
19 | * | ||
20 | * On output: | ||
21 | * t0 = last word written | ||
22 | * t10 = bitmask (with one bit set) indicating the byte position of | ||
23 | * the end of the range specified by COUNT | ||
24 | * t12 = bitmask (with one bit set) indicating the last byte written | ||
25 | * a0 = unaligned address of the last *word* written | ||
26 | * a2 = the number of full words left in COUNT | ||
27 | * | ||
28 | * Furthermore, v0, a3-a5, t11, and $at are untouched. | ||
29 | */ | ||
30 | |||
31 | #include <asm/regdef.h> | ||
32 | |||
33 | .set noat | ||
34 | .set noreorder | ||
35 | |||
36 | .text | ||
37 | |||
38 | /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that | ||
39 | doesn't like putting the entry point for a procedure somewhere in the | ||
40 | middle of the procedure descriptor. Work around this by putting the | ||
41 | aligned copy in its own procedure descriptor */ | ||
42 | |||
43 | .ent stxncpy_aligned | ||
44 | .align 3 | ||
45 | stxncpy_aligned: | ||
46 | .frame sp, 0, t9, 0 | ||
47 | .prologue 0 | ||
48 | |||
49 | /* On entry to this basic block: | ||
50 | t0 == the first destination word for masking back in | ||
51 | t1 == the first source word. */ | ||
52 | |||
53 | /* Create the 1st output word and detect 0's in the 1st input word. */ | ||
54 | lda t2, -1 # e1 : build a mask against false zero | ||
55 | mskqh t2, a1, t2 # e0 : detection in the src word | ||
56 | mskqh t1, a1, t3 # e0 : | ||
57 | ornot t1, t2, t2 # .. e1 : | ||
58 | mskql t0, a1, t0 # e0 : assemble the first output word | ||
59 | cmpbge zero, t2, t8 # .. e1 : bits set iff null found | ||
60 | or t0, t3, t0 # e0 : | ||
61 | beq a2, $a_eoc # .. e1 : | ||
62 | bne t8, $a_eos # .. e1 : | ||
63 | |||
64 | /* On entry to this basic block: | ||
65 | t0 == a source word not containing a null. */ | ||
66 | |||
67 | $a_loop: | ||
68 | stq_u t0, 0(a0) # e0 : | ||
69 | addq a0, 8, a0 # .. e1 : | ||
70 | ldq_u t0, 0(a1) # e0 : | ||
71 | addq a1, 8, a1 # .. e1 : | ||
72 | subq a2, 1, a2 # e0 : | ||
73 | cmpbge zero, t0, t8 # .. e1 (stall) | ||
74 | beq a2, $a_eoc # e1 : | ||
75 | beq t8, $a_loop # e1 : | ||
76 | |||
77 | /* Take care of the final (partial) word store. At this point | ||
78 | the end-of-count bit is set in t8 iff it applies. | ||
79 | |||
80 | On entry to this basic block we have: | ||
81 | t0 == the source word containing the null | ||
82 | t8 == the cmpbge mask that found it. */ | ||
83 | |||
84 | $a_eos: | ||
85 | negq t8, t12 # e0 : find low bit set | ||
86 | and t8, t12, t12 # e1 (stall) | ||
87 | |||
88 | /* For the sake of the cache, don't read a destination word | ||
89 | if we're not going to need it. */ | ||
90 | and t12, 0x80, t6 # e0 : | ||
91 | bne t6, 1f # .. e1 (zdb) | ||
92 | |||
93 | /* We're doing a partial word store and so need to combine | ||
94 | our source and original destination words. */ | ||
95 | ldq_u t1, 0(a0) # e0 : | ||
96 | subq t12, 1, t6 # .. e1 : | ||
97 | or t12, t6, t8 # e0 : | ||
98 | unop # | ||
99 | zapnot t0, t8, t0 # e0 : clear src bytes > null | ||
100 | zap t1, t8, t1 # .. e1 : clear dst bytes <= null | ||
101 | or t0, t1, t0 # e1 : | ||
102 | |||
103 | 1: stq_u t0, 0(a0) # e0 : | ||
104 | ret (t9) # e1 : | ||
105 | |||
106 | /* Add the end-of-count bit to the eos detection bitmask. */ | ||
107 | $a_eoc: | ||
108 | or t10, t8, t8 | ||
109 | br $a_eos | ||
110 | |||
111 | .end stxncpy_aligned | ||
112 | |||
113 | .align 3 | ||
114 | .ent __stxncpy | ||
115 | .globl __stxncpy | ||
116 | __stxncpy: | ||
117 | .frame sp, 0, t9, 0 | ||
118 | .prologue 0 | ||
119 | |||
120 | /* Are source and destination co-aligned? */ | ||
121 | xor a0, a1, t1 # e0 : | ||
122 | and a0, 7, t0 # .. e1 : find dest misalignment | ||
123 | and t1, 7, t1 # e0 : | ||
124 | addq a2, t0, a2 # .. e1 : bias count by dest misalignment | ||
125 | subq a2, 1, a2 # e0 : | ||
126 | and a2, 7, t2 # e1 : | ||
127 | srl a2, 3, a2 # e0 : a2 = loop counter = (count - 1)/8 | ||
128 | addq zero, 1, t10 # .. e1 : | ||
129 | sll t10, t2, t10 # e0 : t10 = bitmask of last count byte | ||
130 | bne t1, $unaligned # .. e1 : | ||
131 | |||
132 | /* We are co-aligned; take care of a partial first word. */ | ||
133 | |||
134 | ldq_u t1, 0(a1) # e0 : load first src word | ||
135 | addq a1, 8, a1 # .. e1 : | ||
136 | |||
137 | beq t0, stxncpy_aligned # avoid loading dest word if not needed | ||
138 | ldq_u t0, 0(a0) # e0 : | ||
139 | br stxncpy_aligned # .. e1 : | ||
140 | |||
141 | |||
142 | /* The source and destination are not co-aligned. Align the destination | ||
143 | and cope. We have to be very careful about not reading too much and | ||
144 | causing a SEGV. */ | ||
145 | |||
146 | .align 3 | ||
147 | $u_head: | ||
148 | /* We know just enough now to be able to assemble the first | ||
149 | full source word. We can still find a zero at the end of it | ||
150 | that prevents us from outputting the whole thing. | ||
151 | |||
152 | On entry to this basic block: | ||
153 | t0 == the first dest word, unmasked | ||
154 | t1 == the shifted low bits of the first source word | ||
155 | t6 == bytemask that is -1 in dest word bytes */ | ||
156 | |||
157 | ldq_u t2, 8(a1) # e0 : load second src word | ||
158 | addq a1, 8, a1 # .. e1 : | ||
159 | mskql t0, a0, t0 # e0 : mask trailing garbage in dst | ||
160 | extqh t2, a1, t4 # e0 : | ||
161 | or t1, t4, t1 # e1 : first aligned src word complete | ||
162 | mskqh t1, a0, t1 # e0 : mask leading garbage in src | ||
163 | or t0, t1, t0 # e0 : first output word complete | ||
164 | or t0, t6, t6 # e1 : mask original data for zero test | ||
165 | cmpbge zero, t6, t8 # e0 : | ||
166 | beq a2, $u_eocfin # .. e1 : | ||
167 | lda t6, -1 # e0 : | ||
168 | bne t8, $u_final # .. e1 : | ||
169 | |||
170 | mskql t6, a1, t6 # e0 : mask out bits already seen | ||
171 | nop # .. e1 : | ||
172 | stq_u t0, 0(a0) # e0 : store first output word | ||
173 | or t6, t2, t2 # .. e1 : | ||
174 | cmpbge zero, t2, t8 # e0 : find nulls in second partial | ||
175 | addq a0, 8, a0 # .. e1 : | ||
176 | subq a2, 1, a2 # e0 : | ||
177 | bne t8, $u_late_head_exit # .. e1 : | ||
178 | |||
179 | /* Finally, we've got all the stupid leading edge cases taken care | ||
180 | of and we can set up to enter the main loop. */ | ||
181 | |||
182 | extql t2, a1, t1 # e0 : position hi-bits of lo word | ||
183 | beq a2, $u_eoc # .. e1 : | ||
184 | ldq_u t2, 8(a1) # e0 : read next high-order source word | ||
185 | addq a1, 8, a1 # .. e1 : | ||
186 | extqh t2, a1, t0 # e0 : position lo-bits of hi word (stall) | ||
187 | cmpbge zero, t2, t8 # .. e1 : | ||
188 | nop # e0 : | ||
189 | bne t8, $u_eos # .. e1 : | ||
190 | |||
191 | /* Unaligned copy main loop. In order to avoid reading too much, | ||
192 | the loop is structured to detect zeros in aligned source words. | ||
193 | This has, unfortunately, effectively pulled half of a loop | ||
194 | iteration out into the head and half into the tail, but it does | ||
195 | prevent nastiness from accumulating in the very thing we want | ||
196 | to run as fast as possible. | ||
197 | |||
198 | On entry to this basic block: | ||
199 | t0 == the shifted low-order bits from the current source word | ||
200 | t1 == the shifted high-order bits from the previous source word | ||
201 | t2 == the unshifted current source word | ||
202 | |||
203 | We further know that t2 does not contain a null terminator. */ | ||
204 | |||
205 | .align 3 | ||
206 | $u_loop: | ||
207 | or t0, t1, t0 # e0 : current dst word now complete | ||
208 | subq a2, 1, a2 # .. e1 : decrement word count | ||
209 | stq_u t0, 0(a0) # e0 : save the current word | ||
210 | addq a0, 8, a0 # .. e1 : | ||
211 | extql t2, a1, t1 # e0 : extract high bits for next time | ||
212 | beq a2, $u_eoc # .. e1 : | ||
213 | ldq_u t2, 8(a1) # e0 : load high word for next time | ||
214 | addq a1, 8, a1 # .. e1 : | ||
215 | nop # e0 : | ||
216 | cmpbge zero, t2, t8 # e1 : test new word for eos (stall) | ||
217 | extqh t2, a1, t0 # e0 : extract low bits for current word | ||
218 | beq t8, $u_loop # .. e1 : | ||
219 | |||
220 | /* We've found a zero somewhere in the source word we just read. | ||
221 | If it resides in the lower half, we have one (probably partial) | ||
222 | word to write out, and if it resides in the upper half, we | ||
223 | have one full and one partial word left to write out. | ||
224 | |||
225 | On entry to this basic block: | ||
226 | t0 == the shifted low-order bits from the current source word | ||
227 | t1 == the shifted high-order bits from the previous source word | ||
228 | t2 == the unshifted current source word. */ | ||
229 | $u_eos: | ||
230 | or t0, t1, t0 # e0 : first (partial) source word complete | ||
231 | nop # .. e1 : | ||
232 | cmpbge zero, t0, t8 # e0 : is the null in this first bit? | ||
233 | bne t8, $u_final # .. e1 (zdb) | ||
234 | |||
235 | stq_u t0, 0(a0) # e0 : the null was in the high-order bits | ||
236 | addq a0, 8, a0 # .. e1 : | ||
237 | subq a2, 1, a2 # e1 : | ||
238 | |||
239 | $u_late_head_exit: | ||
240 | extql t2, a1, t0 # .. e0 : | ||
241 | cmpbge zero, t0, t8 # e0 : | ||
242 | or t8, t10, t6 # e1 : | ||
243 | cmoveq a2, t6, t8 # e0 : | ||
244 | nop # .. e1 : | ||
245 | |||
246 | /* Take care of a final (probably partial) result word. | ||
247 | On entry to this basic block: | ||
248 | t0 == assembled source word | ||
249 | t8 == cmpbge mask that found the null. */ | ||
250 | $u_final: | ||
251 | negq t8, t6 # e0 : isolate low bit set | ||
252 | and t6, t8, t12 # e1 : | ||
253 | |||
254 | and t12, 0x80, t6 # e0 : avoid dest word load if we can | ||
255 | bne t6, 1f # .. e1 (zdb) | ||
256 | |||
257 | ldq_u t1, 0(a0) # e0 : | ||
258 | subq t12, 1, t6 # .. e1 : | ||
259 | or t6, t12, t8 # e0 : | ||
260 | zapnot t0, t8, t0 # .. e1 : kill source bytes > null | ||
261 | zap t1, t8, t1 # e0 : kill dest bytes <= null | ||
262 | or t0, t1, t0 # e1 : | ||
263 | |||
264 | 1: stq_u t0, 0(a0) # e0 : | ||
265 | ret (t9) # .. e1 : | ||
266 | |||
267 | /* Got to end-of-count before end of string. | ||
268 | On entry to this basic block: | ||
269 | t1 == the shifted high-order bits from the previous source word */ | ||
270 | $u_eoc: | ||
271 | and a1, 7, t6 # e1 : | ||
272 | sll t10, t6, t6 # e0 : | ||
273 | and t6, 0xff, t6 # e0 : | ||
274 | bne t6, 1f # .. e1 : | ||
275 | |||
276 | ldq_u t2, 8(a1) # e0 : load final src word | ||
277 | nop # .. e1 : | ||
278 | extqh t2, a1, t0 # e0 : extract low bits for last word | ||
279 | or t1, t0, t1 # e1 : | ||
280 | |||
281 | 1: cmpbge zero, t1, t8 | ||
282 | mov t1, t0 | ||
283 | |||
284 | $u_eocfin: # end-of-count, final word | ||
285 | or t10, t8, t8 | ||
286 | br $u_final | ||
287 | |||
288 | /* Unaligned copy entry point. */ | ||
289 | .align 3 | ||
290 | $unaligned: | ||
291 | |||
292 | ldq_u t1, 0(a1) # e0 : load first source word | ||
293 | |||
294 | and a0, 7, t4 # .. e1 : find dest misalignment | ||
295 | and a1, 7, t5 # e0 : find src misalignment | ||
296 | |||
297 | /* Conditionally load the first destination word and a bytemask | ||
298 | with 0xff indicating that the destination byte is sacrosanct. */ | ||
299 | |||
300 | mov zero, t0 # .. e1 : | ||
301 | mov zero, t6 # e0 : | ||
302 | beq t4, 1f # .. e1 : | ||
303 | ldq_u t0, 0(a0) # e0 : | ||
304 | lda t6, -1 # .. e1 : | ||
305 | mskql t6, a0, t6 # e0 : | ||
306 | subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr | ||
307 | |||
308 | /* If source misalignment is larger than dest misalignment, we need | ||
309 | extra startup checks to avoid SEGV. */ | ||
310 | |||
311 | 1: cmplt t4, t5, t12 # e1 : | ||
312 | extql t1, a1, t1 # .. e0 : shift src into place | ||
313 | lda t2, -1 # e0 : for creating masks later | ||
314 | beq t12, $u_head # .. e1 : | ||
315 | |||
316 | extql t2, a1, t2 # e0 : | ||
317 | cmpbge zero, t1, t8 # .. e1 : is there a zero? | ||
318 | andnot t2, t6, t12 # e0 : dest mask for a single word copy | ||
319 | or t8, t10, t5 # .. e1 : test for end-of-count too | ||
320 | cmpbge zero, t12, t3 # e0 : | ||
321 | cmoveq a2, t5, t8 # .. e1 : | ||
322 | andnot t8, t3, t8 # e0 : | ||
323 | beq t8, $u_head # .. e1 (zdb) | ||
324 | |||
325 | /* At this point we've found a zero in the first partial word of | ||
326 | the source. We need to isolate the valid source data and mask | ||
327 | it into the original destination data. (Incidentally, we know | ||
328 | that we'll need at least one byte of that original dest word.) */ | ||
329 | |||
330 | ldq_u t0, 0(a0) # e0 : | ||
331 | negq t8, t6 # .. e1 : build bitmask of bytes <= zero | ||
332 | mskqh t1, t4, t1 # e0 : | ||
333 | and t6, t8, t2 # .. e1 : | ||
334 | subq t2, 1, t6 # e0 : | ||
335 | or t6, t2, t8 # e1 : | ||
336 | |||
337 | zapnot t12, t8, t12 # e0 : prepare source word; mirror changes | ||
338 | zapnot t1, t8, t1 # .. e1 : to source validity mask | ||
339 | |||
340 | andnot t0, t12, t0 # e0 : zero place for source to reside | ||
341 | or t0, t1, t0 # e1 : and put it there | ||
342 | stq_u t0, 0(a0) # e0 : | ||
343 | ret (t9) # .. e1 : | ||
344 | |||
345 | .end __stxncpy | ||
diff --git a/arch/alpha/lib/udelay.c b/arch/alpha/lib/udelay.c new file mode 100644 index 000000000000..1c879bbce419 --- /dev/null +++ b/arch/alpha/lib/udelay.c | |||
@@ -0,0 +1,55 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1993, 2000 Linus Torvalds | ||
3 | * | ||
4 | * Delay routines, using a pre-computed "loops_per_jiffy" value. | ||
5 | */ | ||
6 | |||
7 | #include <linux/config.h> | ||
8 | #include <linux/module.h> | ||
9 | #include <linux/sched.h> /* for udelay's use of smp_processor_id */ | ||
10 | #include <asm/param.h> | ||
11 | #include <asm/smp.h> | ||
12 | #include <linux/delay.h> | ||
13 | |||
14 | /* | ||
15 | * Use only for very small delays (< 1 msec). | ||
16 | * | ||
17 | * The active part of our cycle counter is only 32-bits wide, and | ||
18 | * we're treating the difference between two marks as signed. On | ||
19 | * a 1GHz box, that's about 2 seconds. | ||
20 | */ | ||
21 | |||
22 | void | ||
23 | __delay(int loops) | ||
24 | { | ||
25 | int tmp; | ||
26 | __asm__ __volatile__( | ||
27 | " rpcc %0\n" | ||
28 | " addl %1,%0,%1\n" | ||
29 | "1: rpcc %0\n" | ||
30 | " subl %1,%0,%0\n" | ||
31 | " bgt %0,1b" | ||
32 | : "=&r" (tmp), "=r" (loops) : "1"(loops)); | ||
33 | } | ||
34 | |||
35 | #ifdef CONFIG_SMP | ||
36 | #define LPJ cpu_data[smp_processor_id()].loops_per_jiffy | ||
37 | #else | ||
38 | #define LPJ loops_per_jiffy | ||
39 | #endif | ||
40 | |||
41 | void | ||
42 | udelay(unsigned long usecs) | ||
43 | { | ||
44 | usecs *= (((unsigned long)HZ << 32) / 1000000) * LPJ; | ||
45 | __delay((long)usecs >> 32); | ||
46 | } | ||
47 | EXPORT_SYMBOL(udelay); | ||
48 | |||
49 | void | ||
50 | ndelay(unsigned long nsecs) | ||
51 | { | ||
52 | nsecs *= (((unsigned long)HZ << 32) / 1000000000) * LPJ; | ||
53 | __delay((long)nsecs >> 32); | ||
54 | } | ||
55 | EXPORT_SYMBOL(ndelay); | ||