aboutsummaryrefslogtreecommitdiffstats
path: root/arch/alpha/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/alpha/lib')
-rw-r--r--arch/alpha/lib/Makefile58
-rw-r--r--arch/alpha/lib/callback_srm.S104
-rw-r--r--arch/alpha/lib/checksum.c186
-rw-r--r--arch/alpha/lib/clear_page.S39
-rw-r--r--arch/alpha/lib/clear_user.S113
-rw-r--r--arch/alpha/lib/copy_page.S49
-rw-r--r--arch/alpha/lib/copy_user.S145
-rw-r--r--arch/alpha/lib/csum_ipv6_magic.S92
-rw-r--r--arch/alpha/lib/csum_partial_copy.c391
-rw-r--r--arch/alpha/lib/dbg_current.S29
-rw-r--r--arch/alpha/lib/dbg_stackcheck.S27
-rw-r--r--arch/alpha/lib/dbg_stackkill.S35
-rw-r--r--arch/alpha/lib/dec_and_lock.c42
-rw-r--r--arch/alpha/lib/divide.S195
-rw-r--r--arch/alpha/lib/ev6-clear_page.S54
-rw-r--r--arch/alpha/lib/ev6-clear_user.S225
-rw-r--r--arch/alpha/lib/ev6-copy_page.S203
-rw-r--r--arch/alpha/lib/ev6-copy_user.S259
-rw-r--r--arch/alpha/lib/ev6-csum_ipv6_magic.S126
-rw-r--r--arch/alpha/lib/ev6-divide.S259
-rw-r--r--arch/alpha/lib/ev6-memchr.S191
-rw-r--r--arch/alpha/lib/ev6-memcpy.S248
-rw-r--r--arch/alpha/lib/ev6-memset.S597
-rw-r--r--arch/alpha/lib/ev6-strncpy_from_user.S424
-rw-r--r--arch/alpha/lib/ev6-stxcpy.S321
-rw-r--r--arch/alpha/lib/ev6-stxncpy.S397
-rw-r--r--arch/alpha/lib/ev67-strcat.S54
-rw-r--r--arch/alpha/lib/ev67-strchr.S88
-rw-r--r--arch/alpha/lib/ev67-strlen.S49
-rw-r--r--arch/alpha/lib/ev67-strlen_user.S107
-rw-r--r--arch/alpha/lib/ev67-strncat.S94
-rw-r--r--arch/alpha/lib/ev67-strrchr.S109
-rw-r--r--arch/alpha/lib/fpreg.c193
-rw-r--r--arch/alpha/lib/memchr.S164
-rw-r--r--arch/alpha/lib/memcpy.c163
-rw-r--r--arch/alpha/lib/memmove.S181
-rw-r--r--arch/alpha/lib/memset.S124
-rw-r--r--arch/alpha/lib/srm_printk.c41
-rw-r--r--arch/alpha/lib/srm_puts.c23
-rw-r--r--arch/alpha/lib/stacktrace.c103
-rw-r--r--arch/alpha/lib/strcasecmp.c26
-rw-r--r--arch/alpha/lib/strcat.S52
-rw-r--r--arch/alpha/lib/strchr.S70
-rw-r--r--arch/alpha/lib/strcpy.S23
-rw-r--r--arch/alpha/lib/strlen.S57
-rw-r--r--arch/alpha/lib/strlen_user.S91
-rw-r--r--arch/alpha/lib/strncat.S84
-rw-r--r--arch/alpha/lib/strncpy.S81
-rw-r--r--arch/alpha/lib/strncpy_from_user.S339
-rw-r--r--arch/alpha/lib/strrchr.S87
-rw-r--r--arch/alpha/lib/stxcpy.S289
-rw-r--r--arch/alpha/lib/stxncpy.S345
-rw-r--r--arch/alpha/lib/udelay.c55
53 files changed, 7901 insertions, 0 deletions
diff --git a/arch/alpha/lib/Makefile b/arch/alpha/lib/Makefile
new file mode 100644
index 000000000000..21cf624d7329
--- /dev/null
+++ b/arch/alpha/lib/Makefile
@@ -0,0 +1,58 @@
1#
2# Makefile for alpha-specific library files..
3#
4
5EXTRA_AFLAGS := $(CFLAGS)
6EXTRA_CFLAGS := -Werror
7
8# Many of these routines have implementations tuned for ev6.
9# Choose them iff we're targeting ev6 specifically.
10ev6-$(CONFIG_ALPHA_EV6) := ev6-
11
12# Several make use of the cttz instruction introduced in ev67.
13ev67-$(CONFIG_ALPHA_EV67) := ev67-
14
15lib-y = __divqu.o __remqu.o __divlu.o __remlu.o \
16 udelay.o \
17 $(ev6-y)memset.o \
18 $(ev6-y)memcpy.o \
19 memmove.o \
20 checksum.o \
21 csum_partial_copy.o \
22 $(ev67-y)strlen.o \
23 $(ev67-y)strcat.o \
24 strcpy.o \
25 $(ev67-y)strncat.o \
26 strncpy.o \
27 $(ev6-y)stxcpy.o \
28 $(ev6-y)stxncpy.o \
29 $(ev67-y)strchr.o \
30 $(ev67-y)strrchr.o \
31 $(ev6-y)memchr.o \
32 $(ev6-y)copy_user.o \
33 $(ev6-y)clear_user.o \
34 $(ev6-y)strncpy_from_user.o \
35 $(ev67-y)strlen_user.o \
36 $(ev6-y)csum_ipv6_magic.o \
37 $(ev6-y)clear_page.o \
38 $(ev6-y)copy_page.o \
39 strcasecmp.o \
40 fpreg.o \
41 callback_srm.o srm_puts.o srm_printk.o
42
43lib-$(CONFIG_SMP) += dec_and_lock.o
44
45# The division routines are built from single source, with different defines.
46AFLAGS___divqu.o = -DDIV
47AFLAGS___remqu.o = -DREM
48AFLAGS___divlu.o = -DDIV -DINTSIZE
49AFLAGS___remlu.o = -DREM -DINTSIZE
50
51$(obj)/__divqu.o: $(obj)/$(ev6-y)divide.S
52 $(cmd_as_o_S)
53$(obj)/__remqu.o: $(obj)/$(ev6-y)divide.S
54 $(cmd_as_o_S)
55$(obj)/__divlu.o: $(obj)/$(ev6-y)divide.S
56 $(cmd_as_o_S)
57$(obj)/__remlu.o: $(obj)/$(ev6-y)divide.S
58 $(cmd_as_o_S)
diff --git a/arch/alpha/lib/callback_srm.S b/arch/alpha/lib/callback_srm.S
new file mode 100644
index 000000000000..0528acd0d9ad
--- /dev/null
+++ b/arch/alpha/lib/callback_srm.S
@@ -0,0 +1,104 @@
1/*
2 * arch/alpha/lib/callback_srm.S
3 */
4
5#include <linux/config.h>
6#include <asm/console.h>
7
8.text
9#define HWRPB_CRB_OFFSET 0xc0
10
11#if defined(CONFIG_ALPHA_SRM) || defined(CONFIG_ALPHA_GENERIC)
12.align 4
13srm_dispatch:
14#if defined(CONFIG_ALPHA_GENERIC)
15 ldl $4,alpha_using_srm
16 beq $4,nosrm
17#endif
18 ldq $0,hwrpb # gp is set up by CALLBACK macro.
19 ldl $25,0($25) # Pick up the wrapper data.
20 mov $20,$21 # Shift arguments right.
21 mov $19,$20
22 ldq $1,HWRPB_CRB_OFFSET($0)
23 mov $18,$19
24 mov $17,$18
25 mov $16,$17
26 addq $0,$1,$2 # CRB address
27 ldq $27,0($2) # DISPATCH procedure descriptor (VMS call std)
28 extwl $25,0,$16 # SRM callback function code
29 ldq $3,8($27) # call address
30 extwl $25,2,$25 # argument information (VMS calling std)
31 jmp ($3) # Return directly to caller of wrapper.
32
33.align 4
34.globl srm_fixup
35.ent srm_fixup
36srm_fixup:
37 ldgp $29,0($27)
38#if defined(CONFIG_ALPHA_GENERIC)
39 ldl $4,alpha_using_srm
40 beq $4,nosrm
41#endif
42 ldq $0,hwrpb
43 ldq $1,HWRPB_CRB_OFFSET($0)
44 addq $0,$1,$2 # CRB address
45 ldq $27,16($2) # VA of FIXUP procedure descriptor
46 ldq $3,8($27) # call address
47 lda $25,2($31) # two integer arguments
48 jmp ($3) # Return directly to caller of srm_fixup.
49.end srm_fixup
50
51#if defined(CONFIG_ALPHA_GENERIC)
52.align 3
53nosrm:
54 lda $0,-1($31)
55 ret
56#endif
57
58#define CALLBACK(NAME, CODE, ARG_CNT) \
59.align 4; .globl callback_##NAME; .ent callback_##NAME; callback_##NAME##: \
60ldgp $29,0($27); br $25,srm_dispatch; .word CODE, ARG_CNT; .end callback_##NAME
61
62#else /* defined(CONFIG_ALPHA_SRM) || defined(CONFIG_ALPHA_GENERIC) */
63
64#define CALLBACK(NAME, CODE, ARG_CNT) \
65.align 3; .globl callback_##NAME; .ent callback_##NAME; callback_##NAME##: \
66lda $0,-1($31); ret; .end callback_##NAME
67
68.align 3
69.globl srm_fixup
70.ent srm_fixup
71srm_fixup:
72 lda $0,-1($31)
73 ret
74.end srm_fixup
75#endif /* defined(CONFIG_ALPHA_SRM) || defined(CONFIG_ALPHA_GENERIC) */
76
77CALLBACK(puts, CCB_PUTS, 4)
78CALLBACK(open, CCB_OPEN, 3)
79CALLBACK(close, CCB_CLOSE, 2)
80CALLBACK(read, CCB_READ, 5)
81CALLBACK(open_console, CCB_OPEN_CONSOLE, 1)
82CALLBACK(close_console, CCB_CLOSE_CONSOLE, 1)
83CALLBACK(getenv, CCB_GET_ENV, 4)
84CALLBACK(setenv, CCB_SET_ENV, 4)
85CALLBACK(getc, CCB_GETC, 2)
86CALLBACK(reset_term, CCB_RESET_TERM, 2)
87CALLBACK(term_int, CCB_SET_TERM_INT, 3)
88CALLBACK(term_ctl, CCB_SET_TERM_CTL, 3)
89CALLBACK(process_keycode, CCB_PROCESS_KEYCODE, 3)
90CALLBACK(ioctl, CCB_IOCTL, 6)
91CALLBACK(write, CCB_WRITE, 5)
92CALLBACK(reset_env, CCB_RESET_ENV, 4)
93CALLBACK(save_env, CCB_SAVE_ENV, 1)
94CALLBACK(pswitch, CCB_PSWITCH, 3)
95CALLBACK(bios_emul, CCB_BIOS_EMUL, 5)
96
97.data
98__alpha_using_srm: # For use by bootpheader
99 .long 7 # value is not 1 for link debugging
100 .weak alpha_using_srm; alpha_using_srm = __alpha_using_srm
101__callback_init_done: # For use by bootpheader
102 .long 7 # value is not 1 for link debugging
103 .weak callback_init_done; callback_init_done = __callback_init_done
104
diff --git a/arch/alpha/lib/checksum.c b/arch/alpha/lib/checksum.c
new file mode 100644
index 000000000000..89044e6385fe
--- /dev/null
+++ b/arch/alpha/lib/checksum.c
@@ -0,0 +1,186 @@
1/*
2 * arch/alpha/lib/checksum.c
3 *
4 * This file contains network checksum routines that are better done
5 * in an architecture-specific manner due to speed..
6 * Comments in other versions indicate that the algorithms are from RFC1071
7 *
8 * accellerated versions (and 21264 assembly versions ) contributed by
9 * Rick Gorton <rick.gorton@alpha-processor.com>
10 */
11
12#include <linux/module.h>
13#include <linux/string.h>
14
15#include <asm/byteorder.h>
16
17static inline unsigned short from64to16(unsigned long x)
18{
19 /* Using extract instructions is a bit more efficient
20 than the original shift/bitmask version. */
21
22 union {
23 unsigned long ul;
24 unsigned int ui[2];
25 unsigned short us[4];
26 } in_v, tmp_v, out_v;
27
28 in_v.ul = x;
29 tmp_v.ul = (unsigned long) in_v.ui[0] + (unsigned long) in_v.ui[1];
30
31 /* Since the bits of tmp_v.sh[3] are going to always be zero,
32 we don't have to bother to add that in. */
33 out_v.ul = (unsigned long) tmp_v.us[0] + (unsigned long) tmp_v.us[1]
34 + (unsigned long) tmp_v.us[2];
35
36 /* Similarly, out_v.us[2] is always zero for the final add. */
37 return out_v.us[0] + out_v.us[1];
38}
39
40/*
41 * computes the checksum of the TCP/UDP pseudo-header
42 * returns a 16-bit checksum, already complemented.
43 */
44unsigned short int csum_tcpudp_magic(unsigned long saddr,
45 unsigned long daddr,
46 unsigned short len,
47 unsigned short proto,
48 unsigned int sum)
49{
50 return ~from64to16(saddr + daddr + sum +
51 ((unsigned long) ntohs(len) << 16) +
52 ((unsigned long) proto << 8));
53}
54
55unsigned int csum_tcpudp_nofold(unsigned long saddr,
56 unsigned long daddr,
57 unsigned short len,
58 unsigned short proto,
59 unsigned int sum)
60{
61 unsigned long result;
62
63 result = (saddr + daddr + sum +
64 ((unsigned long) ntohs(len) << 16) +
65 ((unsigned long) proto << 8));
66
67 /* Fold down to 32-bits so we don't lose in the typedef-less
68 network stack. */
69 /* 64 to 33 */
70 result = (result & 0xffffffff) + (result >> 32);
71 /* 33 to 32 */
72 result = (result & 0xffffffff) + (result >> 32);
73 return result;
74}
75
76/*
77 * Do a 64-bit checksum on an arbitrary memory area..
78 *
79 * This isn't a great routine, but it's not _horrible_ either. The
80 * inner loop could be unrolled a bit further, and there are better
81 * ways to do the carry, but this is reasonable.
82 */
83static inline unsigned long do_csum(const unsigned char * buff, int len)
84{
85 int odd, count;
86 unsigned long result = 0;
87
88 if (len <= 0)
89 goto out;
90 odd = 1 & (unsigned long) buff;
91 if (odd) {
92 result = *buff << 8;
93 len--;
94 buff++;
95 }
96 count = len >> 1; /* nr of 16-bit words.. */
97 if (count) {
98 if (2 & (unsigned long) buff) {
99 result += *(unsigned short *) buff;
100 count--;
101 len -= 2;
102 buff += 2;
103 }
104 count >>= 1; /* nr of 32-bit words.. */
105 if (count) {
106 if (4 & (unsigned long) buff) {
107 result += *(unsigned int *) buff;
108 count--;
109 len -= 4;
110 buff += 4;
111 }
112 count >>= 1; /* nr of 64-bit words.. */
113 if (count) {
114 unsigned long carry = 0;
115 do {
116 unsigned long w = *(unsigned long *) buff;
117 count--;
118 buff += 8;
119 result += carry;
120 result += w;
121 carry = (w > result);
122 } while (count);
123 result += carry;
124 result = (result & 0xffffffff) + (result >> 32);
125 }
126 if (len & 4) {
127 result += *(unsigned int *) buff;
128 buff += 4;
129 }
130 }
131 if (len & 2) {
132 result += *(unsigned short *) buff;
133 buff += 2;
134 }
135 }
136 if (len & 1)
137 result += *buff;
138 result = from64to16(result);
139 if (odd)
140 result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
141out:
142 return result;
143}
144
145/*
146 * This is a version of ip_compute_csum() optimized for IP headers,
147 * which always checksum on 4 octet boundaries.
148 */
149unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl)
150{
151 return ~do_csum(iph,ihl*4);
152}
153
154/*
155 * computes the checksum of a memory block at buff, length len,
156 * and adds in "sum" (32-bit)
157 *
158 * returns a 32-bit number suitable for feeding into itself
159 * or csum_tcpudp_magic
160 *
161 * this function must be called with even lengths, except
162 * for the last fragment, which may be odd
163 *
164 * it's best to have buff aligned on a 32-bit boundary
165 */
166unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
167{
168 unsigned long result = do_csum(buff, len);
169
170 /* add in old sum, and carry.. */
171 result += sum;
172 /* 32+c bits -> 32 bits */
173 result = (result & 0xffffffff) + (result >> 32);
174 return result;
175}
176
177EXPORT_SYMBOL(csum_partial);
178
179/*
180 * this routine is used for miscellaneous IP-like checksums, mainly
181 * in icmp.c
182 */
183unsigned short ip_compute_csum(unsigned char * buff, int len)
184{
185 return ~from64to16(do_csum(buff,len));
186}
diff --git a/arch/alpha/lib/clear_page.S b/arch/alpha/lib/clear_page.S
new file mode 100644
index 000000000000..a221ae266e29
--- /dev/null
+++ b/arch/alpha/lib/clear_page.S
@@ -0,0 +1,39 @@
1/*
2 * arch/alpha/lib/clear_page.S
3 *
4 * Zero an entire page.
5 */
6
7 .text
8 .align 4
9 .global clear_page
10 .ent clear_page
11clear_page:
12 .prologue 0
13
14 lda $0,128
15 nop
16 unop
17 nop
18
191: stq $31,0($16)
20 stq $31,8($16)
21 stq $31,16($16)
22 stq $31,24($16)
23
24 stq $31,32($16)
25 stq $31,40($16)
26 stq $31,48($16)
27 subq $0,1,$0
28
29 stq $31,56($16)
30 addq $16,64,$16
31 unop
32 bne $0,1b
33
34 ret
35 nop
36 unop
37 nop
38
39 .end clear_page
diff --git a/arch/alpha/lib/clear_user.S b/arch/alpha/lib/clear_user.S
new file mode 100644
index 000000000000..8860316c1957
--- /dev/null
+++ b/arch/alpha/lib/clear_user.S
@@ -0,0 +1,113 @@
1/*
2 * arch/alpha/lib/clear_user.S
3 * Contributed by Richard Henderson <rth@tamu.edu>
4 *
5 * Zero user space, handling exceptions as we go.
6 *
7 * We have to make sure that $0 is always up-to-date and contains the
8 * right "bytes left to zero" value (and that it is updated only _after_
9 * a successful copy). There is also some rather minor exception setup
10 * stuff.
11 *
12 * NOTE! This is not directly C-callable, because the calling semantics
13 * are different:
14 *
15 * Inputs:
16 * length in $0
17 * destination address in $6
18 * exception pointer in $7
19 * return address in $28 (exceptions expect it there)
20 *
21 * Outputs:
22 * bytes left to copy in $0
23 *
24 * Clobbers:
25 * $1,$2,$3,$4,$5,$6
26 */
27
28/* Allow an exception for an insn; exit if we get one. */
29#define EX(x,y...) \
30 99: x,##y; \
31 .section __ex_table,"a"; \
32 .long 99b - .; \
33 lda $31, $exception-99b($31); \
34 .previous
35
36 .set noat
37 .set noreorder
38 .align 4
39
40 .globl __do_clear_user
41 .ent __do_clear_user
42 .frame $30, 0, $28
43 .prologue 0
44
45$loop:
46 and $1, 3, $4 # e0 :
47 beq $4, 1f # .. e1 :
48
490: EX( stq_u $31, 0($6) ) # e0 : zero one word
50 subq $0, 8, $0 # .. e1 :
51 subq $4, 1, $4 # e0 :
52 addq $6, 8, $6 # .. e1 :
53 bne $4, 0b # e1 :
54 unop # :
55
561: bic $1, 3, $1 # e0 :
57 beq $1, $tail # .. e1 :
58
592: EX( stq_u $31, 0($6) ) # e0 : zero four words
60 subq $0, 8, $0 # .. e1 :
61 EX( stq_u $31, 8($6) ) # e0 :
62 subq $0, 8, $0 # .. e1 :
63 EX( stq_u $31, 16($6) ) # e0 :
64 subq $0, 8, $0 # .. e1 :
65 EX( stq_u $31, 24($6) ) # e0 :
66 subq $0, 8, $0 # .. e1 :
67 subq $1, 4, $1 # e0 :
68 addq $6, 32, $6 # .. e1 :
69 bne $1, 2b # e1 :
70
71$tail:
72 bne $2, 1f # e1 : is there a tail to do?
73 ret $31, ($28), 1 # .. e1 :
74
751: EX( ldq_u $5, 0($6) ) # e0 :
76 clr $0 # .. e1 :
77 nop # e1 :
78 mskqh $5, $0, $5 # e0 :
79 EX( stq_u $5, 0($6) ) # e0 :
80 ret $31, ($28), 1 # .. e1 :
81
82__do_clear_user:
83 and $6, 7, $4 # e0 : find dest misalignment
84 beq $0, $zerolength # .. e1 :
85 addq $0, $4, $1 # e0 : bias counter
86 and $1, 7, $2 # e1 : number of bytes in tail
87 srl $1, 3, $1 # e0 :
88 beq $4, $loop # .. e1 :
89
90 EX( ldq_u $5, 0($6) ) # e0 : load dst word to mask back in
91 beq $1, $oneword # .. e1 : sub-word store?
92
93 mskql $5, $6, $5 # e0 : take care of misaligned head
94 addq $6, 8, $6 # .. e1 :
95 EX( stq_u $5, -8($6) ) # e0 :
96 addq $0, $4, $0 # .. e1 : bytes left -= 8 - misalignment
97 subq $1, 1, $1 # e0 :
98 subq $0, 8, $0 # .. e1 :
99 br $loop # e1 :
100 unop # :
101
102$oneword:
103 mskql $5, $6, $4 # e0 :
104 mskqh $5, $2, $5 # e0 :
105 or $5, $4, $5 # e1 :
106 EX( stq_u $5, 0($6) ) # e0 :
107 clr $0 # .. e1 :
108
109$zerolength:
110$exception:
111 ret $31, ($28), 1 # .. e1 :
112
113 .end __do_clear_user
diff --git a/arch/alpha/lib/copy_page.S b/arch/alpha/lib/copy_page.S
new file mode 100644
index 000000000000..9f3b97459cc6
--- /dev/null
+++ b/arch/alpha/lib/copy_page.S
@@ -0,0 +1,49 @@
1/*
2 * arch/alpha/lib/copy_page.S
3 *
4 * Copy an entire page.
5 */
6
7 .text
8 .align 4
9 .global copy_page
10 .ent copy_page
11copy_page:
12 .prologue 0
13
14 lda $18,128
15 nop
16 unop
17 nop
18
191: ldq $0,0($17)
20 ldq $1,8($17)
21 ldq $2,16($17)
22 ldq $3,24($17)
23
24 ldq $4,32($17)
25 ldq $5,40($17)
26 ldq $6,48($17)
27 ldq $7,56($17)
28
29 stq $0,0($16)
30 subq $18,1,$18
31 stq $1,8($16)
32 addq $17,64,$17
33
34 stq $2,16($16)
35 stq $3,24($16)
36 stq $4,32($16)
37 stq $5,40($16)
38
39 stq $6,48($16)
40 stq $7,56($16)
41 addq $16,64,$16
42 bne $18, 1b
43
44 ret
45 nop
46 unop
47 nop
48
49 .end copy_page
diff --git a/arch/alpha/lib/copy_user.S b/arch/alpha/lib/copy_user.S
new file mode 100644
index 000000000000..6f3fab9eb434
--- /dev/null
+++ b/arch/alpha/lib/copy_user.S
@@ -0,0 +1,145 @@
1/*
2 * arch/alpha/lib/copy_user.S
3 *
4 * Copy to/from user space, handling exceptions as we go.. This
5 * isn't exactly pretty.
6 *
7 * This is essentially the same as "memcpy()", but with a few twists.
8 * Notably, we have to make sure that $0 is always up-to-date and
9 * contains the right "bytes left to copy" value (and that it is updated
10 * only _after_ a successful copy). There is also some rather minor
11 * exception setup stuff..
12 *
13 * NOTE! This is not directly C-callable, because the calling semantics are
14 * different:
15 *
16 * Inputs:
17 * length in $0
18 * destination address in $6
19 * source address in $7
20 * return address in $28
21 *
22 * Outputs:
23 * bytes left to copy in $0
24 *
25 * Clobbers:
26 * $1,$2,$3,$4,$5,$6,$7
27 */
28
29/* Allow an exception for an insn; exit if we get one. */
30#define EXI(x,y...) \
31 99: x,##y; \
32 .section __ex_table,"a"; \
33 .long 99b - .; \
34 lda $31, $exitin-99b($31); \
35 .previous
36
37#define EXO(x,y...) \
38 99: x,##y; \
39 .section __ex_table,"a"; \
40 .long 99b - .; \
41 lda $31, $exitout-99b($31); \
42 .previous
43
44 .set noat
45 .align 4
46 .globl __copy_user
47 .ent __copy_user
48__copy_user:
49 .prologue 0
50 and $6,7,$3
51 beq $0,$35
52 beq $3,$36
53 subq $3,8,$3
54 .align 4
55$37:
56 EXI( ldq_u $1,0($7) )
57 EXO( ldq_u $2,0($6) )
58 extbl $1,$7,$1
59 mskbl $2,$6,$2
60 insbl $1,$6,$1
61 addq $3,1,$3
62 bis $1,$2,$1
63 EXO( stq_u $1,0($6) )
64 subq $0,1,$0
65 addq $6,1,$6
66 addq $7,1,$7
67 beq $0,$41
68 bne $3,$37
69$36:
70 and $7,7,$1
71 bic $0,7,$4
72 beq $1,$43
73 beq $4,$48
74 EXI( ldq_u $3,0($7) )
75 .align 4
76$50:
77 EXI( ldq_u $2,8($7) )
78 subq $4,8,$4
79 extql $3,$7,$3
80 extqh $2,$7,$1
81 bis $3,$1,$1
82 EXO( stq $1,0($6) )
83 addq $7,8,$7
84 subq $0,8,$0
85 addq $6,8,$6
86 bis $2,$2,$3
87 bne $4,$50
88$48:
89 beq $0,$41
90 .align 4
91$57:
92 EXI( ldq_u $1,0($7) )
93 EXO( ldq_u $2,0($6) )
94 extbl $1,$7,$1
95 mskbl $2,$6,$2
96 insbl $1,$6,$1
97 bis $1,$2,$1
98 EXO( stq_u $1,0($6) )
99 subq $0,1,$0
100 addq $6,1,$6
101 addq $7,1,$7
102 bne $0,$57
103 br $31,$41
104 .align 4
105$43:
106 beq $4,$65
107 .align 4
108$66:
109 EXI( ldq $1,0($7) )
110 subq $4,8,$4
111 EXO( stq $1,0($6) )
112 addq $7,8,$7
113 subq $0,8,$0
114 addq $6,8,$6
115 bne $4,$66
116$65:
117 beq $0,$41
118 EXI( ldq $2,0($7) )
119 EXO( ldq $1,0($6) )
120 mskql $2,$0,$2
121 mskqh $1,$0,$1
122 bis $2,$1,$2
123 EXO( stq $2,0($6) )
124 bis $31,$31,$0
125$41:
126$35:
127$exitout:
128 ret $31,($28),1
129
130$exitin:
131 /* A stupid byte-by-byte zeroing of the rest of the output
132 buffer. This cures security holes by never leaving
133 random kernel data around to be copied elsewhere. */
134
135 mov $0,$1
136$101:
137 EXO ( ldq_u $2,0($6) )
138 subq $1,1,$1
139 mskbl $2,$6,$2
140 EXO ( stq_u $2,0($6) )
141 addq $6,1,$6
142 bgt $1,$101
143 ret $31,($28),1
144
145 .end __copy_user
diff --git a/arch/alpha/lib/csum_ipv6_magic.S b/arch/alpha/lib/csum_ipv6_magic.S
new file mode 100644
index 000000000000..e09748dbf2ed
--- /dev/null
+++ b/arch/alpha/lib/csum_ipv6_magic.S
@@ -0,0 +1,92 @@
1/*
2 * arch/alpha/lib/csum_ipv6_magic.S
3 * Contributed by Richard Henderson <rth@tamu.edu>
4 *
5 * unsigned short csum_ipv6_magic(struct in6_addr *saddr,
6 * struct in6_addr *daddr,
7 * __u32 len,
8 * unsigned short proto,
9 * unsigned int csum);
10 */
11
12 .globl csum_ipv6_magic
13 .align 4
14 .ent csum_ipv6_magic
15 .frame $30,0,$26,0
16csum_ipv6_magic:
17 .prologue 0
18
19 ldq $0,0($16) # e0 : load src & dst addr words
20 zapnot $20,15,$20 # .. e1 : zero extend incoming csum
21 extqh $18,1,$4 # e0 : byte swap len & proto while we wait
22 ldq $1,8($16) # .. e1 :
23
24 extbl $18,1,$5 # e0 :
25 ldq $2,0($17) # .. e1 :
26 extbl $18,2,$6 # e0 :
27 ldq $3,8($17) # .. e1 :
28
29 extbl $18,3,$18 # e0 :
30 sra $4,32,$4 # e0 :
31 sll $5,16,$5 # e0 :
32 addq $20,$0,$20 # .. e1 : begin summing the words
33
34 sll $6,8,$6 # e0 :
35 cmpult $20,$0,$0 # .. e1 :
36 extwh $19,7,$7 # e0 :
37 or $4,$18,$18 # .. e1 :
38
39 extbl $19,1,$19 # e0 :
40 or $5,$6,$5 # .. e1 :
41 or $18,$5,$18 # e0 : len complete
42 or $19,$7,$19 # .. e1 :
43
44 sll $19,48,$19 # e0 :
45 addq $20,$1,$20 # .. e1 :
46 sra $19,32,$19 # e0 : proto complete
47 cmpult $20,$1,$1 # .. e1 :
48
49 nop # e0 :
50 addq $20,$2,$20 # .. e1 :
51 cmpult $20,$2,$2 # e0 :
52 addq $20,$3,$20 # .. e1 :
53
54 cmpult $20,$3,$3 # e0 :
55 addq $20,$18,$20 # .. e1 :
56 cmpult $20,$18,$18 # e0 :
57 addq $20,$19,$20 # .. e1 :
58
59 cmpult $20,$19,$19 # e0 :
60 addq $0,$1,$0 # .. e1 : merge the carries back into the csum
61 addq $2,$3,$2 # e0 :
62 addq $18,$19,$18 # .. e1 :
63
64 addq $0,$2,$0 # e0 :
65 addq $20,$18,$20 # .. e1 :
66 addq $0,$20,$0 # e0 :
67 unop # :
68
69 extwl $0,2,$2 # e0 : begin folding the 64-bit value
70 zapnot $0,3,$3 # .. e1 :
71 extwl $0,4,$1 # e0 :
72 addq $2,$3,$3 # .. e1 :
73
74 extwl $0,6,$0 # e0 :
75 addq $3,$1,$3 # .. e1 :
76 addq $0,$3,$0 # e0 :
77 unop # :
78
79 extwl $0,2,$1 # e0 : fold 18-bit value
80 zapnot $0,3,$0 # .. e1 :
81 addq $0,$1,$0 # e0 :
82 unop # :
83
84 extwl $0,2,$1 # e0 : fold 17-bit value
85 zapnot $0,3,$0 # .. e1 :
86 addq $0,$1,$0 # e0 :
87 not $0,$0 # e1 : and complement.
88
89 zapnot $0,3,$0 # e0 :
90 ret # .. e1 :
91
92 .end csum_ipv6_magic
diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c
new file mode 100644
index 000000000000..a37948f3037a
--- /dev/null
+++ b/arch/alpha/lib/csum_partial_copy.c
@@ -0,0 +1,391 @@
1/*
2 * csum_partial_copy - do IP checksumming and copy
3 *
4 * (C) Copyright 1996 Linus Torvalds
5 * accellerated versions (and 21264 assembly versions ) contributed by
6 * Rick Gorton <rick.gorton@alpha-processor.com>
7 *
8 * Don't look at this too closely - you'll go mad. The things
9 * we do for performance..
10 */
11
12#include <linux/types.h>
13#include <linux/string.h>
14#include <asm/uaccess.h>
15
16
17#define ldq_u(x,y) \
18__asm__ __volatile__("ldq_u %0,%1":"=r" (x):"m" (*(const unsigned long *)(y)))
19
20#define stq_u(x,y) \
21__asm__ __volatile__("stq_u %1,%0":"=m" (*(unsigned long *)(y)):"r" (x))
22
23#define extql(x,y,z) \
24__asm__ __volatile__("extql %1,%2,%0":"=r" (z):"r" (x),"r" (y))
25
26#define extqh(x,y,z) \
27__asm__ __volatile__("extqh %1,%2,%0":"=r" (z):"r" (x),"r" (y))
28
29#define mskql(x,y,z) \
30__asm__ __volatile__("mskql %1,%2,%0":"=r" (z):"r" (x),"r" (y))
31
32#define mskqh(x,y,z) \
33__asm__ __volatile__("mskqh %1,%2,%0":"=r" (z):"r" (x),"r" (y))
34
35#define insql(x,y,z) \
36__asm__ __volatile__("insql %1,%2,%0":"=r" (z):"r" (x),"r" (y))
37
38#define insqh(x,y,z) \
39__asm__ __volatile__("insqh %1,%2,%0":"=r" (z):"r" (x),"r" (y))
40
41
42#define __get_user_u(x,ptr) \
43({ \
44 long __guu_err; \
45 __asm__ __volatile__( \
46 "1: ldq_u %0,%2\n" \
47 "2:\n" \
48 ".section __ex_table,\"a\"\n" \
49 " .long 1b - .\n" \
50 " lda %0,2b-1b(%1)\n" \
51 ".previous" \
52 : "=r"(x), "=r"(__guu_err) \
53 : "m"(__m(ptr)), "1"(0)); \
54 __guu_err; \
55})
56
57#define __put_user_u(x,ptr) \
58({ \
59 long __puu_err; \
60 __asm__ __volatile__( \
61 "1: stq_u %2,%1\n" \
62 "2:\n" \
63 ".section __ex_table,\"a\"\n" \
64 " .long 1b - ." \
65 " lda $31,2b-1b(%0)\n" \
66 ".previous" \
67 : "=r"(__puu_err) \
68 : "m"(__m(addr)), "rJ"(x), "0"(0)); \
69 __puu_err; \
70})
71
72
73static inline unsigned short from64to16(unsigned long x)
74{
75 /* Using extract instructions is a bit more efficient
76 than the original shift/bitmask version. */
77
78 union {
79 unsigned long ul;
80 unsigned int ui[2];
81 unsigned short us[4];
82 } in_v, tmp_v, out_v;
83
84 in_v.ul = x;
85 tmp_v.ul = (unsigned long) in_v.ui[0] + (unsigned long) in_v.ui[1];
86
87 /* Since the bits of tmp_v.sh[3] are going to always be zero,
88 we don't have to bother to add that in. */
89 out_v.ul = (unsigned long) tmp_v.us[0] + (unsigned long) tmp_v.us[1]
90 + (unsigned long) tmp_v.us[2];
91
92 /* Similarly, out_v.us[2] is always zero for the final add. */
93 return out_v.us[0] + out_v.us[1];
94}
95
96
97
98/*
99 * Ok. This isn't fun, but this is the EASY case.
100 */
101static inline unsigned long
102csum_partial_cfu_aligned(const unsigned long __user *src, unsigned long *dst,
103 long len, unsigned long checksum,
104 int *errp)
105{
106 unsigned long carry = 0;
107 int err = 0;
108
109 while (len >= 0) {
110 unsigned long word;
111 err |= __get_user(word, src);
112 checksum += carry;
113 src++;
114 checksum += word;
115 len -= 8;
116 carry = checksum < word;
117 *dst = word;
118 dst++;
119 }
120 len += 8;
121 checksum += carry;
122 if (len) {
123 unsigned long word, tmp;
124 err |= __get_user(word, src);
125 tmp = *dst;
126 mskql(word, len, word);
127 checksum += word;
128 mskqh(tmp, len, tmp);
129 carry = checksum < word;
130 *dst = word | tmp;
131 checksum += carry;
132 }
133 if (err) *errp = err;
134 return checksum;
135}
136
137/*
138 * This is even less fun, but this is still reasonably
139 * easy.
140 */
141static inline unsigned long
142csum_partial_cfu_dest_aligned(const unsigned long __user *src,
143 unsigned long *dst,
144 unsigned long soff,
145 long len, unsigned long checksum,
146 int *errp)
147{
148 unsigned long first;
149 unsigned long word, carry;
150 unsigned long lastsrc = 7+len+(unsigned long)src;
151 int err = 0;
152
153 err |= __get_user_u(first,src);
154 carry = 0;
155 while (len >= 0) {
156 unsigned long second;
157
158 err |= __get_user_u(second, src+1);
159 extql(first, soff, word);
160 len -= 8;
161 src++;
162 extqh(second, soff, first);
163 checksum += carry;
164 word |= first;
165 first = second;
166 checksum += word;
167 *dst = word;
168 dst++;
169 carry = checksum < word;
170 }
171 len += 8;
172 checksum += carry;
173 if (len) {
174 unsigned long tmp;
175 unsigned long second;
176 err |= __get_user_u(second, lastsrc);
177 tmp = *dst;
178 extql(first, soff, word);
179 extqh(second, soff, first);
180 word |= first;
181 mskql(word, len, word);
182 checksum += word;
183 mskqh(tmp, len, tmp);
184 carry = checksum < word;
185 *dst = word | tmp;
186 checksum += carry;
187 }
188 if (err) *errp = err;
189 return checksum;
190}
191
192/*
193 * This is slightly less fun than the above..
194 */
195static inline unsigned long
196csum_partial_cfu_src_aligned(const unsigned long __user *src,
197 unsigned long *dst,
198 unsigned long doff,
199 long len, unsigned long checksum,
200 unsigned long partial_dest,
201 int *errp)
202{
203 unsigned long carry = 0;
204 unsigned long word;
205 unsigned long second_dest;
206 int err = 0;
207
208 mskql(partial_dest, doff, partial_dest);
209 while (len >= 0) {
210 err |= __get_user(word, src);
211 len -= 8;
212 insql(word, doff, second_dest);
213 checksum += carry;
214 stq_u(partial_dest | second_dest, dst);
215 src++;
216 checksum += word;
217 insqh(word, doff, partial_dest);
218 carry = checksum < word;
219 dst++;
220 }
221 len += 8;
222 if (len) {
223 checksum += carry;
224 err |= __get_user(word, src);
225 mskql(word, len, word);
226 len -= 8;
227 checksum += word;
228 insql(word, doff, second_dest);
229 len += doff;
230 carry = checksum < word;
231 partial_dest |= second_dest;
232 if (len >= 0) {
233 stq_u(partial_dest, dst);
234 if (!len) goto out;
235 dst++;
236 insqh(word, doff, partial_dest);
237 }
238 doff = len;
239 }
240 ldq_u(second_dest, dst);
241 mskqh(second_dest, doff, second_dest);
242 stq_u(partial_dest | second_dest, dst);
243out:
244 checksum += carry;
245 if (err) *errp = err;
246 return checksum;
247}
248
249/*
250 * This is so totally un-fun that it's frightening. Don't
251 * look at this too closely, you'll go blind.
252 */
253static inline unsigned long
254csum_partial_cfu_unaligned(const unsigned long __user * src,
255 unsigned long * dst,
256 unsigned long soff, unsigned long doff,
257 long len, unsigned long checksum,
258 unsigned long partial_dest,
259 int *errp)
260{
261 unsigned long carry = 0;
262 unsigned long first;
263 unsigned long lastsrc;
264 int err = 0;
265
266 err |= __get_user_u(first, src);
267 lastsrc = 7+len+(unsigned long)src;
268 mskql(partial_dest, doff, partial_dest);
269 while (len >= 0) {
270 unsigned long second, word;
271 unsigned long second_dest;
272
273 err |= __get_user_u(second, src+1);
274 extql(first, soff, word);
275 checksum += carry;
276 len -= 8;
277 extqh(second, soff, first);
278 src++;
279 word |= first;
280 first = second;
281 insql(word, doff, second_dest);
282 checksum += word;
283 stq_u(partial_dest | second_dest, dst);
284 carry = checksum < word;
285 insqh(word, doff, partial_dest);
286 dst++;
287 }
288 len += doff;
289 checksum += carry;
290 if (len >= 0) {
291 unsigned long second, word;
292 unsigned long second_dest;
293
294 err |= __get_user_u(second, lastsrc);
295 extql(first, soff, word);
296 extqh(second, soff, first);
297 word |= first;
298 first = second;
299 mskql(word, len-doff, word);
300 checksum += word;
301 insql(word, doff, second_dest);
302 carry = checksum < word;
303 stq_u(partial_dest | second_dest, dst);
304 if (len) {
305 ldq_u(second_dest, dst+1);
306 insqh(word, doff, partial_dest);
307 mskqh(second_dest, len, second_dest);
308 stq_u(partial_dest | second_dest, dst+1);
309 }
310 checksum += carry;
311 } else {
312 unsigned long second, word;
313 unsigned long second_dest;
314
315 err |= __get_user_u(second, lastsrc);
316 extql(first, soff, word);
317 extqh(second, soff, first);
318 word |= first;
319 ldq_u(second_dest, dst);
320 mskql(word, len-doff, word);
321 checksum += word;
322 mskqh(second_dest, len, second_dest);
323 carry = checksum < word;
324 insql(word, doff, word);
325 stq_u(partial_dest | word | second_dest, dst);
326 checksum += carry;
327 }
328 if (err) *errp = err;
329 return checksum;
330}
331
332static unsigned int
333do_csum_partial_copy_from_user(const char __user *src, char *dst, int len,
334 unsigned int sum, int *errp)
335{
336 unsigned long checksum = (unsigned) sum;
337 unsigned long soff = 7 & (unsigned long) src;
338 unsigned long doff = 7 & (unsigned long) dst;
339
340 if (len) {
341 if (!doff) {
342 if (!soff)
343 checksum = csum_partial_cfu_aligned(
344 (const unsigned long __user *) src,
345 (unsigned long *) dst,
346 len-8, checksum, errp);
347 else
348 checksum = csum_partial_cfu_dest_aligned(
349 (const unsigned long __user *) src,
350 (unsigned long *) dst,
351 soff, len-8, checksum, errp);
352 } else {
353 unsigned long partial_dest;
354 ldq_u(partial_dest, dst);
355 if (!soff)
356 checksum = csum_partial_cfu_src_aligned(
357 (const unsigned long __user *) src,
358 (unsigned long *) dst,
359 doff, len-8, checksum,
360 partial_dest, errp);
361 else
362 checksum = csum_partial_cfu_unaligned(
363 (const unsigned long __user *) src,
364 (unsigned long *) dst,
365 soff, doff, len-8, checksum,
366 partial_dest, errp);
367 }
368 checksum = from64to16 (checksum);
369 }
370 return checksum;
371}
372
373unsigned int
374csum_partial_copy_from_user(const char __user *src, char *dst, int len,
375 unsigned int sum, int *errp)
376{
377 if (!access_ok(VERIFY_READ, src, len)) {
378 *errp = -EFAULT;
379 memset(dst, 0, len);
380 return sum;
381 }
382
383 return do_csum_partial_copy_from_user(src, dst, len, sum, errp);
384}
385
386unsigned int
387csum_partial_copy_nocheck(const char __user *src, char *dst, int len,
388 unsigned int sum)
389{
390 return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
391}
diff --git a/arch/alpha/lib/dbg_current.S b/arch/alpha/lib/dbg_current.S
new file mode 100644
index 000000000000..e6d071015f9b
--- /dev/null
+++ b/arch/alpha/lib/dbg_current.S
@@ -0,0 +1,29 @@
1/*
2 * arch/alpha/lib/dbg_current.S
3 * Contributed by Richard Henderson (rth@cygnus.com)
4 *
5 * Trap if we find current not correct.
6 */
7
8#include <asm/pal.h>
9
10 .text
11 .set noat
12
13 .globl _mcount
14 .ent _mcount
15_mcount:
16 .frame $30, 0, $28, 0
17 .prologue 0
18
19 lda $0, -0x4000($30)
20 cmpult $8, $30, $1
21 cmpule $0, $30, $2
22 and $1, $2, $3
23 bne $3, 1f
24
25 call_pal PAL_bugchk
26
271: ret $31, ($28), 1
28
29 .end _mcount
diff --git a/arch/alpha/lib/dbg_stackcheck.S b/arch/alpha/lib/dbg_stackcheck.S
new file mode 100644
index 000000000000..cc5ce3a5fcad
--- /dev/null
+++ b/arch/alpha/lib/dbg_stackcheck.S
@@ -0,0 +1,27 @@
1/*
2 * arch/alpha/lib/stackcheck.S
3 * Contributed by Richard Henderson (rth@tamu.edu)
4 *
5 * Verify that we have not overflowed the stack. Oops if we have.
6 */
7
8#include <asm/asm_offsets.h>
9
10 .text
11 .set noat
12
13 .align 3
14 .globl _mcount
15 .ent _mcount
16_mcount:
17 .frame $30, 0, $28, 0
18 .prologue 0
19
20 lda $0, TASK_SIZE($8)
21 cmpult $30, $0, $0
22 bne $0, 1f
23 ret ($28)
241: stq $31, -8($31) # oops me, damn it.
25 br 1b
26
27 .end _mcount
diff --git a/arch/alpha/lib/dbg_stackkill.S b/arch/alpha/lib/dbg_stackkill.S
new file mode 100644
index 000000000000..e09f2ae1e09e
--- /dev/null
+++ b/arch/alpha/lib/dbg_stackkill.S
@@ -0,0 +1,35 @@
1/*
2 * arch/alpha/lib/killstack.S
3 * Contributed by Richard Henderson (rth@cygnus.com)
4 *
5 * Clobber the balance of the kernel stack, hoping to catch
6 * uninitialized local variables in the act.
7 */
8
9#include <asm/asm_offsets.h>
10
11 .text
12 .set noat
13
14 .align 5
15 .globl _mcount
16 .ent _mcount
17_mcount:
18 .frame $30, 0, $28, 0
19 .prologue 0
20
21 ldi $0, 0xdeadbeef
22 lda $2, -STACK_SIZE
23 sll $0, 32, $1
24 and $30, $2, $2
25 or $0, $1, $0
26 lda $2, TASK_SIZE($2)
27 cmpult $2, $30, $1
28 beq $1, 2f
291: stq $0, 0($2)
30 addq $2, 8, $2
31 cmpult $2, $30, $1
32 bne $1, 1b
332: ret ($28)
34
35 .end _mcount
diff --git a/arch/alpha/lib/dec_and_lock.c b/arch/alpha/lib/dec_and_lock.c
new file mode 100644
index 000000000000..6ae2500a9d9e
--- /dev/null
+++ b/arch/alpha/lib/dec_and_lock.c
@@ -0,0 +1,42 @@
1/*
2 * arch/alpha/lib/dec_and_lock.c
3 *
4 * ll/sc version of atomic_dec_and_lock()
5 *
6 */
7
8#include <linux/spinlock.h>
9#include <asm/atomic.h>
10
11 asm (".text \n\
12 .global _atomic_dec_and_lock \n\
13 .ent _atomic_dec_and_lock \n\
14 .align 4 \n\
15_atomic_dec_and_lock: \n\
16 .prologue 0 \n\
171: ldl_l $1, 0($16) \n\
18 subl $1, 1, $1 \n\
19 beq $1, 2f \n\
20 stl_c $1, 0($16) \n\
21 beq $1, 4f \n\
22 mb \n\
23 clr $0 \n\
24 ret \n\
252: br $29, 3f \n\
263: ldgp $29, 0($29) \n\
27 br $atomic_dec_and_lock_1..ng \n\
28 .subsection 2 \n\
294: br 1b \n\
30 .previous \n\
31 .end _atomic_dec_and_lock");
32
33static int __attribute_used__
34atomic_dec_and_lock_1(atomic_t *atomic, spinlock_t *lock)
35{
36 /* Slow path */
37 spin_lock(lock);
38 if (atomic_dec_and_test(atomic))
39 return 1;
40 spin_unlock(lock);
41 return 0;
42}
diff --git a/arch/alpha/lib/divide.S b/arch/alpha/lib/divide.S
new file mode 100644
index 000000000000..2d1a0484a99e
--- /dev/null
+++ b/arch/alpha/lib/divide.S
@@ -0,0 +1,195 @@
1/*
2 * arch/alpha/lib/divide.S
3 *
4 * (C) 1995 Linus Torvalds
5 *
6 * Alpha division..
7 */
8
9/*
10 * The alpha chip doesn't provide hardware division, so we have to do it
11 * by hand. The compiler expects the functions
12 *
13 * __divqu: 64-bit unsigned long divide
14 * __remqu: 64-bit unsigned long remainder
15 * __divqs/__remqs: signed 64-bit
16 * __divlu/__remlu: unsigned 32-bit
17 * __divls/__remls: signed 32-bit
18 *
19 * These are not normal C functions: instead of the normal
20 * calling sequence, these expect their arguments in registers
21 * $24 and $25, and return the result in $27. Register $28 may
22 * be clobbered (assembly temporary), anything else must be saved.
23 *
24 * In short: painful.
25 *
26 * This is a rather simple bit-at-a-time algorithm: it's very good
27 * at dividing random 64-bit numbers, but the more usual case where
28 * the divisor is small is handled better by the DEC algorithm
29 * using lookup tables. This uses much less memory, though, and is
30 * nicer on the cache.. Besides, I don't know the copyright status
31 * of the DEC code.
32 */
33
34/*
35 * My temporaries:
36 * $0 - current bit
37 * $1 - shifted divisor
38 * $2 - modulus/quotient
39 *
40 * $23 - return address
41 * $24 - dividend
42 * $25 - divisor
43 *
44 * $27 - quotient/modulus
45 * $28 - compare status
46 */
47
48#define halt .long 0
49
50/*
51 * Select function type and registers
52 */
53#define mask $0
54#define divisor $1
55#define compare $28
56#define tmp1 $3
57#define tmp2 $4
58
59#ifdef DIV
60#define DIV_ONLY(x,y...) x,##y
61#define MOD_ONLY(x,y...)
62#define func(x) __div##x
63#define modulus $2
64#define quotient $27
65#define GETSIGN(x) xor $24,$25,x
66#define STACK 48
67#else
68#define DIV_ONLY(x,y...)
69#define MOD_ONLY(x,y...) x,##y
70#define func(x) __rem##x
71#define modulus $27
72#define quotient $2
73#define GETSIGN(x) bis $24,$24,x
74#define STACK 32
75#endif
76
77/*
78 * For 32-bit operations, we need to extend to 64-bit
79 */
80#ifdef INTSIZE
81#define ufunction func(lu)
82#define sfunction func(l)
83#define LONGIFY(x) zapnot x,15,x
84#define SLONGIFY(x) addl x,0,x
85#else
86#define ufunction func(qu)
87#define sfunction func(q)
88#define LONGIFY(x)
89#define SLONGIFY(x)
90#endif
91
92.set noat
93.align 3
94.globl ufunction
95.ent ufunction
96ufunction:
97 subq $30,STACK,$30
98 .frame $30,STACK,$23
99 .prologue 0
100
1017: stq $1, 0($30)
102 bis $25,$25,divisor
103 stq $2, 8($30)
104 bis $24,$24,modulus
105 stq $0,16($30)
106 bis $31,$31,quotient
107 LONGIFY(divisor)
108 stq tmp1,24($30)
109 LONGIFY(modulus)
110 bis $31,1,mask
111 DIV_ONLY(stq tmp2,32($30))
112 beq divisor, 9f /* div by zero */
113
114#ifdef INTSIZE
115 /*
116 * shift divisor left, using 3-bit shifts for
117 * 32-bit divides as we can't overflow. Three-bit
118 * shifts will result in looping three times less
119 * here, but can result in two loops more later.
120 * Thus using a large shift isn't worth it (and
121 * s8add pairs better than a sll..)
122 */
1231: cmpult divisor,modulus,compare
124 s8addq divisor,$31,divisor
125 s8addq mask,$31,mask
126 bne compare,1b
127#else
1281: cmpult divisor,modulus,compare
129 blt divisor, 2f
130 addq divisor,divisor,divisor
131 addq mask,mask,mask
132 bne compare,1b
133 unop
134#endif
135
136 /* ok, start to go right again.. */
1372: DIV_ONLY(addq quotient,mask,tmp2)
138 srl mask,1,mask
139 cmpule divisor,modulus,compare
140 subq modulus,divisor,tmp1
141 DIV_ONLY(cmovne compare,tmp2,quotient)
142 srl divisor,1,divisor
143 cmovne compare,tmp1,modulus
144 bne mask,2b
145
1469: ldq $1, 0($30)
147 ldq $2, 8($30)
148 ldq $0,16($30)
149 ldq tmp1,24($30)
150 DIV_ONLY(ldq tmp2,32($30))
151 addq $30,STACK,$30
152 ret $31,($23),1
153 .end ufunction
154
155/*
156 * Uhh.. Ugly signed division. I'd rather not have it at all, but
157 * it's needed in some circumstances. There are different ways to
158 * handle this, really. This does:
159 * -a / b = a / -b = -(a / b)
160 * -a % b = -(a % b)
161 * a % -b = a % b
162 * which is probably not the best solution, but at least should
163 * have the property that (x/y)*y + (x%y) = x.
164 */
165.align 3
166.globl sfunction
167.ent sfunction
168sfunction:
169 subq $30,STACK,$30
170 .frame $30,STACK,$23
171 .prologue 0
172 bis $24,$25,$28
173 SLONGIFY($28)
174 bge $28,7b
175 stq $24,0($30)
176 subq $31,$24,$28
177 stq $25,8($30)
178 cmovlt $24,$28,$24 /* abs($24) */
179 stq $23,16($30)
180 subq $31,$25,$28
181 stq tmp1,24($30)
182 cmovlt $25,$28,$25 /* abs($25) */
183 unop
184 bsr $23,ufunction
185 ldq $24,0($30)
186 ldq $25,8($30)
187 GETSIGN($28)
188 subq $31,$27,tmp1
189 SLONGIFY($28)
190 ldq $23,16($30)
191 cmovlt $28,tmp1,$27
192 ldq tmp1,24($30)
193 addq $30,STACK,$30
194 ret $31,($23),1
195 .end sfunction
diff --git a/arch/alpha/lib/ev6-clear_page.S b/arch/alpha/lib/ev6-clear_page.S
new file mode 100644
index 000000000000..adf4f7be0e2b
--- /dev/null
+++ b/arch/alpha/lib/ev6-clear_page.S
@@ -0,0 +1,54 @@
1/*
2 * arch/alpha/lib/ev6-clear_page.S
3 *
4 * Zero an entire page.
5 */
6
7 .text
8 .align 4
9 .global clear_page
10 .ent clear_page
11clear_page:
12 .prologue 0
13
14 lda $0,128
15 lda $1,125
16 addq $16,64,$2
17 addq $16,128,$3
18
19 addq $16,192,$17
20 wh64 ($16)
21 wh64 ($2)
22 wh64 ($3)
23
241: wh64 ($17)
25 stq $31,0($16)
26 subq $0,1,$0
27 subq $1,1,$1
28
29 stq $31,8($16)
30 stq $31,16($16)
31 addq $17,64,$2
32 nop
33
34 stq $31,24($16)
35 stq $31,32($16)
36 cmovgt $1,$2,$17
37 nop
38
39 stq $31,40($16)
40 stq $31,48($16)
41 nop
42 nop
43
44 stq $31,56($16)
45 addq $16,64,$16
46 nop
47 bne $0,1b
48
49 ret
50 nop
51 nop
52 nop
53
54 .end clear_page
diff --git a/arch/alpha/lib/ev6-clear_user.S b/arch/alpha/lib/ev6-clear_user.S
new file mode 100644
index 000000000000..4f42a16b7f53
--- /dev/null
+++ b/arch/alpha/lib/ev6-clear_user.S
@@ -0,0 +1,225 @@
1/*
2 * arch/alpha/lib/ev6-clear_user.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
4 *
5 * Zero user space, handling exceptions as we go.
6 *
7 * We have to make sure that $0 is always up-to-date and contains the
8 * right "bytes left to zero" value (and that it is updated only _after_
9 * a successful copy). There is also some rather minor exception setup
10 * stuff.
11 *
12 * NOTE! This is not directly C-callable, because the calling semantics
13 * are different:
14 *
15 * Inputs:
16 * length in $0
17 * destination address in $6
18 * exception pointer in $7
19 * return address in $28 (exceptions expect it there)
20 *
21 * Outputs:
22 * bytes left to copy in $0
23 *
24 * Clobbers:
25 * $1,$2,$3,$4,$5,$6
26 *
27 * Much of the information about 21264 scheduling/coding comes from:
28 * Compiler Writer's Guide for the Alpha 21264
29 * abbreviated as 'CWG' in other comments here
30 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
31 * Scheduling notation:
32 * E - either cluster
33 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
34 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
35 * Try not to change the actual algorithm if possible for consistency.
36 * Determining actual stalls (other than slotting) doesn't appear to be easy to do.
37 * From perusing the source code context where this routine is called, it is
38 * a fair assumption that significant fractions of entire pages are zeroed, so
39 * it's going to be worth the effort to hand-unroll a big loop, and use wh64.
40 * ASSUMPTION:
41 * The believed purpose of only updating $0 after a store is that a signal
42 * may come along during the execution of this chunk of code, and we don't
43 * want to leave a hole (and we also want to avoid repeating lots of work)
44 */
45
46/* Allow an exception for an insn; exit if we get one. */
47#define EX(x,y...) \
48 99: x,##y; \
49 .section __ex_table,"a"; \
50 .long 99b - .; \
51 lda $31, $exception-99b($31); \
52 .previous
53
54 .set noat
55 .set noreorder
56 .align 4
57
58 .globl __do_clear_user
59 .ent __do_clear_user
60 .frame $30, 0, $28
61 .prologue 0
62
63 # Pipeline info : Slotting & Comments
64__do_clear_user:
65 and $6, 7, $4 # .. E .. .. : find dest head misalignment
66 beq $0, $zerolength # U .. .. .. : U L U L
67
68 addq $0, $4, $1 # .. .. .. E : bias counter
69 and $1, 7, $2 # .. .. E .. : number of misaligned bytes in tail
70# Note - we never actually use $2, so this is a moot computation
71# and we can rewrite this later...
72 srl $1, 3, $1 # .. E .. .. : number of quadwords to clear
73 beq $4, $headalign # U .. .. .. : U L U L
74
75/*
76 * Head is not aligned. Write (8 - $4) bytes to head of destination
77 * This means $6 is known to be misaligned
78 */
79 EX( ldq_u $5, 0($6) ) # .. .. .. L : load dst word to mask back in
80 beq $1, $onebyte # .. .. U .. : sub-word store?
81 mskql $5, $6, $5 # .. U .. .. : take care of misaligned head
82 addq $6, 8, $6 # E .. .. .. : L U U L
83
84 EX( stq_u $5, -8($6) ) # .. .. .. L :
85 subq $1, 1, $1 # .. .. E .. :
86 addq $0, $4, $0 # .. E .. .. : bytes left -= 8 - misalignment
87 subq $0, 8, $0 # E .. .. .. : U L U L
88
89 .align 4
90/*
91 * (The .align directive ought to be a moot point)
92 * values upon initial entry to the loop
93 * $1 is number of quadwords to clear (zero is a valid value)
94 * $2 is number of trailing bytes (0..7) ($2 never used...)
95 * $6 is known to be aligned 0mod8
96 */
97$headalign:
98 subq $1, 16, $4 # .. .. .. E : If < 16, we can not use the huge loop
99 and $6, 0x3f, $2 # .. .. E .. : Forward work for huge loop
100 subq $2, 0x40, $3 # .. E .. .. : bias counter (huge loop)
101 blt $4, $trailquad # U .. .. .. : U L U L
102
103/*
104 * We know that we're going to do at least 16 quads, which means we are
105 * going to be able to use the large block clear loop at least once.
106 * Figure out how many quads we need to clear before we are 0mod64 aligned
107 * so we can use the wh64 instruction.
108 */
109
110 nop # .. .. .. E
111 nop # .. .. E ..
112 nop # .. E .. ..
113 beq $3, $bigalign # U .. .. .. : U L U L : Aligned 0mod64
114
115$alignmod64:
116 EX( stq_u $31, 0($6) ) # .. .. .. L
117 addq $3, 8, $3 # .. .. E ..
118 subq $0, 8, $0 # .. E .. ..
119 nop # E .. .. .. : U L U L
120
121 nop # .. .. .. E
122 subq $1, 1, $1 # .. .. E ..
123 addq $6, 8, $6 # .. E .. ..
124 blt $3, $alignmod64 # U .. .. .. : U L U L
125
126$bigalign:
127/*
128 * $0 is the number of bytes left
129 * $1 is the number of quads left
130 * $6 is aligned 0mod64
131 * we know that we'll be taking a minimum of one trip through
132 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
133 * We are _not_ going to update $0 after every single store. That
134 * would be silly, because there will be cross-cluster dependencies
135 * no matter how the code is scheduled. By doing it in slightly
136 * staggered fashion, we can still do this loop in 5 fetches
137 * The worse case will be doing two extra quads in some future execution,
138 * in the event of an interrupted clear.
139 * Assumes the wh64 needs to be for 2 trips through the loop in the future
140 * The wh64 is issued on for the starting destination address for trip +2
141 * through the loop, and if there are less than two trips left, the target
142 * address will be for the current trip.
143 */
144 nop # E :
145 nop # E :
146 nop # E :
147 bis $6,$6,$3 # E : U L U L : Initial wh64 address is dest
148 /* This might actually help for the current trip... */
149
150$do_wh64:
151 wh64 ($3) # .. .. .. L1 : memory subsystem hint
152 subq $1, 16, $4 # .. .. E .. : Forward calculation - repeat the loop?
153 EX( stq_u $31, 0($6) ) # .. L .. ..
154 subq $0, 8, $0 # E .. .. .. : U L U L
155
156 addq $6, 128, $3 # E : Target address of wh64
157 EX( stq_u $31, 8($6) ) # L :
158 EX( stq_u $31, 16($6) ) # L :
159 subq $0, 16, $0 # E : U L L U
160
161 nop # E :
162 EX( stq_u $31, 24($6) ) # L :
163 EX( stq_u $31, 32($6) ) # L :
164 subq $0, 168, $5 # E : U L L U : two trips through the loop left?
165 /* 168 = 192 - 24, since we've already completed some stores */
166
167 subq $0, 16, $0 # E :
168 EX( stq_u $31, 40($6) ) # L :
169 EX( stq_u $31, 48($6) ) # L :
170 cmovlt $5, $6, $3 # E : U L L U : Latency 2, extra mapping cycle
171
172 subq $1, 8, $1 # E :
173 subq $0, 16, $0 # E :
174 EX( stq_u $31, 56($6) ) # L :
175 nop # E : U L U L
176
177 nop # E :
178 subq $0, 8, $0 # E :
179 addq $6, 64, $6 # E :
180 bge $4, $do_wh64 # U : U L U L
181
182$trailquad:
183 # zero to 16 quadwords left to store, plus any trailing bytes
184 # $1 is the number of quadwords left to go.
185 #
186 nop # .. .. .. E
187 nop # .. .. E ..
188 nop # .. E .. ..
189 beq $1, $trailbytes # U .. .. .. : U L U L : Only 0..7 bytes to go
190
191$onequad:
192 EX( stq_u $31, 0($6) ) # .. .. .. L
193 subq $1, 1, $1 # .. .. E ..
194 subq $0, 8, $0 # .. E .. ..
195 nop # E .. .. .. : U L U L
196
197 nop # .. .. .. E
198 nop # .. .. E ..
199 addq $6, 8, $6 # .. E .. ..
200 bgt $1, $onequad # U .. .. .. : U L U L
201
202 # We have an unknown number of bytes left to go.
203$trailbytes:
204 nop # .. .. .. E
205 nop # .. .. E ..
206 nop # .. E .. ..
207 beq $0, $zerolength # U .. .. .. : U L U L
208
209 # $0 contains the number of bytes left to copy (0..31)
210 # so we will use $0 as the loop counter
211 # We know for a fact that $0 > 0 zero due to previous context
212$onebyte:
213 EX( stb $31, 0($6) ) # .. .. .. L
214 subq $0, 1, $0 # .. .. E .. :
215 addq $6, 1, $6 # .. E .. .. :
216 bgt $0, $onebyte # U .. .. .. : U L U L
217
218$zerolength:
219$exception: # Destination for exception recovery(?)
220 nop # .. .. .. E :
221 nop # .. .. E .. :
222 nop # .. E .. .. :
223 ret $31, ($28), 1 # L0 .. .. .. : L U L U
224 .end __do_clear_user
225
diff --git a/arch/alpha/lib/ev6-copy_page.S b/arch/alpha/lib/ev6-copy_page.S
new file mode 100644
index 000000000000..b789db192754
--- /dev/null
+++ b/arch/alpha/lib/ev6-copy_page.S
@@ -0,0 +1,203 @@
1/*
2 * arch/alpha/lib/ev6-copy_page.S
3 *
4 * Copy an entire page.
5 */
6
7/* The following comparison of this routine vs the normal copy_page.S
8 was written by an unnamed ev6 hardware designer and forwarded to me
9 via Steven Hobbs <hobbs@steven.zko.dec.com>.
10
11 First Problem: STQ overflows.
12 -----------------------------
13
14 It would be nice if EV6 handled every resource overflow efficiently,
15 but for some it doesn't. Including store queue overflows. It causes
16 a trap and a restart of the pipe.
17
18 To get around this we sometimes use (to borrow a term from a VSSAD
19 researcher) "aeration". The idea is to slow the rate at which the
20 processor receives valid instructions by inserting nops in the fetch
21 path. In doing so, you can prevent the overflow and actually make
22 the code run faster. You can, of course, take advantage of the fact
23 that the processor can fetch at most 4 aligned instructions per cycle.
24
25 I inserted enough nops to force it to take 10 cycles to fetch the
26 loop code. In theory, EV6 should be able to execute this loop in
27 9 cycles but I was not able to get it to run that fast -- the initial
28 conditions were such that I could not reach this optimum rate on
29 (chaotic) EV6. I wrote the code such that everything would issue
30 in order.
31
32 Second Problem: Dcache index matches.
33 -------------------------------------
34
35 If you are going to use this routine on random aligned pages, there
36 is a 25% chance that the pages will be at the same dcache indices.
37 This results in many nasty memory traps without care.
38
39 The solution is to schedule the prefetches to avoid the memory
40 conflicts. I schedule the wh64 prefetches farther ahead of the
41 read prefetches to avoid this problem.
42
43 Third Problem: Needs more prefetching.
44 --------------------------------------
45
46 In order to improve the code I added deeper prefetching to take the
47 most advantage of EV6's bandwidth.
48
49 I also prefetched the read stream. Note that adding the read prefetch
50 forced me to add another cycle to the inner-most kernel - up to 11
51 from the original 8 cycles per iteration. We could improve performance
52 further by unrolling the loop and doing multiple prefetches per cycle.
53
54 I think that the code below will be very robust and fast code for the
55 purposes of copying aligned pages. It is slower when both source and
56 destination pages are in the dcache, but it is my guess that this is
57 less important than the dcache miss case. */
58
59
60 .text
61 .align 4
62 .global copy_page
63 .ent copy_page
64copy_page:
65 .prologue 0
66
67 /* Prefetch 5 read cachelines; write-hint 10 cache lines. */
68 wh64 ($16)
69 ldl $31,0($17)
70 ldl $31,64($17)
71 lda $1,1*64($16)
72
73 wh64 ($1)
74 ldl $31,128($17)
75 ldl $31,192($17)
76 lda $1,2*64($16)
77
78 wh64 ($1)
79 ldl $31,256($17)
80 lda $18,118
81 lda $1,3*64($16)
82
83 wh64 ($1)
84 nop
85 lda $1,4*64($16)
86 lda $2,5*64($16)
87
88 wh64 ($1)
89 wh64 ($2)
90 lda $1,6*64($16)
91 lda $2,7*64($16)
92
93 wh64 ($1)
94 wh64 ($2)
95 lda $1,8*64($16)
96 lda $2,9*64($16)
97
98 wh64 ($1)
99 wh64 ($2)
100 lda $19,10*64($16)
101 nop
102
103 /* Main prefetching/write-hinting loop. */
1041: ldq $0,0($17)
105 ldq $1,8($17)
106 unop
107 unop
108
109 unop
110 unop
111 ldq $2,16($17)
112 ldq $3,24($17)
113
114 ldq $4,32($17)
115 ldq $5,40($17)
116 unop
117 unop
118
119 unop
120 unop
121 ldq $6,48($17)
122 ldq $7,56($17)
123
124 ldl $31,320($17)
125 unop
126 unop
127 unop
128
129 /* This gives the extra cycle of aeration above the minimum. */
130 unop
131 unop
132 unop
133 unop
134
135 wh64 ($19)
136 unop
137 unop
138 unop
139
140 stq $0,0($16)
141 subq $18,1,$18
142 stq $1,8($16)
143 unop
144
145 unop
146 stq $2,16($16)
147 addq $17,64,$17
148 stq $3,24($16)
149
150 stq $4,32($16)
151 stq $5,40($16)
152 addq $19,64,$19
153 unop
154
155 stq $6,48($16)
156 stq $7,56($16)
157 addq $16,64,$16
158 bne $18, 1b
159
160 /* Prefetch the final 5 cache lines of the read stream. */
161 lda $18,10
162 ldl $31,320($17)
163 ldl $31,384($17)
164 ldl $31,448($17)
165
166 ldl $31,512($17)
167 ldl $31,576($17)
168 nop
169 nop
170
171 /* Non-prefetching, non-write-hinting cleanup loop for the
172 final 10 cache lines. */
1732: ldq $0,0($17)
174 ldq $1,8($17)
175 ldq $2,16($17)
176 ldq $3,24($17)
177
178 ldq $4,32($17)
179 ldq $5,40($17)
180 ldq $6,48($17)
181 ldq $7,56($17)
182
183 stq $0,0($16)
184 subq $18,1,$18
185 stq $1,8($16)
186 addq $17,64,$17
187
188 stq $2,16($16)
189 stq $3,24($16)
190 stq $4,32($16)
191 stq $5,40($16)
192
193 stq $6,48($16)
194 stq $7,56($16)
195 addq $16,64,$16
196 bne $18, 2b
197
198 ret
199 nop
200 unop
201 nop
202
203 .end copy_page
diff --git a/arch/alpha/lib/ev6-copy_user.S b/arch/alpha/lib/ev6-copy_user.S
new file mode 100644
index 000000000000..db42ffe9c350
--- /dev/null
+++ b/arch/alpha/lib/ev6-copy_user.S
@@ -0,0 +1,259 @@
1/*
2 * arch/alpha/lib/ev6-copy_user.S
3 *
4 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
5 *
6 * Copy to/from user space, handling exceptions as we go.. This
7 * isn't exactly pretty.
8 *
9 * This is essentially the same as "memcpy()", but with a few twists.
10 * Notably, we have to make sure that $0 is always up-to-date and
11 * contains the right "bytes left to copy" value (and that it is updated
12 * only _after_ a successful copy). There is also some rather minor
13 * exception setup stuff..
14 *
15 * NOTE! This is not directly C-callable, because the calling semantics are
16 * different:
17 *
18 * Inputs:
19 * length in $0
20 * destination address in $6
21 * source address in $7
22 * return address in $28
23 *
24 * Outputs:
25 * bytes left to copy in $0
26 *
27 * Clobbers:
28 * $1,$2,$3,$4,$5,$6,$7
29 *
30 * Much of the information about 21264 scheduling/coding comes from:
31 * Compiler Writer's Guide for the Alpha 21264
32 * abbreviated as 'CWG' in other comments here
33 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
34 * Scheduling notation:
35 * E - either cluster
36 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
37 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
38 */
39
40/* Allow an exception for an insn; exit if we get one. */
41#define EXI(x,y...) \
42 99: x,##y; \
43 .section __ex_table,"a"; \
44 .long 99b - .; \
45 lda $31, $exitin-99b($31); \
46 .previous
47
48#define EXO(x,y...) \
49 99: x,##y; \
50 .section __ex_table,"a"; \
51 .long 99b - .; \
52 lda $31, $exitout-99b($31); \
53 .previous
54
55 .set noat
56 .align 4
57 .globl __copy_user
58 .ent __copy_user
59 # Pipeline info: Slotting & Comments
60__copy_user:
61 .prologue 0
62 subq $0, 32, $1 # .. E .. .. : Is this going to be a small copy?
63 beq $0, $zerolength # U .. .. .. : U L U L
64
65 and $6,7,$3 # .. .. .. E : is leading dest misalignment
66 ble $1, $onebyteloop # .. .. U .. : 1st branch : small amount of data
67 beq $3, $destaligned # .. U .. .. : 2nd (one cycle fetcher stall)
68 subq $3, 8, $3 # E .. .. .. : L U U L : trip counter
69/*
70 * The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U)
71 * This loop aligns the destination a byte at a time
72 * We know we have at least one trip through this loop
73 */
74$aligndest:
75 EXI( ldbu $1,0($7) ) # .. .. .. L : Keep loads separate from stores
76 addq $6,1,$6 # .. .. E .. : Section 3.8 in the CWG
77 addq $3,1,$3 # .. E .. .. :
78 nop # E .. .. .. : U L U L
79
80/*
81 * the -1 is to compensate for the inc($6) done in a previous quadpack
82 * which allows us zero dependencies within either quadpack in the loop
83 */
84 EXO( stb $1,-1($6) ) # .. .. .. L :
85 addq $7,1,$7 # .. .. E .. : Section 3.8 in the CWG
86 subq $0,1,$0 # .. E .. .. :
87 bne $3, $aligndest # U .. .. .. : U L U L
88
89/*
90 * If we fell through into here, we have a minimum of 33 - 7 bytes
91 * If we arrived via branch, we have a minimum of 32 bytes
92 */
93$destaligned:
94 and $7,7,$1 # .. .. .. E : Check _current_ source alignment
95 bic $0,7,$4 # .. .. E .. : number bytes as a quadword loop
96 EXI( ldq_u $3,0($7) ) # .. L .. .. : Forward fetch for fallthrough code
97 beq $1,$quadaligned # U .. .. .. : U L U L
98
99/*
100 * In the worst case, we've just executed an ldq_u here from 0($7)
101 * and we'll repeat it once if we take the branch
102 */
103
104/* Misaligned quadword loop - not unrolled. Leave it that way. */
105$misquad:
106 EXI( ldq_u $2,8($7) ) # .. .. .. L :
107 subq $4,8,$4 # .. .. E .. :
108 extql $3,$7,$3 # .. U .. .. :
109 extqh $2,$7,$1 # U .. .. .. : U U L L
110
111 bis $3,$1,$1 # .. .. .. E :
112 EXO( stq $1,0($6) ) # .. .. L .. :
113 addq $7,8,$7 # .. E .. .. :
114 subq $0,8,$0 # E .. .. .. : U L L U
115
116 addq $6,8,$6 # .. .. .. E :
117 bis $2,$2,$3 # .. .. E .. :
118 nop # .. E .. .. :
119 bne $4,$misquad # U .. .. .. : U L U L
120
121 nop # .. .. .. E
122 nop # .. .. E ..
123 nop # .. E .. ..
124 beq $0,$zerolength # U .. .. .. : U L U L
125
126/* We know we have at least one trip through the byte loop */
127 EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad
128 addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG)
129 nop # .. E .. .. :
130 br $31, $dirtyentry # L0 .. .. .. : L U U L
131/* Do the trailing byte loop load, then hop into the store part of the loop */
132
133/*
134 * A minimum of (33 - 7) bytes to do a quad at a time.
135 * Based upon the usage context, it's worth the effort to unroll this loop
136 * $0 - number of bytes to be moved
137 * $4 - number of bytes to move as quadwords
138 * $6 is current destination address
139 * $7 is current source address
140 */
141$quadaligned:
142 subq $4, 32, $2 # .. .. .. E : do not unroll for small stuff
143 nop # .. .. E ..
144 nop # .. E .. ..
145 blt $2, $onequad # U .. .. .. : U L U L
146
147/*
148 * There is a significant assumption here that the source and destination
149 * addresses differ by more than 32 bytes. In this particular case, a
150 * sparsity of registers further bounds this to be a minimum of 8 bytes.
151 * But if this isn't met, then the output result will be incorrect.
152 * Furthermore, due to a lack of available registers, we really can't
153 * unroll this to be an 8x loop (which would enable us to use the wh64
154 * instruction memory hint instruction).
155 */
156$unroll4:
157 EXI( ldq $1,0($7) ) # .. .. .. L
158 EXI( ldq $2,8($7) ) # .. .. L ..
159 subq $4,32,$4 # .. E .. ..
160 nop # E .. .. .. : U U L L
161
162 addq $7,16,$7 # .. .. .. E
163 EXO( stq $1,0($6) ) # .. .. L ..
164 EXO( stq $2,8($6) ) # .. L .. ..
165 subq $0,16,$0 # E .. .. .. : U L L U
166
167 addq $6,16,$6 # .. .. .. E
168 EXI( ldq $1,0($7) ) # .. .. L ..
169 EXI( ldq $2,8($7) ) # .. L .. ..
170 subq $4, 32, $3 # E .. .. .. : U U L L : is there enough for another trip?
171
172 EXO( stq $1,0($6) ) # .. .. .. L
173 EXO( stq $2,8($6) ) # .. .. L ..
174 subq $0,16,$0 # .. E .. ..
175 addq $7,16,$7 # E .. .. .. : U L L U
176
177 nop # .. .. .. E
178 nop # .. .. E ..
179 addq $6,16,$6 # .. E .. ..
180 bgt $3,$unroll4 # U .. .. .. : U L U L
181
182 nop
183 nop
184 nop
185 beq $4, $noquads
186
187$onequad:
188 EXI( ldq $1,0($7) )
189 subq $4,8,$4
190 addq $7,8,$7
191 nop
192
193 EXO( stq $1,0($6) )
194 subq $0,8,$0
195 addq $6,8,$6
196 bne $4,$onequad
197
198$noquads:
199 nop
200 nop
201 nop
202 beq $0,$zerolength
203
204/*
205 * For small copies (or the tail of a larger copy), do a very simple byte loop.
206 * There's no point in doing a lot of complex alignment calculations to try to
207 * to quadword stuff for a small amount of data.
208 * $0 - remaining number of bytes left to copy
209 * $6 - current dest addr
210 * $7 - current source addr
211 */
212
213$onebyteloop:
214 EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad
215 addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG)
216 nop # .. E .. .. :
217 nop # E .. .. .. : U L U L
218
219$dirtyentry:
220/*
221 * the -1 is to compensate for the inc($6) done in a previous quadpack
222 * which allows us zero dependencies within either quadpack in the loop
223 */
224 EXO ( stb $2,-1($6) ) # .. .. .. L :
225 addq $7,1,$7 # .. .. E .. : quadpack as the load
226 subq $0,1,$0 # .. E .. .. : change count _after_ copy
227 bgt $0,$onebyteloop # U .. .. .. : U L U L
228
229$zerolength:
230$exitout: # Destination for exception recovery(?)
231 nop # .. .. .. E
232 nop # .. .. E ..
233 nop # .. E .. ..
234 ret $31,($28),1 # L0 .. .. .. : L U L U
235
236$exitin:
237
238 /* A stupid byte-by-byte zeroing of the rest of the output
239 buffer. This cures security holes by never leaving
240 random kernel data around to be copied elsewhere. */
241
242 nop
243 nop
244 nop
245 mov $0,$1
246
247$101:
248 EXO ( stb $31,0($6) ) # L
249 subq $1,1,$1 # E
250 addq $6,1,$6 # E
251 bgt $1,$101 # U
252
253 nop
254 nop
255 nop
256 ret $31,($28),1 # L0
257
258 .end __copy_user
259
diff --git a/arch/alpha/lib/ev6-csum_ipv6_magic.S b/arch/alpha/lib/ev6-csum_ipv6_magic.S
new file mode 100644
index 000000000000..de1948a69118
--- /dev/null
+++ b/arch/alpha/lib/ev6-csum_ipv6_magic.S
@@ -0,0 +1,126 @@
1/*
2 * arch/alpha/lib/ev6-csum_ipv6_magic.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
4 *
5 * unsigned short csum_ipv6_magic(struct in6_addr *saddr,
6 * struct in6_addr *daddr,
7 * __u32 len,
8 * unsigned short proto,
9 * unsigned int csum);
10 *
11 * Much of the information about 21264 scheduling/coding comes from:
12 * Compiler Writer's Guide for the Alpha 21264
13 * abbreviated as 'CWG' in other comments here
14 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
15 * Scheduling notation:
16 * E - either cluster
17 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
18 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
19 * Try not to change the actual algorithm if possible for consistency.
20 * Determining actual stalls (other than slotting) doesn't appear to be easy to do.
21 *
22 * unsigned short csum_ipv6_magic(struct in6_addr *saddr,
23 * struct in6_addr *daddr,
24 * __u32 len,
25 * unsigned short proto,
26 * unsigned int csum);
27 *
28 * Swap <proto> (takes form 0xaabb)
29 * Then shift it left by 48, so result is:
30 * 0xbbaa0000 00000000
31 * Then turn it back into a sign extended 32-bit item
32 * 0xbbaa0000
33 *
34 * Swap <len> (an unsigned int) using Mike Burrows' 7-instruction sequence
35 * (we can't hide the 3-cycle latency of the unpkbw in the 6-instruction sequence)
36 * Assume input takes form 0xAABBCCDD
37 *
38 * Finally, original 'folding' approach is to split the long into 4 unsigned shorts
39 * add 4 ushorts, resulting in ushort/carry
40 * add carry bits + ushort --> ushort
41 * add carry bits + ushort --> ushort (in case the carry results in an overflow)
42 * Truncate to a ushort. (took 13 instructions)
43 * From doing some testing, using the approach in checksum.c:from64to16()
44 * results in the same outcome:
45 * split into 2 uints, add those, generating a ulong
46 * add the 3 low ushorts together, generating a uint
47 * a final add of the 2 lower ushorts
48 * truncating the result.
49 */
50
51 .globl csum_ipv6_magic
52 .align 4
53 .ent csum_ipv6_magic
54 .frame $30,0,$26,0
55csum_ipv6_magic:
56 .prologue 0
57
58 ldq $0,0($16) # L : Latency: 3
59 inslh $18,7,$4 # U : 0000000000AABBCC
60 ldq $1,8($16) # L : Latency: 3
61 sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00
62
63 zapnot $20,15,$20 # U : zero extend incoming csum
64 ldq $2,0($17) # L : Latency: 3
65 sll $19,24,$19 # U : U L L U : 0x000000aa bb000000
66 inswl $18,3,$18 # U : 000000CCDD000000
67
68 ldq $3,8($17) # L : Latency: 3
69 bis $18,$4,$18 # E : 000000CCDDAABBCC
70 addl $19,$7,$19 # E : <sign bits>bbaabb00
71 nop # E : U L U L
72
73 addq $20,$0,$20 # E : begin summing the words
74 srl $18,16,$4 # U : 0000000000CCDDAA
75 zap $19,0x3,$19 # U : <sign bits>bbaa0000
76 nop # E : L U U L
77
78 cmpult $20,$0,$0 # E :
79 addq $20,$1,$20 # E :
80 zapnot $18,0xa,$18 # U : 00000000DD00BB00
81 zap $4,0xa,$4 # U : U U L L : 0000000000CC00AA
82
83 or $18,$4,$18 # E : 00000000DDCCBBAA
84 nop # E :
85 cmpult $20,$1,$1 # E :
86 addq $20,$2,$20 # E : U L U L
87
88 cmpult $20,$2,$2 # E :
89 addq $20,$3,$20 # E :
90 cmpult $20,$3,$3 # E : (1 cycle stall on $20)
91 addq $20,$18,$20 # E : U L U L (1 cycle stall on $20)
92
93 cmpult $20,$18,$18 # E :
94 addq $20,$19,$20 # E : (1 cycle stall on $20)
95 addq $0,$1,$0 # E : merge the carries back into the csum
96 addq $2,$3,$2 # E :
97
98 cmpult $20,$19,$19 # E :
99 addq $18,$19,$18 # E : (1 cycle stall on $19)
100 addq $0,$2,$0 # E :
101 addq $20,$18,$20 # E : U L U L :
102 /* (1 cycle stall on $18, 2 cycles on $20) */
103
104 addq $0,$20,$0 # E :
105 zapnot $0,15,$1 # U : Start folding output (1 cycle stall on $0)
106 nop # E :
107 srl $0,32,$0 # U : U L U L : (1 cycle stall on $0)
108
109 addq $1,$0,$1 # E : Finished generating ulong
110 extwl $1,2,$2 # U : ushort[1] (1 cycle stall on $1)
111 zapnot $1,3,$0 # U : ushort[0] (1 cycle stall on $1)
112 extwl $1,4,$1 # U : ushort[2] (1 cycle stall on $1)
113
114 addq $0,$2,$0 # E
115 addq $0,$1,$3 # E : Finished generating uint
116 /* (1 cycle stall on $0) */
117 extwl $3,2,$1 # U : ushort[1] (1 cycle stall on $3)
118 nop # E : L U L U
119
120 addq $1,$3,$0 # E : Final carry
121 not $0,$4 # E : complement (1 cycle stall on $0)
122 zapnot $4,3,$0 # U : clear upper garbage bits
123 /* (1 cycle stall on $4) */
124 ret # L0 : L U L U
125
126 .end csum_ipv6_magic
diff --git a/arch/alpha/lib/ev6-divide.S b/arch/alpha/lib/ev6-divide.S
new file mode 100644
index 000000000000..2a82b9be93fa
--- /dev/null
+++ b/arch/alpha/lib/ev6-divide.S
@@ -0,0 +1,259 @@
1/*
2 * arch/alpha/lib/ev6-divide.S
3 *
4 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
5 *
6 * Alpha division..
7 */
8
9/*
10 * The alpha chip doesn't provide hardware division, so we have to do it
11 * by hand. The compiler expects the functions
12 *
13 * __divqu: 64-bit unsigned long divide
14 * __remqu: 64-bit unsigned long remainder
15 * __divqs/__remqs: signed 64-bit
16 * __divlu/__remlu: unsigned 32-bit
17 * __divls/__remls: signed 32-bit
18 *
19 * These are not normal C functions: instead of the normal
20 * calling sequence, these expect their arguments in registers
21 * $24 and $25, and return the result in $27. Register $28 may
22 * be clobbered (assembly temporary), anything else must be saved.
23 *
24 * In short: painful.
25 *
26 * This is a rather simple bit-at-a-time algorithm: it's very good
27 * at dividing random 64-bit numbers, but the more usual case where
28 * the divisor is small is handled better by the DEC algorithm
29 * using lookup tables. This uses much less memory, though, and is
30 * nicer on the cache.. Besides, I don't know the copyright status
31 * of the DEC code.
32 */
33
34/*
35 * My temporaries:
36 * $0 - current bit
37 * $1 - shifted divisor
38 * $2 - modulus/quotient
39 *
40 * $23 - return address
41 * $24 - dividend
42 * $25 - divisor
43 *
44 * $27 - quotient/modulus
45 * $28 - compare status
46 *
47 * Much of the information about 21264 scheduling/coding comes from:
48 * Compiler Writer's Guide for the Alpha 21264
49 * abbreviated as 'CWG' in other comments here
50 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
51 * Scheduling notation:
52 * E - either cluster
53 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
54 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
55 * Try not to change the actual algorithm if possible for consistency.
56 */
57
58#define halt .long 0
59
60/*
61 * Select function type and registers
62 */
63#define mask $0
64#define divisor $1
65#define compare $28
66#define tmp1 $3
67#define tmp2 $4
68
69#ifdef DIV
70#define DIV_ONLY(x,y...) x,##y
71#define MOD_ONLY(x,y...)
72#define func(x) __div##x
73#define modulus $2
74#define quotient $27
75#define GETSIGN(x) xor $24,$25,x
76#define STACK 48
77#else
78#define DIV_ONLY(x,y...)
79#define MOD_ONLY(x,y...) x,##y
80#define func(x) __rem##x
81#define modulus $27
82#define quotient $2
83#define GETSIGN(x) bis $24,$24,x
84#define STACK 32
85#endif
86
87/*
88 * For 32-bit operations, we need to extend to 64-bit
89 */
90#ifdef INTSIZE
91#define ufunction func(lu)
92#define sfunction func(l)
93#define LONGIFY(x) zapnot x,15,x
94#define SLONGIFY(x) addl x,0,x
95#else
96#define ufunction func(qu)
97#define sfunction func(q)
98#define LONGIFY(x)
99#define SLONGIFY(x)
100#endif
101
102.set noat
103.align 4
104.globl ufunction
105.ent ufunction
106ufunction:
107 subq $30,STACK,$30 # E :
108 .frame $30,STACK,$23
109 .prologue 0
110
1117: stq $1, 0($30) # L :
112 bis $25,$25,divisor # E :
113 stq $2, 8($30) # L : L U L U
114
115 bis $24,$24,modulus # E :
116 stq $0,16($30) # L :
117 bis $31,$31,quotient # E :
118 LONGIFY(divisor) # E : U L L U
119
120 stq tmp1,24($30) # L :
121 LONGIFY(modulus) # E :
122 bis $31,1,mask # E :
123 DIV_ONLY(stq tmp2,32($30)) # L : L U U L
124
125 beq divisor, 9f /* div by zero */
126 /*
127 * In spite of the DIV_ONLY being either a non-instruction
128 * or an actual stq, the addition of the .align directive
129 * below ensures that label 1 is going to be nicely aligned
130 */
131
132 .align 4
133#ifdef INTSIZE
134 /*
135 * shift divisor left, using 3-bit shifts for
136 * 32-bit divides as we can't overflow. Three-bit
137 * shifts will result in looping three times less
138 * here, but can result in two loops more later.
139 * Thus using a large shift isn't worth it (and
140 * s8add pairs better than a sll..)
141 */
1421: cmpult divisor,modulus,compare # E :
143 s8addq divisor,$31,divisor # E :
144 s8addq mask,$31,mask # E :
145 bne compare,1b # U : U L U L
146#else
1471: cmpult divisor,modulus,compare # E :
148 nop # E :
149 nop # E :
150 blt divisor, 2f # U : U L U L
151
152 addq divisor,divisor,divisor # E :
153 addq mask,mask,mask # E :
154 unop # E :
155 bne compare,1b # U : U L U L
156#endif
157
158 /* ok, start to go right again.. */
1592:
160 /*
161 * Keep things nicely bundled... use a nop instead of not
162 * having an instruction for DIV_ONLY
163 */
164#ifdef DIV
165 DIV_ONLY(addq quotient,mask,tmp2) # E :
166#else
167 nop # E :
168#endif
169 srl mask,1,mask # U :
170 cmpule divisor,modulus,compare # E :
171 subq modulus,divisor,tmp1 # E :
172
173#ifdef DIV
174 DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot
175 nop # E : as part of the cmovne
176 srl divisor,1,divisor # U :
177 nop # E : L U L U
178
179 nop # E :
180 cmovne compare,tmp1,modulus # E : Latency 2, extra map slot
181 nop # E : as part of the cmovne
182 bne mask,2b # U : U L U L
183#else
184 srl divisor,1,divisor # U :
185 cmovne compare,tmp1,modulus # E : Latency 2, extra map slot
186 nop # E : as part of the cmovne
187 bne mask,2b # U : U L L U
188#endif
189
1909: ldq $1, 0($30) # L :
191 ldq $2, 8($30) # L :
192 nop # E :
193 nop # E : U U L L
194
195 ldq $0,16($30) # L :
196 ldq tmp1,24($30) # L :
197 nop # E :
198 nop # E :
199
200#ifdef DIV
201 DIV_ONLY(ldq tmp2,32($30)) # L :
202#else
203 nop # E :
204#endif
205 addq $30,STACK,$30 # E :
206 ret $31,($23),1 # L0 : L U U L
207 .end ufunction
208
209/*
210 * Uhh.. Ugly signed division. I'd rather not have it at all, but
211 * it's needed in some circumstances. There are different ways to
212 * handle this, really. This does:
213 * -a / b = a / -b = -(a / b)
214 * -a % b = -(a % b)
215 * a % -b = a % b
216 * which is probably not the best solution, but at least should
217 * have the property that (x/y)*y + (x%y) = x.
218 */
219.align 4
220.globl sfunction
221.ent sfunction
222sfunction:
223 subq $30,STACK,$30 # E :
224 .frame $30,STACK,$23
225 .prologue 0
226 bis $24,$25,$28 # E :
227 SLONGIFY($28) # E :
228 bge $28,7b # U :
229
230 stq $24,0($30) # L :
231 subq $31,$24,$28 # E :
232 stq $25,8($30) # L :
233 nop # E : U L U L
234
235 cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot
236 nop # E : as part of the cmov
237 stq $23,16($30) # L :
238 subq $31,$25,$28 # E : U L U L
239
240 stq tmp1,24($30) # L :
241 cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot
242 nop # E :
243 bsr $23,ufunction # L0: L U L U
244
245 ldq $24,0($30) # L :
246 ldq $25,8($30) # L :
247 GETSIGN($28) # E :
248 subq $31,$27,tmp1 # E : U U L L
249
250 SLONGIFY($28) # E :
251 ldq $23,16($30) # L :
252 cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot
253 nop # E : U L L U : as part of the cmov
254
255 ldq tmp1,24($30) # L :
256 nop # E : as part of the cmov
257 addq $30,STACK,$30 # E :
258 ret $31,($23),1 # L0 : L U U L
259 .end sfunction
diff --git a/arch/alpha/lib/ev6-memchr.S b/arch/alpha/lib/ev6-memchr.S
new file mode 100644
index 000000000000..a8e843dbcc23
--- /dev/null
+++ b/arch/alpha/lib/ev6-memchr.S
@@ -0,0 +1,191 @@
1/*
2 * arch/alpha/lib/ev6-memchr.S
3 *
4 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
5 *
6 * Finds characters in a memory area. Optimized for the Alpha:
7 *
8 * - memory accessed as aligned quadwords only
9 * - uses cmpbge to compare 8 bytes in parallel
10 * - does binary search to find 0 byte in last
11 * quadword (HAKMEM needed 12 instructions to
12 * do this instead of the 9 instructions that
13 * binary search needs).
14 *
15 * For correctness consider that:
16 *
17 * - only minimum number of quadwords may be accessed
18 * - the third argument is an unsigned long
19 *
20 * Much of the information about 21264 scheduling/coding comes from:
21 * Compiler Writer's Guide for the Alpha 21264
22 * abbreviated as 'CWG' in other comments here
23 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
24 * Scheduling notation:
25 * E - either cluster
26 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
27 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
28 * Try not to change the actual algorithm if possible for consistency.
29 */
30
31 .set noreorder
32 .set noat
33
34 .align 4
35 .globl memchr
36 .ent memchr
37memchr:
38 .frame $30,0,$26,0
39 .prologue 0
40
41 # Hack -- if someone passes in (size_t)-1, hoping to just
42 # search til the end of the address space, we will overflow
43 # below when we find the address of the last byte. Given
44 # that we will never have a 56-bit address space, cropping
45 # the length is the easiest way to avoid trouble.
46 zap $18, 0x80, $5 # U : Bound length
47 beq $18, $not_found # U :
48 ldq_u $1, 0($16) # L : load first quadword Latency=3
49 and $17, 0xff, $17 # E : L L U U : 00000000000000ch
50
51 insbl $17, 1, $2 # U : 000000000000ch00
52 cmpult $18, 9, $4 # E : small (< 1 quad) string?
53 or $2, $17, $17 # E : 000000000000chch
54 lda $3, -1($31) # E : U L L U
55
56 sll $17, 16, $2 # U : 00000000chch0000
57 addq $16, $5, $5 # E : Max search address
58 or $2, $17, $17 # E : 00000000chchchch
59 sll $17, 32, $2 # U : U L L U : chchchch00000000
60
61 or $2, $17, $17 # E : chchchchchchchch
62 extql $1, $16, $7 # U : $7 is upper bits
63 beq $4, $first_quad # U :
64 ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3
65
66 extqh $6, $16, $6 # U : 2 cycle stall for $6
67 mov $16, $0 # E :
68 nop # E :
69 or $7, $6, $1 # E : L U L U $1 = quadword starting at $16
70
71 # Deal with the case where at most 8 bytes remain to be searched
72 # in $1. E.g.:
73 # $18 = 6
74 # $1 = ????c6c5c4c3c2c1
75$last_quad:
76 negq $18, $6 # E :
77 xor $17, $1, $1 # E :
78 srl $3, $6, $6 # U : $6 = mask of $18 bits set
79 cmpbge $31, $1, $2 # E : L U L U
80
81 nop
82 nop
83 and $2, $6, $2 # E :
84 beq $2, $not_found # U : U L U L
85
86$found_it:
87#if defined(__alpha_fix__) && defined(__alpha_cix__)
88 /*
89 * Since we are guaranteed to have set one of the bits, we don't
90 * have to worry about coming back with a 0x40 out of cttz...
91 */
92 cttz $2, $3 # U0 :
93 addq $0, $3, $0 # E : All done
94 nop # E :
95 ret # L0 : L U L U
96#else
97 /*
98 * Slow and clunky. It can probably be improved.
99 * An exercise left for others.
100 */
101 negq $2, $3 # E :
102 and $2, $3, $2 # E :
103 and $2, 0x0f, $1 # E :
104 addq $0, 4, $3 # E :
105
106 cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
107 nop # E : keep with cmov
108 and $2, 0x33, $1 # E :
109 addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0
110
111 cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
112 nop # E : keep with cmov
113 and $2, 0x55, $1 # E :
114 addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0
115
116 cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
117 nop
118 nop
119 ret # L0 : L U L U
120#endif
121
122 # Deal with the case where $18 > 8 bytes remain to be
123 # searched. $16 may not be aligned.
124 .align 4
125$first_quad:
126 andnot $16, 0x7, $0 # E :
127 insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff)
128 xor $1, $17, $1 # E :
129 or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff
130
131 cmpbge $31, $1, $2 # E :
132 bne $2, $found_it # U :
133 # At least one byte left to process.
134 ldq $1, 8($0) # L :
135 subq $5, 1, $18 # E : U L U L
136
137 addq $0, 8, $0 # E :
138 # Make $18 point to last quad to be accessed (the
139 # last quad may or may not be partial).
140 andnot $18, 0x7, $18 # E :
141 cmpult $0, $18, $2 # E :
142 beq $2, $final # U : U L U L
143
144 # At least two quads remain to be accessed.
145
146 subq $18, $0, $4 # E : $4 <- nr quads to be processed
147 and $4, 8, $4 # E : odd number of quads?
148 bne $4, $odd_quad_count # U :
149 # At least three quads remain to be accessed
150 mov $1, $4 # E : L U L U : move prefetched value to correct reg
151
152 .align 4
153$unrolled_loop:
154 ldq $1, 8($0) # L : prefetch $1
155 xor $17, $4, $2 # E :
156 cmpbge $31, $2, $2 # E :
157 bne $2, $found_it # U : U L U L
158
159 addq $0, 8, $0 # E :
160 nop # E :
161 nop # E :
162 nop # E :
163
164$odd_quad_count:
165 xor $17, $1, $2 # E :
166 ldq $4, 8($0) # L : prefetch $4
167 cmpbge $31, $2, $2 # E :
168 addq $0, 8, $6 # E :
169
170 bne $2, $found_it # U :
171 cmpult $6, $18, $6 # E :
172 addq $0, 8, $0 # E :
173 nop # E :
174
175 bne $6, $unrolled_loop # U :
176 mov $4, $1 # E : move prefetched value into $1
177 nop # E :
178 nop # E :
179
180$final: subq $5, $0, $18 # E : $18 <- number of bytes left to do
181 nop # E :
182 nop # E :
183 bne $18, $last_quad # U :
184
185$not_found:
186 mov $31, $0 # E :
187 nop # E :
188 nop # E :
189 ret # L0 :
190
191 .end memchr
diff --git a/arch/alpha/lib/ev6-memcpy.S b/arch/alpha/lib/ev6-memcpy.S
new file mode 100644
index 000000000000..52b37b0f2af5
--- /dev/null
+++ b/arch/alpha/lib/ev6-memcpy.S
@@ -0,0 +1,248 @@
1/*
2 * arch/alpha/lib/ev6-memcpy.S
3 * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
4 *
5 * Reasonably optimized memcpy() routine for the Alpha 21264
6 *
7 * - memory accessed as aligned quadwords only
8 * - uses bcmpge to compare 8 bytes in parallel
9 *
10 * Much of the information about 21264 scheduling/coding comes from:
11 * Compiler Writer's Guide for the Alpha 21264
12 * abbreviated as 'CWG' in other comments here
13 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
14 * Scheduling notation:
15 * E - either cluster
16 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
17 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
18 *
19 * Temp usage notes:
20 * $1,$2, - scratch
21 */
22
23 .set noreorder
24 .set noat
25
26 .align 4
27 .globl memcpy
28 .ent memcpy
29memcpy:
30 .frame $30,0,$26,0
31 .prologue 0
32
33 mov $16, $0 # E : copy dest to return
34 ble $18, $nomoredata # U : done with the copy?
35 xor $16, $17, $1 # E : are source and dest alignments the same?
36 and $1, 7, $1 # E : are they the same mod 8?
37
38 bne $1, $misaligned # U : Nope - gotta do this the slow way
39 /* source and dest are same mod 8 address */
40 and $16, 7, $1 # E : Are both 0mod8?
41 beq $1, $both_0mod8 # U : Yes
42 nop # E :
43
44 /*
45 * source and dest are same misalignment. move a byte at a time
46 * until a 0mod8 alignment for both is reached.
47 * At least one byte more to move
48 */
49
50$head_align:
51 ldbu $1, 0($17) # L : grab a byte
52 subq $18, 1, $18 # E : count--
53 addq $17, 1, $17 # E : src++
54 stb $1, 0($16) # L :
55 addq $16, 1, $16 # E : dest++
56 and $16, 7, $1 # E : Are we at 0mod8 yet?
57 ble $18, $nomoredata # U : done with the copy?
58 bne $1, $head_align # U :
59
60$both_0mod8:
61 cmple $18, 127, $1 # E : Can we unroll the loop?
62 bne $1, $no_unroll # U :
63 and $16, 63, $1 # E : get mod64 alignment
64 beq $1, $do_unroll # U : no single quads to fiddle
65
66$single_head_quad:
67 ldq $1, 0($17) # L : get 8 bytes
68 subq $18, 8, $18 # E : count -= 8
69 addq $17, 8, $17 # E : src += 8
70 nop # E :
71
72 stq $1, 0($16) # L : store
73 addq $16, 8, $16 # E : dest += 8
74 and $16, 63, $1 # E : get mod64 alignment
75 bne $1, $single_head_quad # U : still not fully aligned
76
77$do_unroll:
78 addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
79 cmple $18, 127, $1 # E : Can we go through the unrolled loop?
80 bne $1, $tail_quads # U : Nope
81 nop # E :
82
83$unroll_body:
84 wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
85 # ($7) are about to be over-written
86 ldq $6, 0($17) # L0 : bytes 0..7
87 nop # E :
88 nop # E :
89
90 ldq $4, 8($17) # L : bytes 8..15
91 ldq $5, 16($17) # L : bytes 16..23
92 addq $7, 64, $7 # E : Update next wh64 address
93 nop # E :
94
95 ldq $3, 24($17) # L : bytes 24..31
96 addq $16, 64, $1 # E : fallback value for wh64
97 nop # E :
98 nop # E :
99
100 addq $17, 32, $17 # E : src += 32 bytes
101 stq $6, 0($16) # L : bytes 0..7
102 nop # E :
103 nop # E :
104
105 stq $4, 8($16) # L : bytes 8..15
106 stq $5, 16($16) # L : bytes 16..23
107 subq $18, 192, $2 # E : At least two more trips to go?
108 nop # E :
109
110 stq $3, 24($16) # L : bytes 24..31
111 addq $16, 32, $16 # E : dest += 32 bytes
112 nop # E :
113 nop # E :
114
115 ldq $6, 0($17) # L : bytes 0..7
116 ldq $4, 8($17) # L : bytes 8..15
117 cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
118 # fallback wh64 address if < 2 more trips
119 nop # E :
120
121 ldq $5, 16($17) # L : bytes 16..23
122 ldq $3, 24($17) # L : bytes 24..31
123 addq $16, 32, $16 # E : dest += 32
124 subq $18, 64, $18 # E : count -= 64
125
126 addq $17, 32, $17 # E : src += 32
127 stq $6, -32($16) # L : bytes 0..7
128 stq $4, -24($16) # L : bytes 8..15
129 cmple $18, 63, $1 # E : At least one more trip?
130
131 stq $5, -16($16) # L : bytes 16..23
132 stq $3, -8($16) # L : bytes 24..31
133 nop # E :
134 beq $1, $unroll_body
135
136$tail_quads:
137$no_unroll:
138 .align 4
139 subq $18, 8, $18 # E : At least a quad left?
140 blt $18, $less_than_8 # U : Nope
141 nop # E :
142 nop # E :
143
144$move_a_quad:
145 ldq $1, 0($17) # L : fetch 8
146 subq $18, 8, $18 # E : count -= 8
147 addq $17, 8, $17 # E : src += 8
148 nop # E :
149
150 stq $1, 0($16) # L : store 8
151 addq $16, 8, $16 # E : dest += 8
152 bge $18, $move_a_quad # U :
153 nop # E :
154
155$less_than_8:
156 .align 4
157 addq $18, 8, $18 # E : add back for trailing bytes
158 ble $18, $nomoredata # U : All-done
159 nop # E :
160 nop # E :
161
162 /* Trailing bytes */
163$tail_bytes:
164 subq $18, 1, $18 # E : count--
165 ldbu $1, 0($17) # L : fetch a byte
166 addq $17, 1, $17 # E : src++
167 nop # E :
168
169 stb $1, 0($16) # L : store a byte
170 addq $16, 1, $16 # E : dest++
171 bgt $18, $tail_bytes # U : more to be done?
172 nop # E :
173
174 /* branching to exit takes 3 extra cycles, so replicate exit here */
175 ret $31, ($26), 1 # L0 :
176 nop # E :
177 nop # E :
178 nop # E :
179
180$misaligned:
181 mov $0, $4 # E : dest temp
182 and $0, 7, $1 # E : dest alignment mod8
183 beq $1, $dest_0mod8 # U : life doesnt totally suck
184 nop
185
186$aligndest:
187 ble $18, $nomoredata # U :
188 ldbu $1, 0($17) # L : fetch a byte
189 subq $18, 1, $18 # E : count--
190 addq $17, 1, $17 # E : src++
191
192 stb $1, 0($4) # L : store it
193 addq $4, 1, $4 # E : dest++
194 and $4, 7, $1 # E : dest 0mod8 yet?
195 bne $1, $aligndest # U : go until we are aligned.
196
197 /* Source has unknown alignment, but dest is known to be 0mod8 */
198$dest_0mod8:
199 subq $18, 8, $18 # E : At least a quad left?
200 blt $18, $misalign_tail # U : Nope
201 ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
202 nop # E :
203
204$mis_quad:
205 ldq_u $16, 8($17) # L : Fetch next 8
206 extql $3, $17, $3 # U : masking
207 extqh $16, $17, $1 # U : masking
208 bis $3, $1, $1 # E : merged bytes to store
209
210 subq $18, 8, $18 # E : count -= 8
211 addq $17, 8, $17 # E : src += 8
212 stq $1, 0($4) # L : store 8 (aligned)
213 mov $16, $3 # E : "rotate" source data
214
215 addq $4, 8, $4 # E : dest += 8
216 bge $18, $mis_quad # U : More quads to move
217 nop
218 nop
219
220$misalign_tail:
221 addq $18, 8, $18 # E : account for tail stuff
222 ble $18, $nomoredata # U :
223 nop
224 nop
225
226$misalign_byte:
227 ldbu $1, 0($17) # L : fetch 1
228 subq $18, 1, $18 # E : count--
229 addq $17, 1, $17 # E : src++
230 nop # E :
231
232 stb $1, 0($4) # L : store
233 addq $4, 1, $4 # E : dest++
234 bgt $18, $misalign_byte # U : more to go?
235 nop
236
237
238$nomoredata:
239 ret $31, ($26), 1 # L0 :
240 nop # E :
241 nop # E :
242 nop # E :
243
244 .end memcpy
245
246/* For backwards module compatibility. */
247__memcpy = memcpy
248.globl __memcpy
diff --git a/arch/alpha/lib/ev6-memset.S b/arch/alpha/lib/ev6-memset.S
new file mode 100644
index 000000000000..d8b94e1c7fca
--- /dev/null
+++ b/arch/alpha/lib/ev6-memset.S
@@ -0,0 +1,597 @@
1/*
2 * arch/alpha/lib/ev6-memset.S
3 *
4 * This is an efficient (and relatively small) implementation of the C library
5 * "memset()" function for the 21264 implementation of Alpha.
6 *
7 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
8 *
9 * Much of the information about 21264 scheduling/coding comes from:
10 * Compiler Writer's Guide for the Alpha 21264
11 * abbreviated as 'CWG' in other comments here
12 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
13 * Scheduling notation:
14 * E - either cluster
15 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
16 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
17 * The algorithm for the leading and trailing quadwords remains the same,
18 * however the loop has been unrolled to enable better memory throughput,
19 * and the code has been replicated for each of the entry points: __memset
20 * and __memsetw to permit better scheduling to eliminate the stalling
21 * encountered during the mask replication.
22 * A future enhancement might be to put in a byte store loop for really
23 * small (say < 32 bytes) memset()s. Whether or not that change would be
24 * a win in the kernel would depend upon the contextual usage.
25 * WARNING: Maintaining this is going to be more work than the above version,
26 * as fixes will need to be made in multiple places. The performance gain
27 * is worth it.
28 */
29
30 .set noat
31 .set noreorder
32.text
33 .globl __memset
34 .globl __memsetw
35 .globl __constant_c_memset
36 .globl memset
37
38 .ent __memset
39.align 5
40__memset:
41 .frame $30,0,$26,0
42 .prologue 0
43
44 /*
45 * Serious stalling happens. The only way to mitigate this is to
46 * undertake a major re-write to interleave the constant materialization
47 * with other parts of the fall-through code. This is important, even
48 * though it makes maintenance tougher.
49 * Do this later.
50 */
51 and $17,255,$1 # E : 00000000000000ch
52 insbl $17,1,$2 # U : 000000000000ch00
53 bis $16,$16,$0 # E : return value
54 ble $18,end_b # U : zero length requested?
55
56 addq $18,$16,$6 # E : max address to write to
57 bis $1,$2,$17 # E : 000000000000chch
58 insbl $1,2,$3 # U : 0000000000ch0000
59 insbl $1,3,$4 # U : 00000000ch000000
60
61 or $3,$4,$3 # E : 00000000chch0000
62 inswl $17,4,$5 # U : 0000chch00000000
63 xor $16,$6,$1 # E : will complete write be within one quadword?
64 inswl $17,6,$2 # U : chch000000000000
65
66 or $17,$3,$17 # E : 00000000chchchch
67 or $2,$5,$2 # E : chchchch00000000
68 bic $1,7,$1 # E : fit within a single quadword?
69 and $16,7,$3 # E : Target addr misalignment
70
71 or $17,$2,$17 # E : chchchchchchchch
72 beq $1,within_quad_b # U :
73 nop # E :
74 beq $3,aligned_b # U : target is 0mod8
75
76 /*
77 * Target address is misaligned, and won't fit within a quadword
78 */
79 ldq_u $4,0($16) # L : Fetch first partial
80 bis $16,$16,$5 # E : Save the address
81 insql $17,$16,$2 # U : Insert new bytes
82 subq $3,8,$3 # E : Invert (for addressing uses)
83
84 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
85 mskql $4,$16,$4 # U : clear relevant parts of the quad
86 subq $16,$3,$16 # E : $16 is new aligned destination
87 bis $2,$4,$1 # E : Final bytes
88
89 nop
90 stq_u $1,0($5) # L : Store result
91 nop
92 nop
93
94.align 4
95aligned_b:
96 /*
97 * We are now guaranteed to be quad aligned, with at least
98 * one partial quad to write.
99 */
100
101 sra $18,3,$3 # U : Number of remaining quads to write
102 and $18,7,$18 # E : Number of trailing bytes to write
103 bis $16,$16,$5 # E : Save dest address
104 beq $3,no_quad_b # U : tail stuff only
105
106 /*
107 * it's worth the effort to unroll this and use wh64 if possible
108 * Lifted a bunch of code from clear_user.S
109 * At this point, entry values are:
110 * $16 Current destination address
111 * $5 A copy of $16
112 * $6 The max quadword address to write to
113 * $18 Number trailer bytes
114 * $3 Number quads to write
115 */
116
117 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
118 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
119 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
120 blt $4, loop_b # U :
121
122 /*
123 * We know we've got at least 16 quads, minimum of one trip
124 * through unrolled loop. Do a quad at a time to get us 0mod64
125 * aligned.
126 */
127
128 nop # E :
129 nop # E :
130 nop # E :
131 beq $1, $bigalign_b # U :
132
133$alignmod64_b:
134 stq $17, 0($5) # L :
135 subq $3, 1, $3 # E : For consistency later
136 addq $1, 8, $1 # E : Increment towards zero for alignment
137 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
138
139 nop
140 nop
141 addq $5, 8, $5 # E : Inc address
142 blt $1, $alignmod64_b # U :
143
144$bigalign_b:
145 /*
146 * $3 - number quads left to go
147 * $5 - target address (aligned 0mod64)
148 * $17 - mask of stuff to store
149 * Scratch registers available: $7, $2, $4, $1
150 * we know that we'll be taking a minimum of one trip through
151 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
152 * Assumes the wh64 needs to be for 2 trips through the loop in the future
153 * The wh64 is issued on for the starting destination address for trip +2
154 * through the loop, and if there are less than two trips left, the target
155 * address will be for the current trip.
156 */
157
158$do_wh64_b:
159 wh64 ($4) # L1 : memory subsystem write hint
160 subq $3, 24, $2 # E : For determining future wh64 addresses
161 stq $17, 0($5) # L :
162 nop # E :
163
164 addq $5, 128, $4 # E : speculative target of next wh64
165 stq $17, 8($5) # L :
166 stq $17, 16($5) # L :
167 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
168
169 stq $17, 24($5) # L :
170 stq $17, 32($5) # L :
171 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
172 nop
173
174 stq $17, 40($5) # L :
175 stq $17, 48($5) # L :
176 subq $3, 16, $2 # E : Repeat the loop at least once more?
177 nop
178
179 stq $17, 56($5) # L :
180 addq $5, 64, $5 # E :
181 subq $3, 8, $3 # E :
182 bge $2, $do_wh64_b # U :
183
184 nop
185 nop
186 nop
187 beq $3, no_quad_b # U : Might have finished already
188
189.align 4
190 /*
191 * Simple loop for trailing quadwords, or for small amounts
192 * of data (where we can't use an unrolled loop and wh64)
193 */
194loop_b:
195 stq $17,0($5) # L :
196 subq $3,1,$3 # E : Decrement number quads left
197 addq $5,8,$5 # E : Inc address
198 bne $3,loop_b # U : more?
199
200no_quad_b:
201 /*
202 * Write 0..7 trailing bytes.
203 */
204 nop # E :
205 beq $18,end_b # U : All done?
206 ldq $7,0($5) # L :
207 mskqh $7,$6,$2 # U : Mask final quad
208
209 insqh $17,$6,$4 # U : New bits
210 bis $2,$4,$1 # E : Put it all together
211 stq $1,0($5) # L : And back to memory
212 ret $31,($26),1 # L0 :
213
214within_quad_b:
215 ldq_u $1,0($16) # L :
216 insql $17,$16,$2 # U : New bits
217 mskql $1,$16,$4 # U : Clear old
218 bis $2,$4,$2 # E : New result
219
220 mskql $2,$6,$4 # U :
221 mskqh $1,$6,$2 # U :
222 bis $2,$4,$1 # E :
223 stq_u $1,0($16) # L :
224
225end_b:
226 nop
227 nop
228 nop
229 ret $31,($26),1 # L0 :
230 .end __memset
231
232 /*
233 * This is the original body of code, prior to replication and
234 * rescheduling. Leave it here, as there may be calls to this
235 * entry point.
236 */
237.align 4
238 .ent __constant_c_memset
239__constant_c_memset:
240 .frame $30,0,$26,0
241 .prologue 0
242
243 addq $18,$16,$6 # E : max address to write to
244 bis $16,$16,$0 # E : return value
245 xor $16,$6,$1 # E : will complete write be within one quadword?
246 ble $18,end # U : zero length requested?
247
248 bic $1,7,$1 # E : fit within a single quadword
249 beq $1,within_one_quad # U :
250 and $16,7,$3 # E : Target addr misalignment
251 beq $3,aligned # U : target is 0mod8
252
253 /*
254 * Target address is misaligned, and won't fit within a quadword
255 */
256 ldq_u $4,0($16) # L : Fetch first partial
257 bis $16,$16,$5 # E : Save the address
258 insql $17,$16,$2 # U : Insert new bytes
259 subq $3,8,$3 # E : Invert (for addressing uses)
260
261 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
262 mskql $4,$16,$4 # U : clear relevant parts of the quad
263 subq $16,$3,$16 # E : $16 is new aligned destination
264 bis $2,$4,$1 # E : Final bytes
265
266 nop
267 stq_u $1,0($5) # L : Store result
268 nop
269 nop
270
271.align 4
272aligned:
273 /*
274 * We are now guaranteed to be quad aligned, with at least
275 * one partial quad to write.
276 */
277
278 sra $18,3,$3 # U : Number of remaining quads to write
279 and $18,7,$18 # E : Number of trailing bytes to write
280 bis $16,$16,$5 # E : Save dest address
281 beq $3,no_quad # U : tail stuff only
282
283 /*
284 * it's worth the effort to unroll this and use wh64 if possible
285 * Lifted a bunch of code from clear_user.S
286 * At this point, entry values are:
287 * $16 Current destination address
288 * $5 A copy of $16
289 * $6 The max quadword address to write to
290 * $18 Number trailer bytes
291 * $3 Number quads to write
292 */
293
294 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
295 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
296 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
297 blt $4, loop # U :
298
299 /*
300 * We know we've got at least 16 quads, minimum of one trip
301 * through unrolled loop. Do a quad at a time to get us 0mod64
302 * aligned.
303 */
304
305 nop # E :
306 nop # E :
307 nop # E :
308 beq $1, $bigalign # U :
309
310$alignmod64:
311 stq $17, 0($5) # L :
312 subq $3, 1, $3 # E : For consistency later
313 addq $1, 8, $1 # E : Increment towards zero for alignment
314 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
315
316 nop
317 nop
318 addq $5, 8, $5 # E : Inc address
319 blt $1, $alignmod64 # U :
320
321$bigalign:
322 /*
323 * $3 - number quads left to go
324 * $5 - target address (aligned 0mod64)
325 * $17 - mask of stuff to store
326 * Scratch registers available: $7, $2, $4, $1
327 * we know that we'll be taking a minimum of one trip through
328 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
329 * Assumes the wh64 needs to be for 2 trips through the loop in the future
330 * The wh64 is issued on for the starting destination address for trip +2
331 * through the loop, and if there are less than two trips left, the target
332 * address will be for the current trip.
333 */
334
335$do_wh64:
336 wh64 ($4) # L1 : memory subsystem write hint
337 subq $3, 24, $2 # E : For determining future wh64 addresses
338 stq $17, 0($5) # L :
339 nop # E :
340
341 addq $5, 128, $4 # E : speculative target of next wh64
342 stq $17, 8($5) # L :
343 stq $17, 16($5) # L :
344 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
345
346 stq $17, 24($5) # L :
347 stq $17, 32($5) # L :
348 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
349 nop
350
351 stq $17, 40($5) # L :
352 stq $17, 48($5) # L :
353 subq $3, 16, $2 # E : Repeat the loop at least once more?
354 nop
355
356 stq $17, 56($5) # L :
357 addq $5, 64, $5 # E :
358 subq $3, 8, $3 # E :
359 bge $2, $do_wh64 # U :
360
361 nop
362 nop
363 nop
364 beq $3, no_quad # U : Might have finished already
365
366.align 4
367 /*
368 * Simple loop for trailing quadwords, or for small amounts
369 * of data (where we can't use an unrolled loop and wh64)
370 */
371loop:
372 stq $17,0($5) # L :
373 subq $3,1,$3 # E : Decrement number quads left
374 addq $5,8,$5 # E : Inc address
375 bne $3,loop # U : more?
376
377no_quad:
378 /*
379 * Write 0..7 trailing bytes.
380 */
381 nop # E :
382 beq $18,end # U : All done?
383 ldq $7,0($5) # L :
384 mskqh $7,$6,$2 # U : Mask final quad
385
386 insqh $17,$6,$4 # U : New bits
387 bis $2,$4,$1 # E : Put it all together
388 stq $1,0($5) # L : And back to memory
389 ret $31,($26),1 # L0 :
390
391within_one_quad:
392 ldq_u $1,0($16) # L :
393 insql $17,$16,$2 # U : New bits
394 mskql $1,$16,$4 # U : Clear old
395 bis $2,$4,$2 # E : New result
396
397 mskql $2,$6,$4 # U :
398 mskqh $1,$6,$2 # U :
399 bis $2,$4,$1 # E :
400 stq_u $1,0($16) # L :
401
402end:
403 nop
404 nop
405 nop
406 ret $31,($26),1 # L0 :
407 .end __constant_c_memset
408
409 /*
410 * This is a replicant of the __constant_c_memset code, rescheduled
411 * to mask stalls. Note that entry point names also had to change
412 */
413 .align 5
414 .ent __memsetw
415
416__memsetw:
417 .frame $30,0,$26,0
418 .prologue 0
419
420 inswl $17,0,$5 # U : 000000000000c1c2
421 inswl $17,2,$2 # U : 00000000c1c20000
422 bis $16,$16,$0 # E : return value
423 addq $18,$16,$6 # E : max address to write to
424
425 ble $18, end_w # U : zero length requested?
426 inswl $17,4,$3 # U : 0000c1c200000000
427 inswl $17,6,$4 # U : c1c2000000000000
428 xor $16,$6,$1 # E : will complete write be within one quadword?
429
430 or $2,$5,$2 # E : 00000000c1c2c1c2
431 or $3,$4,$17 # E : c1c2c1c200000000
432 bic $1,7,$1 # E : fit within a single quadword
433 and $16,7,$3 # E : Target addr misalignment
434
435 or $17,$2,$17 # E : c1c2c1c2c1c2c1c2
436 beq $1,within_quad_w # U :
437 nop
438 beq $3,aligned_w # U : target is 0mod8
439
440 /*
441 * Target address is misaligned, and won't fit within a quadword
442 */
443 ldq_u $4,0($16) # L : Fetch first partial
444 bis $16,$16,$5 # E : Save the address
445 insql $17,$16,$2 # U : Insert new bytes
446 subq $3,8,$3 # E : Invert (for addressing uses)
447
448 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
449 mskql $4,$16,$4 # U : clear relevant parts of the quad
450 subq $16,$3,$16 # E : $16 is new aligned destination
451 bis $2,$4,$1 # E : Final bytes
452
453 nop
454 stq_u $1,0($5) # L : Store result
455 nop
456 nop
457
458.align 4
459aligned_w:
460 /*
461 * We are now guaranteed to be quad aligned, with at least
462 * one partial quad to write.
463 */
464
465 sra $18,3,$3 # U : Number of remaining quads to write
466 and $18,7,$18 # E : Number of trailing bytes to write
467 bis $16,$16,$5 # E : Save dest address
468 beq $3,no_quad_w # U : tail stuff only
469
470 /*
471 * it's worth the effort to unroll this and use wh64 if possible
472 * Lifted a bunch of code from clear_user.S
473 * At this point, entry values are:
474 * $16 Current destination address
475 * $5 A copy of $16
476 * $6 The max quadword address to write to
477 * $18 Number trailer bytes
478 * $3 Number quads to write
479 */
480
481 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
482 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
483 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
484 blt $4, loop_w # U :
485
486 /*
487 * We know we've got at least 16 quads, minimum of one trip
488 * through unrolled loop. Do a quad at a time to get us 0mod64
489 * aligned.
490 */
491
492 nop # E :
493 nop # E :
494 nop # E :
495 beq $1, $bigalign_w # U :
496
497$alignmod64_w:
498 stq $17, 0($5) # L :
499 subq $3, 1, $3 # E : For consistency later
500 addq $1, 8, $1 # E : Increment towards zero for alignment
501 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
502
503 nop
504 nop
505 addq $5, 8, $5 # E : Inc address
506 blt $1, $alignmod64_w # U :
507
508$bigalign_w:
509 /*
510 * $3 - number quads left to go
511 * $5 - target address (aligned 0mod64)
512 * $17 - mask of stuff to store
513 * Scratch registers available: $7, $2, $4, $1
514 * we know that we'll be taking a minimum of one trip through
515 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
516 * Assumes the wh64 needs to be for 2 trips through the loop in the future
517 * The wh64 is issued on for the starting destination address for trip +2
518 * through the loop, and if there are less than two trips left, the target
519 * address will be for the current trip.
520 */
521
522$do_wh64_w:
523 wh64 ($4) # L1 : memory subsystem write hint
524 subq $3, 24, $2 # E : For determining future wh64 addresses
525 stq $17, 0($5) # L :
526 nop # E :
527
528 addq $5, 128, $4 # E : speculative target of next wh64
529 stq $17, 8($5) # L :
530 stq $17, 16($5) # L :
531 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
532
533 stq $17, 24($5) # L :
534 stq $17, 32($5) # L :
535 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
536 nop
537
538 stq $17, 40($5) # L :
539 stq $17, 48($5) # L :
540 subq $3, 16, $2 # E : Repeat the loop at least once more?
541 nop
542
543 stq $17, 56($5) # L :
544 addq $5, 64, $5 # E :
545 subq $3, 8, $3 # E :
546 bge $2, $do_wh64_w # U :
547
548 nop
549 nop
550 nop
551 beq $3, no_quad_w # U : Might have finished already
552
553.align 4
554 /*
555 * Simple loop for trailing quadwords, or for small amounts
556 * of data (where we can't use an unrolled loop and wh64)
557 */
558loop_w:
559 stq $17,0($5) # L :
560 subq $3,1,$3 # E : Decrement number quads left
561 addq $5,8,$5 # E : Inc address
562 bne $3,loop_w # U : more?
563
564no_quad_w:
565 /*
566 * Write 0..7 trailing bytes.
567 */
568 nop # E :
569 beq $18,end_w # U : All done?
570 ldq $7,0($5) # L :
571 mskqh $7,$6,$2 # U : Mask final quad
572
573 insqh $17,$6,$4 # U : New bits
574 bis $2,$4,$1 # E : Put it all together
575 stq $1,0($5) # L : And back to memory
576 ret $31,($26),1 # L0 :
577
578within_quad_w:
579 ldq_u $1,0($16) # L :
580 insql $17,$16,$2 # U : New bits
581 mskql $1,$16,$4 # U : Clear old
582 bis $2,$4,$2 # E : New result
583
584 mskql $2,$6,$4 # U :
585 mskqh $1,$6,$2 # U :
586 bis $2,$4,$1 # E :
587 stq_u $1,0($16) # L :
588
589end_w:
590 nop
591 nop
592 nop
593 ret $31,($26),1 # L0 :
594
595 .end __memsetw
596
597memset = __memset
diff --git a/arch/alpha/lib/ev6-strncpy_from_user.S b/arch/alpha/lib/ev6-strncpy_from_user.S
new file mode 100644
index 000000000000..d2e28178cacc
--- /dev/null
+++ b/arch/alpha/lib/ev6-strncpy_from_user.S
@@ -0,0 +1,424 @@
1/*
2 * arch/alpha/lib/ev6-strncpy_from_user.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
4 *
5 * Just like strncpy except in the return value:
6 *
7 * -EFAULT if an exception occurs before the terminator is copied.
8 * N if the buffer filled.
9 *
10 * Otherwise the length of the string is returned.
11 *
12 * Much of the information about 21264 scheduling/coding comes from:
13 * Compiler Writer's Guide for the Alpha 21264
14 * abbreviated as 'CWG' in other comments here
15 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
16 * Scheduling notation:
17 * E - either cluster
18 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
19 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
20 * A bunch of instructions got moved and temp registers were changed
21 * to aid in scheduling. Control flow was also re-arranged to eliminate
22 * branches, and to provide longer code sequences to enable better scheduling.
23 * A total rewrite (using byte load/stores for start & tail sequences)
24 * is desirable, but very difficult to do without a from-scratch rewrite.
25 * Save that for the future.
26 */
27
28
29#include <asm/errno.h>
30#include <asm/regdef.h>
31
32
33/* Allow an exception for an insn; exit if we get one. */
34#define EX(x,y...) \
35 99: x,##y; \
36 .section __ex_table,"a"; \
37 .long 99b - .; \
38 lda $31, $exception-99b($0); \
39 .previous
40
41
42 .set noat
43 .set noreorder
44 .text
45
46 .globl __strncpy_from_user
47 .ent __strncpy_from_user
48 .frame $30, 0, $26
49 .prologue 0
50
51 .align 4
52__strncpy_from_user:
53 and a0, 7, t3 # E : find dest misalignment
54 beq a2, $zerolength # U :
55
56 /* Are source and destination co-aligned? */
57 mov a0, v0 # E : save the string start
58 xor a0, a1, t4 # E :
59 EX( ldq_u t1, 0(a1) ) # L : Latency=3 load first quadword
60 ldq_u t0, 0(a0) # L : load first (partial) aligned dest quadword
61
62 addq a2, t3, a2 # E : bias count by dest misalignment
63 subq a2, 1, a3 # E :
64 addq zero, 1, t10 # E :
65 and t4, 7, t4 # E : misalignment between the two
66
67 and a3, 7, t6 # E : number of tail bytes
68 sll t10, t6, t10 # E : t10 = bitmask of last count byte
69 bne t4, $unaligned # U :
70 lda t2, -1 # E : build a mask against false zero
71
72 /*
73 * We are co-aligned; take care of a partial first word.
74 * On entry to this basic block:
75 * t0 == the first destination word for masking back in
76 * t1 == the first source word.
77 */
78
79 srl a3, 3, a2 # E : a2 = loop counter = (count - 1)/8
80 addq a1, 8, a1 # E :
81 mskqh t2, a1, t2 # U : detection in the src word
82 nop
83
84 /* Create the 1st output word and detect 0's in the 1st input word. */
85 mskqh t1, a1, t3 # U :
86 mskql t0, a1, t0 # U : assemble the first output word
87 ornot t1, t2, t2 # E :
88 nop
89
90 cmpbge zero, t2, t8 # E : bits set iff null found
91 or t0, t3, t0 # E :
92 beq a2, $a_eoc # U :
93 bne t8, $a_eos # U : 2nd branch in a quad. Bad.
94
95 /* On entry to this basic block:
96 * t0 == a source quad not containing a null.
97 * a0 - current aligned destination address
98 * a1 - current aligned source address
99 * a2 - count of quadwords to move.
100 * NOTE: Loop improvement - unrolling this is going to be
101 * a huge win, since we're going to stall otherwise.
102 * Fix this later. For _really_ large copies, look
103 * at using wh64 on a look-ahead basis. See the code
104 * in clear_user.S and copy_user.S.
105 * Presumably, since (a0) and (a1) do not overlap (by C definition)
106 * Lots of nops here:
107 * - Separate loads from stores
108 * - Keep it to 1 branch/quadpack so the branch predictor
109 * can train.
110 */
111$a_loop:
112 stq_u t0, 0(a0) # L :
113 addq a0, 8, a0 # E :
114 nop
115 subq a2, 1, a2 # E :
116
117 EX( ldq_u t0, 0(a1) ) # L :
118 addq a1, 8, a1 # E :
119 cmpbge zero, t0, t8 # E : Stall 2 cycles on t0
120 beq a2, $a_eoc # U :
121
122 beq t8, $a_loop # U :
123 nop
124 nop
125 nop
126
127 /* Take care of the final (partial) word store. At this point
128 * the end-of-count bit is set in t8 iff it applies.
129 *
130 * On entry to this basic block we have:
131 * t0 == the source word containing the null
132 * t8 == the cmpbge mask that found it.
133 */
134$a_eos:
135 negq t8, t12 # E : find low bit set
136 and t8, t12, t12 # E :
137
138 /* We're doing a partial word store and so need to combine
139 our source and original destination words. */
140 ldq_u t1, 0(a0) # L :
141 subq t12, 1, t6 # E :
142
143 or t12, t6, t8 # E :
144 zapnot t0, t8, t0 # U : clear src bytes > null
145 zap t1, t8, t1 # U : clear dst bytes <= null
146 or t0, t1, t0 # E :
147
148 stq_u t0, 0(a0) # L :
149 br $finish_up # L0 :
150 nop
151 nop
152
153 /* Add the end-of-count bit to the eos detection bitmask. */
154 .align 4
155$a_eoc:
156 or t10, t8, t8
157 br $a_eos
158 nop
159 nop
160
161
162/* The source and destination are not co-aligned. Align the destination
163 and cope. We have to be very careful about not reading too much and
164 causing a SEGV. */
165
166 .align 4
167$u_head:
168 /* We know just enough now to be able to assemble the first
169 full source word. We can still find a zero at the end of it
170 that prevents us from outputting the whole thing.
171
172 On entry to this basic block:
173 t0 == the first dest word, unmasked
174 t1 == the shifted low bits of the first source word
175 t6 == bytemask that is -1 in dest word bytes */
176
177 EX( ldq_u t2, 8(a1) ) # L : load second src word
178 addq a1, 8, a1 # E :
179 mskql t0, a0, t0 # U : mask trailing garbage in dst
180 extqh t2, a1, t4 # U :
181
182 or t1, t4, t1 # E : first aligned src word complete
183 mskqh t1, a0, t1 # U : mask leading garbage in src
184 or t0, t1, t0 # E : first output word complete
185 or t0, t6, t6 # E : mask original data for zero test
186
187 cmpbge zero, t6, t8 # E :
188 beq a2, $u_eocfin # U :
189 bne t8, $u_final # U : bad news - 2nd branch in a quad
190 lda t6, -1 # E : mask out the bits we have
191
192 mskql t6, a1, t6 # U : already seen
193 stq_u t0, 0(a0) # L : store first output word
194 or t6, t2, t2 # E :
195 cmpbge zero, t2, t8 # E : find nulls in second partial
196
197 addq a0, 8, a0 # E :
198 subq a2, 1, a2 # E :
199 bne t8, $u_late_head_exit # U :
200 nop
201
202 /* Finally, we've got all the stupid leading edge cases taken care
203 of and we can set up to enter the main loop. */
204
205 extql t2, a1, t1 # U : position hi-bits of lo word
206 EX( ldq_u t2, 8(a1) ) # L : read next high-order source word
207 addq a1, 8, a1 # E :
208 cmpbge zero, t2, t8 # E :
209
210 beq a2, $u_eoc # U :
211 bne t8, $u_eos # U :
212 nop
213 nop
214
215 /* Unaligned copy main loop. In order to avoid reading too much,
216 the loop is structured to detect zeros in aligned source words.
217 This has, unfortunately, effectively pulled half of a loop
218 iteration out into the head and half into the tail, but it does
219 prevent nastiness from accumulating in the very thing we want
220 to run as fast as possible.
221
222 On entry to this basic block:
223 t1 == the shifted high-order bits from the previous source word
224 t2 == the unshifted current source word
225
226 We further know that t2 does not contain a null terminator. */
227
228 /*
229 * Extra nops here:
230 * separate load quads from store quads
231 * only one branch/quad to permit predictor training
232 */
233
234 .align 4
235$u_loop:
236 extqh t2, a1, t0 # U : extract high bits for current word
237 addq a1, 8, a1 # E :
238 extql t2, a1, t3 # U : extract low bits for next time
239 addq a0, 8, a0 # E :
240
241 or t0, t1, t0 # E : current dst word now complete
242 EX( ldq_u t2, 0(a1) ) # L : load high word for next time
243 subq a2, 1, a2 # E :
244 nop
245
246 stq_u t0, -8(a0) # L : save the current word
247 mov t3, t1 # E :
248 cmpbge zero, t2, t8 # E : test new word for eos
249 beq a2, $u_eoc # U :
250
251 beq t8, $u_loop # U :
252 nop
253 nop
254 nop
255
256 /* We've found a zero somewhere in the source word we just read.
257 If it resides in the lower half, we have one (probably partial)
258 word to write out, and if it resides in the upper half, we
259 have one full and one partial word left to write out.
260
261 On entry to this basic block:
262 t1 == the shifted high-order bits from the previous source word
263 t2 == the unshifted current source word. */
264 .align 4
265$u_eos:
266 extqh t2, a1, t0 # U :
267 or t0, t1, t0 # E : first (partial) source word complete
268 cmpbge zero, t0, t8 # E : is the null in this first bit?
269 nop
270
271 bne t8, $u_final # U :
272 stq_u t0, 0(a0) # L : the null was in the high-order bits
273 addq a0, 8, a0 # E :
274 subq a2, 1, a2 # E :
275
276 .align 4
277$u_late_head_exit:
278 extql t2, a1, t0 # U :
279 cmpbge zero, t0, t8 # E :
280 or t8, t10, t6 # E :
281 cmoveq a2, t6, t8 # E :
282
283 /* Take care of a final (probably partial) result word.
284 On entry to this basic block:
285 t0 == assembled source word
286 t8 == cmpbge mask that found the null. */
287 .align 4
288$u_final:
289 negq t8, t6 # E : isolate low bit set
290 and t6, t8, t12 # E :
291 ldq_u t1, 0(a0) # L :
292 subq t12, 1, t6 # E :
293
294 or t6, t12, t8 # E :
295 zapnot t0, t8, t0 # U : kill source bytes > null
296 zap t1, t8, t1 # U : kill dest bytes <= null
297 or t0, t1, t0 # E :
298
299 stq_u t0, 0(a0) # E :
300 br $finish_up # U :
301 nop
302 nop
303
304 .align 4
305$u_eoc: # end-of-count
306 extqh t2, a1, t0 # U :
307 or t0, t1, t0 # E :
308 cmpbge zero, t0, t8 # E :
309 nop
310
311 .align 4
312$u_eocfin: # end-of-count, final word
313 or t10, t8, t8 # E :
314 br $u_final # U :
315 nop
316 nop
317
318 /* Unaligned copy entry point. */
319 .align 4
320$unaligned:
321
322 srl a3, 3, a2 # U : a2 = loop counter = (count - 1)/8
323 and a0, 7, t4 # E : find dest misalignment
324 and a1, 7, t5 # E : find src misalignment
325 mov zero, t0 # E :
326
327 /* Conditionally load the first destination word and a bytemask
328 with 0xff indicating that the destination byte is sacrosanct. */
329
330 mov zero, t6 # E :
331 beq t4, 1f # U :
332 ldq_u t0, 0(a0) # L :
333 lda t6, -1 # E :
334
335 mskql t6, a0, t6 # E :
336 nop
337 nop
338 nop
339
340 .align 4
3411:
342 subq a1, t4, a1 # E : sub dest misalignment from src addr
343 /* If source misalignment is larger than dest misalignment, we need
344 extra startup checks to avoid SEGV. */
345 cmplt t4, t5, t12 # E :
346 extql t1, a1, t1 # U : shift src into place
347 lda t2, -1 # E : for creating masks later
348
349 beq t12, $u_head # U :
350 mskqh t2, t5, t2 # U : begin src byte validity mask
351 cmpbge zero, t1, t8 # E : is there a zero?
352 nop
353
354 extql t2, a1, t2 # U :
355 or t8, t10, t5 # E : test for end-of-count too
356 cmpbge zero, t2, t3 # E :
357 cmoveq a2, t5, t8 # E : Latency=2, extra map slot
358
359 nop # E : goes with cmov
360 andnot t8, t3, t8 # E :
361 beq t8, $u_head # U :
362 nop
363
364 /* At this point we've found a zero in the first partial word of
365 the source. We need to isolate the valid source data and mask
366 it into the original destination data. (Incidentally, we know
367 that we'll need at least one byte of that original dest word.) */
368
369 ldq_u t0, 0(a0) # L :
370 negq t8, t6 # E : build bitmask of bytes <= zero
371 mskqh t1, t4, t1 # U :
372 and t6, t8, t12 # E :
373
374 subq t12, 1, t6 # E :
375 or t6, t12, t8 # E :
376 zapnot t2, t8, t2 # U : prepare source word; mirror changes
377 zapnot t1, t8, t1 # U : to source validity mask
378
379 andnot t0, t2, t0 # E : zero place for source to reside
380 or t0, t1, t0 # E : and put it there
381 stq_u t0, 0(a0) # L :
382 nop
383
384 .align 4
385$finish_up:
386 zapnot t0, t12, t4 # U : was last byte written null?
387 and t12, 0xf0, t3 # E : binary search for the address of the
388 cmovne t4, 1, t4 # E : Latency=2, extra map slot
389 nop # E : with cmovne
390
391 and t12, 0xcc, t2 # E : last byte written
392 and t12, 0xaa, t1 # E :
393 cmovne t3, 4, t3 # E : Latency=2, extra map slot
394 nop # E : with cmovne
395
396 bic a0, 7, t0
397 cmovne t2, 2, t2 # E : Latency=2, extra map slot
398 nop # E : with cmovne
399 nop
400
401 cmovne t1, 1, t1 # E : Latency=2, extra map slot
402 nop # E : with cmovne
403 addq t0, t3, t0 # E :
404 addq t1, t2, t1 # E :
405
406 addq t0, t1, t0 # E :
407 addq t0, t4, t0 # add one if we filled the buffer
408 subq t0, v0, v0 # find string length
409 ret # L0 :
410
411 .align 4
412$zerolength:
413 nop
414 nop
415 nop
416 clr v0
417
418$exception:
419 nop
420 nop
421 nop
422 ret
423
424 .end __strncpy_from_user
diff --git a/arch/alpha/lib/ev6-stxcpy.S b/arch/alpha/lib/ev6-stxcpy.S
new file mode 100644
index 000000000000..4643ff2ffc8d
--- /dev/null
+++ b/arch/alpha/lib/ev6-stxcpy.S
@@ -0,0 +1,321 @@
1/*
2 * arch/alpha/lib/ev6-stxcpy.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
4 *
5 * Copy a null-terminated string from SRC to DST.
6 *
7 * This is an internal routine used by strcpy, stpcpy, and strcat.
8 * As such, it uses special linkage conventions to make implementation
9 * of these public functions more efficient.
10 *
11 * On input:
12 * t9 = return address
13 * a0 = DST
14 * a1 = SRC
15 *
16 * On output:
17 * t12 = bitmask (with one bit set) indicating the last byte written
18 * a0 = unaligned address of the last *word* written
19 *
20 * Furthermore, v0, a3-a5, t11, and t12 are untouched.
21 *
22 * Much of the information about 21264 scheduling/coding comes from:
23 * Compiler Writer's Guide for the Alpha 21264
24 * abbreviated as 'CWG' in other comments here
25 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
26 * Scheduling notation:
27 * E - either cluster
28 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
29 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
30 * Try not to change the actual algorithm if possible for consistency.
31 */
32
33#include <asm/regdef.h>
34
35 .set noat
36 .set noreorder
37
38 .text
39
40/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
41 doesn't like putting the entry point for a procedure somewhere in the
42 middle of the procedure descriptor. Work around this by putting the
43 aligned copy in its own procedure descriptor */
44
45
46 .ent stxcpy_aligned
47 .align 4
48stxcpy_aligned:
49 .frame sp, 0, t9
50 .prologue 0
51
52 /* On entry to this basic block:
53 t0 == the first destination word for masking back in
54 t1 == the first source word. */
55
56 /* Create the 1st output word and detect 0's in the 1st input word. */
57 lda t2, -1 # E : build a mask against false zero
58 mskqh t2, a1, t2 # U : detection in the src word (stall)
59 mskqh t1, a1, t3 # U :
60 ornot t1, t2, t2 # E : (stall)
61
62 mskql t0, a1, t0 # U : assemble the first output word
63 cmpbge zero, t2, t8 # E : bits set iff null found
64 or t0, t3, t1 # E : (stall)
65 bne t8, $a_eos # U : (stall)
66
67 /* On entry to this basic block:
68 t0 == the first destination word for masking back in
69 t1 == a source word not containing a null. */
70 /* Nops here to separate store quads from load quads */
71
72$a_loop:
73 stq_u t1, 0(a0) # L :
74 addq a0, 8, a0 # E :
75 nop
76 nop
77
78 ldq_u t1, 0(a1) # L : Latency=3
79 addq a1, 8, a1 # E :
80 cmpbge zero, t1, t8 # E : (3 cycle stall)
81 beq t8, $a_loop # U : (stall for t8)
82
83 /* Take care of the final (partial) word store.
84 On entry to this basic block we have:
85 t1 == the source word containing the null
86 t8 == the cmpbge mask that found it. */
87$a_eos:
88 negq t8, t6 # E : find low bit set
89 and t8, t6, t12 # E : (stall)
90 /* For the sake of the cache, don't read a destination word
91 if we're not going to need it. */
92 and t12, 0x80, t6 # E : (stall)
93 bne t6, 1f # U : (stall)
94
95 /* We're doing a partial word store and so need to combine
96 our source and original destination words. */
97 ldq_u t0, 0(a0) # L : Latency=3
98 subq t12, 1, t6 # E :
99 zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
100 or t12, t6, t8 # E : (stall)
101
102 zap t0, t8, t0 # E : clear dst bytes <= null
103 or t0, t1, t1 # E : (stall)
104 nop
105 nop
106
1071: stq_u t1, 0(a0) # L :
108 ret (t9) # L0 : Latency=3
109 nop
110 nop
111
112 .end stxcpy_aligned
113
114 .align 4
115 .ent __stxcpy
116 .globl __stxcpy
117__stxcpy:
118 .frame sp, 0, t9
119 .prologue 0
120
121 /* Are source and destination co-aligned? */
122 xor a0, a1, t0 # E :
123 unop # E :
124 and t0, 7, t0 # E : (stall)
125 bne t0, $unaligned # U : (stall)
126
127 /* We are co-aligned; take care of a partial first word. */
128 ldq_u t1, 0(a1) # L : load first src word
129 and a0, 7, t0 # E : take care not to load a word ...
130 addq a1, 8, a1 # E :
131 beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
132
133 ldq_u t0, 0(a0) # L :
134 br stxcpy_aligned # L0 : Latency=3
135 nop
136 nop
137
138
139/* The source and destination are not co-aligned. Align the destination
140 and cope. We have to be very careful about not reading too much and
141 causing a SEGV. */
142
143 .align 4
144$u_head:
145 /* We know just enough now to be able to assemble the first
146 full source word. We can still find a zero at the end of it
147 that prevents us from outputting the whole thing.
148
149 On entry to this basic block:
150 t0 == the first dest word, for masking back in, if needed else 0
151 t1 == the low bits of the first source word
152 t6 == bytemask that is -1 in dest word bytes */
153
154 ldq_u t2, 8(a1) # L :
155 addq a1, 8, a1 # E :
156 extql t1, a1, t1 # U : (stall on a1)
157 extqh t2, a1, t4 # U : (stall on a1)
158
159 mskql t0, a0, t0 # U :
160 or t1, t4, t1 # E :
161 mskqh t1, a0, t1 # U : (stall on t1)
162 or t0, t1, t1 # E : (stall on t1)
163
164 or t1, t6, t6 # E :
165 cmpbge zero, t6, t8 # E : (stall)
166 lda t6, -1 # E : for masking just below
167 bne t8, $u_final # U : (stall)
168
169 mskql t6, a1, t6 # U : mask out the bits we have
170 or t6, t2, t2 # E : already extracted before (stall)
171 cmpbge zero, t2, t8 # E : testing eos (stall)
172 bne t8, $u_late_head_exit # U : (stall)
173
174 /* Finally, we've got all the stupid leading edge cases taken care
175 of and we can set up to enter the main loop. */
176
177 stq_u t1, 0(a0) # L : store first output word
178 addq a0, 8, a0 # E :
179 extql t2, a1, t0 # U : position ho-bits of lo word
180 ldq_u t2, 8(a1) # U : read next high-order source word
181
182 addq a1, 8, a1 # E :
183 cmpbge zero, t2, t8 # E : (stall for t2)
184 nop # E :
185 bne t8, $u_eos # U : (stall)
186
187 /* Unaligned copy main loop. In order to avoid reading too much,
188 the loop is structured to detect zeros in aligned source words.
189 This has, unfortunately, effectively pulled half of a loop
190 iteration out into the head and half into the tail, but it does
191 prevent nastiness from accumulating in the very thing we want
192 to run as fast as possible.
193
194 On entry to this basic block:
195 t0 == the shifted high-order bits from the previous source word
196 t2 == the unshifted current source word
197
198 We further know that t2 does not contain a null terminator. */
199
200 .align 3
201$u_loop:
202 extqh t2, a1, t1 # U : extract high bits for current word
203 addq a1, 8, a1 # E : (stall)
204 extql t2, a1, t3 # U : extract low bits for next time (stall)
205 addq a0, 8, a0 # E :
206
207 or t0, t1, t1 # E : current dst word now complete
208 ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
209 stq_u t1, -8(a0) # L : save the current word (stall)
210 mov t3, t0 # E :
211
212 cmpbge zero, t2, t8 # E : test new word for eos
213 beq t8, $u_loop # U : (stall)
214 nop
215 nop
216
217 /* We've found a zero somewhere in the source word we just read.
218 If it resides in the lower half, we have one (probably partial)
219 word to write out, and if it resides in the upper half, we
220 have one full and one partial word left to write out.
221
222 On entry to this basic block:
223 t0 == the shifted high-order bits from the previous source word
224 t2 == the unshifted current source word. */
225$u_eos:
226 extqh t2, a1, t1 # U :
227 or t0, t1, t1 # E : first (partial) source word complete (stall)
228 cmpbge zero, t1, t8 # E : is the null in this first bit? (stall)
229 bne t8, $u_final # U : (stall)
230
231$u_late_head_exit:
232 stq_u t1, 0(a0) # L : the null was in the high-order bits
233 addq a0, 8, a0 # E :
234 extql t2, a1, t1 # U :
235 cmpbge zero, t1, t8 # E : (stall)
236
237 /* Take care of a final (probably partial) result word.
238 On entry to this basic block:
239 t1 == assembled source word
240 t8 == cmpbge mask that found the null. */
241$u_final:
242 negq t8, t6 # E : isolate low bit set
243 and t6, t8, t12 # E : (stall)
244 and t12, 0x80, t6 # E : avoid dest word load if we can (stall)
245 bne t6, 1f # U : (stall)
246
247 ldq_u t0, 0(a0) # E :
248 subq t12, 1, t6 # E :
249 or t6, t12, t8 # E : (stall)
250 zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
251
252 zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall)
253 or t0, t1, t1 # E : (stall)
254 nop
255 nop
256
2571: stq_u t1, 0(a0) # L :
258 ret (t9) # L0 : Latency=3
259 nop
260 nop
261
262 /* Unaligned copy entry point. */
263 .align 4
264$unaligned:
265
266 ldq_u t1, 0(a1) # L : load first source word
267 and a0, 7, t4 # E : find dest misalignment
268 and a1, 7, t5 # E : find src misalignment
269 /* Conditionally load the first destination word and a bytemask
270 with 0xff indicating that the destination byte is sacrosanct. */
271 mov zero, t0 # E :
272
273 mov zero, t6 # E :
274 beq t4, 1f # U :
275 ldq_u t0, 0(a0) # L :
276 lda t6, -1 # E :
277
278 mskql t6, a0, t6 # U :
279 nop
280 nop
281 nop
2821:
283 subq a1, t4, a1 # E : sub dest misalignment from src addr
284 /* If source misalignment is larger than dest misalignment, we need
285 extra startup checks to avoid SEGV. */
286 cmplt t4, t5, t12 # E :
287 beq t12, $u_head # U :
288 lda t2, -1 # E : mask out leading garbage in source
289
290 mskqh t2, t5, t2 # U :
291 ornot t1, t2, t3 # E : (stall)
292 cmpbge zero, t3, t8 # E : is there a zero? (stall)
293 beq t8, $u_head # U : (stall)
294
295 /* At this point we've found a zero in the first partial word of
296 the source. We need to isolate the valid source data and mask
297 it into the original destination data. (Incidentally, we know
298 that we'll need at least one byte of that original dest word.) */
299
300 ldq_u t0, 0(a0) # L :
301 negq t8, t6 # E : build bitmask of bytes <= zero
302 and t6, t8, t12 # E : (stall)
303 and a1, 7, t5 # E :
304
305 subq t12, 1, t6 # E :
306 or t6, t12, t8 # E : (stall)
307 srl t12, t5, t12 # U : adjust final null return value
308 zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)
309
310 and t1, t2, t1 # E : to source validity mask
311 extql t2, a1, t2 # U :
312 extql t1, a1, t1 # U : (stall)
313 andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
314
315 or t0, t1, t1 # e1 : and put it there
316 stq_u t1, 0(a0) # .. e0 : (stall)
317 ret (t9) # e1 :
318 nop
319
320 .end __stxcpy
321
diff --git a/arch/alpha/lib/ev6-stxncpy.S b/arch/alpha/lib/ev6-stxncpy.S
new file mode 100644
index 000000000000..b581a7af2456
--- /dev/null
+++ b/arch/alpha/lib/ev6-stxncpy.S
@@ -0,0 +1,397 @@
1/*
2 * arch/alpha/lib/ev6-stxncpy.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com>
4 *
5 * Copy no more than COUNT bytes of the null-terminated string from
6 * SRC to DST.
7 *
8 * This is an internal routine used by strncpy, stpncpy, and strncat.
9 * As such, it uses special linkage conventions to make implementation
10 * of these public functions more efficient.
11 *
12 * On input:
13 * t9 = return address
14 * a0 = DST
15 * a1 = SRC
16 * a2 = COUNT
17 *
18 * Furthermore, COUNT may not be zero.
19 *
20 * On output:
21 * t0 = last word written
22 * t10 = bitmask (with one bit set) indicating the byte position of
23 * the end of the range specified by COUNT
24 * t12 = bitmask (with one bit set) indicating the last byte written
25 * a0 = unaligned address of the last *word* written
26 * a2 = the number of full words left in COUNT
27 *
28 * Furthermore, v0, a3-a5, t11, and $at are untouched.
29 *
30 * Much of the information about 21264 scheduling/coding comes from:
31 * Compiler Writer's Guide for the Alpha 21264
32 * abbreviated as 'CWG' in other comments here
33 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
34 * Scheduling notation:
35 * E - either cluster
36 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
37 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
38 * Try not to change the actual algorithm if possible for consistency.
39 */
40
41#include <asm/regdef.h>
42
43 .set noat
44 .set noreorder
45
46 .text
47
48/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
49 doesn't like putting the entry point for a procedure somewhere in the
50 middle of the procedure descriptor. Work around this by putting the
51 aligned copy in its own procedure descriptor */
52
53
54 .ent stxncpy_aligned
55 .align 4
56stxncpy_aligned:
57 .frame sp, 0, t9, 0
58 .prologue 0
59
60 /* On entry to this basic block:
61 t0 == the first destination word for masking back in
62 t1 == the first source word. */
63
64 /* Create the 1st output word and detect 0's in the 1st input word. */
65 lda t2, -1 # E : build a mask against false zero
66 mskqh t2, a1, t2 # U : detection in the src word (stall)
67 mskqh t1, a1, t3 # U :
68 ornot t1, t2, t2 # E : (stall)
69
70 mskql t0, a1, t0 # U : assemble the first output word
71 cmpbge zero, t2, t8 # E : bits set iff null found
72 or t0, t3, t0 # E : (stall)
73 beq a2, $a_eoc # U :
74
75 bne t8, $a_eos # U :
76 nop
77 nop
78 nop
79
80 /* On entry to this basic block:
81 t0 == a source word not containing a null. */
82
83 /*
84 * nops here to:
85 * separate store quads from load quads
86 * limit of 1 bcond/quad to permit training
87 */
88$a_loop:
89 stq_u t0, 0(a0) # L :
90 addq a0, 8, a0 # E :
91 subq a2, 1, a2 # E :
92 nop
93
94 ldq_u t0, 0(a1) # L :
95 addq a1, 8, a1 # E :
96 cmpbge zero, t0, t8 # E :
97 beq a2, $a_eoc # U :
98
99 beq t8, $a_loop # U :
100 nop
101 nop
102 nop
103
104 /* Take care of the final (partial) word store. At this point
105 the end-of-count bit is set in t8 iff it applies.
106
107 On entry to this basic block we have:
108 t0 == the source word containing the null
109 t8 == the cmpbge mask that found it. */
110
111$a_eos:
112 negq t8, t12 # E : find low bit set
113 and t8, t12, t12 # E : (stall)
114 /* For the sake of the cache, don't read a destination word
115 if we're not going to need it. */
116 and t12, 0x80, t6 # E : (stall)
117 bne t6, 1f # U : (stall)
118
119 /* We're doing a partial word store and so need to combine
120 our source and original destination words. */
121 ldq_u t1, 0(a0) # L :
122 subq t12, 1, t6 # E :
123 or t12, t6, t8 # E : (stall)
124 zapnot t0, t8, t0 # U : clear src bytes > null (stall)
125
126 zap t1, t8, t1 # .. e1 : clear dst bytes <= null
127 or t0, t1, t0 # e1 : (stall)
128 nop
129 nop
130
1311: stq_u t0, 0(a0) # L :
132 ret (t9) # L0 : Latency=3
133 nop
134 nop
135
136 /* Add the end-of-count bit to the eos detection bitmask. */
137$a_eoc:
138 or t10, t8, t8 # E :
139 br $a_eos # L0 : Latency=3
140 nop
141 nop
142
143 .end stxncpy_aligned
144
145 .align 4
146 .ent __stxncpy
147 .globl __stxncpy
148__stxncpy:
149 .frame sp, 0, t9, 0
150 .prologue 0
151
152 /* Are source and destination co-aligned? */
153 xor a0, a1, t1 # E :
154 and a0, 7, t0 # E : find dest misalignment
155 and t1, 7, t1 # E : (stall)
156 addq a2, t0, a2 # E : bias count by dest misalignment (stall)
157
158 subq a2, 1, a2 # E :
159 and a2, 7, t2 # E : (stall)
160 srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall)
161 addq zero, 1, t10 # E :
162
163 sll t10, t2, t10 # U : t10 = bitmask of last count byte
164 bne t1, $unaligned # U :
165 /* We are co-aligned; take care of a partial first word. */
166 ldq_u t1, 0(a1) # L : load first src word
167 addq a1, 8, a1 # E :
168
169 beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
170 ldq_u t0, 0(a0) # L :
171 nop
172 nop
173
174 br stxncpy_aligned # .. e1 :
175 nop
176 nop
177 nop
178
179
180
181/* The source and destination are not co-aligned. Align the destination
182 and cope. We have to be very careful about not reading too much and
183 causing a SEGV. */
184
185 .align 4
186$u_head:
187 /* We know just enough now to be able to assemble the first
188 full source word. We can still find a zero at the end of it
189 that prevents us from outputting the whole thing.
190
191 On entry to this basic block:
192 t0 == the first dest word, unmasked
193 t1 == the shifted low bits of the first source word
194 t6 == bytemask that is -1 in dest word bytes */
195
196 ldq_u t2, 8(a1) # L : Latency=3 load second src word
197 addq a1, 8, a1 # E :
198 mskql t0, a0, t0 # U : mask trailing garbage in dst
199 extqh t2, a1, t4 # U : (3 cycle stall on t2)
200
201 or t1, t4, t1 # E : first aligned src word complete (stall)
202 mskqh t1, a0, t1 # U : mask leading garbage in src (stall)
203 or t0, t1, t0 # E : first output word complete (stall)
204 or t0, t6, t6 # E : mask original data for zero test (stall)
205
206 cmpbge zero, t6, t8 # E :
207 beq a2, $u_eocfin # U :
208 lda t6, -1 # E :
209 nop
210
211 bne t8, $u_final # U :
212 mskql t6, a1, t6 # U : mask out bits already seen
213 stq_u t0, 0(a0) # L : store first output word
214 or t6, t2, t2 # E : (stall)
215
216 cmpbge zero, t2, t8 # E : find nulls in second partial
217 addq a0, 8, a0 # E :
218 subq a2, 1, a2 # E :
219 bne t8, $u_late_head_exit # U :
220
221 /* Finally, we've got all the stupid leading edge cases taken care
222 of and we can set up to enter the main loop. */
223 extql t2, a1, t1 # U : position hi-bits of lo word
224 beq a2, $u_eoc # U :
225 ldq_u t2, 8(a1) # L : read next high-order source word
226 addq a1, 8, a1 # E :
227
228 extqh t2, a1, t0 # U : position lo-bits of hi word (stall)
229 cmpbge zero, t2, t8 # E :
230 nop
231 bne t8, $u_eos # U :
232
233 /* Unaligned copy main loop. In order to avoid reading too much,
234 the loop is structured to detect zeros in aligned source words.
235 This has, unfortunately, effectively pulled half of a loop
236 iteration out into the head and half into the tail, but it does
237 prevent nastiness from accumulating in the very thing we want
238 to run as fast as possible.
239
240 On entry to this basic block:
241 t0 == the shifted low-order bits from the current source word
242 t1 == the shifted high-order bits from the previous source word
243 t2 == the unshifted current source word
244
245 We further know that t2 does not contain a null terminator. */
246
247 .align 4
248$u_loop:
249 or t0, t1, t0 # E : current dst word now complete
250 subq a2, 1, a2 # E : decrement word count
251 extql t2, a1, t1 # U : extract low bits for next time
252 addq a0, 8, a0 # E :
253
254 stq_u t0, -8(a0) # U : save the current word
255 beq a2, $u_eoc # U :
256 ldq_u t2, 8(a1) # U : Latency=3 load high word for next time
257 addq a1, 8, a1 # E :
258
259 extqh t2, a1, t0 # U : extract low bits (2 cycle stall)
260 cmpbge zero, t2, t8 # E : test new word for eos
261 nop
262 beq t8, $u_loop # U :
263
264 /* We've found a zero somewhere in the source word we just read.
265 If it resides in the lower half, we have one (probably partial)
266 word to write out, and if it resides in the upper half, we
267 have one full and one partial word left to write out.
268
269 On entry to this basic block:
270 t0 == the shifted low-order bits from the current source word
271 t1 == the shifted high-order bits from the previous source word
272 t2 == the unshifted current source word. */
273$u_eos:
274 or t0, t1, t0 # E : first (partial) source word complete
275 nop
276 cmpbge zero, t0, t8 # E : is the null in this first bit? (stall)
277 bne t8, $u_final # U : (stall)
278
279 stq_u t0, 0(a0) # L : the null was in the high-order bits
280 addq a0, 8, a0 # E :
281 subq a2, 1, a2 # E :
282 nop
283
284$u_late_head_exit:
285 extql t2, a1, t0 # U :
286 cmpbge zero, t0, t8 # E :
287 or t8, t10, t6 # E : (stall)
288 cmoveq a2, t6, t8 # E : Latency=2, extra map slot (stall)
289
290 /* Take care of a final (probably partial) result word.
291 On entry to this basic block:
292 t0 == assembled source word
293 t8 == cmpbge mask that found the null. */
294$u_final:
295 negq t8, t6 # E : isolate low bit set
296 and t6, t8, t12 # E : (stall)
297 and t12, 0x80, t6 # E : avoid dest word load if we can (stall)
298 bne t6, 1f # U : (stall)
299
300 ldq_u t1, 0(a0) # L :
301 subq t12, 1, t6 # E :
302 or t6, t12, t8 # E : (stall)
303 zapnot t0, t8, t0 # U : kill source bytes > null
304
305 zap t1, t8, t1 # U : kill dest bytes <= null
306 or t0, t1, t0 # E : (stall)
307 nop
308 nop
309
3101: stq_u t0, 0(a0) # L :
311 ret (t9) # L0 : Latency=3
312
313 /* Got to end-of-count before end of string.
314 On entry to this basic block:
315 t1 == the shifted high-order bits from the previous source word */
316$u_eoc:
317 and a1, 7, t6 # E : avoid final load if possible
318 sll t10, t6, t6 # U : (stall)
319 and t6, 0xff, t6 # E : (stall)
320 bne t6, 1f # U : (stall)
321
322 ldq_u t2, 8(a1) # L : load final src word
323 nop
324 extqh t2, a1, t0 # U : extract low bits for last word (stall)
325 or t1, t0, t1 # E : (stall)
326
3271: cmpbge zero, t1, t8 # E :
328 mov t1, t0 # E :
329
330$u_eocfin: # end-of-count, final word
331 or t10, t8, t8 # E :
332 br $u_final # L0 : Latency=3
333
334 /* Unaligned copy entry point. */
335 .align 4
336$unaligned:
337
338 ldq_u t1, 0(a1) # L : load first source word
339 and a0, 7, t4 # E : find dest misalignment
340 and a1, 7, t5 # E : find src misalignment
341 /* Conditionally load the first destination word and a bytemask
342 with 0xff indicating that the destination byte is sacrosanct. */
343 mov zero, t0 # E :
344
345 mov zero, t6 # E :
346 beq t4, 1f # U :
347 ldq_u t0, 0(a0) # L :
348 lda t6, -1 # E :
349
350 mskql t6, a0, t6 # U :
351 nop
352 nop
353 subq a1, t4, a1 # E : sub dest misalignment from src addr
354
355 /* If source misalignment is larger than dest misalignment, we need
356 extra startup checks to avoid SEGV. */
357
3581: cmplt t4, t5, t12 # E :
359 extql t1, a1, t1 # U : shift src into place
360 lda t2, -1 # E : for creating masks later
361 beq t12, $u_head # U : (stall)
362
363 extql t2, a1, t2 # U :
364 cmpbge zero, t1, t8 # E : is there a zero?
365 andnot t2, t6, t12 # E : dest mask for a single word copy
366 or t8, t10, t5 # E : test for end-of-count too
367
368 cmpbge zero, t12, t3 # E :
369 cmoveq a2, t5, t8 # E : Latency=2, extra map slot
370 nop # E : keep with cmoveq
371 andnot t8, t3, t8 # E : (stall)
372
373 beq t8, $u_head # U :
374 /* At this point we've found a zero in the first partial word of
375 the source. We need to isolate the valid source data and mask
376 it into the original destination data. (Incidentally, we know
377 that we'll need at least one byte of that original dest word.) */
378 ldq_u t0, 0(a0) # L :
379 negq t8, t6 # E : build bitmask of bytes <= zero
380 mskqh t1, t4, t1 # U :
381
382 and t6, t8, t2 # E :
383 subq t2, 1, t6 # E : (stall)
384 or t6, t2, t8 # E : (stall)
385 zapnot t12, t8, t12 # U : prepare source word; mirror changes (stall)
386
387 zapnot t1, t8, t1 # U : to source validity mask
388 andnot t0, t12, t0 # E : zero place for source to reside
389 or t0, t1, t0 # E : and put it there (stall both t0, t1)
390 stq_u t0, 0(a0) # L : (stall)
391
392 ret (t9) # L0 : Latency=3
393 nop
394 nop
395 nop
396
397 .end __stxncpy
diff --git a/arch/alpha/lib/ev67-strcat.S b/arch/alpha/lib/ev67-strcat.S
new file mode 100644
index 000000000000..c426fe3ed72f
--- /dev/null
+++ b/arch/alpha/lib/ev67-strcat.S
@@ -0,0 +1,54 @@
1/*
2 * arch/alpha/lib/ev67-strcat.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
4 *
5 * Append a null-terminated string from SRC to DST.
6 *
7 * Much of the information about 21264 scheduling/coding comes from:
8 * Compiler Writer's Guide for the Alpha 21264
9 * abbreviated as 'CWG' in other comments here
10 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
11 * Scheduling notation:
12 * E - either cluster
13 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
14 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
15 * Try not to change the actual algorithm if possible for consistency.
16 * Commentary: It seems bogus to walk the input string twice - once
17 * to determine the length, and then again while doing the copy.
18 * A significant (future) enhancement would be to only read the input
19 * string once.
20 */
21
22
23 .text
24
25 .align 4
26 .globl strcat
27 .ent strcat
28strcat:
29 .frame $30, 0, $26
30 .prologue 0
31
32 mov $16, $0 # E : set up return value
33 /* Find the end of the string. */
34 ldq_u $1, 0($16) # L : load first quadword (a0 may be misaligned)
35 lda $2, -1 # E :
36 insqh $2, $16, $2 # U :
37
38 andnot $16, 7, $16 # E :
39 or $2, $1, $1 # E :
40 cmpbge $31, $1, $2 # E : bits set iff byte == 0
41 bne $2, $found # U :
42
43$loop: ldq $1, 8($16) # L :
44 addq $16, 8, $16 # E :
45 cmpbge $31, $1, $2 # E :
46 beq $2, $loop # U :
47
48$found: cttz $2, $3 # U0 :
49 addq $16, $3, $16 # E :
50 /* Now do the append. */
51 mov $26, $23 # E :
52 br __stxcpy # L0 :
53
54 .end strcat
diff --git a/arch/alpha/lib/ev67-strchr.S b/arch/alpha/lib/ev67-strchr.S
new file mode 100644
index 000000000000..fbb7b4ffade9
--- /dev/null
+++ b/arch/alpha/lib/ev67-strchr.S
@@ -0,0 +1,88 @@
1/*
2 * arch/alpha/lib/ev67-strchr.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
4 *
5 * Return the address of a given character within a null-terminated
6 * string, or null if it is not found.
7 *
8 * Much of the information about 21264 scheduling/coding comes from:
9 * Compiler Writer's Guide for the Alpha 21264
10 * abbreviated as 'CWG' in other comments here
11 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
12 * Scheduling notation:
13 * E - either cluster
14 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
15 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
16 * Try not to change the actual algorithm if possible for consistency.
17 */
18
19#include <asm/regdef.h>
20
21 .set noreorder
22 .set noat
23
24 .align 4
25 .globl strchr
26 .ent strchr
27strchr:
28 .frame sp, 0, ra
29 .prologue 0
30
31 ldq_u t0, 0(a0) # L : load first quadword Latency=3
32 and a1, 0xff, t3 # E : 00000000000000ch
33 insbl a1, 1, t5 # U : 000000000000ch00
34 insbl a1, 7, a2 # U : ch00000000000000
35
36 insbl t3, 6, a3 # U : 00ch000000000000
37 or t5, t3, a1 # E : 000000000000chch
38 andnot a0, 7, v0 # E : align our loop pointer
39 lda t4, -1 # E : build garbage mask
40
41 mskqh t4, a0, t4 # U : only want relevant part of first quad
42 or a2, a3, a2 # E : chch000000000000
43 inswl a1, 2, t5 # E : 00000000chch0000
44 inswl a1, 4, a3 # E : 0000chch00000000
45
46 or a1, a2, a1 # E : chch00000000chch
47 or a3, t5, t5 # E : 0000chchchch0000
48 cmpbge zero, t0, t2 # E : bits set iff byte == zero
49 cmpbge zero, t4, t4 # E : bits set iff byte is garbage
50
51 /* This quad is _very_ serialized. Lots of stalling happens */
52 or t5, a1, a1 # E : chchchchchchchch
53 xor t0, a1, t1 # E : make bytes == c zero
54 cmpbge zero, t1, t3 # E : bits set iff byte == c
55 or t2, t3, t0 # E : bits set iff char match or zero match
56
57 andnot t0, t4, t0 # E : clear garbage bits
58 cttz t0, a2 # U0 : speculative (in case we get a match)
59 nop # E :
60 bne t0, $found # U :
61
62 /*
63 * Yuk. This loop is going to stall like crazy waiting for the
64 * data to be loaded. Not much can be done about it unless it's
65 * unrolled multiple times - is that safe to do in kernel space?
66 * Or would exception handling recovery code do the trick here?
67 */
68$loop: ldq t0, 8(v0) # L : Latency=3
69 addq v0, 8, v0 # E :
70 xor t0, a1, t1 # E :
71 cmpbge zero, t0, t2 # E : bits set iff byte == 0
72
73 cmpbge zero, t1, t3 # E : bits set iff byte == c
74 or t2, t3, t0 # E :
75 cttz t3, a2 # U0 : speculative (in case we get a match)
76 beq t0, $loop # U :
77
78$found: negq t0, t1 # E : clear all but least set bit
79 and t0, t1, t0 # E :
80 and t0, t3, t1 # E : bit set iff byte was the char
81 addq v0, a2, v0 # E : Add in the bit number from above
82
83 cmoveq t1, $31, v0 # E : Two mapping slots, latency = 2
84 nop
85 nop
86 ret # L0 :
87
88 .end strchr
diff --git a/arch/alpha/lib/ev67-strlen.S b/arch/alpha/lib/ev67-strlen.S
new file mode 100644
index 000000000000..503928072523
--- /dev/null
+++ b/arch/alpha/lib/ev67-strlen.S
@@ -0,0 +1,49 @@
1/*
2 * arch/alpha/lib/ev67-strlen.S
3 * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
4 *
5 * Finds length of a 0-terminated string. Optimized for the
6 * Alpha architecture:
7 *
8 * - memory accessed as aligned quadwords only
9 * - uses bcmpge to compare 8 bytes in parallel
10 *
11 * Much of the information about 21264 scheduling/coding comes from:
12 * Compiler Writer's Guide for the Alpha 21264
13 * abbreviated as 'CWG' in other comments here
14 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
15 * Scheduling notation:
16 * E - either cluster
17 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
18 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
19 */
20
21 .set noreorder
22 .set noat
23
24 .globl strlen
25 .ent strlen
26 .align 4
27strlen:
28 ldq_u $1, 0($16) # L : load first quadword ($16 may be misaligned)
29 lda $2, -1($31) # E :
30 insqh $2, $16, $2 # U :
31 andnot $16, 7, $0 # E :
32
33 or $2, $1, $1 # E :
34 cmpbge $31, $1, $2 # E : $2 <- bitmask: bit i == 1 <==> i-th byte == 0
35 nop # E :
36 bne $2, $found # U :
37
38$loop: ldq $1, 8($0) # L :
39 addq $0, 8, $0 # E : addr += 8
40 cmpbge $31, $1, $2 # E :
41 beq $2, $loop # U :
42
43$found:
44 cttz $2, $3 # U0 :
45 addq $0, $3, $0 # E :
46 subq $0, $16, $0 # E :
47 ret $31, ($26) # L0 :
48
49 .end strlen
diff --git a/arch/alpha/lib/ev67-strlen_user.S b/arch/alpha/lib/ev67-strlen_user.S
new file mode 100644
index 000000000000..57e0d77b81a6
--- /dev/null
+++ b/arch/alpha/lib/ev67-strlen_user.S
@@ -0,0 +1,107 @@
1/*
2 * arch/alpha/lib/ev67-strlen_user.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com>
4 *
5 * Return the length of the string including the NULL terminator
6 * (strlen+1) or zero if an error occurred.
7 *
8 * In places where it is critical to limit the processing time,
9 * and the data is not trusted, strnlen_user() should be used.
10 * It will return a value greater than its second argument if
11 * that limit would be exceeded. This implementation is allowed
12 * to access memory beyond the limit, but will not cross a page
13 * boundary when doing so.
14 *
15 * Much of the information about 21264 scheduling/coding comes from:
16 * Compiler Writer's Guide for the Alpha 21264
17 * abbreviated as 'CWG' in other comments here
18 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
19 * Scheduling notation:
20 * E - either cluster
21 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
22 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
23 * Try not to change the actual algorithm if possible for consistency.
24 */
25
26#include <asm/regdef.h>
27
28
29/* Allow an exception for an insn; exit if we get one. */
30#define EX(x,y...) \
31 99: x,##y; \
32 .section __ex_table,"a"; \
33 .long 99b - .; \
34 lda v0, $exception-99b(zero); \
35 .previous
36
37
38 .set noreorder
39 .set noat
40 .text
41
42 .globl __strlen_user
43 .ent __strlen_user
44 .frame sp, 0, ra
45
46 .align 4
47__strlen_user:
48 ldah a1, 32767(zero) # do not use plain strlen_user() for strings
49 # that might be almost 2 GB long; you should
50 # be using strnlen_user() instead
51 nop
52 nop
53 nop
54
55 .globl __strnlen_user
56
57 .align 4
58__strnlen_user:
59 .prologue 0
60 EX( ldq_u t0, 0(a0) ) # L : load first quadword (a0 may be misaligned)
61 lda t1, -1(zero) # E :
62
63 insqh t1, a0, t1 # U :
64 andnot a0, 7, v0 # E :
65 or t1, t0, t0 # E :
66 subq a0, 1, a0 # E : get our +1 for the return
67
68 cmpbge zero, t0, t1 # E : t1 <- bitmask: bit i == 1 <==> i-th byte == 0
69 subq a1, 7, t2 # E :
70 subq a0, v0, t0 # E :
71 bne t1, $found # U :
72
73 addq t2, t0, t2 # E :
74 addq a1, 1, a1 # E :
75 nop # E :
76 nop # E :
77
78 .align 4
79$loop: ble t2, $limit # U :
80 EX( ldq t0, 8(v0) ) # L :
81 nop # E :
82 nop # E :
83
84 cmpbge zero, t0, t1 # E :
85 subq t2, 8, t2 # E :
86 addq v0, 8, v0 # E : addr += 8
87 beq t1, $loop # U :
88
89$found: cttz t1, t2 # U0 :
90 addq v0, t2, v0 # E :
91 subq v0, a0, v0 # E :
92 ret # L0 :
93
94$exception:
95 nop
96 nop
97 nop
98 ret
99
100 .align 4 # currently redundant
101$limit:
102 nop
103 nop
104 subq a1, t2, v0
105 ret
106
107 .end __strlen_user
diff --git a/arch/alpha/lib/ev67-strncat.S b/arch/alpha/lib/ev67-strncat.S
new file mode 100644
index 000000000000..4ae716cd2bfb
--- /dev/null
+++ b/arch/alpha/lib/ev67-strncat.S
@@ -0,0 +1,94 @@
1/*
2 * arch/alpha/lib/ev67-strncat.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com>
4 *
5 * Append no more than COUNT characters from the null-terminated string SRC
6 * to the null-terminated string DST. Always null-terminate the new DST.
7 *
8 * This differs slightly from the semantics in libc in that we never write
9 * past count, whereas libc may write to count+1. This follows the generic
10 * implementation in lib/string.c and is, IMHO, more sensible.
11 *
12 * Much of the information about 21264 scheduling/coding comes from:
13 * Compiler Writer's Guide for the Alpha 21264
14 * abbreviated as 'CWG' in other comments here
15 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
16 * Scheduling notation:
17 * E - either cluster
18 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
19 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
20 * Try not to change the actual algorithm if possible for consistency.
21 */
22
23
24 .text
25
26 .align 4
27 .globl strncat
28 .ent strncat
29strncat:
30 .frame $30, 0, $26
31 .prologue 0
32
33 mov $16, $0 # set up return value
34 beq $18, $zerocount # U :
35 /* Find the end of the string. */
36 ldq_u $1, 0($16) # L : load first quadword ($16 may be misaligned)
37 lda $2, -1($31) # E :
38
39 insqh $2, $0, $2 # U :
40 andnot $16, 7, $16 # E :
41 nop # E :
42 or $2, $1, $1 # E :
43
44 nop # E :
45 nop # E :
46 cmpbge $31, $1, $2 # E : bits set iff byte == 0
47 bne $2, $found # U :
48
49$loop: ldq $1, 8($16) # L :
50 addq $16, 8, $16 # E :
51 cmpbge $31, $1, $2 # E :
52 beq $2, $loop # U :
53
54$found: cttz $2, $3 # U0 :
55 addq $16, $3, $16 # E :
56 nop # E :
57 bsr $23, __stxncpy # L0 :/* Now do the append. */
58
59 /* Worry about the null termination. */
60
61 zapnot $1, $27, $2 # U : was last byte a null?
62 cmplt $27, $24, $5 # E : did we fill the buffer completely?
63 bne $2, 0f # U :
64 ret # L0 :
65
660: or $5, $18, $2 # E :
67 nop
68 bne $2, 2f # U :
69 and $24, 0x80, $3 # E : no zero next byte
70
71 nop # E :
72 bne $3, 1f # U :
73 /* Here there are bytes left in the current word. Clear one. */
74 addq $24, $24, $24 # E : end-of-count bit <<= 1
75 nop # E :
76
772: zap $1, $24, $1 # U :
78 nop # E :
79 stq_u $1, 0($16) # L :
80 ret # L0 :
81
821: /* Here we must clear the first byte of the next DST word */
83 stb $31, 8($16) # L :
84 nop # E :
85 nop # E :
86 ret # L0 :
87
88$zerocount:
89 nop # E :
90 nop # E :
91 nop # E :
92 ret # L0 :
93
94 .end strncat
diff --git a/arch/alpha/lib/ev67-strrchr.S b/arch/alpha/lib/ev67-strrchr.S
new file mode 100644
index 000000000000..3fd8bf414c7b
--- /dev/null
+++ b/arch/alpha/lib/ev67-strrchr.S
@@ -0,0 +1,109 @@
1/*
2 * arch/alpha/lib/ev67-strrchr.S
3 * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
4 *
5 * Finds length of a 0-terminated string. Optimized for the
6 * Alpha architecture:
7 *
8 * - memory accessed as aligned quadwords only
9 * - uses bcmpge to compare 8 bytes in parallel
10 *
11 * Much of the information about 21264 scheduling/coding comes from:
12 * Compiler Writer's Guide for the Alpha 21264
13 * abbreviated as 'CWG' in other comments here
14 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
15 * Scheduling notation:
16 * E - either cluster
17 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
18 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
19 */
20
21
22#include <asm/regdef.h>
23
24 .set noreorder
25 .set noat
26
27 .align 4
28 .ent strrchr
29 .globl strrchr
30strrchr:
31 .frame sp, 0, ra
32 .prologue 0
33
34 and a1, 0xff, t2 # E : 00000000000000ch
35 insbl a1, 1, t4 # U : 000000000000ch00
36 insbl a1, 2, t5 # U : 0000000000ch0000
37 ldq_u t0, 0(a0) # L : load first quadword Latency=3
38
39 mov zero, t6 # E : t6 is last match aligned addr
40 or t2, t4, a1 # E : 000000000000chch
41 sll t5, 8, t3 # U : 00000000ch000000
42 mov zero, t8 # E : t8 is last match byte compare mask
43
44 andnot a0, 7, v0 # E : align source addr
45 or t5, t3, t3 # E : 00000000chch0000
46 sll a1, 32, t2 # U : 0000chch00000000
47 sll a1, 48, t4 # U : chch000000000000
48
49 or t4, a1, a1 # E : chch00000000chch
50 or t2, t3, t2 # E : 0000chchchch0000
51 or a1, t2, a1 # E : chchchchchchchch
52 lda t5, -1 # E : build garbage mask
53
54 cmpbge zero, t0, t1 # E : bits set iff byte == zero
55 mskqh t5, a0, t4 # E : Complete garbage mask
56 xor t0, a1, t2 # E : make bytes == c zero
57 cmpbge zero, t4, t4 # E : bits set iff byte is garbage
58
59 cmpbge zero, t2, t3 # E : bits set iff byte == c
60 andnot t1, t4, t1 # E : clear garbage from null test
61 andnot t3, t4, t3 # E : clear garbage from char test
62 bne t1, $eos # U : did we already hit the terminator?
63
64 /* Character search main loop */
65$loop:
66 ldq t0, 8(v0) # L : load next quadword
67 cmovne t3, v0, t6 # E : save previous comparisons match
68 nop # : Latency=2, extra map slot (keep nop with cmov)
69 nop
70
71 cmovne t3, t3, t8 # E : Latency=2, extra map slot
72 nop # : keep with cmovne
73 addq v0, 8, v0 # E :
74 xor t0, a1, t2 # E :
75
76 cmpbge zero, t0, t1 # E : bits set iff byte == zero
77 cmpbge zero, t2, t3 # E : bits set iff byte == c
78 beq t1, $loop # U : if we havnt seen a null, loop
79 nop
80
81 /* Mask out character matches after terminator */
82$eos:
83 negq t1, t4 # E : isolate first null byte match
84 and t1, t4, t4 # E :
85 subq t4, 1, t5 # E : build a mask of the bytes upto...
86 or t4, t5, t4 # E : ... and including the null
87
88 and t3, t4, t3 # E : mask out char matches after null
89 cmovne t3, t3, t8 # E : save it, if match found Latency=2, extra map slot
90 nop # : Keep with cmovne
91 nop
92
93 cmovne t3, v0, t6 # E :
94 nop # : Keep with cmovne
95 /* Locate the address of the last matched character */
96 ctlz t8, t2 # U0 : Latency=3 (0x40 for t8=0)
97 nop
98
99 cmoveq t8, 0x3f, t2 # E : Compensate for case when no match is seen
100 nop # E : hide the cmov latency (2) behind ctlz latency
101 lda t5, 0x3f($31) # E :
102 subq t5, t2, t5 # E : Normalize leading zero count
103
104 addq t6, t5, v0 # E : and add to quadword address
105 ret # L0 : Latency=3
106 nop
107 nop
108
109 .end strrchr
diff --git a/arch/alpha/lib/fpreg.c b/arch/alpha/lib/fpreg.c
new file mode 100644
index 000000000000..97c4d9d7a4d5
--- /dev/null
+++ b/arch/alpha/lib/fpreg.c
@@ -0,0 +1,193 @@
1/*
2 * arch/alpha/lib/fpreg.c
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 */
6
7#if defined(__alpha_cix__) || defined(__alpha_fix__)
8#define STT(reg,val) asm volatile ("ftoit $f"#reg",%0" : "=r"(val));
9#else
10#define STT(reg,val) asm volatile ("stt $f"#reg",%0" : "=m"(val));
11#endif
12
13unsigned long
14alpha_read_fp_reg (unsigned long reg)
15{
16 unsigned long val;
17
18 switch (reg) {
19 case 0: STT( 0, val); break;
20 case 1: STT( 1, val); break;
21 case 2: STT( 2, val); break;
22 case 3: STT( 3, val); break;
23 case 4: STT( 4, val); break;
24 case 5: STT( 5, val); break;
25 case 6: STT( 6, val); break;
26 case 7: STT( 7, val); break;
27 case 8: STT( 8, val); break;
28 case 9: STT( 9, val); break;
29 case 10: STT(10, val); break;
30 case 11: STT(11, val); break;
31 case 12: STT(12, val); break;
32 case 13: STT(13, val); break;
33 case 14: STT(14, val); break;
34 case 15: STT(15, val); break;
35 case 16: STT(16, val); break;
36 case 17: STT(17, val); break;
37 case 18: STT(18, val); break;
38 case 19: STT(19, val); break;
39 case 20: STT(20, val); break;
40 case 21: STT(21, val); break;
41 case 22: STT(22, val); break;
42 case 23: STT(23, val); break;
43 case 24: STT(24, val); break;
44 case 25: STT(25, val); break;
45 case 26: STT(26, val); break;
46 case 27: STT(27, val); break;
47 case 28: STT(28, val); break;
48 case 29: STT(29, val); break;
49 case 30: STT(30, val); break;
50 case 31: STT(31, val); break;
51 default: return 0;
52 }
53 return val;
54}
55
56#if defined(__alpha_cix__) || defined(__alpha_fix__)
57#define LDT(reg,val) asm volatile ("itoft %0,$f"#reg : : "r"(val));
58#else
59#define LDT(reg,val) asm volatile ("ldt $f"#reg",%0" : : "m"(val));
60#endif
61
62void
63alpha_write_fp_reg (unsigned long reg, unsigned long val)
64{
65 switch (reg) {
66 case 0: LDT( 0, val); break;
67 case 1: LDT( 1, val); break;
68 case 2: LDT( 2, val); break;
69 case 3: LDT( 3, val); break;
70 case 4: LDT( 4, val); break;
71 case 5: LDT( 5, val); break;
72 case 6: LDT( 6, val); break;
73 case 7: LDT( 7, val); break;
74 case 8: LDT( 8, val); break;
75 case 9: LDT( 9, val); break;
76 case 10: LDT(10, val); break;
77 case 11: LDT(11, val); break;
78 case 12: LDT(12, val); break;
79 case 13: LDT(13, val); break;
80 case 14: LDT(14, val); break;
81 case 15: LDT(15, val); break;
82 case 16: LDT(16, val); break;
83 case 17: LDT(17, val); break;
84 case 18: LDT(18, val); break;
85 case 19: LDT(19, val); break;
86 case 20: LDT(20, val); break;
87 case 21: LDT(21, val); break;
88 case 22: LDT(22, val); break;
89 case 23: LDT(23, val); break;
90 case 24: LDT(24, val); break;
91 case 25: LDT(25, val); break;
92 case 26: LDT(26, val); break;
93 case 27: LDT(27, val); break;
94 case 28: LDT(28, val); break;
95 case 29: LDT(29, val); break;
96 case 30: LDT(30, val); break;
97 case 31: LDT(31, val); break;
98 }
99}
100
101#if defined(__alpha_cix__) || defined(__alpha_fix__)
102#define STS(reg,val) asm volatile ("ftois $f"#reg",%0" : "=r"(val));
103#else
104#define STS(reg,val) asm volatile ("sts $f"#reg",%0" : "=m"(val));
105#endif
106
107unsigned long
108alpha_read_fp_reg_s (unsigned long reg)
109{
110 unsigned long val;
111
112 switch (reg) {
113 case 0: STS( 0, val); break;
114 case 1: STS( 1, val); break;
115 case 2: STS( 2, val); break;
116 case 3: STS( 3, val); break;
117 case 4: STS( 4, val); break;
118 case 5: STS( 5, val); break;
119 case 6: STS( 6, val); break;
120 case 7: STS( 7, val); break;
121 case 8: STS( 8, val); break;
122 case 9: STS( 9, val); break;
123 case 10: STS(10, val); break;
124 case 11: STS(11, val); break;
125 case 12: STS(12, val); break;
126 case 13: STS(13, val); break;
127 case 14: STS(14, val); break;
128 case 15: STS(15, val); break;
129 case 16: STS(16, val); break;
130 case 17: STS(17, val); break;
131 case 18: STS(18, val); break;
132 case 19: STS(19, val); break;
133 case 20: STS(20, val); break;
134 case 21: STS(21, val); break;
135 case 22: STS(22, val); break;
136 case 23: STS(23, val); break;
137 case 24: STS(24, val); break;
138 case 25: STS(25, val); break;
139 case 26: STS(26, val); break;
140 case 27: STS(27, val); break;
141 case 28: STS(28, val); break;
142 case 29: STS(29, val); break;
143 case 30: STS(30, val); break;
144 case 31: STS(31, val); break;
145 default: return 0;
146 }
147 return val;
148}
149
150#if defined(__alpha_cix__) || defined(__alpha_fix__)
151#define LDS(reg,val) asm volatile ("itofs %0,$f"#reg : : "r"(val));
152#else
153#define LDS(reg,val) asm volatile ("lds $f"#reg",%0" : : "m"(val));
154#endif
155
156void
157alpha_write_fp_reg_s (unsigned long reg, unsigned long val)
158{
159 switch (reg) {
160 case 0: LDS( 0, val); break;
161 case 1: LDS( 1, val); break;
162 case 2: LDS( 2, val); break;
163 case 3: LDS( 3, val); break;
164 case 4: LDS( 4, val); break;
165 case 5: LDS( 5, val); break;
166 case 6: LDS( 6, val); break;
167 case 7: LDS( 7, val); break;
168 case 8: LDS( 8, val); break;
169 case 9: LDS( 9, val); break;
170 case 10: LDS(10, val); break;
171 case 11: LDS(11, val); break;
172 case 12: LDS(12, val); break;
173 case 13: LDS(13, val); break;
174 case 14: LDS(14, val); break;
175 case 15: LDS(15, val); break;
176 case 16: LDS(16, val); break;
177 case 17: LDS(17, val); break;
178 case 18: LDS(18, val); break;
179 case 19: LDS(19, val); break;
180 case 20: LDS(20, val); break;
181 case 21: LDS(21, val); break;
182 case 22: LDS(22, val); break;
183 case 23: LDS(23, val); break;
184 case 24: LDS(24, val); break;
185 case 25: LDS(25, val); break;
186 case 26: LDS(26, val); break;
187 case 27: LDS(27, val); break;
188 case 28: LDS(28, val); break;
189 case 29: LDS(29, val); break;
190 case 30: LDS(30, val); break;
191 case 31: LDS(31, val); break;
192 }
193}
diff --git a/arch/alpha/lib/memchr.S b/arch/alpha/lib/memchr.S
new file mode 100644
index 000000000000..14427eeb555e
--- /dev/null
+++ b/arch/alpha/lib/memchr.S
@@ -0,0 +1,164 @@
1/* Copyright (C) 1996 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by David Mosberger (davidm@cs.arizona.edu).
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
19
20/* Finds characters in a memory area. Optimized for the Alpha:
21
22 - memory accessed as aligned quadwords only
23 - uses cmpbge to compare 8 bytes in parallel
24 - does binary search to find 0 byte in last
25 quadword (HAKMEM needed 12 instructions to
26 do this instead of the 9 instructions that
27 binary search needs).
28
29For correctness consider that:
30
31 - only minimum number of quadwords may be accessed
32 - the third argument is an unsigned long
33*/
34
35 .set noreorder
36 .set noat
37
38 .globl memchr
39 .ent memchr
40memchr:
41 .frame $30,0,$26,0
42 .prologue 0
43
44 # Hack -- if someone passes in (size_t)-1, hoping to just
45 # search til the end of the address space, we will overflow
46 # below when we find the address of the last byte. Given
47 # that we will never have a 56-bit address space, cropping
48 # the length is the easiest way to avoid trouble.
49 zap $18, 0x80, $5 #-e0 :
50
51 beq $18, $not_found # .. e1 :
52 ldq_u $1, 0($16) # e1 : load first quadword
53 insbl $17, 1, $2 # .. e0 : $2 = 000000000000ch00
54 and $17, 0xff, $17 #-e0 : $17 = 00000000000000ch
55 cmpult $18, 9, $4 # .. e1 :
56 or $2, $17, $17 # e0 : $17 = 000000000000chch
57 lda $3, -1($31) # .. e1 :
58 sll $17, 16, $2 #-e0 : $2 = 00000000chch0000
59 addq $16, $5, $5 # .. e1 :
60 or $2, $17, $17 # e1 : $17 = 00000000chchchch
61 unop # :
62 sll $17, 32, $2 #-e0 : $2 = chchchch00000000
63 or $2, $17, $17 # e1 : $17 = chchchchchchchch
64 extql $1, $16, $7 # e0 :
65 beq $4, $first_quad # .. e1 :
66
67 ldq_u $6, -1($5) #-e1 : eight or less bytes to search
68 extqh $6, $16, $6 # .. e0 :
69 mov $16, $0 # e0 :
70 or $7, $6, $1 # .. e1 : $1 = quadword starting at $16
71
72 # Deal with the case where at most 8 bytes remain to be searched
73 # in $1. E.g.:
74 # $18 = 6
75 # $1 = ????c6c5c4c3c2c1
76$last_quad:
77 negq $18, $6 #-e0 :
78 xor $17, $1, $1 # .. e1 :
79 srl $3, $6, $6 # e0 : $6 = mask of $18 bits set
80 cmpbge $31, $1, $2 # .. e1 :
81 and $2, $6, $2 #-e0 :
82 beq $2, $not_found # .. e1 :
83
84$found_it:
85 # Now, determine which byte matched:
86 negq $2, $3 # e0 :
87 and $2, $3, $2 # e1 :
88
89 and $2, 0x0f, $1 #-e0 :
90 addq $0, 4, $3 # .. e1 :
91 cmoveq $1, $3, $0 # e0 :
92
93 addq $0, 2, $3 # .. e1 :
94 and $2, 0x33, $1 #-e0 :
95 cmoveq $1, $3, $0 # .. e1 :
96
97 and $2, 0x55, $1 # e0 :
98 addq $0, 1, $3 # .. e1 :
99 cmoveq $1, $3, $0 #-e0 :
100
101$done: ret # .. e1 :
102
103 # Deal with the case where $18 > 8 bytes remain to be
104 # searched. $16 may not be aligned.
105 .align 4
106$first_quad:
107 andnot $16, 0x7, $0 #-e1 :
108 insqh $3, $16, $2 # .. e0 : $2 = 0000ffffffffffff ($16<0:2> ff)
109 xor $1, $17, $1 # e0 :
110 or $1, $2, $1 # e1 : $1 = ====ffffffffffff
111 cmpbge $31, $1, $2 #-e0 :
112 bne $2, $found_it # .. e1 :
113
114 # At least one byte left to process.
115
116 ldq $1, 8($0) # e0 :
117 subq $5, 1, $18 # .. e1 :
118 addq $0, 8, $0 #-e0 :
119
120 # Make $18 point to last quad to be accessed (the
121 # last quad may or may not be partial).
122
123 andnot $18, 0x7, $18 # .. e1 :
124 cmpult $0, $18, $2 # e0 :
125 beq $2, $final # .. e1 :
126
127 # At least two quads remain to be accessed.
128
129 subq $18, $0, $4 #-e0 : $4 <- nr quads to be processed
130 and $4, 8, $4 # e1 : odd number of quads?
131 bne $4, $odd_quad_count # e1 :
132
133 # At least three quads remain to be accessed
134
135 mov $1, $4 # e0 : move prefetched value to correct reg
136
137 .align 4
138$unrolled_loop:
139 ldq $1, 8($0) #-e0 : prefetch $1
140 xor $17, $4, $2 # .. e1 :
141 cmpbge $31, $2, $2 # e0 :
142 bne $2, $found_it # .. e1 :
143
144 addq $0, 8, $0 #-e0 :
145$odd_quad_count:
146 xor $17, $1, $2 # .. e1 :
147 ldq $4, 8($0) # e0 : prefetch $4
148 cmpbge $31, $2, $2 # .. e1 :
149 addq $0, 8, $6 #-e0 :
150 bne $2, $found_it # .. e1 :
151
152 cmpult $6, $18, $6 # e0 :
153 addq $0, 8, $0 # .. e1 :
154 bne $6, $unrolled_loop #-e1 :
155
156 mov $4, $1 # e0 : move prefetched value into $1
157$final: subq $5, $0, $18 # .. e1 : $18 <- number of bytes left to do
158 bne $18, $last_quad # e1 :
159
160$not_found:
161 mov $31, $0 #-e0 :
162 ret # .. e1 :
163
164 .end memchr
diff --git a/arch/alpha/lib/memcpy.c b/arch/alpha/lib/memcpy.c
new file mode 100644
index 000000000000..64083fc73238
--- /dev/null
+++ b/arch/alpha/lib/memcpy.c
@@ -0,0 +1,163 @@
1/*
2 * linux/arch/alpha/lib/memcpy.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 */
6
7/*
8 * This is a reasonably optimized memcpy() routine.
9 */
10
11/*
12 * Note that the C code is written to be optimized into good assembly. However,
13 * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a
14 * explicit compare against 0 (instead of just using the proper "blt reg, xx" or
15 * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually..
16 */
17
18#include <linux/types.h>
19
20/*
21 * This should be done in one go with ldq_u*2/mask/stq_u. Do it
22 * with a macro so that we can fix it up later..
23 */
24#define ALIGN_DEST_TO8_UP(d,s,n) \
25 while (d & 7) { \
26 if (n <= 0) return; \
27 n--; \
28 *(char *) d = *(char *) s; \
29 d++; s++; \
30 }
31#define ALIGN_DEST_TO8_DN(d,s,n) \
32 while (d & 7) { \
33 if (n <= 0) return; \
34 n--; \
35 d--; s--; \
36 *(char *) d = *(char *) s; \
37 }
38
39/*
40 * This should similarly be done with ldq_u*2/mask/stq. The destination
41 * is aligned, but we don't fill in a full quad-word
42 */
43#define DO_REST_UP(d,s,n) \
44 while (n > 0) { \
45 n--; \
46 *(char *) d = *(char *) s; \
47 d++; s++; \
48 }
49#define DO_REST_DN(d,s,n) \
50 while (n > 0) { \
51 n--; \
52 d--; s--; \
53 *(char *) d = *(char *) s; \
54 }
55
56/*
57 * This should be done with ldq/mask/stq. The source and destination are
58 * aligned, but we don't fill in a full quad-word
59 */
60#define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n)
61#define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n)
62
63/*
64 * This does unaligned memory copies. We want to avoid storing to
65 * an unaligned address, as that would do a read-modify-write cycle.
66 * We also want to avoid double-reading the unaligned reads.
67 *
68 * Note the ordering to try to avoid load (and address generation) latencies.
69 */
70static inline void __memcpy_unaligned_up (unsigned long d, unsigned long s,
71 long n)
72{
73 ALIGN_DEST_TO8_UP(d,s,n);
74 n -= 8; /* to avoid compare against 8 in the loop */
75 if (n >= 0) {
76 unsigned long low_word, high_word;
77 __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s));
78 do {
79 unsigned long tmp;
80 __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8)));
81 n -= 8;
82 __asm__("extql %1,%2,%0"
83 :"=r" (low_word)
84 :"r" (low_word), "r" (s));
85 __asm__("extqh %1,%2,%0"
86 :"=r" (tmp)
87 :"r" (high_word), "r" (s));
88 s += 8;
89 *(unsigned long *) d = low_word | tmp;
90 d += 8;
91 low_word = high_word;
92 } while (n >= 0);
93 }
94 n += 8;
95 DO_REST_UP(d,s,n);
96}
97
98static inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s,
99 long n)
100{
101 /* I don't understand AXP assembler well enough for this. -Tim */
102 s += n;
103 d += n;
104 while (n--)
105 * (char *) --d = * (char *) --s;
106}
107
108/*
109 * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register
110 * for the load-store. I don't know why, but it would seem that using a floating
111 * point register for the move seems to slow things down (very small difference,
112 * though).
113 *
114 * Note the ordering to try to avoid load (and address generation) latencies.
115 */
116static inline void __memcpy_aligned_up (unsigned long d, unsigned long s,
117 long n)
118{
119 ALIGN_DEST_TO8_UP(d,s,n);
120 n -= 8;
121 while (n >= 0) {
122 unsigned long tmp;
123 __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
124 n -= 8;
125 s += 8;
126 *(unsigned long *) d = tmp;
127 d += 8;
128 }
129 n += 8;
130 DO_REST_ALIGNED_UP(d,s,n);
131}
132static inline void __memcpy_aligned_dn (unsigned long d, unsigned long s,
133 long n)
134{
135 s += n;
136 d += n;
137 ALIGN_DEST_TO8_DN(d,s,n);
138 n -= 8;
139 while (n >= 0) {
140 unsigned long tmp;
141 s -= 8;
142 __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
143 n -= 8;
144 d -= 8;
145 *(unsigned long *) d = tmp;
146 }
147 n += 8;
148 DO_REST_ALIGNED_DN(d,s,n);
149}
150
151void * memcpy(void * dest, const void *src, size_t n)
152{
153 if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) {
154 __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src,
155 n);
156 return dest;
157 }
158 __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n);
159 return dest;
160}
161
162/* For backward modules compatibility, define __memcpy. */
163asm("__memcpy = memcpy; .globl __memcpy");
diff --git a/arch/alpha/lib/memmove.S b/arch/alpha/lib/memmove.S
new file mode 100644
index 000000000000..eb3b6e02242f
--- /dev/null
+++ b/arch/alpha/lib/memmove.S
@@ -0,0 +1,181 @@
1/*
2 * arch/alpha/lib/memmove.S
3 *
4 * Barely optimized memmove routine for Alpha EV5.
5 *
6 * This is hand-massaged output from the original memcpy.c. We defer to
7 * memcpy whenever possible; the backwards copy loops are not unrolled.
8 */
9
10 .set noat
11 .set noreorder
12 .text
13
14 .align 4
15 .globl memmove
16 .ent memmove
17memmove:
18 ldgp $29, 0($27)
19 unop
20 nop
21 .prologue 1
22
23 addq $16,$18,$4
24 addq $17,$18,$5
25 cmpule $4,$17,$1 /* dest + n <= src */
26 cmpule $5,$16,$2 /* dest >= src + n */
27
28 bis $1,$2,$1
29 mov $16,$0
30 xor $16,$17,$2
31 bne $1,memcpy !samegp
32
33 and $2,7,$2 /* Test for src/dest co-alignment. */
34 and $16,7,$1
35 cmpule $16,$17,$3
36 bne $3,$memmove_up /* dest < src */
37
38 and $4,7,$1
39 bne $2,$misaligned_dn
40 unop
41 beq $1,$skip_aligned_byte_loop_head_dn
42
43$aligned_byte_loop_head_dn:
44 lda $4,-1($4)
45 lda $5,-1($5)
46 unop
47 ble $18,$egress
48
49 ldq_u $3,0($5)
50 ldq_u $2,0($4)
51 lda $18,-1($18)
52 extbl $3,$5,$1
53
54 insbl $1,$4,$1
55 mskbl $2,$4,$2
56 bis $1,$2,$1
57 and $4,7,$6
58
59 stq_u $1,0($4)
60 bne $6,$aligned_byte_loop_head_dn
61
62$skip_aligned_byte_loop_head_dn:
63 lda $18,-8($18)
64 blt $18,$skip_aligned_word_loop_dn
65
66$aligned_word_loop_dn:
67 ldq $1,-8($5)
68 nop
69 lda $5,-8($5)
70 lda $18,-8($18)
71
72 stq $1,-8($4)
73 nop
74 lda $4,-8($4)
75 bge $18,$aligned_word_loop_dn
76
77$skip_aligned_word_loop_dn:
78 lda $18,8($18)
79 bgt $18,$byte_loop_tail_dn
80 unop
81 ret $31,($26),1
82
83 .align 4
84$misaligned_dn:
85 nop
86 fnop
87 unop
88 beq $18,$egress
89
90$byte_loop_tail_dn:
91 ldq_u $3,-1($5)
92 ldq_u $2,-1($4)
93 lda $5,-1($5)
94 lda $4,-1($4)
95
96 lda $18,-1($18)
97 extbl $3,$5,$1
98 insbl $1,$4,$1
99 mskbl $2,$4,$2
100
101 bis $1,$2,$1
102 stq_u $1,0($4)
103 bgt $18,$byte_loop_tail_dn
104 br $egress
105
106$memmove_up:
107 mov $16,$4
108 mov $17,$5
109 bne $2,$misaligned_up
110 beq $1,$skip_aligned_byte_loop_head_up
111
112$aligned_byte_loop_head_up:
113 unop
114 ble $18,$egress
115 ldq_u $3,0($5)
116 ldq_u $2,0($4)
117
118 lda $18,-1($18)
119 extbl $3,$5,$1
120 insbl $1,$4,$1
121 mskbl $2,$4,$2
122
123 bis $1,$2,$1
124 lda $5,1($5)
125 stq_u $1,0($4)
126 lda $4,1($4)
127
128 and $4,7,$6
129 bne $6,$aligned_byte_loop_head_up
130
131$skip_aligned_byte_loop_head_up:
132 lda $18,-8($18)
133 blt $18,$skip_aligned_word_loop_up
134
135$aligned_word_loop_up:
136 ldq $1,0($5)
137 nop
138 lda $5,8($5)
139 lda $18,-8($18)
140
141 stq $1,0($4)
142 nop
143 lda $4,8($4)
144 bge $18,$aligned_word_loop_up
145
146$skip_aligned_word_loop_up:
147 lda $18,8($18)
148 bgt $18,$byte_loop_tail_up
149 unop
150 ret $31,($26),1
151
152 .align 4
153$misaligned_up:
154 nop
155 fnop
156 unop
157 beq $18,$egress
158
159$byte_loop_tail_up:
160 ldq_u $3,0($5)
161 ldq_u $2,0($4)
162 lda $18,-1($18)
163 extbl $3,$5,$1
164
165 insbl $1,$4,$1
166 mskbl $2,$4,$2
167 bis $1,$2,$1
168 stq_u $1,0($4)
169
170 lda $5,1($5)
171 lda $4,1($4)
172 nop
173 bgt $18,$byte_loop_tail_up
174
175$egress:
176 ret $31,($26),1
177 nop
178 nop
179 nop
180
181 .end memmove
diff --git a/arch/alpha/lib/memset.S b/arch/alpha/lib/memset.S
new file mode 100644
index 000000000000..8ff6e7e1773e
--- /dev/null
+++ b/arch/alpha/lib/memset.S
@@ -0,0 +1,124 @@
1/*
2 * linux/arch/alpha/memset.S
3 *
4 * This is an efficient (and small) implementation of the C library "memset()"
5 * function for the alpha.
6 *
7 * (C) Copyright 1996 Linus Torvalds
8 *
9 * This routine is "moral-ware": you are free to use it any way you wish, and
10 * the only obligation I put on you is a moral one: if you make any improvements
11 * to the routine, please send me your improvements for me to use similarly.
12 *
13 * The scheduling comments are according to the EV5 documentation (and done by
14 * hand, so they might well be incorrect, please do tell me about it..)
15 */
16
17 .set noat
18 .set noreorder
19.text
20 .globl memset
21 .globl __memset
22 .globl __memsetw
23 .globl __constant_c_memset
24 .ent __memset
25.align 5
26__memset:
27 .frame $30,0,$26,0
28 .prologue 0
29
30 and $17,255,$1 /* E1 */
31 insbl $17,1,$17 /* .. E0 */
32 bis $17,$1,$17 /* E0 (p-c latency, next cycle) */
33 sll $17,16,$1 /* E1 (p-c latency, next cycle) */
34
35 bis $17,$1,$17 /* E0 (p-c latency, next cycle) */
36 sll $17,32,$1 /* E1 (p-c latency, next cycle) */
37 bis $17,$1,$17 /* E0 (p-c latency, next cycle) */
38 ldq_u $31,0($30) /* .. E1 */
39
40.align 5
41__constant_c_memset:
42 addq $18,$16,$6 /* E0 */
43 bis $16,$16,$0 /* .. E1 */
44 xor $16,$6,$1 /* E0 */
45 ble $18,end /* .. E1 */
46
47 bic $1,7,$1 /* E0 */
48 beq $1,within_one_quad /* .. E1 (note EV5 zero-latency forwarding) */
49 and $16,7,$3 /* E0 */
50 beq $3,aligned /* .. E1 (note EV5 zero-latency forwarding) */
51
52 ldq_u $4,0($16) /* E0 */
53 bis $16,$16,$5 /* .. E1 */
54 insql $17,$16,$2 /* E0 */
55 subq $3,8,$3 /* .. E1 */
56
57 addq $18,$3,$18 /* E0 $18 is new count ($3 is negative) */
58 mskql $4,$16,$4 /* .. E1 (and possible load stall) */
59 subq $16,$3,$16 /* E0 $16 is new aligned destination */
60 bis $2,$4,$1 /* .. E1 */
61
62 bis $31,$31,$31 /* E0 */
63 ldq_u $31,0($30) /* .. E1 */
64 stq_u $1,0($5) /* E0 */
65 bis $31,$31,$31 /* .. E1 */
66
67.align 4
68aligned:
69 sra $18,3,$3 /* E0 */
70 and $18,7,$18 /* .. E1 */
71 bis $16,$16,$5 /* E0 */
72 beq $3,no_quad /* .. E1 */
73
74.align 3
75loop:
76 stq $17,0($5) /* E0 */
77 subq $3,1,$3 /* .. E1 */
78 addq $5,8,$5 /* E0 */
79 bne $3,loop /* .. E1 */
80
81no_quad:
82 bis $31,$31,$31 /* E0 */
83 beq $18,end /* .. E1 */
84 ldq $7,0($5) /* E0 */
85 mskqh $7,$6,$2 /* .. E1 (and load stall) */
86
87 insqh $17,$6,$4 /* E0 */
88 bis $2,$4,$1 /* .. E1 */
89 stq $1,0($5) /* E0 */
90 ret $31,($26),1 /* .. E1 */
91
92.align 3
93within_one_quad:
94 ldq_u $1,0($16) /* E0 */
95 insql $17,$16,$2 /* E1 */
96 mskql $1,$16,$4 /* E0 (after load stall) */
97 bis $2,$4,$2 /* E0 */
98
99 mskql $2,$6,$4 /* E0 */
100 mskqh $1,$6,$2 /* .. E1 */
101 bis $2,$4,$1 /* E0 */
102 stq_u $1,0($16) /* E0 */
103
104end:
105 ret $31,($26),1 /* E1 */
106 .end __memset
107
108 .align 5
109 .ent __memsetw
110__memsetw:
111 .prologue 0
112
113 inswl $17,0,$1 /* E0 */
114 inswl $17,2,$2 /* E0 */
115 inswl $17,4,$3 /* E0 */
116 or $1,$2,$1 /* .. E1 */
117 inswl $17,6,$4 /* E0 */
118 or $1,$3,$1 /* .. E1 */
119 or $1,$4,$17 /* E0 */
120 br __constant_c_memset /* .. E1 */
121
122 .end __memsetw
123
124memset = __memset
diff --git a/arch/alpha/lib/srm_printk.c b/arch/alpha/lib/srm_printk.c
new file mode 100644
index 000000000000..31b53c49435e
--- /dev/null
+++ b/arch/alpha/lib/srm_printk.c
@@ -0,0 +1,41 @@
1/*
2 * arch/alpha/lib/srm_printk.c
3 */
4
5#include <linux/kernel.h>
6#include <asm/console.h>
7
8long
9srm_printk(const char *fmt, ...)
10{
11 static char buf[1024];
12 va_list args;
13 long len, num_lf;
14 char *src, *dst;
15
16 va_start(args, fmt);
17 len = vsprintf(buf, fmt, args);
18 va_end(args);
19
20 /* count number of linefeeds in string: */
21
22 num_lf = 0;
23 for (src = buf; *src; ++src) {
24 if (*src == '\n') {
25 ++num_lf;
26 }
27 }
28
29 if (num_lf) {
30 /* expand each linefeed into carriage-return/linefeed: */
31 for (dst = src + num_lf; src >= buf; ) {
32 if (*src == '\n') {
33 *dst-- = '\r';
34 }
35 *dst-- = *src--;
36 }
37 }
38
39 srm_puts(buf, num_lf+len);
40 return len;
41}
diff --git a/arch/alpha/lib/srm_puts.c b/arch/alpha/lib/srm_puts.c
new file mode 100644
index 000000000000..7b60a6f75a78
--- /dev/null
+++ b/arch/alpha/lib/srm_puts.c
@@ -0,0 +1,23 @@
1/*
2 * arch/alpha/lib/srm_puts.c
3 */
4
5#include <linux/string.h>
6#include <asm/console.h>
7
8long
9srm_puts(const char *str, long len)
10{
11 long remaining, written;
12
13 if (!callback_init_done)
14 return len;
15
16 for (remaining = len; remaining > 0; remaining -= written)
17 {
18 written = callback_puts(0, str, remaining);
19 written &= 0xffffffff;
20 str += written;
21 }
22 return len;
23}
diff --git a/arch/alpha/lib/stacktrace.c b/arch/alpha/lib/stacktrace.c
new file mode 100644
index 000000000000..6d432e42aedc
--- /dev/null
+++ b/arch/alpha/lib/stacktrace.c
@@ -0,0 +1,103 @@
1#include <linux/kernel.h>
2#include <asm/system.h>
3
4typedef unsigned int instr;
5
6#define MAJOR_OP 0xfc000000
7#define LDA_OP 0x20000000
8#define STQ_OP 0xb4000000
9#define BR_OP 0xc0000000
10
11#define STK_ALLOC_1 0x23de8000 /* lda $30,-X($30) */
12#define STK_ALLOC_1M 0xffff8000
13#define STK_ALLOC_2 0x43c0153e /* subq $30,X,$30 */
14#define STK_ALLOC_2M 0xffe01fff
15
16#define MEM_REG 0x03e00000
17#define MEM_BASE 0x001f0000
18#define MEM_OFF 0x0000ffff
19#define MEM_OFF_SIGN 0x00008000
20#define BASE_SP 0x001e0000
21
22#define STK_ALLOC_MATCH(INSTR) \
23 (((INSTR) & STK_ALLOC_1M) == STK_ALLOC_1 \
24 || ((INSTR) & STK_ALLOC_2M) == STK_ALLOC_2)
25#define STK_PUSH_MATCH(INSTR) \
26 (((INSTR) & (MAJOR_OP | MEM_BASE | MEM_OFF_SIGN)) == (STQ_OP | BASE_SP))
27#define MEM_OP_OFFSET(INSTR) \
28 (((long)((INSTR) & MEM_OFF) << 48) >> 48)
29#define MEM_OP_REG(INSTR) \
30 (((INSTR) & MEM_REG) >> 22)
31
32/* Branches, jumps, PAL calls, and illegal opcodes end a basic block. */
33#define BB_END(INSTR) \
34 (((instr)(INSTR) >= BR_OP) | ((instr)(INSTR) < LDA_OP) | \
35 ((((instr)(INSTR) ^ 0x60000000) < 0x20000000) & \
36 (((instr)(INSTR) & 0x0c000000) != 0)))
37
38#define IS_KERNEL_TEXT(PC) ((unsigned long)(PC) > START_ADDR)
39
40static char reg_name[][4] = {
41 "v0 ", "t0 ", "t1 ", "t2 ", "t3 ", "t4 ", "t5 ", "t6 ", "t7 ",
42 "s0 ", "s1 ", "s2 ", "s3 ", "s4 ", "s5 ", "s6 ", "a0 ", "a1 ",
43 "a2 ", "a3 ", "a4 ", "a5 ", "t8 ", "t9 ", "t10", "t11", "ra ",
44 "pv ", "at ", "gp ", "sp ", "0"
45};
46
47
48static instr *
49display_stored_regs(instr * pro_pc, unsigned char * sp)
50{
51 instr * ret_pc = 0;
52 int reg;
53 unsigned long value;
54
55 printk("Prologue [<%p>], Frame %p:\n", pro_pc, sp);
56 while (!BB_END(*pro_pc))
57 if (STK_PUSH_MATCH(*pro_pc)) {
58 reg = (*pro_pc & MEM_REG) >> 21;
59 value = *(unsigned long *)(sp + (*pro_pc & MEM_OFF));
60 if (reg == 26)
61 ret_pc = (instr *)value;
62 printk("\t\t%s / 0x%016lx\n", reg_name[reg], value);
63 }
64 return ret_pc;
65}
66
67static instr *
68seek_prologue(instr * pc)
69{
70 while (!STK_ALLOC_MATCH(*pc))
71 --pc;
72 while (!BB_END(*(pc - 1)))
73 --pc;
74 return pc;
75}
76
77static long
78stack_increment(instr * prologue_pc)
79{
80 while (!STK_ALLOC_MATCH(*prologue_pc))
81 ++prologue_pc;
82
83 /* Count the bytes allocated. */
84 if ((*prologue_pc & STK_ALLOC_1M) == STK_ALLOC_1M)
85 return -(((long)(*prologue_pc) << 48) >> 48);
86 else
87 return (*prologue_pc >> 13) & 0xff;
88}
89
90void
91stacktrace(void)
92{
93 instr * ret_pc;
94 instr * prologue = (instr *)stacktrace;
95 register unsigned char * sp __asm__ ("$30");
96
97 printk("\tstack trace:\n");
98 do {
99 ret_pc = display_stored_regs(prologue, sp);
100 sp += stack_increment(prologue);
101 prologue = seek_prologue(ret_pc);
102 } while (IS_KERNEL_TEXT(ret_pc));
103}
diff --git a/arch/alpha/lib/strcasecmp.c b/arch/alpha/lib/strcasecmp.c
new file mode 100644
index 000000000000..4e57a216feaf
--- /dev/null
+++ b/arch/alpha/lib/strcasecmp.c
@@ -0,0 +1,26 @@
1/*
2 * linux/arch/alpha/lib/strcasecmp.c
3 */
4
5#include <linux/string.h>
6
7
8/* We handle nothing here except the C locale. Since this is used in
9 only one place, on strings known to contain only 7 bit ASCII, this
10 is ok. */
11
12int strcasecmp(const char *a, const char *b)
13{
14 int ca, cb;
15
16 do {
17 ca = *a++ & 0xff;
18 cb = *b++ & 0xff;
19 if (ca >= 'A' && ca <= 'Z')
20 ca += 'a' - 'A';
21 if (cb >= 'A' && cb <= 'Z')
22 cb += 'a' - 'A';
23 } while (ca == cb && ca != '\0');
24
25 return ca - cb;
26}
diff --git a/arch/alpha/lib/strcat.S b/arch/alpha/lib/strcat.S
new file mode 100644
index 000000000000..393f50384878
--- /dev/null
+++ b/arch/alpha/lib/strcat.S
@@ -0,0 +1,52 @@
1/*
2 * arch/alpha/lib/strcat.S
3 * Contributed by Richard Henderson (rth@tamu.edu)
4 *
5 * Append a null-terminated string from SRC to DST.
6 */
7
8 .text
9
10 .align 3
11 .globl strcat
12 .ent strcat
13strcat:
14 .frame $30, 0, $26
15 .prologue 0
16
17 mov $16, $0 # set up return value
18
19 /* Find the end of the string. */
20
21 ldq_u $1, 0($16) # load first quadword (a0 may be misaligned)
22 lda $2, -1
23 insqh $2, $16, $2
24 andnot $16, 7, $16
25 or $2, $1, $1
26 cmpbge $31, $1, $2 # bits set iff byte == 0
27 bne $2, $found
28
29$loop: ldq $1, 8($16)
30 addq $16, 8, $16
31 cmpbge $31, $1, $2
32 beq $2, $loop
33
34$found: negq $2, $3 # clear all but least set bit
35 and $2, $3, $2
36
37 and $2, 0xf0, $3 # binary search for that set bit
38 and $2, 0xcc, $4
39 and $2, 0xaa, $5
40 cmovne $3, 4, $3
41 cmovne $4, 2, $4
42 cmovne $5, 1, $5
43 addq $3, $4, $3
44 addq $16, $5, $16
45 addq $16, $3, $16
46
47 /* Now do the append. */
48
49 mov $26, $23
50 br __stxcpy
51
52 .end strcat
diff --git a/arch/alpha/lib/strchr.S b/arch/alpha/lib/strchr.S
new file mode 100644
index 000000000000..011a175e8329
--- /dev/null
+++ b/arch/alpha/lib/strchr.S
@@ -0,0 +1,70 @@
1/*
2 * arch/alpha/lib/strchr.S
3 * Contributed by Richard Henderson (rth@tamu.edu)
4 *
5 * Return the address of a given character within a null-terminated
6 * string, or null if it is not found.
7 */
8
9#include <asm/regdef.h>
10
11 .set noreorder
12 .set noat
13
14 .align 3
15 .globl strchr
16 .ent strchr
17strchr:
18 .frame sp, 0, ra
19 .prologue 0
20
21 zapnot a1, 1, a1 # e0 : zero extend the search character
22 ldq_u t0, 0(a0) # .. e1 : load first quadword
23 sll a1, 8, t5 # e0 : replicate the search character
24 andnot a0, 7, v0 # .. e1 : align our loop pointer
25 or t5, a1, a1 # e0 :
26 lda t4, -1 # .. e1 : build garbage mask
27 sll a1, 16, t5 # e0 :
28 cmpbge zero, t0, t2 # .. e1 : bits set iff byte == zero
29 mskqh t4, a0, t4 # e0 :
30 or t5, a1, a1 # .. e1 :
31 sll a1, 32, t5 # e0 :
32 cmpbge zero, t4, t4 # .. e1 : bits set iff byte is garbage
33 or t5, a1, a1 # e0 :
34 xor t0, a1, t1 # .. e1 : make bytes == c zero
35 cmpbge zero, t1, t3 # e0 : bits set iff byte == c
36 or t2, t3, t0 # e1 : bits set iff char match or zero match
37 andnot t0, t4, t0 # e0 : clear garbage bits
38 bne t0, $found # .. e1 (zdb)
39
40$loop: ldq t0, 8(v0) # e0 :
41 addq v0, 8, v0 # .. e1 :
42 nop # e0 :
43 xor t0, a1, t1 # .. e1 (ev5 data stall)
44 cmpbge zero, t0, t2 # e0 : bits set iff byte == 0
45 cmpbge zero, t1, t3 # .. e1 : bits set iff byte == c
46 or t2, t3, t0 # e0 :
47 beq t0, $loop # .. e1 (zdb)
48
49$found: negq t0, t1 # e0 : clear all but least set bit
50 and t0, t1, t0 # e1 (stall)
51
52 and t0, t3, t1 # e0 : bit set iff byte was the char
53 beq t1, $retnull # .. e1 (zdb)
54
55 and t0, 0xf0, t2 # e0 : binary search for that set bit
56 and t0, 0xcc, t3 # .. e1 :
57 and t0, 0xaa, t4 # e0 :
58 cmovne t2, 4, t2 # .. e1 :
59 cmovne t3, 2, t3 # e0 :
60 cmovne t4, 1, t4 # .. e1 :
61 addq t2, t3, t2 # e0 :
62 addq v0, t4, v0 # .. e1 :
63 addq v0, t2, v0 # e0 :
64 ret # .. e1 :
65
66$retnull:
67 mov zero, v0 # e0 :
68 ret # .. e1 :
69
70 .end strchr
diff --git a/arch/alpha/lib/strcpy.S b/arch/alpha/lib/strcpy.S
new file mode 100644
index 000000000000..e0728e4ad21f
--- /dev/null
+++ b/arch/alpha/lib/strcpy.S
@@ -0,0 +1,23 @@
1/*
2 * arch/alpha/lib/strcpy.S
3 * Contributed by Richard Henderson (rth@tamu.edu)
4 *
5 * Copy a null-terminated string from SRC to DST. Return a pointer
6 * to the null-terminator in the source.
7 */
8
9 .text
10
11 .align 3
12 .globl strcpy
13 .ent strcpy
14strcpy:
15 .frame $30, 0, $26
16 .prologue 0
17
18 mov $16, $0 # set up return value
19 mov $26, $23 # set up return address
20 unop
21 br __stxcpy # do the copy
22
23 .end strcpy
diff --git a/arch/alpha/lib/strlen.S b/arch/alpha/lib/strlen.S
new file mode 100644
index 000000000000..fe63353de152
--- /dev/null
+++ b/arch/alpha/lib/strlen.S
@@ -0,0 +1,57 @@
1/*
2 * strlen.S (c) 1995 David Mosberger (davidm@cs.arizona.edu)
3 *
4 * Finds length of a 0-terminated string. Optimized for the
5 * Alpha architecture:
6 *
7 * - memory accessed as aligned quadwords only
8 * - uses bcmpge to compare 8 bytes in parallel
9 * - does binary search to find 0 byte in last
10 * quadword (HAKMEM needed 12 instructions to
11 * do this instead of the 9 instructions that
12 * binary search needs).
13 */
14
15 .set noreorder
16 .set noat
17
18 .align 3
19
20 .globl strlen
21 .ent strlen
22
23strlen:
24 ldq_u $1, 0($16) # load first quadword ($16 may be misaligned)
25 lda $2, -1($31)
26 insqh $2, $16, $2
27 andnot $16, 7, $0
28 or $2, $1, $1
29 cmpbge $31, $1, $2 # $2 <- bitmask: bit i == 1 <==> i-th byte == 0
30 bne $2, found
31
32loop: ldq $1, 8($0)
33 addq $0, 8, $0 # addr += 8
34 nop # helps dual issue last two insns
35 cmpbge $31, $1, $2
36 beq $2, loop
37
38found: blbs $2, done # make aligned case fast
39 negq $2, $3
40 and $2, $3, $2
41
42 and $2, 0x0f, $1
43 addq $0, 4, $3
44 cmoveq $1, $3, $0
45
46 and $2, 0x33, $1
47 addq $0, 2, $3
48 cmoveq $1, $3, $0
49
50 and $2, 0x55, $1
51 addq $0, 1, $3
52 cmoveq $1, $3, $0
53
54done: subq $0, $16, $0
55 ret $31, ($26)
56
57 .end strlen
diff --git a/arch/alpha/lib/strlen_user.S b/arch/alpha/lib/strlen_user.S
new file mode 100644
index 000000000000..508a18e96479
--- /dev/null
+++ b/arch/alpha/lib/strlen_user.S
@@ -0,0 +1,91 @@
1/*
2 * arch/alpha/lib/strlen_user.S
3 *
4 * Return the length of the string including the NUL terminator
5 * (strlen+1) or zero if an error occurred.
6 *
7 * In places where it is critical to limit the processing time,
8 * and the data is not trusted, strnlen_user() should be used.
9 * It will return a value greater than its second argument if
10 * that limit would be exceeded. This implementation is allowed
11 * to access memory beyond the limit, but will not cross a page
12 * boundary when doing so.
13 */
14
15#include <asm/regdef.h>
16
17
18/* Allow an exception for an insn; exit if we get one. */
19#define EX(x,y...) \
20 99: x,##y; \
21 .section __ex_table,"a"; \
22 .long 99b - .; \
23 lda v0, $exception-99b(zero); \
24 .previous
25
26
27 .set noreorder
28 .set noat
29 .text
30
31 .globl __strlen_user
32 .ent __strlen_user
33 .frame sp, 0, ra
34
35 .align 3
36__strlen_user:
37 ldah a1, 32767(zero) # do not use plain strlen_user() for strings
38 # that might be almost 2 GB long; you should
39 # be using strnlen_user() instead
40
41 .globl __strnlen_user
42
43 .align 3
44__strnlen_user:
45 .prologue 0
46
47 EX( ldq_u t0, 0(a0) ) # load first quadword (a0 may be misaligned)
48 lda t1, -1(zero)
49 insqh t1, a0, t1
50 andnot a0, 7, v0
51 or t1, t0, t0
52 subq a0, 1, a0 # get our +1 for the return
53 cmpbge zero, t0, t1 # t1 <- bitmask: bit i == 1 <==> i-th byte == 0
54 subq a1, 7, t2
55 subq a0, v0, t0
56 bne t1, $found
57
58 addq t2, t0, t2
59 addq a1, 1, a1
60
61 .align 3
62$loop: ble t2, $limit
63 EX( ldq t0, 8(v0) )
64 subq t2, 8, t2
65 addq v0, 8, v0 # addr += 8
66 cmpbge zero, t0, t1
67 beq t1, $loop
68
69$found: negq t1, t2 # clear all but least set bit
70 and t1, t2, t1
71
72 and t1, 0xf0, t2 # binary search for that set bit
73 and t1, 0xcc, t3
74 and t1, 0xaa, t4
75 cmovne t2, 4, t2
76 cmovne t3, 2, t3
77 cmovne t4, 1, t4
78 addq t2, t3, t2
79 addq v0, t4, v0
80 addq v0, t2, v0
81 nop # dual issue next two on ev4 and ev5
82 subq v0, a0, v0
83$exception:
84 ret
85
86 .align 3 # currently redundant
87$limit:
88 subq a1, t2, v0
89 ret
90
91 .end __strlen_user
diff --git a/arch/alpha/lib/strncat.S b/arch/alpha/lib/strncat.S
new file mode 100644
index 000000000000..a8278163c972
--- /dev/null
+++ b/arch/alpha/lib/strncat.S
@@ -0,0 +1,84 @@
1/*
2 * arch/alpha/lib/strncat.S
3 * Contributed by Richard Henderson (rth@tamu.edu)
4 *
5 * Append no more than COUNT characters from the null-terminated string SRC
6 * to the null-terminated string DST. Always null-terminate the new DST.
7 *
8 * This differs slightly from the semantics in libc in that we never write
9 * past count, whereas libc may write to count+1. This follows the generic
10 * implementation in lib/string.c and is, IMHO, more sensible.
11 */
12
13 .text
14
15 .align 3
16 .globl strncat
17 .ent strncat
18strncat:
19 .frame $30, 0, $26
20 .prologue 0
21
22 mov $16, $0 # set up return value
23 beq $18, $zerocount
24
25 /* Find the end of the string. */
26
27 ldq_u $1, 0($16) # load first quadword ($16 may be misaligned)
28 lda $2, -1($31)
29 insqh $2, $16, $2
30 andnot $16, 7, $16
31 or $2, $1, $1
32 cmpbge $31, $1, $2 # bits set iff byte == 0
33 bne $2, $found
34
35$loop: ldq $1, 8($16)
36 addq $16, 8, $16
37 cmpbge $31, $1, $2
38 beq $2, $loop
39
40$found: negq $2, $3 # clear all but least set bit
41 and $2, $3, $2
42
43 and $2, 0xf0, $3 # binary search for that set bit
44 and $2, 0xcc, $4
45 and $2, 0xaa, $5
46 cmovne $3, 4, $3
47 cmovne $4, 2, $4
48 cmovne $5, 1, $5
49 addq $3, $4, $3
50 addq $16, $5, $16
51 addq $16, $3, $16
52
53 /* Now do the append. */
54
55 bsr $23, __stxncpy
56
57 /* Worry about the null termination. */
58
59 zapnot $1, $27, $2 # was last byte a null?
60 bne $2, 0f
61 ret
62
630: cmplt $27, $24, $2 # did we fill the buffer completely?
64 or $2, $18, $2
65 bne $2, 2f
66
67 and $24, 0x80, $2 # no zero next byte
68 bne $2, 1f
69
70 /* Here there are bytes left in the current word. Clear one. */
71 addq $24, $24, $24 # end-of-count bit <<= 1
722: zap $1, $24, $1
73 stq_u $1, 0($16)
74 ret
75
761: /* Here we must read the next DST word and clear the first byte. */
77 ldq_u $1, 8($16)
78 zap $1, 1, $1
79 stq_u $1, 8($16)
80
81$zerocount:
82 ret
83
84 .end strncat
diff --git a/arch/alpha/lib/strncpy.S b/arch/alpha/lib/strncpy.S
new file mode 100644
index 000000000000..338551c7113c
--- /dev/null
+++ b/arch/alpha/lib/strncpy.S
@@ -0,0 +1,81 @@
1/*
2 * arch/alpha/lib/strncpy.S
3 * Contributed by Richard Henderson (rth@tamu.edu)
4 *
5 * Copy no more than COUNT bytes of the null-terminated string from
6 * SRC to DST. If SRC does not cover all of COUNT, the balance is
7 * zeroed.
8 *
9 * Or, rather, if the kernel cared about that weird ANSI quirk. This
10 * version has cropped that bit o' nastiness as well as assuming that
11 * __stxncpy is in range of a branch.
12 */
13
14 .set noat
15 .set noreorder
16
17 .text
18
19 .align 4
20 .globl strncpy
21 .ent strncpy
22strncpy:
23 .frame $30, 0, $26
24 .prologue 0
25
26 mov $16, $0 # set return value now
27 beq $18, $zerolen
28 unop
29 bsr $23, __stxncpy # do the work of the copy
30
31 unop
32 bne $18, $multiword # do we have full words left?
33 subq $24, 1, $3 # nope
34 subq $27, 1, $4
35
36 or $3, $24, $3 # clear the bits between the last
37 or $4, $27, $4 # written byte and the last byte in COUNT
38 andnot $4, $3, $4
39 zap $1, $4, $1
40
41 stq_u $1, 0($16)
42 ret
43
44 .align 4
45$multiword:
46 subq $24, 1, $2 # clear the final bits in the prev word
47 or $2, $24, $2
48 zapnot $1, $2, $1
49 subq $18, 1, $18
50
51 stq_u $1, 0($16)
52 addq $16, 8, $16
53 unop
54 beq $18, 1f
55
56 nop
57 unop
58 nop
59 blbc $18, 0f
60
61 stq_u $31, 0($16) # zero one word
62 subq $18, 1, $18
63 addq $16, 8, $16
64 beq $18, 1f
65
660: stq_u $31, 0($16) # zero two words
67 subq $18, 2, $18
68 stq_u $31, 8($16)
69 addq $16, 16, $16
70 bne $18, 0b
71
721: ldq_u $1, 0($16) # clear the leading bits in the final word
73 subq $27, 1, $2
74 or $2, $27, $2
75
76 zap $1, $2, $1
77 stq_u $1, 0($16)
78$zerolen:
79 ret
80
81 .end strncpy
diff --git a/arch/alpha/lib/strncpy_from_user.S b/arch/alpha/lib/strncpy_from_user.S
new file mode 100644
index 000000000000..73ee21160ff7
--- /dev/null
+++ b/arch/alpha/lib/strncpy_from_user.S
@@ -0,0 +1,339 @@
1/*
2 * arch/alpha/lib/strncpy_from_user.S
3 * Contributed by Richard Henderson (rth@tamu.edu)
4 *
5 * Just like strncpy except in the return value:
6 *
7 * -EFAULT if an exception occurs before the terminator is copied.
8 * N if the buffer filled.
9 *
10 * Otherwise the length of the string is returned.
11 */
12
13
14#include <asm/errno.h>
15#include <asm/regdef.h>
16
17
18/* Allow an exception for an insn; exit if we get one. */
19#define EX(x,y...) \
20 99: x,##y; \
21 .section __ex_table,"a"; \
22 .long 99b - .; \
23 lda $31, $exception-99b($0); \
24 .previous
25
26
27 .set noat
28 .set noreorder
29 .text
30
31 .globl __strncpy_from_user
32 .ent __strncpy_from_user
33 .frame $30, 0, $26
34 .prologue 0
35
36 .align 3
37$aligned:
38 /* On entry to this basic block:
39 t0 == the first destination word for masking back in
40 t1 == the first source word. */
41
42 /* Create the 1st output word and detect 0's in the 1st input word. */
43 lda t2, -1 # e1 : build a mask against false zero
44 mskqh t2, a1, t2 # e0 : detection in the src word
45 mskqh t1, a1, t3 # e0 :
46 ornot t1, t2, t2 # .. e1 :
47 mskql t0, a1, t0 # e0 : assemble the first output word
48 cmpbge zero, t2, t8 # .. e1 : bits set iff null found
49 or t0, t3, t0 # e0 :
50 beq a2, $a_eoc # .. e1 :
51 bne t8, $a_eos # .. e1 :
52
53 /* On entry to this basic block:
54 t0 == a source word not containing a null. */
55
56$a_loop:
57 stq_u t0, 0(a0) # e0 :
58 addq a0, 8, a0 # .. e1 :
59 EX( ldq_u t0, 0(a1) ) # e0 :
60 addq a1, 8, a1 # .. e1 :
61 subq a2, 1, a2 # e0 :
62 cmpbge zero, t0, t8 # .. e1 (stall)
63 beq a2, $a_eoc # e1 :
64 beq t8, $a_loop # e1 :
65
66 /* Take care of the final (partial) word store. At this point
67 the end-of-count bit is set in t8 iff it applies.
68
69 On entry to this basic block we have:
70 t0 == the source word containing the null
71 t8 == the cmpbge mask that found it. */
72
73$a_eos:
74 negq t8, t12 # e0 : find low bit set
75 and t8, t12, t12 # e1 (stall)
76
77 /* For the sake of the cache, don't read a destination word
78 if we're not going to need it. */
79 and t12, 0x80, t6 # e0 :
80 bne t6, 1f # .. e1 (zdb)
81
82 /* We're doing a partial word store and so need to combine
83 our source and original destination words. */
84 ldq_u t1, 0(a0) # e0 :
85 subq t12, 1, t6 # .. e1 :
86 or t12, t6, t8 # e0 :
87 unop #
88 zapnot t0, t8, t0 # e0 : clear src bytes > null
89 zap t1, t8, t1 # .. e1 : clear dst bytes <= null
90 or t0, t1, t0 # e1 :
91
921: stq_u t0, 0(a0)
93 br $finish_up
94
95 /* Add the end-of-count bit to the eos detection bitmask. */
96$a_eoc:
97 or t10, t8, t8
98 br $a_eos
99
100 /*** The Function Entry Point ***/
101 .align 3
102__strncpy_from_user:
103 mov a0, v0 # save the string start
104 beq a2, $zerolength
105
106 /* Are source and destination co-aligned? */
107 xor a0, a1, t1 # e0 :
108 and a0, 7, t0 # .. e1 : find dest misalignment
109 and t1, 7, t1 # e0 :
110 addq a2, t0, a2 # .. e1 : bias count by dest misalignment
111 subq a2, 1, a2 # e0 :
112 and a2, 7, t2 # e1 :
113 srl a2, 3, a2 # e0 : a2 = loop counter = (count - 1)/8
114 addq zero, 1, t10 # .. e1 :
115 sll t10, t2, t10 # e0 : t10 = bitmask of last count byte
116 bne t1, $unaligned # .. e1 :
117
118 /* We are co-aligned; take care of a partial first word. */
119
120 EX( ldq_u t1, 0(a1) ) # e0 : load first src word
121 addq a1, 8, a1 # .. e1 :
122
123 beq t0, $aligned # avoid loading dest word if not needed
124 ldq_u t0, 0(a0) # e0 :
125 br $aligned # .. e1 :
126
127
128/* The source and destination are not co-aligned. Align the destination
129 and cope. We have to be very careful about not reading too much and
130 causing a SEGV. */
131
132 .align 3
133$u_head:
134 /* We know just enough now to be able to assemble the first
135 full source word. We can still find a zero at the end of it
136 that prevents us from outputting the whole thing.
137
138 On entry to this basic block:
139 t0 == the first dest word, unmasked
140 t1 == the shifted low bits of the first source word
141 t6 == bytemask that is -1 in dest word bytes */
142
143 EX( ldq_u t2, 8(a1) ) # e0 : load second src word
144 addq a1, 8, a1 # .. e1 :
145 mskql t0, a0, t0 # e0 : mask trailing garbage in dst
146 extqh t2, a1, t4 # e0 :
147 or t1, t4, t1 # e1 : first aligned src word complete
148 mskqh t1, a0, t1 # e0 : mask leading garbage in src
149 or t0, t1, t0 # e0 : first output word complete
150 or t0, t6, t6 # e1 : mask original data for zero test
151 cmpbge zero, t6, t8 # e0 :
152 beq a2, $u_eocfin # .. e1 :
153 bne t8, $u_final # e1 :
154
155 lda t6, -1 # e1 : mask out the bits we have
156 mskql t6, a1, t6 # e0 : already seen
157 stq_u t0, 0(a0) # e0 : store first output word
158 or t6, t2, t2 # .. e1 :
159 cmpbge zero, t2, t8 # e0 : find nulls in second partial
160 addq a0, 8, a0 # .. e1 :
161 subq a2, 1, a2 # e0 :
162 bne t8, $u_late_head_exit # .. e1 :
163
164 /* Finally, we've got all the stupid leading edge cases taken care
165 of and we can set up to enter the main loop. */
166
167 extql t2, a1, t1 # e0 : position hi-bits of lo word
168 EX( ldq_u t2, 8(a1) ) # .. e1 : read next high-order source word
169 addq a1, 8, a1 # e0 :
170 cmpbge zero, t2, t8 # e1 (stall)
171 beq a2, $u_eoc # e1 :
172 bne t8, $u_eos # e1 :
173
174 /* Unaligned copy main loop. In order to avoid reading too much,
175 the loop is structured to detect zeros in aligned source words.
176 This has, unfortunately, effectively pulled half of a loop
177 iteration out into the head and half into the tail, but it does
178 prevent nastiness from accumulating in the very thing we want
179 to run as fast as possible.
180
181 On entry to this basic block:
182 t1 == the shifted high-order bits from the previous source word
183 t2 == the unshifted current source word
184
185 We further know that t2 does not contain a null terminator. */
186
187 .align 3
188$u_loop:
189 extqh t2, a1, t0 # e0 : extract high bits for current word
190 addq a1, 8, a1 # .. e1 :
191 extql t2, a1, t3 # e0 : extract low bits for next time
192 addq a0, 8, a0 # .. e1 :
193 or t0, t1, t0 # e0 : current dst word now complete
194 EX( ldq_u t2, 0(a1) ) # .. e1 : load high word for next time
195 stq_u t0, -8(a0) # e0 : save the current word
196 mov t3, t1 # .. e1 :
197 subq a2, 1, a2 # e0 :
198 cmpbge zero, t2, t8 # .. e1 : test new word for eos
199 beq a2, $u_eoc # e1 :
200 beq t8, $u_loop # e1 :
201
202 /* We've found a zero somewhere in the source word we just read.
203 If it resides in the lower half, we have one (probably partial)
204 word to write out, and if it resides in the upper half, we
205 have one full and one partial word left to write out.
206
207 On entry to this basic block:
208 t1 == the shifted high-order bits from the previous source word
209 t2 == the unshifted current source word. */
210$u_eos:
211 extqh t2, a1, t0 # e0 :
212 or t0, t1, t0 # e1 : first (partial) source word complete
213
214 cmpbge zero, t0, t8 # e0 : is the null in this first bit?
215 bne t8, $u_final # .. e1 (zdb)
216
217 stq_u t0, 0(a0) # e0 : the null was in the high-order bits
218 addq a0, 8, a0 # .. e1 :
219 subq a2, 1, a2 # e1 :
220
221$u_late_head_exit:
222 extql t2, a1, t0 # .. e0 :
223 cmpbge zero, t0, t8 # e0 :
224 or t8, t10, t6 # e1 :
225 cmoveq a2, t6, t8 # e0 :
226 nop # .. e1 :
227
228 /* Take care of a final (probably partial) result word.
229 On entry to this basic block:
230 t0 == assembled source word
231 t8 == cmpbge mask that found the null. */
232$u_final:
233 negq t8, t6 # e0 : isolate low bit set
234 and t6, t8, t12 # e1 :
235
236 and t12, 0x80, t6 # e0 : avoid dest word load if we can
237 bne t6, 1f # .. e1 (zdb)
238
239 ldq_u t1, 0(a0) # e0 :
240 subq t12, 1, t6 # .. e1 :
241 or t6, t12, t8 # e0 :
242 zapnot t0, t8, t0 # .. e1 : kill source bytes > null
243 zap t1, t8, t1 # e0 : kill dest bytes <= null
244 or t0, t1, t0 # e1 :
245
2461: stq_u t0, 0(a0) # e0 :
247 br $finish_up
248
249$u_eoc: # end-of-count
250 extqh t2, a1, t0
251 or t0, t1, t0
252 cmpbge zero, t0, t8
253
254$u_eocfin: # end-of-count, final word
255 or t10, t8, t8
256 br $u_final
257
258 /* Unaligned copy entry point. */
259 .align 3
260$unaligned:
261
262 EX( ldq_u t1, 0(a1) ) # e0 : load first source word
263
264 and a0, 7, t4 # .. e1 : find dest misalignment
265 and a1, 7, t5 # e0 : find src misalignment
266
267 /* Conditionally load the first destination word and a bytemask
268 with 0xff indicating that the destination byte is sacrosanct. */
269
270 mov zero, t0 # .. e1 :
271 mov zero, t6 # e0 :
272 beq t4, 1f # .. e1 :
273 ldq_u t0, 0(a0) # e0 :
274 lda t6, -1 # .. e1 :
275 mskql t6, a0, t6 # e0 :
2761:
277 subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr
278
279 /* If source misalignment is larger than dest misalignment, we need
280 extra startup checks to avoid SEGV. */
281
282 cmplt t4, t5, t12 # e1 :
283 extql t1, a1, t1 # .. e0 : shift src into place
284 lda t2, -1 # e0 : for creating masks later
285 beq t12, $u_head # e1 :
286
287 mskqh t2, t5, t2 # e0 : begin src byte validity mask
288 cmpbge zero, t1, t8 # .. e1 : is there a zero?
289 extql t2, a1, t2 # e0 :
290 or t8, t10, t5 # .. e1 : test for end-of-count too
291 cmpbge zero, t2, t3 # e0 :
292 cmoveq a2, t5, t8 # .. e1 :
293 andnot t8, t3, t8 # e0 :
294 beq t8, $u_head # .. e1 (zdb)
295
296 /* At this point we've found a zero in the first partial word of
297 the source. We need to isolate the valid source data and mask
298 it into the original destination data. (Incidentally, we know
299 that we'll need at least one byte of that original dest word.) */
300
301 ldq_u t0, 0(a0) # e0 :
302 negq t8, t6 # .. e1 : build bitmask of bytes <= zero
303 mskqh t1, t4, t1 # e0 :
304 and t6, t8, t12 # .. e1 :
305 subq t12, 1, t6 # e0 :
306 or t6, t12, t8 # e1 :
307
308 zapnot t2, t8, t2 # e0 : prepare source word; mirror changes
309 zapnot t1, t8, t1 # .. e1 : to source validity mask
310
311 andnot t0, t2, t0 # e0 : zero place for source to reside
312 or t0, t1, t0 # e1 : and put it there
313 stq_u t0, 0(a0) # e0 :
314
315$finish_up:
316 zapnot t0, t12, t4 # was last byte written null?
317 cmovne t4, 1, t4
318
319 and t12, 0xf0, t3 # binary search for the address of the
320 and t12, 0xcc, t2 # last byte written
321 and t12, 0xaa, t1
322 bic a0, 7, t0
323 cmovne t3, 4, t3
324 cmovne t2, 2, t2
325 cmovne t1, 1, t1
326 addq t0, t3, t0
327 addq t1, t2, t1
328 addq t0, t1, t0
329 addq t0, t4, t0 # add one if we filled the buffer
330
331 subq t0, v0, v0 # find string length
332 ret
333
334$zerolength:
335 clr v0
336$exception:
337 ret
338
339 .end __strncpy_from_user
diff --git a/arch/alpha/lib/strrchr.S b/arch/alpha/lib/strrchr.S
new file mode 100644
index 000000000000..82cfd0ac907b
--- /dev/null
+++ b/arch/alpha/lib/strrchr.S
@@ -0,0 +1,87 @@
1/*
2 * arch/alpha/lib/strrchr.S
3 * Contributed by Richard Henderson (rth@tamu.edu)
4 *
5 * Return the address of the last occurrence of a given character
6 * within a null-terminated string, or null if it is not found.
7 */
8
9#include <asm/regdef.h>
10
11 .set noreorder
12 .set noat
13
14 .align 3
15 .ent strrchr
16 .globl strrchr
17strrchr:
18 .frame sp, 0, ra
19 .prologue 0
20
21 zapnot a1, 1, a1 # e0 : zero extend our test character
22 mov zero, t6 # .. e1 : t6 is last match aligned addr
23 sll a1, 8, t5 # e0 : replicate our test character
24 mov zero, t8 # .. e1 : t8 is last match byte compare mask
25 or t5, a1, a1 # e0 :
26 ldq_u t0, 0(a0) # .. e1 : load first quadword
27 sll a1, 16, t5 # e0 :
28 andnot a0, 7, v0 # .. e1 : align source addr
29 or t5, a1, a1 # e0 :
30 lda t4, -1 # .. e1 : build garbage mask
31 sll a1, 32, t5 # e0 :
32 cmpbge zero, t0, t1 # .. e1 : bits set iff byte == zero
33 mskqh t4, a0, t4 # e0 :
34 or t5, a1, a1 # .. e1 : character replication complete
35 xor t0, a1, t2 # e0 : make bytes == c zero
36 cmpbge zero, t4, t4 # .. e1 : bits set iff byte is garbage
37 cmpbge zero, t2, t3 # e0 : bits set iff byte == c
38 andnot t1, t4, t1 # .. e1 : clear garbage from null test
39 andnot t3, t4, t3 # e0 : clear garbage from char test
40 bne t1, $eos # .. e1 : did we already hit the terminator?
41
42 /* Character search main loop */
43$loop:
44 ldq t0, 8(v0) # e0 : load next quadword
45 cmovne t3, v0, t6 # .. e1 : save previous comparisons match
46 cmovne t3, t3, t8 # e0 :
47 addq v0, 8, v0 # .. e1 :
48 xor t0, a1, t2 # e0 :
49 cmpbge zero, t0, t1 # .. e1 : bits set iff byte == zero
50 cmpbge zero, t2, t3 # e0 : bits set iff byte == c
51 beq t1, $loop # .. e1 : if we havnt seen a null, loop
52
53 /* Mask out character matches after terminator */
54$eos:
55 negq t1, t4 # e0 : isolate first null byte match
56 and t1, t4, t4 # e1 :
57 subq t4, 1, t5 # e0 : build a mask of the bytes upto...
58 or t4, t5, t4 # e1 : ... and including the null
59
60 and t3, t4, t3 # e0 : mask out char matches after null
61 cmovne t3, t3, t8 # .. e1 : save it, if match found
62 cmovne t3, v0, t6 # e0 :
63
64 /* Locate the address of the last matched character */
65
66 /* Retain the early exit for the ev4 -- the ev5 mispredict penalty
67 is 5 cycles -- the same as just falling through. */
68 beq t8, $retnull # .. e1 :
69
70 and t8, 0xf0, t2 # e0 : binary search for the high bit set
71 cmovne t2, t2, t8 # .. e1 (zdb)
72 cmovne t2, 4, t2 # e0 :
73 and t8, 0xcc, t1 # .. e1 :
74 cmovne t1, t1, t8 # e0 :
75 cmovne t1, 2, t1 # .. e1 :
76 and t8, 0xaa, t0 # e0 :
77 cmovne t0, 1, t0 # .. e1 (zdb)
78 addq t2, t1, t1 # e0 :
79 addq t6, t0, v0 # .. e1 : add our aligned base ptr to the mix
80 addq v0, t1, v0 # e0 :
81 ret # .. e1 :
82
83$retnull:
84 mov zero, v0 # e0 :
85 ret # .. e1 :
86
87 .end strrchr
diff --git a/arch/alpha/lib/stxcpy.S b/arch/alpha/lib/stxcpy.S
new file mode 100644
index 000000000000..2a8d51bfc05d
--- /dev/null
+++ b/arch/alpha/lib/stxcpy.S
@@ -0,0 +1,289 @@
1/*
2 * arch/alpha/lib/stxcpy.S
3 * Contributed by Richard Henderson (rth@tamu.edu)
4 *
5 * Copy a null-terminated string from SRC to DST.
6 *
7 * This is an internal routine used by strcpy, stpcpy, and strcat.
8 * As such, it uses special linkage conventions to make implementation
9 * of these public functions more efficient.
10 *
11 * On input:
12 * t9 = return address
13 * a0 = DST
14 * a1 = SRC
15 *
16 * On output:
17 * t12 = bitmask (with one bit set) indicating the last byte written
18 * a0 = unaligned address of the last *word* written
19 *
20 * Furthermore, v0, a3-a5, t11, and t12 are untouched.
21 */
22
23#include <asm/regdef.h>
24
25 .set noat
26 .set noreorder
27
28 .text
29
30/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
31 doesn't like putting the entry point for a procedure somewhere in the
32 middle of the procedure descriptor. Work around this by putting the
33 aligned copy in its own procedure descriptor */
34
35 .ent stxcpy_aligned
36 .align 3
37stxcpy_aligned:
38 .frame sp, 0, t9
39 .prologue 0
40
41 /* On entry to this basic block:
42 t0 == the first destination word for masking back in
43 t1 == the first source word. */
44
45 /* Create the 1st output word and detect 0's in the 1st input word. */
46 lda t2, -1 # e1 : build a mask against false zero
47 mskqh t2, a1, t2 # e0 : detection in the src word
48 mskqh t1, a1, t3 # e0 :
49 ornot t1, t2, t2 # .. e1 :
50 mskql t0, a1, t0 # e0 : assemble the first output word
51 cmpbge zero, t2, t8 # .. e1 : bits set iff null found
52 or t0, t3, t1 # e0 :
53 bne t8, $a_eos # .. e1 :
54
55 /* On entry to this basic block:
56 t0 == the first destination word for masking back in
57 t1 == a source word not containing a null. */
58
59$a_loop:
60 stq_u t1, 0(a0) # e0 :
61 addq a0, 8, a0 # .. e1 :
62 ldq_u t1, 0(a1) # e0 :
63 addq a1, 8, a1 # .. e1 :
64 cmpbge zero, t1, t8 # e0 (stall)
65 beq t8, $a_loop # .. e1 (zdb)
66
67 /* Take care of the final (partial) word store.
68 On entry to this basic block we have:
69 t1 == the source word containing the null
70 t8 == the cmpbge mask that found it. */
71$a_eos:
72 negq t8, t6 # e0 : find low bit set
73 and t8, t6, t12 # e1 (stall)
74
75 /* For the sake of the cache, don't read a destination word
76 if we're not going to need it. */
77 and t12, 0x80, t6 # e0 :
78 bne t6, 1f # .. e1 (zdb)
79
80 /* We're doing a partial word store and so need to combine
81 our source and original destination words. */
82 ldq_u t0, 0(a0) # e0 :
83 subq t12, 1, t6 # .. e1 :
84 zapnot t1, t6, t1 # e0 : clear src bytes >= null
85 or t12, t6, t8 # .. e1 :
86 zap t0, t8, t0 # e0 : clear dst bytes <= null
87 or t0, t1, t1 # e1 :
88
891: stq_u t1, 0(a0) # e0 :
90 ret (t9) # .. e1 :
91
92 .end stxcpy_aligned
93
94 .align 3
95 .ent __stxcpy
96 .globl __stxcpy
97__stxcpy:
98 .frame sp, 0, t9
99 .prologue 0
100
101 /* Are source and destination co-aligned? */
102 xor a0, a1, t0 # e0 :
103 unop # :
104 and t0, 7, t0 # e0 :
105 bne t0, $unaligned # .. e1 :
106
107 /* We are co-aligned; take care of a partial first word. */
108 ldq_u t1, 0(a1) # e0 : load first src word
109 and a0, 7, t0 # .. e1 : take care not to load a word ...
110 addq a1, 8, a1 # e0 :
111 beq t0, stxcpy_aligned # .. e1 : ... if we wont need it
112 ldq_u t0, 0(a0) # e0 :
113 br stxcpy_aligned # .. e1 :
114
115
116/* The source and destination are not co-aligned. Align the destination
117 and cope. We have to be very careful about not reading too much and
118 causing a SEGV. */
119
120 .align 3
121$u_head:
122 /* We know just enough now to be able to assemble the first
123 full source word. We can still find a zero at the end of it
124 that prevents us from outputting the whole thing.
125
126 On entry to this basic block:
127 t0 == the first dest word, for masking back in, if needed else 0
128 t1 == the low bits of the first source word
129 t6 == bytemask that is -1 in dest word bytes */
130
131 ldq_u t2, 8(a1) # e0 :
132 addq a1, 8, a1 # .. e1 :
133
134 extql t1, a1, t1 # e0 :
135 extqh t2, a1, t4 # e0 :
136 mskql t0, a0, t0 # e0 :
137 or t1, t4, t1 # .. e1 :
138 mskqh t1, a0, t1 # e0 :
139 or t0, t1, t1 # e1 :
140
141 or t1, t6, t6 # e0 :
142 cmpbge zero, t6, t8 # .. e1 :
143 lda t6, -1 # e0 : for masking just below
144 bne t8, $u_final # .. e1 :
145
146 mskql t6, a1, t6 # e0 : mask out the bits we have
147 or t6, t2, t2 # e1 : already extracted before
148 cmpbge zero, t2, t8 # e0 : testing eos
149 bne t8, $u_late_head_exit # .. e1 (zdb)
150
151 /* Finally, we've got all the stupid leading edge cases taken care
152 of and we can set up to enter the main loop. */
153
154 stq_u t1, 0(a0) # e0 : store first output word
155 addq a0, 8, a0 # .. e1 :
156 extql t2, a1, t0 # e0 : position ho-bits of lo word
157 ldq_u t2, 8(a1) # .. e1 : read next high-order source word
158 addq a1, 8, a1 # e0 :
159 cmpbge zero, t2, t8 # .. e1 :
160 nop # e0 :
161 bne t8, $u_eos # .. e1 :
162
163 /* Unaligned copy main loop. In order to avoid reading too much,
164 the loop is structured to detect zeros in aligned source words.
165 This has, unfortunately, effectively pulled half of a loop
166 iteration out into the head and half into the tail, but it does
167 prevent nastiness from accumulating in the very thing we want
168 to run as fast as possible.
169
170 On entry to this basic block:
171 t0 == the shifted high-order bits from the previous source word
172 t2 == the unshifted current source word
173
174 We further know that t2 does not contain a null terminator. */
175
176 .align 3
177$u_loop:
178 extqh t2, a1, t1 # e0 : extract high bits for current word
179 addq a1, 8, a1 # .. e1 :
180 extql t2, a1, t3 # e0 : extract low bits for next time
181 addq a0, 8, a0 # .. e1 :
182 or t0, t1, t1 # e0 : current dst word now complete
183 ldq_u t2, 0(a1) # .. e1 : load high word for next time
184 stq_u t1, -8(a0) # e0 : save the current word
185 mov t3, t0 # .. e1 :
186 cmpbge zero, t2, t8 # e0 : test new word for eos
187 beq t8, $u_loop # .. e1 :
188
189 /* We've found a zero somewhere in the source word we just read.
190 If it resides in the lower half, we have one (probably partial)
191 word to write out, and if it resides in the upper half, we
192 have one full and one partial word left to write out.
193
194 On entry to this basic block:
195 t0 == the shifted high-order bits from the previous source word
196 t2 == the unshifted current source word. */
197$u_eos:
198 extqh t2, a1, t1 # e0 :
199 or t0, t1, t1 # e1 : first (partial) source word complete
200
201 cmpbge zero, t1, t8 # e0 : is the null in this first bit?
202 bne t8, $u_final # .. e1 (zdb)
203
204$u_late_head_exit:
205 stq_u t1, 0(a0) # e0 : the null was in the high-order bits
206 addq a0, 8, a0 # .. e1 :
207 extql t2, a1, t1 # e0 :
208 cmpbge zero, t1, t8 # .. e1 :
209
210 /* Take care of a final (probably partial) result word.
211 On entry to this basic block:
212 t1 == assembled source word
213 t8 == cmpbge mask that found the null. */
214$u_final:
215 negq t8, t6 # e0 : isolate low bit set
216 and t6, t8, t12 # e1 :
217
218 and t12, 0x80, t6 # e0 : avoid dest word load if we can
219 bne t6, 1f # .. e1 (zdb)
220
221 ldq_u t0, 0(a0) # e0 :
222 subq t12, 1, t6 # .. e1 :
223 or t6, t12, t8 # e0 :
224 zapnot t1, t6, t1 # .. e1 : kill source bytes >= null
225 zap t0, t8, t0 # e0 : kill dest bytes <= null
226 or t0, t1, t1 # e1 :
227
2281: stq_u t1, 0(a0) # e0 :
229 ret (t9) # .. e1 :
230
231 /* Unaligned copy entry point. */
232 .align 3
233$unaligned:
234
235 ldq_u t1, 0(a1) # e0 : load first source word
236
237 and a0, 7, t4 # .. e1 : find dest misalignment
238 and a1, 7, t5 # e0 : find src misalignment
239
240 /* Conditionally load the first destination word and a bytemask
241 with 0xff indicating that the destination byte is sacrosanct. */
242
243 mov zero, t0 # .. e1 :
244 mov zero, t6 # e0 :
245 beq t4, 1f # .. e1 :
246 ldq_u t0, 0(a0) # e0 :
247 lda t6, -1 # .. e1 :
248 mskql t6, a0, t6 # e0 :
2491:
250 subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr
251
252 /* If source misalignment is larger than dest misalignment, we need
253 extra startup checks to avoid SEGV. */
254
255 cmplt t4, t5, t12 # e0 :
256 beq t12, $u_head # .. e1 (zdb)
257
258 lda t2, -1 # e1 : mask out leading garbage in source
259 mskqh t2, t5, t2 # e0 :
260 nop # e0 :
261 ornot t1, t2, t3 # .. e1 :
262 cmpbge zero, t3, t8 # e0 : is there a zero?
263 beq t8, $u_head # .. e1 (zdb)
264
265 /* At this point we've found a zero in the first partial word of
266 the source. We need to isolate the valid source data and mask
267 it into the original destination data. (Incidentally, we know
268 that we'll need at least one byte of that original dest word.) */
269
270 ldq_u t0, 0(a0) # e0 :
271
272 negq t8, t6 # .. e1 : build bitmask of bytes <= zero
273 and t6, t8, t12 # e0 :
274 and a1, 7, t5 # .. e1 :
275 subq t12, 1, t6 # e0 :
276 or t6, t12, t8 # e1 :
277 srl t12, t5, t12 # e0 : adjust final null return value
278
279 zapnot t2, t8, t2 # .. e1 : prepare source word; mirror changes
280 and t1, t2, t1 # e1 : to source validity mask
281 extql t2, a1, t2 # .. e0 :
282 extql t1, a1, t1 # e0 :
283
284 andnot t0, t2, t0 # .. e1 : zero place for source to reside
285 or t0, t1, t1 # e1 : and put it there
286 stq_u t1, 0(a0) # .. e0 :
287 ret (t9) # e1 :
288
289 .end __stxcpy
diff --git a/arch/alpha/lib/stxncpy.S b/arch/alpha/lib/stxncpy.S
new file mode 100644
index 000000000000..da1a72740d29
--- /dev/null
+++ b/arch/alpha/lib/stxncpy.S
@@ -0,0 +1,345 @@
1/*
2 * arch/alpha/lib/stxncpy.S
3 * Contributed by Richard Henderson (rth@tamu.edu)
4 *
5 * Copy no more than COUNT bytes of the null-terminated string from
6 * SRC to DST.
7 *
8 * This is an internal routine used by strncpy, stpncpy, and strncat.
9 * As such, it uses special linkage conventions to make implementation
10 * of these public functions more efficient.
11 *
12 * On input:
13 * t9 = return address
14 * a0 = DST
15 * a1 = SRC
16 * a2 = COUNT
17 *
18 * Furthermore, COUNT may not be zero.
19 *
20 * On output:
21 * t0 = last word written
22 * t10 = bitmask (with one bit set) indicating the byte position of
23 * the end of the range specified by COUNT
24 * t12 = bitmask (with one bit set) indicating the last byte written
25 * a0 = unaligned address of the last *word* written
26 * a2 = the number of full words left in COUNT
27 *
28 * Furthermore, v0, a3-a5, t11, and $at are untouched.
29 */
30
31#include <asm/regdef.h>
32
33 .set noat
34 .set noreorder
35
36 .text
37
38/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
39 doesn't like putting the entry point for a procedure somewhere in the
40 middle of the procedure descriptor. Work around this by putting the
41 aligned copy in its own procedure descriptor */
42
43 .ent stxncpy_aligned
44 .align 3
45stxncpy_aligned:
46 .frame sp, 0, t9, 0
47 .prologue 0
48
49 /* On entry to this basic block:
50 t0 == the first destination word for masking back in
51 t1 == the first source word. */
52
53 /* Create the 1st output word and detect 0's in the 1st input word. */
54 lda t2, -1 # e1 : build a mask against false zero
55 mskqh t2, a1, t2 # e0 : detection in the src word
56 mskqh t1, a1, t3 # e0 :
57 ornot t1, t2, t2 # .. e1 :
58 mskql t0, a1, t0 # e0 : assemble the first output word
59 cmpbge zero, t2, t8 # .. e1 : bits set iff null found
60 or t0, t3, t0 # e0 :
61 beq a2, $a_eoc # .. e1 :
62 bne t8, $a_eos # .. e1 :
63
64 /* On entry to this basic block:
65 t0 == a source word not containing a null. */
66
67$a_loop:
68 stq_u t0, 0(a0) # e0 :
69 addq a0, 8, a0 # .. e1 :
70 ldq_u t0, 0(a1) # e0 :
71 addq a1, 8, a1 # .. e1 :
72 subq a2, 1, a2 # e0 :
73 cmpbge zero, t0, t8 # .. e1 (stall)
74 beq a2, $a_eoc # e1 :
75 beq t8, $a_loop # e1 :
76
77 /* Take care of the final (partial) word store. At this point
78 the end-of-count bit is set in t8 iff it applies.
79
80 On entry to this basic block we have:
81 t0 == the source word containing the null
82 t8 == the cmpbge mask that found it. */
83
84$a_eos:
85 negq t8, t12 # e0 : find low bit set
86 and t8, t12, t12 # e1 (stall)
87
88 /* For the sake of the cache, don't read a destination word
89 if we're not going to need it. */
90 and t12, 0x80, t6 # e0 :
91 bne t6, 1f # .. e1 (zdb)
92
93 /* We're doing a partial word store and so need to combine
94 our source and original destination words. */
95 ldq_u t1, 0(a0) # e0 :
96 subq t12, 1, t6 # .. e1 :
97 or t12, t6, t8 # e0 :
98 unop #
99 zapnot t0, t8, t0 # e0 : clear src bytes > null
100 zap t1, t8, t1 # .. e1 : clear dst bytes <= null
101 or t0, t1, t0 # e1 :
102
1031: stq_u t0, 0(a0) # e0 :
104 ret (t9) # e1 :
105
106 /* Add the end-of-count bit to the eos detection bitmask. */
107$a_eoc:
108 or t10, t8, t8
109 br $a_eos
110
111 .end stxncpy_aligned
112
113 .align 3
114 .ent __stxncpy
115 .globl __stxncpy
116__stxncpy:
117 .frame sp, 0, t9, 0
118 .prologue 0
119
120 /* Are source and destination co-aligned? */
121 xor a0, a1, t1 # e0 :
122 and a0, 7, t0 # .. e1 : find dest misalignment
123 and t1, 7, t1 # e0 :
124 addq a2, t0, a2 # .. e1 : bias count by dest misalignment
125 subq a2, 1, a2 # e0 :
126 and a2, 7, t2 # e1 :
127 srl a2, 3, a2 # e0 : a2 = loop counter = (count - 1)/8
128 addq zero, 1, t10 # .. e1 :
129 sll t10, t2, t10 # e0 : t10 = bitmask of last count byte
130 bne t1, $unaligned # .. e1 :
131
132 /* We are co-aligned; take care of a partial first word. */
133
134 ldq_u t1, 0(a1) # e0 : load first src word
135 addq a1, 8, a1 # .. e1 :
136
137 beq t0, stxncpy_aligned # avoid loading dest word if not needed
138 ldq_u t0, 0(a0) # e0 :
139 br stxncpy_aligned # .. e1 :
140
141
142/* The source and destination are not co-aligned. Align the destination
143 and cope. We have to be very careful about not reading too much and
144 causing a SEGV. */
145
146 .align 3
147$u_head:
148 /* We know just enough now to be able to assemble the first
149 full source word. We can still find a zero at the end of it
150 that prevents us from outputting the whole thing.
151
152 On entry to this basic block:
153 t0 == the first dest word, unmasked
154 t1 == the shifted low bits of the first source word
155 t6 == bytemask that is -1 in dest word bytes */
156
157 ldq_u t2, 8(a1) # e0 : load second src word
158 addq a1, 8, a1 # .. e1 :
159 mskql t0, a0, t0 # e0 : mask trailing garbage in dst
160 extqh t2, a1, t4 # e0 :
161 or t1, t4, t1 # e1 : first aligned src word complete
162 mskqh t1, a0, t1 # e0 : mask leading garbage in src
163 or t0, t1, t0 # e0 : first output word complete
164 or t0, t6, t6 # e1 : mask original data for zero test
165 cmpbge zero, t6, t8 # e0 :
166 beq a2, $u_eocfin # .. e1 :
167 lda t6, -1 # e0 :
168 bne t8, $u_final # .. e1 :
169
170 mskql t6, a1, t6 # e0 : mask out bits already seen
171 nop # .. e1 :
172 stq_u t0, 0(a0) # e0 : store first output word
173 or t6, t2, t2 # .. e1 :
174 cmpbge zero, t2, t8 # e0 : find nulls in second partial
175 addq a0, 8, a0 # .. e1 :
176 subq a2, 1, a2 # e0 :
177 bne t8, $u_late_head_exit # .. e1 :
178
179 /* Finally, we've got all the stupid leading edge cases taken care
180 of and we can set up to enter the main loop. */
181
182 extql t2, a1, t1 # e0 : position hi-bits of lo word
183 beq a2, $u_eoc # .. e1 :
184 ldq_u t2, 8(a1) # e0 : read next high-order source word
185 addq a1, 8, a1 # .. e1 :
186 extqh t2, a1, t0 # e0 : position lo-bits of hi word (stall)
187 cmpbge zero, t2, t8 # .. e1 :
188 nop # e0 :
189 bne t8, $u_eos # .. e1 :
190
191 /* Unaligned copy main loop. In order to avoid reading too much,
192 the loop is structured to detect zeros in aligned source words.
193 This has, unfortunately, effectively pulled half of a loop
194 iteration out into the head and half into the tail, but it does
195 prevent nastiness from accumulating in the very thing we want
196 to run as fast as possible.
197
198 On entry to this basic block:
199 t0 == the shifted low-order bits from the current source word
200 t1 == the shifted high-order bits from the previous source word
201 t2 == the unshifted current source word
202
203 We further know that t2 does not contain a null terminator. */
204
205 .align 3
206$u_loop:
207 or t0, t1, t0 # e0 : current dst word now complete
208 subq a2, 1, a2 # .. e1 : decrement word count
209 stq_u t0, 0(a0) # e0 : save the current word
210 addq a0, 8, a0 # .. e1 :
211 extql t2, a1, t1 # e0 : extract high bits for next time
212 beq a2, $u_eoc # .. e1 :
213 ldq_u t2, 8(a1) # e0 : load high word for next time
214 addq a1, 8, a1 # .. e1 :
215 nop # e0 :
216 cmpbge zero, t2, t8 # e1 : test new word for eos (stall)
217 extqh t2, a1, t0 # e0 : extract low bits for current word
218 beq t8, $u_loop # .. e1 :
219
220 /* We've found a zero somewhere in the source word we just read.
221 If it resides in the lower half, we have one (probably partial)
222 word to write out, and if it resides in the upper half, we
223 have one full and one partial word left to write out.
224
225 On entry to this basic block:
226 t0 == the shifted low-order bits from the current source word
227 t1 == the shifted high-order bits from the previous source word
228 t2 == the unshifted current source word. */
229$u_eos:
230 or t0, t1, t0 # e0 : first (partial) source word complete
231 nop # .. e1 :
232 cmpbge zero, t0, t8 # e0 : is the null in this first bit?
233 bne t8, $u_final # .. e1 (zdb)
234
235 stq_u t0, 0(a0) # e0 : the null was in the high-order bits
236 addq a0, 8, a0 # .. e1 :
237 subq a2, 1, a2 # e1 :
238
239$u_late_head_exit:
240 extql t2, a1, t0 # .. e0 :
241 cmpbge zero, t0, t8 # e0 :
242 or t8, t10, t6 # e1 :
243 cmoveq a2, t6, t8 # e0 :
244 nop # .. e1 :
245
246 /* Take care of a final (probably partial) result word.
247 On entry to this basic block:
248 t0 == assembled source word
249 t8 == cmpbge mask that found the null. */
250$u_final:
251 negq t8, t6 # e0 : isolate low bit set
252 and t6, t8, t12 # e1 :
253
254 and t12, 0x80, t6 # e0 : avoid dest word load if we can
255 bne t6, 1f # .. e1 (zdb)
256
257 ldq_u t1, 0(a0) # e0 :
258 subq t12, 1, t6 # .. e1 :
259 or t6, t12, t8 # e0 :
260 zapnot t0, t8, t0 # .. e1 : kill source bytes > null
261 zap t1, t8, t1 # e0 : kill dest bytes <= null
262 or t0, t1, t0 # e1 :
263
2641: stq_u t0, 0(a0) # e0 :
265 ret (t9) # .. e1 :
266
267 /* Got to end-of-count before end of string.
268 On entry to this basic block:
269 t1 == the shifted high-order bits from the previous source word */
270$u_eoc:
271 and a1, 7, t6 # e1 :
272 sll t10, t6, t6 # e0 :
273 and t6, 0xff, t6 # e0 :
274 bne t6, 1f # .. e1 :
275
276 ldq_u t2, 8(a1) # e0 : load final src word
277 nop # .. e1 :
278 extqh t2, a1, t0 # e0 : extract low bits for last word
279 or t1, t0, t1 # e1 :
280
2811: cmpbge zero, t1, t8
282 mov t1, t0
283
284$u_eocfin: # end-of-count, final word
285 or t10, t8, t8
286 br $u_final
287
288 /* Unaligned copy entry point. */
289 .align 3
290$unaligned:
291
292 ldq_u t1, 0(a1) # e0 : load first source word
293
294 and a0, 7, t4 # .. e1 : find dest misalignment
295 and a1, 7, t5 # e0 : find src misalignment
296
297 /* Conditionally load the first destination word and a bytemask
298 with 0xff indicating that the destination byte is sacrosanct. */
299
300 mov zero, t0 # .. e1 :
301 mov zero, t6 # e0 :
302 beq t4, 1f # .. e1 :
303 ldq_u t0, 0(a0) # e0 :
304 lda t6, -1 # .. e1 :
305 mskql t6, a0, t6 # e0 :
306 subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr
307
308 /* If source misalignment is larger than dest misalignment, we need
309 extra startup checks to avoid SEGV. */
310
3111: cmplt t4, t5, t12 # e1 :
312 extql t1, a1, t1 # .. e0 : shift src into place
313 lda t2, -1 # e0 : for creating masks later
314 beq t12, $u_head # .. e1 :
315
316 extql t2, a1, t2 # e0 :
317 cmpbge zero, t1, t8 # .. e1 : is there a zero?
318 andnot t2, t6, t12 # e0 : dest mask for a single word copy
319 or t8, t10, t5 # .. e1 : test for end-of-count too
320 cmpbge zero, t12, t3 # e0 :
321 cmoveq a2, t5, t8 # .. e1 :
322 andnot t8, t3, t8 # e0 :
323 beq t8, $u_head # .. e1 (zdb)
324
325 /* At this point we've found a zero in the first partial word of
326 the source. We need to isolate the valid source data and mask
327 it into the original destination data. (Incidentally, we know
328 that we'll need at least one byte of that original dest word.) */
329
330 ldq_u t0, 0(a0) # e0 :
331 negq t8, t6 # .. e1 : build bitmask of bytes <= zero
332 mskqh t1, t4, t1 # e0 :
333 and t6, t8, t2 # .. e1 :
334 subq t2, 1, t6 # e0 :
335 or t6, t2, t8 # e1 :
336
337 zapnot t12, t8, t12 # e0 : prepare source word; mirror changes
338 zapnot t1, t8, t1 # .. e1 : to source validity mask
339
340 andnot t0, t12, t0 # e0 : zero place for source to reside
341 or t0, t1, t0 # e1 : and put it there
342 stq_u t0, 0(a0) # e0 :
343 ret (t9) # .. e1 :
344
345 .end __stxncpy
diff --git a/arch/alpha/lib/udelay.c b/arch/alpha/lib/udelay.c
new file mode 100644
index 000000000000..1c879bbce419
--- /dev/null
+++ b/arch/alpha/lib/udelay.c
@@ -0,0 +1,55 @@
1/*
2 * Copyright (C) 1993, 2000 Linus Torvalds
3 *
4 * Delay routines, using a pre-computed "loops_per_jiffy" value.
5 */
6
7#include <linux/config.h>
8#include <linux/module.h>
9#include <linux/sched.h> /* for udelay's use of smp_processor_id */
10#include <asm/param.h>
11#include <asm/smp.h>
12#include <linux/delay.h>
13
14/*
15 * Use only for very small delays (< 1 msec).
16 *
17 * The active part of our cycle counter is only 32-bits wide, and
18 * we're treating the difference between two marks as signed. On
19 * a 1GHz box, that's about 2 seconds.
20 */
21
22void
23__delay(int loops)
24{
25 int tmp;
26 __asm__ __volatile__(
27 " rpcc %0\n"
28 " addl %1,%0,%1\n"
29 "1: rpcc %0\n"
30 " subl %1,%0,%0\n"
31 " bgt %0,1b"
32 : "=&r" (tmp), "=r" (loops) : "1"(loops));
33}
34
35#ifdef CONFIG_SMP
36#define LPJ cpu_data[smp_processor_id()].loops_per_jiffy
37#else
38#define LPJ loops_per_jiffy
39#endif
40
41void
42udelay(unsigned long usecs)
43{
44 usecs *= (((unsigned long)HZ << 32) / 1000000) * LPJ;
45 __delay((long)usecs >> 32);
46}
47EXPORT_SYMBOL(udelay);
48
49void
50ndelay(unsigned long nsecs)
51{
52 nsecs *= (((unsigned long)HZ << 32) / 1000000000) * LPJ;
53 __delay((long)nsecs >> 32);
54}
55EXPORT_SYMBOL(ndelay);