diff options
| author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
| commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
| tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/alpha/lib | |
Linux-2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/alpha/lib')
53 files changed, 7901 insertions, 0 deletions
diff --git a/arch/alpha/lib/Makefile b/arch/alpha/lib/Makefile new file mode 100644 index 00000000000..21cf624d732 --- /dev/null +++ b/arch/alpha/lib/Makefile | |||
| @@ -0,0 +1,58 @@ | |||
| 1 | # | ||
| 2 | # Makefile for alpha-specific library files.. | ||
| 3 | # | ||
| 4 | |||
| 5 | EXTRA_AFLAGS := $(CFLAGS) | ||
| 6 | EXTRA_CFLAGS := -Werror | ||
| 7 | |||
| 8 | # Many of these routines have implementations tuned for ev6. | ||
| 9 | # Choose them iff we're targeting ev6 specifically. | ||
| 10 | ev6-$(CONFIG_ALPHA_EV6) := ev6- | ||
| 11 | |||
| 12 | # Several make use of the cttz instruction introduced in ev67. | ||
| 13 | ev67-$(CONFIG_ALPHA_EV67) := ev67- | ||
| 14 | |||
| 15 | lib-y = __divqu.o __remqu.o __divlu.o __remlu.o \ | ||
| 16 | udelay.o \ | ||
| 17 | $(ev6-y)memset.o \ | ||
| 18 | $(ev6-y)memcpy.o \ | ||
| 19 | memmove.o \ | ||
| 20 | checksum.o \ | ||
| 21 | csum_partial_copy.o \ | ||
| 22 | $(ev67-y)strlen.o \ | ||
| 23 | $(ev67-y)strcat.o \ | ||
| 24 | strcpy.o \ | ||
| 25 | $(ev67-y)strncat.o \ | ||
| 26 | strncpy.o \ | ||
| 27 | $(ev6-y)stxcpy.o \ | ||
| 28 | $(ev6-y)stxncpy.o \ | ||
| 29 | $(ev67-y)strchr.o \ | ||
| 30 | $(ev67-y)strrchr.o \ | ||
| 31 | $(ev6-y)memchr.o \ | ||
| 32 | $(ev6-y)copy_user.o \ | ||
| 33 | $(ev6-y)clear_user.o \ | ||
| 34 | $(ev6-y)strncpy_from_user.o \ | ||
| 35 | $(ev67-y)strlen_user.o \ | ||
| 36 | $(ev6-y)csum_ipv6_magic.o \ | ||
| 37 | $(ev6-y)clear_page.o \ | ||
| 38 | $(ev6-y)copy_page.o \ | ||
| 39 | strcasecmp.o \ | ||
| 40 | fpreg.o \ | ||
| 41 | callback_srm.o srm_puts.o srm_printk.o | ||
| 42 | |||
| 43 | lib-$(CONFIG_SMP) += dec_and_lock.o | ||
| 44 | |||
| 45 | # The division routines are built from single source, with different defines. | ||
| 46 | AFLAGS___divqu.o = -DDIV | ||
| 47 | AFLAGS___remqu.o = -DREM | ||
| 48 | AFLAGS___divlu.o = -DDIV -DINTSIZE | ||
| 49 | AFLAGS___remlu.o = -DREM -DINTSIZE | ||
| 50 | |||
| 51 | $(obj)/__divqu.o: $(obj)/$(ev6-y)divide.S | ||
| 52 | $(cmd_as_o_S) | ||
| 53 | $(obj)/__remqu.o: $(obj)/$(ev6-y)divide.S | ||
| 54 | $(cmd_as_o_S) | ||
| 55 | $(obj)/__divlu.o: $(obj)/$(ev6-y)divide.S | ||
| 56 | $(cmd_as_o_S) | ||
| 57 | $(obj)/__remlu.o: $(obj)/$(ev6-y)divide.S | ||
| 58 | $(cmd_as_o_S) | ||
diff --git a/arch/alpha/lib/callback_srm.S b/arch/alpha/lib/callback_srm.S new file mode 100644 index 00000000000..0528acd0d9a --- /dev/null +++ b/arch/alpha/lib/callback_srm.S | |||
| @@ -0,0 +1,104 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/callback_srm.S | ||
| 3 | */ | ||
| 4 | |||
| 5 | #include <linux/config.h> | ||
| 6 | #include <asm/console.h> | ||
| 7 | |||
| 8 | .text | ||
| 9 | #define HWRPB_CRB_OFFSET 0xc0 | ||
| 10 | |||
| 11 | #if defined(CONFIG_ALPHA_SRM) || defined(CONFIG_ALPHA_GENERIC) | ||
| 12 | .align 4 | ||
| 13 | srm_dispatch: | ||
| 14 | #if defined(CONFIG_ALPHA_GENERIC) | ||
| 15 | ldl $4,alpha_using_srm | ||
| 16 | beq $4,nosrm | ||
| 17 | #endif | ||
| 18 | ldq $0,hwrpb # gp is set up by CALLBACK macro. | ||
| 19 | ldl $25,0($25) # Pick up the wrapper data. | ||
| 20 | mov $20,$21 # Shift arguments right. | ||
| 21 | mov $19,$20 | ||
| 22 | ldq $1,HWRPB_CRB_OFFSET($0) | ||
| 23 | mov $18,$19 | ||
| 24 | mov $17,$18 | ||
| 25 | mov $16,$17 | ||
| 26 | addq $0,$1,$2 # CRB address | ||
| 27 | ldq $27,0($2) # DISPATCH procedure descriptor (VMS call std) | ||
| 28 | extwl $25,0,$16 # SRM callback function code | ||
| 29 | ldq $3,8($27) # call address | ||
| 30 | extwl $25,2,$25 # argument information (VMS calling std) | ||
| 31 | jmp ($3) # Return directly to caller of wrapper. | ||
| 32 | |||
| 33 | .align 4 | ||
| 34 | .globl srm_fixup | ||
| 35 | .ent srm_fixup | ||
| 36 | srm_fixup: | ||
| 37 | ldgp $29,0($27) | ||
| 38 | #if defined(CONFIG_ALPHA_GENERIC) | ||
| 39 | ldl $4,alpha_using_srm | ||
| 40 | beq $4,nosrm | ||
| 41 | #endif | ||
| 42 | ldq $0,hwrpb | ||
| 43 | ldq $1,HWRPB_CRB_OFFSET($0) | ||
| 44 | addq $0,$1,$2 # CRB address | ||
| 45 | ldq $27,16($2) # VA of FIXUP procedure descriptor | ||
| 46 | ldq $3,8($27) # call address | ||
| 47 | lda $25,2($31) # two integer arguments | ||
| 48 | jmp ($3) # Return directly to caller of srm_fixup. | ||
| 49 | .end srm_fixup | ||
| 50 | |||
| 51 | #if defined(CONFIG_ALPHA_GENERIC) | ||
| 52 | .align 3 | ||
| 53 | nosrm: | ||
| 54 | lda $0,-1($31) | ||
| 55 | ret | ||
| 56 | #endif | ||
| 57 | |||
| 58 | #define CALLBACK(NAME, CODE, ARG_CNT) \ | ||
| 59 | .align 4; .globl callback_##NAME; .ent callback_##NAME; callback_##NAME##: \ | ||
| 60 | ldgp $29,0($27); br $25,srm_dispatch; .word CODE, ARG_CNT; .end callback_##NAME | ||
| 61 | |||
| 62 | #else /* defined(CONFIG_ALPHA_SRM) || defined(CONFIG_ALPHA_GENERIC) */ | ||
| 63 | |||
| 64 | #define CALLBACK(NAME, CODE, ARG_CNT) \ | ||
| 65 | .align 3; .globl callback_##NAME; .ent callback_##NAME; callback_##NAME##: \ | ||
| 66 | lda $0,-1($31); ret; .end callback_##NAME | ||
| 67 | |||
| 68 | .align 3 | ||
| 69 | .globl srm_fixup | ||
| 70 | .ent srm_fixup | ||
| 71 | srm_fixup: | ||
| 72 | lda $0,-1($31) | ||
| 73 | ret | ||
| 74 | .end srm_fixup | ||
| 75 | #endif /* defined(CONFIG_ALPHA_SRM) || defined(CONFIG_ALPHA_GENERIC) */ | ||
| 76 | |||
| 77 | CALLBACK(puts, CCB_PUTS, 4) | ||
| 78 | CALLBACK(open, CCB_OPEN, 3) | ||
| 79 | CALLBACK(close, CCB_CLOSE, 2) | ||
| 80 | CALLBACK(read, CCB_READ, 5) | ||
| 81 | CALLBACK(open_console, CCB_OPEN_CONSOLE, 1) | ||
| 82 | CALLBACK(close_console, CCB_CLOSE_CONSOLE, 1) | ||
| 83 | CALLBACK(getenv, CCB_GET_ENV, 4) | ||
| 84 | CALLBACK(setenv, CCB_SET_ENV, 4) | ||
| 85 | CALLBACK(getc, CCB_GETC, 2) | ||
| 86 | CALLBACK(reset_term, CCB_RESET_TERM, 2) | ||
| 87 | CALLBACK(term_int, CCB_SET_TERM_INT, 3) | ||
| 88 | CALLBACK(term_ctl, CCB_SET_TERM_CTL, 3) | ||
| 89 | CALLBACK(process_keycode, CCB_PROCESS_KEYCODE, 3) | ||
| 90 | CALLBACK(ioctl, CCB_IOCTL, 6) | ||
| 91 | CALLBACK(write, CCB_WRITE, 5) | ||
| 92 | CALLBACK(reset_env, CCB_RESET_ENV, 4) | ||
| 93 | CALLBACK(save_env, CCB_SAVE_ENV, 1) | ||
| 94 | CALLBACK(pswitch, CCB_PSWITCH, 3) | ||
| 95 | CALLBACK(bios_emul, CCB_BIOS_EMUL, 5) | ||
| 96 | |||
| 97 | .data | ||
| 98 | __alpha_using_srm: # For use by bootpheader | ||
| 99 | .long 7 # value is not 1 for link debugging | ||
| 100 | .weak alpha_using_srm; alpha_using_srm = __alpha_using_srm | ||
| 101 | __callback_init_done: # For use by bootpheader | ||
| 102 | .long 7 # value is not 1 for link debugging | ||
| 103 | .weak callback_init_done; callback_init_done = __callback_init_done | ||
| 104 | |||
diff --git a/arch/alpha/lib/checksum.c b/arch/alpha/lib/checksum.c new file mode 100644 index 00000000000..89044e6385f --- /dev/null +++ b/arch/alpha/lib/checksum.c | |||
| @@ -0,0 +1,186 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/checksum.c | ||
| 3 | * | ||
| 4 | * This file contains network checksum routines that are better done | ||
| 5 | * in an architecture-specific manner due to speed.. | ||
| 6 | * Comments in other versions indicate that the algorithms are from RFC1071 | ||
| 7 | * | ||
| 8 | * accellerated versions (and 21264 assembly versions ) contributed by | ||
| 9 | * Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/module.h> | ||
| 13 | #include <linux/string.h> | ||
| 14 | |||
| 15 | #include <asm/byteorder.h> | ||
| 16 | |||
| 17 | static inline unsigned short from64to16(unsigned long x) | ||
| 18 | { | ||
| 19 | /* Using extract instructions is a bit more efficient | ||
| 20 | than the original shift/bitmask version. */ | ||
| 21 | |||
| 22 | union { | ||
| 23 | unsigned long ul; | ||
| 24 | unsigned int ui[2]; | ||
| 25 | unsigned short us[4]; | ||
| 26 | } in_v, tmp_v, out_v; | ||
| 27 | |||
| 28 | in_v.ul = x; | ||
| 29 | tmp_v.ul = (unsigned long) in_v.ui[0] + (unsigned long) in_v.ui[1]; | ||
| 30 | |||
| 31 | /* Since the bits of tmp_v.sh[3] are going to always be zero, | ||
| 32 | we don't have to bother to add that in. */ | ||
| 33 | out_v.ul = (unsigned long) tmp_v.us[0] + (unsigned long) tmp_v.us[1] | ||
| 34 | + (unsigned long) tmp_v.us[2]; | ||
| 35 | |||
| 36 | /* Similarly, out_v.us[2] is always zero for the final add. */ | ||
| 37 | return out_v.us[0] + out_v.us[1]; | ||
| 38 | } | ||
| 39 | |||
| 40 | /* | ||
| 41 | * computes the checksum of the TCP/UDP pseudo-header | ||
| 42 | * returns a 16-bit checksum, already complemented. | ||
| 43 | */ | ||
| 44 | unsigned short int csum_tcpudp_magic(unsigned long saddr, | ||
| 45 | unsigned long daddr, | ||
| 46 | unsigned short len, | ||
| 47 | unsigned short proto, | ||
| 48 | unsigned int sum) | ||
| 49 | { | ||
| 50 | return ~from64to16(saddr + daddr + sum + | ||
| 51 | ((unsigned long) ntohs(len) << 16) + | ||
| 52 | ((unsigned long) proto << 8)); | ||
| 53 | } | ||
| 54 | |||
| 55 | unsigned int csum_tcpudp_nofold(unsigned long saddr, | ||
| 56 | unsigned long daddr, | ||
| 57 | unsigned short len, | ||
| 58 | unsigned short proto, | ||
| 59 | unsigned int sum) | ||
| 60 | { | ||
| 61 | unsigned long result; | ||
| 62 | |||
| 63 | result = (saddr + daddr + sum + | ||
| 64 | ((unsigned long) ntohs(len) << 16) + | ||
| 65 | ((unsigned long) proto << 8)); | ||
| 66 | |||
| 67 | /* Fold down to 32-bits so we don't lose in the typedef-less | ||
| 68 | network stack. */ | ||
| 69 | /* 64 to 33 */ | ||
| 70 | result = (result & 0xffffffff) + (result >> 32); | ||
| 71 | /* 33 to 32 */ | ||
| 72 | result = (result & 0xffffffff) + (result >> 32); | ||
| 73 | return result; | ||
| 74 | } | ||
| 75 | |||
| 76 | /* | ||
| 77 | * Do a 64-bit checksum on an arbitrary memory area.. | ||
| 78 | * | ||
| 79 | * This isn't a great routine, but it's not _horrible_ either. The | ||
| 80 | * inner loop could be unrolled a bit further, and there are better | ||
| 81 | * ways to do the carry, but this is reasonable. | ||
| 82 | */ | ||
| 83 | static inline unsigned long do_csum(const unsigned char * buff, int len) | ||
| 84 | { | ||
| 85 | int odd, count; | ||
| 86 | unsigned long result = 0; | ||
| 87 | |||
| 88 | if (len <= 0) | ||
| 89 | goto out; | ||
| 90 | odd = 1 & (unsigned long) buff; | ||
| 91 | if (odd) { | ||
| 92 | result = *buff << 8; | ||
| 93 | len--; | ||
| 94 | buff++; | ||
| 95 | } | ||
| 96 | count = len >> 1; /* nr of 16-bit words.. */ | ||
| 97 | if (count) { | ||
| 98 | if (2 & (unsigned long) buff) { | ||
| 99 | result += *(unsigned short *) buff; | ||
| 100 | count--; | ||
| 101 | len -= 2; | ||
| 102 | buff += 2; | ||
| 103 | } | ||
| 104 | count >>= 1; /* nr of 32-bit words.. */ | ||
| 105 | if (count) { | ||
| 106 | if (4 & (unsigned long) buff) { | ||
| 107 | result += *(unsigned int *) buff; | ||
| 108 | count--; | ||
| 109 | len -= 4; | ||
| 110 | buff += 4; | ||
| 111 | } | ||
| 112 | count >>= 1; /* nr of 64-bit words.. */ | ||
| 113 | if (count) { | ||
| 114 | unsigned long carry = 0; | ||
| 115 | do { | ||
| 116 | unsigned long w = *(unsigned long *) buff; | ||
| 117 | count--; | ||
| 118 | buff += 8; | ||
| 119 | result += carry; | ||
| 120 | result += w; | ||
| 121 | carry = (w > result); | ||
| 122 | } while (count); | ||
| 123 | result += carry; | ||
| 124 | result = (result & 0xffffffff) + (result >> 32); | ||
| 125 | } | ||
| 126 | if (len & 4) { | ||
| 127 | result += *(unsigned int *) buff; | ||
| 128 | buff += 4; | ||
| 129 | } | ||
| 130 | } | ||
| 131 | if (len & 2) { | ||
| 132 | result += *(unsigned short *) buff; | ||
| 133 | buff += 2; | ||
| 134 | } | ||
| 135 | } | ||
| 136 | if (len & 1) | ||
| 137 | result += *buff; | ||
| 138 | result = from64to16(result); | ||
| 139 | if (odd) | ||
| 140 | result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); | ||
| 141 | out: | ||
| 142 | return result; | ||
| 143 | } | ||
| 144 | |||
| 145 | /* | ||
| 146 | * This is a version of ip_compute_csum() optimized for IP headers, | ||
| 147 | * which always checksum on 4 octet boundaries. | ||
| 148 | */ | ||
| 149 | unsigned short ip_fast_csum(unsigned char * iph, unsigned int ihl) | ||
| 150 | { | ||
| 151 | return ~do_csum(iph,ihl*4); | ||
| 152 | } | ||
| 153 | |||
| 154 | /* | ||
| 155 | * computes the checksum of a memory block at buff, length len, | ||
| 156 | * and adds in "sum" (32-bit) | ||
| 157 | * | ||
| 158 | * returns a 32-bit number suitable for feeding into itself | ||
| 159 | * or csum_tcpudp_magic | ||
| 160 | * | ||
| 161 | * this function must be called with even lengths, except | ||
| 162 | * for the last fragment, which may be odd | ||
| 163 | * | ||
| 164 | * it's best to have buff aligned on a 32-bit boundary | ||
| 165 | */ | ||
| 166 | unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) | ||
| 167 | { | ||
| 168 | unsigned long result = do_csum(buff, len); | ||
| 169 | |||
| 170 | /* add in old sum, and carry.. */ | ||
| 171 | result += sum; | ||
| 172 | /* 32+c bits -> 32 bits */ | ||
| 173 | result = (result & 0xffffffff) + (result >> 32); | ||
| 174 | return result; | ||
| 175 | } | ||
| 176 | |||
| 177 | EXPORT_SYMBOL(csum_partial); | ||
| 178 | |||
| 179 | /* | ||
| 180 | * this routine is used for miscellaneous IP-like checksums, mainly | ||
| 181 | * in icmp.c | ||
| 182 | */ | ||
| 183 | unsigned short ip_compute_csum(unsigned char * buff, int len) | ||
| 184 | { | ||
| 185 | return ~from64to16(do_csum(buff,len)); | ||
| 186 | } | ||
diff --git a/arch/alpha/lib/clear_page.S b/arch/alpha/lib/clear_page.S new file mode 100644 index 00000000000..a221ae266e2 --- /dev/null +++ b/arch/alpha/lib/clear_page.S | |||
| @@ -0,0 +1,39 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/clear_page.S | ||
| 3 | * | ||
| 4 | * Zero an entire page. | ||
| 5 | */ | ||
| 6 | |||
| 7 | .text | ||
| 8 | .align 4 | ||
| 9 | .global clear_page | ||
| 10 | .ent clear_page | ||
| 11 | clear_page: | ||
| 12 | .prologue 0 | ||
| 13 | |||
| 14 | lda $0,128 | ||
| 15 | nop | ||
| 16 | unop | ||
| 17 | nop | ||
| 18 | |||
| 19 | 1: stq $31,0($16) | ||
| 20 | stq $31,8($16) | ||
| 21 | stq $31,16($16) | ||
| 22 | stq $31,24($16) | ||
| 23 | |||
| 24 | stq $31,32($16) | ||
| 25 | stq $31,40($16) | ||
| 26 | stq $31,48($16) | ||
| 27 | subq $0,1,$0 | ||
| 28 | |||
| 29 | stq $31,56($16) | ||
| 30 | addq $16,64,$16 | ||
| 31 | unop | ||
| 32 | bne $0,1b | ||
| 33 | |||
| 34 | ret | ||
| 35 | nop | ||
| 36 | unop | ||
| 37 | nop | ||
| 38 | |||
| 39 | .end clear_page | ||
diff --git a/arch/alpha/lib/clear_user.S b/arch/alpha/lib/clear_user.S new file mode 100644 index 00000000000..8860316c195 --- /dev/null +++ b/arch/alpha/lib/clear_user.S | |||
| @@ -0,0 +1,113 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/clear_user.S | ||
| 3 | * Contributed by Richard Henderson <rth@tamu.edu> | ||
| 4 | * | ||
| 5 | * Zero user space, handling exceptions as we go. | ||
| 6 | * | ||
| 7 | * We have to make sure that $0 is always up-to-date and contains the | ||
| 8 | * right "bytes left to zero" value (and that it is updated only _after_ | ||
| 9 | * a successful copy). There is also some rather minor exception setup | ||
| 10 | * stuff. | ||
| 11 | * | ||
| 12 | * NOTE! This is not directly C-callable, because the calling semantics | ||
| 13 | * are different: | ||
| 14 | * | ||
| 15 | * Inputs: | ||
| 16 | * length in $0 | ||
| 17 | * destination address in $6 | ||
| 18 | * exception pointer in $7 | ||
| 19 | * return address in $28 (exceptions expect it there) | ||
| 20 | * | ||
| 21 | * Outputs: | ||
| 22 | * bytes left to copy in $0 | ||
| 23 | * | ||
| 24 | * Clobbers: | ||
| 25 | * $1,$2,$3,$4,$5,$6 | ||
| 26 | */ | ||
| 27 | |||
| 28 | /* Allow an exception for an insn; exit if we get one. */ | ||
| 29 | #define EX(x,y...) \ | ||
| 30 | 99: x,##y; \ | ||
| 31 | .section __ex_table,"a"; \ | ||
| 32 | .long 99b - .; \ | ||
| 33 | lda $31, $exception-99b($31); \ | ||
| 34 | .previous | ||
| 35 | |||
| 36 | .set noat | ||
| 37 | .set noreorder | ||
| 38 | .align 4 | ||
| 39 | |||
| 40 | .globl __do_clear_user | ||
| 41 | .ent __do_clear_user | ||
| 42 | .frame $30, 0, $28 | ||
| 43 | .prologue 0 | ||
| 44 | |||
| 45 | $loop: | ||
| 46 | and $1, 3, $4 # e0 : | ||
| 47 | beq $4, 1f # .. e1 : | ||
| 48 | |||
| 49 | 0: EX( stq_u $31, 0($6) ) # e0 : zero one word | ||
| 50 | subq $0, 8, $0 # .. e1 : | ||
| 51 | subq $4, 1, $4 # e0 : | ||
| 52 | addq $6, 8, $6 # .. e1 : | ||
| 53 | bne $4, 0b # e1 : | ||
| 54 | unop # : | ||
| 55 | |||
| 56 | 1: bic $1, 3, $1 # e0 : | ||
| 57 | beq $1, $tail # .. e1 : | ||
| 58 | |||
| 59 | 2: EX( stq_u $31, 0($6) ) # e0 : zero four words | ||
| 60 | subq $0, 8, $0 # .. e1 : | ||
| 61 | EX( stq_u $31, 8($6) ) # e0 : | ||
| 62 | subq $0, 8, $0 # .. e1 : | ||
| 63 | EX( stq_u $31, 16($6) ) # e0 : | ||
| 64 | subq $0, 8, $0 # .. e1 : | ||
| 65 | EX( stq_u $31, 24($6) ) # e0 : | ||
| 66 | subq $0, 8, $0 # .. e1 : | ||
| 67 | subq $1, 4, $1 # e0 : | ||
| 68 | addq $6, 32, $6 # .. e1 : | ||
| 69 | bne $1, 2b # e1 : | ||
| 70 | |||
| 71 | $tail: | ||
| 72 | bne $2, 1f # e1 : is there a tail to do? | ||
| 73 | ret $31, ($28), 1 # .. e1 : | ||
| 74 | |||
| 75 | 1: EX( ldq_u $5, 0($6) ) # e0 : | ||
| 76 | clr $0 # .. e1 : | ||
| 77 | nop # e1 : | ||
| 78 | mskqh $5, $0, $5 # e0 : | ||
| 79 | EX( stq_u $5, 0($6) ) # e0 : | ||
| 80 | ret $31, ($28), 1 # .. e1 : | ||
| 81 | |||
| 82 | __do_clear_user: | ||
| 83 | and $6, 7, $4 # e0 : find dest misalignment | ||
| 84 | beq $0, $zerolength # .. e1 : | ||
| 85 | addq $0, $4, $1 # e0 : bias counter | ||
| 86 | and $1, 7, $2 # e1 : number of bytes in tail | ||
| 87 | srl $1, 3, $1 # e0 : | ||
| 88 | beq $4, $loop # .. e1 : | ||
| 89 | |||
| 90 | EX( ldq_u $5, 0($6) ) # e0 : load dst word to mask back in | ||
| 91 | beq $1, $oneword # .. e1 : sub-word store? | ||
| 92 | |||
| 93 | mskql $5, $6, $5 # e0 : take care of misaligned head | ||
| 94 | addq $6, 8, $6 # .. e1 : | ||
| 95 | EX( stq_u $5, -8($6) ) # e0 : | ||
| 96 | addq $0, $4, $0 # .. e1 : bytes left -= 8 - misalignment | ||
| 97 | subq $1, 1, $1 # e0 : | ||
| 98 | subq $0, 8, $0 # .. e1 : | ||
| 99 | br $loop # e1 : | ||
| 100 | unop # : | ||
| 101 | |||
| 102 | $oneword: | ||
| 103 | mskql $5, $6, $4 # e0 : | ||
| 104 | mskqh $5, $2, $5 # e0 : | ||
| 105 | or $5, $4, $5 # e1 : | ||
| 106 | EX( stq_u $5, 0($6) ) # e0 : | ||
| 107 | clr $0 # .. e1 : | ||
| 108 | |||
| 109 | $zerolength: | ||
| 110 | $exception: | ||
| 111 | ret $31, ($28), 1 # .. e1 : | ||
| 112 | |||
| 113 | .end __do_clear_user | ||
diff --git a/arch/alpha/lib/copy_page.S b/arch/alpha/lib/copy_page.S new file mode 100644 index 00000000000..9f3b97459cc --- /dev/null +++ b/arch/alpha/lib/copy_page.S | |||
| @@ -0,0 +1,49 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/copy_page.S | ||
| 3 | * | ||
| 4 | * Copy an entire page. | ||
| 5 | */ | ||
| 6 | |||
| 7 | .text | ||
| 8 | .align 4 | ||
| 9 | .global copy_page | ||
| 10 | .ent copy_page | ||
| 11 | copy_page: | ||
| 12 | .prologue 0 | ||
| 13 | |||
| 14 | lda $18,128 | ||
| 15 | nop | ||
| 16 | unop | ||
| 17 | nop | ||
| 18 | |||
| 19 | 1: ldq $0,0($17) | ||
| 20 | ldq $1,8($17) | ||
| 21 | ldq $2,16($17) | ||
| 22 | ldq $3,24($17) | ||
| 23 | |||
| 24 | ldq $4,32($17) | ||
| 25 | ldq $5,40($17) | ||
| 26 | ldq $6,48($17) | ||
| 27 | ldq $7,56($17) | ||
| 28 | |||
| 29 | stq $0,0($16) | ||
| 30 | subq $18,1,$18 | ||
| 31 | stq $1,8($16) | ||
| 32 | addq $17,64,$17 | ||
| 33 | |||
| 34 | stq $2,16($16) | ||
| 35 | stq $3,24($16) | ||
| 36 | stq $4,32($16) | ||
| 37 | stq $5,40($16) | ||
| 38 | |||
| 39 | stq $6,48($16) | ||
| 40 | stq $7,56($16) | ||
| 41 | addq $16,64,$16 | ||
| 42 | bne $18, 1b | ||
| 43 | |||
| 44 | ret | ||
| 45 | nop | ||
| 46 | unop | ||
| 47 | nop | ||
| 48 | |||
| 49 | .end copy_page | ||
diff --git a/arch/alpha/lib/copy_user.S b/arch/alpha/lib/copy_user.S new file mode 100644 index 00000000000..6f3fab9eb43 --- /dev/null +++ b/arch/alpha/lib/copy_user.S | |||
| @@ -0,0 +1,145 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/copy_user.S | ||
| 3 | * | ||
| 4 | * Copy to/from user space, handling exceptions as we go.. This | ||
| 5 | * isn't exactly pretty. | ||
| 6 | * | ||
| 7 | * This is essentially the same as "memcpy()", but with a few twists. | ||
| 8 | * Notably, we have to make sure that $0 is always up-to-date and | ||
| 9 | * contains the right "bytes left to copy" value (and that it is updated | ||
| 10 | * only _after_ a successful copy). There is also some rather minor | ||
| 11 | * exception setup stuff.. | ||
| 12 | * | ||
| 13 | * NOTE! This is not directly C-callable, because the calling semantics are | ||
| 14 | * different: | ||
| 15 | * | ||
| 16 | * Inputs: | ||
| 17 | * length in $0 | ||
| 18 | * destination address in $6 | ||
| 19 | * source address in $7 | ||
| 20 | * return address in $28 | ||
| 21 | * | ||
| 22 | * Outputs: | ||
| 23 | * bytes left to copy in $0 | ||
| 24 | * | ||
| 25 | * Clobbers: | ||
| 26 | * $1,$2,$3,$4,$5,$6,$7 | ||
| 27 | */ | ||
| 28 | |||
| 29 | /* Allow an exception for an insn; exit if we get one. */ | ||
| 30 | #define EXI(x,y...) \ | ||
| 31 | 99: x,##y; \ | ||
| 32 | .section __ex_table,"a"; \ | ||
| 33 | .long 99b - .; \ | ||
| 34 | lda $31, $exitin-99b($31); \ | ||
| 35 | .previous | ||
| 36 | |||
| 37 | #define EXO(x,y...) \ | ||
| 38 | 99: x,##y; \ | ||
| 39 | .section __ex_table,"a"; \ | ||
| 40 | .long 99b - .; \ | ||
| 41 | lda $31, $exitout-99b($31); \ | ||
| 42 | .previous | ||
| 43 | |||
| 44 | .set noat | ||
| 45 | .align 4 | ||
| 46 | .globl __copy_user | ||
| 47 | .ent __copy_user | ||
| 48 | __copy_user: | ||
| 49 | .prologue 0 | ||
| 50 | and $6,7,$3 | ||
| 51 | beq $0,$35 | ||
| 52 | beq $3,$36 | ||
| 53 | subq $3,8,$3 | ||
| 54 | .align 4 | ||
| 55 | $37: | ||
| 56 | EXI( ldq_u $1,0($7) ) | ||
| 57 | EXO( ldq_u $2,0($6) ) | ||
| 58 | extbl $1,$7,$1 | ||
| 59 | mskbl $2,$6,$2 | ||
| 60 | insbl $1,$6,$1 | ||
| 61 | addq $3,1,$3 | ||
| 62 | bis $1,$2,$1 | ||
| 63 | EXO( stq_u $1,0($6) ) | ||
| 64 | subq $0,1,$0 | ||
| 65 | addq $6,1,$6 | ||
| 66 | addq $7,1,$7 | ||
| 67 | beq $0,$41 | ||
| 68 | bne $3,$37 | ||
| 69 | $36: | ||
| 70 | and $7,7,$1 | ||
| 71 | bic $0,7,$4 | ||
| 72 | beq $1,$43 | ||
| 73 | beq $4,$48 | ||
| 74 | EXI( ldq_u $3,0($7) ) | ||
| 75 | .align 4 | ||
| 76 | $50: | ||
| 77 | EXI( ldq_u $2,8($7) ) | ||
| 78 | subq $4,8,$4 | ||
| 79 | extql $3,$7,$3 | ||
| 80 | extqh $2,$7,$1 | ||
| 81 | bis $3,$1,$1 | ||
| 82 | EXO( stq $1,0($6) ) | ||
| 83 | addq $7,8,$7 | ||
| 84 | subq $0,8,$0 | ||
| 85 | addq $6,8,$6 | ||
| 86 | bis $2,$2,$3 | ||
| 87 | bne $4,$50 | ||
| 88 | $48: | ||
| 89 | beq $0,$41 | ||
| 90 | .align 4 | ||
| 91 | $57: | ||
| 92 | EXI( ldq_u $1,0($7) ) | ||
| 93 | EXO( ldq_u $2,0($6) ) | ||
| 94 | extbl $1,$7,$1 | ||
| 95 | mskbl $2,$6,$2 | ||
| 96 | insbl $1,$6,$1 | ||
| 97 | bis $1,$2,$1 | ||
| 98 | EXO( stq_u $1,0($6) ) | ||
| 99 | subq $0,1,$0 | ||
| 100 | addq $6,1,$6 | ||
| 101 | addq $7,1,$7 | ||
| 102 | bne $0,$57 | ||
| 103 | br $31,$41 | ||
| 104 | .align 4 | ||
| 105 | $43: | ||
| 106 | beq $4,$65 | ||
| 107 | .align 4 | ||
| 108 | $66: | ||
| 109 | EXI( ldq $1,0($7) ) | ||
| 110 | subq $4,8,$4 | ||
| 111 | EXO( stq $1,0($6) ) | ||
| 112 | addq $7,8,$7 | ||
| 113 | subq $0,8,$0 | ||
| 114 | addq $6,8,$6 | ||
| 115 | bne $4,$66 | ||
| 116 | $65: | ||
| 117 | beq $0,$41 | ||
| 118 | EXI( ldq $2,0($7) ) | ||
| 119 | EXO( ldq $1,0($6) ) | ||
| 120 | mskql $2,$0,$2 | ||
| 121 | mskqh $1,$0,$1 | ||
| 122 | bis $2,$1,$2 | ||
| 123 | EXO( stq $2,0($6) ) | ||
| 124 | bis $31,$31,$0 | ||
| 125 | $41: | ||
| 126 | $35: | ||
| 127 | $exitout: | ||
| 128 | ret $31,($28),1 | ||
| 129 | |||
| 130 | $exitin: | ||
| 131 | /* A stupid byte-by-byte zeroing of the rest of the output | ||
| 132 | buffer. This cures security holes by never leaving | ||
| 133 | random kernel data around to be copied elsewhere. */ | ||
| 134 | |||
| 135 | mov $0,$1 | ||
| 136 | $101: | ||
| 137 | EXO ( ldq_u $2,0($6) ) | ||
| 138 | subq $1,1,$1 | ||
| 139 | mskbl $2,$6,$2 | ||
| 140 | EXO ( stq_u $2,0($6) ) | ||
| 141 | addq $6,1,$6 | ||
| 142 | bgt $1,$101 | ||
| 143 | ret $31,($28),1 | ||
| 144 | |||
| 145 | .end __copy_user | ||
diff --git a/arch/alpha/lib/csum_ipv6_magic.S b/arch/alpha/lib/csum_ipv6_magic.S new file mode 100644 index 00000000000..e09748dbf2e --- /dev/null +++ b/arch/alpha/lib/csum_ipv6_magic.S | |||
| @@ -0,0 +1,92 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/csum_ipv6_magic.S | ||
| 3 | * Contributed by Richard Henderson <rth@tamu.edu> | ||
| 4 | * | ||
| 5 | * unsigned short csum_ipv6_magic(struct in6_addr *saddr, | ||
| 6 | * struct in6_addr *daddr, | ||
| 7 | * __u32 len, | ||
| 8 | * unsigned short proto, | ||
| 9 | * unsigned int csum); | ||
| 10 | */ | ||
| 11 | |||
| 12 | .globl csum_ipv6_magic | ||
| 13 | .align 4 | ||
| 14 | .ent csum_ipv6_magic | ||
| 15 | .frame $30,0,$26,0 | ||
| 16 | csum_ipv6_magic: | ||
| 17 | .prologue 0 | ||
| 18 | |||
| 19 | ldq $0,0($16) # e0 : load src & dst addr words | ||
| 20 | zapnot $20,15,$20 # .. e1 : zero extend incoming csum | ||
| 21 | extqh $18,1,$4 # e0 : byte swap len & proto while we wait | ||
| 22 | ldq $1,8($16) # .. e1 : | ||
| 23 | |||
| 24 | extbl $18,1,$5 # e0 : | ||
| 25 | ldq $2,0($17) # .. e1 : | ||
| 26 | extbl $18,2,$6 # e0 : | ||
| 27 | ldq $3,8($17) # .. e1 : | ||
| 28 | |||
| 29 | extbl $18,3,$18 # e0 : | ||
| 30 | sra $4,32,$4 # e0 : | ||
| 31 | sll $5,16,$5 # e0 : | ||
| 32 | addq $20,$0,$20 # .. e1 : begin summing the words | ||
| 33 | |||
| 34 | sll $6,8,$6 # e0 : | ||
| 35 | cmpult $20,$0,$0 # .. e1 : | ||
| 36 | extwh $19,7,$7 # e0 : | ||
| 37 | or $4,$18,$18 # .. e1 : | ||
| 38 | |||
| 39 | extbl $19,1,$19 # e0 : | ||
| 40 | or $5,$6,$5 # .. e1 : | ||
| 41 | or $18,$5,$18 # e0 : len complete | ||
| 42 | or $19,$7,$19 # .. e1 : | ||
| 43 | |||
| 44 | sll $19,48,$19 # e0 : | ||
| 45 | addq $20,$1,$20 # .. e1 : | ||
| 46 | sra $19,32,$19 # e0 : proto complete | ||
| 47 | cmpult $20,$1,$1 # .. e1 : | ||
| 48 | |||
| 49 | nop # e0 : | ||
| 50 | addq $20,$2,$20 # .. e1 : | ||
| 51 | cmpult $20,$2,$2 # e0 : | ||
| 52 | addq $20,$3,$20 # .. e1 : | ||
| 53 | |||
| 54 | cmpult $20,$3,$3 # e0 : | ||
| 55 | addq $20,$18,$20 # .. e1 : | ||
| 56 | cmpult $20,$18,$18 # e0 : | ||
| 57 | addq $20,$19,$20 # .. e1 : | ||
| 58 | |||
| 59 | cmpult $20,$19,$19 # e0 : | ||
| 60 | addq $0,$1,$0 # .. e1 : merge the carries back into the csum | ||
| 61 | addq $2,$3,$2 # e0 : | ||
| 62 | addq $18,$19,$18 # .. e1 : | ||
| 63 | |||
| 64 | addq $0,$2,$0 # e0 : | ||
| 65 | addq $20,$18,$20 # .. e1 : | ||
| 66 | addq $0,$20,$0 # e0 : | ||
| 67 | unop # : | ||
| 68 | |||
| 69 | extwl $0,2,$2 # e0 : begin folding the 64-bit value | ||
| 70 | zapnot $0,3,$3 # .. e1 : | ||
| 71 | extwl $0,4,$1 # e0 : | ||
| 72 | addq $2,$3,$3 # .. e1 : | ||
| 73 | |||
| 74 | extwl $0,6,$0 # e0 : | ||
| 75 | addq $3,$1,$3 # .. e1 : | ||
| 76 | addq $0,$3,$0 # e0 : | ||
| 77 | unop # : | ||
| 78 | |||
| 79 | extwl $0,2,$1 # e0 : fold 18-bit value | ||
| 80 | zapnot $0,3,$0 # .. e1 : | ||
| 81 | addq $0,$1,$0 # e0 : | ||
| 82 | unop # : | ||
| 83 | |||
| 84 | extwl $0,2,$1 # e0 : fold 17-bit value | ||
| 85 | zapnot $0,3,$0 # .. e1 : | ||
| 86 | addq $0,$1,$0 # e0 : | ||
| 87 | not $0,$0 # e1 : and complement. | ||
| 88 | |||
| 89 | zapnot $0,3,$0 # e0 : | ||
| 90 | ret # .. e1 : | ||
| 91 | |||
| 92 | .end csum_ipv6_magic | ||
diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c new file mode 100644 index 00000000000..a37948f3037 --- /dev/null +++ b/arch/alpha/lib/csum_partial_copy.c | |||
| @@ -0,0 +1,391 @@ | |||
| 1 | /* | ||
| 2 | * csum_partial_copy - do IP checksumming and copy | ||
| 3 | * | ||
| 4 | * (C) Copyright 1996 Linus Torvalds | ||
| 5 | * accellerated versions (and 21264 assembly versions ) contributed by | ||
| 6 | * Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 7 | * | ||
| 8 | * Don't look at this too closely - you'll go mad. The things | ||
| 9 | * we do for performance.. | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/types.h> | ||
| 13 | #include <linux/string.h> | ||
| 14 | #include <asm/uaccess.h> | ||
| 15 | |||
| 16 | |||
| 17 | #define ldq_u(x,y) \ | ||
| 18 | __asm__ __volatile__("ldq_u %0,%1":"=r" (x):"m" (*(const unsigned long *)(y))) | ||
| 19 | |||
| 20 | #define stq_u(x,y) \ | ||
| 21 | __asm__ __volatile__("stq_u %1,%0":"=m" (*(unsigned long *)(y)):"r" (x)) | ||
| 22 | |||
| 23 | #define extql(x,y,z) \ | ||
| 24 | __asm__ __volatile__("extql %1,%2,%0":"=r" (z):"r" (x),"r" (y)) | ||
| 25 | |||
| 26 | #define extqh(x,y,z) \ | ||
| 27 | __asm__ __volatile__("extqh %1,%2,%0":"=r" (z):"r" (x),"r" (y)) | ||
| 28 | |||
| 29 | #define mskql(x,y,z) \ | ||
| 30 | __asm__ __volatile__("mskql %1,%2,%0":"=r" (z):"r" (x),"r" (y)) | ||
| 31 | |||
| 32 | #define mskqh(x,y,z) \ | ||
| 33 | __asm__ __volatile__("mskqh %1,%2,%0":"=r" (z):"r" (x),"r" (y)) | ||
| 34 | |||
| 35 | #define insql(x,y,z) \ | ||
| 36 | __asm__ __volatile__("insql %1,%2,%0":"=r" (z):"r" (x),"r" (y)) | ||
| 37 | |||
| 38 | #define insqh(x,y,z) \ | ||
| 39 | __asm__ __volatile__("insqh %1,%2,%0":"=r" (z):"r" (x),"r" (y)) | ||
| 40 | |||
| 41 | |||
| 42 | #define __get_user_u(x,ptr) \ | ||
| 43 | ({ \ | ||
| 44 | long __guu_err; \ | ||
| 45 | __asm__ __volatile__( \ | ||
| 46 | "1: ldq_u %0,%2\n" \ | ||
| 47 | "2:\n" \ | ||
| 48 | ".section __ex_table,\"a\"\n" \ | ||
| 49 | " .long 1b - .\n" \ | ||
| 50 | " lda %0,2b-1b(%1)\n" \ | ||
| 51 | ".previous" \ | ||
| 52 | : "=r"(x), "=r"(__guu_err) \ | ||
| 53 | : "m"(__m(ptr)), "1"(0)); \ | ||
| 54 | __guu_err; \ | ||
| 55 | }) | ||
| 56 | |||
| 57 | #define __put_user_u(x,ptr) \ | ||
| 58 | ({ \ | ||
| 59 | long __puu_err; \ | ||
| 60 | __asm__ __volatile__( \ | ||
| 61 | "1: stq_u %2,%1\n" \ | ||
| 62 | "2:\n" \ | ||
| 63 | ".section __ex_table,\"a\"\n" \ | ||
| 64 | " .long 1b - ." \ | ||
| 65 | " lda $31,2b-1b(%0)\n" \ | ||
| 66 | ".previous" \ | ||
| 67 | : "=r"(__puu_err) \ | ||
| 68 | : "m"(__m(addr)), "rJ"(x), "0"(0)); \ | ||
| 69 | __puu_err; \ | ||
| 70 | }) | ||
| 71 | |||
| 72 | |||
| 73 | static inline unsigned short from64to16(unsigned long x) | ||
| 74 | { | ||
| 75 | /* Using extract instructions is a bit more efficient | ||
| 76 | than the original shift/bitmask version. */ | ||
| 77 | |||
| 78 | union { | ||
| 79 | unsigned long ul; | ||
| 80 | unsigned int ui[2]; | ||
| 81 | unsigned short us[4]; | ||
| 82 | } in_v, tmp_v, out_v; | ||
| 83 | |||
| 84 | in_v.ul = x; | ||
| 85 | tmp_v.ul = (unsigned long) in_v.ui[0] + (unsigned long) in_v.ui[1]; | ||
| 86 | |||
| 87 | /* Since the bits of tmp_v.sh[3] are going to always be zero, | ||
| 88 | we don't have to bother to add that in. */ | ||
| 89 | out_v.ul = (unsigned long) tmp_v.us[0] + (unsigned long) tmp_v.us[1] | ||
| 90 | + (unsigned long) tmp_v.us[2]; | ||
| 91 | |||
| 92 | /* Similarly, out_v.us[2] is always zero for the final add. */ | ||
| 93 | return out_v.us[0] + out_v.us[1]; | ||
| 94 | } | ||
| 95 | |||
| 96 | |||
| 97 | |||
| 98 | /* | ||
| 99 | * Ok. This isn't fun, but this is the EASY case. | ||
| 100 | */ | ||
| 101 | static inline unsigned long | ||
| 102 | csum_partial_cfu_aligned(const unsigned long __user *src, unsigned long *dst, | ||
| 103 | long len, unsigned long checksum, | ||
| 104 | int *errp) | ||
| 105 | { | ||
| 106 | unsigned long carry = 0; | ||
| 107 | int err = 0; | ||
| 108 | |||
| 109 | while (len >= 0) { | ||
| 110 | unsigned long word; | ||
| 111 | err |= __get_user(word, src); | ||
| 112 | checksum += carry; | ||
| 113 | src++; | ||
| 114 | checksum += word; | ||
| 115 | len -= 8; | ||
| 116 | carry = checksum < word; | ||
| 117 | *dst = word; | ||
| 118 | dst++; | ||
| 119 | } | ||
| 120 | len += 8; | ||
| 121 | checksum += carry; | ||
| 122 | if (len) { | ||
| 123 | unsigned long word, tmp; | ||
| 124 | err |= __get_user(word, src); | ||
| 125 | tmp = *dst; | ||
| 126 | mskql(word, len, word); | ||
| 127 | checksum += word; | ||
| 128 | mskqh(tmp, len, tmp); | ||
| 129 | carry = checksum < word; | ||
| 130 | *dst = word | tmp; | ||
| 131 | checksum += carry; | ||
| 132 | } | ||
| 133 | if (err) *errp = err; | ||
| 134 | return checksum; | ||
| 135 | } | ||
| 136 | |||
| 137 | /* | ||
| 138 | * This is even less fun, but this is still reasonably | ||
| 139 | * easy. | ||
| 140 | */ | ||
| 141 | static inline unsigned long | ||
| 142 | csum_partial_cfu_dest_aligned(const unsigned long __user *src, | ||
| 143 | unsigned long *dst, | ||
| 144 | unsigned long soff, | ||
| 145 | long len, unsigned long checksum, | ||
| 146 | int *errp) | ||
| 147 | { | ||
| 148 | unsigned long first; | ||
| 149 | unsigned long word, carry; | ||
| 150 | unsigned long lastsrc = 7+len+(unsigned long)src; | ||
| 151 | int err = 0; | ||
| 152 | |||
| 153 | err |= __get_user_u(first,src); | ||
| 154 | carry = 0; | ||
| 155 | while (len >= 0) { | ||
| 156 | unsigned long second; | ||
| 157 | |||
| 158 | err |= __get_user_u(second, src+1); | ||
| 159 | extql(first, soff, word); | ||
| 160 | len -= 8; | ||
| 161 | src++; | ||
| 162 | extqh(second, soff, first); | ||
| 163 | checksum += carry; | ||
| 164 | word |= first; | ||
| 165 | first = second; | ||
| 166 | checksum += word; | ||
| 167 | *dst = word; | ||
| 168 | dst++; | ||
| 169 | carry = checksum < word; | ||
| 170 | } | ||
| 171 | len += 8; | ||
| 172 | checksum += carry; | ||
| 173 | if (len) { | ||
| 174 | unsigned long tmp; | ||
| 175 | unsigned long second; | ||
| 176 | err |= __get_user_u(second, lastsrc); | ||
| 177 | tmp = *dst; | ||
| 178 | extql(first, soff, word); | ||
| 179 | extqh(second, soff, first); | ||
| 180 | word |= first; | ||
| 181 | mskql(word, len, word); | ||
| 182 | checksum += word; | ||
| 183 | mskqh(tmp, len, tmp); | ||
| 184 | carry = checksum < word; | ||
| 185 | *dst = word | tmp; | ||
| 186 | checksum += carry; | ||
| 187 | } | ||
| 188 | if (err) *errp = err; | ||
| 189 | return checksum; | ||
| 190 | } | ||
| 191 | |||
| 192 | /* | ||
| 193 | * This is slightly less fun than the above.. | ||
| 194 | */ | ||
| 195 | static inline unsigned long | ||
| 196 | csum_partial_cfu_src_aligned(const unsigned long __user *src, | ||
| 197 | unsigned long *dst, | ||
| 198 | unsigned long doff, | ||
| 199 | long len, unsigned long checksum, | ||
| 200 | unsigned long partial_dest, | ||
| 201 | int *errp) | ||
| 202 | { | ||
| 203 | unsigned long carry = 0; | ||
| 204 | unsigned long word; | ||
| 205 | unsigned long second_dest; | ||
| 206 | int err = 0; | ||
| 207 | |||
| 208 | mskql(partial_dest, doff, partial_dest); | ||
| 209 | while (len >= 0) { | ||
| 210 | err |= __get_user(word, src); | ||
| 211 | len -= 8; | ||
| 212 | insql(word, doff, second_dest); | ||
| 213 | checksum += carry; | ||
| 214 | stq_u(partial_dest | second_dest, dst); | ||
| 215 | src++; | ||
| 216 | checksum += word; | ||
| 217 | insqh(word, doff, partial_dest); | ||
| 218 | carry = checksum < word; | ||
| 219 | dst++; | ||
| 220 | } | ||
| 221 | len += 8; | ||
| 222 | if (len) { | ||
| 223 | checksum += carry; | ||
| 224 | err |= __get_user(word, src); | ||
| 225 | mskql(word, len, word); | ||
| 226 | len -= 8; | ||
| 227 | checksum += word; | ||
| 228 | insql(word, doff, second_dest); | ||
| 229 | len += doff; | ||
| 230 | carry = checksum < word; | ||
| 231 | partial_dest |= second_dest; | ||
| 232 | if (len >= 0) { | ||
| 233 | stq_u(partial_dest, dst); | ||
| 234 | if (!len) goto out; | ||
| 235 | dst++; | ||
| 236 | insqh(word, doff, partial_dest); | ||
| 237 | } | ||
| 238 | doff = len; | ||
| 239 | } | ||
| 240 | ldq_u(second_dest, dst); | ||
| 241 | mskqh(second_dest, doff, second_dest); | ||
| 242 | stq_u(partial_dest | second_dest, dst); | ||
| 243 | out: | ||
| 244 | checksum += carry; | ||
| 245 | if (err) *errp = err; | ||
| 246 | return checksum; | ||
| 247 | } | ||
| 248 | |||
| 249 | /* | ||
| 250 | * This is so totally un-fun that it's frightening. Don't | ||
| 251 | * look at this too closely, you'll go blind. | ||
| 252 | */ | ||
| 253 | static inline unsigned long | ||
| 254 | csum_partial_cfu_unaligned(const unsigned long __user * src, | ||
| 255 | unsigned long * dst, | ||
| 256 | unsigned long soff, unsigned long doff, | ||
| 257 | long len, unsigned long checksum, | ||
| 258 | unsigned long partial_dest, | ||
| 259 | int *errp) | ||
| 260 | { | ||
| 261 | unsigned long carry = 0; | ||
| 262 | unsigned long first; | ||
| 263 | unsigned long lastsrc; | ||
| 264 | int err = 0; | ||
| 265 | |||
| 266 | err |= __get_user_u(first, src); | ||
| 267 | lastsrc = 7+len+(unsigned long)src; | ||
| 268 | mskql(partial_dest, doff, partial_dest); | ||
| 269 | while (len >= 0) { | ||
| 270 | unsigned long second, word; | ||
| 271 | unsigned long second_dest; | ||
| 272 | |||
| 273 | err |= __get_user_u(second, src+1); | ||
| 274 | extql(first, soff, word); | ||
| 275 | checksum += carry; | ||
| 276 | len -= 8; | ||
| 277 | extqh(second, soff, first); | ||
| 278 | src++; | ||
| 279 | word |= first; | ||
| 280 | first = second; | ||
| 281 | insql(word, doff, second_dest); | ||
| 282 | checksum += word; | ||
| 283 | stq_u(partial_dest | second_dest, dst); | ||
| 284 | carry = checksum < word; | ||
| 285 | insqh(word, doff, partial_dest); | ||
| 286 | dst++; | ||
| 287 | } | ||
| 288 | len += doff; | ||
| 289 | checksum += carry; | ||
| 290 | if (len >= 0) { | ||
| 291 | unsigned long second, word; | ||
| 292 | unsigned long second_dest; | ||
| 293 | |||
| 294 | err |= __get_user_u(second, lastsrc); | ||
| 295 | extql(first, soff, word); | ||
| 296 | extqh(second, soff, first); | ||
| 297 | word |= first; | ||
| 298 | first = second; | ||
| 299 | mskql(word, len-doff, word); | ||
| 300 | checksum += word; | ||
| 301 | insql(word, doff, second_dest); | ||
| 302 | carry = checksum < word; | ||
| 303 | stq_u(partial_dest | second_dest, dst); | ||
| 304 | if (len) { | ||
| 305 | ldq_u(second_dest, dst+1); | ||
| 306 | insqh(word, doff, partial_dest); | ||
| 307 | mskqh(second_dest, len, second_dest); | ||
| 308 | stq_u(partial_dest | second_dest, dst+1); | ||
| 309 | } | ||
| 310 | checksum += carry; | ||
| 311 | } else { | ||
| 312 | unsigned long second, word; | ||
| 313 | unsigned long second_dest; | ||
| 314 | |||
| 315 | err |= __get_user_u(second, lastsrc); | ||
| 316 | extql(first, soff, word); | ||
| 317 | extqh(second, soff, first); | ||
| 318 | word |= first; | ||
| 319 | ldq_u(second_dest, dst); | ||
| 320 | mskql(word, len-doff, word); | ||
| 321 | checksum += word; | ||
| 322 | mskqh(second_dest, len, second_dest); | ||
| 323 | carry = checksum < word; | ||
| 324 | insql(word, doff, word); | ||
| 325 | stq_u(partial_dest | word | second_dest, dst); | ||
| 326 | checksum += carry; | ||
| 327 | } | ||
| 328 | if (err) *errp = err; | ||
| 329 | return checksum; | ||
| 330 | } | ||
| 331 | |||
| 332 | static unsigned int | ||
| 333 | do_csum_partial_copy_from_user(const char __user *src, char *dst, int len, | ||
| 334 | unsigned int sum, int *errp) | ||
| 335 | { | ||
| 336 | unsigned long checksum = (unsigned) sum; | ||
| 337 | unsigned long soff = 7 & (unsigned long) src; | ||
| 338 | unsigned long doff = 7 & (unsigned long) dst; | ||
| 339 | |||
| 340 | if (len) { | ||
| 341 | if (!doff) { | ||
| 342 | if (!soff) | ||
| 343 | checksum = csum_partial_cfu_aligned( | ||
| 344 | (const unsigned long __user *) src, | ||
| 345 | (unsigned long *) dst, | ||
| 346 | len-8, checksum, errp); | ||
| 347 | else | ||
| 348 | checksum = csum_partial_cfu_dest_aligned( | ||
| 349 | (const unsigned long __user *) src, | ||
| 350 | (unsigned long *) dst, | ||
| 351 | soff, len-8, checksum, errp); | ||
| 352 | } else { | ||
| 353 | unsigned long partial_dest; | ||
| 354 | ldq_u(partial_dest, dst); | ||
| 355 | if (!soff) | ||
| 356 | checksum = csum_partial_cfu_src_aligned( | ||
| 357 | (const unsigned long __user *) src, | ||
| 358 | (unsigned long *) dst, | ||
| 359 | doff, len-8, checksum, | ||
| 360 | partial_dest, errp); | ||
| 361 | else | ||
| 362 | checksum = csum_partial_cfu_unaligned( | ||
| 363 | (const unsigned long __user *) src, | ||
| 364 | (unsigned long *) dst, | ||
| 365 | soff, doff, len-8, checksum, | ||
| 366 | partial_dest, errp); | ||
| 367 | } | ||
| 368 | checksum = from64to16 (checksum); | ||
| 369 | } | ||
| 370 | return checksum; | ||
| 371 | } | ||
| 372 | |||
| 373 | unsigned int | ||
| 374 | csum_partial_copy_from_user(const char __user *src, char *dst, int len, | ||
| 375 | unsigned int sum, int *errp) | ||
| 376 | { | ||
| 377 | if (!access_ok(VERIFY_READ, src, len)) { | ||
| 378 | *errp = -EFAULT; | ||
| 379 | memset(dst, 0, len); | ||
| 380 | return sum; | ||
| 381 | } | ||
| 382 | |||
| 383 | return do_csum_partial_copy_from_user(src, dst, len, sum, errp); | ||
| 384 | } | ||
| 385 | |||
| 386 | unsigned int | ||
| 387 | csum_partial_copy_nocheck(const char __user *src, char *dst, int len, | ||
| 388 | unsigned int sum) | ||
| 389 | { | ||
| 390 | return do_csum_partial_copy_from_user(src, dst, len, sum, NULL); | ||
| 391 | } | ||
diff --git a/arch/alpha/lib/dbg_current.S b/arch/alpha/lib/dbg_current.S new file mode 100644 index 00000000000..e6d071015f9 --- /dev/null +++ b/arch/alpha/lib/dbg_current.S | |||
| @@ -0,0 +1,29 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/dbg_current.S | ||
| 3 | * Contributed by Richard Henderson (rth@cygnus.com) | ||
| 4 | * | ||
| 5 | * Trap if we find current not correct. | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <asm/pal.h> | ||
| 9 | |||
| 10 | .text | ||
| 11 | .set noat | ||
| 12 | |||
| 13 | .globl _mcount | ||
| 14 | .ent _mcount | ||
| 15 | _mcount: | ||
| 16 | .frame $30, 0, $28, 0 | ||
| 17 | .prologue 0 | ||
| 18 | |||
| 19 | lda $0, -0x4000($30) | ||
| 20 | cmpult $8, $30, $1 | ||
| 21 | cmpule $0, $30, $2 | ||
| 22 | and $1, $2, $3 | ||
| 23 | bne $3, 1f | ||
| 24 | |||
| 25 | call_pal PAL_bugchk | ||
| 26 | |||
| 27 | 1: ret $31, ($28), 1 | ||
| 28 | |||
| 29 | .end _mcount | ||
diff --git a/arch/alpha/lib/dbg_stackcheck.S b/arch/alpha/lib/dbg_stackcheck.S new file mode 100644 index 00000000000..cc5ce3a5fca --- /dev/null +++ b/arch/alpha/lib/dbg_stackcheck.S | |||
| @@ -0,0 +1,27 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/stackcheck.S | ||
| 3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
| 4 | * | ||
| 5 | * Verify that we have not overflowed the stack. Oops if we have. | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <asm/asm_offsets.h> | ||
| 9 | |||
| 10 | .text | ||
| 11 | .set noat | ||
| 12 | |||
| 13 | .align 3 | ||
| 14 | .globl _mcount | ||
| 15 | .ent _mcount | ||
| 16 | _mcount: | ||
| 17 | .frame $30, 0, $28, 0 | ||
| 18 | .prologue 0 | ||
| 19 | |||
| 20 | lda $0, TASK_SIZE($8) | ||
| 21 | cmpult $30, $0, $0 | ||
| 22 | bne $0, 1f | ||
| 23 | ret ($28) | ||
| 24 | 1: stq $31, -8($31) # oops me, damn it. | ||
| 25 | br 1b | ||
| 26 | |||
| 27 | .end _mcount | ||
diff --git a/arch/alpha/lib/dbg_stackkill.S b/arch/alpha/lib/dbg_stackkill.S new file mode 100644 index 00000000000..e09f2ae1e09 --- /dev/null +++ b/arch/alpha/lib/dbg_stackkill.S | |||
| @@ -0,0 +1,35 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/killstack.S | ||
| 3 | * Contributed by Richard Henderson (rth@cygnus.com) | ||
| 4 | * | ||
| 5 | * Clobber the balance of the kernel stack, hoping to catch | ||
| 6 | * uninitialized local variables in the act. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <asm/asm_offsets.h> | ||
| 10 | |||
| 11 | .text | ||
| 12 | .set noat | ||
| 13 | |||
| 14 | .align 5 | ||
| 15 | .globl _mcount | ||
| 16 | .ent _mcount | ||
| 17 | _mcount: | ||
| 18 | .frame $30, 0, $28, 0 | ||
| 19 | .prologue 0 | ||
| 20 | |||
| 21 | ldi $0, 0xdeadbeef | ||
| 22 | lda $2, -STACK_SIZE | ||
| 23 | sll $0, 32, $1 | ||
| 24 | and $30, $2, $2 | ||
| 25 | or $0, $1, $0 | ||
| 26 | lda $2, TASK_SIZE($2) | ||
| 27 | cmpult $2, $30, $1 | ||
| 28 | beq $1, 2f | ||
| 29 | 1: stq $0, 0($2) | ||
| 30 | addq $2, 8, $2 | ||
| 31 | cmpult $2, $30, $1 | ||
| 32 | bne $1, 1b | ||
| 33 | 2: ret ($28) | ||
| 34 | |||
| 35 | .end _mcount | ||
diff --git a/arch/alpha/lib/dec_and_lock.c b/arch/alpha/lib/dec_and_lock.c new file mode 100644 index 00000000000..6ae2500a9d9 --- /dev/null +++ b/arch/alpha/lib/dec_and_lock.c | |||
| @@ -0,0 +1,42 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/dec_and_lock.c | ||
| 3 | * | ||
| 4 | * ll/sc version of atomic_dec_and_lock() | ||
| 5 | * | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/spinlock.h> | ||
| 9 | #include <asm/atomic.h> | ||
| 10 | |||
| 11 | asm (".text \n\ | ||
| 12 | .global _atomic_dec_and_lock \n\ | ||
| 13 | .ent _atomic_dec_and_lock \n\ | ||
| 14 | .align 4 \n\ | ||
| 15 | _atomic_dec_and_lock: \n\ | ||
| 16 | .prologue 0 \n\ | ||
| 17 | 1: ldl_l $1, 0($16) \n\ | ||
| 18 | subl $1, 1, $1 \n\ | ||
| 19 | beq $1, 2f \n\ | ||
| 20 | stl_c $1, 0($16) \n\ | ||
| 21 | beq $1, 4f \n\ | ||
| 22 | mb \n\ | ||
| 23 | clr $0 \n\ | ||
| 24 | ret \n\ | ||
| 25 | 2: br $29, 3f \n\ | ||
| 26 | 3: ldgp $29, 0($29) \n\ | ||
| 27 | br $atomic_dec_and_lock_1..ng \n\ | ||
| 28 | .subsection 2 \n\ | ||
| 29 | 4: br 1b \n\ | ||
| 30 | .previous \n\ | ||
| 31 | .end _atomic_dec_and_lock"); | ||
| 32 | |||
| 33 | static int __attribute_used__ | ||
| 34 | atomic_dec_and_lock_1(atomic_t *atomic, spinlock_t *lock) | ||
| 35 | { | ||
| 36 | /* Slow path */ | ||
| 37 | spin_lock(lock); | ||
| 38 | if (atomic_dec_and_test(atomic)) | ||
| 39 | return 1; | ||
| 40 | spin_unlock(lock); | ||
| 41 | return 0; | ||
| 42 | } | ||
diff --git a/arch/alpha/lib/divide.S b/arch/alpha/lib/divide.S new file mode 100644 index 00000000000..2d1a0484a99 --- /dev/null +++ b/arch/alpha/lib/divide.S | |||
| @@ -0,0 +1,195 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/divide.S | ||
| 3 | * | ||
| 4 | * (C) 1995 Linus Torvalds | ||
| 5 | * | ||
| 6 | * Alpha division.. | ||
| 7 | */ | ||
| 8 | |||
| 9 | /* | ||
| 10 | * The alpha chip doesn't provide hardware division, so we have to do it | ||
| 11 | * by hand. The compiler expects the functions | ||
| 12 | * | ||
| 13 | * __divqu: 64-bit unsigned long divide | ||
| 14 | * __remqu: 64-bit unsigned long remainder | ||
| 15 | * __divqs/__remqs: signed 64-bit | ||
| 16 | * __divlu/__remlu: unsigned 32-bit | ||
| 17 | * __divls/__remls: signed 32-bit | ||
| 18 | * | ||
| 19 | * These are not normal C functions: instead of the normal | ||
| 20 | * calling sequence, these expect their arguments in registers | ||
| 21 | * $24 and $25, and return the result in $27. Register $28 may | ||
| 22 | * be clobbered (assembly temporary), anything else must be saved. | ||
| 23 | * | ||
| 24 | * In short: painful. | ||
| 25 | * | ||
| 26 | * This is a rather simple bit-at-a-time algorithm: it's very good | ||
| 27 | * at dividing random 64-bit numbers, but the more usual case where | ||
| 28 | * the divisor is small is handled better by the DEC algorithm | ||
| 29 | * using lookup tables. This uses much less memory, though, and is | ||
| 30 | * nicer on the cache.. Besides, I don't know the copyright status | ||
| 31 | * of the DEC code. | ||
| 32 | */ | ||
| 33 | |||
| 34 | /* | ||
| 35 | * My temporaries: | ||
| 36 | * $0 - current bit | ||
| 37 | * $1 - shifted divisor | ||
| 38 | * $2 - modulus/quotient | ||
| 39 | * | ||
| 40 | * $23 - return address | ||
| 41 | * $24 - dividend | ||
| 42 | * $25 - divisor | ||
| 43 | * | ||
| 44 | * $27 - quotient/modulus | ||
| 45 | * $28 - compare status | ||
| 46 | */ | ||
| 47 | |||
| 48 | #define halt .long 0 | ||
| 49 | |||
| 50 | /* | ||
| 51 | * Select function type and registers | ||
| 52 | */ | ||
| 53 | #define mask $0 | ||
| 54 | #define divisor $1 | ||
| 55 | #define compare $28 | ||
| 56 | #define tmp1 $3 | ||
| 57 | #define tmp2 $4 | ||
| 58 | |||
| 59 | #ifdef DIV | ||
| 60 | #define DIV_ONLY(x,y...) x,##y | ||
| 61 | #define MOD_ONLY(x,y...) | ||
| 62 | #define func(x) __div##x | ||
| 63 | #define modulus $2 | ||
| 64 | #define quotient $27 | ||
| 65 | #define GETSIGN(x) xor $24,$25,x | ||
| 66 | #define STACK 48 | ||
| 67 | #else | ||
| 68 | #define DIV_ONLY(x,y...) | ||
| 69 | #define MOD_ONLY(x,y...) x,##y | ||
| 70 | #define func(x) __rem##x | ||
| 71 | #define modulus $27 | ||
| 72 | #define quotient $2 | ||
| 73 | #define GETSIGN(x) bis $24,$24,x | ||
| 74 | #define STACK 32 | ||
| 75 | #endif | ||
| 76 | |||
| 77 | /* | ||
| 78 | * For 32-bit operations, we need to extend to 64-bit | ||
| 79 | */ | ||
| 80 | #ifdef INTSIZE | ||
| 81 | #define ufunction func(lu) | ||
| 82 | #define sfunction func(l) | ||
| 83 | #define LONGIFY(x) zapnot x,15,x | ||
| 84 | #define SLONGIFY(x) addl x,0,x | ||
| 85 | #else | ||
| 86 | #define ufunction func(qu) | ||
| 87 | #define sfunction func(q) | ||
| 88 | #define LONGIFY(x) | ||
| 89 | #define SLONGIFY(x) | ||
| 90 | #endif | ||
| 91 | |||
| 92 | .set noat | ||
| 93 | .align 3 | ||
| 94 | .globl ufunction | ||
| 95 | .ent ufunction | ||
| 96 | ufunction: | ||
| 97 | subq $30,STACK,$30 | ||
| 98 | .frame $30,STACK,$23 | ||
| 99 | .prologue 0 | ||
| 100 | |||
| 101 | 7: stq $1, 0($30) | ||
| 102 | bis $25,$25,divisor | ||
| 103 | stq $2, 8($30) | ||
| 104 | bis $24,$24,modulus | ||
| 105 | stq $0,16($30) | ||
| 106 | bis $31,$31,quotient | ||
| 107 | LONGIFY(divisor) | ||
| 108 | stq tmp1,24($30) | ||
| 109 | LONGIFY(modulus) | ||
| 110 | bis $31,1,mask | ||
| 111 | DIV_ONLY(stq tmp2,32($30)) | ||
| 112 | beq divisor, 9f /* div by zero */ | ||
| 113 | |||
| 114 | #ifdef INTSIZE | ||
| 115 | /* | ||
| 116 | * shift divisor left, using 3-bit shifts for | ||
| 117 | * 32-bit divides as we can't overflow. Three-bit | ||
| 118 | * shifts will result in looping three times less | ||
| 119 | * here, but can result in two loops more later. | ||
| 120 | * Thus using a large shift isn't worth it (and | ||
| 121 | * s8add pairs better than a sll..) | ||
| 122 | */ | ||
| 123 | 1: cmpult divisor,modulus,compare | ||
| 124 | s8addq divisor,$31,divisor | ||
| 125 | s8addq mask,$31,mask | ||
| 126 | bne compare,1b | ||
| 127 | #else | ||
| 128 | 1: cmpult divisor,modulus,compare | ||
| 129 | blt divisor, 2f | ||
| 130 | addq divisor,divisor,divisor | ||
| 131 | addq mask,mask,mask | ||
| 132 | bne compare,1b | ||
| 133 | unop | ||
| 134 | #endif | ||
| 135 | |||
| 136 | /* ok, start to go right again.. */ | ||
| 137 | 2: DIV_ONLY(addq quotient,mask,tmp2) | ||
| 138 | srl mask,1,mask | ||
| 139 | cmpule divisor,modulus,compare | ||
| 140 | subq modulus,divisor,tmp1 | ||
| 141 | DIV_ONLY(cmovne compare,tmp2,quotient) | ||
| 142 | srl divisor,1,divisor | ||
| 143 | cmovne compare,tmp1,modulus | ||
| 144 | bne mask,2b | ||
| 145 | |||
| 146 | 9: ldq $1, 0($30) | ||
| 147 | ldq $2, 8($30) | ||
| 148 | ldq $0,16($30) | ||
| 149 | ldq tmp1,24($30) | ||
| 150 | DIV_ONLY(ldq tmp2,32($30)) | ||
| 151 | addq $30,STACK,$30 | ||
| 152 | ret $31,($23),1 | ||
| 153 | .end ufunction | ||
| 154 | |||
| 155 | /* | ||
| 156 | * Uhh.. Ugly signed division. I'd rather not have it at all, but | ||
| 157 | * it's needed in some circumstances. There are different ways to | ||
| 158 | * handle this, really. This does: | ||
| 159 | * -a / b = a / -b = -(a / b) | ||
| 160 | * -a % b = -(a % b) | ||
| 161 | * a % -b = a % b | ||
| 162 | * which is probably not the best solution, but at least should | ||
| 163 | * have the property that (x/y)*y + (x%y) = x. | ||
| 164 | */ | ||
| 165 | .align 3 | ||
| 166 | .globl sfunction | ||
| 167 | .ent sfunction | ||
| 168 | sfunction: | ||
| 169 | subq $30,STACK,$30 | ||
| 170 | .frame $30,STACK,$23 | ||
| 171 | .prologue 0 | ||
| 172 | bis $24,$25,$28 | ||
| 173 | SLONGIFY($28) | ||
| 174 | bge $28,7b | ||
| 175 | stq $24,0($30) | ||
| 176 | subq $31,$24,$28 | ||
| 177 | stq $25,8($30) | ||
| 178 | cmovlt $24,$28,$24 /* abs($24) */ | ||
| 179 | stq $23,16($30) | ||
| 180 | subq $31,$25,$28 | ||
| 181 | stq tmp1,24($30) | ||
| 182 | cmovlt $25,$28,$25 /* abs($25) */ | ||
| 183 | unop | ||
| 184 | bsr $23,ufunction | ||
| 185 | ldq $24,0($30) | ||
| 186 | ldq $25,8($30) | ||
| 187 | GETSIGN($28) | ||
| 188 | subq $31,$27,tmp1 | ||
| 189 | SLONGIFY($28) | ||
| 190 | ldq $23,16($30) | ||
| 191 | cmovlt $28,tmp1,$27 | ||
| 192 | ldq tmp1,24($30) | ||
| 193 | addq $30,STACK,$30 | ||
| 194 | ret $31,($23),1 | ||
| 195 | .end sfunction | ||
diff --git a/arch/alpha/lib/ev6-clear_page.S b/arch/alpha/lib/ev6-clear_page.S new file mode 100644 index 00000000000..adf4f7be0e2 --- /dev/null +++ b/arch/alpha/lib/ev6-clear_page.S | |||
| @@ -0,0 +1,54 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev6-clear_page.S | ||
| 3 | * | ||
| 4 | * Zero an entire page. | ||
| 5 | */ | ||
| 6 | |||
| 7 | .text | ||
| 8 | .align 4 | ||
| 9 | .global clear_page | ||
| 10 | .ent clear_page | ||
| 11 | clear_page: | ||
| 12 | .prologue 0 | ||
| 13 | |||
| 14 | lda $0,128 | ||
| 15 | lda $1,125 | ||
| 16 | addq $16,64,$2 | ||
| 17 | addq $16,128,$3 | ||
| 18 | |||
| 19 | addq $16,192,$17 | ||
| 20 | wh64 ($16) | ||
| 21 | wh64 ($2) | ||
| 22 | wh64 ($3) | ||
| 23 | |||
| 24 | 1: wh64 ($17) | ||
| 25 | stq $31,0($16) | ||
| 26 | subq $0,1,$0 | ||
| 27 | subq $1,1,$1 | ||
| 28 | |||
| 29 | stq $31,8($16) | ||
| 30 | stq $31,16($16) | ||
| 31 | addq $17,64,$2 | ||
| 32 | nop | ||
| 33 | |||
| 34 | stq $31,24($16) | ||
| 35 | stq $31,32($16) | ||
| 36 | cmovgt $1,$2,$17 | ||
| 37 | nop | ||
| 38 | |||
| 39 | stq $31,40($16) | ||
| 40 | stq $31,48($16) | ||
| 41 | nop | ||
| 42 | nop | ||
| 43 | |||
| 44 | stq $31,56($16) | ||
| 45 | addq $16,64,$16 | ||
| 46 | nop | ||
| 47 | bne $0,1b | ||
| 48 | |||
| 49 | ret | ||
| 50 | nop | ||
| 51 | nop | ||
| 52 | nop | ||
| 53 | |||
| 54 | .end clear_page | ||
diff --git a/arch/alpha/lib/ev6-clear_user.S b/arch/alpha/lib/ev6-clear_user.S new file mode 100644 index 00000000000..4f42a16b7f5 --- /dev/null +++ b/arch/alpha/lib/ev6-clear_user.S | |||
| @@ -0,0 +1,225 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev6-clear_user.S | ||
| 3 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 4 | * | ||
| 5 | * Zero user space, handling exceptions as we go. | ||
| 6 | * | ||
| 7 | * We have to make sure that $0 is always up-to-date and contains the | ||
| 8 | * right "bytes left to zero" value (and that it is updated only _after_ | ||
| 9 | * a successful copy). There is also some rather minor exception setup | ||
| 10 | * stuff. | ||
| 11 | * | ||
| 12 | * NOTE! This is not directly C-callable, because the calling semantics | ||
| 13 | * are different: | ||
| 14 | * | ||
| 15 | * Inputs: | ||
| 16 | * length in $0 | ||
| 17 | * destination address in $6 | ||
| 18 | * exception pointer in $7 | ||
| 19 | * return address in $28 (exceptions expect it there) | ||
| 20 | * | ||
| 21 | * Outputs: | ||
| 22 | * bytes left to copy in $0 | ||
| 23 | * | ||
| 24 | * Clobbers: | ||
| 25 | * $1,$2,$3,$4,$5,$6 | ||
| 26 | * | ||
| 27 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 28 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 29 | * abbreviated as 'CWG' in other comments here | ||
| 30 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 31 | * Scheduling notation: | ||
| 32 | * E - either cluster | ||
| 33 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 34 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 35 | * Try not to change the actual algorithm if possible for consistency. | ||
| 36 | * Determining actual stalls (other than slotting) doesn't appear to be easy to do. | ||
| 37 | * From perusing the source code context where this routine is called, it is | ||
| 38 | * a fair assumption that significant fractions of entire pages are zeroed, so | ||
| 39 | * it's going to be worth the effort to hand-unroll a big loop, and use wh64. | ||
| 40 | * ASSUMPTION: | ||
| 41 | * The believed purpose of only updating $0 after a store is that a signal | ||
| 42 | * may come along during the execution of this chunk of code, and we don't | ||
| 43 | * want to leave a hole (and we also want to avoid repeating lots of work) | ||
| 44 | */ | ||
| 45 | |||
| 46 | /* Allow an exception for an insn; exit if we get one. */ | ||
| 47 | #define EX(x,y...) \ | ||
| 48 | 99: x,##y; \ | ||
| 49 | .section __ex_table,"a"; \ | ||
| 50 | .long 99b - .; \ | ||
| 51 | lda $31, $exception-99b($31); \ | ||
| 52 | .previous | ||
| 53 | |||
| 54 | .set noat | ||
| 55 | .set noreorder | ||
| 56 | .align 4 | ||
| 57 | |||
| 58 | .globl __do_clear_user | ||
| 59 | .ent __do_clear_user | ||
| 60 | .frame $30, 0, $28 | ||
| 61 | .prologue 0 | ||
| 62 | |||
| 63 | # Pipeline info : Slotting & Comments | ||
| 64 | __do_clear_user: | ||
| 65 | and $6, 7, $4 # .. E .. .. : find dest head misalignment | ||
| 66 | beq $0, $zerolength # U .. .. .. : U L U L | ||
| 67 | |||
| 68 | addq $0, $4, $1 # .. .. .. E : bias counter | ||
| 69 | and $1, 7, $2 # .. .. E .. : number of misaligned bytes in tail | ||
| 70 | # Note - we never actually use $2, so this is a moot computation | ||
| 71 | # and we can rewrite this later... | ||
| 72 | srl $1, 3, $1 # .. E .. .. : number of quadwords to clear | ||
| 73 | beq $4, $headalign # U .. .. .. : U L U L | ||
| 74 | |||
| 75 | /* | ||
| 76 | * Head is not aligned. Write (8 - $4) bytes to head of destination | ||
| 77 | * This means $6 is known to be misaligned | ||
| 78 | */ | ||
| 79 | EX( ldq_u $5, 0($6) ) # .. .. .. L : load dst word to mask back in | ||
| 80 | beq $1, $onebyte # .. .. U .. : sub-word store? | ||
| 81 | mskql $5, $6, $5 # .. U .. .. : take care of misaligned head | ||
| 82 | addq $6, 8, $6 # E .. .. .. : L U U L | ||
| 83 | |||
| 84 | EX( stq_u $5, -8($6) ) # .. .. .. L : | ||
| 85 | subq $1, 1, $1 # .. .. E .. : | ||
| 86 | addq $0, $4, $0 # .. E .. .. : bytes left -= 8 - misalignment | ||
| 87 | subq $0, 8, $0 # E .. .. .. : U L U L | ||
| 88 | |||
| 89 | .align 4 | ||
| 90 | /* | ||
| 91 | * (The .align directive ought to be a moot point) | ||
| 92 | * values upon initial entry to the loop | ||
| 93 | * $1 is number of quadwords to clear (zero is a valid value) | ||
| 94 | * $2 is number of trailing bytes (0..7) ($2 never used...) | ||
| 95 | * $6 is known to be aligned 0mod8 | ||
| 96 | */ | ||
| 97 | $headalign: | ||
| 98 | subq $1, 16, $4 # .. .. .. E : If < 16, we can not use the huge loop | ||
| 99 | and $6, 0x3f, $2 # .. .. E .. : Forward work for huge loop | ||
| 100 | subq $2, 0x40, $3 # .. E .. .. : bias counter (huge loop) | ||
| 101 | blt $4, $trailquad # U .. .. .. : U L U L | ||
| 102 | |||
| 103 | /* | ||
| 104 | * We know that we're going to do at least 16 quads, which means we are | ||
| 105 | * going to be able to use the large block clear loop at least once. | ||
| 106 | * Figure out how many quads we need to clear before we are 0mod64 aligned | ||
| 107 | * so we can use the wh64 instruction. | ||
| 108 | */ | ||
| 109 | |||
| 110 | nop # .. .. .. E | ||
| 111 | nop # .. .. E .. | ||
| 112 | nop # .. E .. .. | ||
| 113 | beq $3, $bigalign # U .. .. .. : U L U L : Aligned 0mod64 | ||
| 114 | |||
| 115 | $alignmod64: | ||
| 116 | EX( stq_u $31, 0($6) ) # .. .. .. L | ||
| 117 | addq $3, 8, $3 # .. .. E .. | ||
| 118 | subq $0, 8, $0 # .. E .. .. | ||
| 119 | nop # E .. .. .. : U L U L | ||
| 120 | |||
| 121 | nop # .. .. .. E | ||
| 122 | subq $1, 1, $1 # .. .. E .. | ||
| 123 | addq $6, 8, $6 # .. E .. .. | ||
| 124 | blt $3, $alignmod64 # U .. .. .. : U L U L | ||
| 125 | |||
| 126 | $bigalign: | ||
| 127 | /* | ||
| 128 | * $0 is the number of bytes left | ||
| 129 | * $1 is the number of quads left | ||
| 130 | * $6 is aligned 0mod64 | ||
| 131 | * we know that we'll be taking a minimum of one trip through | ||
| 132 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle | ||
| 133 | * We are _not_ going to update $0 after every single store. That | ||
| 134 | * would be silly, because there will be cross-cluster dependencies | ||
| 135 | * no matter how the code is scheduled. By doing it in slightly | ||
| 136 | * staggered fashion, we can still do this loop in 5 fetches | ||
| 137 | * The worse case will be doing two extra quads in some future execution, | ||
| 138 | * in the event of an interrupted clear. | ||
| 139 | * Assumes the wh64 needs to be for 2 trips through the loop in the future | ||
| 140 | * The wh64 is issued on for the starting destination address for trip +2 | ||
| 141 | * through the loop, and if there are less than two trips left, the target | ||
| 142 | * address will be for the current trip. | ||
| 143 | */ | ||
| 144 | nop # E : | ||
| 145 | nop # E : | ||
| 146 | nop # E : | ||
| 147 | bis $6,$6,$3 # E : U L U L : Initial wh64 address is dest | ||
| 148 | /* This might actually help for the current trip... */ | ||
| 149 | |||
| 150 | $do_wh64: | ||
| 151 | wh64 ($3) # .. .. .. L1 : memory subsystem hint | ||
| 152 | subq $1, 16, $4 # .. .. E .. : Forward calculation - repeat the loop? | ||
| 153 | EX( stq_u $31, 0($6) ) # .. L .. .. | ||
| 154 | subq $0, 8, $0 # E .. .. .. : U L U L | ||
| 155 | |||
| 156 | addq $6, 128, $3 # E : Target address of wh64 | ||
| 157 | EX( stq_u $31, 8($6) ) # L : | ||
| 158 | EX( stq_u $31, 16($6) ) # L : | ||
| 159 | subq $0, 16, $0 # E : U L L U | ||
| 160 | |||
| 161 | nop # E : | ||
| 162 | EX( stq_u $31, 24($6) ) # L : | ||
| 163 | EX( stq_u $31, 32($6) ) # L : | ||
| 164 | subq $0, 168, $5 # E : U L L U : two trips through the loop left? | ||
| 165 | /* 168 = 192 - 24, since we've already completed some stores */ | ||
| 166 | |||
| 167 | subq $0, 16, $0 # E : | ||
| 168 | EX( stq_u $31, 40($6) ) # L : | ||
| 169 | EX( stq_u $31, 48($6) ) # L : | ||
| 170 | cmovlt $5, $6, $3 # E : U L L U : Latency 2, extra mapping cycle | ||
| 171 | |||
| 172 | subq $1, 8, $1 # E : | ||
| 173 | subq $0, 16, $0 # E : | ||
| 174 | EX( stq_u $31, 56($6) ) # L : | ||
| 175 | nop # E : U L U L | ||
| 176 | |||
| 177 | nop # E : | ||
| 178 | subq $0, 8, $0 # E : | ||
| 179 | addq $6, 64, $6 # E : | ||
| 180 | bge $4, $do_wh64 # U : U L U L | ||
| 181 | |||
| 182 | $trailquad: | ||
| 183 | # zero to 16 quadwords left to store, plus any trailing bytes | ||
| 184 | # $1 is the number of quadwords left to go. | ||
| 185 | # | ||
| 186 | nop # .. .. .. E | ||
| 187 | nop # .. .. E .. | ||
| 188 | nop # .. E .. .. | ||
| 189 | beq $1, $trailbytes # U .. .. .. : U L U L : Only 0..7 bytes to go | ||
| 190 | |||
| 191 | $onequad: | ||
| 192 | EX( stq_u $31, 0($6) ) # .. .. .. L | ||
| 193 | subq $1, 1, $1 # .. .. E .. | ||
| 194 | subq $0, 8, $0 # .. E .. .. | ||
| 195 | nop # E .. .. .. : U L U L | ||
| 196 | |||
| 197 | nop # .. .. .. E | ||
| 198 | nop # .. .. E .. | ||
| 199 | addq $6, 8, $6 # .. E .. .. | ||
| 200 | bgt $1, $onequad # U .. .. .. : U L U L | ||
| 201 | |||
| 202 | # We have an unknown number of bytes left to go. | ||
| 203 | $trailbytes: | ||
| 204 | nop # .. .. .. E | ||
| 205 | nop # .. .. E .. | ||
| 206 | nop # .. E .. .. | ||
| 207 | beq $0, $zerolength # U .. .. .. : U L U L | ||
| 208 | |||
| 209 | # $0 contains the number of bytes left to copy (0..31) | ||
| 210 | # so we will use $0 as the loop counter | ||
| 211 | # We know for a fact that $0 > 0 zero due to previous context | ||
| 212 | $onebyte: | ||
| 213 | EX( stb $31, 0($6) ) # .. .. .. L | ||
| 214 | subq $0, 1, $0 # .. .. E .. : | ||
| 215 | addq $6, 1, $6 # .. E .. .. : | ||
| 216 | bgt $0, $onebyte # U .. .. .. : U L U L | ||
| 217 | |||
| 218 | $zerolength: | ||
| 219 | $exception: # Destination for exception recovery(?) | ||
| 220 | nop # .. .. .. E : | ||
| 221 | nop # .. .. E .. : | ||
| 222 | nop # .. E .. .. : | ||
| 223 | ret $31, ($28), 1 # L0 .. .. .. : L U L U | ||
| 224 | .end __do_clear_user | ||
| 225 | |||
diff --git a/arch/alpha/lib/ev6-copy_page.S b/arch/alpha/lib/ev6-copy_page.S new file mode 100644 index 00000000000..b789db19275 --- /dev/null +++ b/arch/alpha/lib/ev6-copy_page.S | |||
| @@ -0,0 +1,203 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev6-copy_page.S | ||
| 3 | * | ||
| 4 | * Copy an entire page. | ||
| 5 | */ | ||
| 6 | |||
| 7 | /* The following comparison of this routine vs the normal copy_page.S | ||
| 8 | was written by an unnamed ev6 hardware designer and forwarded to me | ||
| 9 | via Steven Hobbs <hobbs@steven.zko.dec.com>. | ||
| 10 | |||
| 11 | First Problem: STQ overflows. | ||
| 12 | ----------------------------- | ||
| 13 | |||
| 14 | It would be nice if EV6 handled every resource overflow efficiently, | ||
| 15 | but for some it doesn't. Including store queue overflows. It causes | ||
| 16 | a trap and a restart of the pipe. | ||
| 17 | |||
| 18 | To get around this we sometimes use (to borrow a term from a VSSAD | ||
| 19 | researcher) "aeration". The idea is to slow the rate at which the | ||
| 20 | processor receives valid instructions by inserting nops in the fetch | ||
| 21 | path. In doing so, you can prevent the overflow and actually make | ||
| 22 | the code run faster. You can, of course, take advantage of the fact | ||
| 23 | that the processor can fetch at most 4 aligned instructions per cycle. | ||
| 24 | |||
| 25 | I inserted enough nops to force it to take 10 cycles to fetch the | ||
| 26 | loop code. In theory, EV6 should be able to execute this loop in | ||
| 27 | 9 cycles but I was not able to get it to run that fast -- the initial | ||
| 28 | conditions were such that I could not reach this optimum rate on | ||
| 29 | (chaotic) EV6. I wrote the code such that everything would issue | ||
| 30 | in order. | ||
| 31 | |||
| 32 | Second Problem: Dcache index matches. | ||
| 33 | ------------------------------------- | ||
| 34 | |||
| 35 | If you are going to use this routine on random aligned pages, there | ||
| 36 | is a 25% chance that the pages will be at the same dcache indices. | ||
| 37 | This results in many nasty memory traps without care. | ||
| 38 | |||
| 39 | The solution is to schedule the prefetches to avoid the memory | ||
| 40 | conflicts. I schedule the wh64 prefetches farther ahead of the | ||
| 41 | read prefetches to avoid this problem. | ||
| 42 | |||
| 43 | Third Problem: Needs more prefetching. | ||
| 44 | -------------------------------------- | ||
| 45 | |||
| 46 | In order to improve the code I added deeper prefetching to take the | ||
| 47 | most advantage of EV6's bandwidth. | ||
| 48 | |||
| 49 | I also prefetched the read stream. Note that adding the read prefetch | ||
| 50 | forced me to add another cycle to the inner-most kernel - up to 11 | ||
| 51 | from the original 8 cycles per iteration. We could improve performance | ||
| 52 | further by unrolling the loop and doing multiple prefetches per cycle. | ||
| 53 | |||
| 54 | I think that the code below will be very robust and fast code for the | ||
| 55 | purposes of copying aligned pages. It is slower when both source and | ||
| 56 | destination pages are in the dcache, but it is my guess that this is | ||
| 57 | less important than the dcache miss case. */ | ||
| 58 | |||
| 59 | |||
| 60 | .text | ||
| 61 | .align 4 | ||
| 62 | .global copy_page | ||
| 63 | .ent copy_page | ||
| 64 | copy_page: | ||
| 65 | .prologue 0 | ||
| 66 | |||
| 67 | /* Prefetch 5 read cachelines; write-hint 10 cache lines. */ | ||
| 68 | wh64 ($16) | ||
| 69 | ldl $31,0($17) | ||
| 70 | ldl $31,64($17) | ||
| 71 | lda $1,1*64($16) | ||
| 72 | |||
| 73 | wh64 ($1) | ||
| 74 | ldl $31,128($17) | ||
| 75 | ldl $31,192($17) | ||
| 76 | lda $1,2*64($16) | ||
| 77 | |||
| 78 | wh64 ($1) | ||
| 79 | ldl $31,256($17) | ||
| 80 | lda $18,118 | ||
| 81 | lda $1,3*64($16) | ||
| 82 | |||
| 83 | wh64 ($1) | ||
| 84 | nop | ||
| 85 | lda $1,4*64($16) | ||
| 86 | lda $2,5*64($16) | ||
| 87 | |||
| 88 | wh64 ($1) | ||
| 89 | wh64 ($2) | ||
| 90 | lda $1,6*64($16) | ||
| 91 | lda $2,7*64($16) | ||
| 92 | |||
| 93 | wh64 ($1) | ||
| 94 | wh64 ($2) | ||
| 95 | lda $1,8*64($16) | ||
| 96 | lda $2,9*64($16) | ||
| 97 | |||
| 98 | wh64 ($1) | ||
| 99 | wh64 ($2) | ||
| 100 | lda $19,10*64($16) | ||
| 101 | nop | ||
| 102 | |||
| 103 | /* Main prefetching/write-hinting loop. */ | ||
| 104 | 1: ldq $0,0($17) | ||
| 105 | ldq $1,8($17) | ||
| 106 | unop | ||
| 107 | unop | ||
| 108 | |||
| 109 | unop | ||
| 110 | unop | ||
| 111 | ldq $2,16($17) | ||
| 112 | ldq $3,24($17) | ||
| 113 | |||
| 114 | ldq $4,32($17) | ||
| 115 | ldq $5,40($17) | ||
| 116 | unop | ||
| 117 | unop | ||
| 118 | |||
| 119 | unop | ||
| 120 | unop | ||
| 121 | ldq $6,48($17) | ||
| 122 | ldq $7,56($17) | ||
| 123 | |||
| 124 | ldl $31,320($17) | ||
| 125 | unop | ||
| 126 | unop | ||
| 127 | unop | ||
| 128 | |||
| 129 | /* This gives the extra cycle of aeration above the minimum. */ | ||
| 130 | unop | ||
| 131 | unop | ||
| 132 | unop | ||
| 133 | unop | ||
| 134 | |||
| 135 | wh64 ($19) | ||
| 136 | unop | ||
| 137 | unop | ||
| 138 | unop | ||
| 139 | |||
| 140 | stq $0,0($16) | ||
| 141 | subq $18,1,$18 | ||
| 142 | stq $1,8($16) | ||
| 143 | unop | ||
| 144 | |||
| 145 | unop | ||
| 146 | stq $2,16($16) | ||
| 147 | addq $17,64,$17 | ||
| 148 | stq $3,24($16) | ||
| 149 | |||
| 150 | stq $4,32($16) | ||
| 151 | stq $5,40($16) | ||
| 152 | addq $19,64,$19 | ||
| 153 | unop | ||
| 154 | |||
| 155 | stq $6,48($16) | ||
| 156 | stq $7,56($16) | ||
| 157 | addq $16,64,$16 | ||
| 158 | bne $18, 1b | ||
| 159 | |||
| 160 | /* Prefetch the final 5 cache lines of the read stream. */ | ||
| 161 | lda $18,10 | ||
| 162 | ldl $31,320($17) | ||
| 163 | ldl $31,384($17) | ||
| 164 | ldl $31,448($17) | ||
| 165 | |||
| 166 | ldl $31,512($17) | ||
| 167 | ldl $31,576($17) | ||
| 168 | nop | ||
| 169 | nop | ||
| 170 | |||
| 171 | /* Non-prefetching, non-write-hinting cleanup loop for the | ||
| 172 | final 10 cache lines. */ | ||
| 173 | 2: ldq $0,0($17) | ||
| 174 | ldq $1,8($17) | ||
| 175 | ldq $2,16($17) | ||
| 176 | ldq $3,24($17) | ||
| 177 | |||
| 178 | ldq $4,32($17) | ||
| 179 | ldq $5,40($17) | ||
| 180 | ldq $6,48($17) | ||
| 181 | ldq $7,56($17) | ||
| 182 | |||
| 183 | stq $0,0($16) | ||
| 184 | subq $18,1,$18 | ||
| 185 | stq $1,8($16) | ||
| 186 | addq $17,64,$17 | ||
| 187 | |||
| 188 | stq $2,16($16) | ||
| 189 | stq $3,24($16) | ||
| 190 | stq $4,32($16) | ||
| 191 | stq $5,40($16) | ||
| 192 | |||
| 193 | stq $6,48($16) | ||
| 194 | stq $7,56($16) | ||
| 195 | addq $16,64,$16 | ||
| 196 | bne $18, 2b | ||
| 197 | |||
| 198 | ret | ||
| 199 | nop | ||
| 200 | unop | ||
| 201 | nop | ||
| 202 | |||
| 203 | .end copy_page | ||
diff --git a/arch/alpha/lib/ev6-copy_user.S b/arch/alpha/lib/ev6-copy_user.S new file mode 100644 index 00000000000..db42ffe9c35 --- /dev/null +++ b/arch/alpha/lib/ev6-copy_user.S | |||
| @@ -0,0 +1,259 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev6-copy_user.S | ||
| 3 | * | ||
| 4 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 5 | * | ||
| 6 | * Copy to/from user space, handling exceptions as we go.. This | ||
| 7 | * isn't exactly pretty. | ||
| 8 | * | ||
| 9 | * This is essentially the same as "memcpy()", but with a few twists. | ||
| 10 | * Notably, we have to make sure that $0 is always up-to-date and | ||
| 11 | * contains the right "bytes left to copy" value (and that it is updated | ||
| 12 | * only _after_ a successful copy). There is also some rather minor | ||
| 13 | * exception setup stuff.. | ||
| 14 | * | ||
| 15 | * NOTE! This is not directly C-callable, because the calling semantics are | ||
| 16 | * different: | ||
| 17 | * | ||
| 18 | * Inputs: | ||
| 19 | * length in $0 | ||
| 20 | * destination address in $6 | ||
| 21 | * source address in $7 | ||
| 22 | * return address in $28 | ||
| 23 | * | ||
| 24 | * Outputs: | ||
| 25 | * bytes left to copy in $0 | ||
| 26 | * | ||
| 27 | * Clobbers: | ||
| 28 | * $1,$2,$3,$4,$5,$6,$7 | ||
| 29 | * | ||
| 30 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 31 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 32 | * abbreviated as 'CWG' in other comments here | ||
| 33 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 34 | * Scheduling notation: | ||
| 35 | * E - either cluster | ||
| 36 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 37 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 38 | */ | ||
| 39 | |||
| 40 | /* Allow an exception for an insn; exit if we get one. */ | ||
| 41 | #define EXI(x,y...) \ | ||
| 42 | 99: x,##y; \ | ||
| 43 | .section __ex_table,"a"; \ | ||
| 44 | .long 99b - .; \ | ||
| 45 | lda $31, $exitin-99b($31); \ | ||
| 46 | .previous | ||
| 47 | |||
| 48 | #define EXO(x,y...) \ | ||
| 49 | 99: x,##y; \ | ||
| 50 | .section __ex_table,"a"; \ | ||
| 51 | .long 99b - .; \ | ||
| 52 | lda $31, $exitout-99b($31); \ | ||
| 53 | .previous | ||
| 54 | |||
| 55 | .set noat | ||
| 56 | .align 4 | ||
| 57 | .globl __copy_user | ||
| 58 | .ent __copy_user | ||
| 59 | # Pipeline info: Slotting & Comments | ||
| 60 | __copy_user: | ||
| 61 | .prologue 0 | ||
| 62 | subq $0, 32, $1 # .. E .. .. : Is this going to be a small copy? | ||
| 63 | beq $0, $zerolength # U .. .. .. : U L U L | ||
| 64 | |||
| 65 | and $6,7,$3 # .. .. .. E : is leading dest misalignment | ||
| 66 | ble $1, $onebyteloop # .. .. U .. : 1st branch : small amount of data | ||
| 67 | beq $3, $destaligned # .. U .. .. : 2nd (one cycle fetcher stall) | ||
| 68 | subq $3, 8, $3 # E .. .. .. : L U U L : trip counter | ||
| 69 | /* | ||
| 70 | * The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U) | ||
| 71 | * This loop aligns the destination a byte at a time | ||
| 72 | * We know we have at least one trip through this loop | ||
| 73 | */ | ||
| 74 | $aligndest: | ||
| 75 | EXI( ldbu $1,0($7) ) # .. .. .. L : Keep loads separate from stores | ||
| 76 | addq $6,1,$6 # .. .. E .. : Section 3.8 in the CWG | ||
| 77 | addq $3,1,$3 # .. E .. .. : | ||
| 78 | nop # E .. .. .. : U L U L | ||
| 79 | |||
| 80 | /* | ||
| 81 | * the -1 is to compensate for the inc($6) done in a previous quadpack | ||
| 82 | * which allows us zero dependencies within either quadpack in the loop | ||
| 83 | */ | ||
| 84 | EXO( stb $1,-1($6) ) # .. .. .. L : | ||
| 85 | addq $7,1,$7 # .. .. E .. : Section 3.8 in the CWG | ||
| 86 | subq $0,1,$0 # .. E .. .. : | ||
| 87 | bne $3, $aligndest # U .. .. .. : U L U L | ||
| 88 | |||
| 89 | /* | ||
| 90 | * If we fell through into here, we have a minimum of 33 - 7 bytes | ||
| 91 | * If we arrived via branch, we have a minimum of 32 bytes | ||
| 92 | */ | ||
| 93 | $destaligned: | ||
| 94 | and $7,7,$1 # .. .. .. E : Check _current_ source alignment | ||
| 95 | bic $0,7,$4 # .. .. E .. : number bytes as a quadword loop | ||
| 96 | EXI( ldq_u $3,0($7) ) # .. L .. .. : Forward fetch for fallthrough code | ||
| 97 | beq $1,$quadaligned # U .. .. .. : U L U L | ||
| 98 | |||
| 99 | /* | ||
| 100 | * In the worst case, we've just executed an ldq_u here from 0($7) | ||
| 101 | * and we'll repeat it once if we take the branch | ||
| 102 | */ | ||
| 103 | |||
| 104 | /* Misaligned quadword loop - not unrolled. Leave it that way. */ | ||
| 105 | $misquad: | ||
| 106 | EXI( ldq_u $2,8($7) ) # .. .. .. L : | ||
| 107 | subq $4,8,$4 # .. .. E .. : | ||
| 108 | extql $3,$7,$3 # .. U .. .. : | ||
| 109 | extqh $2,$7,$1 # U .. .. .. : U U L L | ||
| 110 | |||
| 111 | bis $3,$1,$1 # .. .. .. E : | ||
| 112 | EXO( stq $1,0($6) ) # .. .. L .. : | ||
| 113 | addq $7,8,$7 # .. E .. .. : | ||
| 114 | subq $0,8,$0 # E .. .. .. : U L L U | ||
| 115 | |||
| 116 | addq $6,8,$6 # .. .. .. E : | ||
| 117 | bis $2,$2,$3 # .. .. E .. : | ||
| 118 | nop # .. E .. .. : | ||
| 119 | bne $4,$misquad # U .. .. .. : U L U L | ||
| 120 | |||
| 121 | nop # .. .. .. E | ||
| 122 | nop # .. .. E .. | ||
| 123 | nop # .. E .. .. | ||
| 124 | beq $0,$zerolength # U .. .. .. : U L U L | ||
| 125 | |||
| 126 | /* We know we have at least one trip through the byte loop */ | ||
| 127 | EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad | ||
| 128 | addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG) | ||
| 129 | nop # .. E .. .. : | ||
| 130 | br $31, $dirtyentry # L0 .. .. .. : L U U L | ||
| 131 | /* Do the trailing byte loop load, then hop into the store part of the loop */ | ||
| 132 | |||
| 133 | /* | ||
| 134 | * A minimum of (33 - 7) bytes to do a quad at a time. | ||
| 135 | * Based upon the usage context, it's worth the effort to unroll this loop | ||
| 136 | * $0 - number of bytes to be moved | ||
| 137 | * $4 - number of bytes to move as quadwords | ||
| 138 | * $6 is current destination address | ||
| 139 | * $7 is current source address | ||
| 140 | */ | ||
| 141 | $quadaligned: | ||
| 142 | subq $4, 32, $2 # .. .. .. E : do not unroll for small stuff | ||
| 143 | nop # .. .. E .. | ||
| 144 | nop # .. E .. .. | ||
| 145 | blt $2, $onequad # U .. .. .. : U L U L | ||
| 146 | |||
| 147 | /* | ||
| 148 | * There is a significant assumption here that the source and destination | ||
| 149 | * addresses differ by more than 32 bytes. In this particular case, a | ||
| 150 | * sparsity of registers further bounds this to be a minimum of 8 bytes. | ||
| 151 | * But if this isn't met, then the output result will be incorrect. | ||
| 152 | * Furthermore, due to a lack of available registers, we really can't | ||
| 153 | * unroll this to be an 8x loop (which would enable us to use the wh64 | ||
| 154 | * instruction memory hint instruction). | ||
| 155 | */ | ||
| 156 | $unroll4: | ||
| 157 | EXI( ldq $1,0($7) ) # .. .. .. L | ||
| 158 | EXI( ldq $2,8($7) ) # .. .. L .. | ||
| 159 | subq $4,32,$4 # .. E .. .. | ||
| 160 | nop # E .. .. .. : U U L L | ||
| 161 | |||
| 162 | addq $7,16,$7 # .. .. .. E | ||
| 163 | EXO( stq $1,0($6) ) # .. .. L .. | ||
| 164 | EXO( stq $2,8($6) ) # .. L .. .. | ||
| 165 | subq $0,16,$0 # E .. .. .. : U L L U | ||
| 166 | |||
| 167 | addq $6,16,$6 # .. .. .. E | ||
| 168 | EXI( ldq $1,0($7) ) # .. .. L .. | ||
| 169 | EXI( ldq $2,8($7) ) # .. L .. .. | ||
| 170 | subq $4, 32, $3 # E .. .. .. : U U L L : is there enough for another trip? | ||
| 171 | |||
| 172 | EXO( stq $1,0($6) ) # .. .. .. L | ||
| 173 | EXO( stq $2,8($6) ) # .. .. L .. | ||
| 174 | subq $0,16,$0 # .. E .. .. | ||
| 175 | addq $7,16,$7 # E .. .. .. : U L L U | ||
| 176 | |||
| 177 | nop # .. .. .. E | ||
| 178 | nop # .. .. E .. | ||
| 179 | addq $6,16,$6 # .. E .. .. | ||
| 180 | bgt $3,$unroll4 # U .. .. .. : U L U L | ||
| 181 | |||
| 182 | nop | ||
| 183 | nop | ||
| 184 | nop | ||
| 185 | beq $4, $noquads | ||
| 186 | |||
| 187 | $onequad: | ||
| 188 | EXI( ldq $1,0($7) ) | ||
| 189 | subq $4,8,$4 | ||
| 190 | addq $7,8,$7 | ||
| 191 | nop | ||
| 192 | |||
| 193 | EXO( stq $1,0($6) ) | ||
| 194 | subq $0,8,$0 | ||
| 195 | addq $6,8,$6 | ||
| 196 | bne $4,$onequad | ||
| 197 | |||
| 198 | $noquads: | ||
| 199 | nop | ||
| 200 | nop | ||
| 201 | nop | ||
| 202 | beq $0,$zerolength | ||
| 203 | |||
| 204 | /* | ||
| 205 | * For small copies (or the tail of a larger copy), do a very simple byte loop. | ||
| 206 | * There's no point in doing a lot of complex alignment calculations to try to | ||
| 207 | * to quadword stuff for a small amount of data. | ||
| 208 | * $0 - remaining number of bytes left to copy | ||
| 209 | * $6 - current dest addr | ||
| 210 | * $7 - current source addr | ||
| 211 | */ | ||
| 212 | |||
| 213 | $onebyteloop: | ||
| 214 | EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad | ||
| 215 | addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG) | ||
| 216 | nop # .. E .. .. : | ||
| 217 | nop # E .. .. .. : U L U L | ||
| 218 | |||
| 219 | $dirtyentry: | ||
| 220 | /* | ||
| 221 | * the -1 is to compensate for the inc($6) done in a previous quadpack | ||
| 222 | * which allows us zero dependencies within either quadpack in the loop | ||
| 223 | */ | ||
| 224 | EXO ( stb $2,-1($6) ) # .. .. .. L : | ||
| 225 | addq $7,1,$7 # .. .. E .. : quadpack as the load | ||
| 226 | subq $0,1,$0 # .. E .. .. : change count _after_ copy | ||
| 227 | bgt $0,$onebyteloop # U .. .. .. : U L U L | ||
| 228 | |||
| 229 | $zerolength: | ||
| 230 | $exitout: # Destination for exception recovery(?) | ||
| 231 | nop # .. .. .. E | ||
| 232 | nop # .. .. E .. | ||
| 233 | nop # .. E .. .. | ||
| 234 | ret $31,($28),1 # L0 .. .. .. : L U L U | ||
| 235 | |||
| 236 | $exitin: | ||
| 237 | |||
| 238 | /* A stupid byte-by-byte zeroing of the rest of the output | ||
| 239 | buffer. This cures security holes by never leaving | ||
| 240 | random kernel data around to be copied elsewhere. */ | ||
| 241 | |||
| 242 | nop | ||
| 243 | nop | ||
| 244 | nop | ||
| 245 | mov $0,$1 | ||
| 246 | |||
| 247 | $101: | ||
| 248 | EXO ( stb $31,0($6) ) # L | ||
| 249 | subq $1,1,$1 # E | ||
| 250 | addq $6,1,$6 # E | ||
| 251 | bgt $1,$101 # U | ||
| 252 | |||
| 253 | nop | ||
| 254 | nop | ||
| 255 | nop | ||
| 256 | ret $31,($28),1 # L0 | ||
| 257 | |||
| 258 | .end __copy_user | ||
| 259 | |||
diff --git a/arch/alpha/lib/ev6-csum_ipv6_magic.S b/arch/alpha/lib/ev6-csum_ipv6_magic.S new file mode 100644 index 00000000000..de1948a6911 --- /dev/null +++ b/arch/alpha/lib/ev6-csum_ipv6_magic.S | |||
| @@ -0,0 +1,126 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev6-csum_ipv6_magic.S | ||
| 3 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 4 | * | ||
| 5 | * unsigned short csum_ipv6_magic(struct in6_addr *saddr, | ||
| 6 | * struct in6_addr *daddr, | ||
| 7 | * __u32 len, | ||
| 8 | * unsigned short proto, | ||
| 9 | * unsigned int csum); | ||
| 10 | * | ||
| 11 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 12 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 13 | * abbreviated as 'CWG' in other comments here | ||
| 14 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 15 | * Scheduling notation: | ||
| 16 | * E - either cluster | ||
| 17 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 18 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 19 | * Try not to change the actual algorithm if possible for consistency. | ||
| 20 | * Determining actual stalls (other than slotting) doesn't appear to be easy to do. | ||
| 21 | * | ||
| 22 | * unsigned short csum_ipv6_magic(struct in6_addr *saddr, | ||
| 23 | * struct in6_addr *daddr, | ||
| 24 | * __u32 len, | ||
| 25 | * unsigned short proto, | ||
| 26 | * unsigned int csum); | ||
| 27 | * | ||
| 28 | * Swap <proto> (takes form 0xaabb) | ||
| 29 | * Then shift it left by 48, so result is: | ||
| 30 | * 0xbbaa0000 00000000 | ||
| 31 | * Then turn it back into a sign extended 32-bit item | ||
| 32 | * 0xbbaa0000 | ||
| 33 | * | ||
| 34 | * Swap <len> (an unsigned int) using Mike Burrows' 7-instruction sequence | ||
| 35 | * (we can't hide the 3-cycle latency of the unpkbw in the 6-instruction sequence) | ||
| 36 | * Assume input takes form 0xAABBCCDD | ||
| 37 | * | ||
| 38 | * Finally, original 'folding' approach is to split the long into 4 unsigned shorts | ||
| 39 | * add 4 ushorts, resulting in ushort/carry | ||
| 40 | * add carry bits + ushort --> ushort | ||
| 41 | * add carry bits + ushort --> ushort (in case the carry results in an overflow) | ||
| 42 | * Truncate to a ushort. (took 13 instructions) | ||
| 43 | * From doing some testing, using the approach in checksum.c:from64to16() | ||
| 44 | * results in the same outcome: | ||
| 45 | * split into 2 uints, add those, generating a ulong | ||
| 46 | * add the 3 low ushorts together, generating a uint | ||
| 47 | * a final add of the 2 lower ushorts | ||
| 48 | * truncating the result. | ||
| 49 | */ | ||
| 50 | |||
| 51 | .globl csum_ipv6_magic | ||
| 52 | .align 4 | ||
| 53 | .ent csum_ipv6_magic | ||
| 54 | .frame $30,0,$26,0 | ||
| 55 | csum_ipv6_magic: | ||
| 56 | .prologue 0 | ||
| 57 | |||
| 58 | ldq $0,0($16) # L : Latency: 3 | ||
| 59 | inslh $18,7,$4 # U : 0000000000AABBCC | ||
| 60 | ldq $1,8($16) # L : Latency: 3 | ||
| 61 | sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00 | ||
| 62 | |||
| 63 | zapnot $20,15,$20 # U : zero extend incoming csum | ||
| 64 | ldq $2,0($17) # L : Latency: 3 | ||
| 65 | sll $19,24,$19 # U : U L L U : 0x000000aa bb000000 | ||
| 66 | inswl $18,3,$18 # U : 000000CCDD000000 | ||
| 67 | |||
| 68 | ldq $3,8($17) # L : Latency: 3 | ||
| 69 | bis $18,$4,$18 # E : 000000CCDDAABBCC | ||
| 70 | addl $19,$7,$19 # E : <sign bits>bbaabb00 | ||
| 71 | nop # E : U L U L | ||
| 72 | |||
| 73 | addq $20,$0,$20 # E : begin summing the words | ||
| 74 | srl $18,16,$4 # U : 0000000000CCDDAA | ||
| 75 | zap $19,0x3,$19 # U : <sign bits>bbaa0000 | ||
| 76 | nop # E : L U U L | ||
| 77 | |||
| 78 | cmpult $20,$0,$0 # E : | ||
| 79 | addq $20,$1,$20 # E : | ||
| 80 | zapnot $18,0xa,$18 # U : 00000000DD00BB00 | ||
| 81 | zap $4,0xa,$4 # U : U U L L : 0000000000CC00AA | ||
| 82 | |||
| 83 | or $18,$4,$18 # E : 00000000DDCCBBAA | ||
| 84 | nop # E : | ||
| 85 | cmpult $20,$1,$1 # E : | ||
| 86 | addq $20,$2,$20 # E : U L U L | ||
| 87 | |||
| 88 | cmpult $20,$2,$2 # E : | ||
| 89 | addq $20,$3,$20 # E : | ||
| 90 | cmpult $20,$3,$3 # E : (1 cycle stall on $20) | ||
| 91 | addq $20,$18,$20 # E : U L U L (1 cycle stall on $20) | ||
| 92 | |||
| 93 | cmpult $20,$18,$18 # E : | ||
| 94 | addq $20,$19,$20 # E : (1 cycle stall on $20) | ||
| 95 | addq $0,$1,$0 # E : merge the carries back into the csum | ||
| 96 | addq $2,$3,$2 # E : | ||
| 97 | |||
| 98 | cmpult $20,$19,$19 # E : | ||
| 99 | addq $18,$19,$18 # E : (1 cycle stall on $19) | ||
| 100 | addq $0,$2,$0 # E : | ||
| 101 | addq $20,$18,$20 # E : U L U L : | ||
| 102 | /* (1 cycle stall on $18, 2 cycles on $20) */ | ||
| 103 | |||
| 104 | addq $0,$20,$0 # E : | ||
| 105 | zapnot $0,15,$1 # U : Start folding output (1 cycle stall on $0) | ||
| 106 | nop # E : | ||
| 107 | srl $0,32,$0 # U : U L U L : (1 cycle stall on $0) | ||
| 108 | |||
| 109 | addq $1,$0,$1 # E : Finished generating ulong | ||
| 110 | extwl $1,2,$2 # U : ushort[1] (1 cycle stall on $1) | ||
| 111 | zapnot $1,3,$0 # U : ushort[0] (1 cycle stall on $1) | ||
| 112 | extwl $1,4,$1 # U : ushort[2] (1 cycle stall on $1) | ||
| 113 | |||
| 114 | addq $0,$2,$0 # E | ||
| 115 | addq $0,$1,$3 # E : Finished generating uint | ||
| 116 | /* (1 cycle stall on $0) */ | ||
| 117 | extwl $3,2,$1 # U : ushort[1] (1 cycle stall on $3) | ||
| 118 | nop # E : L U L U | ||
| 119 | |||
| 120 | addq $1,$3,$0 # E : Final carry | ||
| 121 | not $0,$4 # E : complement (1 cycle stall on $0) | ||
| 122 | zapnot $4,3,$0 # U : clear upper garbage bits | ||
| 123 | /* (1 cycle stall on $4) */ | ||
| 124 | ret # L0 : L U L U | ||
| 125 | |||
| 126 | .end csum_ipv6_magic | ||
diff --git a/arch/alpha/lib/ev6-divide.S b/arch/alpha/lib/ev6-divide.S new file mode 100644 index 00000000000..2a82b9be93f --- /dev/null +++ b/arch/alpha/lib/ev6-divide.S | |||
| @@ -0,0 +1,259 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev6-divide.S | ||
| 3 | * | ||
| 4 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 5 | * | ||
| 6 | * Alpha division.. | ||
| 7 | */ | ||
| 8 | |||
| 9 | /* | ||
| 10 | * The alpha chip doesn't provide hardware division, so we have to do it | ||
| 11 | * by hand. The compiler expects the functions | ||
| 12 | * | ||
| 13 | * __divqu: 64-bit unsigned long divide | ||
| 14 | * __remqu: 64-bit unsigned long remainder | ||
| 15 | * __divqs/__remqs: signed 64-bit | ||
| 16 | * __divlu/__remlu: unsigned 32-bit | ||
| 17 | * __divls/__remls: signed 32-bit | ||
| 18 | * | ||
| 19 | * These are not normal C functions: instead of the normal | ||
| 20 | * calling sequence, these expect their arguments in registers | ||
| 21 | * $24 and $25, and return the result in $27. Register $28 may | ||
| 22 | * be clobbered (assembly temporary), anything else must be saved. | ||
| 23 | * | ||
| 24 | * In short: painful. | ||
| 25 | * | ||
| 26 | * This is a rather simple bit-at-a-time algorithm: it's very good | ||
| 27 | * at dividing random 64-bit numbers, but the more usual case where | ||
| 28 | * the divisor is small is handled better by the DEC algorithm | ||
| 29 | * using lookup tables. This uses much less memory, though, and is | ||
| 30 | * nicer on the cache.. Besides, I don't know the copyright status | ||
| 31 | * of the DEC code. | ||
| 32 | */ | ||
| 33 | |||
| 34 | /* | ||
| 35 | * My temporaries: | ||
| 36 | * $0 - current bit | ||
| 37 | * $1 - shifted divisor | ||
| 38 | * $2 - modulus/quotient | ||
| 39 | * | ||
| 40 | * $23 - return address | ||
| 41 | * $24 - dividend | ||
| 42 | * $25 - divisor | ||
| 43 | * | ||
| 44 | * $27 - quotient/modulus | ||
| 45 | * $28 - compare status | ||
| 46 | * | ||
| 47 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 48 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 49 | * abbreviated as 'CWG' in other comments here | ||
| 50 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 51 | * Scheduling notation: | ||
| 52 | * E - either cluster | ||
| 53 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 54 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 55 | * Try not to change the actual algorithm if possible for consistency. | ||
| 56 | */ | ||
| 57 | |||
| 58 | #define halt .long 0 | ||
| 59 | |||
| 60 | /* | ||
| 61 | * Select function type and registers | ||
| 62 | */ | ||
| 63 | #define mask $0 | ||
| 64 | #define divisor $1 | ||
| 65 | #define compare $28 | ||
| 66 | #define tmp1 $3 | ||
| 67 | #define tmp2 $4 | ||
| 68 | |||
| 69 | #ifdef DIV | ||
| 70 | #define DIV_ONLY(x,y...) x,##y | ||
| 71 | #define MOD_ONLY(x,y...) | ||
| 72 | #define func(x) __div##x | ||
| 73 | #define modulus $2 | ||
| 74 | #define quotient $27 | ||
| 75 | #define GETSIGN(x) xor $24,$25,x | ||
| 76 | #define STACK 48 | ||
| 77 | #else | ||
| 78 | #define DIV_ONLY(x,y...) | ||
| 79 | #define MOD_ONLY(x,y...) x,##y | ||
| 80 | #define func(x) __rem##x | ||
| 81 | #define modulus $27 | ||
| 82 | #define quotient $2 | ||
| 83 | #define GETSIGN(x) bis $24,$24,x | ||
| 84 | #define STACK 32 | ||
| 85 | #endif | ||
| 86 | |||
| 87 | /* | ||
| 88 | * For 32-bit operations, we need to extend to 64-bit | ||
| 89 | */ | ||
| 90 | #ifdef INTSIZE | ||
| 91 | #define ufunction func(lu) | ||
| 92 | #define sfunction func(l) | ||
| 93 | #define LONGIFY(x) zapnot x,15,x | ||
| 94 | #define SLONGIFY(x) addl x,0,x | ||
| 95 | #else | ||
| 96 | #define ufunction func(qu) | ||
| 97 | #define sfunction func(q) | ||
| 98 | #define LONGIFY(x) | ||
| 99 | #define SLONGIFY(x) | ||
| 100 | #endif | ||
| 101 | |||
| 102 | .set noat | ||
| 103 | .align 4 | ||
| 104 | .globl ufunction | ||
| 105 | .ent ufunction | ||
| 106 | ufunction: | ||
| 107 | subq $30,STACK,$30 # E : | ||
| 108 | .frame $30,STACK,$23 | ||
| 109 | .prologue 0 | ||
| 110 | |||
| 111 | 7: stq $1, 0($30) # L : | ||
| 112 | bis $25,$25,divisor # E : | ||
| 113 | stq $2, 8($30) # L : L U L U | ||
| 114 | |||
| 115 | bis $24,$24,modulus # E : | ||
| 116 | stq $0,16($30) # L : | ||
| 117 | bis $31,$31,quotient # E : | ||
| 118 | LONGIFY(divisor) # E : U L L U | ||
| 119 | |||
| 120 | stq tmp1,24($30) # L : | ||
| 121 | LONGIFY(modulus) # E : | ||
| 122 | bis $31,1,mask # E : | ||
| 123 | DIV_ONLY(stq tmp2,32($30)) # L : L U U L | ||
| 124 | |||
| 125 | beq divisor, 9f /* div by zero */ | ||
| 126 | /* | ||
| 127 | * In spite of the DIV_ONLY being either a non-instruction | ||
| 128 | * or an actual stq, the addition of the .align directive | ||
| 129 | * below ensures that label 1 is going to be nicely aligned | ||
| 130 | */ | ||
| 131 | |||
| 132 | .align 4 | ||
| 133 | #ifdef INTSIZE | ||
| 134 | /* | ||
| 135 | * shift divisor left, using 3-bit shifts for | ||
| 136 | * 32-bit divides as we can't overflow. Three-bit | ||
| 137 | * shifts will result in looping three times less | ||
| 138 | * here, but can result in two loops more later. | ||
| 139 | * Thus using a large shift isn't worth it (and | ||
| 140 | * s8add pairs better than a sll..) | ||
| 141 | */ | ||
| 142 | 1: cmpult divisor,modulus,compare # E : | ||
| 143 | s8addq divisor,$31,divisor # E : | ||
| 144 | s8addq mask,$31,mask # E : | ||
| 145 | bne compare,1b # U : U L U L | ||
| 146 | #else | ||
| 147 | 1: cmpult divisor,modulus,compare # E : | ||
| 148 | nop # E : | ||
| 149 | nop # E : | ||
| 150 | blt divisor, 2f # U : U L U L | ||
| 151 | |||
| 152 | addq divisor,divisor,divisor # E : | ||
| 153 | addq mask,mask,mask # E : | ||
| 154 | unop # E : | ||
| 155 | bne compare,1b # U : U L U L | ||
| 156 | #endif | ||
| 157 | |||
| 158 | /* ok, start to go right again.. */ | ||
| 159 | 2: | ||
| 160 | /* | ||
| 161 | * Keep things nicely bundled... use a nop instead of not | ||
| 162 | * having an instruction for DIV_ONLY | ||
| 163 | */ | ||
| 164 | #ifdef DIV | ||
| 165 | DIV_ONLY(addq quotient,mask,tmp2) # E : | ||
| 166 | #else | ||
| 167 | nop # E : | ||
| 168 | #endif | ||
| 169 | srl mask,1,mask # U : | ||
| 170 | cmpule divisor,modulus,compare # E : | ||
| 171 | subq modulus,divisor,tmp1 # E : | ||
| 172 | |||
| 173 | #ifdef DIV | ||
| 174 | DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot | ||
| 175 | nop # E : as part of the cmovne | ||
| 176 | srl divisor,1,divisor # U : | ||
| 177 | nop # E : L U L U | ||
| 178 | |||
| 179 | nop # E : | ||
| 180 | cmovne compare,tmp1,modulus # E : Latency 2, extra map slot | ||
| 181 | nop # E : as part of the cmovne | ||
| 182 | bne mask,2b # U : U L U L | ||
| 183 | #else | ||
| 184 | srl divisor,1,divisor # U : | ||
| 185 | cmovne compare,tmp1,modulus # E : Latency 2, extra map slot | ||
| 186 | nop # E : as part of the cmovne | ||
| 187 | bne mask,2b # U : U L L U | ||
| 188 | #endif | ||
| 189 | |||
| 190 | 9: ldq $1, 0($30) # L : | ||
| 191 | ldq $2, 8($30) # L : | ||
| 192 | nop # E : | ||
| 193 | nop # E : U U L L | ||
| 194 | |||
| 195 | ldq $0,16($30) # L : | ||
| 196 | ldq tmp1,24($30) # L : | ||
| 197 | nop # E : | ||
| 198 | nop # E : | ||
| 199 | |||
| 200 | #ifdef DIV | ||
| 201 | DIV_ONLY(ldq tmp2,32($30)) # L : | ||
| 202 | #else | ||
| 203 | nop # E : | ||
| 204 | #endif | ||
| 205 | addq $30,STACK,$30 # E : | ||
| 206 | ret $31,($23),1 # L0 : L U U L | ||
| 207 | .end ufunction | ||
| 208 | |||
| 209 | /* | ||
| 210 | * Uhh.. Ugly signed division. I'd rather not have it at all, but | ||
| 211 | * it's needed in some circumstances. There are different ways to | ||
| 212 | * handle this, really. This does: | ||
| 213 | * -a / b = a / -b = -(a / b) | ||
| 214 | * -a % b = -(a % b) | ||
| 215 | * a % -b = a % b | ||
| 216 | * which is probably not the best solution, but at least should | ||
| 217 | * have the property that (x/y)*y + (x%y) = x. | ||
| 218 | */ | ||
| 219 | .align 4 | ||
| 220 | .globl sfunction | ||
| 221 | .ent sfunction | ||
| 222 | sfunction: | ||
| 223 | subq $30,STACK,$30 # E : | ||
| 224 | .frame $30,STACK,$23 | ||
| 225 | .prologue 0 | ||
| 226 | bis $24,$25,$28 # E : | ||
| 227 | SLONGIFY($28) # E : | ||
| 228 | bge $28,7b # U : | ||
| 229 | |||
| 230 | stq $24,0($30) # L : | ||
| 231 | subq $31,$24,$28 # E : | ||
| 232 | stq $25,8($30) # L : | ||
| 233 | nop # E : U L U L | ||
| 234 | |||
| 235 | cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot | ||
| 236 | nop # E : as part of the cmov | ||
| 237 | stq $23,16($30) # L : | ||
| 238 | subq $31,$25,$28 # E : U L U L | ||
| 239 | |||
| 240 | stq tmp1,24($30) # L : | ||
| 241 | cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot | ||
| 242 | nop # E : | ||
| 243 | bsr $23,ufunction # L0: L U L U | ||
| 244 | |||
| 245 | ldq $24,0($30) # L : | ||
| 246 | ldq $25,8($30) # L : | ||
| 247 | GETSIGN($28) # E : | ||
| 248 | subq $31,$27,tmp1 # E : U U L L | ||
| 249 | |||
| 250 | SLONGIFY($28) # E : | ||
| 251 | ldq $23,16($30) # L : | ||
| 252 | cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot | ||
| 253 | nop # E : U L L U : as part of the cmov | ||
| 254 | |||
| 255 | ldq tmp1,24($30) # L : | ||
| 256 | nop # E : as part of the cmov | ||
| 257 | addq $30,STACK,$30 # E : | ||
| 258 | ret $31,($23),1 # L0 : L U U L | ||
| 259 | .end sfunction | ||
diff --git a/arch/alpha/lib/ev6-memchr.S b/arch/alpha/lib/ev6-memchr.S new file mode 100644 index 00000000000..a8e843dbcc2 --- /dev/null +++ b/arch/alpha/lib/ev6-memchr.S | |||
| @@ -0,0 +1,191 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev6-memchr.S | ||
| 3 | * | ||
| 4 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 5 | * | ||
| 6 | * Finds characters in a memory area. Optimized for the Alpha: | ||
| 7 | * | ||
| 8 | * - memory accessed as aligned quadwords only | ||
| 9 | * - uses cmpbge to compare 8 bytes in parallel | ||
| 10 | * - does binary search to find 0 byte in last | ||
| 11 | * quadword (HAKMEM needed 12 instructions to | ||
| 12 | * do this instead of the 9 instructions that | ||
| 13 | * binary search needs). | ||
| 14 | * | ||
| 15 | * For correctness consider that: | ||
| 16 | * | ||
| 17 | * - only minimum number of quadwords may be accessed | ||
| 18 | * - the third argument is an unsigned long | ||
| 19 | * | ||
| 20 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 21 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 22 | * abbreviated as 'CWG' in other comments here | ||
| 23 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 24 | * Scheduling notation: | ||
| 25 | * E - either cluster | ||
| 26 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 27 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 28 | * Try not to change the actual algorithm if possible for consistency. | ||
| 29 | */ | ||
| 30 | |||
| 31 | .set noreorder | ||
| 32 | .set noat | ||
| 33 | |||
| 34 | .align 4 | ||
| 35 | .globl memchr | ||
| 36 | .ent memchr | ||
| 37 | memchr: | ||
| 38 | .frame $30,0,$26,0 | ||
| 39 | .prologue 0 | ||
| 40 | |||
| 41 | # Hack -- if someone passes in (size_t)-1, hoping to just | ||
| 42 | # search til the end of the address space, we will overflow | ||
| 43 | # below when we find the address of the last byte. Given | ||
| 44 | # that we will never have a 56-bit address space, cropping | ||
| 45 | # the length is the easiest way to avoid trouble. | ||
| 46 | zap $18, 0x80, $5 # U : Bound length | ||
| 47 | beq $18, $not_found # U : | ||
| 48 | ldq_u $1, 0($16) # L : load first quadword Latency=3 | ||
| 49 | and $17, 0xff, $17 # E : L L U U : 00000000000000ch | ||
| 50 | |||
| 51 | insbl $17, 1, $2 # U : 000000000000ch00 | ||
| 52 | cmpult $18, 9, $4 # E : small (< 1 quad) string? | ||
| 53 | or $2, $17, $17 # E : 000000000000chch | ||
| 54 | lda $3, -1($31) # E : U L L U | ||
| 55 | |||
| 56 | sll $17, 16, $2 # U : 00000000chch0000 | ||
| 57 | addq $16, $5, $5 # E : Max search address | ||
| 58 | or $2, $17, $17 # E : 00000000chchchch | ||
| 59 | sll $17, 32, $2 # U : U L L U : chchchch00000000 | ||
| 60 | |||
| 61 | or $2, $17, $17 # E : chchchchchchchch | ||
| 62 | extql $1, $16, $7 # U : $7 is upper bits | ||
| 63 | beq $4, $first_quad # U : | ||
| 64 | ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3 | ||
| 65 | |||
| 66 | extqh $6, $16, $6 # U : 2 cycle stall for $6 | ||
| 67 | mov $16, $0 # E : | ||
| 68 | nop # E : | ||
| 69 | or $7, $6, $1 # E : L U L U $1 = quadword starting at $16 | ||
| 70 | |||
| 71 | # Deal with the case where at most 8 bytes remain to be searched | ||
| 72 | # in $1. E.g.: | ||
| 73 | # $18 = 6 | ||
| 74 | # $1 = ????c6c5c4c3c2c1 | ||
| 75 | $last_quad: | ||
| 76 | negq $18, $6 # E : | ||
| 77 | xor $17, $1, $1 # E : | ||
| 78 | srl $3, $6, $6 # U : $6 = mask of $18 bits set | ||
| 79 | cmpbge $31, $1, $2 # E : L U L U | ||
| 80 | |||
| 81 | nop | ||
| 82 | nop | ||
| 83 | and $2, $6, $2 # E : | ||
| 84 | beq $2, $not_found # U : U L U L | ||
| 85 | |||
| 86 | $found_it: | ||
| 87 | #if defined(__alpha_fix__) && defined(__alpha_cix__) | ||
| 88 | /* | ||
| 89 | * Since we are guaranteed to have set one of the bits, we don't | ||
| 90 | * have to worry about coming back with a 0x40 out of cttz... | ||
| 91 | */ | ||
| 92 | cttz $2, $3 # U0 : | ||
| 93 | addq $0, $3, $0 # E : All done | ||
| 94 | nop # E : | ||
| 95 | ret # L0 : L U L U | ||
| 96 | #else | ||
| 97 | /* | ||
| 98 | * Slow and clunky. It can probably be improved. | ||
| 99 | * An exercise left for others. | ||
| 100 | */ | ||
| 101 | negq $2, $3 # E : | ||
| 102 | and $2, $3, $2 # E : | ||
| 103 | and $2, 0x0f, $1 # E : | ||
| 104 | addq $0, 4, $3 # E : | ||
| 105 | |||
| 106 | cmoveq $1, $3, $0 # E : Latency 2, extra map cycle | ||
| 107 | nop # E : keep with cmov | ||
| 108 | and $2, 0x33, $1 # E : | ||
| 109 | addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0 | ||
| 110 | |||
| 111 | cmoveq $1, $3, $0 # E : Latency 2, extra map cycle | ||
| 112 | nop # E : keep with cmov | ||
| 113 | and $2, 0x55, $1 # E : | ||
| 114 | addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0 | ||
| 115 | |||
| 116 | cmoveq $1, $3, $0 # E : Latency 2, extra map cycle | ||
| 117 | nop | ||
| 118 | nop | ||
| 119 | ret # L0 : L U L U | ||
| 120 | #endif | ||
| 121 | |||
| 122 | # Deal with the case where $18 > 8 bytes remain to be | ||
| 123 | # searched. $16 may not be aligned. | ||
| 124 | .align 4 | ||
| 125 | $first_quad: | ||
| 126 | andnot $16, 0x7, $0 # E : | ||
| 127 | insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff) | ||
| 128 | xor $1, $17, $1 # E : | ||
| 129 | or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff | ||
| 130 | |||
| 131 | cmpbge $31, $1, $2 # E : | ||
| 132 | bne $2, $found_it # U : | ||
| 133 | # At least one byte left to process. | ||
| 134 | ldq $1, 8($0) # L : | ||
| 135 | subq $5, 1, $18 # E : U L U L | ||
| 136 | |||
| 137 | addq $0, 8, $0 # E : | ||
| 138 | # Make $18 point to last quad to be accessed (the | ||
| 139 | # last quad may or may not be partial). | ||
| 140 | andnot $18, 0x7, $18 # E : | ||
| 141 | cmpult $0, $18, $2 # E : | ||
| 142 | beq $2, $final # U : U L U L | ||
| 143 | |||
| 144 | # At least two quads remain to be accessed. | ||
| 145 | |||
| 146 | subq $18, $0, $4 # E : $4 <- nr quads to be processed | ||
| 147 | and $4, 8, $4 # E : odd number of quads? | ||
| 148 | bne $4, $odd_quad_count # U : | ||
| 149 | # At least three quads remain to be accessed | ||
| 150 | mov $1, $4 # E : L U L U : move prefetched value to correct reg | ||
| 151 | |||
| 152 | .align 4 | ||
| 153 | $unrolled_loop: | ||
| 154 | ldq $1, 8($0) # L : prefetch $1 | ||
| 155 | xor $17, $4, $2 # E : | ||
| 156 | cmpbge $31, $2, $2 # E : | ||
| 157 | bne $2, $found_it # U : U L U L | ||
| 158 | |||
| 159 | addq $0, 8, $0 # E : | ||
| 160 | nop # E : | ||
| 161 | nop # E : | ||
| 162 | nop # E : | ||
| 163 | |||
| 164 | $odd_quad_count: | ||
| 165 | xor $17, $1, $2 # E : | ||
| 166 | ldq $4, 8($0) # L : prefetch $4 | ||
| 167 | cmpbge $31, $2, $2 # E : | ||
| 168 | addq $0, 8, $6 # E : | ||
| 169 | |||
| 170 | bne $2, $found_it # U : | ||
| 171 | cmpult $6, $18, $6 # E : | ||
| 172 | addq $0, 8, $0 # E : | ||
| 173 | nop # E : | ||
| 174 | |||
| 175 | bne $6, $unrolled_loop # U : | ||
| 176 | mov $4, $1 # E : move prefetched value into $1 | ||
| 177 | nop # E : | ||
| 178 | nop # E : | ||
| 179 | |||
| 180 | $final: subq $5, $0, $18 # E : $18 <- number of bytes left to do | ||
| 181 | nop # E : | ||
| 182 | nop # E : | ||
| 183 | bne $18, $last_quad # U : | ||
| 184 | |||
| 185 | $not_found: | ||
| 186 | mov $31, $0 # E : | ||
| 187 | nop # E : | ||
| 188 | nop # E : | ||
| 189 | ret # L0 : | ||
| 190 | |||
| 191 | .end memchr | ||
diff --git a/arch/alpha/lib/ev6-memcpy.S b/arch/alpha/lib/ev6-memcpy.S new file mode 100644 index 00000000000..52b37b0f2af --- /dev/null +++ b/arch/alpha/lib/ev6-memcpy.S | |||
| @@ -0,0 +1,248 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev6-memcpy.S | ||
| 3 | * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 4 | * | ||
| 5 | * Reasonably optimized memcpy() routine for the Alpha 21264 | ||
| 6 | * | ||
| 7 | * - memory accessed as aligned quadwords only | ||
| 8 | * - uses bcmpge to compare 8 bytes in parallel | ||
| 9 | * | ||
| 10 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 11 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 12 | * abbreviated as 'CWG' in other comments here | ||
| 13 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 14 | * Scheduling notation: | ||
| 15 | * E - either cluster | ||
| 16 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 17 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 18 | * | ||
| 19 | * Temp usage notes: | ||
| 20 | * $1,$2, - scratch | ||
| 21 | */ | ||
| 22 | |||
| 23 | .set noreorder | ||
| 24 | .set noat | ||
| 25 | |||
| 26 | .align 4 | ||
| 27 | .globl memcpy | ||
| 28 | .ent memcpy | ||
| 29 | memcpy: | ||
| 30 | .frame $30,0,$26,0 | ||
| 31 | .prologue 0 | ||
| 32 | |||
| 33 | mov $16, $0 # E : copy dest to return | ||
| 34 | ble $18, $nomoredata # U : done with the copy? | ||
| 35 | xor $16, $17, $1 # E : are source and dest alignments the same? | ||
| 36 | and $1, 7, $1 # E : are they the same mod 8? | ||
| 37 | |||
| 38 | bne $1, $misaligned # U : Nope - gotta do this the slow way | ||
| 39 | /* source and dest are same mod 8 address */ | ||
| 40 | and $16, 7, $1 # E : Are both 0mod8? | ||
| 41 | beq $1, $both_0mod8 # U : Yes | ||
| 42 | nop # E : | ||
| 43 | |||
| 44 | /* | ||
| 45 | * source and dest are same misalignment. move a byte at a time | ||
| 46 | * until a 0mod8 alignment for both is reached. | ||
| 47 | * At least one byte more to move | ||
| 48 | */ | ||
| 49 | |||
| 50 | $head_align: | ||
| 51 | ldbu $1, 0($17) # L : grab a byte | ||
| 52 | subq $18, 1, $18 # E : count-- | ||
| 53 | addq $17, 1, $17 # E : src++ | ||
| 54 | stb $1, 0($16) # L : | ||
| 55 | addq $16, 1, $16 # E : dest++ | ||
| 56 | and $16, 7, $1 # E : Are we at 0mod8 yet? | ||
| 57 | ble $18, $nomoredata # U : done with the copy? | ||
| 58 | bne $1, $head_align # U : | ||
| 59 | |||
| 60 | $both_0mod8: | ||
| 61 | cmple $18, 127, $1 # E : Can we unroll the loop? | ||
| 62 | bne $1, $no_unroll # U : | ||
| 63 | and $16, 63, $1 # E : get mod64 alignment | ||
| 64 | beq $1, $do_unroll # U : no single quads to fiddle | ||
| 65 | |||
| 66 | $single_head_quad: | ||
| 67 | ldq $1, 0($17) # L : get 8 bytes | ||
| 68 | subq $18, 8, $18 # E : count -= 8 | ||
| 69 | addq $17, 8, $17 # E : src += 8 | ||
| 70 | nop # E : | ||
| 71 | |||
| 72 | stq $1, 0($16) # L : store | ||
| 73 | addq $16, 8, $16 # E : dest += 8 | ||
| 74 | and $16, 63, $1 # E : get mod64 alignment | ||
| 75 | bne $1, $single_head_quad # U : still not fully aligned | ||
| 76 | |||
| 77 | $do_unroll: | ||
| 78 | addq $16, 64, $7 # E : Initial (+1 trip) wh64 address | ||
| 79 | cmple $18, 127, $1 # E : Can we go through the unrolled loop? | ||
| 80 | bne $1, $tail_quads # U : Nope | ||
| 81 | nop # E : | ||
| 82 | |||
| 83 | $unroll_body: | ||
| 84 | wh64 ($7) # L1 : memory subsystem hint: 64 bytes at | ||
| 85 | # ($7) are about to be over-written | ||
| 86 | ldq $6, 0($17) # L0 : bytes 0..7 | ||
| 87 | nop # E : | ||
| 88 | nop # E : | ||
| 89 | |||
| 90 | ldq $4, 8($17) # L : bytes 8..15 | ||
| 91 | ldq $5, 16($17) # L : bytes 16..23 | ||
| 92 | addq $7, 64, $7 # E : Update next wh64 address | ||
| 93 | nop # E : | ||
| 94 | |||
| 95 | ldq $3, 24($17) # L : bytes 24..31 | ||
| 96 | addq $16, 64, $1 # E : fallback value for wh64 | ||
| 97 | nop # E : | ||
| 98 | nop # E : | ||
| 99 | |||
| 100 | addq $17, 32, $17 # E : src += 32 bytes | ||
| 101 | stq $6, 0($16) # L : bytes 0..7 | ||
| 102 | nop # E : | ||
| 103 | nop # E : | ||
| 104 | |||
| 105 | stq $4, 8($16) # L : bytes 8..15 | ||
| 106 | stq $5, 16($16) # L : bytes 16..23 | ||
| 107 | subq $18, 192, $2 # E : At least two more trips to go? | ||
| 108 | nop # E : | ||
| 109 | |||
| 110 | stq $3, 24($16) # L : bytes 24..31 | ||
| 111 | addq $16, 32, $16 # E : dest += 32 bytes | ||
| 112 | nop # E : | ||
| 113 | nop # E : | ||
| 114 | |||
| 115 | ldq $6, 0($17) # L : bytes 0..7 | ||
| 116 | ldq $4, 8($17) # L : bytes 8..15 | ||
| 117 | cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use | ||
| 118 | # fallback wh64 address if < 2 more trips | ||
| 119 | nop # E : | ||
| 120 | |||
| 121 | ldq $5, 16($17) # L : bytes 16..23 | ||
| 122 | ldq $3, 24($17) # L : bytes 24..31 | ||
| 123 | addq $16, 32, $16 # E : dest += 32 | ||
| 124 | subq $18, 64, $18 # E : count -= 64 | ||
| 125 | |||
| 126 | addq $17, 32, $17 # E : src += 32 | ||
| 127 | stq $6, -32($16) # L : bytes 0..7 | ||
| 128 | stq $4, -24($16) # L : bytes 8..15 | ||
| 129 | cmple $18, 63, $1 # E : At least one more trip? | ||
| 130 | |||
| 131 | stq $5, -16($16) # L : bytes 16..23 | ||
| 132 | stq $3, -8($16) # L : bytes 24..31 | ||
| 133 | nop # E : | ||
| 134 | beq $1, $unroll_body | ||
| 135 | |||
| 136 | $tail_quads: | ||
| 137 | $no_unroll: | ||
| 138 | .align 4 | ||
| 139 | subq $18, 8, $18 # E : At least a quad left? | ||
| 140 | blt $18, $less_than_8 # U : Nope | ||
| 141 | nop # E : | ||
| 142 | nop # E : | ||
| 143 | |||
| 144 | $move_a_quad: | ||
| 145 | ldq $1, 0($17) # L : fetch 8 | ||
| 146 | subq $18, 8, $18 # E : count -= 8 | ||
| 147 | addq $17, 8, $17 # E : src += 8 | ||
| 148 | nop # E : | ||
| 149 | |||
| 150 | stq $1, 0($16) # L : store 8 | ||
| 151 | addq $16, 8, $16 # E : dest += 8 | ||
| 152 | bge $18, $move_a_quad # U : | ||
| 153 | nop # E : | ||
| 154 | |||
| 155 | $less_than_8: | ||
| 156 | .align 4 | ||
| 157 | addq $18, 8, $18 # E : add back for trailing bytes | ||
| 158 | ble $18, $nomoredata # U : All-done | ||
| 159 | nop # E : | ||
| 160 | nop # E : | ||
| 161 | |||
| 162 | /* Trailing bytes */ | ||
| 163 | $tail_bytes: | ||
| 164 | subq $18, 1, $18 # E : count-- | ||
| 165 | ldbu $1, 0($17) # L : fetch a byte | ||
| 166 | addq $17, 1, $17 # E : src++ | ||
| 167 | nop # E : | ||
| 168 | |||
| 169 | stb $1, 0($16) # L : store a byte | ||
| 170 | addq $16, 1, $16 # E : dest++ | ||
| 171 | bgt $18, $tail_bytes # U : more to be done? | ||
| 172 | nop # E : | ||
| 173 | |||
| 174 | /* branching to exit takes 3 extra cycles, so replicate exit here */ | ||
| 175 | ret $31, ($26), 1 # L0 : | ||
| 176 | nop # E : | ||
| 177 | nop # E : | ||
| 178 | nop # E : | ||
| 179 | |||
| 180 | $misaligned: | ||
| 181 | mov $0, $4 # E : dest temp | ||
| 182 | and $0, 7, $1 # E : dest alignment mod8 | ||
| 183 | beq $1, $dest_0mod8 # U : life doesnt totally suck | ||
| 184 | nop | ||
| 185 | |||
| 186 | $aligndest: | ||
| 187 | ble $18, $nomoredata # U : | ||
| 188 | ldbu $1, 0($17) # L : fetch a byte | ||
| 189 | subq $18, 1, $18 # E : count-- | ||
| 190 | addq $17, 1, $17 # E : src++ | ||
| 191 | |||
| 192 | stb $1, 0($4) # L : store it | ||
| 193 | addq $4, 1, $4 # E : dest++ | ||
| 194 | and $4, 7, $1 # E : dest 0mod8 yet? | ||
| 195 | bne $1, $aligndest # U : go until we are aligned. | ||
| 196 | |||
| 197 | /* Source has unknown alignment, but dest is known to be 0mod8 */ | ||
| 198 | $dest_0mod8: | ||
| 199 | subq $18, 8, $18 # E : At least a quad left? | ||
| 200 | blt $18, $misalign_tail # U : Nope | ||
| 201 | ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes | ||
| 202 | nop # E : | ||
| 203 | |||
| 204 | $mis_quad: | ||
| 205 | ldq_u $16, 8($17) # L : Fetch next 8 | ||
| 206 | extql $3, $17, $3 # U : masking | ||
| 207 | extqh $16, $17, $1 # U : masking | ||
| 208 | bis $3, $1, $1 # E : merged bytes to store | ||
| 209 | |||
| 210 | subq $18, 8, $18 # E : count -= 8 | ||
| 211 | addq $17, 8, $17 # E : src += 8 | ||
| 212 | stq $1, 0($4) # L : store 8 (aligned) | ||
| 213 | mov $16, $3 # E : "rotate" source data | ||
| 214 | |||
| 215 | addq $4, 8, $4 # E : dest += 8 | ||
| 216 | bge $18, $mis_quad # U : More quads to move | ||
| 217 | nop | ||
| 218 | nop | ||
| 219 | |||
| 220 | $misalign_tail: | ||
| 221 | addq $18, 8, $18 # E : account for tail stuff | ||
| 222 | ble $18, $nomoredata # U : | ||
| 223 | nop | ||
| 224 | nop | ||
| 225 | |||
| 226 | $misalign_byte: | ||
| 227 | ldbu $1, 0($17) # L : fetch 1 | ||
| 228 | subq $18, 1, $18 # E : count-- | ||
| 229 | addq $17, 1, $17 # E : src++ | ||
| 230 | nop # E : | ||
| 231 | |||
| 232 | stb $1, 0($4) # L : store | ||
| 233 | addq $4, 1, $4 # E : dest++ | ||
| 234 | bgt $18, $misalign_byte # U : more to go? | ||
| 235 | nop | ||
| 236 | |||
| 237 | |||
| 238 | $nomoredata: | ||
| 239 | ret $31, ($26), 1 # L0 : | ||
| 240 | nop # E : | ||
| 241 | nop # E : | ||
| 242 | nop # E : | ||
| 243 | |||
| 244 | .end memcpy | ||
| 245 | |||
| 246 | /* For backwards module compatibility. */ | ||
| 247 | __memcpy = memcpy | ||
| 248 | .globl __memcpy | ||
diff --git a/arch/alpha/lib/ev6-memset.S b/arch/alpha/lib/ev6-memset.S new file mode 100644 index 00000000000..d8b94e1c7fc --- /dev/null +++ b/arch/alpha/lib/ev6-memset.S | |||
| @@ -0,0 +1,597 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev6-memset.S | ||
| 3 | * | ||
| 4 | * This is an efficient (and relatively small) implementation of the C library | ||
| 5 | * "memset()" function for the 21264 implementation of Alpha. | ||
| 6 | * | ||
| 7 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 8 | * | ||
| 9 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 10 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 11 | * abbreviated as 'CWG' in other comments here | ||
| 12 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 13 | * Scheduling notation: | ||
| 14 | * E - either cluster | ||
| 15 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 16 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 17 | * The algorithm for the leading and trailing quadwords remains the same, | ||
| 18 | * however the loop has been unrolled to enable better memory throughput, | ||
| 19 | * and the code has been replicated for each of the entry points: __memset | ||
| 20 | * and __memsetw to permit better scheduling to eliminate the stalling | ||
| 21 | * encountered during the mask replication. | ||
| 22 | * A future enhancement might be to put in a byte store loop for really | ||
| 23 | * small (say < 32 bytes) memset()s. Whether or not that change would be | ||
| 24 | * a win in the kernel would depend upon the contextual usage. | ||
| 25 | * WARNING: Maintaining this is going to be more work than the above version, | ||
| 26 | * as fixes will need to be made in multiple places. The performance gain | ||
| 27 | * is worth it. | ||
| 28 | */ | ||
| 29 | |||
| 30 | .set noat | ||
| 31 | .set noreorder | ||
| 32 | .text | ||
| 33 | .globl __memset | ||
| 34 | .globl __memsetw | ||
| 35 | .globl __constant_c_memset | ||
| 36 | .globl memset | ||
| 37 | |||
| 38 | .ent __memset | ||
| 39 | .align 5 | ||
| 40 | __memset: | ||
| 41 | .frame $30,0,$26,0 | ||
| 42 | .prologue 0 | ||
| 43 | |||
| 44 | /* | ||
| 45 | * Serious stalling happens. The only way to mitigate this is to | ||
| 46 | * undertake a major re-write to interleave the constant materialization | ||
| 47 | * with other parts of the fall-through code. This is important, even | ||
| 48 | * though it makes maintenance tougher. | ||
| 49 | * Do this later. | ||
| 50 | */ | ||
| 51 | and $17,255,$1 # E : 00000000000000ch | ||
| 52 | insbl $17,1,$2 # U : 000000000000ch00 | ||
| 53 | bis $16,$16,$0 # E : return value | ||
| 54 | ble $18,end_b # U : zero length requested? | ||
| 55 | |||
| 56 | addq $18,$16,$6 # E : max address to write to | ||
| 57 | bis $1,$2,$17 # E : 000000000000chch | ||
| 58 | insbl $1,2,$3 # U : 0000000000ch0000 | ||
| 59 | insbl $1,3,$4 # U : 00000000ch000000 | ||
| 60 | |||
| 61 | or $3,$4,$3 # E : 00000000chch0000 | ||
| 62 | inswl $17,4,$5 # U : 0000chch00000000 | ||
| 63 | xor $16,$6,$1 # E : will complete write be within one quadword? | ||
| 64 | inswl $17,6,$2 # U : chch000000000000 | ||
| 65 | |||
| 66 | or $17,$3,$17 # E : 00000000chchchch | ||
| 67 | or $2,$5,$2 # E : chchchch00000000 | ||
| 68 | bic $1,7,$1 # E : fit within a single quadword? | ||
| 69 | and $16,7,$3 # E : Target addr misalignment | ||
| 70 | |||
| 71 | or $17,$2,$17 # E : chchchchchchchch | ||
| 72 | beq $1,within_quad_b # U : | ||
| 73 | nop # E : | ||
| 74 | beq $3,aligned_b # U : target is 0mod8 | ||
| 75 | |||
| 76 | /* | ||
| 77 | * Target address is misaligned, and won't fit within a quadword | ||
| 78 | */ | ||
| 79 | ldq_u $4,0($16) # L : Fetch first partial | ||
| 80 | bis $16,$16,$5 # E : Save the address | ||
| 81 | insql $17,$16,$2 # U : Insert new bytes | ||
| 82 | subq $3,8,$3 # E : Invert (for addressing uses) | ||
| 83 | |||
| 84 | addq $18,$3,$18 # E : $18 is new count ($3 is negative) | ||
| 85 | mskql $4,$16,$4 # U : clear relevant parts of the quad | ||
| 86 | subq $16,$3,$16 # E : $16 is new aligned destination | ||
| 87 | bis $2,$4,$1 # E : Final bytes | ||
| 88 | |||
| 89 | nop | ||
| 90 | stq_u $1,0($5) # L : Store result | ||
| 91 | nop | ||
| 92 | nop | ||
| 93 | |||
| 94 | .align 4 | ||
| 95 | aligned_b: | ||
| 96 | /* | ||
| 97 | * We are now guaranteed to be quad aligned, with at least | ||
| 98 | * one partial quad to write. | ||
| 99 | */ | ||
| 100 | |||
| 101 | sra $18,3,$3 # U : Number of remaining quads to write | ||
| 102 | and $18,7,$18 # E : Number of trailing bytes to write | ||
| 103 | bis $16,$16,$5 # E : Save dest address | ||
| 104 | beq $3,no_quad_b # U : tail stuff only | ||
| 105 | |||
| 106 | /* | ||
| 107 | * it's worth the effort to unroll this and use wh64 if possible | ||
| 108 | * Lifted a bunch of code from clear_user.S | ||
| 109 | * At this point, entry values are: | ||
| 110 | * $16 Current destination address | ||
| 111 | * $5 A copy of $16 | ||
| 112 | * $6 The max quadword address to write to | ||
| 113 | * $18 Number trailer bytes | ||
| 114 | * $3 Number quads to write | ||
| 115 | */ | ||
| 116 | |||
| 117 | and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) | ||
| 118 | subq $3, 16, $4 # E : Only try to unroll if > 128 bytes | ||
| 119 | subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) | ||
| 120 | blt $4, loop_b # U : | ||
| 121 | |||
| 122 | /* | ||
| 123 | * We know we've got at least 16 quads, minimum of one trip | ||
| 124 | * through unrolled loop. Do a quad at a time to get us 0mod64 | ||
| 125 | * aligned. | ||
| 126 | */ | ||
| 127 | |||
| 128 | nop # E : | ||
| 129 | nop # E : | ||
| 130 | nop # E : | ||
| 131 | beq $1, $bigalign_b # U : | ||
| 132 | |||
| 133 | $alignmod64_b: | ||
| 134 | stq $17, 0($5) # L : | ||
| 135 | subq $3, 1, $3 # E : For consistency later | ||
| 136 | addq $1, 8, $1 # E : Increment towards zero for alignment | ||
| 137 | addq $5, 8, $4 # E : Initial wh64 address (filler instruction) | ||
| 138 | |||
| 139 | nop | ||
| 140 | nop | ||
| 141 | addq $5, 8, $5 # E : Inc address | ||
| 142 | blt $1, $alignmod64_b # U : | ||
| 143 | |||
| 144 | $bigalign_b: | ||
| 145 | /* | ||
| 146 | * $3 - number quads left to go | ||
| 147 | * $5 - target address (aligned 0mod64) | ||
| 148 | * $17 - mask of stuff to store | ||
| 149 | * Scratch registers available: $7, $2, $4, $1 | ||
| 150 | * we know that we'll be taking a minimum of one trip through | ||
| 151 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle | ||
| 152 | * Assumes the wh64 needs to be for 2 trips through the loop in the future | ||
| 153 | * The wh64 is issued on for the starting destination address for trip +2 | ||
| 154 | * through the loop, and if there are less than two trips left, the target | ||
| 155 | * address will be for the current trip. | ||
| 156 | */ | ||
| 157 | |||
| 158 | $do_wh64_b: | ||
| 159 | wh64 ($4) # L1 : memory subsystem write hint | ||
| 160 | subq $3, 24, $2 # E : For determining future wh64 addresses | ||
| 161 | stq $17, 0($5) # L : | ||
| 162 | nop # E : | ||
| 163 | |||
| 164 | addq $5, 128, $4 # E : speculative target of next wh64 | ||
| 165 | stq $17, 8($5) # L : | ||
| 166 | stq $17, 16($5) # L : | ||
| 167 | addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) | ||
| 168 | |||
| 169 | stq $17, 24($5) # L : | ||
| 170 | stq $17, 32($5) # L : | ||
| 171 | cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle | ||
| 172 | nop | ||
| 173 | |||
| 174 | stq $17, 40($5) # L : | ||
| 175 | stq $17, 48($5) # L : | ||
| 176 | subq $3, 16, $2 # E : Repeat the loop at least once more? | ||
| 177 | nop | ||
| 178 | |||
| 179 | stq $17, 56($5) # L : | ||
| 180 | addq $5, 64, $5 # E : | ||
| 181 | subq $3, 8, $3 # E : | ||
| 182 | bge $2, $do_wh64_b # U : | ||
| 183 | |||
| 184 | nop | ||
| 185 | nop | ||
| 186 | nop | ||
| 187 | beq $3, no_quad_b # U : Might have finished already | ||
| 188 | |||
| 189 | .align 4 | ||
| 190 | /* | ||
| 191 | * Simple loop for trailing quadwords, or for small amounts | ||
| 192 | * of data (where we can't use an unrolled loop and wh64) | ||
| 193 | */ | ||
| 194 | loop_b: | ||
| 195 | stq $17,0($5) # L : | ||
| 196 | subq $3,1,$3 # E : Decrement number quads left | ||
| 197 | addq $5,8,$5 # E : Inc address | ||
| 198 | bne $3,loop_b # U : more? | ||
| 199 | |||
| 200 | no_quad_b: | ||
| 201 | /* | ||
| 202 | * Write 0..7 trailing bytes. | ||
| 203 | */ | ||
| 204 | nop # E : | ||
| 205 | beq $18,end_b # U : All done? | ||
| 206 | ldq $7,0($5) # L : | ||
| 207 | mskqh $7,$6,$2 # U : Mask final quad | ||
| 208 | |||
| 209 | insqh $17,$6,$4 # U : New bits | ||
| 210 | bis $2,$4,$1 # E : Put it all together | ||
| 211 | stq $1,0($5) # L : And back to memory | ||
| 212 | ret $31,($26),1 # L0 : | ||
| 213 | |||
| 214 | within_quad_b: | ||
| 215 | ldq_u $1,0($16) # L : | ||
| 216 | insql $17,$16,$2 # U : New bits | ||
| 217 | mskql $1,$16,$4 # U : Clear old | ||
| 218 | bis $2,$4,$2 # E : New result | ||
| 219 | |||
| 220 | mskql $2,$6,$4 # U : | ||
| 221 | mskqh $1,$6,$2 # U : | ||
| 222 | bis $2,$4,$1 # E : | ||
| 223 | stq_u $1,0($16) # L : | ||
| 224 | |||
| 225 | end_b: | ||
| 226 | nop | ||
| 227 | nop | ||
| 228 | nop | ||
| 229 | ret $31,($26),1 # L0 : | ||
| 230 | .end __memset | ||
| 231 | |||
| 232 | /* | ||
| 233 | * This is the original body of code, prior to replication and | ||
| 234 | * rescheduling. Leave it here, as there may be calls to this | ||
| 235 | * entry point. | ||
| 236 | */ | ||
| 237 | .align 4 | ||
| 238 | .ent __constant_c_memset | ||
| 239 | __constant_c_memset: | ||
| 240 | .frame $30,0,$26,0 | ||
| 241 | .prologue 0 | ||
| 242 | |||
| 243 | addq $18,$16,$6 # E : max address to write to | ||
| 244 | bis $16,$16,$0 # E : return value | ||
| 245 | xor $16,$6,$1 # E : will complete write be within one quadword? | ||
| 246 | ble $18,end # U : zero length requested? | ||
| 247 | |||
| 248 | bic $1,7,$1 # E : fit within a single quadword | ||
| 249 | beq $1,within_one_quad # U : | ||
| 250 | and $16,7,$3 # E : Target addr misalignment | ||
| 251 | beq $3,aligned # U : target is 0mod8 | ||
| 252 | |||
| 253 | /* | ||
| 254 | * Target address is misaligned, and won't fit within a quadword | ||
| 255 | */ | ||
| 256 | ldq_u $4,0($16) # L : Fetch first partial | ||
| 257 | bis $16,$16,$5 # E : Save the address | ||
| 258 | insql $17,$16,$2 # U : Insert new bytes | ||
| 259 | subq $3,8,$3 # E : Invert (for addressing uses) | ||
| 260 | |||
| 261 | addq $18,$3,$18 # E : $18 is new count ($3 is negative) | ||
| 262 | mskql $4,$16,$4 # U : clear relevant parts of the quad | ||
| 263 | subq $16,$3,$16 # E : $16 is new aligned destination | ||
| 264 | bis $2,$4,$1 # E : Final bytes | ||
| 265 | |||
| 266 | nop | ||
| 267 | stq_u $1,0($5) # L : Store result | ||
| 268 | nop | ||
| 269 | nop | ||
| 270 | |||
| 271 | .align 4 | ||
| 272 | aligned: | ||
| 273 | /* | ||
| 274 | * We are now guaranteed to be quad aligned, with at least | ||
| 275 | * one partial quad to write. | ||
| 276 | */ | ||
| 277 | |||
| 278 | sra $18,3,$3 # U : Number of remaining quads to write | ||
| 279 | and $18,7,$18 # E : Number of trailing bytes to write | ||
| 280 | bis $16,$16,$5 # E : Save dest address | ||
| 281 | beq $3,no_quad # U : tail stuff only | ||
| 282 | |||
| 283 | /* | ||
| 284 | * it's worth the effort to unroll this and use wh64 if possible | ||
| 285 | * Lifted a bunch of code from clear_user.S | ||
| 286 | * At this point, entry values are: | ||
| 287 | * $16 Current destination address | ||
| 288 | * $5 A copy of $16 | ||
| 289 | * $6 The max quadword address to write to | ||
| 290 | * $18 Number trailer bytes | ||
| 291 | * $3 Number quads to write | ||
| 292 | */ | ||
| 293 | |||
| 294 | and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) | ||
| 295 | subq $3, 16, $4 # E : Only try to unroll if > 128 bytes | ||
| 296 | subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) | ||
| 297 | blt $4, loop # U : | ||
| 298 | |||
| 299 | /* | ||
| 300 | * We know we've got at least 16 quads, minimum of one trip | ||
| 301 | * through unrolled loop. Do a quad at a time to get us 0mod64 | ||
| 302 | * aligned. | ||
| 303 | */ | ||
| 304 | |||
| 305 | nop # E : | ||
| 306 | nop # E : | ||
| 307 | nop # E : | ||
| 308 | beq $1, $bigalign # U : | ||
| 309 | |||
| 310 | $alignmod64: | ||
| 311 | stq $17, 0($5) # L : | ||
| 312 | subq $3, 1, $3 # E : For consistency later | ||
| 313 | addq $1, 8, $1 # E : Increment towards zero for alignment | ||
| 314 | addq $5, 8, $4 # E : Initial wh64 address (filler instruction) | ||
| 315 | |||
| 316 | nop | ||
| 317 | nop | ||
| 318 | addq $5, 8, $5 # E : Inc address | ||
| 319 | blt $1, $alignmod64 # U : | ||
| 320 | |||
| 321 | $bigalign: | ||
| 322 | /* | ||
| 323 | * $3 - number quads left to go | ||
| 324 | * $5 - target address (aligned 0mod64) | ||
| 325 | * $17 - mask of stuff to store | ||
| 326 | * Scratch registers available: $7, $2, $4, $1 | ||
| 327 | * we know that we'll be taking a minimum of one trip through | ||
| 328 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle | ||
| 329 | * Assumes the wh64 needs to be for 2 trips through the loop in the future | ||
| 330 | * The wh64 is issued on for the starting destination address for trip +2 | ||
| 331 | * through the loop, and if there are less than two trips left, the target | ||
| 332 | * address will be for the current trip. | ||
| 333 | */ | ||
| 334 | |||
| 335 | $do_wh64: | ||
| 336 | wh64 ($4) # L1 : memory subsystem write hint | ||
| 337 | subq $3, 24, $2 # E : For determining future wh64 addresses | ||
| 338 | stq $17, 0($5) # L : | ||
| 339 | nop # E : | ||
| 340 | |||
| 341 | addq $5, 128, $4 # E : speculative target of next wh64 | ||
| 342 | stq $17, 8($5) # L : | ||
| 343 | stq $17, 16($5) # L : | ||
| 344 | addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) | ||
| 345 | |||
| 346 | stq $17, 24($5) # L : | ||
| 347 | stq $17, 32($5) # L : | ||
| 348 | cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle | ||
| 349 | nop | ||
| 350 | |||
| 351 | stq $17, 40($5) # L : | ||
| 352 | stq $17, 48($5) # L : | ||
| 353 | subq $3, 16, $2 # E : Repeat the loop at least once more? | ||
| 354 | nop | ||
| 355 | |||
| 356 | stq $17, 56($5) # L : | ||
| 357 | addq $5, 64, $5 # E : | ||
| 358 | subq $3, 8, $3 # E : | ||
| 359 | bge $2, $do_wh64 # U : | ||
| 360 | |||
| 361 | nop | ||
| 362 | nop | ||
| 363 | nop | ||
| 364 | beq $3, no_quad # U : Might have finished already | ||
| 365 | |||
| 366 | .align 4 | ||
| 367 | /* | ||
| 368 | * Simple loop for trailing quadwords, or for small amounts | ||
| 369 | * of data (where we can't use an unrolled loop and wh64) | ||
| 370 | */ | ||
| 371 | loop: | ||
| 372 | stq $17,0($5) # L : | ||
| 373 | subq $3,1,$3 # E : Decrement number quads left | ||
| 374 | addq $5,8,$5 # E : Inc address | ||
| 375 | bne $3,loop # U : more? | ||
| 376 | |||
| 377 | no_quad: | ||
| 378 | /* | ||
| 379 | * Write 0..7 trailing bytes. | ||
| 380 | */ | ||
| 381 | nop # E : | ||
| 382 | beq $18,end # U : All done? | ||
| 383 | ldq $7,0($5) # L : | ||
| 384 | mskqh $7,$6,$2 # U : Mask final quad | ||
| 385 | |||
| 386 | insqh $17,$6,$4 # U : New bits | ||
| 387 | bis $2,$4,$1 # E : Put it all together | ||
| 388 | stq $1,0($5) # L : And back to memory | ||
| 389 | ret $31,($26),1 # L0 : | ||
| 390 | |||
| 391 | within_one_quad: | ||
| 392 | ldq_u $1,0($16) # L : | ||
| 393 | insql $17,$16,$2 # U : New bits | ||
| 394 | mskql $1,$16,$4 # U : Clear old | ||
| 395 | bis $2,$4,$2 # E : New result | ||
| 396 | |||
| 397 | mskql $2,$6,$4 # U : | ||
| 398 | mskqh $1,$6,$2 # U : | ||
| 399 | bis $2,$4,$1 # E : | ||
| 400 | stq_u $1,0($16) # L : | ||
| 401 | |||
| 402 | end: | ||
| 403 | nop | ||
| 404 | nop | ||
| 405 | nop | ||
| 406 | ret $31,($26),1 # L0 : | ||
| 407 | .end __constant_c_memset | ||
| 408 | |||
| 409 | /* | ||
| 410 | * This is a replicant of the __constant_c_memset code, rescheduled | ||
| 411 | * to mask stalls. Note that entry point names also had to change | ||
| 412 | */ | ||
| 413 | .align 5 | ||
| 414 | .ent __memsetw | ||
| 415 | |||
| 416 | __memsetw: | ||
| 417 | .frame $30,0,$26,0 | ||
| 418 | .prologue 0 | ||
| 419 | |||
| 420 | inswl $17,0,$5 # U : 000000000000c1c2 | ||
| 421 | inswl $17,2,$2 # U : 00000000c1c20000 | ||
| 422 | bis $16,$16,$0 # E : return value | ||
| 423 | addq $18,$16,$6 # E : max address to write to | ||
| 424 | |||
| 425 | ble $18, end_w # U : zero length requested? | ||
| 426 | inswl $17,4,$3 # U : 0000c1c200000000 | ||
| 427 | inswl $17,6,$4 # U : c1c2000000000000 | ||
| 428 | xor $16,$6,$1 # E : will complete write be within one quadword? | ||
| 429 | |||
| 430 | or $2,$5,$2 # E : 00000000c1c2c1c2 | ||
| 431 | or $3,$4,$17 # E : c1c2c1c200000000 | ||
| 432 | bic $1,7,$1 # E : fit within a single quadword | ||
| 433 | and $16,7,$3 # E : Target addr misalignment | ||
| 434 | |||
| 435 | or $17,$2,$17 # E : c1c2c1c2c1c2c1c2 | ||
| 436 | beq $1,within_quad_w # U : | ||
| 437 | nop | ||
| 438 | beq $3,aligned_w # U : target is 0mod8 | ||
| 439 | |||
| 440 | /* | ||
| 441 | * Target address is misaligned, and won't fit within a quadword | ||
| 442 | */ | ||
| 443 | ldq_u $4,0($16) # L : Fetch first partial | ||
| 444 | bis $16,$16,$5 # E : Save the address | ||
| 445 | insql $17,$16,$2 # U : Insert new bytes | ||
| 446 | subq $3,8,$3 # E : Invert (for addressing uses) | ||
| 447 | |||
| 448 | addq $18,$3,$18 # E : $18 is new count ($3 is negative) | ||
| 449 | mskql $4,$16,$4 # U : clear relevant parts of the quad | ||
| 450 | subq $16,$3,$16 # E : $16 is new aligned destination | ||
| 451 | bis $2,$4,$1 # E : Final bytes | ||
| 452 | |||
| 453 | nop | ||
| 454 | stq_u $1,0($5) # L : Store result | ||
| 455 | nop | ||
| 456 | nop | ||
| 457 | |||
| 458 | .align 4 | ||
| 459 | aligned_w: | ||
| 460 | /* | ||
| 461 | * We are now guaranteed to be quad aligned, with at least | ||
| 462 | * one partial quad to write. | ||
| 463 | */ | ||
| 464 | |||
| 465 | sra $18,3,$3 # U : Number of remaining quads to write | ||
| 466 | and $18,7,$18 # E : Number of trailing bytes to write | ||
| 467 | bis $16,$16,$5 # E : Save dest address | ||
| 468 | beq $3,no_quad_w # U : tail stuff only | ||
| 469 | |||
| 470 | /* | ||
| 471 | * it's worth the effort to unroll this and use wh64 if possible | ||
| 472 | * Lifted a bunch of code from clear_user.S | ||
| 473 | * At this point, entry values are: | ||
| 474 | * $16 Current destination address | ||
| 475 | * $5 A copy of $16 | ||
| 476 | * $6 The max quadword address to write to | ||
| 477 | * $18 Number trailer bytes | ||
| 478 | * $3 Number quads to write | ||
| 479 | */ | ||
| 480 | |||
| 481 | and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) | ||
| 482 | subq $3, 16, $4 # E : Only try to unroll if > 128 bytes | ||
| 483 | subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) | ||
| 484 | blt $4, loop_w # U : | ||
| 485 | |||
| 486 | /* | ||
| 487 | * We know we've got at least 16 quads, minimum of one trip | ||
| 488 | * through unrolled loop. Do a quad at a time to get us 0mod64 | ||
| 489 | * aligned. | ||
| 490 | */ | ||
| 491 | |||
| 492 | nop # E : | ||
| 493 | nop # E : | ||
| 494 | nop # E : | ||
| 495 | beq $1, $bigalign_w # U : | ||
| 496 | |||
| 497 | $alignmod64_w: | ||
| 498 | stq $17, 0($5) # L : | ||
| 499 | subq $3, 1, $3 # E : For consistency later | ||
| 500 | addq $1, 8, $1 # E : Increment towards zero for alignment | ||
| 501 | addq $5, 8, $4 # E : Initial wh64 address (filler instruction) | ||
| 502 | |||
| 503 | nop | ||
| 504 | nop | ||
| 505 | addq $5, 8, $5 # E : Inc address | ||
| 506 | blt $1, $alignmod64_w # U : | ||
| 507 | |||
| 508 | $bigalign_w: | ||
| 509 | /* | ||
| 510 | * $3 - number quads left to go | ||
| 511 | * $5 - target address (aligned 0mod64) | ||
| 512 | * $17 - mask of stuff to store | ||
| 513 | * Scratch registers available: $7, $2, $4, $1 | ||
| 514 | * we know that we'll be taking a minimum of one trip through | ||
| 515 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle | ||
| 516 | * Assumes the wh64 needs to be for 2 trips through the loop in the future | ||
| 517 | * The wh64 is issued on for the starting destination address for trip +2 | ||
| 518 | * through the loop, and if there are less than two trips left, the target | ||
| 519 | * address will be for the current trip. | ||
| 520 | */ | ||
| 521 | |||
| 522 | $do_wh64_w: | ||
| 523 | wh64 ($4) # L1 : memory subsystem write hint | ||
| 524 | subq $3, 24, $2 # E : For determining future wh64 addresses | ||
| 525 | stq $17, 0($5) # L : | ||
| 526 | nop # E : | ||
| 527 | |||
| 528 | addq $5, 128, $4 # E : speculative target of next wh64 | ||
| 529 | stq $17, 8($5) # L : | ||
| 530 | stq $17, 16($5) # L : | ||
| 531 | addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) | ||
| 532 | |||
| 533 | stq $17, 24($5) # L : | ||
| 534 | stq $17, 32($5) # L : | ||
| 535 | cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle | ||
| 536 | nop | ||
| 537 | |||
| 538 | stq $17, 40($5) # L : | ||
| 539 | stq $17, 48($5) # L : | ||
| 540 | subq $3, 16, $2 # E : Repeat the loop at least once more? | ||
| 541 | nop | ||
| 542 | |||
| 543 | stq $17, 56($5) # L : | ||
| 544 | addq $5, 64, $5 # E : | ||
| 545 | subq $3, 8, $3 # E : | ||
| 546 | bge $2, $do_wh64_w # U : | ||
| 547 | |||
| 548 | nop | ||
| 549 | nop | ||
| 550 | nop | ||
| 551 | beq $3, no_quad_w # U : Might have finished already | ||
| 552 | |||
| 553 | .align 4 | ||
| 554 | /* | ||
| 555 | * Simple loop for trailing quadwords, or for small amounts | ||
| 556 | * of data (where we can't use an unrolled loop and wh64) | ||
| 557 | */ | ||
| 558 | loop_w: | ||
| 559 | stq $17,0($5) # L : | ||
| 560 | subq $3,1,$3 # E : Decrement number quads left | ||
| 561 | addq $5,8,$5 # E : Inc address | ||
| 562 | bne $3,loop_w # U : more? | ||
| 563 | |||
| 564 | no_quad_w: | ||
| 565 | /* | ||
| 566 | * Write 0..7 trailing bytes. | ||
| 567 | */ | ||
| 568 | nop # E : | ||
| 569 | beq $18,end_w # U : All done? | ||
| 570 | ldq $7,0($5) # L : | ||
| 571 | mskqh $7,$6,$2 # U : Mask final quad | ||
| 572 | |||
| 573 | insqh $17,$6,$4 # U : New bits | ||
| 574 | bis $2,$4,$1 # E : Put it all together | ||
| 575 | stq $1,0($5) # L : And back to memory | ||
| 576 | ret $31,($26),1 # L0 : | ||
| 577 | |||
| 578 | within_quad_w: | ||
| 579 | ldq_u $1,0($16) # L : | ||
| 580 | insql $17,$16,$2 # U : New bits | ||
| 581 | mskql $1,$16,$4 # U : Clear old | ||
| 582 | bis $2,$4,$2 # E : New result | ||
| 583 | |||
| 584 | mskql $2,$6,$4 # U : | ||
| 585 | mskqh $1,$6,$2 # U : | ||
| 586 | bis $2,$4,$1 # E : | ||
| 587 | stq_u $1,0($16) # L : | ||
| 588 | |||
| 589 | end_w: | ||
| 590 | nop | ||
| 591 | nop | ||
| 592 | nop | ||
| 593 | ret $31,($26),1 # L0 : | ||
| 594 | |||
| 595 | .end __memsetw | ||
| 596 | |||
| 597 | memset = __memset | ||
diff --git a/arch/alpha/lib/ev6-strncpy_from_user.S b/arch/alpha/lib/ev6-strncpy_from_user.S new file mode 100644 index 00000000000..d2e28178cac --- /dev/null +++ b/arch/alpha/lib/ev6-strncpy_from_user.S | |||
| @@ -0,0 +1,424 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev6-strncpy_from_user.S | ||
| 3 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 4 | * | ||
| 5 | * Just like strncpy except in the return value: | ||
| 6 | * | ||
| 7 | * -EFAULT if an exception occurs before the terminator is copied. | ||
| 8 | * N if the buffer filled. | ||
| 9 | * | ||
| 10 | * Otherwise the length of the string is returned. | ||
| 11 | * | ||
| 12 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 13 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 14 | * abbreviated as 'CWG' in other comments here | ||
| 15 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 16 | * Scheduling notation: | ||
| 17 | * E - either cluster | ||
| 18 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 19 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 20 | * A bunch of instructions got moved and temp registers were changed | ||
| 21 | * to aid in scheduling. Control flow was also re-arranged to eliminate | ||
| 22 | * branches, and to provide longer code sequences to enable better scheduling. | ||
| 23 | * A total rewrite (using byte load/stores for start & tail sequences) | ||
| 24 | * is desirable, but very difficult to do without a from-scratch rewrite. | ||
| 25 | * Save that for the future. | ||
| 26 | */ | ||
| 27 | |||
| 28 | |||
| 29 | #include <asm/errno.h> | ||
| 30 | #include <asm/regdef.h> | ||
| 31 | |||
| 32 | |||
| 33 | /* Allow an exception for an insn; exit if we get one. */ | ||
| 34 | #define EX(x,y...) \ | ||
| 35 | 99: x,##y; \ | ||
| 36 | .section __ex_table,"a"; \ | ||
| 37 | .long 99b - .; \ | ||
| 38 | lda $31, $exception-99b($0); \ | ||
| 39 | .previous | ||
| 40 | |||
| 41 | |||
| 42 | .set noat | ||
| 43 | .set noreorder | ||
| 44 | .text | ||
| 45 | |||
| 46 | .globl __strncpy_from_user | ||
| 47 | .ent __strncpy_from_user | ||
| 48 | .frame $30, 0, $26 | ||
| 49 | .prologue 0 | ||
| 50 | |||
| 51 | .align 4 | ||
| 52 | __strncpy_from_user: | ||
| 53 | and a0, 7, t3 # E : find dest misalignment | ||
| 54 | beq a2, $zerolength # U : | ||
| 55 | |||
| 56 | /* Are source and destination co-aligned? */ | ||
| 57 | mov a0, v0 # E : save the string start | ||
| 58 | xor a0, a1, t4 # E : | ||
| 59 | EX( ldq_u t1, 0(a1) ) # L : Latency=3 load first quadword | ||
| 60 | ldq_u t0, 0(a0) # L : load first (partial) aligned dest quadword | ||
| 61 | |||
| 62 | addq a2, t3, a2 # E : bias count by dest misalignment | ||
| 63 | subq a2, 1, a3 # E : | ||
| 64 | addq zero, 1, t10 # E : | ||
| 65 | and t4, 7, t4 # E : misalignment between the two | ||
| 66 | |||
| 67 | and a3, 7, t6 # E : number of tail bytes | ||
| 68 | sll t10, t6, t10 # E : t10 = bitmask of last count byte | ||
| 69 | bne t4, $unaligned # U : | ||
| 70 | lda t2, -1 # E : build a mask against false zero | ||
| 71 | |||
| 72 | /* | ||
| 73 | * We are co-aligned; take care of a partial first word. | ||
| 74 | * On entry to this basic block: | ||
| 75 | * t0 == the first destination word for masking back in | ||
| 76 | * t1 == the first source word. | ||
| 77 | */ | ||
| 78 | |||
| 79 | srl a3, 3, a2 # E : a2 = loop counter = (count - 1)/8 | ||
| 80 | addq a1, 8, a1 # E : | ||
| 81 | mskqh t2, a1, t2 # U : detection in the src word | ||
| 82 | nop | ||
| 83 | |||
| 84 | /* Create the 1st output word and detect 0's in the 1st input word. */ | ||
| 85 | mskqh t1, a1, t3 # U : | ||
| 86 | mskql t0, a1, t0 # U : assemble the first output word | ||
| 87 | ornot t1, t2, t2 # E : | ||
| 88 | nop | ||
| 89 | |||
| 90 | cmpbge zero, t2, t8 # E : bits set iff null found | ||
| 91 | or t0, t3, t0 # E : | ||
| 92 | beq a2, $a_eoc # U : | ||
| 93 | bne t8, $a_eos # U : 2nd branch in a quad. Bad. | ||
| 94 | |||
| 95 | /* On entry to this basic block: | ||
| 96 | * t0 == a source quad not containing a null. | ||
| 97 | * a0 - current aligned destination address | ||
| 98 | * a1 - current aligned source address | ||
| 99 | * a2 - count of quadwords to move. | ||
| 100 | * NOTE: Loop improvement - unrolling this is going to be | ||
| 101 | * a huge win, since we're going to stall otherwise. | ||
| 102 | * Fix this later. For _really_ large copies, look | ||
| 103 | * at using wh64 on a look-ahead basis. See the code | ||
| 104 | * in clear_user.S and copy_user.S. | ||
| 105 | * Presumably, since (a0) and (a1) do not overlap (by C definition) | ||
| 106 | * Lots of nops here: | ||
| 107 | * - Separate loads from stores | ||
| 108 | * - Keep it to 1 branch/quadpack so the branch predictor | ||
| 109 | * can train. | ||
| 110 | */ | ||
| 111 | $a_loop: | ||
| 112 | stq_u t0, 0(a0) # L : | ||
| 113 | addq a0, 8, a0 # E : | ||
| 114 | nop | ||
| 115 | subq a2, 1, a2 # E : | ||
| 116 | |||
| 117 | EX( ldq_u t0, 0(a1) ) # L : | ||
| 118 | addq a1, 8, a1 # E : | ||
| 119 | cmpbge zero, t0, t8 # E : Stall 2 cycles on t0 | ||
| 120 | beq a2, $a_eoc # U : | ||
| 121 | |||
| 122 | beq t8, $a_loop # U : | ||
| 123 | nop | ||
| 124 | nop | ||
| 125 | nop | ||
| 126 | |||
| 127 | /* Take care of the final (partial) word store. At this point | ||
| 128 | * the end-of-count bit is set in t8 iff it applies. | ||
| 129 | * | ||
| 130 | * On entry to this basic block we have: | ||
| 131 | * t0 == the source word containing the null | ||
| 132 | * t8 == the cmpbge mask that found it. | ||
| 133 | */ | ||
| 134 | $a_eos: | ||
| 135 | negq t8, t12 # E : find low bit set | ||
| 136 | and t8, t12, t12 # E : | ||
| 137 | |||
| 138 | /* We're doing a partial word store and so need to combine | ||
| 139 | our source and original destination words. */ | ||
| 140 | ldq_u t1, 0(a0) # L : | ||
| 141 | subq t12, 1, t6 # E : | ||
| 142 | |||
| 143 | or t12, t6, t8 # E : | ||
| 144 | zapnot t0, t8, t0 # U : clear src bytes > null | ||
| 145 | zap t1, t8, t1 # U : clear dst bytes <= null | ||
| 146 | or t0, t1, t0 # E : | ||
| 147 | |||
| 148 | stq_u t0, 0(a0) # L : | ||
| 149 | br $finish_up # L0 : | ||
| 150 | nop | ||
| 151 | nop | ||
| 152 | |||
| 153 | /* Add the end-of-count bit to the eos detection bitmask. */ | ||
| 154 | .align 4 | ||
| 155 | $a_eoc: | ||
| 156 | or t10, t8, t8 | ||
| 157 | br $a_eos | ||
| 158 | nop | ||
| 159 | nop | ||
| 160 | |||
| 161 | |||
| 162 | /* The source and destination are not co-aligned. Align the destination | ||
| 163 | and cope. We have to be very careful about not reading too much and | ||
| 164 | causing a SEGV. */ | ||
| 165 | |||
| 166 | .align 4 | ||
| 167 | $u_head: | ||
| 168 | /* We know just enough now to be able to assemble the first | ||
| 169 | full source word. We can still find a zero at the end of it | ||
| 170 | that prevents us from outputting the whole thing. | ||
| 171 | |||
| 172 | On entry to this basic block: | ||
| 173 | t0 == the first dest word, unmasked | ||
| 174 | t1 == the shifted low bits of the first source word | ||
| 175 | t6 == bytemask that is -1 in dest word bytes */ | ||
| 176 | |||
| 177 | EX( ldq_u t2, 8(a1) ) # L : load second src word | ||
| 178 | addq a1, 8, a1 # E : | ||
| 179 | mskql t0, a0, t0 # U : mask trailing garbage in dst | ||
| 180 | extqh t2, a1, t4 # U : | ||
| 181 | |||
| 182 | or t1, t4, t1 # E : first aligned src word complete | ||
| 183 | mskqh t1, a0, t1 # U : mask leading garbage in src | ||
| 184 | or t0, t1, t0 # E : first output word complete | ||
| 185 | or t0, t6, t6 # E : mask original data for zero test | ||
| 186 | |||
| 187 | cmpbge zero, t6, t8 # E : | ||
| 188 | beq a2, $u_eocfin # U : | ||
| 189 | bne t8, $u_final # U : bad news - 2nd branch in a quad | ||
| 190 | lda t6, -1 # E : mask out the bits we have | ||
| 191 | |||
| 192 | mskql t6, a1, t6 # U : already seen | ||
| 193 | stq_u t0, 0(a0) # L : store first output word | ||
| 194 | or t6, t2, t2 # E : | ||
| 195 | cmpbge zero, t2, t8 # E : find nulls in second partial | ||
| 196 | |||
| 197 | addq a0, 8, a0 # E : | ||
| 198 | subq a2, 1, a2 # E : | ||
| 199 | bne t8, $u_late_head_exit # U : | ||
| 200 | nop | ||
| 201 | |||
| 202 | /* Finally, we've got all the stupid leading edge cases taken care | ||
| 203 | of and we can set up to enter the main loop. */ | ||
| 204 | |||
| 205 | extql t2, a1, t1 # U : position hi-bits of lo word | ||
| 206 | EX( ldq_u t2, 8(a1) ) # L : read next high-order source word | ||
| 207 | addq a1, 8, a1 # E : | ||
| 208 | cmpbge zero, t2, t8 # E : | ||
| 209 | |||
| 210 | beq a2, $u_eoc # U : | ||
| 211 | bne t8, $u_eos # U : | ||
| 212 | nop | ||
| 213 | nop | ||
| 214 | |||
| 215 | /* Unaligned copy main loop. In order to avoid reading too much, | ||
| 216 | the loop is structured to detect zeros in aligned source words. | ||
| 217 | This has, unfortunately, effectively pulled half of a loop | ||
| 218 | iteration out into the head and half into the tail, but it does | ||
| 219 | prevent nastiness from accumulating in the very thing we want | ||
| 220 | to run as fast as possible. | ||
| 221 | |||
| 222 | On entry to this basic block: | ||
| 223 | t1 == the shifted high-order bits from the previous source word | ||
| 224 | t2 == the unshifted current source word | ||
| 225 | |||
| 226 | We further know that t2 does not contain a null terminator. */ | ||
| 227 | |||
| 228 | /* | ||
| 229 | * Extra nops here: | ||
| 230 | * separate load quads from store quads | ||
| 231 | * only one branch/quad to permit predictor training | ||
| 232 | */ | ||
| 233 | |||
| 234 | .align 4 | ||
| 235 | $u_loop: | ||
| 236 | extqh t2, a1, t0 # U : extract high bits for current word | ||
| 237 | addq a1, 8, a1 # E : | ||
| 238 | extql t2, a1, t3 # U : extract low bits for next time | ||
| 239 | addq a0, 8, a0 # E : | ||
| 240 | |||
| 241 | or t0, t1, t0 # E : current dst word now complete | ||
| 242 | EX( ldq_u t2, 0(a1) ) # L : load high word for next time | ||
| 243 | subq a2, 1, a2 # E : | ||
| 244 | nop | ||
| 245 | |||
| 246 | stq_u t0, -8(a0) # L : save the current word | ||
| 247 | mov t3, t1 # E : | ||
| 248 | cmpbge zero, t2, t8 # E : test new word for eos | ||
| 249 | beq a2, $u_eoc # U : | ||
| 250 | |||
| 251 | beq t8, $u_loop # U : | ||
| 252 | nop | ||
| 253 | nop | ||
| 254 | nop | ||
| 255 | |||
| 256 | /* We've found a zero somewhere in the source word we just read. | ||
| 257 | If it resides in the lower half, we have one (probably partial) | ||
| 258 | word to write out, and if it resides in the upper half, we | ||
| 259 | have one full and one partial word left to write out. | ||
| 260 | |||
| 261 | On entry to this basic block: | ||
| 262 | t1 == the shifted high-order bits from the previous source word | ||
| 263 | t2 == the unshifted current source word. */ | ||
| 264 | .align 4 | ||
| 265 | $u_eos: | ||
| 266 | extqh t2, a1, t0 # U : | ||
| 267 | or t0, t1, t0 # E : first (partial) source word complete | ||
| 268 | cmpbge zero, t0, t8 # E : is the null in this first bit? | ||
| 269 | nop | ||
| 270 | |||
| 271 | bne t8, $u_final # U : | ||
| 272 | stq_u t0, 0(a0) # L : the null was in the high-order bits | ||
| 273 | addq a0, 8, a0 # E : | ||
| 274 | subq a2, 1, a2 # E : | ||
| 275 | |||
| 276 | .align 4 | ||
| 277 | $u_late_head_exit: | ||
| 278 | extql t2, a1, t0 # U : | ||
| 279 | cmpbge zero, t0, t8 # E : | ||
| 280 | or t8, t10, t6 # E : | ||
| 281 | cmoveq a2, t6, t8 # E : | ||
| 282 | |||
| 283 | /* Take care of a final (probably partial) result word. | ||
| 284 | On entry to this basic block: | ||
| 285 | t0 == assembled source word | ||
| 286 | t8 == cmpbge mask that found the null. */ | ||
| 287 | .align 4 | ||
| 288 | $u_final: | ||
| 289 | negq t8, t6 # E : isolate low bit set | ||
| 290 | and t6, t8, t12 # E : | ||
| 291 | ldq_u t1, 0(a0) # L : | ||
| 292 | subq t12, 1, t6 # E : | ||
| 293 | |||
| 294 | or t6, t12, t8 # E : | ||
| 295 | zapnot t0, t8, t0 # U : kill source bytes > null | ||
| 296 | zap t1, t8, t1 # U : kill dest bytes <= null | ||
| 297 | or t0, t1, t0 # E : | ||
| 298 | |||
| 299 | stq_u t0, 0(a0) # E : | ||
| 300 | br $finish_up # U : | ||
| 301 | nop | ||
| 302 | nop | ||
| 303 | |||
| 304 | .align 4 | ||
| 305 | $u_eoc: # end-of-count | ||
| 306 | extqh t2, a1, t0 # U : | ||
| 307 | or t0, t1, t0 # E : | ||
| 308 | cmpbge zero, t0, t8 # E : | ||
| 309 | nop | ||
| 310 | |||
| 311 | .align 4 | ||
| 312 | $u_eocfin: # end-of-count, final word | ||
| 313 | or t10, t8, t8 # E : | ||
| 314 | br $u_final # U : | ||
| 315 | nop | ||
| 316 | nop | ||
| 317 | |||
| 318 | /* Unaligned copy entry point. */ | ||
| 319 | .align 4 | ||
| 320 | $unaligned: | ||
| 321 | |||
| 322 | srl a3, 3, a2 # U : a2 = loop counter = (count - 1)/8 | ||
| 323 | and a0, 7, t4 # E : find dest misalignment | ||
| 324 | and a1, 7, t5 # E : find src misalignment | ||
| 325 | mov zero, t0 # E : | ||
| 326 | |||
| 327 | /* Conditionally load the first destination word and a bytemask | ||
| 328 | with 0xff indicating that the destination byte is sacrosanct. */ | ||
| 329 | |||
| 330 | mov zero, t6 # E : | ||
| 331 | beq t4, 1f # U : | ||
| 332 | ldq_u t0, 0(a0) # L : | ||
| 333 | lda t6, -1 # E : | ||
| 334 | |||
| 335 | mskql t6, a0, t6 # E : | ||
| 336 | nop | ||
| 337 | nop | ||
| 338 | nop | ||
| 339 | |||
| 340 | .align 4 | ||
| 341 | 1: | ||
| 342 | subq a1, t4, a1 # E : sub dest misalignment from src addr | ||
| 343 | /* If source misalignment is larger than dest misalignment, we need | ||
| 344 | extra startup checks to avoid SEGV. */ | ||
| 345 | cmplt t4, t5, t12 # E : | ||
| 346 | extql t1, a1, t1 # U : shift src into place | ||
| 347 | lda t2, -1 # E : for creating masks later | ||
| 348 | |||
| 349 | beq t12, $u_head # U : | ||
| 350 | mskqh t2, t5, t2 # U : begin src byte validity mask | ||
| 351 | cmpbge zero, t1, t8 # E : is there a zero? | ||
| 352 | nop | ||
| 353 | |||
| 354 | extql t2, a1, t2 # U : | ||
| 355 | or t8, t10, t5 # E : test for end-of-count too | ||
| 356 | cmpbge zero, t2, t3 # E : | ||
| 357 | cmoveq a2, t5, t8 # E : Latency=2, extra map slot | ||
| 358 | |||
| 359 | nop # E : goes with cmov | ||
| 360 | andnot t8, t3, t8 # E : | ||
| 361 | beq t8, $u_head # U : | ||
| 362 | nop | ||
| 363 | |||
| 364 | /* At this point we've found a zero in the first partial word of | ||
| 365 | the source. We need to isolate the valid source data and mask | ||
| 366 | it into the original destination data. (Incidentally, we know | ||
| 367 | that we'll need at least one byte of that original dest word.) */ | ||
| 368 | |||
| 369 | ldq_u t0, 0(a0) # L : | ||
| 370 | negq t8, t6 # E : build bitmask of bytes <= zero | ||
| 371 | mskqh t1, t4, t1 # U : | ||
| 372 | and t6, t8, t12 # E : | ||
| 373 | |||
| 374 | subq t12, 1, t6 # E : | ||
| 375 | or t6, t12, t8 # E : | ||
| 376 | zapnot t2, t8, t2 # U : prepare source word; mirror changes | ||
| 377 | zapnot t1, t8, t1 # U : to source validity mask | ||
| 378 | |||
| 379 | andnot t0, t2, t0 # E : zero place for source to reside | ||
| 380 | or t0, t1, t0 # E : and put it there | ||
| 381 | stq_u t0, 0(a0) # L : | ||
| 382 | nop | ||
| 383 | |||
| 384 | .align 4 | ||
| 385 | $finish_up: | ||
| 386 | zapnot t0, t12, t4 # U : was last byte written null? | ||
| 387 | and t12, 0xf0, t3 # E : binary search for the address of the | ||
| 388 | cmovne t4, 1, t4 # E : Latency=2, extra map slot | ||
| 389 | nop # E : with cmovne | ||
| 390 | |||
| 391 | and t12, 0xcc, t2 # E : last byte written | ||
| 392 | and t12, 0xaa, t1 # E : | ||
| 393 | cmovne t3, 4, t3 # E : Latency=2, extra map slot | ||
| 394 | nop # E : with cmovne | ||
| 395 | |||
| 396 | bic a0, 7, t0 | ||
| 397 | cmovne t2, 2, t2 # E : Latency=2, extra map slot | ||
| 398 | nop # E : with cmovne | ||
| 399 | nop | ||
| 400 | |||
| 401 | cmovne t1, 1, t1 # E : Latency=2, extra map slot | ||
| 402 | nop # E : with cmovne | ||
| 403 | addq t0, t3, t0 # E : | ||
| 404 | addq t1, t2, t1 # E : | ||
| 405 | |||
| 406 | addq t0, t1, t0 # E : | ||
| 407 | addq t0, t4, t0 # add one if we filled the buffer | ||
| 408 | subq t0, v0, v0 # find string length | ||
| 409 | ret # L0 : | ||
| 410 | |||
| 411 | .align 4 | ||
| 412 | $zerolength: | ||
| 413 | nop | ||
| 414 | nop | ||
| 415 | nop | ||
| 416 | clr v0 | ||
| 417 | |||
| 418 | $exception: | ||
| 419 | nop | ||
| 420 | nop | ||
| 421 | nop | ||
| 422 | ret | ||
| 423 | |||
| 424 | .end __strncpy_from_user | ||
diff --git a/arch/alpha/lib/ev6-stxcpy.S b/arch/alpha/lib/ev6-stxcpy.S new file mode 100644 index 00000000000..4643ff2ffc8 --- /dev/null +++ b/arch/alpha/lib/ev6-stxcpy.S | |||
| @@ -0,0 +1,321 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev6-stxcpy.S | ||
| 3 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 4 | * | ||
| 5 | * Copy a null-terminated string from SRC to DST. | ||
| 6 | * | ||
| 7 | * This is an internal routine used by strcpy, stpcpy, and strcat. | ||
| 8 | * As such, it uses special linkage conventions to make implementation | ||
| 9 | * of these public functions more efficient. | ||
| 10 | * | ||
| 11 | * On input: | ||
| 12 | * t9 = return address | ||
| 13 | * a0 = DST | ||
| 14 | * a1 = SRC | ||
| 15 | * | ||
| 16 | * On output: | ||
| 17 | * t12 = bitmask (with one bit set) indicating the last byte written | ||
| 18 | * a0 = unaligned address of the last *word* written | ||
| 19 | * | ||
| 20 | * Furthermore, v0, a3-a5, t11, and t12 are untouched. | ||
| 21 | * | ||
| 22 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 23 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 24 | * abbreviated as 'CWG' in other comments here | ||
| 25 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 26 | * Scheduling notation: | ||
| 27 | * E - either cluster | ||
| 28 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 29 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 30 | * Try not to change the actual algorithm if possible for consistency. | ||
| 31 | */ | ||
| 32 | |||
| 33 | #include <asm/regdef.h> | ||
| 34 | |||
| 35 | .set noat | ||
| 36 | .set noreorder | ||
| 37 | |||
| 38 | .text | ||
| 39 | |||
| 40 | /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that | ||
| 41 | doesn't like putting the entry point for a procedure somewhere in the | ||
| 42 | middle of the procedure descriptor. Work around this by putting the | ||
| 43 | aligned copy in its own procedure descriptor */ | ||
| 44 | |||
| 45 | |||
| 46 | .ent stxcpy_aligned | ||
| 47 | .align 4 | ||
| 48 | stxcpy_aligned: | ||
| 49 | .frame sp, 0, t9 | ||
| 50 | .prologue 0 | ||
| 51 | |||
| 52 | /* On entry to this basic block: | ||
| 53 | t0 == the first destination word for masking back in | ||
| 54 | t1 == the first source word. */ | ||
| 55 | |||
| 56 | /* Create the 1st output word and detect 0's in the 1st input word. */ | ||
| 57 | lda t2, -1 # E : build a mask against false zero | ||
| 58 | mskqh t2, a1, t2 # U : detection in the src word (stall) | ||
| 59 | mskqh t1, a1, t3 # U : | ||
| 60 | ornot t1, t2, t2 # E : (stall) | ||
| 61 | |||
| 62 | mskql t0, a1, t0 # U : assemble the first output word | ||
| 63 | cmpbge zero, t2, t8 # E : bits set iff null found | ||
| 64 | or t0, t3, t1 # E : (stall) | ||
| 65 | bne t8, $a_eos # U : (stall) | ||
| 66 | |||
| 67 | /* On entry to this basic block: | ||
| 68 | t0 == the first destination word for masking back in | ||
| 69 | t1 == a source word not containing a null. */ | ||
| 70 | /* Nops here to separate store quads from load quads */ | ||
| 71 | |||
| 72 | $a_loop: | ||
| 73 | stq_u t1, 0(a0) # L : | ||
| 74 | addq a0, 8, a0 # E : | ||
| 75 | nop | ||
| 76 | nop | ||
| 77 | |||
| 78 | ldq_u t1, 0(a1) # L : Latency=3 | ||
| 79 | addq a1, 8, a1 # E : | ||
| 80 | cmpbge zero, t1, t8 # E : (3 cycle stall) | ||
| 81 | beq t8, $a_loop # U : (stall for t8) | ||
| 82 | |||
| 83 | /* Take care of the final (partial) word store. | ||
| 84 | On entry to this basic block we have: | ||
| 85 | t1 == the source word containing the null | ||
| 86 | t8 == the cmpbge mask that found it. */ | ||
| 87 | $a_eos: | ||
| 88 | negq t8, t6 # E : find low bit set | ||
| 89 | and t8, t6, t12 # E : (stall) | ||
| 90 | /* For the sake of the cache, don't read a destination word | ||
| 91 | if we're not going to need it. */ | ||
| 92 | and t12, 0x80, t6 # E : (stall) | ||
| 93 | bne t6, 1f # U : (stall) | ||
| 94 | |||
| 95 | /* We're doing a partial word store and so need to combine | ||
| 96 | our source and original destination words. */ | ||
| 97 | ldq_u t0, 0(a0) # L : Latency=3 | ||
| 98 | subq t12, 1, t6 # E : | ||
| 99 | zapnot t1, t6, t1 # U : clear src bytes >= null (stall) | ||
| 100 | or t12, t6, t8 # E : (stall) | ||
| 101 | |||
| 102 | zap t0, t8, t0 # E : clear dst bytes <= null | ||
| 103 | or t0, t1, t1 # E : (stall) | ||
| 104 | nop | ||
| 105 | nop | ||
| 106 | |||
| 107 | 1: stq_u t1, 0(a0) # L : | ||
| 108 | ret (t9) # L0 : Latency=3 | ||
| 109 | nop | ||
| 110 | nop | ||
| 111 | |||
| 112 | .end stxcpy_aligned | ||
| 113 | |||
| 114 | .align 4 | ||
| 115 | .ent __stxcpy | ||
| 116 | .globl __stxcpy | ||
| 117 | __stxcpy: | ||
| 118 | .frame sp, 0, t9 | ||
| 119 | .prologue 0 | ||
| 120 | |||
| 121 | /* Are source and destination co-aligned? */ | ||
| 122 | xor a0, a1, t0 # E : | ||
| 123 | unop # E : | ||
| 124 | and t0, 7, t0 # E : (stall) | ||
| 125 | bne t0, $unaligned # U : (stall) | ||
| 126 | |||
| 127 | /* We are co-aligned; take care of a partial first word. */ | ||
| 128 | ldq_u t1, 0(a1) # L : load first src word | ||
| 129 | and a0, 7, t0 # E : take care not to load a word ... | ||
| 130 | addq a1, 8, a1 # E : | ||
| 131 | beq t0, stxcpy_aligned # U : ... if we wont need it (stall) | ||
| 132 | |||
| 133 | ldq_u t0, 0(a0) # L : | ||
| 134 | br stxcpy_aligned # L0 : Latency=3 | ||
| 135 | nop | ||
| 136 | nop | ||
| 137 | |||
| 138 | |||
| 139 | /* The source and destination are not co-aligned. Align the destination | ||
| 140 | and cope. We have to be very careful about not reading too much and | ||
| 141 | causing a SEGV. */ | ||
| 142 | |||
| 143 | .align 4 | ||
| 144 | $u_head: | ||
| 145 | /* We know just enough now to be able to assemble the first | ||
| 146 | full source word. We can still find a zero at the end of it | ||
| 147 | that prevents us from outputting the whole thing. | ||
| 148 | |||
| 149 | On entry to this basic block: | ||
| 150 | t0 == the first dest word, for masking back in, if needed else 0 | ||
| 151 | t1 == the low bits of the first source word | ||
| 152 | t6 == bytemask that is -1 in dest word bytes */ | ||
| 153 | |||
| 154 | ldq_u t2, 8(a1) # L : | ||
| 155 | addq a1, 8, a1 # E : | ||
| 156 | extql t1, a1, t1 # U : (stall on a1) | ||
| 157 | extqh t2, a1, t4 # U : (stall on a1) | ||
| 158 | |||
| 159 | mskql t0, a0, t0 # U : | ||
| 160 | or t1, t4, t1 # E : | ||
| 161 | mskqh t1, a0, t1 # U : (stall on t1) | ||
| 162 | or t0, t1, t1 # E : (stall on t1) | ||
| 163 | |||
| 164 | or t1, t6, t6 # E : | ||
| 165 | cmpbge zero, t6, t8 # E : (stall) | ||
| 166 | lda t6, -1 # E : for masking just below | ||
| 167 | bne t8, $u_final # U : (stall) | ||
| 168 | |||
| 169 | mskql t6, a1, t6 # U : mask out the bits we have | ||
| 170 | or t6, t2, t2 # E : already extracted before (stall) | ||
| 171 | cmpbge zero, t2, t8 # E : testing eos (stall) | ||
| 172 | bne t8, $u_late_head_exit # U : (stall) | ||
| 173 | |||
| 174 | /* Finally, we've got all the stupid leading edge cases taken care | ||
| 175 | of and we can set up to enter the main loop. */ | ||
| 176 | |||
| 177 | stq_u t1, 0(a0) # L : store first output word | ||
| 178 | addq a0, 8, a0 # E : | ||
| 179 | extql t2, a1, t0 # U : position ho-bits of lo word | ||
| 180 | ldq_u t2, 8(a1) # U : read next high-order source word | ||
| 181 | |||
| 182 | addq a1, 8, a1 # E : | ||
| 183 | cmpbge zero, t2, t8 # E : (stall for t2) | ||
| 184 | nop # E : | ||
| 185 | bne t8, $u_eos # U : (stall) | ||
| 186 | |||
| 187 | /* Unaligned copy main loop. In order to avoid reading too much, | ||
| 188 | the loop is structured to detect zeros in aligned source words. | ||
| 189 | This has, unfortunately, effectively pulled half of a loop | ||
| 190 | iteration out into the head and half into the tail, but it does | ||
| 191 | prevent nastiness from accumulating in the very thing we want | ||
| 192 | to run as fast as possible. | ||
| 193 | |||
| 194 | On entry to this basic block: | ||
| 195 | t0 == the shifted high-order bits from the previous source word | ||
| 196 | t2 == the unshifted current source word | ||
| 197 | |||
| 198 | We further know that t2 does not contain a null terminator. */ | ||
| 199 | |||
| 200 | .align 3 | ||
| 201 | $u_loop: | ||
| 202 | extqh t2, a1, t1 # U : extract high bits for current word | ||
| 203 | addq a1, 8, a1 # E : (stall) | ||
| 204 | extql t2, a1, t3 # U : extract low bits for next time (stall) | ||
| 205 | addq a0, 8, a0 # E : | ||
| 206 | |||
| 207 | or t0, t1, t1 # E : current dst word now complete | ||
| 208 | ldq_u t2, 0(a1) # L : Latency=3 load high word for next time | ||
| 209 | stq_u t1, -8(a0) # L : save the current word (stall) | ||
| 210 | mov t3, t0 # E : | ||
| 211 | |||
| 212 | cmpbge zero, t2, t8 # E : test new word for eos | ||
| 213 | beq t8, $u_loop # U : (stall) | ||
| 214 | nop | ||
| 215 | nop | ||
| 216 | |||
| 217 | /* We've found a zero somewhere in the source word we just read. | ||
| 218 | If it resides in the lower half, we have one (probably partial) | ||
| 219 | word to write out, and if it resides in the upper half, we | ||
| 220 | have one full and one partial word left to write out. | ||
| 221 | |||
| 222 | On entry to this basic block: | ||
| 223 | t0 == the shifted high-order bits from the previous source word | ||
| 224 | t2 == the unshifted current source word. */ | ||
| 225 | $u_eos: | ||
| 226 | extqh t2, a1, t1 # U : | ||
| 227 | or t0, t1, t1 # E : first (partial) source word complete (stall) | ||
| 228 | cmpbge zero, t1, t8 # E : is the null in this first bit? (stall) | ||
| 229 | bne t8, $u_final # U : (stall) | ||
| 230 | |||
| 231 | $u_late_head_exit: | ||
| 232 | stq_u t1, 0(a0) # L : the null was in the high-order bits | ||
| 233 | addq a0, 8, a0 # E : | ||
| 234 | extql t2, a1, t1 # U : | ||
| 235 | cmpbge zero, t1, t8 # E : (stall) | ||
| 236 | |||
| 237 | /* Take care of a final (probably partial) result word. | ||
| 238 | On entry to this basic block: | ||
| 239 | t1 == assembled source word | ||
| 240 | t8 == cmpbge mask that found the null. */ | ||
| 241 | $u_final: | ||
| 242 | negq t8, t6 # E : isolate low bit set | ||
| 243 | and t6, t8, t12 # E : (stall) | ||
| 244 | and t12, 0x80, t6 # E : avoid dest word load if we can (stall) | ||
| 245 | bne t6, 1f # U : (stall) | ||
| 246 | |||
| 247 | ldq_u t0, 0(a0) # E : | ||
| 248 | subq t12, 1, t6 # E : | ||
| 249 | or t6, t12, t8 # E : (stall) | ||
| 250 | zapnot t1, t6, t1 # U : kill source bytes >= null (stall) | ||
| 251 | |||
| 252 | zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall) | ||
| 253 | or t0, t1, t1 # E : (stall) | ||
| 254 | nop | ||
| 255 | nop | ||
| 256 | |||
| 257 | 1: stq_u t1, 0(a0) # L : | ||
| 258 | ret (t9) # L0 : Latency=3 | ||
| 259 | nop | ||
| 260 | nop | ||
| 261 | |||
| 262 | /* Unaligned copy entry point. */ | ||
| 263 | .align 4 | ||
| 264 | $unaligned: | ||
| 265 | |||
| 266 | ldq_u t1, 0(a1) # L : load first source word | ||
| 267 | and a0, 7, t4 # E : find dest misalignment | ||
| 268 | and a1, 7, t5 # E : find src misalignment | ||
| 269 | /* Conditionally load the first destination word and a bytemask | ||
| 270 | with 0xff indicating that the destination byte is sacrosanct. */ | ||
| 271 | mov zero, t0 # E : | ||
| 272 | |||
| 273 | mov zero, t6 # E : | ||
| 274 | beq t4, 1f # U : | ||
| 275 | ldq_u t0, 0(a0) # L : | ||
| 276 | lda t6, -1 # E : | ||
| 277 | |||
| 278 | mskql t6, a0, t6 # U : | ||
| 279 | nop | ||
| 280 | nop | ||
| 281 | nop | ||
| 282 | 1: | ||
| 283 | subq a1, t4, a1 # E : sub dest misalignment from src addr | ||
| 284 | /* If source misalignment is larger than dest misalignment, we need | ||
| 285 | extra startup checks to avoid SEGV. */ | ||
| 286 | cmplt t4, t5, t12 # E : | ||
| 287 | beq t12, $u_head # U : | ||
| 288 | lda t2, -1 # E : mask out leading garbage in source | ||
| 289 | |||
| 290 | mskqh t2, t5, t2 # U : | ||
| 291 | ornot t1, t2, t3 # E : (stall) | ||
| 292 | cmpbge zero, t3, t8 # E : is there a zero? (stall) | ||
| 293 | beq t8, $u_head # U : (stall) | ||
| 294 | |||
| 295 | /* At this point we've found a zero in the first partial word of | ||
| 296 | the source. We need to isolate the valid source data and mask | ||
| 297 | it into the original destination data. (Incidentally, we know | ||
| 298 | that we'll need at least one byte of that original dest word.) */ | ||
| 299 | |||
| 300 | ldq_u t0, 0(a0) # L : | ||
| 301 | negq t8, t6 # E : build bitmask of bytes <= zero | ||
| 302 | and t6, t8, t12 # E : (stall) | ||
| 303 | and a1, 7, t5 # E : | ||
| 304 | |||
| 305 | subq t12, 1, t6 # E : | ||
| 306 | or t6, t12, t8 # E : (stall) | ||
| 307 | srl t12, t5, t12 # U : adjust final null return value | ||
| 308 | zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall) | ||
| 309 | |||
| 310 | and t1, t2, t1 # E : to source validity mask | ||
| 311 | extql t2, a1, t2 # U : | ||
| 312 | extql t1, a1, t1 # U : (stall) | ||
| 313 | andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall) | ||
| 314 | |||
| 315 | or t0, t1, t1 # e1 : and put it there | ||
| 316 | stq_u t1, 0(a0) # .. e0 : (stall) | ||
| 317 | ret (t9) # e1 : | ||
| 318 | nop | ||
| 319 | |||
| 320 | .end __stxcpy | ||
| 321 | |||
diff --git a/arch/alpha/lib/ev6-stxncpy.S b/arch/alpha/lib/ev6-stxncpy.S new file mode 100644 index 00000000000..b581a7af245 --- /dev/null +++ b/arch/alpha/lib/ev6-stxncpy.S | |||
| @@ -0,0 +1,397 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev6-stxncpy.S | ||
| 3 | * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com> | ||
| 4 | * | ||
| 5 | * Copy no more than COUNT bytes of the null-terminated string from | ||
| 6 | * SRC to DST. | ||
| 7 | * | ||
| 8 | * This is an internal routine used by strncpy, stpncpy, and strncat. | ||
| 9 | * As such, it uses special linkage conventions to make implementation | ||
| 10 | * of these public functions more efficient. | ||
| 11 | * | ||
| 12 | * On input: | ||
| 13 | * t9 = return address | ||
| 14 | * a0 = DST | ||
| 15 | * a1 = SRC | ||
| 16 | * a2 = COUNT | ||
| 17 | * | ||
| 18 | * Furthermore, COUNT may not be zero. | ||
| 19 | * | ||
| 20 | * On output: | ||
| 21 | * t0 = last word written | ||
| 22 | * t10 = bitmask (with one bit set) indicating the byte position of | ||
| 23 | * the end of the range specified by COUNT | ||
| 24 | * t12 = bitmask (with one bit set) indicating the last byte written | ||
| 25 | * a0 = unaligned address of the last *word* written | ||
| 26 | * a2 = the number of full words left in COUNT | ||
| 27 | * | ||
| 28 | * Furthermore, v0, a3-a5, t11, and $at are untouched. | ||
| 29 | * | ||
| 30 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 31 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 32 | * abbreviated as 'CWG' in other comments here | ||
| 33 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 34 | * Scheduling notation: | ||
| 35 | * E - either cluster | ||
| 36 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 37 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 38 | * Try not to change the actual algorithm if possible for consistency. | ||
| 39 | */ | ||
| 40 | |||
| 41 | #include <asm/regdef.h> | ||
| 42 | |||
| 43 | .set noat | ||
| 44 | .set noreorder | ||
| 45 | |||
| 46 | .text | ||
| 47 | |||
| 48 | /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that | ||
| 49 | doesn't like putting the entry point for a procedure somewhere in the | ||
| 50 | middle of the procedure descriptor. Work around this by putting the | ||
| 51 | aligned copy in its own procedure descriptor */ | ||
| 52 | |||
| 53 | |||
| 54 | .ent stxncpy_aligned | ||
| 55 | .align 4 | ||
| 56 | stxncpy_aligned: | ||
| 57 | .frame sp, 0, t9, 0 | ||
| 58 | .prologue 0 | ||
| 59 | |||
| 60 | /* On entry to this basic block: | ||
| 61 | t0 == the first destination word for masking back in | ||
| 62 | t1 == the first source word. */ | ||
| 63 | |||
| 64 | /* Create the 1st output word and detect 0's in the 1st input word. */ | ||
| 65 | lda t2, -1 # E : build a mask against false zero | ||
| 66 | mskqh t2, a1, t2 # U : detection in the src word (stall) | ||
| 67 | mskqh t1, a1, t3 # U : | ||
| 68 | ornot t1, t2, t2 # E : (stall) | ||
| 69 | |||
| 70 | mskql t0, a1, t0 # U : assemble the first output word | ||
| 71 | cmpbge zero, t2, t8 # E : bits set iff null found | ||
| 72 | or t0, t3, t0 # E : (stall) | ||
| 73 | beq a2, $a_eoc # U : | ||
| 74 | |||
| 75 | bne t8, $a_eos # U : | ||
| 76 | nop | ||
| 77 | nop | ||
| 78 | nop | ||
| 79 | |||
| 80 | /* On entry to this basic block: | ||
| 81 | t0 == a source word not containing a null. */ | ||
| 82 | |||
| 83 | /* | ||
| 84 | * nops here to: | ||
| 85 | * separate store quads from load quads | ||
| 86 | * limit of 1 bcond/quad to permit training | ||
| 87 | */ | ||
| 88 | $a_loop: | ||
| 89 | stq_u t0, 0(a0) # L : | ||
| 90 | addq a0, 8, a0 # E : | ||
| 91 | subq a2, 1, a2 # E : | ||
| 92 | nop | ||
| 93 | |||
| 94 | ldq_u t0, 0(a1) # L : | ||
| 95 | addq a1, 8, a1 # E : | ||
| 96 | cmpbge zero, t0, t8 # E : | ||
| 97 | beq a2, $a_eoc # U : | ||
| 98 | |||
| 99 | beq t8, $a_loop # U : | ||
| 100 | nop | ||
| 101 | nop | ||
| 102 | nop | ||
| 103 | |||
| 104 | /* Take care of the final (partial) word store. At this point | ||
| 105 | the end-of-count bit is set in t8 iff it applies. | ||
| 106 | |||
| 107 | On entry to this basic block we have: | ||
| 108 | t0 == the source word containing the null | ||
| 109 | t8 == the cmpbge mask that found it. */ | ||
| 110 | |||
| 111 | $a_eos: | ||
| 112 | negq t8, t12 # E : find low bit set | ||
| 113 | and t8, t12, t12 # E : (stall) | ||
| 114 | /* For the sake of the cache, don't read a destination word | ||
| 115 | if we're not going to need it. */ | ||
| 116 | and t12, 0x80, t6 # E : (stall) | ||
| 117 | bne t6, 1f # U : (stall) | ||
| 118 | |||
| 119 | /* We're doing a partial word store and so need to combine | ||
| 120 | our source and original destination words. */ | ||
| 121 | ldq_u t1, 0(a0) # L : | ||
| 122 | subq t12, 1, t6 # E : | ||
| 123 | or t12, t6, t8 # E : (stall) | ||
| 124 | zapnot t0, t8, t0 # U : clear src bytes > null (stall) | ||
| 125 | |||
| 126 | zap t1, t8, t1 # .. e1 : clear dst bytes <= null | ||
| 127 | or t0, t1, t0 # e1 : (stall) | ||
| 128 | nop | ||
| 129 | nop | ||
| 130 | |||
| 131 | 1: stq_u t0, 0(a0) # L : | ||
| 132 | ret (t9) # L0 : Latency=3 | ||
| 133 | nop | ||
| 134 | nop | ||
| 135 | |||
| 136 | /* Add the end-of-count bit to the eos detection bitmask. */ | ||
| 137 | $a_eoc: | ||
| 138 | or t10, t8, t8 # E : | ||
| 139 | br $a_eos # L0 : Latency=3 | ||
| 140 | nop | ||
| 141 | nop | ||
| 142 | |||
| 143 | .end stxncpy_aligned | ||
| 144 | |||
| 145 | .align 4 | ||
| 146 | .ent __stxncpy | ||
| 147 | .globl __stxncpy | ||
| 148 | __stxncpy: | ||
| 149 | .frame sp, 0, t9, 0 | ||
| 150 | .prologue 0 | ||
| 151 | |||
| 152 | /* Are source and destination co-aligned? */ | ||
| 153 | xor a0, a1, t1 # E : | ||
| 154 | and a0, 7, t0 # E : find dest misalignment | ||
| 155 | and t1, 7, t1 # E : (stall) | ||
| 156 | addq a2, t0, a2 # E : bias count by dest misalignment (stall) | ||
| 157 | |||
| 158 | subq a2, 1, a2 # E : | ||
| 159 | and a2, 7, t2 # E : (stall) | ||
| 160 | srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall) | ||
| 161 | addq zero, 1, t10 # E : | ||
| 162 | |||
| 163 | sll t10, t2, t10 # U : t10 = bitmask of last count byte | ||
| 164 | bne t1, $unaligned # U : | ||
| 165 | /* We are co-aligned; take care of a partial first word. */ | ||
| 166 | ldq_u t1, 0(a1) # L : load first src word | ||
| 167 | addq a1, 8, a1 # E : | ||
| 168 | |||
| 169 | beq t0, stxncpy_aligned # U : avoid loading dest word if not needed | ||
| 170 | ldq_u t0, 0(a0) # L : | ||
| 171 | nop | ||
| 172 | nop | ||
| 173 | |||
| 174 | br stxncpy_aligned # .. e1 : | ||
| 175 | nop | ||
| 176 | nop | ||
| 177 | nop | ||
| 178 | |||
| 179 | |||
| 180 | |||
| 181 | /* The source and destination are not co-aligned. Align the destination | ||
| 182 | and cope. We have to be very careful about not reading too much and | ||
| 183 | causing a SEGV. */ | ||
| 184 | |||
| 185 | .align 4 | ||
| 186 | $u_head: | ||
| 187 | /* We know just enough now to be able to assemble the first | ||
| 188 | full source word. We can still find a zero at the end of it | ||
| 189 | that prevents us from outputting the whole thing. | ||
| 190 | |||
| 191 | On entry to this basic block: | ||
| 192 | t0 == the first dest word, unmasked | ||
| 193 | t1 == the shifted low bits of the first source word | ||
| 194 | t6 == bytemask that is -1 in dest word bytes */ | ||
| 195 | |||
| 196 | ldq_u t2, 8(a1) # L : Latency=3 load second src word | ||
| 197 | addq a1, 8, a1 # E : | ||
| 198 | mskql t0, a0, t0 # U : mask trailing garbage in dst | ||
| 199 | extqh t2, a1, t4 # U : (3 cycle stall on t2) | ||
| 200 | |||
| 201 | or t1, t4, t1 # E : first aligned src word complete (stall) | ||
| 202 | mskqh t1, a0, t1 # U : mask leading garbage in src (stall) | ||
| 203 | or t0, t1, t0 # E : first output word complete (stall) | ||
| 204 | or t0, t6, t6 # E : mask original data for zero test (stall) | ||
| 205 | |||
| 206 | cmpbge zero, t6, t8 # E : | ||
| 207 | beq a2, $u_eocfin # U : | ||
| 208 | lda t6, -1 # E : | ||
| 209 | nop | ||
| 210 | |||
| 211 | bne t8, $u_final # U : | ||
| 212 | mskql t6, a1, t6 # U : mask out bits already seen | ||
| 213 | stq_u t0, 0(a0) # L : store first output word | ||
| 214 | or t6, t2, t2 # E : (stall) | ||
| 215 | |||
| 216 | cmpbge zero, t2, t8 # E : find nulls in second partial | ||
| 217 | addq a0, 8, a0 # E : | ||
| 218 | subq a2, 1, a2 # E : | ||
| 219 | bne t8, $u_late_head_exit # U : | ||
| 220 | |||
| 221 | /* Finally, we've got all the stupid leading edge cases taken care | ||
| 222 | of and we can set up to enter the main loop. */ | ||
| 223 | extql t2, a1, t1 # U : position hi-bits of lo word | ||
| 224 | beq a2, $u_eoc # U : | ||
| 225 | ldq_u t2, 8(a1) # L : read next high-order source word | ||
| 226 | addq a1, 8, a1 # E : | ||
| 227 | |||
| 228 | extqh t2, a1, t0 # U : position lo-bits of hi word (stall) | ||
| 229 | cmpbge zero, t2, t8 # E : | ||
| 230 | nop | ||
| 231 | bne t8, $u_eos # U : | ||
| 232 | |||
| 233 | /* Unaligned copy main loop. In order to avoid reading too much, | ||
| 234 | the loop is structured to detect zeros in aligned source words. | ||
| 235 | This has, unfortunately, effectively pulled half of a loop | ||
| 236 | iteration out into the head and half into the tail, but it does | ||
| 237 | prevent nastiness from accumulating in the very thing we want | ||
| 238 | to run as fast as possible. | ||
| 239 | |||
| 240 | On entry to this basic block: | ||
| 241 | t0 == the shifted low-order bits from the current source word | ||
| 242 | t1 == the shifted high-order bits from the previous source word | ||
| 243 | t2 == the unshifted current source word | ||
| 244 | |||
| 245 | We further know that t2 does not contain a null terminator. */ | ||
| 246 | |||
| 247 | .align 4 | ||
| 248 | $u_loop: | ||
| 249 | or t0, t1, t0 # E : current dst word now complete | ||
| 250 | subq a2, 1, a2 # E : decrement word count | ||
| 251 | extql t2, a1, t1 # U : extract low bits for next time | ||
| 252 | addq a0, 8, a0 # E : | ||
| 253 | |||
| 254 | stq_u t0, -8(a0) # U : save the current word | ||
| 255 | beq a2, $u_eoc # U : | ||
| 256 | ldq_u t2, 8(a1) # U : Latency=3 load high word for next time | ||
| 257 | addq a1, 8, a1 # E : | ||
| 258 | |||
| 259 | extqh t2, a1, t0 # U : extract low bits (2 cycle stall) | ||
| 260 | cmpbge zero, t2, t8 # E : test new word for eos | ||
| 261 | nop | ||
| 262 | beq t8, $u_loop # U : | ||
| 263 | |||
| 264 | /* We've found a zero somewhere in the source word we just read. | ||
| 265 | If it resides in the lower half, we have one (probably partial) | ||
| 266 | word to write out, and if it resides in the upper half, we | ||
| 267 | have one full and one partial word left to write out. | ||
| 268 | |||
| 269 | On entry to this basic block: | ||
| 270 | t0 == the shifted low-order bits from the current source word | ||
| 271 | t1 == the shifted high-order bits from the previous source word | ||
| 272 | t2 == the unshifted current source word. */ | ||
| 273 | $u_eos: | ||
| 274 | or t0, t1, t0 # E : first (partial) source word complete | ||
| 275 | nop | ||
| 276 | cmpbge zero, t0, t8 # E : is the null in this first bit? (stall) | ||
| 277 | bne t8, $u_final # U : (stall) | ||
| 278 | |||
| 279 | stq_u t0, 0(a0) # L : the null was in the high-order bits | ||
| 280 | addq a0, 8, a0 # E : | ||
| 281 | subq a2, 1, a2 # E : | ||
| 282 | nop | ||
| 283 | |||
| 284 | $u_late_head_exit: | ||
| 285 | extql t2, a1, t0 # U : | ||
| 286 | cmpbge zero, t0, t8 # E : | ||
| 287 | or t8, t10, t6 # E : (stall) | ||
| 288 | cmoveq a2, t6, t8 # E : Latency=2, extra map slot (stall) | ||
| 289 | |||
| 290 | /* Take care of a final (probably partial) result word. | ||
| 291 | On entry to this basic block: | ||
| 292 | t0 == assembled source word | ||
| 293 | t8 == cmpbge mask that found the null. */ | ||
| 294 | $u_final: | ||
| 295 | negq t8, t6 # E : isolate low bit set | ||
| 296 | and t6, t8, t12 # E : (stall) | ||
| 297 | and t12, 0x80, t6 # E : avoid dest word load if we can (stall) | ||
| 298 | bne t6, 1f # U : (stall) | ||
| 299 | |||
| 300 | ldq_u t1, 0(a0) # L : | ||
| 301 | subq t12, 1, t6 # E : | ||
| 302 | or t6, t12, t8 # E : (stall) | ||
| 303 | zapnot t0, t8, t0 # U : kill source bytes > null | ||
| 304 | |||
| 305 | zap t1, t8, t1 # U : kill dest bytes <= null | ||
| 306 | or t0, t1, t0 # E : (stall) | ||
| 307 | nop | ||
| 308 | nop | ||
| 309 | |||
| 310 | 1: stq_u t0, 0(a0) # L : | ||
| 311 | ret (t9) # L0 : Latency=3 | ||
| 312 | |||
| 313 | /* Got to end-of-count before end of string. | ||
| 314 | On entry to this basic block: | ||
| 315 | t1 == the shifted high-order bits from the previous source word */ | ||
| 316 | $u_eoc: | ||
| 317 | and a1, 7, t6 # E : avoid final load if possible | ||
| 318 | sll t10, t6, t6 # U : (stall) | ||
| 319 | and t6, 0xff, t6 # E : (stall) | ||
| 320 | bne t6, 1f # U : (stall) | ||
| 321 | |||
| 322 | ldq_u t2, 8(a1) # L : load final src word | ||
| 323 | nop | ||
| 324 | extqh t2, a1, t0 # U : extract low bits for last word (stall) | ||
| 325 | or t1, t0, t1 # E : (stall) | ||
| 326 | |||
| 327 | 1: cmpbge zero, t1, t8 # E : | ||
| 328 | mov t1, t0 # E : | ||
| 329 | |||
| 330 | $u_eocfin: # end-of-count, final word | ||
| 331 | or t10, t8, t8 # E : | ||
| 332 | br $u_final # L0 : Latency=3 | ||
| 333 | |||
| 334 | /* Unaligned copy entry point. */ | ||
| 335 | .align 4 | ||
| 336 | $unaligned: | ||
| 337 | |||
| 338 | ldq_u t1, 0(a1) # L : load first source word | ||
| 339 | and a0, 7, t4 # E : find dest misalignment | ||
| 340 | and a1, 7, t5 # E : find src misalignment | ||
| 341 | /* Conditionally load the first destination word and a bytemask | ||
| 342 | with 0xff indicating that the destination byte is sacrosanct. */ | ||
| 343 | mov zero, t0 # E : | ||
| 344 | |||
| 345 | mov zero, t6 # E : | ||
| 346 | beq t4, 1f # U : | ||
| 347 | ldq_u t0, 0(a0) # L : | ||
| 348 | lda t6, -1 # E : | ||
| 349 | |||
| 350 | mskql t6, a0, t6 # U : | ||
| 351 | nop | ||
| 352 | nop | ||
| 353 | subq a1, t4, a1 # E : sub dest misalignment from src addr | ||
| 354 | |||
| 355 | /* If source misalignment is larger than dest misalignment, we need | ||
| 356 | extra startup checks to avoid SEGV. */ | ||
| 357 | |||
| 358 | 1: cmplt t4, t5, t12 # E : | ||
| 359 | extql t1, a1, t1 # U : shift src into place | ||
| 360 | lda t2, -1 # E : for creating masks later | ||
| 361 | beq t12, $u_head # U : (stall) | ||
| 362 | |||
| 363 | extql t2, a1, t2 # U : | ||
| 364 | cmpbge zero, t1, t8 # E : is there a zero? | ||
| 365 | andnot t2, t6, t12 # E : dest mask for a single word copy | ||
| 366 | or t8, t10, t5 # E : test for end-of-count too | ||
| 367 | |||
| 368 | cmpbge zero, t12, t3 # E : | ||
| 369 | cmoveq a2, t5, t8 # E : Latency=2, extra map slot | ||
| 370 | nop # E : keep with cmoveq | ||
| 371 | andnot t8, t3, t8 # E : (stall) | ||
| 372 | |||
| 373 | beq t8, $u_head # U : | ||
| 374 | /* At this point we've found a zero in the first partial word of | ||
| 375 | the source. We need to isolate the valid source data and mask | ||
| 376 | it into the original destination data. (Incidentally, we know | ||
| 377 | that we'll need at least one byte of that original dest word.) */ | ||
| 378 | ldq_u t0, 0(a0) # L : | ||
| 379 | negq t8, t6 # E : build bitmask of bytes <= zero | ||
| 380 | mskqh t1, t4, t1 # U : | ||
| 381 | |||
| 382 | and t6, t8, t2 # E : | ||
| 383 | subq t2, 1, t6 # E : (stall) | ||
| 384 | or t6, t2, t8 # E : (stall) | ||
| 385 | zapnot t12, t8, t12 # U : prepare source word; mirror changes (stall) | ||
| 386 | |||
| 387 | zapnot t1, t8, t1 # U : to source validity mask | ||
| 388 | andnot t0, t12, t0 # E : zero place for source to reside | ||
| 389 | or t0, t1, t0 # E : and put it there (stall both t0, t1) | ||
| 390 | stq_u t0, 0(a0) # L : (stall) | ||
| 391 | |||
| 392 | ret (t9) # L0 : Latency=3 | ||
| 393 | nop | ||
| 394 | nop | ||
| 395 | nop | ||
| 396 | |||
| 397 | .end __stxncpy | ||
diff --git a/arch/alpha/lib/ev67-strcat.S b/arch/alpha/lib/ev67-strcat.S new file mode 100644 index 00000000000..c426fe3ed72 --- /dev/null +++ b/arch/alpha/lib/ev67-strcat.S | |||
| @@ -0,0 +1,54 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev67-strcat.S | ||
| 3 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 4 | * | ||
| 5 | * Append a null-terminated string from SRC to DST. | ||
| 6 | * | ||
| 7 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 8 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 9 | * abbreviated as 'CWG' in other comments here | ||
| 10 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 11 | * Scheduling notation: | ||
| 12 | * E - either cluster | ||
| 13 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 14 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 15 | * Try not to change the actual algorithm if possible for consistency. | ||
| 16 | * Commentary: It seems bogus to walk the input string twice - once | ||
| 17 | * to determine the length, and then again while doing the copy. | ||
| 18 | * A significant (future) enhancement would be to only read the input | ||
| 19 | * string once. | ||
| 20 | */ | ||
| 21 | |||
| 22 | |||
| 23 | .text | ||
| 24 | |||
| 25 | .align 4 | ||
| 26 | .globl strcat | ||
| 27 | .ent strcat | ||
| 28 | strcat: | ||
| 29 | .frame $30, 0, $26 | ||
| 30 | .prologue 0 | ||
| 31 | |||
| 32 | mov $16, $0 # E : set up return value | ||
| 33 | /* Find the end of the string. */ | ||
| 34 | ldq_u $1, 0($16) # L : load first quadword (a0 may be misaligned) | ||
| 35 | lda $2, -1 # E : | ||
| 36 | insqh $2, $16, $2 # U : | ||
| 37 | |||
| 38 | andnot $16, 7, $16 # E : | ||
| 39 | or $2, $1, $1 # E : | ||
| 40 | cmpbge $31, $1, $2 # E : bits set iff byte == 0 | ||
| 41 | bne $2, $found # U : | ||
| 42 | |||
| 43 | $loop: ldq $1, 8($16) # L : | ||
| 44 | addq $16, 8, $16 # E : | ||
| 45 | cmpbge $31, $1, $2 # E : | ||
| 46 | beq $2, $loop # U : | ||
| 47 | |||
| 48 | $found: cttz $2, $3 # U0 : | ||
| 49 | addq $16, $3, $16 # E : | ||
| 50 | /* Now do the append. */ | ||
| 51 | mov $26, $23 # E : | ||
| 52 | br __stxcpy # L0 : | ||
| 53 | |||
| 54 | .end strcat | ||
diff --git a/arch/alpha/lib/ev67-strchr.S b/arch/alpha/lib/ev67-strchr.S new file mode 100644 index 00000000000..fbb7b4ffade --- /dev/null +++ b/arch/alpha/lib/ev67-strchr.S | |||
| @@ -0,0 +1,88 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev67-strchr.S | ||
| 3 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 4 | * | ||
| 5 | * Return the address of a given character within a null-terminated | ||
| 6 | * string, or null if it is not found. | ||
| 7 | * | ||
| 8 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 9 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 10 | * abbreviated as 'CWG' in other comments here | ||
| 11 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 12 | * Scheduling notation: | ||
| 13 | * E - either cluster | ||
| 14 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 15 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 16 | * Try not to change the actual algorithm if possible for consistency. | ||
| 17 | */ | ||
| 18 | |||
| 19 | #include <asm/regdef.h> | ||
| 20 | |||
| 21 | .set noreorder | ||
| 22 | .set noat | ||
| 23 | |||
| 24 | .align 4 | ||
| 25 | .globl strchr | ||
| 26 | .ent strchr | ||
| 27 | strchr: | ||
| 28 | .frame sp, 0, ra | ||
| 29 | .prologue 0 | ||
| 30 | |||
| 31 | ldq_u t0, 0(a0) # L : load first quadword Latency=3 | ||
| 32 | and a1, 0xff, t3 # E : 00000000000000ch | ||
| 33 | insbl a1, 1, t5 # U : 000000000000ch00 | ||
| 34 | insbl a1, 7, a2 # U : ch00000000000000 | ||
| 35 | |||
| 36 | insbl t3, 6, a3 # U : 00ch000000000000 | ||
| 37 | or t5, t3, a1 # E : 000000000000chch | ||
| 38 | andnot a0, 7, v0 # E : align our loop pointer | ||
| 39 | lda t4, -1 # E : build garbage mask | ||
| 40 | |||
| 41 | mskqh t4, a0, t4 # U : only want relevant part of first quad | ||
| 42 | or a2, a3, a2 # E : chch000000000000 | ||
| 43 | inswl a1, 2, t5 # E : 00000000chch0000 | ||
| 44 | inswl a1, 4, a3 # E : 0000chch00000000 | ||
| 45 | |||
| 46 | or a1, a2, a1 # E : chch00000000chch | ||
| 47 | or a3, t5, t5 # E : 0000chchchch0000 | ||
| 48 | cmpbge zero, t0, t2 # E : bits set iff byte == zero | ||
| 49 | cmpbge zero, t4, t4 # E : bits set iff byte is garbage | ||
| 50 | |||
| 51 | /* This quad is _very_ serialized. Lots of stalling happens */ | ||
| 52 | or t5, a1, a1 # E : chchchchchchchch | ||
| 53 | xor t0, a1, t1 # E : make bytes == c zero | ||
| 54 | cmpbge zero, t1, t3 # E : bits set iff byte == c | ||
| 55 | or t2, t3, t0 # E : bits set iff char match or zero match | ||
| 56 | |||
| 57 | andnot t0, t4, t0 # E : clear garbage bits | ||
| 58 | cttz t0, a2 # U0 : speculative (in case we get a match) | ||
| 59 | nop # E : | ||
| 60 | bne t0, $found # U : | ||
| 61 | |||
| 62 | /* | ||
| 63 | * Yuk. This loop is going to stall like crazy waiting for the | ||
| 64 | * data to be loaded. Not much can be done about it unless it's | ||
| 65 | * unrolled multiple times - is that safe to do in kernel space? | ||
| 66 | * Or would exception handling recovery code do the trick here? | ||
| 67 | */ | ||
| 68 | $loop: ldq t0, 8(v0) # L : Latency=3 | ||
| 69 | addq v0, 8, v0 # E : | ||
| 70 | xor t0, a1, t1 # E : | ||
| 71 | cmpbge zero, t0, t2 # E : bits set iff byte == 0 | ||
| 72 | |||
| 73 | cmpbge zero, t1, t3 # E : bits set iff byte == c | ||
| 74 | or t2, t3, t0 # E : | ||
| 75 | cttz t3, a2 # U0 : speculative (in case we get a match) | ||
| 76 | beq t0, $loop # U : | ||
| 77 | |||
| 78 | $found: negq t0, t1 # E : clear all but least set bit | ||
| 79 | and t0, t1, t0 # E : | ||
| 80 | and t0, t3, t1 # E : bit set iff byte was the char | ||
| 81 | addq v0, a2, v0 # E : Add in the bit number from above | ||
| 82 | |||
| 83 | cmoveq t1, $31, v0 # E : Two mapping slots, latency = 2 | ||
| 84 | nop | ||
| 85 | nop | ||
| 86 | ret # L0 : | ||
| 87 | |||
| 88 | .end strchr | ||
diff --git a/arch/alpha/lib/ev67-strlen.S b/arch/alpha/lib/ev67-strlen.S new file mode 100644 index 00000000000..50392807252 --- /dev/null +++ b/arch/alpha/lib/ev67-strlen.S | |||
| @@ -0,0 +1,49 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev67-strlen.S | ||
| 3 | * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 4 | * | ||
| 5 | * Finds length of a 0-terminated string. Optimized for the | ||
| 6 | * Alpha architecture: | ||
| 7 | * | ||
| 8 | * - memory accessed as aligned quadwords only | ||
| 9 | * - uses bcmpge to compare 8 bytes in parallel | ||
| 10 | * | ||
| 11 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 12 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 13 | * abbreviated as 'CWG' in other comments here | ||
| 14 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 15 | * Scheduling notation: | ||
| 16 | * E - either cluster | ||
| 17 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 18 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 19 | */ | ||
| 20 | |||
| 21 | .set noreorder | ||
| 22 | .set noat | ||
| 23 | |||
| 24 | .globl strlen | ||
| 25 | .ent strlen | ||
| 26 | .align 4 | ||
| 27 | strlen: | ||
| 28 | ldq_u $1, 0($16) # L : load first quadword ($16 may be misaligned) | ||
| 29 | lda $2, -1($31) # E : | ||
| 30 | insqh $2, $16, $2 # U : | ||
| 31 | andnot $16, 7, $0 # E : | ||
| 32 | |||
| 33 | or $2, $1, $1 # E : | ||
| 34 | cmpbge $31, $1, $2 # E : $2 <- bitmask: bit i == 1 <==> i-th byte == 0 | ||
| 35 | nop # E : | ||
| 36 | bne $2, $found # U : | ||
| 37 | |||
| 38 | $loop: ldq $1, 8($0) # L : | ||
| 39 | addq $0, 8, $0 # E : addr += 8 | ||
| 40 | cmpbge $31, $1, $2 # E : | ||
| 41 | beq $2, $loop # U : | ||
| 42 | |||
| 43 | $found: | ||
| 44 | cttz $2, $3 # U0 : | ||
| 45 | addq $0, $3, $0 # E : | ||
| 46 | subq $0, $16, $0 # E : | ||
| 47 | ret $31, ($26) # L0 : | ||
| 48 | |||
| 49 | .end strlen | ||
diff --git a/arch/alpha/lib/ev67-strlen_user.S b/arch/alpha/lib/ev67-strlen_user.S new file mode 100644 index 00000000000..57e0d77b81a --- /dev/null +++ b/arch/alpha/lib/ev67-strlen_user.S | |||
| @@ -0,0 +1,107 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev67-strlen_user.S | ||
| 3 | * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com> | ||
| 4 | * | ||
| 5 | * Return the length of the string including the NULL terminator | ||
| 6 | * (strlen+1) or zero if an error occurred. | ||
| 7 | * | ||
| 8 | * In places where it is critical to limit the processing time, | ||
| 9 | * and the data is not trusted, strnlen_user() should be used. | ||
| 10 | * It will return a value greater than its second argument if | ||
| 11 | * that limit would be exceeded. This implementation is allowed | ||
| 12 | * to access memory beyond the limit, but will not cross a page | ||
| 13 | * boundary when doing so. | ||
| 14 | * | ||
| 15 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 16 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 17 | * abbreviated as 'CWG' in other comments here | ||
| 18 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 19 | * Scheduling notation: | ||
| 20 | * E - either cluster | ||
| 21 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 22 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 23 | * Try not to change the actual algorithm if possible for consistency. | ||
| 24 | */ | ||
| 25 | |||
| 26 | #include <asm/regdef.h> | ||
| 27 | |||
| 28 | |||
| 29 | /* Allow an exception for an insn; exit if we get one. */ | ||
| 30 | #define EX(x,y...) \ | ||
| 31 | 99: x,##y; \ | ||
| 32 | .section __ex_table,"a"; \ | ||
| 33 | .long 99b - .; \ | ||
| 34 | lda v0, $exception-99b(zero); \ | ||
| 35 | .previous | ||
| 36 | |||
| 37 | |||
| 38 | .set noreorder | ||
| 39 | .set noat | ||
| 40 | .text | ||
| 41 | |||
| 42 | .globl __strlen_user | ||
| 43 | .ent __strlen_user | ||
| 44 | .frame sp, 0, ra | ||
| 45 | |||
| 46 | .align 4 | ||
| 47 | __strlen_user: | ||
| 48 | ldah a1, 32767(zero) # do not use plain strlen_user() for strings | ||
| 49 | # that might be almost 2 GB long; you should | ||
| 50 | # be using strnlen_user() instead | ||
| 51 | nop | ||
| 52 | nop | ||
| 53 | nop | ||
| 54 | |||
| 55 | .globl __strnlen_user | ||
| 56 | |||
| 57 | .align 4 | ||
| 58 | __strnlen_user: | ||
| 59 | .prologue 0 | ||
| 60 | EX( ldq_u t0, 0(a0) ) # L : load first quadword (a0 may be misaligned) | ||
| 61 | lda t1, -1(zero) # E : | ||
| 62 | |||
| 63 | insqh t1, a0, t1 # U : | ||
| 64 | andnot a0, 7, v0 # E : | ||
| 65 | or t1, t0, t0 # E : | ||
| 66 | subq a0, 1, a0 # E : get our +1 for the return | ||
| 67 | |||
| 68 | cmpbge zero, t0, t1 # E : t1 <- bitmask: bit i == 1 <==> i-th byte == 0 | ||
| 69 | subq a1, 7, t2 # E : | ||
| 70 | subq a0, v0, t0 # E : | ||
| 71 | bne t1, $found # U : | ||
| 72 | |||
| 73 | addq t2, t0, t2 # E : | ||
| 74 | addq a1, 1, a1 # E : | ||
| 75 | nop # E : | ||
| 76 | nop # E : | ||
| 77 | |||
| 78 | .align 4 | ||
| 79 | $loop: ble t2, $limit # U : | ||
| 80 | EX( ldq t0, 8(v0) ) # L : | ||
| 81 | nop # E : | ||
| 82 | nop # E : | ||
| 83 | |||
| 84 | cmpbge zero, t0, t1 # E : | ||
| 85 | subq t2, 8, t2 # E : | ||
| 86 | addq v0, 8, v0 # E : addr += 8 | ||
| 87 | beq t1, $loop # U : | ||
| 88 | |||
| 89 | $found: cttz t1, t2 # U0 : | ||
| 90 | addq v0, t2, v0 # E : | ||
| 91 | subq v0, a0, v0 # E : | ||
| 92 | ret # L0 : | ||
| 93 | |||
| 94 | $exception: | ||
| 95 | nop | ||
| 96 | nop | ||
| 97 | nop | ||
| 98 | ret | ||
| 99 | |||
| 100 | .align 4 # currently redundant | ||
| 101 | $limit: | ||
| 102 | nop | ||
| 103 | nop | ||
| 104 | subq a1, t2, v0 | ||
| 105 | ret | ||
| 106 | |||
| 107 | .end __strlen_user | ||
diff --git a/arch/alpha/lib/ev67-strncat.S b/arch/alpha/lib/ev67-strncat.S new file mode 100644 index 00000000000..4ae716cd2bf --- /dev/null +++ b/arch/alpha/lib/ev67-strncat.S | |||
| @@ -0,0 +1,94 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev67-strncat.S | ||
| 3 | * 21264 version contributed by Rick Gorton <rick.gorton@api-networks.com> | ||
| 4 | * | ||
| 5 | * Append no more than COUNT characters from the null-terminated string SRC | ||
| 6 | * to the null-terminated string DST. Always null-terminate the new DST. | ||
| 7 | * | ||
| 8 | * This differs slightly from the semantics in libc in that we never write | ||
| 9 | * past count, whereas libc may write to count+1. This follows the generic | ||
| 10 | * implementation in lib/string.c and is, IMHO, more sensible. | ||
| 11 | * | ||
| 12 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 13 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 14 | * abbreviated as 'CWG' in other comments here | ||
| 15 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 16 | * Scheduling notation: | ||
| 17 | * E - either cluster | ||
| 18 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 19 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 20 | * Try not to change the actual algorithm if possible for consistency. | ||
| 21 | */ | ||
| 22 | |||
| 23 | |||
| 24 | .text | ||
| 25 | |||
| 26 | .align 4 | ||
| 27 | .globl strncat | ||
| 28 | .ent strncat | ||
| 29 | strncat: | ||
| 30 | .frame $30, 0, $26 | ||
| 31 | .prologue 0 | ||
| 32 | |||
| 33 | mov $16, $0 # set up return value | ||
| 34 | beq $18, $zerocount # U : | ||
| 35 | /* Find the end of the string. */ | ||
| 36 | ldq_u $1, 0($16) # L : load first quadword ($16 may be misaligned) | ||
| 37 | lda $2, -1($31) # E : | ||
| 38 | |||
| 39 | insqh $2, $0, $2 # U : | ||
| 40 | andnot $16, 7, $16 # E : | ||
| 41 | nop # E : | ||
| 42 | or $2, $1, $1 # E : | ||
| 43 | |||
| 44 | nop # E : | ||
| 45 | nop # E : | ||
| 46 | cmpbge $31, $1, $2 # E : bits set iff byte == 0 | ||
| 47 | bne $2, $found # U : | ||
| 48 | |||
| 49 | $loop: ldq $1, 8($16) # L : | ||
| 50 | addq $16, 8, $16 # E : | ||
| 51 | cmpbge $31, $1, $2 # E : | ||
| 52 | beq $2, $loop # U : | ||
| 53 | |||
| 54 | $found: cttz $2, $3 # U0 : | ||
| 55 | addq $16, $3, $16 # E : | ||
| 56 | nop # E : | ||
| 57 | bsr $23, __stxncpy # L0 :/* Now do the append. */ | ||
| 58 | |||
| 59 | /* Worry about the null termination. */ | ||
| 60 | |||
| 61 | zapnot $1, $27, $2 # U : was last byte a null? | ||
| 62 | cmplt $27, $24, $5 # E : did we fill the buffer completely? | ||
| 63 | bne $2, 0f # U : | ||
| 64 | ret # L0 : | ||
| 65 | |||
| 66 | 0: or $5, $18, $2 # E : | ||
| 67 | nop | ||
| 68 | bne $2, 2f # U : | ||
| 69 | and $24, 0x80, $3 # E : no zero next byte | ||
| 70 | |||
| 71 | nop # E : | ||
| 72 | bne $3, 1f # U : | ||
| 73 | /* Here there are bytes left in the current word. Clear one. */ | ||
| 74 | addq $24, $24, $24 # E : end-of-count bit <<= 1 | ||
| 75 | nop # E : | ||
| 76 | |||
| 77 | 2: zap $1, $24, $1 # U : | ||
| 78 | nop # E : | ||
| 79 | stq_u $1, 0($16) # L : | ||
| 80 | ret # L0 : | ||
| 81 | |||
| 82 | 1: /* Here we must clear the first byte of the next DST word */ | ||
| 83 | stb $31, 8($16) # L : | ||
| 84 | nop # E : | ||
| 85 | nop # E : | ||
| 86 | ret # L0 : | ||
| 87 | |||
| 88 | $zerocount: | ||
| 89 | nop # E : | ||
| 90 | nop # E : | ||
| 91 | nop # E : | ||
| 92 | ret # L0 : | ||
| 93 | |||
| 94 | .end strncat | ||
diff --git a/arch/alpha/lib/ev67-strrchr.S b/arch/alpha/lib/ev67-strrchr.S new file mode 100644 index 00000000000..3fd8bf414c7 --- /dev/null +++ b/arch/alpha/lib/ev67-strrchr.S | |||
| @@ -0,0 +1,109 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/ev67-strrchr.S | ||
| 3 | * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com> | ||
| 4 | * | ||
| 5 | * Finds length of a 0-terminated string. Optimized for the | ||
| 6 | * Alpha architecture: | ||
| 7 | * | ||
| 8 | * - memory accessed as aligned quadwords only | ||
| 9 | * - uses bcmpge to compare 8 bytes in parallel | ||
| 10 | * | ||
| 11 | * Much of the information about 21264 scheduling/coding comes from: | ||
| 12 | * Compiler Writer's Guide for the Alpha 21264 | ||
| 13 | * abbreviated as 'CWG' in other comments here | ||
| 14 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
| 15 | * Scheduling notation: | ||
| 16 | * E - either cluster | ||
| 17 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
| 18 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
| 19 | */ | ||
| 20 | |||
| 21 | |||
| 22 | #include <asm/regdef.h> | ||
| 23 | |||
| 24 | .set noreorder | ||
| 25 | .set noat | ||
| 26 | |||
| 27 | .align 4 | ||
| 28 | .ent strrchr | ||
| 29 | .globl strrchr | ||
| 30 | strrchr: | ||
| 31 | .frame sp, 0, ra | ||
| 32 | .prologue 0 | ||
| 33 | |||
| 34 | and a1, 0xff, t2 # E : 00000000000000ch | ||
| 35 | insbl a1, 1, t4 # U : 000000000000ch00 | ||
| 36 | insbl a1, 2, t5 # U : 0000000000ch0000 | ||
| 37 | ldq_u t0, 0(a0) # L : load first quadword Latency=3 | ||
| 38 | |||
| 39 | mov zero, t6 # E : t6 is last match aligned addr | ||
| 40 | or t2, t4, a1 # E : 000000000000chch | ||
| 41 | sll t5, 8, t3 # U : 00000000ch000000 | ||
| 42 | mov zero, t8 # E : t8 is last match byte compare mask | ||
| 43 | |||
| 44 | andnot a0, 7, v0 # E : align source addr | ||
| 45 | or t5, t3, t3 # E : 00000000chch0000 | ||
| 46 | sll a1, 32, t2 # U : 0000chch00000000 | ||
| 47 | sll a1, 48, t4 # U : chch000000000000 | ||
| 48 | |||
| 49 | or t4, a1, a1 # E : chch00000000chch | ||
| 50 | or t2, t3, t2 # E : 0000chchchch0000 | ||
| 51 | or a1, t2, a1 # E : chchchchchchchch | ||
| 52 | lda t5, -1 # E : build garbage mask | ||
| 53 | |||
| 54 | cmpbge zero, t0, t1 # E : bits set iff byte == zero | ||
| 55 | mskqh t5, a0, t4 # E : Complete garbage mask | ||
| 56 | xor t0, a1, t2 # E : make bytes == c zero | ||
| 57 | cmpbge zero, t4, t4 # E : bits set iff byte is garbage | ||
| 58 | |||
| 59 | cmpbge zero, t2, t3 # E : bits set iff byte == c | ||
| 60 | andnot t1, t4, t1 # E : clear garbage from null test | ||
| 61 | andnot t3, t4, t3 # E : clear garbage from char test | ||
| 62 | bne t1, $eos # U : did we already hit the terminator? | ||
| 63 | |||
| 64 | /* Character search main loop */ | ||
| 65 | $loop: | ||
| 66 | ldq t0, 8(v0) # L : load next quadword | ||
| 67 | cmovne t3, v0, t6 # E : save previous comparisons match | ||
| 68 | nop # : Latency=2, extra map slot (keep nop with cmov) | ||
| 69 | nop | ||
| 70 | |||
| 71 | cmovne t3, t3, t8 # E : Latency=2, extra map slot | ||
| 72 | nop # : keep with cmovne | ||
| 73 | addq v0, 8, v0 # E : | ||
| 74 | xor t0, a1, t2 # E : | ||
| 75 | |||
| 76 | cmpbge zero, t0, t1 # E : bits set iff byte == zero | ||
| 77 | cmpbge zero, t2, t3 # E : bits set iff byte == c | ||
| 78 | beq t1, $loop # U : if we havnt seen a null, loop | ||
| 79 | nop | ||
| 80 | |||
| 81 | /* Mask out character matches after terminator */ | ||
| 82 | $eos: | ||
| 83 | negq t1, t4 # E : isolate first null byte match | ||
| 84 | and t1, t4, t4 # E : | ||
| 85 | subq t4, 1, t5 # E : build a mask of the bytes upto... | ||
| 86 | or t4, t5, t4 # E : ... and including the null | ||
| 87 | |||
| 88 | and t3, t4, t3 # E : mask out char matches after null | ||
| 89 | cmovne t3, t3, t8 # E : save it, if match found Latency=2, extra map slot | ||
| 90 | nop # : Keep with cmovne | ||
| 91 | nop | ||
| 92 | |||
| 93 | cmovne t3, v0, t6 # E : | ||
| 94 | nop # : Keep with cmovne | ||
| 95 | /* Locate the address of the last matched character */ | ||
| 96 | ctlz t8, t2 # U0 : Latency=3 (0x40 for t8=0) | ||
| 97 | nop | ||
| 98 | |||
| 99 | cmoveq t8, 0x3f, t2 # E : Compensate for case when no match is seen | ||
| 100 | nop # E : hide the cmov latency (2) behind ctlz latency | ||
| 101 | lda t5, 0x3f($31) # E : | ||
| 102 | subq t5, t2, t5 # E : Normalize leading zero count | ||
| 103 | |||
| 104 | addq t6, t5, v0 # E : and add to quadword address | ||
| 105 | ret # L0 : Latency=3 | ||
| 106 | nop | ||
| 107 | nop | ||
| 108 | |||
| 109 | .end strrchr | ||
diff --git a/arch/alpha/lib/fpreg.c b/arch/alpha/lib/fpreg.c new file mode 100644 index 00000000000..97c4d9d7a4d --- /dev/null +++ b/arch/alpha/lib/fpreg.c | |||
| @@ -0,0 +1,193 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/fpreg.c | ||
| 3 | * | ||
| 4 | * (C) Copyright 1998 Linus Torvalds | ||
| 5 | */ | ||
| 6 | |||
| 7 | #if defined(__alpha_cix__) || defined(__alpha_fix__) | ||
| 8 | #define STT(reg,val) asm volatile ("ftoit $f"#reg",%0" : "=r"(val)); | ||
| 9 | #else | ||
| 10 | #define STT(reg,val) asm volatile ("stt $f"#reg",%0" : "=m"(val)); | ||
| 11 | #endif | ||
| 12 | |||
| 13 | unsigned long | ||
| 14 | alpha_read_fp_reg (unsigned long reg) | ||
| 15 | { | ||
| 16 | unsigned long val; | ||
| 17 | |||
| 18 | switch (reg) { | ||
| 19 | case 0: STT( 0, val); break; | ||
| 20 | case 1: STT( 1, val); break; | ||
| 21 | case 2: STT( 2, val); break; | ||
| 22 | case 3: STT( 3, val); break; | ||
| 23 | case 4: STT( 4, val); break; | ||
| 24 | case 5: STT( 5, val); break; | ||
| 25 | case 6: STT( 6, val); break; | ||
| 26 | case 7: STT( 7, val); break; | ||
| 27 | case 8: STT( 8, val); break; | ||
| 28 | case 9: STT( 9, val); break; | ||
| 29 | case 10: STT(10, val); break; | ||
| 30 | case 11: STT(11, val); break; | ||
| 31 | case 12: STT(12, val); break; | ||
| 32 | case 13: STT(13, val); break; | ||
| 33 | case 14: STT(14, val); break; | ||
| 34 | case 15: STT(15, val); break; | ||
| 35 | case 16: STT(16, val); break; | ||
| 36 | case 17: STT(17, val); break; | ||
| 37 | case 18: STT(18, val); break; | ||
| 38 | case 19: STT(19, val); break; | ||
| 39 | case 20: STT(20, val); break; | ||
| 40 | case 21: STT(21, val); break; | ||
| 41 | case 22: STT(22, val); break; | ||
| 42 | case 23: STT(23, val); break; | ||
| 43 | case 24: STT(24, val); break; | ||
| 44 | case 25: STT(25, val); break; | ||
| 45 | case 26: STT(26, val); break; | ||
| 46 | case 27: STT(27, val); break; | ||
| 47 | case 28: STT(28, val); break; | ||
| 48 | case 29: STT(29, val); break; | ||
| 49 | case 30: STT(30, val); break; | ||
| 50 | case 31: STT(31, val); break; | ||
| 51 | default: return 0; | ||
| 52 | } | ||
| 53 | return val; | ||
| 54 | } | ||
| 55 | |||
| 56 | #if defined(__alpha_cix__) || defined(__alpha_fix__) | ||
| 57 | #define LDT(reg,val) asm volatile ("itoft %0,$f"#reg : : "r"(val)); | ||
| 58 | #else | ||
| 59 | #define LDT(reg,val) asm volatile ("ldt $f"#reg",%0" : : "m"(val)); | ||
| 60 | #endif | ||
| 61 | |||
| 62 | void | ||
| 63 | alpha_write_fp_reg (unsigned long reg, unsigned long val) | ||
| 64 | { | ||
| 65 | switch (reg) { | ||
| 66 | case 0: LDT( 0, val); break; | ||
| 67 | case 1: LDT( 1, val); break; | ||
| 68 | case 2: LDT( 2, val); break; | ||
| 69 | case 3: LDT( 3, val); break; | ||
| 70 | case 4: LDT( 4, val); break; | ||
| 71 | case 5: LDT( 5, val); break; | ||
| 72 | case 6: LDT( 6, val); break; | ||
| 73 | case 7: LDT( 7, val); break; | ||
| 74 | case 8: LDT( 8, val); break; | ||
| 75 | case 9: LDT( 9, val); break; | ||
| 76 | case 10: LDT(10, val); break; | ||
| 77 | case 11: LDT(11, val); break; | ||
| 78 | case 12: LDT(12, val); break; | ||
| 79 | case 13: LDT(13, val); break; | ||
| 80 | case 14: LDT(14, val); break; | ||
| 81 | case 15: LDT(15, val); break; | ||
| 82 | case 16: LDT(16, val); break; | ||
| 83 | case 17: LDT(17, val); break; | ||
| 84 | case 18: LDT(18, val); break; | ||
| 85 | case 19: LDT(19, val); break; | ||
| 86 | case 20: LDT(20, val); break; | ||
| 87 | case 21: LDT(21, val); break; | ||
| 88 | case 22: LDT(22, val); break; | ||
| 89 | case 23: LDT(23, val); break; | ||
| 90 | case 24: LDT(24, val); break; | ||
| 91 | case 25: LDT(25, val); break; | ||
| 92 | case 26: LDT(26, val); break; | ||
| 93 | case 27: LDT(27, val); break; | ||
| 94 | case 28: LDT(28, val); break; | ||
| 95 | case 29: LDT(29, val); break; | ||
| 96 | case 30: LDT(30, val); break; | ||
| 97 | case 31: LDT(31, val); break; | ||
| 98 | } | ||
| 99 | } | ||
| 100 | |||
| 101 | #if defined(__alpha_cix__) || defined(__alpha_fix__) | ||
| 102 | #define STS(reg,val) asm volatile ("ftois $f"#reg",%0" : "=r"(val)); | ||
| 103 | #else | ||
| 104 | #define STS(reg,val) asm volatile ("sts $f"#reg",%0" : "=m"(val)); | ||
| 105 | #endif | ||
| 106 | |||
| 107 | unsigned long | ||
| 108 | alpha_read_fp_reg_s (unsigned long reg) | ||
| 109 | { | ||
| 110 | unsigned long val; | ||
| 111 | |||
| 112 | switch (reg) { | ||
| 113 | case 0: STS( 0, val); break; | ||
| 114 | case 1: STS( 1, val); break; | ||
| 115 | case 2: STS( 2, val); break; | ||
| 116 | case 3: STS( 3, val); break; | ||
| 117 | case 4: STS( 4, val); break; | ||
| 118 | case 5: STS( 5, val); break; | ||
| 119 | case 6: STS( 6, val); break; | ||
| 120 | case 7: STS( 7, val); break; | ||
| 121 | case 8: STS( 8, val); break; | ||
| 122 | case 9: STS( 9, val); break; | ||
| 123 | case 10: STS(10, val); break; | ||
| 124 | case 11: STS(11, val); break; | ||
| 125 | case 12: STS(12, val); break; | ||
| 126 | case 13: STS(13, val); break; | ||
| 127 | case 14: STS(14, val); break; | ||
| 128 | case 15: STS(15, val); break; | ||
| 129 | case 16: STS(16, val); break; | ||
| 130 | case 17: STS(17, val); break; | ||
| 131 | case 18: STS(18, val); break; | ||
| 132 | case 19: STS(19, val); break; | ||
| 133 | case 20: STS(20, val); break; | ||
| 134 | case 21: STS(21, val); break; | ||
| 135 | case 22: STS(22, val); break; | ||
| 136 | case 23: STS(23, val); break; | ||
| 137 | case 24: STS(24, val); break; | ||
| 138 | case 25: STS(25, val); break; | ||
| 139 | case 26: STS(26, val); break; | ||
| 140 | case 27: STS(27, val); break; | ||
| 141 | case 28: STS(28, val); break; | ||
| 142 | case 29: STS(29, val); break; | ||
| 143 | case 30: STS(30, val); break; | ||
| 144 | case 31: STS(31, val); break; | ||
| 145 | default: return 0; | ||
| 146 | } | ||
| 147 | return val; | ||
| 148 | } | ||
| 149 | |||
| 150 | #if defined(__alpha_cix__) || defined(__alpha_fix__) | ||
| 151 | #define LDS(reg,val) asm volatile ("itofs %0,$f"#reg : : "r"(val)); | ||
| 152 | #else | ||
| 153 | #define LDS(reg,val) asm volatile ("lds $f"#reg",%0" : : "m"(val)); | ||
| 154 | #endif | ||
| 155 | |||
| 156 | void | ||
| 157 | alpha_write_fp_reg_s (unsigned long reg, unsigned long val) | ||
| 158 | { | ||
| 159 | switch (reg) { | ||
| 160 | case 0: LDS( 0, val); break; | ||
| 161 | case 1: LDS( 1, val); break; | ||
| 162 | case 2: LDS( 2, val); break; | ||
| 163 | case 3: LDS( 3, val); break; | ||
| 164 | case 4: LDS( 4, val); break; | ||
| 165 | case 5: LDS( 5, val); break; | ||
| 166 | case 6: LDS( 6, val); break; | ||
| 167 | case 7: LDS( 7, val); break; | ||
| 168 | case 8: LDS( 8, val); break; | ||
| 169 | case 9: LDS( 9, val); break; | ||
| 170 | case 10: LDS(10, val); break; | ||
| 171 | case 11: LDS(11, val); break; | ||
| 172 | case 12: LDS(12, val); break; | ||
| 173 | case 13: LDS(13, val); break; | ||
| 174 | case 14: LDS(14, val); break; | ||
| 175 | case 15: LDS(15, val); break; | ||
| 176 | case 16: LDS(16, val); break; | ||
| 177 | case 17: LDS(17, val); break; | ||
| 178 | case 18: LDS(18, val); break; | ||
| 179 | case 19: LDS(19, val); break; | ||
| 180 | case 20: LDS(20, val); break; | ||
| 181 | case 21: LDS(21, val); break; | ||
| 182 | case 22: LDS(22, val); break; | ||
| 183 | case 23: LDS(23, val); break; | ||
| 184 | case 24: LDS(24, val); break; | ||
| 185 | case 25: LDS(25, val); break; | ||
| 186 | case 26: LDS(26, val); break; | ||
| 187 | case 27: LDS(27, val); break; | ||
| 188 | case 28: LDS(28, val); break; | ||
| 189 | case 29: LDS(29, val); break; | ||
| 190 | case 30: LDS(30, val); break; | ||
| 191 | case 31: LDS(31, val); break; | ||
| 192 | } | ||
| 193 | } | ||
diff --git a/arch/alpha/lib/memchr.S b/arch/alpha/lib/memchr.S new file mode 100644 index 00000000000..14427eeb555 --- /dev/null +++ b/arch/alpha/lib/memchr.S | |||
| @@ -0,0 +1,164 @@ | |||
| 1 | /* Copyright (C) 1996 Free Software Foundation, Inc. | ||
| 2 | This file is part of the GNU C Library. | ||
| 3 | Contributed by David Mosberger (davidm@cs.arizona.edu). | ||
| 4 | |||
| 5 | The GNU C Library is free software; you can redistribute it and/or | ||
| 6 | modify it under the terms of the GNU Library General Public License as | ||
| 7 | published by the Free Software Foundation; either version 2 of the | ||
| 8 | License, or (at your option) any later version. | ||
| 9 | |||
| 10 | The GNU C Library is distributed in the hope that it will be useful, | ||
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 13 | Library General Public License for more details. | ||
| 14 | |||
| 15 | You should have received a copy of the GNU Library General Public | ||
| 16 | License along with the GNU C Library; see the file COPYING.LIB. If not, | ||
| 17 | write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
| 18 | Boston, MA 02111-1307, USA. */ | ||
| 19 | |||
| 20 | /* Finds characters in a memory area. Optimized for the Alpha: | ||
| 21 | |||
| 22 | - memory accessed as aligned quadwords only | ||
| 23 | - uses cmpbge to compare 8 bytes in parallel | ||
| 24 | - does binary search to find 0 byte in last | ||
| 25 | quadword (HAKMEM needed 12 instructions to | ||
| 26 | do this instead of the 9 instructions that | ||
| 27 | binary search needs). | ||
| 28 | |||
| 29 | For correctness consider that: | ||
| 30 | |||
| 31 | - only minimum number of quadwords may be accessed | ||
| 32 | - the third argument is an unsigned long | ||
| 33 | */ | ||
| 34 | |||
| 35 | .set noreorder | ||
| 36 | .set noat | ||
| 37 | |||
| 38 | .globl memchr | ||
| 39 | .ent memchr | ||
| 40 | memchr: | ||
| 41 | .frame $30,0,$26,0 | ||
| 42 | .prologue 0 | ||
| 43 | |||
| 44 | # Hack -- if someone passes in (size_t)-1, hoping to just | ||
| 45 | # search til the end of the address space, we will overflow | ||
| 46 | # below when we find the address of the last byte. Given | ||
| 47 | # that we will never have a 56-bit address space, cropping | ||
| 48 | # the length is the easiest way to avoid trouble. | ||
| 49 | zap $18, 0x80, $5 #-e0 : | ||
| 50 | |||
| 51 | beq $18, $not_found # .. e1 : | ||
| 52 | ldq_u $1, 0($16) # e1 : load first quadword | ||
| 53 | insbl $17, 1, $2 # .. e0 : $2 = 000000000000ch00 | ||
| 54 | and $17, 0xff, $17 #-e0 : $17 = 00000000000000ch | ||
| 55 | cmpult $18, 9, $4 # .. e1 : | ||
| 56 | or $2, $17, $17 # e0 : $17 = 000000000000chch | ||
| 57 | lda $3, -1($31) # .. e1 : | ||
| 58 | sll $17, 16, $2 #-e0 : $2 = 00000000chch0000 | ||
| 59 | addq $16, $5, $5 # .. e1 : | ||
| 60 | or $2, $17, $17 # e1 : $17 = 00000000chchchch | ||
| 61 | unop # : | ||
| 62 | sll $17, 32, $2 #-e0 : $2 = chchchch00000000 | ||
| 63 | or $2, $17, $17 # e1 : $17 = chchchchchchchch | ||
| 64 | extql $1, $16, $7 # e0 : | ||
| 65 | beq $4, $first_quad # .. e1 : | ||
| 66 | |||
| 67 | ldq_u $6, -1($5) #-e1 : eight or less bytes to search | ||
| 68 | extqh $6, $16, $6 # .. e0 : | ||
| 69 | mov $16, $0 # e0 : | ||
| 70 | or $7, $6, $1 # .. e1 : $1 = quadword starting at $16 | ||
| 71 | |||
| 72 | # Deal with the case where at most 8 bytes remain to be searched | ||
| 73 | # in $1. E.g.: | ||
| 74 | # $18 = 6 | ||
| 75 | # $1 = ????c6c5c4c3c2c1 | ||
| 76 | $last_quad: | ||
| 77 | negq $18, $6 #-e0 : | ||
| 78 | xor $17, $1, $1 # .. e1 : | ||
| 79 | srl $3, $6, $6 # e0 : $6 = mask of $18 bits set | ||
| 80 | cmpbge $31, $1, $2 # .. e1 : | ||
| 81 | and $2, $6, $2 #-e0 : | ||
| 82 | beq $2, $not_found # .. e1 : | ||
| 83 | |||
| 84 | $found_it: | ||
| 85 | # Now, determine which byte matched: | ||
| 86 | negq $2, $3 # e0 : | ||
| 87 | and $2, $3, $2 # e1 : | ||
| 88 | |||
| 89 | and $2, 0x0f, $1 #-e0 : | ||
| 90 | addq $0, 4, $3 # .. e1 : | ||
| 91 | cmoveq $1, $3, $0 # e0 : | ||
| 92 | |||
| 93 | addq $0, 2, $3 # .. e1 : | ||
| 94 | and $2, 0x33, $1 #-e0 : | ||
| 95 | cmoveq $1, $3, $0 # .. e1 : | ||
| 96 | |||
| 97 | and $2, 0x55, $1 # e0 : | ||
| 98 | addq $0, 1, $3 # .. e1 : | ||
| 99 | cmoveq $1, $3, $0 #-e0 : | ||
| 100 | |||
| 101 | $done: ret # .. e1 : | ||
| 102 | |||
| 103 | # Deal with the case where $18 > 8 bytes remain to be | ||
| 104 | # searched. $16 may not be aligned. | ||
| 105 | .align 4 | ||
| 106 | $first_quad: | ||
| 107 | andnot $16, 0x7, $0 #-e1 : | ||
| 108 | insqh $3, $16, $2 # .. e0 : $2 = 0000ffffffffffff ($16<0:2> ff) | ||
| 109 | xor $1, $17, $1 # e0 : | ||
| 110 | or $1, $2, $1 # e1 : $1 = ====ffffffffffff | ||
| 111 | cmpbge $31, $1, $2 #-e0 : | ||
| 112 | bne $2, $found_it # .. e1 : | ||
| 113 | |||
| 114 | # At least one byte left to process. | ||
| 115 | |||
| 116 | ldq $1, 8($0) # e0 : | ||
| 117 | subq $5, 1, $18 # .. e1 : | ||
| 118 | addq $0, 8, $0 #-e0 : | ||
| 119 | |||
| 120 | # Make $18 point to last quad to be accessed (the | ||
| 121 | # last quad may or may not be partial). | ||
| 122 | |||
| 123 | andnot $18, 0x7, $18 # .. e1 : | ||
| 124 | cmpult $0, $18, $2 # e0 : | ||
| 125 | beq $2, $final # .. e1 : | ||
| 126 | |||
| 127 | # At least two quads remain to be accessed. | ||
| 128 | |||
| 129 | subq $18, $0, $4 #-e0 : $4 <- nr quads to be processed | ||
| 130 | and $4, 8, $4 # e1 : odd number of quads? | ||
| 131 | bne $4, $odd_quad_count # e1 : | ||
| 132 | |||
| 133 | # At least three quads remain to be accessed | ||
| 134 | |||
| 135 | mov $1, $4 # e0 : move prefetched value to correct reg | ||
| 136 | |||
| 137 | .align 4 | ||
| 138 | $unrolled_loop: | ||
| 139 | ldq $1, 8($0) #-e0 : prefetch $1 | ||
| 140 | xor $17, $4, $2 # .. e1 : | ||
| 141 | cmpbge $31, $2, $2 # e0 : | ||
| 142 | bne $2, $found_it # .. e1 : | ||
| 143 | |||
| 144 | addq $0, 8, $0 #-e0 : | ||
| 145 | $odd_quad_count: | ||
| 146 | xor $17, $1, $2 # .. e1 : | ||
| 147 | ldq $4, 8($0) # e0 : prefetch $4 | ||
| 148 | cmpbge $31, $2, $2 # .. e1 : | ||
| 149 | addq $0, 8, $6 #-e0 : | ||
| 150 | bne $2, $found_it # .. e1 : | ||
| 151 | |||
| 152 | cmpult $6, $18, $6 # e0 : | ||
| 153 | addq $0, 8, $0 # .. e1 : | ||
| 154 | bne $6, $unrolled_loop #-e1 : | ||
| 155 | |||
| 156 | mov $4, $1 # e0 : move prefetched value into $1 | ||
| 157 | $final: subq $5, $0, $18 # .. e1 : $18 <- number of bytes left to do | ||
| 158 | bne $18, $last_quad # e1 : | ||
| 159 | |||
| 160 | $not_found: | ||
| 161 | mov $31, $0 #-e0 : | ||
| 162 | ret # .. e1 : | ||
| 163 | |||
| 164 | .end memchr | ||
diff --git a/arch/alpha/lib/memcpy.c b/arch/alpha/lib/memcpy.c new file mode 100644 index 00000000000..64083fc7323 --- /dev/null +++ b/arch/alpha/lib/memcpy.c | |||
| @@ -0,0 +1,163 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/alpha/lib/memcpy.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1995 Linus Torvalds | ||
| 5 | */ | ||
| 6 | |||
| 7 | /* | ||
| 8 | * This is a reasonably optimized memcpy() routine. | ||
| 9 | */ | ||
| 10 | |||
| 11 | /* | ||
| 12 | * Note that the C code is written to be optimized into good assembly. However, | ||
| 13 | * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a | ||
| 14 | * explicit compare against 0 (instead of just using the proper "blt reg, xx" or | ||
| 15 | * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually.. | ||
| 16 | */ | ||
| 17 | |||
| 18 | #include <linux/types.h> | ||
| 19 | |||
| 20 | /* | ||
| 21 | * This should be done in one go with ldq_u*2/mask/stq_u. Do it | ||
| 22 | * with a macro so that we can fix it up later.. | ||
| 23 | */ | ||
| 24 | #define ALIGN_DEST_TO8_UP(d,s,n) \ | ||
| 25 | while (d & 7) { \ | ||
| 26 | if (n <= 0) return; \ | ||
| 27 | n--; \ | ||
| 28 | *(char *) d = *(char *) s; \ | ||
| 29 | d++; s++; \ | ||
| 30 | } | ||
| 31 | #define ALIGN_DEST_TO8_DN(d,s,n) \ | ||
| 32 | while (d & 7) { \ | ||
| 33 | if (n <= 0) return; \ | ||
| 34 | n--; \ | ||
| 35 | d--; s--; \ | ||
| 36 | *(char *) d = *(char *) s; \ | ||
| 37 | } | ||
| 38 | |||
| 39 | /* | ||
| 40 | * This should similarly be done with ldq_u*2/mask/stq. The destination | ||
| 41 | * is aligned, but we don't fill in a full quad-word | ||
| 42 | */ | ||
| 43 | #define DO_REST_UP(d,s,n) \ | ||
| 44 | while (n > 0) { \ | ||
| 45 | n--; \ | ||
| 46 | *(char *) d = *(char *) s; \ | ||
| 47 | d++; s++; \ | ||
| 48 | } | ||
| 49 | #define DO_REST_DN(d,s,n) \ | ||
| 50 | while (n > 0) { \ | ||
| 51 | n--; \ | ||
| 52 | d--; s--; \ | ||
| 53 | *(char *) d = *(char *) s; \ | ||
| 54 | } | ||
| 55 | |||
| 56 | /* | ||
| 57 | * This should be done with ldq/mask/stq. The source and destination are | ||
| 58 | * aligned, but we don't fill in a full quad-word | ||
| 59 | */ | ||
| 60 | #define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n) | ||
| 61 | #define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n) | ||
| 62 | |||
| 63 | /* | ||
| 64 | * This does unaligned memory copies. We want to avoid storing to | ||
| 65 | * an unaligned address, as that would do a read-modify-write cycle. | ||
| 66 | * We also want to avoid double-reading the unaligned reads. | ||
| 67 | * | ||
| 68 | * Note the ordering to try to avoid load (and address generation) latencies. | ||
| 69 | */ | ||
| 70 | static inline void __memcpy_unaligned_up (unsigned long d, unsigned long s, | ||
| 71 | long n) | ||
| 72 | { | ||
| 73 | ALIGN_DEST_TO8_UP(d,s,n); | ||
| 74 | n -= 8; /* to avoid compare against 8 in the loop */ | ||
| 75 | if (n >= 0) { | ||
| 76 | unsigned long low_word, high_word; | ||
| 77 | __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s)); | ||
| 78 | do { | ||
| 79 | unsigned long tmp; | ||
| 80 | __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8))); | ||
| 81 | n -= 8; | ||
| 82 | __asm__("extql %1,%2,%0" | ||
| 83 | :"=r" (low_word) | ||
| 84 | :"r" (low_word), "r" (s)); | ||
| 85 | __asm__("extqh %1,%2,%0" | ||
| 86 | :"=r" (tmp) | ||
| 87 | :"r" (high_word), "r" (s)); | ||
| 88 | s += 8; | ||
| 89 | *(unsigned long *) d = low_word | tmp; | ||
| 90 | d += 8; | ||
| 91 | low_word = high_word; | ||
| 92 | } while (n >= 0); | ||
| 93 | } | ||
| 94 | n += 8; | ||
| 95 | DO_REST_UP(d,s,n); | ||
| 96 | } | ||
| 97 | |||
| 98 | static inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s, | ||
| 99 | long n) | ||
| 100 | { | ||
| 101 | /* I don't understand AXP assembler well enough for this. -Tim */ | ||
| 102 | s += n; | ||
| 103 | d += n; | ||
| 104 | while (n--) | ||
| 105 | * (char *) --d = * (char *) --s; | ||
| 106 | } | ||
| 107 | |||
| 108 | /* | ||
| 109 | * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register | ||
| 110 | * for the load-store. I don't know why, but it would seem that using a floating | ||
| 111 | * point register for the move seems to slow things down (very small difference, | ||
| 112 | * though). | ||
| 113 | * | ||
| 114 | * Note the ordering to try to avoid load (and address generation) latencies. | ||
| 115 | */ | ||
| 116 | static inline void __memcpy_aligned_up (unsigned long d, unsigned long s, | ||
| 117 | long n) | ||
| 118 | { | ||
| 119 | ALIGN_DEST_TO8_UP(d,s,n); | ||
| 120 | n -= 8; | ||
| 121 | while (n >= 0) { | ||
| 122 | unsigned long tmp; | ||
| 123 | __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s)); | ||
| 124 | n -= 8; | ||
| 125 | s += 8; | ||
| 126 | *(unsigned long *) d = tmp; | ||
| 127 | d += 8; | ||
| 128 | } | ||
| 129 | n += 8; | ||
| 130 | DO_REST_ALIGNED_UP(d,s,n); | ||
| 131 | } | ||
| 132 | static inline void __memcpy_aligned_dn (unsigned long d, unsigned long s, | ||
| 133 | long n) | ||
| 134 | { | ||
| 135 | s += n; | ||
| 136 | d += n; | ||
| 137 | ALIGN_DEST_TO8_DN(d,s,n); | ||
| 138 | n -= 8; | ||
| 139 | while (n >= 0) { | ||
| 140 | unsigned long tmp; | ||
| 141 | s -= 8; | ||
| 142 | __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s)); | ||
| 143 | n -= 8; | ||
| 144 | d -= 8; | ||
| 145 | *(unsigned long *) d = tmp; | ||
| 146 | } | ||
| 147 | n += 8; | ||
| 148 | DO_REST_ALIGNED_DN(d,s,n); | ||
| 149 | } | ||
| 150 | |||
| 151 | void * memcpy(void * dest, const void *src, size_t n) | ||
| 152 | { | ||
| 153 | if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) { | ||
| 154 | __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src, | ||
| 155 | n); | ||
| 156 | return dest; | ||
| 157 | } | ||
| 158 | __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n); | ||
| 159 | return dest; | ||
| 160 | } | ||
| 161 | |||
| 162 | /* For backward modules compatibility, define __memcpy. */ | ||
| 163 | asm("__memcpy = memcpy; .globl __memcpy"); | ||
diff --git a/arch/alpha/lib/memmove.S b/arch/alpha/lib/memmove.S new file mode 100644 index 00000000000..eb3b6e02242 --- /dev/null +++ b/arch/alpha/lib/memmove.S | |||
| @@ -0,0 +1,181 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/memmove.S | ||
| 3 | * | ||
| 4 | * Barely optimized memmove routine for Alpha EV5. | ||
| 5 | * | ||
| 6 | * This is hand-massaged output from the original memcpy.c. We defer to | ||
| 7 | * memcpy whenever possible; the backwards copy loops are not unrolled. | ||
| 8 | */ | ||
| 9 | |||
| 10 | .set noat | ||
| 11 | .set noreorder | ||
| 12 | .text | ||
| 13 | |||
| 14 | .align 4 | ||
| 15 | .globl memmove | ||
| 16 | .ent memmove | ||
| 17 | memmove: | ||
| 18 | ldgp $29, 0($27) | ||
| 19 | unop | ||
| 20 | nop | ||
| 21 | .prologue 1 | ||
| 22 | |||
| 23 | addq $16,$18,$4 | ||
| 24 | addq $17,$18,$5 | ||
| 25 | cmpule $4,$17,$1 /* dest + n <= src */ | ||
| 26 | cmpule $5,$16,$2 /* dest >= src + n */ | ||
| 27 | |||
| 28 | bis $1,$2,$1 | ||
| 29 | mov $16,$0 | ||
| 30 | xor $16,$17,$2 | ||
| 31 | bne $1,memcpy !samegp | ||
| 32 | |||
| 33 | and $2,7,$2 /* Test for src/dest co-alignment. */ | ||
| 34 | and $16,7,$1 | ||
| 35 | cmpule $16,$17,$3 | ||
| 36 | bne $3,$memmove_up /* dest < src */ | ||
| 37 | |||
| 38 | and $4,7,$1 | ||
| 39 | bne $2,$misaligned_dn | ||
| 40 | unop | ||
| 41 | beq $1,$skip_aligned_byte_loop_head_dn | ||
| 42 | |||
| 43 | $aligned_byte_loop_head_dn: | ||
| 44 | lda $4,-1($4) | ||
| 45 | lda $5,-1($5) | ||
| 46 | unop | ||
| 47 | ble $18,$egress | ||
| 48 | |||
| 49 | ldq_u $3,0($5) | ||
| 50 | ldq_u $2,0($4) | ||
| 51 | lda $18,-1($18) | ||
| 52 | extbl $3,$5,$1 | ||
| 53 | |||
| 54 | insbl $1,$4,$1 | ||
| 55 | mskbl $2,$4,$2 | ||
| 56 | bis $1,$2,$1 | ||
| 57 | and $4,7,$6 | ||
| 58 | |||
| 59 | stq_u $1,0($4) | ||
| 60 | bne $6,$aligned_byte_loop_head_dn | ||
| 61 | |||
| 62 | $skip_aligned_byte_loop_head_dn: | ||
| 63 | lda $18,-8($18) | ||
| 64 | blt $18,$skip_aligned_word_loop_dn | ||
| 65 | |||
| 66 | $aligned_word_loop_dn: | ||
| 67 | ldq $1,-8($5) | ||
| 68 | nop | ||
| 69 | lda $5,-8($5) | ||
| 70 | lda $18,-8($18) | ||
| 71 | |||
| 72 | stq $1,-8($4) | ||
| 73 | nop | ||
| 74 | lda $4,-8($4) | ||
| 75 | bge $18,$aligned_word_loop_dn | ||
| 76 | |||
| 77 | $skip_aligned_word_loop_dn: | ||
| 78 | lda $18,8($18) | ||
| 79 | bgt $18,$byte_loop_tail_dn | ||
| 80 | unop | ||
| 81 | ret $31,($26),1 | ||
| 82 | |||
| 83 | .align 4 | ||
| 84 | $misaligned_dn: | ||
| 85 | nop | ||
| 86 | fnop | ||
| 87 | unop | ||
| 88 | beq $18,$egress | ||
| 89 | |||
| 90 | $byte_loop_tail_dn: | ||
| 91 | ldq_u $3,-1($5) | ||
| 92 | ldq_u $2,-1($4) | ||
| 93 | lda $5,-1($5) | ||
| 94 | lda $4,-1($4) | ||
| 95 | |||
| 96 | lda $18,-1($18) | ||
| 97 | extbl $3,$5,$1 | ||
| 98 | insbl $1,$4,$1 | ||
| 99 | mskbl $2,$4,$2 | ||
| 100 | |||
| 101 | bis $1,$2,$1 | ||
| 102 | stq_u $1,0($4) | ||
| 103 | bgt $18,$byte_loop_tail_dn | ||
| 104 | br $egress | ||
| 105 | |||
| 106 | $memmove_up: | ||
| 107 | mov $16,$4 | ||
| 108 | mov $17,$5 | ||
| 109 | bne $2,$misaligned_up | ||
| 110 | beq $1,$skip_aligned_byte_loop_head_up | ||
| 111 | |||
| 112 | $aligned_byte_loop_head_up: | ||
| 113 | unop | ||
| 114 | ble $18,$egress | ||
| 115 | ldq_u $3,0($5) | ||
| 116 | ldq_u $2,0($4) | ||
| 117 | |||
| 118 | lda $18,-1($18) | ||
| 119 | extbl $3,$5,$1 | ||
| 120 | insbl $1,$4,$1 | ||
| 121 | mskbl $2,$4,$2 | ||
| 122 | |||
| 123 | bis $1,$2,$1 | ||
| 124 | lda $5,1($5) | ||
| 125 | stq_u $1,0($4) | ||
| 126 | lda $4,1($4) | ||
| 127 | |||
| 128 | and $4,7,$6 | ||
| 129 | bne $6,$aligned_byte_loop_head_up | ||
| 130 | |||
| 131 | $skip_aligned_byte_loop_head_up: | ||
| 132 | lda $18,-8($18) | ||
| 133 | blt $18,$skip_aligned_word_loop_up | ||
| 134 | |||
| 135 | $aligned_word_loop_up: | ||
| 136 | ldq $1,0($5) | ||
| 137 | nop | ||
| 138 | lda $5,8($5) | ||
| 139 | lda $18,-8($18) | ||
| 140 | |||
| 141 | stq $1,0($4) | ||
| 142 | nop | ||
| 143 | lda $4,8($4) | ||
| 144 | bge $18,$aligned_word_loop_up | ||
| 145 | |||
| 146 | $skip_aligned_word_loop_up: | ||
| 147 | lda $18,8($18) | ||
| 148 | bgt $18,$byte_loop_tail_up | ||
| 149 | unop | ||
| 150 | ret $31,($26),1 | ||
| 151 | |||
| 152 | .align 4 | ||
| 153 | $misaligned_up: | ||
| 154 | nop | ||
| 155 | fnop | ||
| 156 | unop | ||
| 157 | beq $18,$egress | ||
| 158 | |||
| 159 | $byte_loop_tail_up: | ||
| 160 | ldq_u $3,0($5) | ||
| 161 | ldq_u $2,0($4) | ||
| 162 | lda $18,-1($18) | ||
| 163 | extbl $3,$5,$1 | ||
| 164 | |||
| 165 | insbl $1,$4,$1 | ||
| 166 | mskbl $2,$4,$2 | ||
| 167 | bis $1,$2,$1 | ||
| 168 | stq_u $1,0($4) | ||
| 169 | |||
| 170 | lda $5,1($5) | ||
| 171 | lda $4,1($4) | ||
| 172 | nop | ||
| 173 | bgt $18,$byte_loop_tail_up | ||
| 174 | |||
| 175 | $egress: | ||
| 176 | ret $31,($26),1 | ||
| 177 | nop | ||
| 178 | nop | ||
| 179 | nop | ||
| 180 | |||
| 181 | .end memmove | ||
diff --git a/arch/alpha/lib/memset.S b/arch/alpha/lib/memset.S new file mode 100644 index 00000000000..8ff6e7e1773 --- /dev/null +++ b/arch/alpha/lib/memset.S | |||
| @@ -0,0 +1,124 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/alpha/memset.S | ||
| 3 | * | ||
| 4 | * This is an efficient (and small) implementation of the C library "memset()" | ||
| 5 | * function for the alpha. | ||
| 6 | * | ||
| 7 | * (C) Copyright 1996 Linus Torvalds | ||
| 8 | * | ||
| 9 | * This routine is "moral-ware": you are free to use it any way you wish, and | ||
| 10 | * the only obligation I put on you is a moral one: if you make any improvements | ||
| 11 | * to the routine, please send me your improvements for me to use similarly. | ||
| 12 | * | ||
| 13 | * The scheduling comments are according to the EV5 documentation (and done by | ||
| 14 | * hand, so they might well be incorrect, please do tell me about it..) | ||
| 15 | */ | ||
| 16 | |||
| 17 | .set noat | ||
| 18 | .set noreorder | ||
| 19 | .text | ||
| 20 | .globl memset | ||
| 21 | .globl __memset | ||
| 22 | .globl __memsetw | ||
| 23 | .globl __constant_c_memset | ||
| 24 | .ent __memset | ||
| 25 | .align 5 | ||
| 26 | __memset: | ||
| 27 | .frame $30,0,$26,0 | ||
| 28 | .prologue 0 | ||
| 29 | |||
| 30 | and $17,255,$1 /* E1 */ | ||
| 31 | insbl $17,1,$17 /* .. E0 */ | ||
| 32 | bis $17,$1,$17 /* E0 (p-c latency, next cycle) */ | ||
| 33 | sll $17,16,$1 /* E1 (p-c latency, next cycle) */ | ||
| 34 | |||
| 35 | bis $17,$1,$17 /* E0 (p-c latency, next cycle) */ | ||
| 36 | sll $17,32,$1 /* E1 (p-c latency, next cycle) */ | ||
| 37 | bis $17,$1,$17 /* E0 (p-c latency, next cycle) */ | ||
| 38 | ldq_u $31,0($30) /* .. E1 */ | ||
| 39 | |||
| 40 | .align 5 | ||
| 41 | __constant_c_memset: | ||
| 42 | addq $18,$16,$6 /* E0 */ | ||
| 43 | bis $16,$16,$0 /* .. E1 */ | ||
| 44 | xor $16,$6,$1 /* E0 */ | ||
| 45 | ble $18,end /* .. E1 */ | ||
| 46 | |||
| 47 | bic $1,7,$1 /* E0 */ | ||
| 48 | beq $1,within_one_quad /* .. E1 (note EV5 zero-latency forwarding) */ | ||
| 49 | and $16,7,$3 /* E0 */ | ||
| 50 | beq $3,aligned /* .. E1 (note EV5 zero-latency forwarding) */ | ||
| 51 | |||
| 52 | ldq_u $4,0($16) /* E0 */ | ||
| 53 | bis $16,$16,$5 /* .. E1 */ | ||
| 54 | insql $17,$16,$2 /* E0 */ | ||
| 55 | subq $3,8,$3 /* .. E1 */ | ||
| 56 | |||
| 57 | addq $18,$3,$18 /* E0 $18 is new count ($3 is negative) */ | ||
| 58 | mskql $4,$16,$4 /* .. E1 (and possible load stall) */ | ||
| 59 | subq $16,$3,$16 /* E0 $16 is new aligned destination */ | ||
| 60 | bis $2,$4,$1 /* .. E1 */ | ||
| 61 | |||
| 62 | bis $31,$31,$31 /* E0 */ | ||
| 63 | ldq_u $31,0($30) /* .. E1 */ | ||
| 64 | stq_u $1,0($5) /* E0 */ | ||
| 65 | bis $31,$31,$31 /* .. E1 */ | ||
| 66 | |||
| 67 | .align 4 | ||
| 68 | aligned: | ||
| 69 | sra $18,3,$3 /* E0 */ | ||
| 70 | and $18,7,$18 /* .. E1 */ | ||
| 71 | bis $16,$16,$5 /* E0 */ | ||
| 72 | beq $3,no_quad /* .. E1 */ | ||
| 73 | |||
| 74 | .align 3 | ||
| 75 | loop: | ||
| 76 | stq $17,0($5) /* E0 */ | ||
| 77 | subq $3,1,$3 /* .. E1 */ | ||
| 78 | addq $5,8,$5 /* E0 */ | ||
| 79 | bne $3,loop /* .. E1 */ | ||
| 80 | |||
| 81 | no_quad: | ||
| 82 | bis $31,$31,$31 /* E0 */ | ||
| 83 | beq $18,end /* .. E1 */ | ||
| 84 | ldq $7,0($5) /* E0 */ | ||
| 85 | mskqh $7,$6,$2 /* .. E1 (and load stall) */ | ||
| 86 | |||
| 87 | insqh $17,$6,$4 /* E0 */ | ||
| 88 | bis $2,$4,$1 /* .. E1 */ | ||
| 89 | stq $1,0($5) /* E0 */ | ||
| 90 | ret $31,($26),1 /* .. E1 */ | ||
| 91 | |||
| 92 | .align 3 | ||
| 93 | within_one_quad: | ||
| 94 | ldq_u $1,0($16) /* E0 */ | ||
| 95 | insql $17,$16,$2 /* E1 */ | ||
| 96 | mskql $1,$16,$4 /* E0 (after load stall) */ | ||
| 97 | bis $2,$4,$2 /* E0 */ | ||
| 98 | |||
| 99 | mskql $2,$6,$4 /* E0 */ | ||
| 100 | mskqh $1,$6,$2 /* .. E1 */ | ||
| 101 | bis $2,$4,$1 /* E0 */ | ||
| 102 | stq_u $1,0($16) /* E0 */ | ||
| 103 | |||
| 104 | end: | ||
| 105 | ret $31,($26),1 /* E1 */ | ||
| 106 | .end __memset | ||
| 107 | |||
| 108 | .align 5 | ||
| 109 | .ent __memsetw | ||
| 110 | __memsetw: | ||
| 111 | .prologue 0 | ||
| 112 | |||
| 113 | inswl $17,0,$1 /* E0 */ | ||
| 114 | inswl $17,2,$2 /* E0 */ | ||
| 115 | inswl $17,4,$3 /* E0 */ | ||
| 116 | or $1,$2,$1 /* .. E1 */ | ||
| 117 | inswl $17,6,$4 /* E0 */ | ||
| 118 | or $1,$3,$1 /* .. E1 */ | ||
| 119 | or $1,$4,$17 /* E0 */ | ||
| 120 | br __constant_c_memset /* .. E1 */ | ||
| 121 | |||
| 122 | .end __memsetw | ||
| 123 | |||
| 124 | memset = __memset | ||
diff --git a/arch/alpha/lib/srm_printk.c b/arch/alpha/lib/srm_printk.c new file mode 100644 index 00000000000..31b53c49435 --- /dev/null +++ b/arch/alpha/lib/srm_printk.c | |||
| @@ -0,0 +1,41 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/srm_printk.c | ||
| 3 | */ | ||
| 4 | |||
| 5 | #include <linux/kernel.h> | ||
| 6 | #include <asm/console.h> | ||
| 7 | |||
| 8 | long | ||
| 9 | srm_printk(const char *fmt, ...) | ||
| 10 | { | ||
| 11 | static char buf[1024]; | ||
| 12 | va_list args; | ||
| 13 | long len, num_lf; | ||
| 14 | char *src, *dst; | ||
| 15 | |||
| 16 | va_start(args, fmt); | ||
| 17 | len = vsprintf(buf, fmt, args); | ||
| 18 | va_end(args); | ||
| 19 | |||
| 20 | /* count number of linefeeds in string: */ | ||
| 21 | |||
| 22 | num_lf = 0; | ||
| 23 | for (src = buf; *src; ++src) { | ||
| 24 | if (*src == '\n') { | ||
| 25 | ++num_lf; | ||
| 26 | } | ||
| 27 | } | ||
| 28 | |||
| 29 | if (num_lf) { | ||
| 30 | /* expand each linefeed into carriage-return/linefeed: */ | ||
| 31 | for (dst = src + num_lf; src >= buf; ) { | ||
| 32 | if (*src == '\n') { | ||
| 33 | *dst-- = '\r'; | ||
| 34 | } | ||
| 35 | *dst-- = *src--; | ||
| 36 | } | ||
| 37 | } | ||
| 38 | |||
| 39 | srm_puts(buf, num_lf+len); | ||
| 40 | return len; | ||
| 41 | } | ||
diff --git a/arch/alpha/lib/srm_puts.c b/arch/alpha/lib/srm_puts.c new file mode 100644 index 00000000000..7b60a6f75a7 --- /dev/null +++ b/arch/alpha/lib/srm_puts.c | |||
| @@ -0,0 +1,23 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/srm_puts.c | ||
| 3 | */ | ||
| 4 | |||
| 5 | #include <linux/string.h> | ||
| 6 | #include <asm/console.h> | ||
| 7 | |||
| 8 | long | ||
| 9 | srm_puts(const char *str, long len) | ||
| 10 | { | ||
| 11 | long remaining, written; | ||
| 12 | |||
| 13 | if (!callback_init_done) | ||
| 14 | return len; | ||
| 15 | |||
| 16 | for (remaining = len; remaining > 0; remaining -= written) | ||
| 17 | { | ||
| 18 | written = callback_puts(0, str, remaining); | ||
| 19 | written &= 0xffffffff; | ||
| 20 | str += written; | ||
| 21 | } | ||
| 22 | return len; | ||
| 23 | } | ||
diff --git a/arch/alpha/lib/stacktrace.c b/arch/alpha/lib/stacktrace.c new file mode 100644 index 00000000000..6d432e42aed --- /dev/null +++ b/arch/alpha/lib/stacktrace.c | |||
| @@ -0,0 +1,103 @@ | |||
| 1 | #include <linux/kernel.h> | ||
| 2 | #include <asm/system.h> | ||
| 3 | |||
| 4 | typedef unsigned int instr; | ||
| 5 | |||
| 6 | #define MAJOR_OP 0xfc000000 | ||
| 7 | #define LDA_OP 0x20000000 | ||
| 8 | #define STQ_OP 0xb4000000 | ||
| 9 | #define BR_OP 0xc0000000 | ||
| 10 | |||
| 11 | #define STK_ALLOC_1 0x23de8000 /* lda $30,-X($30) */ | ||
| 12 | #define STK_ALLOC_1M 0xffff8000 | ||
| 13 | #define STK_ALLOC_2 0x43c0153e /* subq $30,X,$30 */ | ||
| 14 | #define STK_ALLOC_2M 0xffe01fff | ||
| 15 | |||
| 16 | #define MEM_REG 0x03e00000 | ||
| 17 | #define MEM_BASE 0x001f0000 | ||
| 18 | #define MEM_OFF 0x0000ffff | ||
| 19 | #define MEM_OFF_SIGN 0x00008000 | ||
| 20 | #define BASE_SP 0x001e0000 | ||
| 21 | |||
| 22 | #define STK_ALLOC_MATCH(INSTR) \ | ||
| 23 | (((INSTR) & STK_ALLOC_1M) == STK_ALLOC_1 \ | ||
| 24 | || ((INSTR) & STK_ALLOC_2M) == STK_ALLOC_2) | ||
| 25 | #define STK_PUSH_MATCH(INSTR) \ | ||
| 26 | (((INSTR) & (MAJOR_OP | MEM_BASE | MEM_OFF_SIGN)) == (STQ_OP | BASE_SP)) | ||
| 27 | #define MEM_OP_OFFSET(INSTR) \ | ||
| 28 | (((long)((INSTR) & MEM_OFF) << 48) >> 48) | ||
| 29 | #define MEM_OP_REG(INSTR) \ | ||
| 30 | (((INSTR) & MEM_REG) >> 22) | ||
| 31 | |||
| 32 | /* Branches, jumps, PAL calls, and illegal opcodes end a basic block. */ | ||
| 33 | #define BB_END(INSTR) \ | ||
| 34 | (((instr)(INSTR) >= BR_OP) | ((instr)(INSTR) < LDA_OP) | \ | ||
| 35 | ((((instr)(INSTR) ^ 0x60000000) < 0x20000000) & \ | ||
| 36 | (((instr)(INSTR) & 0x0c000000) != 0))) | ||
| 37 | |||
| 38 | #define IS_KERNEL_TEXT(PC) ((unsigned long)(PC) > START_ADDR) | ||
| 39 | |||
| 40 | static char reg_name[][4] = { | ||
| 41 | "v0 ", "t0 ", "t1 ", "t2 ", "t3 ", "t4 ", "t5 ", "t6 ", "t7 ", | ||
| 42 | "s0 ", "s1 ", "s2 ", "s3 ", "s4 ", "s5 ", "s6 ", "a0 ", "a1 ", | ||
| 43 | "a2 ", "a3 ", "a4 ", "a5 ", "t8 ", "t9 ", "t10", "t11", "ra ", | ||
| 44 | "pv ", "at ", "gp ", "sp ", "0" | ||
| 45 | }; | ||
| 46 | |||
| 47 | |||
| 48 | static instr * | ||
| 49 | display_stored_regs(instr * pro_pc, unsigned char * sp) | ||
| 50 | { | ||
| 51 | instr * ret_pc = 0; | ||
| 52 | int reg; | ||
| 53 | unsigned long value; | ||
| 54 | |||
| 55 | printk("Prologue [<%p>], Frame %p:\n", pro_pc, sp); | ||
| 56 | while (!BB_END(*pro_pc)) | ||
| 57 | if (STK_PUSH_MATCH(*pro_pc)) { | ||
| 58 | reg = (*pro_pc & MEM_REG) >> 21; | ||
| 59 | value = *(unsigned long *)(sp + (*pro_pc & MEM_OFF)); | ||
| 60 | if (reg == 26) | ||
| 61 | ret_pc = (instr *)value; | ||
| 62 | printk("\t\t%s / 0x%016lx\n", reg_name[reg], value); | ||
| 63 | } | ||
| 64 | return ret_pc; | ||
| 65 | } | ||
| 66 | |||
| 67 | static instr * | ||
| 68 | seek_prologue(instr * pc) | ||
| 69 | { | ||
| 70 | while (!STK_ALLOC_MATCH(*pc)) | ||
| 71 | --pc; | ||
| 72 | while (!BB_END(*(pc - 1))) | ||
| 73 | --pc; | ||
| 74 | return pc; | ||
| 75 | } | ||
| 76 | |||
| 77 | static long | ||
| 78 | stack_increment(instr * prologue_pc) | ||
| 79 | { | ||
| 80 | while (!STK_ALLOC_MATCH(*prologue_pc)) | ||
| 81 | ++prologue_pc; | ||
| 82 | |||
| 83 | /* Count the bytes allocated. */ | ||
| 84 | if ((*prologue_pc & STK_ALLOC_1M) == STK_ALLOC_1M) | ||
| 85 | return -(((long)(*prologue_pc) << 48) >> 48); | ||
| 86 | else | ||
| 87 | return (*prologue_pc >> 13) & 0xff; | ||
| 88 | } | ||
| 89 | |||
| 90 | void | ||
| 91 | stacktrace(void) | ||
| 92 | { | ||
| 93 | instr * ret_pc; | ||
| 94 | instr * prologue = (instr *)stacktrace; | ||
| 95 | register unsigned char * sp __asm__ ("$30"); | ||
| 96 | |||
| 97 | printk("\tstack trace:\n"); | ||
| 98 | do { | ||
| 99 | ret_pc = display_stored_regs(prologue, sp); | ||
| 100 | sp += stack_increment(prologue); | ||
| 101 | prologue = seek_prologue(ret_pc); | ||
| 102 | } while (IS_KERNEL_TEXT(ret_pc)); | ||
| 103 | } | ||
diff --git a/arch/alpha/lib/strcasecmp.c b/arch/alpha/lib/strcasecmp.c new file mode 100644 index 00000000000..4e57a216fea --- /dev/null +++ b/arch/alpha/lib/strcasecmp.c | |||
| @@ -0,0 +1,26 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/alpha/lib/strcasecmp.c | ||
| 3 | */ | ||
| 4 | |||
| 5 | #include <linux/string.h> | ||
| 6 | |||
| 7 | |||
| 8 | /* We handle nothing here except the C locale. Since this is used in | ||
| 9 | only one place, on strings known to contain only 7 bit ASCII, this | ||
| 10 | is ok. */ | ||
| 11 | |||
| 12 | int strcasecmp(const char *a, const char *b) | ||
| 13 | { | ||
| 14 | int ca, cb; | ||
| 15 | |||
| 16 | do { | ||
| 17 | ca = *a++ & 0xff; | ||
| 18 | cb = *b++ & 0xff; | ||
| 19 | if (ca >= 'A' && ca <= 'Z') | ||
| 20 | ca += 'a' - 'A'; | ||
| 21 | if (cb >= 'A' && cb <= 'Z') | ||
| 22 | cb += 'a' - 'A'; | ||
| 23 | } while (ca == cb && ca != '\0'); | ||
| 24 | |||
| 25 | return ca - cb; | ||
| 26 | } | ||
diff --git a/arch/alpha/lib/strcat.S b/arch/alpha/lib/strcat.S new file mode 100644 index 00000000000..393f5038487 --- /dev/null +++ b/arch/alpha/lib/strcat.S | |||
| @@ -0,0 +1,52 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/strcat.S | ||
| 3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
| 4 | * | ||
| 5 | * Append a null-terminated string from SRC to DST. | ||
| 6 | */ | ||
| 7 | |||
| 8 | .text | ||
| 9 | |||
| 10 | .align 3 | ||
| 11 | .globl strcat | ||
| 12 | .ent strcat | ||
| 13 | strcat: | ||
| 14 | .frame $30, 0, $26 | ||
| 15 | .prologue 0 | ||
| 16 | |||
| 17 | mov $16, $0 # set up return value | ||
| 18 | |||
| 19 | /* Find the end of the string. */ | ||
| 20 | |||
| 21 | ldq_u $1, 0($16) # load first quadword (a0 may be misaligned) | ||
| 22 | lda $2, -1 | ||
| 23 | insqh $2, $16, $2 | ||
| 24 | andnot $16, 7, $16 | ||
| 25 | or $2, $1, $1 | ||
| 26 | cmpbge $31, $1, $2 # bits set iff byte == 0 | ||
| 27 | bne $2, $found | ||
| 28 | |||
| 29 | $loop: ldq $1, 8($16) | ||
| 30 | addq $16, 8, $16 | ||
| 31 | cmpbge $31, $1, $2 | ||
| 32 | beq $2, $loop | ||
| 33 | |||
| 34 | $found: negq $2, $3 # clear all but least set bit | ||
| 35 | and $2, $3, $2 | ||
| 36 | |||
| 37 | and $2, 0xf0, $3 # binary search for that set bit | ||
| 38 | and $2, 0xcc, $4 | ||
| 39 | and $2, 0xaa, $5 | ||
| 40 | cmovne $3, 4, $3 | ||
| 41 | cmovne $4, 2, $4 | ||
| 42 | cmovne $5, 1, $5 | ||
| 43 | addq $3, $4, $3 | ||
| 44 | addq $16, $5, $16 | ||
| 45 | addq $16, $3, $16 | ||
| 46 | |||
| 47 | /* Now do the append. */ | ||
| 48 | |||
| 49 | mov $26, $23 | ||
| 50 | br __stxcpy | ||
| 51 | |||
| 52 | .end strcat | ||
diff --git a/arch/alpha/lib/strchr.S b/arch/alpha/lib/strchr.S new file mode 100644 index 00000000000..011a175e832 --- /dev/null +++ b/arch/alpha/lib/strchr.S | |||
| @@ -0,0 +1,70 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/strchr.S | ||
| 3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
| 4 | * | ||
| 5 | * Return the address of a given character within a null-terminated | ||
| 6 | * string, or null if it is not found. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <asm/regdef.h> | ||
| 10 | |||
| 11 | .set noreorder | ||
| 12 | .set noat | ||
| 13 | |||
| 14 | .align 3 | ||
| 15 | .globl strchr | ||
| 16 | .ent strchr | ||
| 17 | strchr: | ||
| 18 | .frame sp, 0, ra | ||
| 19 | .prologue 0 | ||
| 20 | |||
| 21 | zapnot a1, 1, a1 # e0 : zero extend the search character | ||
| 22 | ldq_u t0, 0(a0) # .. e1 : load first quadword | ||
| 23 | sll a1, 8, t5 # e0 : replicate the search character | ||
| 24 | andnot a0, 7, v0 # .. e1 : align our loop pointer | ||
| 25 | or t5, a1, a1 # e0 : | ||
| 26 | lda t4, -1 # .. e1 : build garbage mask | ||
| 27 | sll a1, 16, t5 # e0 : | ||
| 28 | cmpbge zero, t0, t2 # .. e1 : bits set iff byte == zero | ||
| 29 | mskqh t4, a0, t4 # e0 : | ||
| 30 | or t5, a1, a1 # .. e1 : | ||
| 31 | sll a1, 32, t5 # e0 : | ||
| 32 | cmpbge zero, t4, t4 # .. e1 : bits set iff byte is garbage | ||
| 33 | or t5, a1, a1 # e0 : | ||
| 34 | xor t0, a1, t1 # .. e1 : make bytes == c zero | ||
| 35 | cmpbge zero, t1, t3 # e0 : bits set iff byte == c | ||
| 36 | or t2, t3, t0 # e1 : bits set iff char match or zero match | ||
| 37 | andnot t0, t4, t0 # e0 : clear garbage bits | ||
| 38 | bne t0, $found # .. e1 (zdb) | ||
| 39 | |||
| 40 | $loop: ldq t0, 8(v0) # e0 : | ||
| 41 | addq v0, 8, v0 # .. e1 : | ||
| 42 | nop # e0 : | ||
| 43 | xor t0, a1, t1 # .. e1 (ev5 data stall) | ||
| 44 | cmpbge zero, t0, t2 # e0 : bits set iff byte == 0 | ||
| 45 | cmpbge zero, t1, t3 # .. e1 : bits set iff byte == c | ||
| 46 | or t2, t3, t0 # e0 : | ||
| 47 | beq t0, $loop # .. e1 (zdb) | ||
| 48 | |||
| 49 | $found: negq t0, t1 # e0 : clear all but least set bit | ||
| 50 | and t0, t1, t0 # e1 (stall) | ||
| 51 | |||
| 52 | and t0, t3, t1 # e0 : bit set iff byte was the char | ||
| 53 | beq t1, $retnull # .. e1 (zdb) | ||
| 54 | |||
| 55 | and t0, 0xf0, t2 # e0 : binary search for that set bit | ||
| 56 | and t0, 0xcc, t3 # .. e1 : | ||
| 57 | and t0, 0xaa, t4 # e0 : | ||
| 58 | cmovne t2, 4, t2 # .. e1 : | ||
| 59 | cmovne t3, 2, t3 # e0 : | ||
| 60 | cmovne t4, 1, t4 # .. e1 : | ||
| 61 | addq t2, t3, t2 # e0 : | ||
| 62 | addq v0, t4, v0 # .. e1 : | ||
| 63 | addq v0, t2, v0 # e0 : | ||
| 64 | ret # .. e1 : | ||
| 65 | |||
| 66 | $retnull: | ||
| 67 | mov zero, v0 # e0 : | ||
| 68 | ret # .. e1 : | ||
| 69 | |||
| 70 | .end strchr | ||
diff --git a/arch/alpha/lib/strcpy.S b/arch/alpha/lib/strcpy.S new file mode 100644 index 00000000000..e0728e4ad21 --- /dev/null +++ b/arch/alpha/lib/strcpy.S | |||
| @@ -0,0 +1,23 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/strcpy.S | ||
| 3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
| 4 | * | ||
| 5 | * Copy a null-terminated string from SRC to DST. Return a pointer | ||
| 6 | * to the null-terminator in the source. | ||
| 7 | */ | ||
| 8 | |||
| 9 | .text | ||
| 10 | |||
| 11 | .align 3 | ||
| 12 | .globl strcpy | ||
| 13 | .ent strcpy | ||
| 14 | strcpy: | ||
| 15 | .frame $30, 0, $26 | ||
| 16 | .prologue 0 | ||
| 17 | |||
| 18 | mov $16, $0 # set up return value | ||
| 19 | mov $26, $23 # set up return address | ||
| 20 | unop | ||
| 21 | br __stxcpy # do the copy | ||
| 22 | |||
| 23 | .end strcpy | ||
diff --git a/arch/alpha/lib/strlen.S b/arch/alpha/lib/strlen.S new file mode 100644 index 00000000000..fe63353de15 --- /dev/null +++ b/arch/alpha/lib/strlen.S | |||
| @@ -0,0 +1,57 @@ | |||
| 1 | /* | ||
| 2 | * strlen.S (c) 1995 David Mosberger (davidm@cs.arizona.edu) | ||
| 3 | * | ||
| 4 | * Finds length of a 0-terminated string. Optimized for the | ||
| 5 | * Alpha architecture: | ||
| 6 | * | ||
| 7 | * - memory accessed as aligned quadwords only | ||
| 8 | * - uses bcmpge to compare 8 bytes in parallel | ||
| 9 | * - does binary search to find 0 byte in last | ||
| 10 | * quadword (HAKMEM needed 12 instructions to | ||
| 11 | * do this instead of the 9 instructions that | ||
| 12 | * binary search needs). | ||
| 13 | */ | ||
| 14 | |||
| 15 | .set noreorder | ||
| 16 | .set noat | ||
| 17 | |||
| 18 | .align 3 | ||
| 19 | |||
| 20 | .globl strlen | ||
| 21 | .ent strlen | ||
| 22 | |||
| 23 | strlen: | ||
| 24 | ldq_u $1, 0($16) # load first quadword ($16 may be misaligned) | ||
| 25 | lda $2, -1($31) | ||
| 26 | insqh $2, $16, $2 | ||
| 27 | andnot $16, 7, $0 | ||
| 28 | or $2, $1, $1 | ||
| 29 | cmpbge $31, $1, $2 # $2 <- bitmask: bit i == 1 <==> i-th byte == 0 | ||
| 30 | bne $2, found | ||
| 31 | |||
| 32 | loop: ldq $1, 8($0) | ||
| 33 | addq $0, 8, $0 # addr += 8 | ||
| 34 | nop # helps dual issue last two insns | ||
| 35 | cmpbge $31, $1, $2 | ||
| 36 | beq $2, loop | ||
| 37 | |||
| 38 | found: blbs $2, done # make aligned case fast | ||
| 39 | negq $2, $3 | ||
| 40 | and $2, $3, $2 | ||
| 41 | |||
| 42 | and $2, 0x0f, $1 | ||
| 43 | addq $0, 4, $3 | ||
| 44 | cmoveq $1, $3, $0 | ||
| 45 | |||
| 46 | and $2, 0x33, $1 | ||
| 47 | addq $0, 2, $3 | ||
| 48 | cmoveq $1, $3, $0 | ||
| 49 | |||
| 50 | and $2, 0x55, $1 | ||
| 51 | addq $0, 1, $3 | ||
| 52 | cmoveq $1, $3, $0 | ||
| 53 | |||
| 54 | done: subq $0, $16, $0 | ||
| 55 | ret $31, ($26) | ||
| 56 | |||
| 57 | .end strlen | ||
diff --git a/arch/alpha/lib/strlen_user.S b/arch/alpha/lib/strlen_user.S new file mode 100644 index 00000000000..508a18e9647 --- /dev/null +++ b/arch/alpha/lib/strlen_user.S | |||
| @@ -0,0 +1,91 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/strlen_user.S | ||
| 3 | * | ||
| 4 | * Return the length of the string including the NUL terminator | ||
| 5 | * (strlen+1) or zero if an error occurred. | ||
| 6 | * | ||
| 7 | * In places where it is critical to limit the processing time, | ||
| 8 | * and the data is not trusted, strnlen_user() should be used. | ||
| 9 | * It will return a value greater than its second argument if | ||
| 10 | * that limit would be exceeded. This implementation is allowed | ||
| 11 | * to access memory beyond the limit, but will not cross a page | ||
| 12 | * boundary when doing so. | ||
| 13 | */ | ||
| 14 | |||
| 15 | #include <asm/regdef.h> | ||
| 16 | |||
| 17 | |||
| 18 | /* Allow an exception for an insn; exit if we get one. */ | ||
| 19 | #define EX(x,y...) \ | ||
| 20 | 99: x,##y; \ | ||
| 21 | .section __ex_table,"a"; \ | ||
| 22 | .long 99b - .; \ | ||
| 23 | lda v0, $exception-99b(zero); \ | ||
| 24 | .previous | ||
| 25 | |||
| 26 | |||
| 27 | .set noreorder | ||
| 28 | .set noat | ||
| 29 | .text | ||
| 30 | |||
| 31 | .globl __strlen_user | ||
| 32 | .ent __strlen_user | ||
| 33 | .frame sp, 0, ra | ||
| 34 | |||
| 35 | .align 3 | ||
| 36 | __strlen_user: | ||
| 37 | ldah a1, 32767(zero) # do not use plain strlen_user() for strings | ||
| 38 | # that might be almost 2 GB long; you should | ||
| 39 | # be using strnlen_user() instead | ||
| 40 | |||
| 41 | .globl __strnlen_user | ||
| 42 | |||
| 43 | .align 3 | ||
| 44 | __strnlen_user: | ||
| 45 | .prologue 0 | ||
| 46 | |||
| 47 | EX( ldq_u t0, 0(a0) ) # load first quadword (a0 may be misaligned) | ||
| 48 | lda t1, -1(zero) | ||
| 49 | insqh t1, a0, t1 | ||
| 50 | andnot a0, 7, v0 | ||
| 51 | or t1, t0, t0 | ||
| 52 | subq a0, 1, a0 # get our +1 for the return | ||
| 53 | cmpbge zero, t0, t1 # t1 <- bitmask: bit i == 1 <==> i-th byte == 0 | ||
| 54 | subq a1, 7, t2 | ||
| 55 | subq a0, v0, t0 | ||
| 56 | bne t1, $found | ||
| 57 | |||
| 58 | addq t2, t0, t2 | ||
| 59 | addq a1, 1, a1 | ||
| 60 | |||
| 61 | .align 3 | ||
| 62 | $loop: ble t2, $limit | ||
| 63 | EX( ldq t0, 8(v0) ) | ||
| 64 | subq t2, 8, t2 | ||
| 65 | addq v0, 8, v0 # addr += 8 | ||
| 66 | cmpbge zero, t0, t1 | ||
| 67 | beq t1, $loop | ||
| 68 | |||
| 69 | $found: negq t1, t2 # clear all but least set bit | ||
| 70 | and t1, t2, t1 | ||
| 71 | |||
| 72 | and t1, 0xf0, t2 # binary search for that set bit | ||
| 73 | and t1, 0xcc, t3 | ||
| 74 | and t1, 0xaa, t4 | ||
| 75 | cmovne t2, 4, t2 | ||
| 76 | cmovne t3, 2, t3 | ||
| 77 | cmovne t4, 1, t4 | ||
| 78 | addq t2, t3, t2 | ||
| 79 | addq v0, t4, v0 | ||
| 80 | addq v0, t2, v0 | ||
| 81 | nop # dual issue next two on ev4 and ev5 | ||
| 82 | subq v0, a0, v0 | ||
| 83 | $exception: | ||
| 84 | ret | ||
| 85 | |||
| 86 | .align 3 # currently redundant | ||
| 87 | $limit: | ||
| 88 | subq a1, t2, v0 | ||
| 89 | ret | ||
| 90 | |||
| 91 | .end __strlen_user | ||
diff --git a/arch/alpha/lib/strncat.S b/arch/alpha/lib/strncat.S new file mode 100644 index 00000000000..a8278163c97 --- /dev/null +++ b/arch/alpha/lib/strncat.S | |||
| @@ -0,0 +1,84 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/strncat.S | ||
| 3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
| 4 | * | ||
| 5 | * Append no more than COUNT characters from the null-terminated string SRC | ||
| 6 | * to the null-terminated string DST. Always null-terminate the new DST. | ||
| 7 | * | ||
| 8 | * This differs slightly from the semantics in libc in that we never write | ||
| 9 | * past count, whereas libc may write to count+1. This follows the generic | ||
| 10 | * implementation in lib/string.c and is, IMHO, more sensible. | ||
| 11 | */ | ||
| 12 | |||
| 13 | .text | ||
| 14 | |||
| 15 | .align 3 | ||
| 16 | .globl strncat | ||
| 17 | .ent strncat | ||
| 18 | strncat: | ||
| 19 | .frame $30, 0, $26 | ||
| 20 | .prologue 0 | ||
| 21 | |||
| 22 | mov $16, $0 # set up return value | ||
| 23 | beq $18, $zerocount | ||
| 24 | |||
| 25 | /* Find the end of the string. */ | ||
| 26 | |||
| 27 | ldq_u $1, 0($16) # load first quadword ($16 may be misaligned) | ||
| 28 | lda $2, -1($31) | ||
| 29 | insqh $2, $16, $2 | ||
| 30 | andnot $16, 7, $16 | ||
| 31 | or $2, $1, $1 | ||
| 32 | cmpbge $31, $1, $2 # bits set iff byte == 0 | ||
| 33 | bne $2, $found | ||
| 34 | |||
| 35 | $loop: ldq $1, 8($16) | ||
| 36 | addq $16, 8, $16 | ||
| 37 | cmpbge $31, $1, $2 | ||
| 38 | beq $2, $loop | ||
| 39 | |||
| 40 | $found: negq $2, $3 # clear all but least set bit | ||
| 41 | and $2, $3, $2 | ||
| 42 | |||
| 43 | and $2, 0xf0, $3 # binary search for that set bit | ||
| 44 | and $2, 0xcc, $4 | ||
| 45 | and $2, 0xaa, $5 | ||
| 46 | cmovne $3, 4, $3 | ||
| 47 | cmovne $4, 2, $4 | ||
| 48 | cmovne $5, 1, $5 | ||
| 49 | addq $3, $4, $3 | ||
| 50 | addq $16, $5, $16 | ||
| 51 | addq $16, $3, $16 | ||
| 52 | |||
| 53 | /* Now do the append. */ | ||
| 54 | |||
| 55 | bsr $23, __stxncpy | ||
| 56 | |||
| 57 | /* Worry about the null termination. */ | ||
| 58 | |||
| 59 | zapnot $1, $27, $2 # was last byte a null? | ||
| 60 | bne $2, 0f | ||
| 61 | ret | ||
| 62 | |||
| 63 | 0: cmplt $27, $24, $2 # did we fill the buffer completely? | ||
| 64 | or $2, $18, $2 | ||
| 65 | bne $2, 2f | ||
| 66 | |||
| 67 | and $24, 0x80, $2 # no zero next byte | ||
| 68 | bne $2, 1f | ||
| 69 | |||
| 70 | /* Here there are bytes left in the current word. Clear one. */ | ||
| 71 | addq $24, $24, $24 # end-of-count bit <<= 1 | ||
| 72 | 2: zap $1, $24, $1 | ||
| 73 | stq_u $1, 0($16) | ||
| 74 | ret | ||
| 75 | |||
| 76 | 1: /* Here we must read the next DST word and clear the first byte. */ | ||
| 77 | ldq_u $1, 8($16) | ||
| 78 | zap $1, 1, $1 | ||
| 79 | stq_u $1, 8($16) | ||
| 80 | |||
| 81 | $zerocount: | ||
| 82 | ret | ||
| 83 | |||
| 84 | .end strncat | ||
diff --git a/arch/alpha/lib/strncpy.S b/arch/alpha/lib/strncpy.S new file mode 100644 index 00000000000..338551c7113 --- /dev/null +++ b/arch/alpha/lib/strncpy.S | |||
| @@ -0,0 +1,81 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/strncpy.S | ||
| 3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
| 4 | * | ||
| 5 | * Copy no more than COUNT bytes of the null-terminated string from | ||
| 6 | * SRC to DST. If SRC does not cover all of COUNT, the balance is | ||
| 7 | * zeroed. | ||
| 8 | * | ||
| 9 | * Or, rather, if the kernel cared about that weird ANSI quirk. This | ||
| 10 | * version has cropped that bit o' nastiness as well as assuming that | ||
| 11 | * __stxncpy is in range of a branch. | ||
| 12 | */ | ||
| 13 | |||
| 14 | .set noat | ||
| 15 | .set noreorder | ||
| 16 | |||
| 17 | .text | ||
| 18 | |||
| 19 | .align 4 | ||
| 20 | .globl strncpy | ||
| 21 | .ent strncpy | ||
| 22 | strncpy: | ||
| 23 | .frame $30, 0, $26 | ||
| 24 | .prologue 0 | ||
| 25 | |||
| 26 | mov $16, $0 # set return value now | ||
| 27 | beq $18, $zerolen | ||
| 28 | unop | ||
| 29 | bsr $23, __stxncpy # do the work of the copy | ||
| 30 | |||
| 31 | unop | ||
| 32 | bne $18, $multiword # do we have full words left? | ||
| 33 | subq $24, 1, $3 # nope | ||
| 34 | subq $27, 1, $4 | ||
| 35 | |||
| 36 | or $3, $24, $3 # clear the bits between the last | ||
| 37 | or $4, $27, $4 # written byte and the last byte in COUNT | ||
| 38 | andnot $4, $3, $4 | ||
| 39 | zap $1, $4, $1 | ||
| 40 | |||
| 41 | stq_u $1, 0($16) | ||
| 42 | ret | ||
| 43 | |||
| 44 | .align 4 | ||
| 45 | $multiword: | ||
| 46 | subq $24, 1, $2 # clear the final bits in the prev word | ||
| 47 | or $2, $24, $2 | ||
| 48 | zapnot $1, $2, $1 | ||
| 49 | subq $18, 1, $18 | ||
| 50 | |||
| 51 | stq_u $1, 0($16) | ||
| 52 | addq $16, 8, $16 | ||
| 53 | unop | ||
| 54 | beq $18, 1f | ||
| 55 | |||
| 56 | nop | ||
| 57 | unop | ||
| 58 | nop | ||
| 59 | blbc $18, 0f | ||
| 60 | |||
| 61 | stq_u $31, 0($16) # zero one word | ||
| 62 | subq $18, 1, $18 | ||
| 63 | addq $16, 8, $16 | ||
| 64 | beq $18, 1f | ||
| 65 | |||
| 66 | 0: stq_u $31, 0($16) # zero two words | ||
| 67 | subq $18, 2, $18 | ||
| 68 | stq_u $31, 8($16) | ||
| 69 | addq $16, 16, $16 | ||
| 70 | bne $18, 0b | ||
| 71 | |||
| 72 | 1: ldq_u $1, 0($16) # clear the leading bits in the final word | ||
| 73 | subq $27, 1, $2 | ||
| 74 | or $2, $27, $2 | ||
| 75 | |||
| 76 | zap $1, $2, $1 | ||
| 77 | stq_u $1, 0($16) | ||
| 78 | $zerolen: | ||
| 79 | ret | ||
| 80 | |||
| 81 | .end strncpy | ||
diff --git a/arch/alpha/lib/strncpy_from_user.S b/arch/alpha/lib/strncpy_from_user.S new file mode 100644 index 00000000000..73ee21160ff --- /dev/null +++ b/arch/alpha/lib/strncpy_from_user.S | |||
| @@ -0,0 +1,339 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/strncpy_from_user.S | ||
| 3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
| 4 | * | ||
| 5 | * Just like strncpy except in the return value: | ||
| 6 | * | ||
| 7 | * -EFAULT if an exception occurs before the terminator is copied. | ||
| 8 | * N if the buffer filled. | ||
| 9 | * | ||
| 10 | * Otherwise the length of the string is returned. | ||
| 11 | */ | ||
| 12 | |||
| 13 | |||
| 14 | #include <asm/errno.h> | ||
| 15 | #include <asm/regdef.h> | ||
| 16 | |||
| 17 | |||
| 18 | /* Allow an exception for an insn; exit if we get one. */ | ||
| 19 | #define EX(x,y...) \ | ||
| 20 | 99: x,##y; \ | ||
| 21 | .section __ex_table,"a"; \ | ||
| 22 | .long 99b - .; \ | ||
| 23 | lda $31, $exception-99b($0); \ | ||
| 24 | .previous | ||
| 25 | |||
| 26 | |||
| 27 | .set noat | ||
| 28 | .set noreorder | ||
| 29 | .text | ||
| 30 | |||
| 31 | .globl __strncpy_from_user | ||
| 32 | .ent __strncpy_from_user | ||
| 33 | .frame $30, 0, $26 | ||
| 34 | .prologue 0 | ||
| 35 | |||
| 36 | .align 3 | ||
| 37 | $aligned: | ||
| 38 | /* On entry to this basic block: | ||
| 39 | t0 == the first destination word for masking back in | ||
| 40 | t1 == the first source word. */ | ||
| 41 | |||
| 42 | /* Create the 1st output word and detect 0's in the 1st input word. */ | ||
| 43 | lda t2, -1 # e1 : build a mask against false zero | ||
| 44 | mskqh t2, a1, t2 # e0 : detection in the src word | ||
| 45 | mskqh t1, a1, t3 # e0 : | ||
| 46 | ornot t1, t2, t2 # .. e1 : | ||
| 47 | mskql t0, a1, t0 # e0 : assemble the first output word | ||
| 48 | cmpbge zero, t2, t8 # .. e1 : bits set iff null found | ||
| 49 | or t0, t3, t0 # e0 : | ||
| 50 | beq a2, $a_eoc # .. e1 : | ||
| 51 | bne t8, $a_eos # .. e1 : | ||
| 52 | |||
| 53 | /* On entry to this basic block: | ||
| 54 | t0 == a source word not containing a null. */ | ||
| 55 | |||
| 56 | $a_loop: | ||
| 57 | stq_u t0, 0(a0) # e0 : | ||
| 58 | addq a0, 8, a0 # .. e1 : | ||
| 59 | EX( ldq_u t0, 0(a1) ) # e0 : | ||
| 60 | addq a1, 8, a1 # .. e1 : | ||
| 61 | subq a2, 1, a2 # e0 : | ||
| 62 | cmpbge zero, t0, t8 # .. e1 (stall) | ||
| 63 | beq a2, $a_eoc # e1 : | ||
| 64 | beq t8, $a_loop # e1 : | ||
| 65 | |||
| 66 | /* Take care of the final (partial) word store. At this point | ||
| 67 | the end-of-count bit is set in t8 iff it applies. | ||
| 68 | |||
| 69 | On entry to this basic block we have: | ||
| 70 | t0 == the source word containing the null | ||
| 71 | t8 == the cmpbge mask that found it. */ | ||
| 72 | |||
| 73 | $a_eos: | ||
| 74 | negq t8, t12 # e0 : find low bit set | ||
| 75 | and t8, t12, t12 # e1 (stall) | ||
| 76 | |||
| 77 | /* For the sake of the cache, don't read a destination word | ||
| 78 | if we're not going to need it. */ | ||
| 79 | and t12, 0x80, t6 # e0 : | ||
| 80 | bne t6, 1f # .. e1 (zdb) | ||
| 81 | |||
| 82 | /* We're doing a partial word store and so need to combine | ||
| 83 | our source and original destination words. */ | ||
| 84 | ldq_u t1, 0(a0) # e0 : | ||
| 85 | subq t12, 1, t6 # .. e1 : | ||
| 86 | or t12, t6, t8 # e0 : | ||
| 87 | unop # | ||
| 88 | zapnot t0, t8, t0 # e0 : clear src bytes > null | ||
| 89 | zap t1, t8, t1 # .. e1 : clear dst bytes <= null | ||
| 90 | or t0, t1, t0 # e1 : | ||
| 91 | |||
| 92 | 1: stq_u t0, 0(a0) | ||
| 93 | br $finish_up | ||
| 94 | |||
| 95 | /* Add the end-of-count bit to the eos detection bitmask. */ | ||
| 96 | $a_eoc: | ||
| 97 | or t10, t8, t8 | ||
| 98 | br $a_eos | ||
| 99 | |||
| 100 | /*** The Function Entry Point ***/ | ||
| 101 | .align 3 | ||
| 102 | __strncpy_from_user: | ||
| 103 | mov a0, v0 # save the string start | ||
| 104 | beq a2, $zerolength | ||
| 105 | |||
| 106 | /* Are source and destination co-aligned? */ | ||
| 107 | xor a0, a1, t1 # e0 : | ||
| 108 | and a0, 7, t0 # .. e1 : find dest misalignment | ||
| 109 | and t1, 7, t1 # e0 : | ||
| 110 | addq a2, t0, a2 # .. e1 : bias count by dest misalignment | ||
| 111 | subq a2, 1, a2 # e0 : | ||
| 112 | and a2, 7, t2 # e1 : | ||
| 113 | srl a2, 3, a2 # e0 : a2 = loop counter = (count - 1)/8 | ||
| 114 | addq zero, 1, t10 # .. e1 : | ||
| 115 | sll t10, t2, t10 # e0 : t10 = bitmask of last count byte | ||
| 116 | bne t1, $unaligned # .. e1 : | ||
| 117 | |||
| 118 | /* We are co-aligned; take care of a partial first word. */ | ||
| 119 | |||
| 120 | EX( ldq_u t1, 0(a1) ) # e0 : load first src word | ||
| 121 | addq a1, 8, a1 # .. e1 : | ||
| 122 | |||
| 123 | beq t0, $aligned # avoid loading dest word if not needed | ||
| 124 | ldq_u t0, 0(a0) # e0 : | ||
| 125 | br $aligned # .. e1 : | ||
| 126 | |||
| 127 | |||
| 128 | /* The source and destination are not co-aligned. Align the destination | ||
| 129 | and cope. We have to be very careful about not reading too much and | ||
| 130 | causing a SEGV. */ | ||
| 131 | |||
| 132 | .align 3 | ||
| 133 | $u_head: | ||
| 134 | /* We know just enough now to be able to assemble the first | ||
| 135 | full source word. We can still find a zero at the end of it | ||
| 136 | that prevents us from outputting the whole thing. | ||
| 137 | |||
| 138 | On entry to this basic block: | ||
| 139 | t0 == the first dest word, unmasked | ||
| 140 | t1 == the shifted low bits of the first source word | ||
| 141 | t6 == bytemask that is -1 in dest word bytes */ | ||
| 142 | |||
| 143 | EX( ldq_u t2, 8(a1) ) # e0 : load second src word | ||
| 144 | addq a1, 8, a1 # .. e1 : | ||
| 145 | mskql t0, a0, t0 # e0 : mask trailing garbage in dst | ||
| 146 | extqh t2, a1, t4 # e0 : | ||
| 147 | or t1, t4, t1 # e1 : first aligned src word complete | ||
| 148 | mskqh t1, a0, t1 # e0 : mask leading garbage in src | ||
| 149 | or t0, t1, t0 # e0 : first output word complete | ||
| 150 | or t0, t6, t6 # e1 : mask original data for zero test | ||
| 151 | cmpbge zero, t6, t8 # e0 : | ||
| 152 | beq a2, $u_eocfin # .. e1 : | ||
| 153 | bne t8, $u_final # e1 : | ||
| 154 | |||
| 155 | lda t6, -1 # e1 : mask out the bits we have | ||
| 156 | mskql t6, a1, t6 # e0 : already seen | ||
| 157 | stq_u t0, 0(a0) # e0 : store first output word | ||
| 158 | or t6, t2, t2 # .. e1 : | ||
| 159 | cmpbge zero, t2, t8 # e0 : find nulls in second partial | ||
| 160 | addq a0, 8, a0 # .. e1 : | ||
| 161 | subq a2, 1, a2 # e0 : | ||
| 162 | bne t8, $u_late_head_exit # .. e1 : | ||
| 163 | |||
| 164 | /* Finally, we've got all the stupid leading edge cases taken care | ||
| 165 | of and we can set up to enter the main loop. */ | ||
| 166 | |||
| 167 | extql t2, a1, t1 # e0 : position hi-bits of lo word | ||
| 168 | EX( ldq_u t2, 8(a1) ) # .. e1 : read next high-order source word | ||
| 169 | addq a1, 8, a1 # e0 : | ||
| 170 | cmpbge zero, t2, t8 # e1 (stall) | ||
| 171 | beq a2, $u_eoc # e1 : | ||
| 172 | bne t8, $u_eos # e1 : | ||
| 173 | |||
| 174 | /* Unaligned copy main loop. In order to avoid reading too much, | ||
| 175 | the loop is structured to detect zeros in aligned source words. | ||
| 176 | This has, unfortunately, effectively pulled half of a loop | ||
| 177 | iteration out into the head and half into the tail, but it does | ||
| 178 | prevent nastiness from accumulating in the very thing we want | ||
| 179 | to run as fast as possible. | ||
| 180 | |||
| 181 | On entry to this basic block: | ||
| 182 | t1 == the shifted high-order bits from the previous source word | ||
| 183 | t2 == the unshifted current source word | ||
| 184 | |||
| 185 | We further know that t2 does not contain a null terminator. */ | ||
| 186 | |||
| 187 | .align 3 | ||
| 188 | $u_loop: | ||
| 189 | extqh t2, a1, t0 # e0 : extract high bits for current word | ||
| 190 | addq a1, 8, a1 # .. e1 : | ||
| 191 | extql t2, a1, t3 # e0 : extract low bits for next time | ||
| 192 | addq a0, 8, a0 # .. e1 : | ||
| 193 | or t0, t1, t0 # e0 : current dst word now complete | ||
| 194 | EX( ldq_u t2, 0(a1) ) # .. e1 : load high word for next time | ||
| 195 | stq_u t0, -8(a0) # e0 : save the current word | ||
| 196 | mov t3, t1 # .. e1 : | ||
| 197 | subq a2, 1, a2 # e0 : | ||
| 198 | cmpbge zero, t2, t8 # .. e1 : test new word for eos | ||
| 199 | beq a2, $u_eoc # e1 : | ||
| 200 | beq t8, $u_loop # e1 : | ||
| 201 | |||
| 202 | /* We've found a zero somewhere in the source word we just read. | ||
| 203 | If it resides in the lower half, we have one (probably partial) | ||
| 204 | word to write out, and if it resides in the upper half, we | ||
| 205 | have one full and one partial word left to write out. | ||
| 206 | |||
| 207 | On entry to this basic block: | ||
| 208 | t1 == the shifted high-order bits from the previous source word | ||
| 209 | t2 == the unshifted current source word. */ | ||
| 210 | $u_eos: | ||
| 211 | extqh t2, a1, t0 # e0 : | ||
| 212 | or t0, t1, t0 # e1 : first (partial) source word complete | ||
| 213 | |||
| 214 | cmpbge zero, t0, t8 # e0 : is the null in this first bit? | ||
| 215 | bne t8, $u_final # .. e1 (zdb) | ||
| 216 | |||
| 217 | stq_u t0, 0(a0) # e0 : the null was in the high-order bits | ||
| 218 | addq a0, 8, a0 # .. e1 : | ||
| 219 | subq a2, 1, a2 # e1 : | ||
| 220 | |||
| 221 | $u_late_head_exit: | ||
| 222 | extql t2, a1, t0 # .. e0 : | ||
| 223 | cmpbge zero, t0, t8 # e0 : | ||
| 224 | or t8, t10, t6 # e1 : | ||
| 225 | cmoveq a2, t6, t8 # e0 : | ||
| 226 | nop # .. e1 : | ||
| 227 | |||
| 228 | /* Take care of a final (probably partial) result word. | ||
| 229 | On entry to this basic block: | ||
| 230 | t0 == assembled source word | ||
| 231 | t8 == cmpbge mask that found the null. */ | ||
| 232 | $u_final: | ||
| 233 | negq t8, t6 # e0 : isolate low bit set | ||
| 234 | and t6, t8, t12 # e1 : | ||
| 235 | |||
| 236 | and t12, 0x80, t6 # e0 : avoid dest word load if we can | ||
| 237 | bne t6, 1f # .. e1 (zdb) | ||
| 238 | |||
| 239 | ldq_u t1, 0(a0) # e0 : | ||
| 240 | subq t12, 1, t6 # .. e1 : | ||
| 241 | or t6, t12, t8 # e0 : | ||
| 242 | zapnot t0, t8, t0 # .. e1 : kill source bytes > null | ||
| 243 | zap t1, t8, t1 # e0 : kill dest bytes <= null | ||
| 244 | or t0, t1, t0 # e1 : | ||
| 245 | |||
| 246 | 1: stq_u t0, 0(a0) # e0 : | ||
| 247 | br $finish_up | ||
| 248 | |||
| 249 | $u_eoc: # end-of-count | ||
| 250 | extqh t2, a1, t0 | ||
| 251 | or t0, t1, t0 | ||
| 252 | cmpbge zero, t0, t8 | ||
| 253 | |||
| 254 | $u_eocfin: # end-of-count, final word | ||
| 255 | or t10, t8, t8 | ||
| 256 | br $u_final | ||
| 257 | |||
| 258 | /* Unaligned copy entry point. */ | ||
| 259 | .align 3 | ||
| 260 | $unaligned: | ||
| 261 | |||
| 262 | EX( ldq_u t1, 0(a1) ) # e0 : load first source word | ||
| 263 | |||
| 264 | and a0, 7, t4 # .. e1 : find dest misalignment | ||
| 265 | and a1, 7, t5 # e0 : find src misalignment | ||
| 266 | |||
| 267 | /* Conditionally load the first destination word and a bytemask | ||
| 268 | with 0xff indicating that the destination byte is sacrosanct. */ | ||
| 269 | |||
| 270 | mov zero, t0 # .. e1 : | ||
| 271 | mov zero, t6 # e0 : | ||
| 272 | beq t4, 1f # .. e1 : | ||
| 273 | ldq_u t0, 0(a0) # e0 : | ||
| 274 | lda t6, -1 # .. e1 : | ||
| 275 | mskql t6, a0, t6 # e0 : | ||
| 276 | 1: | ||
| 277 | subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr | ||
| 278 | |||
| 279 | /* If source misalignment is larger than dest misalignment, we need | ||
| 280 | extra startup checks to avoid SEGV. */ | ||
| 281 | |||
| 282 | cmplt t4, t5, t12 # e1 : | ||
| 283 | extql t1, a1, t1 # .. e0 : shift src into place | ||
| 284 | lda t2, -1 # e0 : for creating masks later | ||
| 285 | beq t12, $u_head # e1 : | ||
| 286 | |||
| 287 | mskqh t2, t5, t2 # e0 : begin src byte validity mask | ||
| 288 | cmpbge zero, t1, t8 # .. e1 : is there a zero? | ||
| 289 | extql t2, a1, t2 # e0 : | ||
| 290 | or t8, t10, t5 # .. e1 : test for end-of-count too | ||
| 291 | cmpbge zero, t2, t3 # e0 : | ||
| 292 | cmoveq a2, t5, t8 # .. e1 : | ||
| 293 | andnot t8, t3, t8 # e0 : | ||
| 294 | beq t8, $u_head # .. e1 (zdb) | ||
| 295 | |||
| 296 | /* At this point we've found a zero in the first partial word of | ||
| 297 | the source. We need to isolate the valid source data and mask | ||
| 298 | it into the original destination data. (Incidentally, we know | ||
| 299 | that we'll need at least one byte of that original dest word.) */ | ||
| 300 | |||
| 301 | ldq_u t0, 0(a0) # e0 : | ||
| 302 | negq t8, t6 # .. e1 : build bitmask of bytes <= zero | ||
| 303 | mskqh t1, t4, t1 # e0 : | ||
| 304 | and t6, t8, t12 # .. e1 : | ||
| 305 | subq t12, 1, t6 # e0 : | ||
| 306 | or t6, t12, t8 # e1 : | ||
| 307 | |||
| 308 | zapnot t2, t8, t2 # e0 : prepare source word; mirror changes | ||
| 309 | zapnot t1, t8, t1 # .. e1 : to source validity mask | ||
| 310 | |||
| 311 | andnot t0, t2, t0 # e0 : zero place for source to reside | ||
| 312 | or t0, t1, t0 # e1 : and put it there | ||
| 313 | stq_u t0, 0(a0) # e0 : | ||
| 314 | |||
| 315 | $finish_up: | ||
| 316 | zapnot t0, t12, t4 # was last byte written null? | ||
| 317 | cmovne t4, 1, t4 | ||
| 318 | |||
| 319 | and t12, 0xf0, t3 # binary search for the address of the | ||
| 320 | and t12, 0xcc, t2 # last byte written | ||
| 321 | and t12, 0xaa, t1 | ||
| 322 | bic a0, 7, t0 | ||
| 323 | cmovne t3, 4, t3 | ||
| 324 | cmovne t2, 2, t2 | ||
| 325 | cmovne t1, 1, t1 | ||
| 326 | addq t0, t3, t0 | ||
| 327 | addq t1, t2, t1 | ||
| 328 | addq t0, t1, t0 | ||
| 329 | addq t0, t4, t0 # add one if we filled the buffer | ||
| 330 | |||
| 331 | subq t0, v0, v0 # find string length | ||
| 332 | ret | ||
| 333 | |||
| 334 | $zerolength: | ||
| 335 | clr v0 | ||
| 336 | $exception: | ||
| 337 | ret | ||
| 338 | |||
| 339 | .end __strncpy_from_user | ||
diff --git a/arch/alpha/lib/strrchr.S b/arch/alpha/lib/strrchr.S new file mode 100644 index 00000000000..82cfd0ac907 --- /dev/null +++ b/arch/alpha/lib/strrchr.S | |||
| @@ -0,0 +1,87 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/strrchr.S | ||
| 3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
| 4 | * | ||
| 5 | * Return the address of the last occurrence of a given character | ||
| 6 | * within a null-terminated string, or null if it is not found. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <asm/regdef.h> | ||
| 10 | |||
| 11 | .set noreorder | ||
| 12 | .set noat | ||
| 13 | |||
| 14 | .align 3 | ||
| 15 | .ent strrchr | ||
| 16 | .globl strrchr | ||
| 17 | strrchr: | ||
| 18 | .frame sp, 0, ra | ||
| 19 | .prologue 0 | ||
| 20 | |||
| 21 | zapnot a1, 1, a1 # e0 : zero extend our test character | ||
| 22 | mov zero, t6 # .. e1 : t6 is last match aligned addr | ||
| 23 | sll a1, 8, t5 # e0 : replicate our test character | ||
| 24 | mov zero, t8 # .. e1 : t8 is last match byte compare mask | ||
| 25 | or t5, a1, a1 # e0 : | ||
| 26 | ldq_u t0, 0(a0) # .. e1 : load first quadword | ||
| 27 | sll a1, 16, t5 # e0 : | ||
| 28 | andnot a0, 7, v0 # .. e1 : align source addr | ||
| 29 | or t5, a1, a1 # e0 : | ||
| 30 | lda t4, -1 # .. e1 : build garbage mask | ||
| 31 | sll a1, 32, t5 # e0 : | ||
| 32 | cmpbge zero, t0, t1 # .. e1 : bits set iff byte == zero | ||
| 33 | mskqh t4, a0, t4 # e0 : | ||
| 34 | or t5, a1, a1 # .. e1 : character replication complete | ||
| 35 | xor t0, a1, t2 # e0 : make bytes == c zero | ||
| 36 | cmpbge zero, t4, t4 # .. e1 : bits set iff byte is garbage | ||
| 37 | cmpbge zero, t2, t3 # e0 : bits set iff byte == c | ||
| 38 | andnot t1, t4, t1 # .. e1 : clear garbage from null test | ||
| 39 | andnot t3, t4, t3 # e0 : clear garbage from char test | ||
| 40 | bne t1, $eos # .. e1 : did we already hit the terminator? | ||
| 41 | |||
| 42 | /* Character search main loop */ | ||
| 43 | $loop: | ||
| 44 | ldq t0, 8(v0) # e0 : load next quadword | ||
| 45 | cmovne t3, v0, t6 # .. e1 : save previous comparisons match | ||
| 46 | cmovne t3, t3, t8 # e0 : | ||
| 47 | addq v0, 8, v0 # .. e1 : | ||
| 48 | xor t0, a1, t2 # e0 : | ||
| 49 | cmpbge zero, t0, t1 # .. e1 : bits set iff byte == zero | ||
| 50 | cmpbge zero, t2, t3 # e0 : bits set iff byte == c | ||
| 51 | beq t1, $loop # .. e1 : if we havnt seen a null, loop | ||
| 52 | |||
| 53 | /* Mask out character matches after terminator */ | ||
| 54 | $eos: | ||
| 55 | negq t1, t4 # e0 : isolate first null byte match | ||
| 56 | and t1, t4, t4 # e1 : | ||
| 57 | subq t4, 1, t5 # e0 : build a mask of the bytes upto... | ||
| 58 | or t4, t5, t4 # e1 : ... and including the null | ||
| 59 | |||
| 60 | and t3, t4, t3 # e0 : mask out char matches after null | ||
| 61 | cmovne t3, t3, t8 # .. e1 : save it, if match found | ||
| 62 | cmovne t3, v0, t6 # e0 : | ||
| 63 | |||
| 64 | /* Locate the address of the last matched character */ | ||
| 65 | |||
| 66 | /* Retain the early exit for the ev4 -- the ev5 mispredict penalty | ||
| 67 | is 5 cycles -- the same as just falling through. */ | ||
| 68 | beq t8, $retnull # .. e1 : | ||
| 69 | |||
| 70 | and t8, 0xf0, t2 # e0 : binary search for the high bit set | ||
| 71 | cmovne t2, t2, t8 # .. e1 (zdb) | ||
| 72 | cmovne t2, 4, t2 # e0 : | ||
| 73 | and t8, 0xcc, t1 # .. e1 : | ||
| 74 | cmovne t1, t1, t8 # e0 : | ||
| 75 | cmovne t1, 2, t1 # .. e1 : | ||
| 76 | and t8, 0xaa, t0 # e0 : | ||
| 77 | cmovne t0, 1, t0 # .. e1 (zdb) | ||
| 78 | addq t2, t1, t1 # e0 : | ||
| 79 | addq t6, t0, v0 # .. e1 : add our aligned base ptr to the mix | ||
| 80 | addq v0, t1, v0 # e0 : | ||
| 81 | ret # .. e1 : | ||
| 82 | |||
| 83 | $retnull: | ||
| 84 | mov zero, v0 # e0 : | ||
| 85 | ret # .. e1 : | ||
| 86 | |||
| 87 | .end strrchr | ||
diff --git a/arch/alpha/lib/stxcpy.S b/arch/alpha/lib/stxcpy.S new file mode 100644 index 00000000000..2a8d51bfc05 --- /dev/null +++ b/arch/alpha/lib/stxcpy.S | |||
| @@ -0,0 +1,289 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/stxcpy.S | ||
| 3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
| 4 | * | ||
| 5 | * Copy a null-terminated string from SRC to DST. | ||
| 6 | * | ||
| 7 | * This is an internal routine used by strcpy, stpcpy, and strcat. | ||
| 8 | * As such, it uses special linkage conventions to make implementation | ||
| 9 | * of these public functions more efficient. | ||
| 10 | * | ||
| 11 | * On input: | ||
| 12 | * t9 = return address | ||
| 13 | * a0 = DST | ||
| 14 | * a1 = SRC | ||
| 15 | * | ||
| 16 | * On output: | ||
| 17 | * t12 = bitmask (with one bit set) indicating the last byte written | ||
| 18 | * a0 = unaligned address of the last *word* written | ||
| 19 | * | ||
| 20 | * Furthermore, v0, a3-a5, t11, and t12 are untouched. | ||
| 21 | */ | ||
| 22 | |||
| 23 | #include <asm/regdef.h> | ||
| 24 | |||
| 25 | .set noat | ||
| 26 | .set noreorder | ||
| 27 | |||
| 28 | .text | ||
| 29 | |||
| 30 | /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that | ||
| 31 | doesn't like putting the entry point for a procedure somewhere in the | ||
| 32 | middle of the procedure descriptor. Work around this by putting the | ||
| 33 | aligned copy in its own procedure descriptor */ | ||
| 34 | |||
| 35 | .ent stxcpy_aligned | ||
| 36 | .align 3 | ||
| 37 | stxcpy_aligned: | ||
| 38 | .frame sp, 0, t9 | ||
| 39 | .prologue 0 | ||
| 40 | |||
| 41 | /* On entry to this basic block: | ||
| 42 | t0 == the first destination word for masking back in | ||
| 43 | t1 == the first source word. */ | ||
| 44 | |||
| 45 | /* Create the 1st output word and detect 0's in the 1st input word. */ | ||
| 46 | lda t2, -1 # e1 : build a mask against false zero | ||
| 47 | mskqh t2, a1, t2 # e0 : detection in the src word | ||
| 48 | mskqh t1, a1, t3 # e0 : | ||
| 49 | ornot t1, t2, t2 # .. e1 : | ||
| 50 | mskql t0, a1, t0 # e0 : assemble the first output word | ||
| 51 | cmpbge zero, t2, t8 # .. e1 : bits set iff null found | ||
| 52 | or t0, t3, t1 # e0 : | ||
| 53 | bne t8, $a_eos # .. e1 : | ||
| 54 | |||
| 55 | /* On entry to this basic block: | ||
| 56 | t0 == the first destination word for masking back in | ||
| 57 | t1 == a source word not containing a null. */ | ||
| 58 | |||
| 59 | $a_loop: | ||
| 60 | stq_u t1, 0(a0) # e0 : | ||
| 61 | addq a0, 8, a0 # .. e1 : | ||
| 62 | ldq_u t1, 0(a1) # e0 : | ||
| 63 | addq a1, 8, a1 # .. e1 : | ||
| 64 | cmpbge zero, t1, t8 # e0 (stall) | ||
| 65 | beq t8, $a_loop # .. e1 (zdb) | ||
| 66 | |||
| 67 | /* Take care of the final (partial) word store. | ||
| 68 | On entry to this basic block we have: | ||
| 69 | t1 == the source word containing the null | ||
| 70 | t8 == the cmpbge mask that found it. */ | ||
| 71 | $a_eos: | ||
| 72 | negq t8, t6 # e0 : find low bit set | ||
| 73 | and t8, t6, t12 # e1 (stall) | ||
| 74 | |||
| 75 | /* For the sake of the cache, don't read a destination word | ||
| 76 | if we're not going to need it. */ | ||
| 77 | and t12, 0x80, t6 # e0 : | ||
| 78 | bne t6, 1f # .. e1 (zdb) | ||
| 79 | |||
| 80 | /* We're doing a partial word store and so need to combine | ||
| 81 | our source and original destination words. */ | ||
| 82 | ldq_u t0, 0(a0) # e0 : | ||
| 83 | subq t12, 1, t6 # .. e1 : | ||
| 84 | zapnot t1, t6, t1 # e0 : clear src bytes >= null | ||
| 85 | or t12, t6, t8 # .. e1 : | ||
| 86 | zap t0, t8, t0 # e0 : clear dst bytes <= null | ||
| 87 | or t0, t1, t1 # e1 : | ||
| 88 | |||
| 89 | 1: stq_u t1, 0(a0) # e0 : | ||
| 90 | ret (t9) # .. e1 : | ||
| 91 | |||
| 92 | .end stxcpy_aligned | ||
| 93 | |||
| 94 | .align 3 | ||
| 95 | .ent __stxcpy | ||
| 96 | .globl __stxcpy | ||
| 97 | __stxcpy: | ||
| 98 | .frame sp, 0, t9 | ||
| 99 | .prologue 0 | ||
| 100 | |||
| 101 | /* Are source and destination co-aligned? */ | ||
| 102 | xor a0, a1, t0 # e0 : | ||
| 103 | unop # : | ||
| 104 | and t0, 7, t0 # e0 : | ||
| 105 | bne t0, $unaligned # .. e1 : | ||
| 106 | |||
| 107 | /* We are co-aligned; take care of a partial first word. */ | ||
| 108 | ldq_u t1, 0(a1) # e0 : load first src word | ||
| 109 | and a0, 7, t0 # .. e1 : take care not to load a word ... | ||
| 110 | addq a1, 8, a1 # e0 : | ||
| 111 | beq t0, stxcpy_aligned # .. e1 : ... if we wont need it | ||
| 112 | ldq_u t0, 0(a0) # e0 : | ||
| 113 | br stxcpy_aligned # .. e1 : | ||
| 114 | |||
| 115 | |||
| 116 | /* The source and destination are not co-aligned. Align the destination | ||
| 117 | and cope. We have to be very careful about not reading too much and | ||
| 118 | causing a SEGV. */ | ||
| 119 | |||
| 120 | .align 3 | ||
| 121 | $u_head: | ||
| 122 | /* We know just enough now to be able to assemble the first | ||
| 123 | full source word. We can still find a zero at the end of it | ||
| 124 | that prevents us from outputting the whole thing. | ||
| 125 | |||
| 126 | On entry to this basic block: | ||
| 127 | t0 == the first dest word, for masking back in, if needed else 0 | ||
| 128 | t1 == the low bits of the first source word | ||
| 129 | t6 == bytemask that is -1 in dest word bytes */ | ||
| 130 | |||
| 131 | ldq_u t2, 8(a1) # e0 : | ||
| 132 | addq a1, 8, a1 # .. e1 : | ||
| 133 | |||
| 134 | extql t1, a1, t1 # e0 : | ||
| 135 | extqh t2, a1, t4 # e0 : | ||
| 136 | mskql t0, a0, t0 # e0 : | ||
| 137 | or t1, t4, t1 # .. e1 : | ||
| 138 | mskqh t1, a0, t1 # e0 : | ||
| 139 | or t0, t1, t1 # e1 : | ||
| 140 | |||
| 141 | or t1, t6, t6 # e0 : | ||
| 142 | cmpbge zero, t6, t8 # .. e1 : | ||
| 143 | lda t6, -1 # e0 : for masking just below | ||
| 144 | bne t8, $u_final # .. e1 : | ||
| 145 | |||
| 146 | mskql t6, a1, t6 # e0 : mask out the bits we have | ||
| 147 | or t6, t2, t2 # e1 : already extracted before | ||
| 148 | cmpbge zero, t2, t8 # e0 : testing eos | ||
| 149 | bne t8, $u_late_head_exit # .. e1 (zdb) | ||
| 150 | |||
| 151 | /* Finally, we've got all the stupid leading edge cases taken care | ||
| 152 | of and we can set up to enter the main loop. */ | ||
| 153 | |||
| 154 | stq_u t1, 0(a0) # e0 : store first output word | ||
| 155 | addq a0, 8, a0 # .. e1 : | ||
| 156 | extql t2, a1, t0 # e0 : position ho-bits of lo word | ||
| 157 | ldq_u t2, 8(a1) # .. e1 : read next high-order source word | ||
| 158 | addq a1, 8, a1 # e0 : | ||
| 159 | cmpbge zero, t2, t8 # .. e1 : | ||
| 160 | nop # e0 : | ||
| 161 | bne t8, $u_eos # .. e1 : | ||
| 162 | |||
| 163 | /* Unaligned copy main loop. In order to avoid reading too much, | ||
| 164 | the loop is structured to detect zeros in aligned source words. | ||
| 165 | This has, unfortunately, effectively pulled half of a loop | ||
| 166 | iteration out into the head and half into the tail, but it does | ||
| 167 | prevent nastiness from accumulating in the very thing we want | ||
| 168 | to run as fast as possible. | ||
| 169 | |||
| 170 | On entry to this basic block: | ||
| 171 | t0 == the shifted high-order bits from the previous source word | ||
| 172 | t2 == the unshifted current source word | ||
| 173 | |||
| 174 | We further know that t2 does not contain a null terminator. */ | ||
| 175 | |||
| 176 | .align 3 | ||
| 177 | $u_loop: | ||
| 178 | extqh t2, a1, t1 # e0 : extract high bits for current word | ||
| 179 | addq a1, 8, a1 # .. e1 : | ||
| 180 | extql t2, a1, t3 # e0 : extract low bits for next time | ||
| 181 | addq a0, 8, a0 # .. e1 : | ||
| 182 | or t0, t1, t1 # e0 : current dst word now complete | ||
| 183 | ldq_u t2, 0(a1) # .. e1 : load high word for next time | ||
| 184 | stq_u t1, -8(a0) # e0 : save the current word | ||
| 185 | mov t3, t0 # .. e1 : | ||
| 186 | cmpbge zero, t2, t8 # e0 : test new word for eos | ||
| 187 | beq t8, $u_loop # .. e1 : | ||
| 188 | |||
| 189 | /* We've found a zero somewhere in the source word we just read. | ||
| 190 | If it resides in the lower half, we have one (probably partial) | ||
| 191 | word to write out, and if it resides in the upper half, we | ||
| 192 | have one full and one partial word left to write out. | ||
| 193 | |||
| 194 | On entry to this basic block: | ||
| 195 | t0 == the shifted high-order bits from the previous source word | ||
| 196 | t2 == the unshifted current source word. */ | ||
| 197 | $u_eos: | ||
| 198 | extqh t2, a1, t1 # e0 : | ||
| 199 | or t0, t1, t1 # e1 : first (partial) source word complete | ||
| 200 | |||
| 201 | cmpbge zero, t1, t8 # e0 : is the null in this first bit? | ||
| 202 | bne t8, $u_final # .. e1 (zdb) | ||
| 203 | |||
| 204 | $u_late_head_exit: | ||
| 205 | stq_u t1, 0(a0) # e0 : the null was in the high-order bits | ||
| 206 | addq a0, 8, a0 # .. e1 : | ||
| 207 | extql t2, a1, t1 # e0 : | ||
| 208 | cmpbge zero, t1, t8 # .. e1 : | ||
| 209 | |||
| 210 | /* Take care of a final (probably partial) result word. | ||
| 211 | On entry to this basic block: | ||
| 212 | t1 == assembled source word | ||
| 213 | t8 == cmpbge mask that found the null. */ | ||
| 214 | $u_final: | ||
| 215 | negq t8, t6 # e0 : isolate low bit set | ||
| 216 | and t6, t8, t12 # e1 : | ||
| 217 | |||
| 218 | and t12, 0x80, t6 # e0 : avoid dest word load if we can | ||
| 219 | bne t6, 1f # .. e1 (zdb) | ||
| 220 | |||
| 221 | ldq_u t0, 0(a0) # e0 : | ||
| 222 | subq t12, 1, t6 # .. e1 : | ||
| 223 | or t6, t12, t8 # e0 : | ||
| 224 | zapnot t1, t6, t1 # .. e1 : kill source bytes >= null | ||
| 225 | zap t0, t8, t0 # e0 : kill dest bytes <= null | ||
| 226 | or t0, t1, t1 # e1 : | ||
| 227 | |||
| 228 | 1: stq_u t1, 0(a0) # e0 : | ||
| 229 | ret (t9) # .. e1 : | ||
| 230 | |||
| 231 | /* Unaligned copy entry point. */ | ||
| 232 | .align 3 | ||
| 233 | $unaligned: | ||
| 234 | |||
| 235 | ldq_u t1, 0(a1) # e0 : load first source word | ||
| 236 | |||
| 237 | and a0, 7, t4 # .. e1 : find dest misalignment | ||
| 238 | and a1, 7, t5 # e0 : find src misalignment | ||
| 239 | |||
| 240 | /* Conditionally load the first destination word and a bytemask | ||
| 241 | with 0xff indicating that the destination byte is sacrosanct. */ | ||
| 242 | |||
| 243 | mov zero, t0 # .. e1 : | ||
| 244 | mov zero, t6 # e0 : | ||
| 245 | beq t4, 1f # .. e1 : | ||
| 246 | ldq_u t0, 0(a0) # e0 : | ||
| 247 | lda t6, -1 # .. e1 : | ||
| 248 | mskql t6, a0, t6 # e0 : | ||
| 249 | 1: | ||
| 250 | subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr | ||
| 251 | |||
| 252 | /* If source misalignment is larger than dest misalignment, we need | ||
| 253 | extra startup checks to avoid SEGV. */ | ||
| 254 | |||
| 255 | cmplt t4, t5, t12 # e0 : | ||
| 256 | beq t12, $u_head # .. e1 (zdb) | ||
| 257 | |||
| 258 | lda t2, -1 # e1 : mask out leading garbage in source | ||
| 259 | mskqh t2, t5, t2 # e0 : | ||
| 260 | nop # e0 : | ||
| 261 | ornot t1, t2, t3 # .. e1 : | ||
| 262 | cmpbge zero, t3, t8 # e0 : is there a zero? | ||
| 263 | beq t8, $u_head # .. e1 (zdb) | ||
| 264 | |||
| 265 | /* At this point we've found a zero in the first partial word of | ||
| 266 | the source. We need to isolate the valid source data and mask | ||
| 267 | it into the original destination data. (Incidentally, we know | ||
| 268 | that we'll need at least one byte of that original dest word.) */ | ||
| 269 | |||
| 270 | ldq_u t0, 0(a0) # e0 : | ||
| 271 | |||
| 272 | negq t8, t6 # .. e1 : build bitmask of bytes <= zero | ||
| 273 | and t6, t8, t12 # e0 : | ||
| 274 | and a1, 7, t5 # .. e1 : | ||
| 275 | subq t12, 1, t6 # e0 : | ||
| 276 | or t6, t12, t8 # e1 : | ||
| 277 | srl t12, t5, t12 # e0 : adjust final null return value | ||
| 278 | |||
| 279 | zapnot t2, t8, t2 # .. e1 : prepare source word; mirror changes | ||
| 280 | and t1, t2, t1 # e1 : to source validity mask | ||
| 281 | extql t2, a1, t2 # .. e0 : | ||
| 282 | extql t1, a1, t1 # e0 : | ||
| 283 | |||
| 284 | andnot t0, t2, t0 # .. e1 : zero place for source to reside | ||
| 285 | or t0, t1, t1 # e1 : and put it there | ||
| 286 | stq_u t1, 0(a0) # .. e0 : | ||
| 287 | ret (t9) # e1 : | ||
| 288 | |||
| 289 | .end __stxcpy | ||
diff --git a/arch/alpha/lib/stxncpy.S b/arch/alpha/lib/stxncpy.S new file mode 100644 index 00000000000..da1a72740d2 --- /dev/null +++ b/arch/alpha/lib/stxncpy.S | |||
| @@ -0,0 +1,345 @@ | |||
| 1 | /* | ||
| 2 | * arch/alpha/lib/stxncpy.S | ||
| 3 | * Contributed by Richard Henderson (rth@tamu.edu) | ||
| 4 | * | ||
| 5 | * Copy no more than COUNT bytes of the null-terminated string from | ||
| 6 | * SRC to DST. | ||
| 7 | * | ||
| 8 | * This is an internal routine used by strncpy, stpncpy, and strncat. | ||
| 9 | * As such, it uses special linkage conventions to make implementation | ||
| 10 | * of these public functions more efficient. | ||
| 11 | * | ||
| 12 | * On input: | ||
| 13 | * t9 = return address | ||
| 14 | * a0 = DST | ||
| 15 | * a1 = SRC | ||
| 16 | * a2 = COUNT | ||
| 17 | * | ||
| 18 | * Furthermore, COUNT may not be zero. | ||
| 19 | * | ||
| 20 | * On output: | ||
| 21 | * t0 = last word written | ||
| 22 | * t10 = bitmask (with one bit set) indicating the byte position of | ||
| 23 | * the end of the range specified by COUNT | ||
| 24 | * t12 = bitmask (with one bit set) indicating the last byte written | ||
| 25 | * a0 = unaligned address of the last *word* written | ||
| 26 | * a2 = the number of full words left in COUNT | ||
| 27 | * | ||
| 28 | * Furthermore, v0, a3-a5, t11, and $at are untouched. | ||
| 29 | */ | ||
| 30 | |||
| 31 | #include <asm/regdef.h> | ||
| 32 | |||
| 33 | .set noat | ||
| 34 | .set noreorder | ||
| 35 | |||
| 36 | .text | ||
| 37 | |||
| 38 | /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that | ||
| 39 | doesn't like putting the entry point for a procedure somewhere in the | ||
| 40 | middle of the procedure descriptor. Work around this by putting the | ||
| 41 | aligned copy in its own procedure descriptor */ | ||
| 42 | |||
| 43 | .ent stxncpy_aligned | ||
| 44 | .align 3 | ||
| 45 | stxncpy_aligned: | ||
| 46 | .frame sp, 0, t9, 0 | ||
| 47 | .prologue 0 | ||
| 48 | |||
| 49 | /* On entry to this basic block: | ||
| 50 | t0 == the first destination word for masking back in | ||
| 51 | t1 == the first source word. */ | ||
| 52 | |||
| 53 | /* Create the 1st output word and detect 0's in the 1st input word. */ | ||
| 54 | lda t2, -1 # e1 : build a mask against false zero | ||
| 55 | mskqh t2, a1, t2 # e0 : detection in the src word | ||
| 56 | mskqh t1, a1, t3 # e0 : | ||
| 57 | ornot t1, t2, t2 # .. e1 : | ||
| 58 | mskql t0, a1, t0 # e0 : assemble the first output word | ||
| 59 | cmpbge zero, t2, t8 # .. e1 : bits set iff null found | ||
| 60 | or t0, t3, t0 # e0 : | ||
| 61 | beq a2, $a_eoc # .. e1 : | ||
| 62 | bne t8, $a_eos # .. e1 : | ||
| 63 | |||
| 64 | /* On entry to this basic block: | ||
| 65 | t0 == a source word not containing a null. */ | ||
| 66 | |||
| 67 | $a_loop: | ||
| 68 | stq_u t0, 0(a0) # e0 : | ||
| 69 | addq a0, 8, a0 # .. e1 : | ||
| 70 | ldq_u t0, 0(a1) # e0 : | ||
| 71 | addq a1, 8, a1 # .. e1 : | ||
| 72 | subq a2, 1, a2 # e0 : | ||
| 73 | cmpbge zero, t0, t8 # .. e1 (stall) | ||
| 74 | beq a2, $a_eoc # e1 : | ||
| 75 | beq t8, $a_loop # e1 : | ||
| 76 | |||
| 77 | /* Take care of the final (partial) word store. At this point | ||
| 78 | the end-of-count bit is set in t8 iff it applies. | ||
| 79 | |||
| 80 | On entry to this basic block we have: | ||
| 81 | t0 == the source word containing the null | ||
| 82 | t8 == the cmpbge mask that found it. */ | ||
| 83 | |||
| 84 | $a_eos: | ||
| 85 | negq t8, t12 # e0 : find low bit set | ||
| 86 | and t8, t12, t12 # e1 (stall) | ||
| 87 | |||
| 88 | /* For the sake of the cache, don't read a destination word | ||
| 89 | if we're not going to need it. */ | ||
| 90 | and t12, 0x80, t6 # e0 : | ||
| 91 | bne t6, 1f # .. e1 (zdb) | ||
| 92 | |||
| 93 | /* We're doing a partial word store and so need to combine | ||
| 94 | our source and original destination words. */ | ||
| 95 | ldq_u t1, 0(a0) # e0 : | ||
| 96 | subq t12, 1, t6 # .. e1 : | ||
| 97 | or t12, t6, t8 # e0 : | ||
| 98 | unop # | ||
| 99 | zapnot t0, t8, t0 # e0 : clear src bytes > null | ||
| 100 | zap t1, t8, t1 # .. e1 : clear dst bytes <= null | ||
| 101 | or t0, t1, t0 # e1 : | ||
| 102 | |||
| 103 | 1: stq_u t0, 0(a0) # e0 : | ||
| 104 | ret (t9) # e1 : | ||
| 105 | |||
| 106 | /* Add the end-of-count bit to the eos detection bitmask. */ | ||
| 107 | $a_eoc: | ||
| 108 | or t10, t8, t8 | ||
| 109 | br $a_eos | ||
| 110 | |||
| 111 | .end stxncpy_aligned | ||
| 112 | |||
| 113 | .align 3 | ||
| 114 | .ent __stxncpy | ||
| 115 | .globl __stxncpy | ||
| 116 | __stxncpy: | ||
| 117 | .frame sp, 0, t9, 0 | ||
| 118 | .prologue 0 | ||
| 119 | |||
| 120 | /* Are source and destination co-aligned? */ | ||
| 121 | xor a0, a1, t1 # e0 : | ||
| 122 | and a0, 7, t0 # .. e1 : find dest misalignment | ||
| 123 | and t1, 7, t1 # e0 : | ||
| 124 | addq a2, t0, a2 # .. e1 : bias count by dest misalignment | ||
| 125 | subq a2, 1, a2 # e0 : | ||
| 126 | and a2, 7, t2 # e1 : | ||
| 127 | srl a2, 3, a2 # e0 : a2 = loop counter = (count - 1)/8 | ||
| 128 | addq zero, 1, t10 # .. e1 : | ||
| 129 | sll t10, t2, t10 # e0 : t10 = bitmask of last count byte | ||
| 130 | bne t1, $unaligned # .. e1 : | ||
| 131 | |||
| 132 | /* We are co-aligned; take care of a partial first word. */ | ||
| 133 | |||
| 134 | ldq_u t1, 0(a1) # e0 : load first src word | ||
| 135 | addq a1, 8, a1 # .. e1 : | ||
| 136 | |||
| 137 | beq t0, stxncpy_aligned # avoid loading dest word if not needed | ||
| 138 | ldq_u t0, 0(a0) # e0 : | ||
| 139 | br stxncpy_aligned # .. e1 : | ||
| 140 | |||
| 141 | |||
| 142 | /* The source and destination are not co-aligned. Align the destination | ||
| 143 | and cope. We have to be very careful about not reading too much and | ||
| 144 | causing a SEGV. */ | ||
| 145 | |||
| 146 | .align 3 | ||
| 147 | $u_head: | ||
| 148 | /* We know just enough now to be able to assemble the first | ||
| 149 | full source word. We can still find a zero at the end of it | ||
| 150 | that prevents us from outputting the whole thing. | ||
| 151 | |||
| 152 | On entry to this basic block: | ||
| 153 | t0 == the first dest word, unmasked | ||
| 154 | t1 == the shifted low bits of the first source word | ||
| 155 | t6 == bytemask that is -1 in dest word bytes */ | ||
| 156 | |||
| 157 | ldq_u t2, 8(a1) # e0 : load second src word | ||
| 158 | addq a1, 8, a1 # .. e1 : | ||
| 159 | mskql t0, a0, t0 # e0 : mask trailing garbage in dst | ||
| 160 | extqh t2, a1, t4 # e0 : | ||
| 161 | or t1, t4, t1 # e1 : first aligned src word complete | ||
| 162 | mskqh t1, a0, t1 # e0 : mask leading garbage in src | ||
| 163 | or t0, t1, t0 # e0 : first output word complete | ||
| 164 | or t0, t6, t6 # e1 : mask original data for zero test | ||
| 165 | cmpbge zero, t6, t8 # e0 : | ||
| 166 | beq a2, $u_eocfin # .. e1 : | ||
| 167 | lda t6, -1 # e0 : | ||
| 168 | bne t8, $u_final # .. e1 : | ||
| 169 | |||
| 170 | mskql t6, a1, t6 # e0 : mask out bits already seen | ||
| 171 | nop # .. e1 : | ||
| 172 | stq_u t0, 0(a0) # e0 : store first output word | ||
| 173 | or t6, t2, t2 # .. e1 : | ||
| 174 | cmpbge zero, t2, t8 # e0 : find nulls in second partial | ||
| 175 | addq a0, 8, a0 # .. e1 : | ||
| 176 | subq a2, 1, a2 # e0 : | ||
| 177 | bne t8, $u_late_head_exit # .. e1 : | ||
| 178 | |||
| 179 | /* Finally, we've got all the stupid leading edge cases taken care | ||
| 180 | of and we can set up to enter the main loop. */ | ||
| 181 | |||
| 182 | extql t2, a1, t1 # e0 : position hi-bits of lo word | ||
| 183 | beq a2, $u_eoc # .. e1 : | ||
| 184 | ldq_u t2, 8(a1) # e0 : read next high-order source word | ||
| 185 | addq a1, 8, a1 # .. e1 : | ||
| 186 | extqh t2, a1, t0 # e0 : position lo-bits of hi word (stall) | ||
| 187 | cmpbge zero, t2, t8 # .. e1 : | ||
| 188 | nop # e0 : | ||
| 189 | bne t8, $u_eos # .. e1 : | ||
| 190 | |||
| 191 | /* Unaligned copy main loop. In order to avoid reading too much, | ||
| 192 | the loop is structured to detect zeros in aligned source words. | ||
| 193 | This has, unfortunately, effectively pulled half of a loop | ||
| 194 | iteration out into the head and half into the tail, but it does | ||
| 195 | prevent nastiness from accumulating in the very thing we want | ||
| 196 | to run as fast as possible. | ||
| 197 | |||
| 198 | On entry to this basic block: | ||
| 199 | t0 == the shifted low-order bits from the current source word | ||
| 200 | t1 == the shifted high-order bits from the previous source word | ||
| 201 | t2 == the unshifted current source word | ||
| 202 | |||
| 203 | We further know that t2 does not contain a null terminator. */ | ||
| 204 | |||
| 205 | .align 3 | ||
| 206 | $u_loop: | ||
| 207 | or t0, t1, t0 # e0 : current dst word now complete | ||
| 208 | subq a2, 1, a2 # .. e1 : decrement word count | ||
| 209 | stq_u t0, 0(a0) # e0 : save the current word | ||
| 210 | addq a0, 8, a0 # .. e1 : | ||
| 211 | extql t2, a1, t1 # e0 : extract high bits for next time | ||
| 212 | beq a2, $u_eoc # .. e1 : | ||
| 213 | ldq_u t2, 8(a1) # e0 : load high word for next time | ||
| 214 | addq a1, 8, a1 # .. e1 : | ||
| 215 | nop # e0 : | ||
| 216 | cmpbge zero, t2, t8 # e1 : test new word for eos (stall) | ||
| 217 | extqh t2, a1, t0 # e0 : extract low bits for current word | ||
| 218 | beq t8, $u_loop # .. e1 : | ||
| 219 | |||
| 220 | /* We've found a zero somewhere in the source word we just read. | ||
| 221 | If it resides in the lower half, we have one (probably partial) | ||
| 222 | word to write out, and if it resides in the upper half, we | ||
| 223 | have one full and one partial word left to write out. | ||
| 224 | |||
| 225 | On entry to this basic block: | ||
| 226 | t0 == the shifted low-order bits from the current source word | ||
| 227 | t1 == the shifted high-order bits from the previous source word | ||
| 228 | t2 == the unshifted current source word. */ | ||
| 229 | $u_eos: | ||
| 230 | or t0, t1, t0 # e0 : first (partial) source word complete | ||
| 231 | nop # .. e1 : | ||
| 232 | cmpbge zero, t0, t8 # e0 : is the null in this first bit? | ||
| 233 | bne t8, $u_final # .. e1 (zdb) | ||
| 234 | |||
| 235 | stq_u t0, 0(a0) # e0 : the null was in the high-order bits | ||
| 236 | addq a0, 8, a0 # .. e1 : | ||
| 237 | subq a2, 1, a2 # e1 : | ||
| 238 | |||
| 239 | $u_late_head_exit: | ||
| 240 | extql t2, a1, t0 # .. e0 : | ||
| 241 | cmpbge zero, t0, t8 # e0 : | ||
| 242 | or t8, t10, t6 # e1 : | ||
| 243 | cmoveq a2, t6, t8 # e0 : | ||
| 244 | nop # .. e1 : | ||
| 245 | |||
| 246 | /* Take care of a final (probably partial) result word. | ||
| 247 | On entry to this basic block: | ||
| 248 | t0 == assembled source word | ||
| 249 | t8 == cmpbge mask that found the null. */ | ||
| 250 | $u_final: | ||
| 251 | negq t8, t6 # e0 : isolate low bit set | ||
| 252 | and t6, t8, t12 # e1 : | ||
| 253 | |||
| 254 | and t12, 0x80, t6 # e0 : avoid dest word load if we can | ||
| 255 | bne t6, 1f # .. e1 (zdb) | ||
| 256 | |||
| 257 | ldq_u t1, 0(a0) # e0 : | ||
| 258 | subq t12, 1, t6 # .. e1 : | ||
| 259 | or t6, t12, t8 # e0 : | ||
| 260 | zapnot t0, t8, t0 # .. e1 : kill source bytes > null | ||
| 261 | zap t1, t8, t1 # e0 : kill dest bytes <= null | ||
| 262 | or t0, t1, t0 # e1 : | ||
| 263 | |||
| 264 | 1: stq_u t0, 0(a0) # e0 : | ||
| 265 | ret (t9) # .. e1 : | ||
| 266 | |||
| 267 | /* Got to end-of-count before end of string. | ||
| 268 | On entry to this basic block: | ||
| 269 | t1 == the shifted high-order bits from the previous source word */ | ||
| 270 | $u_eoc: | ||
| 271 | and a1, 7, t6 # e1 : | ||
| 272 | sll t10, t6, t6 # e0 : | ||
| 273 | and t6, 0xff, t6 # e0 : | ||
| 274 | bne t6, 1f # .. e1 : | ||
| 275 | |||
| 276 | ldq_u t2, 8(a1) # e0 : load final src word | ||
| 277 | nop # .. e1 : | ||
| 278 | extqh t2, a1, t0 # e0 : extract low bits for last word | ||
| 279 | or t1, t0, t1 # e1 : | ||
| 280 | |||
| 281 | 1: cmpbge zero, t1, t8 | ||
| 282 | mov t1, t0 | ||
| 283 | |||
| 284 | $u_eocfin: # end-of-count, final word | ||
| 285 | or t10, t8, t8 | ||
| 286 | br $u_final | ||
| 287 | |||
| 288 | /* Unaligned copy entry point. */ | ||
| 289 | .align 3 | ||
| 290 | $unaligned: | ||
| 291 | |||
| 292 | ldq_u t1, 0(a1) # e0 : load first source word | ||
| 293 | |||
| 294 | and a0, 7, t4 # .. e1 : find dest misalignment | ||
| 295 | and a1, 7, t5 # e0 : find src misalignment | ||
| 296 | |||
| 297 | /* Conditionally load the first destination word and a bytemask | ||
| 298 | with 0xff indicating that the destination byte is sacrosanct. */ | ||
| 299 | |||
| 300 | mov zero, t0 # .. e1 : | ||
| 301 | mov zero, t6 # e0 : | ||
| 302 | beq t4, 1f # .. e1 : | ||
| 303 | ldq_u t0, 0(a0) # e0 : | ||
| 304 | lda t6, -1 # .. e1 : | ||
| 305 | mskql t6, a0, t6 # e0 : | ||
| 306 | subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr | ||
| 307 | |||
| 308 | /* If source misalignment is larger than dest misalignment, we need | ||
| 309 | extra startup checks to avoid SEGV. */ | ||
| 310 | |||
| 311 | 1: cmplt t4, t5, t12 # e1 : | ||
| 312 | extql t1, a1, t1 # .. e0 : shift src into place | ||
| 313 | lda t2, -1 # e0 : for creating masks later | ||
| 314 | beq t12, $u_head # .. e1 : | ||
| 315 | |||
| 316 | extql t2, a1, t2 # e0 : | ||
| 317 | cmpbge zero, t1, t8 # .. e1 : is there a zero? | ||
| 318 | andnot t2, t6, t12 # e0 : dest mask for a single word copy | ||
| 319 | or t8, t10, t5 # .. e1 : test for end-of-count too | ||
| 320 | cmpbge zero, t12, t3 # e0 : | ||
| 321 | cmoveq a2, t5, t8 # .. e1 : | ||
| 322 | andnot t8, t3, t8 # e0 : | ||
| 323 | beq t8, $u_head # .. e1 (zdb) | ||
| 324 | |||
| 325 | /* At this point we've found a zero in the first partial word of | ||
| 326 | the source. We need to isolate the valid source data and mask | ||
| 327 | it into the original destination data. (Incidentally, we know | ||
| 328 | that we'll need at least one byte of that original dest word.) */ | ||
| 329 | |||
| 330 | ldq_u t0, 0(a0) # e0 : | ||
| 331 | negq t8, t6 # .. e1 : build bitmask of bytes <= zero | ||
| 332 | mskqh t1, t4, t1 # e0 : | ||
| 333 | and t6, t8, t2 # .. e1 : | ||
| 334 | subq t2, 1, t6 # e0 : | ||
| 335 | or t6, t2, t8 # e1 : | ||
| 336 | |||
| 337 | zapnot t12, t8, t12 # e0 : prepare source word; mirror changes | ||
| 338 | zapnot t1, t8, t1 # .. e1 : to source validity mask | ||
| 339 | |||
| 340 | andnot t0, t12, t0 # e0 : zero place for source to reside | ||
| 341 | or t0, t1, t0 # e1 : and put it there | ||
| 342 | stq_u t0, 0(a0) # e0 : | ||
| 343 | ret (t9) # .. e1 : | ||
| 344 | |||
| 345 | .end __stxncpy | ||
diff --git a/arch/alpha/lib/udelay.c b/arch/alpha/lib/udelay.c new file mode 100644 index 00000000000..1c879bbce41 --- /dev/null +++ b/arch/alpha/lib/udelay.c | |||
| @@ -0,0 +1,55 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 1993, 2000 Linus Torvalds | ||
| 3 | * | ||
| 4 | * Delay routines, using a pre-computed "loops_per_jiffy" value. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/config.h> | ||
| 8 | #include <linux/module.h> | ||
| 9 | #include <linux/sched.h> /* for udelay's use of smp_processor_id */ | ||
| 10 | #include <asm/param.h> | ||
| 11 | #include <asm/smp.h> | ||
| 12 | #include <linux/delay.h> | ||
| 13 | |||
| 14 | /* | ||
| 15 | * Use only for very small delays (< 1 msec). | ||
| 16 | * | ||
| 17 | * The active part of our cycle counter is only 32-bits wide, and | ||
| 18 | * we're treating the difference between two marks as signed. On | ||
| 19 | * a 1GHz box, that's about 2 seconds. | ||
| 20 | */ | ||
| 21 | |||
| 22 | void | ||
| 23 | __delay(int loops) | ||
| 24 | { | ||
| 25 | int tmp; | ||
| 26 | __asm__ __volatile__( | ||
| 27 | " rpcc %0\n" | ||
| 28 | " addl %1,%0,%1\n" | ||
| 29 | "1: rpcc %0\n" | ||
| 30 | " subl %1,%0,%0\n" | ||
| 31 | " bgt %0,1b" | ||
| 32 | : "=&r" (tmp), "=r" (loops) : "1"(loops)); | ||
| 33 | } | ||
| 34 | |||
| 35 | #ifdef CONFIG_SMP | ||
| 36 | #define LPJ cpu_data[smp_processor_id()].loops_per_jiffy | ||
| 37 | #else | ||
| 38 | #define LPJ loops_per_jiffy | ||
| 39 | #endif | ||
| 40 | |||
| 41 | void | ||
| 42 | udelay(unsigned long usecs) | ||
| 43 | { | ||
| 44 | usecs *= (((unsigned long)HZ << 32) / 1000000) * LPJ; | ||
| 45 | __delay((long)usecs >> 32); | ||
| 46 | } | ||
| 47 | EXPORT_SYMBOL(udelay); | ||
| 48 | |||
| 49 | void | ||
| 50 | ndelay(unsigned long nsecs) | ||
| 51 | { | ||
| 52 | nsecs *= (((unsigned long)HZ << 32) / 1000000000) * LPJ; | ||
| 53 | __delay((long)nsecs >> 32); | ||
| 54 | } | ||
| 55 | EXPORT_SYMBOL(ndelay); | ||
