1 files changed, 424 insertions, 0 deletions
diff --git a/arch/alpha/lib/ev6-strncpy_from_user.S b/arch/alpha/lib/ev6-strncpy_from_user.S
new file mode 100644
index 000000000000..d2e28178cacc
--- /dev/null
+++ b/arch/alpha/lib/ev6-strncpy_from_user.S
@@ -0,0 +1,424 @@
+/*
+ * arch/alpha/lib/ev6-strncpy_from_user.S
+ * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
+ *
+ * Just like strncpy except in the return value:
+ *
+ * -EFAULT       if an exception occurs before the terminator is copied.
+ * N             if the buffer filled.
+ *
+ * Otherwise the length of the string is returned.
+ *
+ * Much of the information about 21264 scheduling/coding comes from:
+ *      Compiler Writer's Guide for the Alpha 21264
+ *      abbreviated as 'CWG' in other comments here
+ *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ *      E       - either cluster
+ *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ * A bunch of instructions got moved and temp registers were changed
+ * to aid in scheduling.  Control flow was also re-arranged to eliminate
+ * branches, and to provide longer code sequences to enable better scheduling.
+ * A total rewrite (using byte load/stores for start & tail sequences)
+ * is desirable, but very difficult to do without a from-scratch rewrite.
+ * Save that for the future.
+ */
+#include <asm/errno.h>
+#include <asm/regdef.h>
+/* Allow an exception for an insn; exit if we get one.  */
+#define EX(x,y...)                      \
+        99: x,##y;                      \
+        .section __ex_table,"a";        \
+        .long 99b - .;                  \
+        lda $31, $exception-99b($0);    \
+        .previous
+        .set noat
+        .set noreorder
+        .text
+        .globl __strncpy_from_user
+        .ent __strncpy_from_user
+        .frame $30, 0, $26
+        .prologue 0
+        .align 4
+__strncpy_from_user:
+        and     a0, 7, t3       # E : find dest misalignment
+        beq     a2, $zerolength # U :
+        /* Are source and destination co-aligned?  */
+        mov     a0, v0          # E : save the string start
+        xor     a0, a1, t4      # E :
+        EX( ldq_u t1, 0(a1) )   # L : Latency=3 load first quadword
+        ldq_u   t0, 0(a0)       # L : load first (partial) aligned dest quadword
+        addq    a2, t3, a2      # E : bias count by dest misalignment
+        subq    a2, 1, a3       # E :
+        addq    zero, 1, t10    # E :
+        and     t4, 7, t4       # E : misalignment between the two
+        and     a3, 7, t6       # E : number of tail bytes
+        sll     t10, t6, t10    # E : t10 = bitmask of last count byte
+        bne     t4, $unaligned  # U :
+        lda     t2, -1          # E : build a mask against false zero
+        /*
+         * We are co-aligned; take care of a partial first word.
+         * On entry to this basic block:
+         * t0 == the first destination word for masking back in
+         * t1 == the first source word.
+         */
+        srl     a3, 3, a2       # E : a2 = loop counter = (count - 1)/8
+        addq    a1, 8, a1       # E :
+        mskqh   t2, a1, t2      # U :   detection in the src word
+        nop
+        /* Create the 1st output word and detect 0's in the 1st input word.  */
+        mskqh   t1, a1, t3      # U :
+        mskql   t0, a1, t0      # U : assemble the first output word
+        ornot   t1, t2, t2      # E :
+        nop
+        cmpbge  zero, t2, t8    # E : bits set iff null found
+        or      t0, t3, t0      # E :
+        beq     a2, $a_eoc      # U :
+        bne     t8, $a_eos      # U : 2nd branch in a quad.  Bad.
+        /* On entry to this basic block:
+         * t0 == a source quad not containing a null.
+         * a0 - current aligned destination address
+         * a1 - current aligned source address
+         * a2 - count of quadwords to move.
+         * NOTE: Loop improvement - unrolling this is going to be
+         *      a huge win, since we're going to stall otherwise.
+         *      Fix this later.  For _really_ large copies, look
+         *      at using wh64 on a look-ahead basis.  See the code
+         *      in clear_user.S and copy_user.S.
+         * Presumably, since (a0) and (a1) do not overlap (by C definition)
+         * Lots of nops here:
+         *      - Separate loads from stores
+         *      - Keep it to 1 branch/quadpack so the branch predictor
+         *        can train.
+         */
+$a_loop:
+        stq_u   t0, 0(a0)       # L :
+        addq    a0, 8, a0       # E :
+        nop
+        subq    a2, 1, a2       # E :
+        EX( ldq_u t0, 0(a1) )   # L :
+        addq    a1, 8, a1       # E :
+        cmpbge  zero, t0, t8    # E : Stall 2 cycles on t0
+        beq     a2, $a_eoc      # U :
+        beq     t8, $a_loop     # U :
+        nop
+        nop
+        nop
+        /* Take care of the final (partial) word store.  At this point
+         * the end-of-count bit is set in t8 iff it applies.
+         *
+         * On entry to this basic block we have:
+         * t0 == the source word containing the null
+         * t8 == the cmpbge mask that found it.
+         */
+$a_eos:
+        negq    t8, t12         # E : find low bit set
+        and     t8, t12, t12    # E : 
+        /* We're doing a partial word store and so need to combine
+           our source and original destination words.  */
+        ldq_u   t1, 0(a0)       # L :
+        subq    t12, 1, t6      # E :
+        or      t12, t6, t8     # E :
+        zapnot  t0, t8, t0      # U : clear src bytes > null
+        zap     t1, t8, t1      # U : clear dst bytes <= null
+        or      t0, t1, t0      # E :
+        stq_u   t0, 0(a0)       # L :
+        br      $finish_up      # L0 :
+        nop
+        nop
+        /* Add the end-of-count bit to the eos detection bitmask.  */
+        .align 4
+$a_eoc:
+        or      t10, t8, t8
+        br      $a_eos
+        nop
+        nop
+/* The source and destination are not co-aligned.  Align the destination
+   and cope.  We have to be very careful about not reading too much and
+   causing a SEGV.  */
+        .align 4
+$u_head:
+        /* We know just enough now to be able to assemble the first
+           full source word.  We can still find a zero at the end of it
+           that prevents us from outputting the whole thing.
+           On entry to this basic block:
+           t0 == the first dest word, unmasked
+           t1 == the shifted low bits of the first source word
+           t6 == bytemask that is -1 in dest word bytes */
+        EX( ldq_u t2, 8(a1) )   # L : load second src word
+        addq    a1, 8, a1       # E :
+        mskql   t0, a0, t0      # U : mask trailing garbage in dst
+        extqh   t2, a1, t4      # U :
+        or      t1, t4, t1      # E : first aligned src word complete
+        mskqh   t1, a0, t1      # U : mask leading garbage in src
+        or      t0, t1, t0      # E : first output word complete
+        or      t0, t6, t6      # E : mask original data for zero test
+        cmpbge  zero, t6, t8    # E :
+        beq     a2, $u_eocfin   # U :
+        bne     t8, $u_final    # U : bad news - 2nd branch in a quad
+        lda     t6, -1          # E : mask out the bits we have
+        mskql   t6, a1, t6      # U :   already seen
+        stq_u   t0, 0(a0)       # L : store first output word
+        or      t6, t2, t2      # E :
+        cmpbge  zero, t2, t8    # E : find nulls in second partial
+        addq    a0, 8, a0               # E :
+        subq    a2, 1, a2               # E :
+        bne     t8, $u_late_head_exit   # U :
+        nop
+        /* Finally, we've got all the stupid leading edge cases taken care
+           of and we can set up to enter the main loop.  */
+        extql   t2, a1, t1      # U : position hi-bits of lo word
+        EX( ldq_u t2, 8(a1) )   # L : read next high-order source word
+        addq    a1, 8, a1       # E :
+        cmpbge  zero, t2, t8    # E :
+        beq     a2, $u_eoc      # U :
+        bne     t8, $u_eos      # U :
+        nop
+        nop
+        /* Unaligned copy main loop.  In order to avoid reading too much,
+           the loop is structured to detect zeros in aligned source words.
+           This has, unfortunately, effectively pulled half of a loop
+           iteration out into the head and half into the tail, but it does
+           prevent nastiness from accumulating in the very thing we want
+           to run as fast as possible.
+           On entry to this basic block:
+           t1 == the shifted high-order bits from the previous source word
+           t2 == the unshifted current source word
+           We further know that t2 does not contain a null terminator.  */
+        /*
+         * Extra nops here:
+         *      separate load quads from store quads
+         *      only one branch/quad to permit predictor training
+         */
+        .align 4
+$u_loop:
+        extqh   t2, a1, t0      # U : extract high bits for current word
+        addq    a1, 8, a1       # E :
+        extql   t2, a1, t3      # U : extract low bits for next time
+        addq    a0, 8, a0       # E :
+        or      t0, t1, t0      # E : current dst word now complete
+        EX( ldq_u t2, 0(a1) )   # L : load high word for next time
+        subq    a2, 1, a2       # E :
+        nop
+        stq_u   t0, -8(a0)      # L : save the current word
+        mov     t3, t1          # E :
+        cmpbge  zero, t2, t8    # E : test new word for eos
+        beq     a2, $u_eoc      # U :
+        beq     t8, $u_loop     # U :
+        nop
+        nop
+        nop
+        /* We've found a zero somewhere in the source word we just read.
+           If it resides in the lower half, we have one (probably partial)
+           word to write out, and if it resides in the upper half, we
+           have one full and one partial word left to write out.
+           On entry to this basic block:
+           t1 == the shifted high-order bits from the previous source word
+           t2 == the unshifted current source word.  */
+        .align 4
+$u_eos:
+        extqh   t2, a1, t0      # U :
+        or      t0, t1, t0      # E : first (partial) source word complete
+        cmpbge  zero, t0, t8    # E : is the null in this first bit?
+        nop
+        bne     t8, $u_final    # U :
+        stq_u   t0, 0(a0)       # L : the null was in the high-order bits
+        addq    a0, 8, a0       # E :
+        subq    a2, 1, a2       # E :
+        .align 4
+$u_late_head_exit:
+        extql   t2, a1, t0      # U :
+        cmpbge  zero, t0, t8    # E :
+        or      t8, t10, t6     # E :
+        cmoveq  a2, t6, t8      # E :
+        /* Take care of a final (probably partial) result word.
+           On entry to this basic block:
+           t0 == assembled source word
+           t8 == cmpbge mask that found the null.  */
+        .align 4
+$u_final:
+        negq    t8, t6          # E : isolate low bit set
+        and     t6, t8, t12     # E :
+        ldq_u   t1, 0(a0)       # L :
+        subq    t12, 1, t6      # E :
+        or      t6, t12, t8     # E :
+        zapnot  t0, t8, t0      # U : kill source bytes > null
+        zap     t1, t8, t1      # U : kill dest bytes <= null
+        or      t0, t1, t0      # E :
+        stq_u   t0, 0(a0)       # E :
+        br      $finish_up      # U :
+        nop
+        nop
+        .align 4
+$u_eoc:                         # end-of-count
+        extqh   t2, a1, t0      # U :
+        or      t0, t1, t0      # E :
+        cmpbge  zero, t0, t8    # E :
+        nop
+        .align 4
+$u_eocfin:                      # end-of-count, final word
+        or      t10, t8, t8     # E :
+        br      $u_final        # U :
+        nop
+        nop
+        /* Unaligned copy entry point.  */
+        .align 4
+$unaligned:
+        srl     a3, 3, a2       # U : a2 = loop counter = (count - 1)/8
+        and     a0, 7, t4       # E : find dest misalignment
+        and     a1, 7, t5       # E : find src misalignment
+        mov     zero, t0        # E :
+        /* Conditionally load the first destination word and a bytemask
+           with 0xff indicating that the destination byte is sacrosanct.  */
+        mov     zero, t6        # E :
+        beq     t4, 1f          # U :
+        ldq_u   t0, 0(a0)       # L :
+        lda     t6, -1          # E :
+        mskql   t6, a0, t6      # E :
+        nop
+        nop
+        nop
+        .align 4
+1:
+        subq    a1, t4, a1      # E : sub dest misalignment from src addr
+        /* If source misalignment is larger than dest misalignment, we need
+           extra startup checks to avoid SEGV.  */
+        cmplt   t4, t5, t12     # E :
+        extql   t1, a1, t1      # U : shift src into place
+        lda     t2, -1          # E : for creating masks later
+        beq     t12, $u_head    # U :
+        mskqh   t2, t5, t2      # U : begin src byte validity mask
+        cmpbge  zero, t1, t8    # E : is there a zero?
+        nop
+        extql   t2, a1, t2      # U :
+        or      t8, t10, t5     # E : test for end-of-count too
+        cmpbge  zero, t2, t3    # E :
+        cmoveq  a2, t5, t8      # E : Latency=2, extra map slot
+        nop                     # E : goes with cmov
+        andnot  t8, t3, t8      # E :
+        beq     t8, $u_head     # U :
+        nop
+        /* At this point we've found a zero in the first partial word of
+           the source.  We need to isolate the valid source data and mask
+           it into the original destination data.  (Incidentally, we know
+           that we'll need at least one byte of that original dest word.) */
+        ldq_u   t0, 0(a0)       # L :
+        negq    t8, t6          # E : build bitmask of bytes <= zero
+        mskqh   t1, t4, t1      # U :
+        and     t6, t8, t12     # E :
+        subq    t12, 1, t6      # E :
+        or      t6, t12, t8     # E :
+        zapnot  t2, t8, t2      # U : prepare source word; mirror changes
+        zapnot  t1, t8, t1      # U : to source validity mask
+        andnot  t0, t2, t0      # E : zero place for source to reside
+        or      t0, t1, t0      # E : and put it there
+        stq_u   t0, 0(a0)       # L :
+        nop
+        .align 4
+$finish_up:
+        zapnot  t0, t12, t4     # U : was last byte written null?
+        and     t12, 0xf0, t3   # E : binary search for the address of the
+        cmovne  t4, 1, t4       # E : Latency=2, extra map slot
+        nop                     # E : with cmovne
+        and     t12, 0xcc, t2   # E : last byte written
+        and     t12, 0xaa, t1   # E :
+        cmovne  t3, 4, t3       # E : Latency=2, extra map slot
+        nop                     # E : with cmovne
+        bic     a0, 7, t0
+        cmovne  t2, 2, t2       # E : Latency=2, extra map slot
+        nop                     # E : with cmovne
+        nop
+        cmovne  t1, 1, t1       # E : Latency=2, extra map slot
+        nop                     # E : with cmovne
+        addq    t0, t3, t0      # E :
+        addq    t1, t2, t1      # E :
+        addq    t0, t1, t0      # E :
+        addq    t0, t4, t0      # add one if we filled the buffer
+        subq    t0, v0, v0      # find string length
+        ret                     # L0 :
+        .align 4
+$zerolength:
+        nop
+        nop
+        nop
+        clr     v0
+$exception:
+        nop
+        nop
+        nop
+        ret
+        .end __strncpy_from_user

diff --git a/arch/alpha/lib/ev6-strncpy_from_user.S b/arch/alpha/lib/ev6-strncpy_from_user.S new file mode 100644 index 000000000000..d2e28178cacc --- /dev/null +++ b/arch/alpha/lib/ev6-strncpy_from_user.S
@@ -0,0 +1,424 @@
	1	/*
	2	* arch/alpha/lib/ev6-strncpy_from_user.S
	3	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
	4	*
	5	* Just like strncpy except in the return value:
	6	*
	7	* -EFAULT if an exception occurs before the terminator is copied.
	8	* N if the buffer filled.
	9	*
	10	* Otherwise the length of the string is returned.
	11	*
	12	* Much of the information about 21264 scheduling/coding comes from:
	13	* Compiler Writer's Guide for the Alpha 21264
	14	* abbreviated as 'CWG' in other comments here
	15	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
	16	* Scheduling notation:
	17	* E - either cluster
	18	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
	19	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
	20	* A bunch of instructions got moved and temp registers were changed
	21	* to aid in scheduling. Control flow was also re-arranged to eliminate
	22	* branches, and to provide longer code sequences to enable better scheduling.
	23	* A total rewrite (using byte load/stores for start & tail sequences)
	24	* is desirable, but very difficult to do without a from-scratch rewrite.
	25	* Save that for the future.
	26	*/
	27
	28
	29	#include <asm/errno.h>
	30	#include <asm/regdef.h>
	31
	32
	33	/* Allow an exception for an insn; exit if we get one. */
	34	#define EX(x,y...) \
	35	99: x,##y; \
	36	.section __ex_table,"a"; \
	37	.long 99b - .; \
	38	lda $31, $exception-99b($0); \
	39	.previous
	40
	41
	42	.set noat
	43	.set noreorder
	44	.text
	45
	46	.globl __strncpy_from_user
	47	.ent __strncpy_from_user
	48	.frame $30, 0, $26
	49	.prologue 0
	50
	51	.align 4
	52	__strncpy_from_user:
	53	and a0, 7, t3 # E : find dest misalignment
	54	beq a2, $zerolength # U :
	55
	56	/* Are source and destination co-aligned? */
	57	mov a0, v0 # E : save the string start
	58	xor a0, a1, t4 # E :
	59	EX( ldq_u t1, 0(a1) ) # L : Latency=3 load first quadword
	60	ldq_u t0, 0(a0) # L : load first (partial) aligned dest quadword
	61
	62	addq a2, t3, a2 # E : bias count by dest misalignment
	63	subq a2, 1, a3 # E :
	64	addq zero, 1, t10 # E :
	65	and t4, 7, t4 # E : misalignment between the two
	66
	67	and a3, 7, t6 # E : number of tail bytes
	68	sll t10, t6, t10 # E : t10 = bitmask of last count byte
	69	bne t4, $unaligned # U :
	70	lda t2, -1 # E : build a mask against false zero
	71
	72	/*
	73	* We are co-aligned; take care of a partial first word.
	74	* On entry to this basic block:
	75	* t0 == the first destination word for masking back in
	76	* t1 == the first source word.
	77	*/
	78
	79	srl a3, 3, a2 # E : a2 = loop counter = (count - 1)/8
	80	addq a1, 8, a1 # E :
	81	mskqh t2, a1, t2 # U : detection in the src word
	82	nop
	83
	84	/* Create the 1st output word and detect 0's in the 1st input word. */
	85	mskqh t1, a1, t3 # U :
	86	mskql t0, a1, t0 # U : assemble the first output word
	87	ornot t1, t2, t2 # E :
	88	nop
	89
	90	cmpbge zero, t2, t8 # E : bits set iff null found
	91	or t0, t3, t0 # E :
	92	beq a2, $a_eoc # U :
	93	bne t8, $a_eos # U : 2nd branch in a quad. Bad.
	94
	95	/* On entry to this basic block:
	96	* t0 == a source quad not containing a null.
	97	* a0 - current aligned destination address
	98	* a1 - current aligned source address
	99	* a2 - count of quadwords to move.
	100	* NOTE: Loop improvement - unrolling this is going to be
	101	* a huge win, since we're going to stall otherwise.
	102	* Fix this later. For _really_ large copies, look
	103	* at using wh64 on a look-ahead basis. See the code
	104	* in clear_user.S and copy_user.S.
	105	* Presumably, since (a0) and (a1) do not overlap (by C definition)
	106	* Lots of nops here:
	107	* - Separate loads from stores
	108	* - Keep it to 1 branch/quadpack so the branch predictor
	109	* can train.
	110	*/
	111	$a_loop:
	112	stq_u t0, 0(a0) # L :
	113	addq a0, 8, a0 # E :
	114	nop
	115	subq a2, 1, a2 # E :
	116
	117	EX( ldq_u t0, 0(a1) ) # L :
	118	addq a1, 8, a1 # E :
	119	cmpbge zero, t0, t8 # E : Stall 2 cycles on t0
	120	beq a2, $a_eoc # U :
	121
	122	beq t8, $a_loop # U :
	123	nop
	124	nop
	125	nop
	126
	127	/* Take care of the final (partial) word store. At this point
	128	* the end-of-count bit is set in t8 iff it applies.
	129	*
	130	* On entry to this basic block we have:
	131	* t0 == the source word containing the null
	132	* t8 == the cmpbge mask that found it.
	133	*/
	134	$a_eos:
	135	negq t8, t12 # E : find low bit set
	136	and t8, t12, t12 # E :
	137
	138	/* We're doing a partial word store and so need to combine
	139	our source and original destination words. */
	140	ldq_u t1, 0(a0) # L :
	141	subq t12, 1, t6 # E :
	142
	143	or t12, t6, t8 # E :
	144	zapnot t0, t8, t0 # U : clear src bytes > null
	145	zap t1, t8, t1 # U : clear dst bytes <= null
	146	or t0, t1, t0 # E :
	147
	148	stq_u t0, 0(a0) # L :
	149	br $finish_up # L0 :
	150	nop
	151	nop
	152
	153	/* Add the end-of-count bit to the eos detection bitmask. */
	154	.align 4
	155	$a_eoc:
	156	or t10, t8, t8
	157	br $a_eos
	158	nop
	159	nop
	160
	161
	162	/* The source and destination are not co-aligned. Align the destination
	163	and cope. We have to be very careful about not reading too much and
	164	causing a SEGV. */
	165
	166	.align 4
	167	$u_head:
	168	/* We know just enough now to be able to assemble the first
	169	full source word. We can still find a zero at the end of it
	170	that prevents us from outputting the whole thing.
	171
	172	On entry to this basic block:
	173	t0 == the first dest word, unmasked
	174	t1 == the shifted low bits of the first source word
	175	t6 == bytemask that is -1 in dest word bytes */
	176
	177	EX( ldq_u t2, 8(a1) ) # L : load second src word
	178	addq a1, 8, a1 # E :
	179	mskql t0, a0, t0 # U : mask trailing garbage in dst
	180	extqh t2, a1, t4 # U :
	181
	182	or t1, t4, t1 # E : first aligned src word complete
	183	mskqh t1, a0, t1 # U : mask leading garbage in src
	184	or t0, t1, t0 # E : first output word complete
	185	or t0, t6, t6 # E : mask original data for zero test
	186
	187	cmpbge zero, t6, t8 # E :
	188	beq a2, $u_eocfin # U :
	189	bne t8, $u_final # U : bad news - 2nd branch in a quad
	190	lda t6, -1 # E : mask out the bits we have
	191
	192	mskql t6, a1, t6 # U : already seen
	193	stq_u t0, 0(a0) # L : store first output word
	194	or t6, t2, t2 # E :
	195	cmpbge zero, t2, t8 # E : find nulls in second partial
	196
	197	addq a0, 8, a0 # E :
	198	subq a2, 1, a2 # E :
	199	bne t8, $u_late_head_exit # U :
	200	nop
	201
	202	/* Finally, we've got all the stupid leading edge cases taken care
	203	of and we can set up to enter the main loop. */
	204
	205	extql t2, a1, t1 # U : position hi-bits of lo word
	206	EX( ldq_u t2, 8(a1) ) # L : read next high-order source word
	207	addq a1, 8, a1 # E :
	208	cmpbge zero, t2, t8 # E :
	209
	210	beq a2, $u_eoc # U :
	211	bne t8, $u_eos # U :
	212	nop
	213	nop
	214
	215	/* Unaligned copy main loop. In order to avoid reading too much,
	216	the loop is structured to detect zeros in aligned source words.
	217	This has, unfortunately, effectively pulled half of a loop
	218	iteration out into the head and half into the tail, but it does
	219	prevent nastiness from accumulating in the very thing we want
	220	to run as fast as possible.
	221
	222	On entry to this basic block:
	223	t1 == the shifted high-order bits from the previous source word
	224	t2 == the unshifted current source word
	225
	226	We further know that t2 does not contain a null terminator. */
	227
	228	/*
	229	* Extra nops here:
	230	* separate load quads from store quads
	231	* only one branch/quad to permit predictor training
	232	*/
	233
	234	.align 4
	235	$u_loop:
	236	extqh t2, a1, t0 # U : extract high bits for current word
	237	addq a1, 8, a1 # E :
	238	extql t2, a1, t3 # U : extract low bits for next time
	239	addq a0, 8, a0 # E :
	240
	241	or t0, t1, t0 # E : current dst word now complete
	242	EX( ldq_u t2, 0(a1) ) # L : load high word for next time
	243	subq a2, 1, a2 # E :
	244	nop
	245
	246	stq_u t0, -8(a0) # L : save the current word
	247	mov t3, t1 # E :
	248	cmpbge zero, t2, t8 # E : test new word for eos
	249	beq a2, $u_eoc # U :
	250
	251	beq t8, $u_loop # U :
	252	nop
	253	nop
	254	nop
	255
	256	/* We've found a zero somewhere in the source word we just read.
	257	If it resides in the lower half, we have one (probably partial)
	258	word to write out, and if it resides in the upper half, we
	259	have one full and one partial word left to write out.
	260
	261	On entry to this basic block:
	262	t1 == the shifted high-order bits from the previous source word
	263	t2 == the unshifted current source word. */
	264	.align 4
	265	$u_eos:
	266	extqh t2, a1, t0 # U :
	267	or t0, t1, t0 # E : first (partial) source word complete
	268	cmpbge zero, t0, t8 # E : is the null in this first bit?
	269	nop
	270
	271	bne t8, $u_final # U :
	272	stq_u t0, 0(a0) # L : the null was in the high-order bits
	273	addq a0, 8, a0 # E :
	274	subq a2, 1, a2 # E :
	275
	276	.align 4
	277	$u_late_head_exit:
	278	extql t2, a1, t0 # U :
	279	cmpbge zero, t0, t8 # E :
	280	or t8, t10, t6 # E :
	281	cmoveq a2, t6, t8 # E :
	282
	283	/* Take care of a final (probably partial) result word.
	284	On entry to this basic block:
	285	t0 == assembled source word
	286	t8 == cmpbge mask that found the null. */
	287	.align 4
	288	$u_final:
	289	negq t8, t6 # E : isolate low bit set
	290	and t6, t8, t12 # E :
	291	ldq_u t1, 0(a0) # L :
	292	subq t12, 1, t6 # E :
	293
	294	or t6, t12, t8 # E :
	295	zapnot t0, t8, t0 # U : kill source bytes > null
	296	zap t1, t8, t1 # U : kill dest bytes <= null
	297	or t0, t1, t0 # E :
	298
	299	stq_u t0, 0(a0) # E :
	300	br $finish_up # U :
	301	nop
	302	nop
	303
	304	.align 4
	305	$u_eoc: # end-of-count
	306	extqh t2, a1, t0 # U :
	307	or t0, t1, t0 # E :
	308	cmpbge zero, t0, t8 # E :
	309	nop
	310
	311	.align 4
	312	$u_eocfin: # end-of-count, final word
	313	or t10, t8, t8 # E :
	314	br $u_final # U :
	315	nop
	316	nop
	317
	318	/* Unaligned copy entry point. */
	319	.align 4
	320	$unaligned:
	321
	322	srl a3, 3, a2 # U : a2 = loop counter = (count - 1)/8
	323	and a0, 7, t4 # E : find dest misalignment
	324	and a1, 7, t5 # E : find src misalignment
	325	mov zero, t0 # E :
	326
	327	/* Conditionally load the first destination word and a bytemask
	328	with 0xff indicating that the destination byte is sacrosanct. */
	329
	330	mov zero, t6 # E :
	331	beq t4, 1f # U :
	332	ldq_u t0, 0(a0) # L :
	333	lda t6, -1 # E :
	334
	335	mskql t6, a0, t6 # E :
	336	nop
	337	nop
	338	nop
	339
	340	.align 4
	341	1:
	342	subq a1, t4, a1 # E : sub dest misalignment from src addr
	343	/* If source misalignment is larger than dest misalignment, we need
	344	extra startup checks to avoid SEGV. */
	345	cmplt t4, t5, t12 # E :
	346	extql t1, a1, t1 # U : shift src into place
	347	lda t2, -1 # E : for creating masks later
	348
	349	beq t12, $u_head # U :
	350	mskqh t2, t5, t2 # U : begin src byte validity mask
	351	cmpbge zero, t1, t8 # E : is there a zero?
	352	nop
	353
	354	extql t2, a1, t2 # U :
	355	or t8, t10, t5 # E : test for end-of-count too
	356	cmpbge zero, t2, t3 # E :
	357	cmoveq a2, t5, t8 # E : Latency=2, extra map slot
	358
	359	nop # E : goes with cmov
	360	andnot t8, t3, t8 # E :
	361	beq t8, $u_head # U :
	362	nop
	363
	364	/* At this point we've found a zero in the first partial word of
	365	the source. We need to isolate the valid source data and mask
	366	it into the original destination data. (Incidentally, we know
	367	that we'll need at least one byte of that original dest word.) */
	368
	369	ldq_u t0, 0(a0) # L :
	370	negq t8, t6 # E : build bitmask of bytes <= zero
	371	mskqh t1, t4, t1 # U :
	372	and t6, t8, t12 # E :
	373
	374	subq t12, 1, t6 # E :
	375	or t6, t12, t8 # E :
	376	zapnot t2, t8, t2 # U : prepare source word; mirror changes
	377	zapnot t1, t8, t1 # U : to source validity mask
	378
	379	andnot t0, t2, t0 # E : zero place for source to reside
	380	or t0, t1, t0 # E : and put it there
	381	stq_u t0, 0(a0) # L :
	382	nop
	383
	384	.align 4
	385	$finish_up:
	386	zapnot t0, t12, t4 # U : was last byte written null?
	387	and t12, 0xf0, t3 # E : binary search for the address of the
	388	cmovne t4, 1, t4 # E : Latency=2, extra map slot
	389	nop # E : with cmovne
	390
	391	and t12, 0xcc, t2 # E : last byte written
	392	and t12, 0xaa, t1 # E :
	393	cmovne t3, 4, t3 # E : Latency=2, extra map slot
	394	nop # E : with cmovne
	395
	396	bic a0, 7, t0
	397	cmovne t2, 2, t2 # E : Latency=2, extra map slot
	398	nop # E : with cmovne
	399	nop
	400
	401	cmovne t1, 1, t1 # E : Latency=2, extra map slot
	402	nop # E : with cmovne
	403	addq t0, t3, t0 # E :
	404	addq t1, t2, t1 # E :
	405
	406	addq t0, t1, t0 # E :
	407	addq t0, t4, t0 # add one if we filled the buffer
	408	subq t0, v0, v0 # find string length
	409	ret # L0 :
	410
	411	.align 4
	412	$zerolength:
	413	nop
	414	nop
	415	nop
	416	clr v0
	417
	418	$exception:
	419	nop
	420	nop
	421	nop
	422	ret
	423
	424	.end __strncpy_from_user