Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/alpha/lib/ev6-stxcpy.S
1 files changed, 321 insertions, 0 deletions
diff --git a/arch/alpha/lib/ev6-stxcpy.S b/arch/alpha/lib/ev6-stxcpy.S
new file mode 100644
index 000000000000..4643ff2ffc8d
--- /dev/null
+++ b/arch/alpha/lib/ev6-stxcpy.S
@@ -0,0 +1,321 @@
+/*
+ * arch/alpha/lib/ev6-stxcpy.S
+ * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
+ *
+ * Copy a null-terminated string from SRC to DST.
+ *
+ * This is an internal routine used by strcpy, stpcpy, and strcat.
+ * As such, it uses special linkage conventions to make implementation
+ * of these public functions more efficient.
+ *
+ * On input:
+ *      t9 = return address
+ *      a0 = DST
+ *      a1 = SRC
+ *
+ * On output:
+ *      t12 = bitmask (with one bit set) indicating the last byte written
+ *      a0  = unaligned address of the last *word* written
+ *
+ * Furthermore, v0, a3-a5, t11, and t12 are untouched.
+ *
+ * Much of the information about 21264 scheduling/coding comes from:
+ *      Compiler Writer's Guide for the Alpha 21264
+ *      abbreviated as 'CWG' in other comments here
+ *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ *      E       - either cluster
+ *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ * Try not to change the actual algorithm if possible for consistency.
+ */
+#include <asm/regdef.h>
+        .set noat
+        .set noreorder
+        .text
+/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
+   doesn't like putting the entry point for a procedure somewhere in the
+   middle of the procedure descriptor.  Work around this by putting the
+   aligned copy in its own procedure descriptor */
+        .ent stxcpy_aligned
+        .align 4
+stxcpy_aligned:
+        .frame sp, 0, t9
+        .prologue 0
+        /* On entry to this basic block:
+           t0 == the first destination word for masking back in
+           t1 == the first source word.  */
+        /* Create the 1st output word and detect 0's in the 1st input word.  */
+        lda     t2, -1          # E : build a mask against false zero
+        mskqh   t2, a1, t2      # U :   detection in the src word (stall)
+        mskqh   t1, a1, t3      # U :
+        ornot   t1, t2, t2      # E : (stall)
+        mskql   t0, a1, t0      # U : assemble the first output word
+        cmpbge  zero, t2, t8    # E : bits set iff null found
+        or      t0, t3, t1      # E : (stall)
+        bne     t8, $a_eos      # U : (stall)
+        /* On entry to this basic block:
+           t0 == the first destination word for masking back in
+           t1 == a source word not containing a null.  */
+        /* Nops here to separate store quads from load quads */
+$a_loop:
+        stq_u   t1, 0(a0)       # L :
+        addq    a0, 8, a0       # E :
+        nop
+        nop
+        ldq_u   t1, 0(a1)       # L : Latency=3
+        addq    a1, 8, a1       # E :
+        cmpbge  zero, t1, t8    # E : (3 cycle stall)
+        beq     t8, $a_loop     # U : (stall for t8)
+        /* Take care of the final (partial) word store.
+           On entry to this basic block we have:
+           t1 == the source word containing the null
+           t8 == the cmpbge mask that found it.  */
+$a_eos:
+        negq    t8, t6          # E : find low bit set
+        and     t8, t6, t12     # E : (stall)
+        /* For the sake of the cache, don't read a destination word
+           if we're not going to need it.  */
+        and     t12, 0x80, t6   # E : (stall)
+        bne     t6, 1f          # U : (stall)
+        /* We're doing a partial word store and so need to combine
+           our source and original destination words.  */
+        ldq_u   t0, 0(a0)       # L : Latency=3
+        subq    t12, 1, t6      # E :
+        zapnot  t1, t6, t1      # U : clear src bytes >= null (stall)
+        or      t12, t6, t8     # E : (stall)
+        zap     t0, t8, t0      # E : clear dst bytes <= null
+        or      t0, t1, t1      # E : (stall)
+        nop
+        nop
+1:      stq_u   t1, 0(a0)       # L :
+        ret     (t9)            # L0 : Latency=3
+        nop
+        nop
+        .end stxcpy_aligned
+        .align 4
+        .ent __stxcpy
+        .globl __stxcpy
+__stxcpy:
+        .frame sp, 0, t9
+        .prologue 0
+        /* Are source and destination co-aligned?  */
+        xor     a0, a1, t0      # E :
+        unop                    # E :
+        and     t0, 7, t0       # E : (stall)
+        bne     t0, $unaligned  # U : (stall)
+        /* We are co-aligned; take care of a partial first word.  */
+        ldq_u   t1, 0(a1)               # L : load first src word
+        and     a0, 7, t0               # E : take care not to load a word ...
+        addq    a1, 8, a1               # E :
+        beq     t0, stxcpy_aligned      # U : ... if we wont need it (stall)
+        ldq_u   t0, 0(a0)       # L :
+        br      stxcpy_aligned  # L0 : Latency=3
+        nop
+        nop
+/* The source and destination are not co-aligned.  Align the destination
+   and cope.  We have to be very careful about not reading too much and
+   causing a SEGV.  */
+        .align 4
+$u_head:
+        /* We know just enough now to be able to assemble the first
+           full source word.  We can still find a zero at the end of it
+           that prevents us from outputting the whole thing.
+           On entry to this basic block:
+           t0 == the first dest word, for masking back in, if needed else 0
+           t1 == the low bits of the first source word
+           t6 == bytemask that is -1 in dest word bytes */
+        ldq_u   t2, 8(a1)       # L :
+        addq    a1, 8, a1       # E :
+        extql   t1, a1, t1      # U : (stall on a1)
+        extqh   t2, a1, t4      # U : (stall on a1)
+        mskql   t0, a0, t0      # U :
+        or      t1, t4, t1      # E :
+        mskqh   t1, a0, t1      # U : (stall on t1)
+        or      t0, t1, t1      # E : (stall on t1)
+        or      t1, t6, t6      # E :
+        cmpbge  zero, t6, t8    # E : (stall)
+        lda     t6, -1          # E : for masking just below
+        bne     t8, $u_final    # U : (stall)
+        mskql   t6, a1, t6              # U : mask out the bits we have
+        or      t6, t2, t2              # E :   already extracted before (stall)
+        cmpbge  zero, t2, t8            # E :   testing eos (stall)
+        bne     t8, $u_late_head_exit   # U : (stall)
+        /* Finally, we've got all the stupid leading edge cases taken care
+           of and we can set up to enter the main loop.  */
+        stq_u   t1, 0(a0)       # L : store first output word
+        addq    a0, 8, a0       # E :
+        extql   t2, a1, t0      # U : position ho-bits of lo word
+        ldq_u   t2, 8(a1)       # U : read next high-order source word
+        addq    a1, 8, a1       # E :
+        cmpbge  zero, t2, t8    # E : (stall for t2)
+        nop                     # E :
+        bne     t8, $u_eos      # U : (stall)
+        /* Unaligned copy main loop.  In order to avoid reading too much,
+           the loop is structured to detect zeros in aligned source words.
+           This has, unfortunately, effectively pulled half of a loop
+           iteration out into the head and half into the tail, but it does
+           prevent nastiness from accumulating in the very thing we want
+           to run as fast as possible.
+           On entry to this basic block:
+           t0 == the shifted high-order bits from the previous source word
+           t2 == the unshifted current source word
+           We further know that t2 does not contain a null terminator.  */
+        .align 3
+$u_loop:
+        extqh   t2, a1, t1      # U : extract high bits for current word
+        addq    a1, 8, a1       # E : (stall)
+        extql   t2, a1, t3      # U : extract low bits for next time (stall)
+        addq    a0, 8, a0       # E :
+        or      t0, t1, t1      # E : current dst word now complete
+        ldq_u   t2, 0(a1)       # L : Latency=3 load high word for next time
+        stq_u   t1, -8(a0)      # L : save the current word (stall)
+        mov     t3, t0          # E :
+        cmpbge  zero, t2, t8    # E : test new word for eos
+        beq     t8, $u_loop     # U : (stall)
+        nop
+        nop
+        /* We've found a zero somewhere in the source word we just read.
+           If it resides in the lower half, we have one (probably partial)
+           word to write out, and if it resides in the upper half, we
+           have one full and one partial word left to write out.
+           On entry to this basic block:
+           t0 == the shifted high-order bits from the previous source word
+           t2 == the unshifted current source word.  */
+$u_eos:
+        extqh   t2, a1, t1      # U :
+        or      t0, t1, t1      # E : first (partial) source word complete (stall)
+        cmpbge  zero, t1, t8    # E : is the null in this first bit? (stall)
+        bne     t8, $u_final    # U : (stall)
+$u_late_head_exit:
+        stq_u   t1, 0(a0)       # L : the null was in the high-order bits
+        addq    a0, 8, a0       # E :
+        extql   t2, a1, t1      # U :
+        cmpbge  zero, t1, t8    # E : (stall)
+        /* Take care of a final (probably partial) result word.
+           On entry to this basic block:
+           t1 == assembled source word
+           t8 == cmpbge mask that found the null.  */
+$u_final:
+        negq    t8, t6          # E : isolate low bit set
+        and     t6, t8, t12     # E : (stall)
+        and     t12, 0x80, t6   # E : avoid dest word load if we can (stall)
+        bne     t6, 1f          # U : (stall)
+        ldq_u   t0, 0(a0)       # E :
+        subq    t12, 1, t6      # E :
+        or      t6, t12, t8     # E : (stall)
+        zapnot  t1, t6, t1      # U : kill source bytes >= null (stall)
+        zap     t0, t8, t0      # U : kill dest bytes <= null (2 cycle data stall)
+        or      t0, t1, t1      # E : (stall)
+        nop
+        nop
+1:      stq_u   t1, 0(a0)       # L :
+        ret     (t9)            # L0 : Latency=3
+        nop
+        nop
+        /* Unaligned copy entry point.  */
+        .align 4
+$unaligned:
+        ldq_u   t1, 0(a1)       # L : load first source word
+        and     a0, 7, t4       # E : find dest misalignment
+        and     a1, 7, t5       # E : find src misalignment
+        /* Conditionally load the first destination word and a bytemask
+           with 0xff indicating that the destination byte is sacrosanct.  */
+        mov     zero, t0        # E :
+        mov     zero, t6        # E :
+        beq     t4, 1f          # U :
+        ldq_u   t0, 0(a0)       # L :
+        lda     t6, -1          # E :
+        mskql   t6, a0, t6      # U :
+        nop
+        nop
+        nop
+1:
+        subq    a1, t4, a1      # E : sub dest misalignment from src addr
+        /* If source misalignment is larger than dest misalignment, we need
+           extra startup checks to avoid SEGV.  */
+        cmplt   t4, t5, t12     # E :
+        beq     t12, $u_head    # U :
+        lda     t2, -1          # E : mask out leading garbage in source
+        mskqh   t2, t5, t2      # U :
+        ornot   t1, t2, t3      # E : (stall)
+        cmpbge  zero, t3, t8    # E : is there a zero? (stall)
+        beq     t8, $u_head     # U : (stall)
+        /* At this point we've found a zero in the first partial word of
+           the source.  We need to isolate the valid source data and mask
+           it into the original destination data.  (Incidentally, we know
+           that we'll need at least one byte of that original dest word.) */
+        ldq_u   t0, 0(a0)       # L :
+        negq    t8, t6          # E : build bitmask of bytes <= zero
+        and     t6, t8, t12     # E : (stall)
+        and     a1, 7, t5       # E :
+        subq    t12, 1, t6      # E :
+        or      t6, t12, t8     # E : (stall)
+        srl     t12, t5, t12    # U : adjust final null return value
+        zapnot  t2, t8, t2      # U : prepare source word; mirror changes (stall)
+        and     t1, t2, t1      # E : to source validity mask
+        extql   t2, a1, t2      # U :
+        extql   t1, a1, t1      # U : (stall)
+        andnot  t0, t2, t0      # .. e1 : zero place for source to reside (stall)
+        or      t0, t1, t1      # e1    : and put it there
+        stq_u   t1, 0(a0)       # .. e0 : (stall)
+        ret     (t9)            # e1    :
+        nop
+        .end __stxcpy
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/alpha/lib/ev6-stxcpy.S

diff --git a/arch/alpha/lib/ev6-stxcpy.S b/arch/alpha/lib/ev6-stxcpy.S new file mode 100644 index 000000000000..4643ff2ffc8d --- /dev/null +++ b/arch/alpha/lib/ev6-stxcpy.S
@@ -0,0 +1,321 @@
	1	/*
	2	* arch/alpha/lib/ev6-stxcpy.S
	3	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
	4	*
	5	* Copy a null-terminated string from SRC to DST.
	6	*
	7	* This is an internal routine used by strcpy, stpcpy, and strcat.
	8	* As such, it uses special linkage conventions to make implementation
	9	* of these public functions more efficient.
	10	*
	11	* On input:
	12	* t9 = return address
	13	* a0 = DST
	14	* a1 = SRC
	15	*
	16	* On output:
	17	* t12 = bitmask (with one bit set) indicating the last byte written
	18	* a0 = unaligned address of the last word written
	19	*
	20	* Furthermore, v0, a3-a5, t11, and t12 are untouched.
	21	*
	22	* Much of the information about 21264 scheduling/coding comes from:
	23	* Compiler Writer's Guide for the Alpha 21264
	24	* abbreviated as 'CWG' in other comments here
	25	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
	26	* Scheduling notation:
	27	* E - either cluster
	28	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
	29	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
	30	* Try not to change the actual algorithm if possible for consistency.
	31	*/
	32
	33	#include <asm/regdef.h>
	34
	35	.set noat
	36	.set noreorder
	37
	38	.text
	39
	40	/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
	41	doesn't like putting the entry point for a procedure somewhere in the
	42	middle of the procedure descriptor. Work around this by putting the
	43	aligned copy in its own procedure descriptor */
	44
	45
	46	.ent stxcpy_aligned
	47	.align 4
	48	stxcpy_aligned:
	49	.frame sp, 0, t9
	50	.prologue 0
	51
	52	/* On entry to this basic block:
	53	t0 == the first destination word for masking back in
	54	t1 == the first source word. */
	55
	56	/* Create the 1st output word and detect 0's in the 1st input word. */
	57	lda t2, -1 # E : build a mask against false zero
	58	mskqh t2, a1, t2 # U : detection in the src word (stall)
	59	mskqh t1, a1, t3 # U :
	60	ornot t1, t2, t2 # E : (stall)
	61
	62	mskql t0, a1, t0 # U : assemble the first output word
	63	cmpbge zero, t2, t8 # E : bits set iff null found
	64	or t0, t3, t1 # E : (stall)
	65	bne t8, $a_eos # U : (stall)
	66
	67	/* On entry to this basic block:
	68	t0 == the first destination word for masking back in
	69	t1 == a source word not containing a null. */
	70	/* Nops here to separate store quads from load quads */
	71
	72	$a_loop:
	73	stq_u t1, 0(a0) # L :
	74	addq a0, 8, a0 # E :
	75	nop
	76	nop
	77
	78	ldq_u t1, 0(a1) # L : Latency=3
	79	addq a1, 8, a1 # E :
	80	cmpbge zero, t1, t8 # E : (3 cycle stall)
	81	beq t8, $a_loop # U : (stall for t8)
	82
	83	/* Take care of the final (partial) word store.
	84	On entry to this basic block we have:
	85	t1 == the source word containing the null
	86	t8 == the cmpbge mask that found it. */
	87	$a_eos:
	88	negq t8, t6 # E : find low bit set
	89	and t8, t6, t12 # E : (stall)
	90	/* For the sake of the cache, don't read a destination word
	91	if we're not going to need it. */
	92	and t12, 0x80, t6 # E : (stall)
	93	bne t6, 1f # U : (stall)
	94
	95	/* We're doing a partial word store and so need to combine
	96	our source and original destination words. */
	97	ldq_u t0, 0(a0) # L : Latency=3
	98	subq t12, 1, t6 # E :
	99	zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
	100	or t12, t6, t8 # E : (stall)
	101
	102	zap t0, t8, t0 # E : clear dst bytes <= null
	103	or t0, t1, t1 # E : (stall)
	104	nop
	105	nop
	106
	107	1: stq_u t1, 0(a0) # L :
	108	ret (t9) # L0 : Latency=3
	109	nop
	110	nop
	111
	112	.end stxcpy_aligned
	113
	114	.align 4
	115	.ent __stxcpy
	116	.globl __stxcpy
	117	__stxcpy:
	118	.frame sp, 0, t9
	119	.prologue 0
	120
	121	/* Are source and destination co-aligned? */
	122	xor a0, a1, t0 # E :
	123	unop # E :
	124	and t0, 7, t0 # E : (stall)
	125	bne t0, $unaligned # U : (stall)
	126
	127	/* We are co-aligned; take care of a partial first word. */
	128	ldq_u t1, 0(a1) # L : load first src word
	129	and a0, 7, t0 # E : take care not to load a word ...
	130	addq a1, 8, a1 # E :
	131	beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
	132
	133	ldq_u t0, 0(a0) # L :
	134	br stxcpy_aligned # L0 : Latency=3
	135	nop
	136	nop
	137
	138
	139	/* The source and destination are not co-aligned. Align the destination
	140	and cope. We have to be very careful about not reading too much and
	141	causing a SEGV. */
	142
	143	.align 4
	144	$u_head:
	145	/* We know just enough now to be able to assemble the first
	146	full source word. We can still find a zero at the end of it
	147	that prevents us from outputting the whole thing.
	148
	149	On entry to this basic block:
	150	t0 == the first dest word, for masking back in, if needed else 0
	151	t1 == the low bits of the first source word
	152	t6 == bytemask that is -1 in dest word bytes */
	153
	154	ldq_u t2, 8(a1) # L :
	155	addq a1, 8, a1 # E :
	156	extql t1, a1, t1 # U : (stall on a1)
	157	extqh t2, a1, t4 # U : (stall on a1)
	158
	159	mskql t0, a0, t0 # U :
	160	or t1, t4, t1 # E :
	161	mskqh t1, a0, t1 # U : (stall on t1)
	162	or t0, t1, t1 # E : (stall on t1)
	163
	164	or t1, t6, t6 # E :
	165	cmpbge zero, t6, t8 # E : (stall)
	166	lda t6, -1 # E : for masking just below
	167	bne t8, $u_final # U : (stall)
	168
	169	mskql t6, a1, t6 # U : mask out the bits we have
	170	or t6, t2, t2 # E : already extracted before (stall)
	171	cmpbge zero, t2, t8 # E : testing eos (stall)
	172	bne t8, $u_late_head_exit # U : (stall)
	173
	174	/* Finally, we've got all the stupid leading edge cases taken care
	175	of and we can set up to enter the main loop. */
	176
	177	stq_u t1, 0(a0) # L : store first output word
	178	addq a0, 8, a0 # E :
	179	extql t2, a1, t0 # U : position ho-bits of lo word
	180	ldq_u t2, 8(a1) # U : read next high-order source word
	181
	182	addq a1, 8, a1 # E :
	183	cmpbge zero, t2, t8 # E : (stall for t2)
	184	nop # E :
	185	bne t8, $u_eos # U : (stall)
	186
	187	/* Unaligned copy main loop. In order to avoid reading too much,
	188	the loop is structured to detect zeros in aligned source words.
	189	This has, unfortunately, effectively pulled half of a loop
	190	iteration out into the head and half into the tail, but it does
	191	prevent nastiness from accumulating in the very thing we want
	192	to run as fast as possible.
	193
	194	On entry to this basic block:
	195	t0 == the shifted high-order bits from the previous source word
	196	t2 == the unshifted current source word
	197
	198	We further know that t2 does not contain a null terminator. */
	199
	200	.align 3
	201	$u_loop:
	202	extqh t2, a1, t1 # U : extract high bits for current word
	203	addq a1, 8, a1 # E : (stall)
	204	extql t2, a1, t3 # U : extract low bits for next time (stall)
	205	addq a0, 8, a0 # E :
	206
	207	or t0, t1, t1 # E : current dst word now complete
	208	ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
	209	stq_u t1, -8(a0) # L : save the current word (stall)
	210	mov t3, t0 # E :
	211
	212	cmpbge zero, t2, t8 # E : test new word for eos
	213	beq t8, $u_loop # U : (stall)
	214	nop
	215	nop
	216
	217	/* We've found a zero somewhere in the source word we just read.
	218	If it resides in the lower half, we have one (probably partial)
	219	word to write out, and if it resides in the upper half, we
	220	have one full and one partial word left to write out.
	221
	222	On entry to this basic block:
	223	t0 == the shifted high-order bits from the previous source word
	224	t2 == the unshifted current source word. */
	225	$u_eos:
	226	extqh t2, a1, t1 # U :
	227	or t0, t1, t1 # E : first (partial) source word complete (stall)
	228	cmpbge zero, t1, t8 # E : is the null in this first bit? (stall)
	229	bne t8, $u_final # U : (stall)
	230
	231	$u_late_head_exit:
	232	stq_u t1, 0(a0) # L : the null was in the high-order bits
	233	addq a0, 8, a0 # E :
	234	extql t2, a1, t1 # U :
	235	cmpbge zero, t1, t8 # E : (stall)
	236
	237	/* Take care of a final (probably partial) result word.
	238	On entry to this basic block:
	239	t1 == assembled source word
	240	t8 == cmpbge mask that found the null. */
	241	$u_final:
	242	negq t8, t6 # E : isolate low bit set
	243	and t6, t8, t12 # E : (stall)
	244	and t12, 0x80, t6 # E : avoid dest word load if we can (stall)
	245	bne t6, 1f # U : (stall)
	246
	247	ldq_u t0, 0(a0) # E :
	248	subq t12, 1, t6 # E :
	249	or t6, t12, t8 # E : (stall)
	250	zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
	251
	252	zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall)
	253	or t0, t1, t1 # E : (stall)
	254	nop
	255	nop
	256
	257	1: stq_u t1, 0(a0) # L :
	258	ret (t9) # L0 : Latency=3
	259	nop
	260	nop
	261
	262	/* Unaligned copy entry point. */
	263	.align 4
	264	$unaligned:
	265
	266	ldq_u t1, 0(a1) # L : load first source word
	267	and a0, 7, t4 # E : find dest misalignment
	268	and a1, 7, t5 # E : find src misalignment
	269	/* Conditionally load the first destination word and a bytemask
	270	with 0xff indicating that the destination byte is sacrosanct. */
	271	mov zero, t0 # E :
	272
	273	mov zero, t6 # E :
	274	beq t4, 1f # U :
	275	ldq_u t0, 0(a0) # L :
	276	lda t6, -1 # E :
	277
	278	mskql t6, a0, t6 # U :
	279	nop
	280	nop
	281	nop
	282	1:
	283	subq a1, t4, a1 # E : sub dest misalignment from src addr
	284	/* If source misalignment is larger than dest misalignment, we need
	285	extra startup checks to avoid SEGV. */
	286	cmplt t4, t5, t12 # E :
	287	beq t12, $u_head # U :
	288	lda t2, -1 # E : mask out leading garbage in source
	289
	290	mskqh t2, t5, t2 # U :
	291	ornot t1, t2, t3 # E : (stall)
	292	cmpbge zero, t3, t8 # E : is there a zero? (stall)
	293	beq t8, $u_head # U : (stall)
	294
	295	/* At this point we've found a zero in the first partial word of
	296	the source. We need to isolate the valid source data and mask
	297	it into the original destination data. (Incidentally, we know
	298	that we'll need at least one byte of that original dest word.) */
	299
	300	ldq_u t0, 0(a0) # L :
	301	negq t8, t6 # E : build bitmask of bytes <= zero
	302	and t6, t8, t12 # E : (stall)
	303	and a1, 7, t5 # E :
	304
	305	subq t12, 1, t6 # E :
	306	or t6, t12, t8 # E : (stall)
	307	srl t12, t5, t12 # U : adjust final null return value
	308	zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)
	309
	310	and t1, t2, t1 # E : to source validity mask
	311	extql t2, a1, t2 # U :
	312	extql t1, a1, t1 # U : (stall)
	313	andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
	314
	315	or t0, t1, t1 # e1 : and put it there
	316	stq_u t1, 0(a0) # .. e0 : (stall)
	317	ret (t9) # e1 :
	318	nop
	319
	320	.end __stxcpy
	321