1 files changed, 597 insertions, 0 deletions
diff --git a/arch/alpha/lib/ev6-memset.S b/arch/alpha/lib/ev6-memset.S
new file mode 100644
index 000000000000..d8b94e1c7fca
--- /dev/null
+++ b/arch/alpha/lib/ev6-memset.S
@@ -0,0 +1,597 @@
+/*
+ * arch/alpha/lib/ev6-memset.S
+ *
+ * This is an efficient (and relatively small) implementation of the C library
+ * "memset()" function for the 21264 implementation of Alpha.
+ *
+ * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
+ *
+ * Much of the information about 21264 scheduling/coding comes from:
+ *      Compiler Writer's Guide for the Alpha 21264
+ *      abbreviated as 'CWG' in other comments here
+ *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ *      E       - either cluster
+ *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ * The algorithm for the leading and trailing quadwords remains the same,
+ * however the loop has been unrolled to enable better memory throughput,
+ * and the code has been replicated for each of the entry points: __memset
+ * and __memsetw to permit better scheduling to eliminate the stalling
+ * encountered during the mask replication.
+ * A future enhancement might be to put in a byte store loop for really
+ * small (say < 32 bytes) memset()s.  Whether or not that change would be
+ * a win in the kernel would depend upon the contextual usage.
+ * WARNING: Maintaining this is going to be more work than the above version,
+ * as fixes will need to be made in multiple places.  The performance gain
+ * is worth it.
+ */
+        .set noat
+        .set noreorder
+.text
+        .globl __memset
+        .globl __memsetw
+        .globl __constant_c_memset
+        .globl memset
+        .ent __memset
+.align 5
+__memset:
+        .frame $30,0,$26,0
+        .prologue 0
+        /*
+         * Serious stalling happens.  The only way to mitigate this is to
+         * undertake a major re-write to interleave the constant materialization
+         * with other parts of the fall-through code.  This is important, even
+         * though it makes maintenance tougher.
+         * Do this later.
+         */
+        and $17,255,$1          # E : 00000000000000ch
+        insbl $17,1,$2          # U : 000000000000ch00
+        bis $16,$16,$0          # E : return value
+        ble $18,end_b           # U : zero length requested?
+        addq $18,$16,$6         # E : max address to write to
+        bis     $1,$2,$17       # E : 000000000000chch
+        insbl   $1,2,$3         # U : 0000000000ch0000
+        insbl   $1,3,$4         # U : 00000000ch000000
+        or      $3,$4,$3        # E : 00000000chch0000
+        inswl   $17,4,$5        # U : 0000chch00000000
+        xor     $16,$6,$1       # E : will complete write be within one quadword?
+        inswl   $17,6,$2        # U : chch000000000000
+        or      $17,$3,$17      # E : 00000000chchchch
+        or      $2,$5,$2        # E : chchchch00000000
+        bic     $1,7,$1         # E : fit within a single quadword?
+        and     $16,7,$3        # E : Target addr misalignment
+        or      $17,$2,$17      # E : chchchchchchchch
+        beq     $1,within_quad_b # U :
+        nop                     # E :
+        beq     $3,aligned_b    # U : target is 0mod8
+        /*
+         * Target address is misaligned, and won't fit within a quadword
+         */
+        ldq_u $4,0($16)         # L : Fetch first partial
+        bis $16,$16,$5          # E : Save the address
+        insql $17,$16,$2        # U : Insert new bytes
+        subq $3,8,$3            # E : Invert (for addressing uses)
+        addq $18,$3,$18         # E : $18 is new count ($3 is negative)
+        mskql $4,$16,$4         # U : clear relevant parts of the quad
+        subq $16,$3,$16         # E : $16 is new aligned destination
+        bis $2,$4,$1            # E : Final bytes
+        nop
+        stq_u $1,0($5)          # L : Store result
+        nop
+        nop
+.align 4
+aligned_b:
+        /*
+         * We are now guaranteed to be quad aligned, with at least
+         * one partial quad to write.
+         */
+        sra $18,3,$3            # U : Number of remaining quads to write
+        and $18,7,$18           # E : Number of trailing bytes to write
+        bis $16,$16,$5          # E : Save dest address
+        beq $3,no_quad_b        # U : tail stuff only
+        /*
+         * it's worth the effort to unroll this and use wh64 if possible
+         * Lifted a bunch of code from clear_user.S
+         * At this point, entry values are:
+         * $16  Current destination address
+         * $5   A copy of $16
+         * $6   The max quadword address to write to
+         * $18  Number trailer bytes
+         * $3   Number quads to write
+         */
+        and     $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
+        subq    $3, 16, $4      # E : Only try to unroll if > 128 bytes
+        subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
+        blt     $4, loop_b      # U :
+        /*
+         * We know we've got at least 16 quads, minimum of one trip
+         * through unrolled loop.  Do a quad at a time to get us 0mod64
+         * aligned.
+         */
+        nop                     # E :
+        nop                     # E :
+        nop                     # E :
+        beq     $1, $bigalign_b # U :
+$alignmod64_b:
+        stq     $17, 0($5)      # L :
+        subq    $3, 1, $3       # E : For consistency later
+        addq    $1, 8, $1       # E : Increment towards zero for alignment
+        addq    $5, 8, $4       # E : Initial wh64 address (filler instruction)
+        nop
+        nop
+        addq    $5, 8, $5       # E : Inc address
+        blt     $1, $alignmod64_b # U :
+$bigalign_b:
+        /*
+         * $3 - number quads left to go
+         * $5 - target address (aligned 0mod64)
+         * $17 - mask of stuff to store
+         * Scratch registers available: $7, $2, $4, $1
+         * we know that we'll be taking a minimum of one trip through
+         * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
+         * Assumes the wh64 needs to be for 2 trips through the loop in the future
+         * The wh64 is issued on for the starting destination address for trip +2
+         * through the loop, and if there are less than two trips left, the target
+         * address will be for the current trip.
+         */
+$do_wh64_b:
+        wh64    ($4)            # L1 : memory subsystem write hint
+        subq    $3, 24, $2      # E : For determining future wh64 addresses
+        stq     $17, 0($5)      # L :
+        nop                     # E :
+        addq    $5, 128, $4     # E : speculative target of next wh64
+        stq     $17, 8($5)      # L :
+        stq     $17, 16($5)     # L :
+        addq    $5, 64, $7      # E : Fallback address for wh64 (== next trip addr)
+        stq     $17, 24($5)     # L :
+        stq     $17, 32($5)     # L :
+        cmovlt  $2, $7, $4      # E : Latency 2, extra mapping cycle
+        nop
+        stq     $17, 40($5)     # L :
+        stq     $17, 48($5)     # L :
+        subq    $3, 16, $2      # E : Repeat the loop at least once more?
+        nop
+        stq     $17, 56($5)     # L :
+        addq    $5, 64, $5      # E :
+        subq    $3, 8, $3       # E :
+        bge     $2, $do_wh64_b  # U :
+        nop
+        nop
+        nop
+        beq     $3, no_quad_b   # U : Might have finished already
+.align 4
+        /*
+         * Simple loop for trailing quadwords, or for small amounts
+         * of data (where we can't use an unrolled loop and wh64)
+         */
+loop_b:
+        stq $17,0($5)           # L :
+        subq $3,1,$3            # E : Decrement number quads left
+        addq $5,8,$5            # E : Inc address
+        bne $3,loop_b           # U : more?
+no_quad_b:
+        /*
+         * Write 0..7 trailing bytes.
+         */
+        nop                     # E :
+        beq $18,end_b           # U : All done?
+        ldq $7,0($5)            # L :
+        mskqh $7,$6,$2          # U : Mask final quad
+        insqh $17,$6,$4         # U : New bits
+        bis $2,$4,$1            # E : Put it all together
+        stq $1,0($5)            # L : And back to memory
+        ret $31,($26),1         # L0 :
+within_quad_b:
+        ldq_u $1,0($16)         # L :
+        insql $17,$16,$2        # U : New bits
+        mskql $1,$16,$4         # U : Clear old
+        bis $2,$4,$2            # E : New result
+        mskql $2,$6,$4          # U :
+        mskqh $1,$6,$2          # U :
+        bis $2,$4,$1            # E :
+        stq_u $1,0($16)         # L :
+end_b:
+        nop
+        nop
+        nop
+        ret $31,($26),1         # L0 :
+        .end __memset
+        /*
+         * This is the original body of code, prior to replication and
+         * rescheduling.  Leave it here, as there may be calls to this
+         * entry point.
+         */
+.align 4
+        .ent __constant_c_memset
+__constant_c_memset:
+        .frame $30,0,$26,0
+        .prologue 0
+        addq $18,$16,$6         # E : max address to write to
+        bis $16,$16,$0          # E : return value
+        xor $16,$6,$1           # E : will complete write be within one quadword?
+        ble $18,end             # U : zero length requested?
+        bic $1,7,$1             # E : fit within a single quadword
+        beq $1,within_one_quad  # U :
+        and $16,7,$3            # E : Target addr misalignment
+        beq $3,aligned          # U : target is 0mod8
+        /*
+         * Target address is misaligned, and won't fit within a quadword
+         */
+        ldq_u $4,0($16)         # L : Fetch first partial
+        bis $16,$16,$5          # E : Save the address
+        insql $17,$16,$2        # U : Insert new bytes
+        subq $3,8,$3            # E : Invert (for addressing uses)
+        addq $18,$3,$18         # E : $18 is new count ($3 is negative)
+        mskql $4,$16,$4         # U : clear relevant parts of the quad
+        subq $16,$3,$16         # E : $16 is new aligned destination
+        bis $2,$4,$1            # E : Final bytes
+        nop
+        stq_u $1,0($5)          # L : Store result
+        nop
+        nop
+.align 4
+aligned:
+        /*
+         * We are now guaranteed to be quad aligned, with at least
+         * one partial quad to write.
+         */
+        sra $18,3,$3            # U : Number of remaining quads to write
+        and $18,7,$18           # E : Number of trailing bytes to write
+        bis $16,$16,$5          # E : Save dest address
+        beq $3,no_quad          # U : tail stuff only
+        /*
+         * it's worth the effort to unroll this and use wh64 if possible
+         * Lifted a bunch of code from clear_user.S
+         * At this point, entry values are:
+         * $16  Current destination address
+         * $5   A copy of $16
+         * $6   The max quadword address to write to
+         * $18  Number trailer bytes
+         * $3   Number quads to write
+         */
+        and     $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
+        subq    $3, 16, $4      # E : Only try to unroll if > 128 bytes
+        subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
+        blt     $4, loop        # U :
+        /*
+         * We know we've got at least 16 quads, minimum of one trip
+         * through unrolled loop.  Do a quad at a time to get us 0mod64
+         * aligned.
+         */
+        nop                     # E :
+        nop                     # E :
+        nop                     # E :
+        beq     $1, $bigalign   # U :
+$alignmod64:
+        stq     $17, 0($5)      # L :
+        subq    $3, 1, $3       # E : For consistency later
+        addq    $1, 8, $1       # E : Increment towards zero for alignment
+        addq    $5, 8, $4       # E : Initial wh64 address (filler instruction)
+        nop
+        nop
+        addq    $5, 8, $5       # E : Inc address
+        blt     $1, $alignmod64 # U :
+$bigalign:
+        /*
+         * $3 - number quads left to go
+         * $5 - target address (aligned 0mod64)
+         * $17 - mask of stuff to store
+         * Scratch registers available: $7, $2, $4, $1
+         * we know that we'll be taking a minimum of one trip through
+         * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
+         * Assumes the wh64 needs to be for 2 trips through the loop in the future
+         * The wh64 is issued on for the starting destination address for trip +2
+         * through the loop, and if there are less than two trips left, the target
+         * address will be for the current trip.
+         */
+$do_wh64:
+        wh64    ($4)            # L1 : memory subsystem write hint
+        subq    $3, 24, $2      # E : For determining future wh64 addresses
+        stq     $17, 0($5)      # L :
+        nop                     # E :
+        addq    $5, 128, $4     # E : speculative target of next wh64
+        stq     $17, 8($5)      # L :
+        stq     $17, 16($5)     # L :
+        addq    $5, 64, $7      # E : Fallback address for wh64 (== next trip addr)
+        stq     $17, 24($5)     # L :
+        stq     $17, 32($5)     # L :
+        cmovlt  $2, $7, $4      # E : Latency 2, extra mapping cycle
+        nop
+        stq     $17, 40($5)     # L :
+        stq     $17, 48($5)     # L :
+        subq    $3, 16, $2      # E : Repeat the loop at least once more?
+        nop
+        stq     $17, 56($5)     # L :
+        addq    $5, 64, $5      # E :
+        subq    $3, 8, $3       # E :
+        bge     $2, $do_wh64    # U :
+        nop
+        nop
+        nop
+        beq     $3, no_quad     # U : Might have finished already
+.align 4
+        /*
+         * Simple loop for trailing quadwords, or for small amounts
+         * of data (where we can't use an unrolled loop and wh64)
+         */
+loop:
+        stq $17,0($5)           # L :
+        subq $3,1,$3            # E : Decrement number quads left
+        addq $5,8,$5            # E : Inc address
+        bne $3,loop             # U : more?
+no_quad:
+        /*
+         * Write 0..7 trailing bytes.
+         */
+        nop                     # E :
+        beq $18,end             # U : All done?
+        ldq $7,0($5)            # L :
+        mskqh $7,$6,$2          # U : Mask final quad
+        insqh $17,$6,$4         # U : New bits
+        bis $2,$4,$1            # E : Put it all together
+        stq $1,0($5)            # L : And back to memory
+        ret $31,($26),1         # L0 :
+within_one_quad:
+        ldq_u $1,0($16)         # L :
+        insql $17,$16,$2        # U : New bits
+        mskql $1,$16,$4         # U : Clear old
+        bis $2,$4,$2            # E : New result
+        mskql $2,$6,$4          # U :
+        mskqh $1,$6,$2          # U :
+        bis $2,$4,$1            # E :
+        stq_u $1,0($16)         # L :
+end:
+        nop
+        nop
+        nop
+        ret $31,($26),1         # L0 :
+        .end __constant_c_memset
+        /*
+         * This is a replicant of the __constant_c_memset code, rescheduled
+         * to mask stalls.  Note that entry point names also had to change
+         */
+        .align 5
+        .ent __memsetw
+__memsetw:
+        .frame $30,0,$26,0
+        .prologue 0
+        inswl $17,0,$5          # U : 000000000000c1c2
+        inswl $17,2,$2          # U : 00000000c1c20000
+        bis $16,$16,$0          # E : return value
+        addq    $18,$16,$6      # E : max address to write to
+        ble $18, end_w          # U : zero length requested?
+        inswl   $17,4,$3        # U : 0000c1c200000000
+        inswl   $17,6,$4        # U : c1c2000000000000
+        xor     $16,$6,$1       # E : will complete write be within one quadword?
+        or      $2,$5,$2        # E : 00000000c1c2c1c2
+        or      $3,$4,$17       # E : c1c2c1c200000000
+        bic     $1,7,$1         # E : fit within a single quadword
+        and     $16,7,$3        # E : Target addr misalignment
+        or      $17,$2,$17      # E : c1c2c1c2c1c2c1c2
+        beq $1,within_quad_w    # U :
+        nop
+        beq $3,aligned_w        # U : target is 0mod8
+        /*
+         * Target address is misaligned, and won't fit within a quadword
+         */
+        ldq_u $4,0($16)         # L : Fetch first partial
+        bis $16,$16,$5          # E : Save the address
+        insql $17,$16,$2        # U : Insert new bytes
+        subq $3,8,$3            # E : Invert (for addressing uses)
+        addq $18,$3,$18         # E : $18 is new count ($3 is negative)
+        mskql $4,$16,$4         # U : clear relevant parts of the quad
+        subq $16,$3,$16         # E : $16 is new aligned destination
+        bis $2,$4,$1            # E : Final bytes
+        nop
+        stq_u $1,0($5)          # L : Store result
+        nop
+        nop
+.align 4
+aligned_w:
+        /*
+         * We are now guaranteed to be quad aligned, with at least
+         * one partial quad to write.
+         */
+        sra $18,3,$3            # U : Number of remaining quads to write
+        and $18,7,$18           # E : Number of trailing bytes to write
+        bis $16,$16,$5          # E : Save dest address
+        beq $3,no_quad_w        # U : tail stuff only
+        /*
+         * it's worth the effort to unroll this and use wh64 if possible
+         * Lifted a bunch of code from clear_user.S
+         * At this point, entry values are:
+         * $16  Current destination address
+         * $5   A copy of $16
+         * $6   The max quadword address to write to
+         * $18  Number trailer bytes
+         * $3   Number quads to write
+         */
+        and     $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
+        subq    $3, 16, $4      # E : Only try to unroll if > 128 bytes
+        subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
+        blt     $4, loop_w      # U :
+        /*
+         * We know we've got at least 16 quads, minimum of one trip
+         * through unrolled loop.  Do a quad at a time to get us 0mod64
+         * aligned.
+         */
+        nop                     # E :
+        nop                     # E :
+        nop                     # E :
+        beq     $1, $bigalign_w # U :
+$alignmod64_w:
+        stq     $17, 0($5)      # L :
+        subq    $3, 1, $3       # E : For consistency later
+        addq    $1, 8, $1       # E : Increment towards zero for alignment
+        addq    $5, 8, $4       # E : Initial wh64 address (filler instruction)
+        nop
+        nop
+        addq    $5, 8, $5       # E : Inc address
+        blt     $1, $alignmod64_w       # U :
+$bigalign_w:
+        /*
+         * $3 - number quads left to go
+         * $5 - target address (aligned 0mod64)
+         * $17 - mask of stuff to store
+         * Scratch registers available: $7, $2, $4, $1
+         * we know that we'll be taking a minimum of one trip through
+         * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
+         * Assumes the wh64 needs to be for 2 trips through the loop in the future
+         * The wh64 is issued on for the starting destination address for trip +2
+         * through the loop, and if there are less than two trips left, the target
+         * address will be for the current trip.
+         */
+$do_wh64_w:
+        wh64    ($4)            # L1 : memory subsystem write hint
+        subq    $3, 24, $2      # E : For determining future wh64 addresses
+        stq     $17, 0($5)      # L :
+        nop                     # E :
+        addq    $5, 128, $4     # E : speculative target of next wh64
+        stq     $17, 8($5)      # L :
+        stq     $17, 16($5)     # L :
+        addq    $5, 64, $7      # E : Fallback address for wh64 (== next trip addr)
+        stq     $17, 24($5)     # L :
+        stq     $17, 32($5)     # L :
+        cmovlt  $2, $7, $4      # E : Latency 2, extra mapping cycle
+        nop
+        stq     $17, 40($5)     # L :
+        stq     $17, 48($5)     # L :
+        subq    $3, 16, $2      # E : Repeat the loop at least once more?
+        nop
+        stq     $17, 56($5)     # L :
+        addq    $5, 64, $5      # E :
+        subq    $3, 8, $3       # E :
+        bge     $2, $do_wh64_w  # U :
+        nop
+        nop
+        nop
+        beq     $3, no_quad_w   # U : Might have finished already
+.align 4
+        /*
+         * Simple loop for trailing quadwords, or for small amounts
+         * of data (where we can't use an unrolled loop and wh64)
+         */
+loop_w:
+        stq $17,0($5)           # L :
+        subq $3,1,$3            # E : Decrement number quads left
+        addq $5,8,$5            # E : Inc address
+        bne $3,loop_w           # U : more?
+no_quad_w:
+        /*
+         * Write 0..7 trailing bytes.
+         */
+        nop                     # E :
+        beq $18,end_w           # U : All done?
+        ldq $7,0($5)            # L :
+        mskqh $7,$6,$2          # U : Mask final quad
+        insqh $17,$6,$4         # U : New bits
+        bis $2,$4,$1            # E : Put it all together
+        stq $1,0($5)            # L : And back to memory
+        ret $31,($26),1         # L0 :
+within_quad_w:
+        ldq_u $1,0($16)         # L :
+        insql $17,$16,$2        # U : New bits
+        mskql $1,$16,$4         # U : Clear old
+        bis $2,$4,$2            # E : New result
+        mskql $2,$6,$4          # U :
+        mskqh $1,$6,$2          # U :
+        bis $2,$4,$1            # E :
+        stq_u $1,0($16)         # L :
+end_w:
+        nop
+        nop
+        nop
+        ret $31,($26),1         # L0 :
+        .end __memsetw
+memset = __memset

diff --git a/arch/alpha/lib/ev6-memset.S b/arch/alpha/lib/ev6-memset.S new file mode 100644 index 000000000000..d8b94e1c7fca --- /dev/null +++ b/arch/alpha/lib/ev6-memset.S
@@ -0,0 +1,597 @@
	1	/*
	2	* arch/alpha/lib/ev6-memset.S
	3	*
	4	* This is an efficient (and relatively small) implementation of the C library
	5	* "memset()" function for the 21264 implementation of Alpha.
	6	*
	7	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
	8	*
	9	* Much of the information about 21264 scheduling/coding comes from:
	10	* Compiler Writer's Guide for the Alpha 21264
	11	* abbreviated as 'CWG' in other comments here
	12	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
	13	* Scheduling notation:
	14	* E - either cluster
	15	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
	16	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
	17	* The algorithm for the leading and trailing quadwords remains the same,
	18	* however the loop has been unrolled to enable better memory throughput,
	19	* and the code has been replicated for each of the entry points: __memset
	20	* and __memsetw to permit better scheduling to eliminate the stalling
	21	* encountered during the mask replication.
	22	* A future enhancement might be to put in a byte store loop for really
	23	* small (say < 32 bytes) memset()s. Whether or not that change would be
	24	* a win in the kernel would depend upon the contextual usage.
	25	* WARNING: Maintaining this is going to be more work than the above version,
	26	* as fixes will need to be made in multiple places. The performance gain
	27	* is worth it.
	28	*/
	29
	30	.set noat
	31	.set noreorder
	32	.text
	33	.globl __memset
	34	.globl __memsetw
	35	.globl __constant_c_memset
	36	.globl memset
	37
	38	.ent __memset
	39	.align 5
	40	__memset:
	41	.frame $30,0,$26,0
	42	.prologue 0
	43
	44	/*
	45	* Serious stalling happens. The only way to mitigate this is to
	46	* undertake a major re-write to interleave the constant materialization
	47	* with other parts of the fall-through code. This is important, even
	48	* though it makes maintenance tougher.
	49	* Do this later.
	50	*/
	51	and $17,255,$1 # E : 00000000000000ch
	52	insbl $17,1,$2 # U : 000000000000ch00
	53	bis $16,$16,$0 # E : return value
	54	ble $18,end_b # U : zero length requested?
	55
	56	addq $18,$16,$6 # E : max address to write to
	57	bis $1,$2,$17 # E : 000000000000chch
	58	insbl $1,2,$3 # U : 0000000000ch0000
	59	insbl $1,3,$4 # U : 00000000ch000000
	60
	61	or $3,$4,$3 # E : 00000000chch0000
	62	inswl $17,4,$5 # U : 0000chch00000000
	63	xor $16,$6,$1 # E : will complete write be within one quadword?
	64	inswl $17,6,$2 # U : chch000000000000
	65
	66	or $17,$3,$17 # E : 00000000chchchch
	67	or $2,$5,$2 # E : chchchch00000000
	68	bic $1,7,$1 # E : fit within a single quadword?
	69	and $16,7,$3 # E : Target addr misalignment
	70
	71	or $17,$2,$17 # E : chchchchchchchch
	72	beq $1,within_quad_b # U :
	73	nop # E :
	74	beq $3,aligned_b # U : target is 0mod8
	75
	76	/*
	77	* Target address is misaligned, and won't fit within a quadword
	78	*/
	79	ldq_u $4,0($16) # L : Fetch first partial
	80	bis $16,$16,$5 # E : Save the address
	81	insql $17,$16,$2 # U : Insert new bytes
	82	subq $3,8,$3 # E : Invert (for addressing uses)
	83
	84	addq $18,$3,$18 # E : $18 is new count ($3 is negative)
	85	mskql $4,$16,$4 # U : clear relevant parts of the quad
	86	subq $16,$3,$16 # E : $16 is new aligned destination
	87	bis $2,$4,$1 # E : Final bytes
	88
	89	nop
	90	stq_u $1,0($5) # L : Store result
	91	nop
	92	nop
	93
	94	.align 4
	95	aligned_b:
	96	/*
	97	* We are now guaranteed to be quad aligned, with at least
	98	* one partial quad to write.
	99	*/
	100
	101	sra $18,3,$3 # U : Number of remaining quads to write
	102	and $18,7,$18 # E : Number of trailing bytes to write
	103	bis $16,$16,$5 # E : Save dest address
	104	beq $3,no_quad_b # U : tail stuff only
	105
	106	/*
	107	* it's worth the effort to unroll this and use wh64 if possible
	108	* Lifted a bunch of code from clear_user.S
	109	* At this point, entry values are:
	110	* $16 Current destination address
	111	* $5 A copy of $16
	112	* $6 The max quadword address to write to
	113	* $18 Number trailer bytes
	114	* $3 Number quads to write
	115	*/
	116
	117	and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
	118	subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
	119	subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
	120	blt $4, loop_b # U :
	121
	122	/*
	123	* We know we've got at least 16 quads, minimum of one trip
	124	* through unrolled loop. Do a quad at a time to get us 0mod64
	125	* aligned.
	126	*/
	127
	128	nop # E :
	129	nop # E :
	130	nop # E :
	131	beq $1, $bigalign_b # U :
	132
	133	$alignmod64_b:
	134	stq $17, 0($5) # L :
	135	subq $3, 1, $3 # E : For consistency later
	136	addq $1, 8, $1 # E : Increment towards zero for alignment
	137	addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
	138
	139	nop
	140	nop
	141	addq $5, 8, $5 # E : Inc address
	142	blt $1, $alignmod64_b # U :
	143
	144	$bigalign_b:
	145	/*
	146	* $3 - number quads left to go
	147	* $5 - target address (aligned 0mod64)
	148	* $17 - mask of stuff to store
	149	* Scratch registers available: $7, $2, $4, $1
	150	* we know that we'll be taking a minimum of one trip through
	151	* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
	152	* Assumes the wh64 needs to be for 2 trips through the loop in the future
	153	* The wh64 is issued on for the starting destination address for trip +2
	154	* through the loop, and if there are less than two trips left, the target
	155	* address will be for the current trip.
	156	*/
	157
	158	$do_wh64_b:
	159	wh64 ($4) # L1 : memory subsystem write hint
	160	subq $3, 24, $2 # E : For determining future wh64 addresses
	161	stq $17, 0($5) # L :
	162	nop # E :
	163
	164	addq $5, 128, $4 # E : speculative target of next wh64
	165	stq $17, 8($5) # L :
	166	stq $17, 16($5) # L :
	167	addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
	168
	169	stq $17, 24($5) # L :
	170	stq $17, 32($5) # L :
	171	cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
	172	nop
	173
	174	stq $17, 40($5) # L :
	175	stq $17, 48($5) # L :
	176	subq $3, 16, $2 # E : Repeat the loop at least once more?
	177	nop
	178
	179	stq $17, 56($5) # L :
	180	addq $5, 64, $5 # E :
	181	subq $3, 8, $3 # E :
	182	bge $2, $do_wh64_b # U :
	183
	184	nop
	185	nop
	186	nop
	187	beq $3, no_quad_b # U : Might have finished already
	188
	189	.align 4
	190	/*
	191	* Simple loop for trailing quadwords, or for small amounts
	192	* of data (where we can't use an unrolled loop and wh64)
	193	*/
	194	loop_b:
	195	stq $17,0($5) # L :
	196	subq $3,1,$3 # E : Decrement number quads left
	197	addq $5,8,$5 # E : Inc address
	198	bne $3,loop_b # U : more?
	199
	200	no_quad_b:
	201	/*
	202	* Write 0..7 trailing bytes.
	203	*/
	204	nop # E :
	205	beq $18,end_b # U : All done?
	206	ldq $7,0($5) # L :
	207	mskqh $7,$6,$2 # U : Mask final quad
	208
	209	insqh $17,$6,$4 # U : New bits
	210	bis $2,$4,$1 # E : Put it all together
	211	stq $1,0($5) # L : And back to memory
	212	ret $31,($26),1 # L0 :
	213
	214	within_quad_b:
	215	ldq_u $1,0($16) # L :
	216	insql $17,$16,$2 # U : New bits
	217	mskql $1,$16,$4 # U : Clear old
	218	bis $2,$4,$2 # E : New result
	219
	220	mskql $2,$6,$4 # U :
	221	mskqh $1,$6,$2 # U :
	222	bis $2,$4,$1 # E :
	223	stq_u $1,0($16) # L :
	224
	225	end_b:
	226	nop
	227	nop
	228	nop
	229	ret $31,($26),1 # L0 :
	230	.end __memset
	231
	232	/*
	233	* This is the original body of code, prior to replication and
	234	* rescheduling. Leave it here, as there may be calls to this
	235	* entry point.
	236	*/
	237	.align 4
	238	.ent __constant_c_memset
	239	__constant_c_memset:
	240	.frame $30,0,$26,0
	241	.prologue 0
	242
	243	addq $18,$16,$6 # E : max address to write to
	244	bis $16,$16,$0 # E : return value
	245	xor $16,$6,$1 # E : will complete write be within one quadword?
	246	ble $18,end # U : zero length requested?
	247
	248	bic $1,7,$1 # E : fit within a single quadword
	249	beq $1,within_one_quad # U :
	250	and $16,7,$3 # E : Target addr misalignment
	251	beq $3,aligned # U : target is 0mod8
	252
	253	/*
	254	* Target address is misaligned, and won't fit within a quadword
	255	*/
	256	ldq_u $4,0($16) # L : Fetch first partial
	257	bis $16,$16,$5 # E : Save the address
	258	insql $17,$16,$2 # U : Insert new bytes
	259	subq $3,8,$3 # E : Invert (for addressing uses)
	260
	261	addq $18,$3,$18 # E : $18 is new count ($3 is negative)
	262	mskql $4,$16,$4 # U : clear relevant parts of the quad
	263	subq $16,$3,$16 # E : $16 is new aligned destination
	264	bis $2,$4,$1 # E : Final bytes
	265
	266	nop
	267	stq_u $1,0($5) # L : Store result
	268	nop
	269	nop
	270
	271	.align 4
	272	aligned:
	273	/*
	274	* We are now guaranteed to be quad aligned, with at least
	275	* one partial quad to write.
	276	*/
	277
	278	sra $18,3,$3 # U : Number of remaining quads to write
	279	and $18,7,$18 # E : Number of trailing bytes to write
	280	bis $16,$16,$5 # E : Save dest address
	281	beq $3,no_quad # U : tail stuff only
	282
	283	/*
	284	* it's worth the effort to unroll this and use wh64 if possible
	285	* Lifted a bunch of code from clear_user.S
	286	* At this point, entry values are:
	287	* $16 Current destination address
	288	* $5 A copy of $16
	289	* $6 The max quadword address to write to
	290	* $18 Number trailer bytes
	291	* $3 Number quads to write
	292	*/
	293
	294	and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
	295	subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
	296	subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
	297	blt $4, loop # U :
	298
	299	/*
	300	* We know we've got at least 16 quads, minimum of one trip
	301	* through unrolled loop. Do a quad at a time to get us 0mod64
	302	* aligned.
	303	*/
	304
	305	nop # E :
	306	nop # E :
	307	nop # E :
	308	beq $1, $bigalign # U :
	309
	310	$alignmod64:
	311	stq $17, 0($5) # L :
	312	subq $3, 1, $3 # E : For consistency later
	313	addq $1, 8, $1 # E : Increment towards zero for alignment
	314	addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
	315
	316	nop
	317	nop
	318	addq $5, 8, $5 # E : Inc address
	319	blt $1, $alignmod64 # U :
	320
	321	$bigalign:
	322	/*
	323	* $3 - number quads left to go
	324	* $5 - target address (aligned 0mod64)
	325	* $17 - mask of stuff to store
	326	* Scratch registers available: $7, $2, $4, $1
	327	* we know that we'll be taking a minimum of one trip through
	328	* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
	329	* Assumes the wh64 needs to be for 2 trips through the loop in the future
	330	* The wh64 is issued on for the starting destination address for trip +2
	331	* through the loop, and if there are less than two trips left, the target
	332	* address will be for the current trip.
	333	*/
	334
	335	$do_wh64:
	336	wh64 ($4) # L1 : memory subsystem write hint
	337	subq $3, 24, $2 # E : For determining future wh64 addresses
	338	stq $17, 0($5) # L :
	339	nop # E :
	340
	341	addq $5, 128, $4 # E : speculative target of next wh64
	342	stq $17, 8($5) # L :
	343	stq $17, 16($5) # L :
	344	addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
	345
	346	stq $17, 24($5) # L :
	347	stq $17, 32($5) # L :
	348	cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
	349	nop
	350
	351	stq $17, 40($5) # L :
	352	stq $17, 48($5) # L :
	353	subq $3, 16, $2 # E : Repeat the loop at least once more?
	354	nop
	355
	356	stq $17, 56($5) # L :
	357	addq $5, 64, $5 # E :
	358	subq $3, 8, $3 # E :
	359	bge $2, $do_wh64 # U :
	360
	361	nop
	362	nop
	363	nop
	364	beq $3, no_quad # U : Might have finished already
	365
	366	.align 4
	367	/*
	368	* Simple loop for trailing quadwords, or for small amounts
	369	* of data (where we can't use an unrolled loop and wh64)
	370	*/
	371	loop:
	372	stq $17,0($5) # L :
	373	subq $3,1,$3 # E : Decrement number quads left
	374	addq $5,8,$5 # E : Inc address
	375	bne $3,loop # U : more?
	376
	377	no_quad:
	378	/*
	379	* Write 0..7 trailing bytes.
	380	*/
	381	nop # E :
	382	beq $18,end # U : All done?
	383	ldq $7,0($5) # L :
	384	mskqh $7,$6,$2 # U : Mask final quad
	385
	386	insqh $17,$6,$4 # U : New bits
	387	bis $2,$4,$1 # E : Put it all together
	388	stq $1,0($5) # L : And back to memory
	389	ret $31,($26),1 # L0 :
	390
	391	within_one_quad:
	392	ldq_u $1,0($16) # L :
	393	insql $17,$16,$2 # U : New bits
	394	mskql $1,$16,$4 # U : Clear old
	395	bis $2,$4,$2 # E : New result
	396
	397	mskql $2,$6,$4 # U :
	398	mskqh $1,$6,$2 # U :
	399	bis $2,$4,$1 # E :
	400	stq_u $1,0($16) # L :
	401
	402	end:
	403	nop
	404	nop
	405	nop
	406	ret $31,($26),1 # L0 :
	407	.end __constant_c_memset
	408
	409	/*
	410	* This is a replicant of the __constant_c_memset code, rescheduled
	411	* to mask stalls. Note that entry point names also had to change
	412	*/
	413	.align 5
	414	.ent __memsetw
	415
	416	__memsetw:
	417	.frame $30,0,$26,0
	418	.prologue 0
	419
	420	inswl $17,0,$5 # U : 000000000000c1c2
	421	inswl $17,2,$2 # U : 00000000c1c20000
	422	bis $16,$16,$0 # E : return value
	423	addq $18,$16,$6 # E : max address to write to
	424
	425	ble $18, end_w # U : zero length requested?
	426	inswl $17,4,$3 # U : 0000c1c200000000
	427	inswl $17,6,$4 # U : c1c2000000000000
	428	xor $16,$6,$1 # E : will complete write be within one quadword?
	429
	430	or $2,$5,$2 # E : 00000000c1c2c1c2
	431	or $3,$4,$17 # E : c1c2c1c200000000
	432	bic $1,7,$1 # E : fit within a single quadword
	433	and $16,7,$3 # E : Target addr misalignment
	434
	435	or $17,$2,$17 # E : c1c2c1c2c1c2c1c2
	436	beq $1,within_quad_w # U :
	437	nop
	438	beq $3,aligned_w # U : target is 0mod8
	439
	440	/*
	441	* Target address is misaligned, and won't fit within a quadword
	442	*/
	443	ldq_u $4,0($16) # L : Fetch first partial
	444	bis $16,$16,$5 # E : Save the address
	445	insql $17,$16,$2 # U : Insert new bytes
	446	subq $3,8,$3 # E : Invert (for addressing uses)
	447
	448	addq $18,$3,$18 # E : $18 is new count ($3 is negative)
	449	mskql $4,$16,$4 # U : clear relevant parts of the quad
	450	subq $16,$3,$16 # E : $16 is new aligned destination
	451	bis $2,$4,$1 # E : Final bytes
	452
	453	nop
	454	stq_u $1,0($5) # L : Store result
	455	nop
	456	nop
	457
	458	.align 4
	459	aligned_w:
	460	/*
	461	* We are now guaranteed to be quad aligned, with at least
	462	* one partial quad to write.
	463	*/
	464
	465	sra $18,3,$3 # U : Number of remaining quads to write
	466	and $18,7,$18 # E : Number of trailing bytes to write
	467	bis $16,$16,$5 # E : Save dest address
	468	beq $3,no_quad_w # U : tail stuff only
	469
	470	/*
	471	* it's worth the effort to unroll this and use wh64 if possible
	472	* Lifted a bunch of code from clear_user.S
	473	* At this point, entry values are:
	474	* $16 Current destination address
	475	* $5 A copy of $16
	476	* $6 The max quadword address to write to
	477	* $18 Number trailer bytes
	478	* $3 Number quads to write
	479	*/
	480
	481	and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
	482	subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
	483	subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
	484	blt $4, loop_w # U :
	485
	486	/*
	487	* We know we've got at least 16 quads, minimum of one trip
	488	* through unrolled loop. Do a quad at a time to get us 0mod64
	489	* aligned.
	490	*/
	491
	492	nop # E :
	493	nop # E :
	494	nop # E :
	495	beq $1, $bigalign_w # U :
	496
	497	$alignmod64_w:
	498	stq $17, 0($5) # L :
	499	subq $3, 1, $3 # E : For consistency later
	500	addq $1, 8, $1 # E : Increment towards zero for alignment
	501	addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
	502
	503	nop
	504	nop
	505	addq $5, 8, $5 # E : Inc address
	506	blt $1, $alignmod64_w # U :
	507
	508	$bigalign_w:
	509	/*
	510	* $3 - number quads left to go
	511	* $5 - target address (aligned 0mod64)
	512	* $17 - mask of stuff to store
	513	* Scratch registers available: $7, $2, $4, $1
	514	* we know that we'll be taking a minimum of one trip through
	515	* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
	516	* Assumes the wh64 needs to be for 2 trips through the loop in the future
	517	* The wh64 is issued on for the starting destination address for trip +2
	518	* through the loop, and if there are less than two trips left, the target
	519	* address will be for the current trip.
	520	*/
	521
	522	$do_wh64_w:
	523	wh64 ($4) # L1 : memory subsystem write hint
	524	subq $3, 24, $2 # E : For determining future wh64 addresses
	525	stq $17, 0($5) # L :
	526	nop # E :
	527
	528	addq $5, 128, $4 # E : speculative target of next wh64
	529	stq $17, 8($5) # L :
	530	stq $17, 16($5) # L :
	531	addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
	532
	533	stq $17, 24($5) # L :
	534	stq $17, 32($5) # L :
	535	cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
	536	nop
	537
	538	stq $17, 40($5) # L :
	539	stq $17, 48($5) # L :
	540	subq $3, 16, $2 # E : Repeat the loop at least once more?
	541	nop
	542
	543	stq $17, 56($5) # L :
	544	addq $5, 64, $5 # E :
	545	subq $3, 8, $3 # E :
	546	bge $2, $do_wh64_w # U :
	547
	548	nop
	549	nop
	550	nop
	551	beq $3, no_quad_w # U : Might have finished already
	552
	553	.align 4
	554	/*
	555	* Simple loop for trailing quadwords, or for small amounts
	556	* of data (where we can't use an unrolled loop and wh64)
	557	*/
	558	loop_w:
	559	stq $17,0($5) # L :
	560	subq $3,1,$3 # E : Decrement number quads left
	561	addq $5,8,$5 # E : Inc address
	562	bne $3,loop_w # U : more?
	563
	564	no_quad_w:
	565	/*
	566	* Write 0..7 trailing bytes.
	567	*/
	568	nop # E :
	569	beq $18,end_w # U : All done?
	570	ldq $7,0($5) # L :
	571	mskqh $7,$6,$2 # U : Mask final quad
	572
	573	insqh $17,$6,$4 # U : New bits
	574	bis $2,$4,$1 # E : Put it all together
	575	stq $1,0($5) # L : And back to memory
	576	ret $31,($26),1 # L0 :
	577
	578	within_quad_w:
	579	ldq_u $1,0($16) # L :
	580	insql $17,$16,$2 # U : New bits
	581	mskql $1,$16,$4 # U : Clear old
	582	bis $2,$4,$2 # E : New result
	583
	584	mskql $2,$6,$4 # U :
	585	mskqh $1,$6,$2 # U :
	586	bis $2,$4,$1 # E :
	587	stq_u $1,0($16) # L :
	588
	589	end_w:
	590	nop
	591	nop
	592	nop
	593	ret $31,($26),1 # L0 :
	594
	595	.end __memsetw
	596
	597	memset = __memset