1 files changed, 248 insertions, 0 deletions
diff --git a/arch/alpha/lib/ev6-memcpy.S b/arch/alpha/lib/ev6-memcpy.S
new file mode 100644
index 000000000000..52b37b0f2af5
--- /dev/null
+++ b/arch/alpha/lib/ev6-memcpy.S
@@ -0,0 +1,248 @@
+/*
+ * arch/alpha/lib/ev6-memcpy.S
+ * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
+ *
+ * Reasonably optimized memcpy() routine for the Alpha 21264
+ *
+ *      - memory accessed as aligned quadwords only
+ *      - uses bcmpge to compare 8 bytes in parallel
+ *
+ * Much of the information about 21264 scheduling/coding comes from:
+ *      Compiler Writer's Guide for the Alpha 21264
+ *      abbreviated as 'CWG' in other comments here
+ *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ *      E       - either cluster
+ *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ *
+ * Temp usage notes:
+ *      $1,$2,          - scratch
+ */
+        .set noreorder
+        .set noat
+        .align  4
+        .globl memcpy
+        .ent memcpy
+memcpy:
+        .frame $30,0,$26,0
+        .prologue 0
+        mov     $16, $0                 # E : copy dest to return
+        ble     $18, $nomoredata        # U : done with the copy?
+        xor     $16, $17, $1            # E : are source and dest alignments the same?
+        and     $1, 7, $1               # E : are they the same mod 8?
+        bne     $1, $misaligned         # U : Nope - gotta do this the slow way
+        /* source and dest are same mod 8 address */
+        and     $16, 7, $1              # E : Are both 0mod8?
+        beq     $1, $both_0mod8         # U : Yes
+        nop                             # E :
+        /*
+         * source and dest are same misalignment.  move a byte at a time
+         * until a 0mod8 alignment for both is reached.
+         * At least one byte more to move
+         */
+$head_align:
+        ldbu    $1, 0($17)              # L : grab a byte
+        subq    $18, 1, $18             # E : count--
+        addq    $17, 1, $17             # E : src++
+        stb     $1, 0($16)              # L :
+        addq    $16, 1, $16             # E : dest++
+        and     $16, 7, $1              # E : Are we at 0mod8 yet?
+        ble     $18, $nomoredata        # U : done with the copy?
+        bne     $1, $head_align         # U :
+$both_0mod8:
+        cmple   $18, 127, $1            # E : Can we unroll the loop?
+        bne     $1, $no_unroll          # U :
+        and     $16, 63, $1             # E : get mod64 alignment
+        beq     $1, $do_unroll          # U : no single quads to fiddle
+$single_head_quad:
+        ldq     $1, 0($17)              # L : get 8 bytes
+        subq    $18, 8, $18             # E : count -= 8
+        addq    $17, 8, $17             # E : src += 8
+        nop                             # E :
+        stq     $1, 0($16)              # L : store
+        addq    $16, 8, $16             # E : dest += 8
+        and     $16, 63, $1             # E : get mod64 alignment
+        bne     $1, $single_head_quad   # U : still not fully aligned
+$do_unroll:
+        addq    $16, 64, $7             # E : Initial (+1 trip) wh64 address
+        cmple   $18, 127, $1            # E : Can we go through the unrolled loop?
+        bne     $1, $tail_quads         # U : Nope
+        nop                             # E : 
+$unroll_body:
+        wh64    ($7)                    # L1 : memory subsystem hint: 64 bytes at
+                                        # ($7) are about to be over-written
+        ldq     $6, 0($17)              # L0 : bytes 0..7
+        nop                             # E :
+        nop                             # E :
+        ldq     $4, 8($17)              # L : bytes 8..15
+        ldq     $5, 16($17)             # L : bytes 16..23
+        addq    $7, 64, $7              # E : Update next wh64 address
+        nop                             # E :
+        ldq     $3, 24($17)             # L : bytes 24..31
+        addq    $16, 64, $1             # E : fallback value for wh64
+        nop                             # E :
+        nop                             # E :
+        addq    $17, 32, $17            # E : src += 32 bytes
+        stq     $6, 0($16)              # L : bytes 0..7
+        nop                             # E :
+        nop                             # E :
+        stq     $4, 8($16)              # L : bytes 8..15
+        stq     $5, 16($16)             # L : bytes 16..23
+        subq    $18, 192, $2            # E : At least two more trips to go?
+        nop                             # E :
+        stq     $3, 24($16)             # L : bytes 24..31
+        addq    $16, 32, $16            # E : dest += 32 bytes
+        nop                             # E :
+        nop                             # E :
+        ldq     $6, 0($17)              # L : bytes 0..7
+        ldq     $4, 8($17)              # L : bytes 8..15
+        cmovlt  $2, $1, $7              # E : Latency 2, extra map slot - Use
+                                        # fallback wh64 address if < 2 more trips
+        nop                             # E :
+        ldq     $5, 16($17)             # L : bytes 16..23
+        ldq     $3, 24($17)             # L : bytes 24..31
+        addq    $16, 32, $16            # E : dest += 32
+        subq    $18, 64, $18            # E : count -= 64
+        addq    $17, 32, $17            # E : src += 32
+        stq     $6, -32($16)            # L : bytes 0..7
+        stq     $4, -24($16)            # L : bytes 8..15
+        cmple   $18, 63, $1             # E : At least one more trip?
+        stq     $5, -16($16)            # L : bytes 16..23
+        stq     $3, -8($16)             # L : bytes 24..31
+        nop                             # E :
+        beq     $1, $unroll_body
+$tail_quads:
+$no_unroll:
+        .align 4
+        subq    $18, 8, $18             # E : At least a quad left?
+        blt     $18, $less_than_8       # U : Nope
+        nop                             # E :
+        nop                             # E :
+$move_a_quad:
+        ldq     $1, 0($17)              # L : fetch 8
+        subq    $18, 8, $18             # E : count -= 8
+        addq    $17, 8, $17             # E : src += 8
+        nop                             # E :
+        stq     $1, 0($16)              # L : store 8
+        addq    $16, 8, $16             # E : dest += 8
+        bge     $18, $move_a_quad       # U :
+        nop                             # E :
+$less_than_8:
+        .align 4
+        addq    $18, 8, $18             # E : add back for trailing bytes
+        ble     $18, $nomoredata        # U : All-done
+        nop                             # E :
+        nop                             # E :
+        /* Trailing bytes */
+$tail_bytes:
+        subq    $18, 1, $18             # E : count--
+        ldbu    $1, 0($17)              # L : fetch a byte
+        addq    $17, 1, $17             # E : src++
+        nop                             # E :
+        stb     $1, 0($16)              # L : store a byte
+        addq    $16, 1, $16             # E : dest++
+        bgt     $18, $tail_bytes        # U : more to be done?
+        nop                             # E :
+        /* branching to exit takes 3 extra cycles, so replicate exit here */
+        ret     $31, ($26), 1           # L0 :
+        nop                             # E :
+        nop                             # E :
+        nop                             # E :
+$misaligned:
+        mov     $0, $4                  # E : dest temp
+        and     $0, 7, $1               # E : dest alignment mod8
+        beq     $1, $dest_0mod8         # U : life doesnt totally suck
+        nop
+$aligndest:
+        ble     $18, $nomoredata        # U :
+        ldbu    $1, 0($17)              # L : fetch a byte
+        subq    $18, 1, $18             # E : count--
+        addq    $17, 1, $17             # E : src++
+        stb     $1, 0($4)               # L : store it
+        addq    $4, 1, $4               # E : dest++
+        and     $4, 7, $1               # E : dest 0mod8 yet?
+        bne     $1, $aligndest          # U : go until we are aligned.
+        /* Source has unknown alignment, but dest is known to be 0mod8 */
+$dest_0mod8:
+        subq    $18, 8, $18             # E : At least a quad left?
+        blt     $18, $misalign_tail     # U : Nope
+        ldq_u   $3, 0($17)              # L : seed (rotating load) of 8 bytes
+        nop                             # E :
+$mis_quad:
+        ldq_u   $16, 8($17)             # L : Fetch next 8
+        extql   $3, $17, $3             # U : masking
+        extqh   $16, $17, $1            # U : masking
+        bis     $3, $1, $1              # E : merged bytes to store
+        subq    $18, 8, $18             # E : count -= 8
+        addq    $17, 8, $17             # E : src += 8
+        stq     $1, 0($4)               # L : store 8 (aligned)
+        mov     $16, $3                 # E : "rotate" source data
+        addq    $4, 8, $4               # E : dest += 8
+        bge     $18, $mis_quad          # U : More quads to move
+        nop
+        nop
+$misalign_tail:
+        addq    $18, 8, $18             # E : account for tail stuff
+        ble     $18, $nomoredata        # U :
+        nop
+        nop
+$misalign_byte:
+        ldbu    $1, 0($17)              # L : fetch 1
+        subq    $18, 1, $18             # E : count--
+        addq    $17, 1, $17             # E : src++
+        nop                             # E :
+        stb     $1, 0($4)               # L : store
+        addq    $4, 1, $4               # E : dest++
+        bgt     $18, $misalign_byte     # U : more to go?
+        nop
+$nomoredata:
+        ret     $31, ($26), 1           # L0 :
+        nop                             # E :
+        nop                             # E :
+        nop                             # E :
+        .end memcpy
+/* For backwards module compatibility.  */
+__memcpy = memcpy
+.globl __memcpy

diff --git a/arch/alpha/lib/ev6-memcpy.S b/arch/alpha/lib/ev6-memcpy.S new file mode 100644 index 000000000000..52b37b0f2af5 --- /dev/null +++ b/arch/alpha/lib/ev6-memcpy.S
@@ -0,0 +1,248 @@
	1	/*
	2	* arch/alpha/lib/ev6-memcpy.S
	3	* 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
	4	*
	5	* Reasonably optimized memcpy() routine for the Alpha 21264
	6	*
	7	* - memory accessed as aligned quadwords only
	8	* - uses bcmpge to compare 8 bytes in parallel
	9	*
	10	* Much of the information about 21264 scheduling/coding comes from:
	11	* Compiler Writer's Guide for the Alpha 21264
	12	* abbreviated as 'CWG' in other comments here
	13	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
	14	* Scheduling notation:
	15	* E - either cluster
	16	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
	17	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
	18	*
	19	* Temp usage notes:
	20	* $1,$2, - scratch
	21	*/
	22
	23	.set noreorder
	24	.set noat
	25
	26	.align 4
	27	.globl memcpy
	28	.ent memcpy
	29	memcpy:
	30	.frame $30,0,$26,0
	31	.prologue 0
	32
	33	mov $16, $0 # E : copy dest to return
	34	ble $18, $nomoredata # U : done with the copy?
	35	xor $16, $17, $1 # E : are source and dest alignments the same?
	36	and $1, 7, $1 # E : are they the same mod 8?
	37
	38	bne $1, $misaligned # U : Nope - gotta do this the slow way
	39	/* source and dest are same mod 8 address */
	40	and $16, 7, $1 # E : Are both 0mod8?
	41	beq $1, $both_0mod8 # U : Yes
	42	nop # E :
	43
	44	/*
	45	* source and dest are same misalignment. move a byte at a time
	46	* until a 0mod8 alignment for both is reached.
	47	* At least one byte more to move
	48	*/
	49
	50	$head_align:
	51	ldbu $1, 0($17) # L : grab a byte
	52	subq $18, 1, $18 # E : count--
	53	addq $17, 1, $17 # E : src++
	54	stb $1, 0($16) # L :
	55	addq $16, 1, $16 # E : dest++
	56	and $16, 7, $1 # E : Are we at 0mod8 yet?
	57	ble $18, $nomoredata # U : done with the copy?
	58	bne $1, $head_align # U :
	59
	60	$both_0mod8:
	61	cmple $18, 127, $1 # E : Can we unroll the loop?
	62	bne $1, $no_unroll # U :
	63	and $16, 63, $1 # E : get mod64 alignment
	64	beq $1, $do_unroll # U : no single quads to fiddle
	65
	66	$single_head_quad:
	67	ldq $1, 0($17) # L : get 8 bytes
	68	subq $18, 8, $18 # E : count -= 8
	69	addq $17, 8, $17 # E : src += 8
	70	nop # E :
	71
	72	stq $1, 0($16) # L : store
	73	addq $16, 8, $16 # E : dest += 8
	74	and $16, 63, $1 # E : get mod64 alignment
	75	bne $1, $single_head_quad # U : still not fully aligned
	76
	77	$do_unroll:
	78	addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
	79	cmple $18, 127, $1 # E : Can we go through the unrolled loop?
	80	bne $1, $tail_quads # U : Nope
	81	nop # E :
	82
	83	$unroll_body:
	84	wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
	85	# ($7) are about to be over-written
	86	ldq $6, 0($17) # L0 : bytes 0..7
	87	nop # E :
	88	nop # E :
	89
	90	ldq $4, 8($17) # L : bytes 8..15
	91	ldq $5, 16($17) # L : bytes 16..23
	92	addq $7, 64, $7 # E : Update next wh64 address
	93	nop # E :
	94
	95	ldq $3, 24($17) # L : bytes 24..31
	96	addq $16, 64, $1 # E : fallback value for wh64
	97	nop # E :
	98	nop # E :
	99
	100	addq $17, 32, $17 # E : src += 32 bytes
	101	stq $6, 0($16) # L : bytes 0..7
	102	nop # E :
	103	nop # E :
	104
	105	stq $4, 8($16) # L : bytes 8..15
	106	stq $5, 16($16) # L : bytes 16..23
	107	subq $18, 192, $2 # E : At least two more trips to go?
	108	nop # E :
	109
	110	stq $3, 24($16) # L : bytes 24..31
	111	addq $16, 32, $16 # E : dest += 32 bytes
	112	nop # E :
	113	nop # E :
	114
	115	ldq $6, 0($17) # L : bytes 0..7
	116	ldq $4, 8($17) # L : bytes 8..15
	117	cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
	118	# fallback wh64 address if < 2 more trips
	119	nop # E :
	120
	121	ldq $5, 16($17) # L : bytes 16..23
	122	ldq $3, 24($17) # L : bytes 24..31
	123	addq $16, 32, $16 # E : dest += 32
	124	subq $18, 64, $18 # E : count -= 64
	125
	126	addq $17, 32, $17 # E : src += 32
	127	stq $6, -32($16) # L : bytes 0..7
	128	stq $4, -24($16) # L : bytes 8..15
	129	cmple $18, 63, $1 # E : At least one more trip?
	130
	131	stq $5, -16($16) # L : bytes 16..23
	132	stq $3, -8($16) # L : bytes 24..31
	133	nop # E :
	134	beq $1, $unroll_body
	135
	136	$tail_quads:
	137	$no_unroll:
	138	.align 4
	139	subq $18, 8, $18 # E : At least a quad left?
	140	blt $18, $less_than_8 # U : Nope
	141	nop # E :
	142	nop # E :
	143
	144	$move_a_quad:
	145	ldq $1, 0($17) # L : fetch 8
	146	subq $18, 8, $18 # E : count -= 8
	147	addq $17, 8, $17 # E : src += 8
	148	nop # E :
	149
	150	stq $1, 0($16) # L : store 8
	151	addq $16, 8, $16 # E : dest += 8
	152	bge $18, $move_a_quad # U :
	153	nop # E :
	154
	155	$less_than_8:
	156	.align 4
	157	addq $18, 8, $18 # E : add back for trailing bytes
	158	ble $18, $nomoredata # U : All-done
	159	nop # E :
	160	nop # E :
	161
	162	/* Trailing bytes */
	163	$tail_bytes:
	164	subq $18, 1, $18 # E : count--
	165	ldbu $1, 0($17) # L : fetch a byte
	166	addq $17, 1, $17 # E : src++
	167	nop # E :
	168
	169	stb $1, 0($16) # L : store a byte
	170	addq $16, 1, $16 # E : dest++
	171	bgt $18, $tail_bytes # U : more to be done?
	172	nop # E :
	173
	174	/* branching to exit takes 3 extra cycles, so replicate exit here */
	175	ret $31, ($26), 1 # L0 :
	176	nop # E :
	177	nop # E :
	178	nop # E :
	179
	180	$misaligned:
	181	mov $0, $4 # E : dest temp
	182	and $0, 7, $1 # E : dest alignment mod8
	183	beq $1, $dest_0mod8 # U : life doesnt totally suck
	184	nop
	185
	186	$aligndest:
	187	ble $18, $nomoredata # U :
	188	ldbu $1, 0($17) # L : fetch a byte
	189	subq $18, 1, $18 # E : count--
	190	addq $17, 1, $17 # E : src++
	191
	192	stb $1, 0($4) # L : store it
	193	addq $4, 1, $4 # E : dest++
	194	and $4, 7, $1 # E : dest 0mod8 yet?
	195	bne $1, $aligndest # U : go until we are aligned.
	196
	197	/* Source has unknown alignment, but dest is known to be 0mod8 */
	198	$dest_0mod8:
	199	subq $18, 8, $18 # E : At least a quad left?
	200	blt $18, $misalign_tail # U : Nope
	201	ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
	202	nop # E :
	203
	204	$mis_quad:
	205	ldq_u $16, 8($17) # L : Fetch next 8
	206	extql $3, $17, $3 # U : masking
	207	extqh $16, $17, $1 # U : masking
	208	bis $3, $1, $1 # E : merged bytes to store
	209
	210	subq $18, 8, $18 # E : count -= 8
	211	addq $17, 8, $17 # E : src += 8
	212	stq $1, 0($4) # L : store 8 (aligned)
	213	mov $16, $3 # E : "rotate" source data
	214
	215	addq $4, 8, $4 # E : dest += 8
	216	bge $18, $mis_quad # U : More quads to move
	217	nop
	218	nop
	219
	220	$misalign_tail:
	221	addq $18, 8, $18 # E : account for tail stuff
	222	ble $18, $nomoredata # U :
	223	nop
	224	nop
	225
	226	$misalign_byte:
	227	ldbu $1, 0($17) # L : fetch 1
	228	subq $18, 1, $18 # E : count--
	229	addq $17, 1, $17 # E : src++
	230	nop # E :
	231
	232	stb $1, 0($4) # L : store
	233	addq $4, 1, $4 # E : dest++
	234	bgt $18, $misalign_byte # U : more to go?
	235	nop
	236
	237
	238	$nomoredata:
	239	ret $31, ($26), 1 # L0 :
	240	nop # E :
	241	nop # E :
	242	nop # E :
	243
	244	.end memcpy
	245
	246	/* For backwards module compatibility. */
	247	__memcpy = memcpy
	248	.globl __memcpy