1 files changed, 259 insertions, 0 deletions
diff --git a/arch/alpha/lib/ev6-copy_user.S b/arch/alpha/lib/ev6-copy_user.S
new file mode 100644
index 000000000000..db42ffe9c350
--- /dev/null
+++ b/arch/alpha/lib/ev6-copy_user.S
@@ -0,0 +1,259 @@
+/*
+ * arch/alpha/lib/ev6-copy_user.S
+ *
+ * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
+ *
+ * Copy to/from user space, handling exceptions as we go..  This
+ * isn't exactly pretty.
+ *
+ * This is essentially the same as "memcpy()", but with a few twists.
+ * Notably, we have to make sure that $0 is always up-to-date and
+ * contains the right "bytes left to copy" value (and that it is updated
+ * only _after_ a successful copy). There is also some rather minor
+ * exception setup stuff..
+ *
+ * NOTE! This is not directly C-callable, because the calling semantics are
+ * different:
+ *
+ * Inputs:
+ *      length in $0
+ *      destination address in $6
+ *      source address in $7
+ *      return address in $28
+ *
+ * Outputs:
+ *      bytes left to copy in $0
+ *
+ * Clobbers:
+ *      $1,$2,$3,$4,$5,$6,$7
+ *
+ * Much of the information about 21264 scheduling/coding comes from:
+ *      Compiler Writer's Guide for the Alpha 21264
+ *      abbreviated as 'CWG' in other comments here
+ *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ *      E       - either cluster
+ *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ */
+/* Allow an exception for an insn; exit if we get one.  */
+#define EXI(x,y...)                     \
+        99: x,##y;                      \
+        .section __ex_table,"a";        \
+        .long 99b - .;                  \
+        lda $31, $exitin-99b($31);      \
+        .previous
+#define EXO(x,y...)                     \
+        99: x,##y;                      \
+        .section __ex_table,"a";        \
+        .long 99b - .;                  \
+        lda $31, $exitout-99b($31);     \
+        .previous
+        .set noat
+        .align 4
+        .globl __copy_user
+        .ent __copy_user
+                                # Pipeline info: Slotting & Comments
+__copy_user:
+        .prologue 0
+        subq $0, 32, $1         # .. E  .. ..   : Is this going to be a small copy?
+        beq $0, $zerolength     # U  .. .. ..   : U L U L
+        and $6,7,$3             # .. .. .. E    : is leading dest misalignment
+        ble $1, $onebyteloop    # .. .. U  ..   : 1st branch : small amount of data
+        beq $3, $destaligned    # .. U  .. ..   : 2nd (one cycle fetcher stall)
+        subq $3, 8, $3          # E  .. .. ..   : L U U L : trip counter
+/*
+ * The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U)
+ * This loop aligns the destination a byte at a time
+ * We know we have at least one trip through this loop
+ */
+$aligndest:
+        EXI( ldbu $1,0($7) )    # .. .. .. L    : Keep loads separate from stores
+        addq $6,1,$6            # .. .. E  ..   : Section 3.8 in the CWG
+        addq $3,1,$3            # .. E  .. ..   :
+        nop                     # E  .. .. ..   : U L U L
+/*
+ * the -1 is to compensate for the inc($6) done in a previous quadpack
+ * which allows us zero dependencies within either quadpack in the loop
+ */
+        EXO( stb $1,-1($6) )    # .. .. .. L    :
+        addq $7,1,$7            # .. .. E  ..   : Section 3.8 in the CWG
+        subq $0,1,$0            # .. E  .. ..   :
+        bne $3, $aligndest      # U  .. .. ..   : U L U L
+/*
+ * If we fell through into here, we have a minimum of 33 - 7 bytes
+ * If we arrived via branch, we have a minimum of 32 bytes
+ */
+$destaligned:
+        and $7,7,$1             # .. .. .. E    : Check _current_ source alignment
+        bic $0,7,$4             # .. .. E  ..   : number bytes as a quadword loop
+        EXI( ldq_u $3,0($7) )   # .. L  .. ..   : Forward fetch for fallthrough code
+        beq $1,$quadaligned     # U  .. .. ..   : U L U L
+/*
+ * In the worst case, we've just executed an ldq_u here from 0($7)
+ * and we'll repeat it once if we take the branch
+ */
+/* Misaligned quadword loop - not unrolled.  Leave it that way. */
+$misquad:
+        EXI( ldq_u $2,8($7) )   # .. .. .. L    :
+        subq $4,8,$4            # .. .. E  ..   :
+        extql $3,$7,$3          # .. U  .. ..   :
+        extqh $2,$7,$1          # U  .. .. ..   : U U L L
+        bis $3,$1,$1            # .. .. .. E    :
+        EXO( stq $1,0($6) )     # .. .. L  ..   :
+        addq $7,8,$7            # .. E  .. ..   :
+        subq $0,8,$0            # E  .. .. ..   : U L L U
+        addq $6,8,$6            # .. .. .. E    :
+        bis $2,$2,$3            # .. .. E  ..   :
+        nop                     # .. E  .. ..   :
+        bne $4,$misquad         # U  .. .. ..   : U L U L
+        nop                     # .. .. .. E
+        nop                     # .. .. E  ..
+        nop                     # .. E  .. ..
+        beq $0,$zerolength      # U  .. .. ..   : U L U L
+/* We know we have at least one trip through the byte loop */
+        EXI ( ldbu $2,0($7) )   # .. .. .. L    : No loads in the same quad
+        addq $6,1,$6            # .. .. E  ..   : as the store (Section 3.8 in CWG)
+        nop                     # .. E  .. ..   :
+        br $31, $dirtyentry     # L0 .. .. ..   : L U U L
+/* Do the trailing byte loop load, then hop into the store part of the loop */
+/*
+ * A minimum of (33 - 7) bytes to do a quad at a time.
+ * Based upon the usage context, it's worth the effort to unroll this loop
+ * $0 - number of bytes to be moved
+ * $4 - number of bytes to move as quadwords
+ * $6 is current destination address
+ * $7 is current source address
+ */
+$quadaligned:
+        subq    $4, 32, $2      # .. .. .. E    : do not unroll for small stuff
+        nop                     # .. .. E  ..
+        nop                     # .. E  .. ..
+        blt     $2, $onequad    # U  .. .. ..   : U L U L
+/*
+ * There is a significant assumption here that the source and destination
+ * addresses differ by more than 32 bytes.  In this particular case, a
+ * sparsity of registers further bounds this to be a minimum of 8 bytes.
+ * But if this isn't met, then the output result will be incorrect.
+ * Furthermore, due to a lack of available registers, we really can't
+ * unroll this to be an 8x loop (which would enable us to use the wh64
+ * instruction memory hint instruction).
+ */
+$unroll4:
+        EXI( ldq $1,0($7) )     # .. .. .. L
+        EXI( ldq $2,8($7) )     # .. .. L  ..
+        subq    $4,32,$4        # .. E  .. ..
+        nop                     # E  .. .. ..   : U U L L
+        addq    $7,16,$7        # .. .. .. E
+        EXO( stq $1,0($6) )     # .. .. L  ..
+        EXO( stq $2,8($6) )     # .. L  .. ..
+        subq    $0,16,$0        # E  .. .. ..   : U L L U
+        addq    $6,16,$6        # .. .. .. E
+        EXI( ldq $1,0($7) )     # .. .. L  ..
+        EXI( ldq $2,8($7) )     # .. L  .. ..
+        subq    $4, 32, $3      # E  .. .. ..   : U U L L : is there enough for another trip?
+        EXO( stq $1,0($6) )     # .. .. .. L
+        EXO( stq $2,8($6) )     # .. .. L  ..
+        subq    $0,16,$0        # .. E  .. ..
+        addq    $7,16,$7        # E  .. .. ..   : U L L U
+        nop                     # .. .. .. E
+        nop                     # .. .. E  ..
+        addq    $6,16,$6        # .. E  .. ..
+        bgt     $3,$unroll4     # U  .. .. ..   : U L U L
+        nop
+        nop
+        nop
+        beq     $4, $noquads
+$onequad:
+        EXI( ldq $1,0($7) )
+        subq    $4,8,$4
+        addq    $7,8,$7
+        nop
+        EXO( stq $1,0($6) )
+        subq    $0,8,$0
+        addq    $6,8,$6
+        bne     $4,$onequad
+$noquads:
+        nop
+        nop
+        nop
+        beq $0,$zerolength
+/*
+ * For small copies (or the tail of a larger copy), do a very simple byte loop.
+ * There's no point in doing a lot of complex alignment calculations to try to
+ * to quadword stuff for a small amount of data.
+ *      $0 - remaining number of bytes left to copy
+ *      $6 - current dest addr
+ *      $7 - current source addr
+ */
+$onebyteloop:
+        EXI ( ldbu $2,0($7) )   # .. .. .. L    : No loads in the same quad
+        addq $6,1,$6            # .. .. E  ..   : as the store (Section 3.8 in CWG)
+        nop                     # .. E  .. ..   :
+        nop                     # E  .. .. ..   : U L U L
+$dirtyentry:
+/*
+ * the -1 is to compensate for the inc($6) done in a previous quadpack
+ * which allows us zero dependencies within either quadpack in the loop
+ */
+        EXO ( stb $2,-1($6) )   # .. .. .. L    :
+        addq $7,1,$7            # .. .. E  ..   : quadpack as the load
+        subq $0,1,$0            # .. E  .. ..   : change count _after_ copy
+        bgt $0,$onebyteloop     # U  .. .. ..   : U L U L
+$zerolength:
+$exitout:                       # Destination for exception recovery(?)
+        nop                     # .. .. .. E
+        nop                     # .. .. E  ..
+        nop                     # .. E  .. ..
+        ret $31,($28),1         # L0 .. .. ..   : L U L U
+$exitin:
+        /* A stupid byte-by-byte zeroing of the rest of the output
+           buffer.  This cures security holes by never leaving 
+           random kernel data around to be copied elsewhere.  */
+        nop
+        nop
+        nop
+        mov     $0,$1
+$101:
+        EXO ( stb $31,0($6) )   # L
+        subq $1,1,$1            # E
+        addq $6,1,$6            # E
+        bgt $1,$101             # U
+        nop
+        nop
+        nop
+        ret $31,($28),1         # L0
+        .end __copy_user

diff --git a/arch/alpha/lib/ev6-copy_user.S b/arch/alpha/lib/ev6-copy_user.S new file mode 100644 index 000000000000..db42ffe9c350 --- /dev/null +++ b/arch/alpha/lib/ev6-copy_user.S
@@ -0,0 +1,259 @@
	1	/*
	2	* arch/alpha/lib/ev6-copy_user.S
	3	*
	4	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
	5	*
	6	* Copy to/from user space, handling exceptions as we go.. This
	7	* isn't exactly pretty.
	8	*
	9	* This is essentially the same as "memcpy()", but with a few twists.
	10	* Notably, we have to make sure that $0 is always up-to-date and
	11	* contains the right "bytes left to copy" value (and that it is updated
	12	* only _after_ a successful copy). There is also some rather minor
	13	* exception setup stuff..
	14	*
	15	* NOTE! This is not directly C-callable, because the calling semantics are
	16	* different:
	17	*
	18	* Inputs:
	19	* length in $0
	20	* destination address in $6
	21	* source address in $7
	22	* return address in $28
	23	*
	24	* Outputs:
	25	* bytes left to copy in $0
	26	*
	27	* Clobbers:
	28	* $1,$2,$3,$4,$5,$6,$7
	29	*
	30	* Much of the information about 21264 scheduling/coding comes from:
	31	* Compiler Writer's Guide for the Alpha 21264
	32	* abbreviated as 'CWG' in other comments here
	33	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
	34	* Scheduling notation:
	35	* E - either cluster
	36	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
	37	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
	38	*/
	39
	40	/* Allow an exception for an insn; exit if we get one. */
	41	#define EXI(x,y...) \
	42	99: x,##y; \
	43	.section __ex_table,"a"; \
	44	.long 99b - .; \
	45	lda $31, $exitin-99b($31); \
	46	.previous
	47
	48	#define EXO(x,y...) \
	49	99: x,##y; \
	50	.section __ex_table,"a"; \
	51	.long 99b - .; \
	52	lda $31, $exitout-99b($31); \
	53	.previous
	54
	55	.set noat
	56	.align 4
	57	.globl __copy_user
	58	.ent __copy_user
	59	# Pipeline info: Slotting & Comments
	60	__copy_user:
	61	.prologue 0
	62	subq $0, 32, $1 # .. E .. .. : Is this going to be a small copy?
	63	beq $0, $zerolength # U .. .. .. : U L U L
	64
	65	and $6,7,$3 # .. .. .. E : is leading dest misalignment
	66	ble $1, $onebyteloop # .. .. U .. : 1st branch : small amount of data
	67	beq $3, $destaligned # .. U .. .. : 2nd (one cycle fetcher stall)
	68	subq $3, 8, $3 # E .. .. .. : L U U L : trip counter
	69	/*
	70	* The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U)
	71	* This loop aligns the destination a byte at a time
	72	* We know we have at least one trip through this loop
	73	*/
	74	$aligndest:
	75	EXI( ldbu $1,0($7) ) # .. .. .. L : Keep loads separate from stores
	76	addq $6,1,$6 # .. .. E .. : Section 3.8 in the CWG
	77	addq $3,1,$3 # .. E .. .. :
	78	nop # E .. .. .. : U L U L
	79
	80	/*
	81	* the -1 is to compensate for the inc($6) done in a previous quadpack
	82	* which allows us zero dependencies within either quadpack in the loop
	83	*/
	84	EXO( stb $1,-1($6) ) # .. .. .. L :
	85	addq $7,1,$7 # .. .. E .. : Section 3.8 in the CWG
	86	subq $0,1,$0 # .. E .. .. :
	87	bne $3, $aligndest # U .. .. .. : U L U L
	88
	89	/*
	90	* If we fell through into here, we have a minimum of 33 - 7 bytes
	91	* If we arrived via branch, we have a minimum of 32 bytes
	92	*/
	93	$destaligned:
	94	and $7,7,$1 # .. .. .. E : Check _current_ source alignment
	95	bic $0,7,$4 # .. .. E .. : number bytes as a quadword loop
	96	EXI( ldq_u $3,0($7) ) # .. L .. .. : Forward fetch for fallthrough code
	97	beq $1,$quadaligned # U .. .. .. : U L U L
	98
	99	/*
	100	* In the worst case, we've just executed an ldq_u here from 0($7)
	101	* and we'll repeat it once if we take the branch
	102	*/
	103
	104	/* Misaligned quadword loop - not unrolled. Leave it that way. */
	105	$misquad:
	106	EXI( ldq_u $2,8($7) ) # .. .. .. L :
	107	subq $4,8,$4 # .. .. E .. :
	108	extql $3,$7,$3 # .. U .. .. :
	109	extqh $2,$7,$1 # U .. .. .. : U U L L
	110
	111	bis $3,$1,$1 # .. .. .. E :
	112	EXO( stq $1,0($6) ) # .. .. L .. :
	113	addq $7,8,$7 # .. E .. .. :
	114	subq $0,8,$0 # E .. .. .. : U L L U
	115
	116	addq $6,8,$6 # .. .. .. E :
	117	bis $2,$2,$3 # .. .. E .. :
	118	nop # .. E .. .. :
	119	bne $4,$misquad # U .. .. .. : U L U L
	120
	121	nop # .. .. .. E
	122	nop # .. .. E ..
	123	nop # .. E .. ..
	124	beq $0,$zerolength # U .. .. .. : U L U L
	125
	126	/* We know we have at least one trip through the byte loop */
	127	EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad
	128	addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG)
	129	nop # .. E .. .. :
	130	br $31, $dirtyentry # L0 .. .. .. : L U U L
	131	/* Do the trailing byte loop load, then hop into the store part of the loop */
	132
	133	/*
	134	* A minimum of (33 - 7) bytes to do a quad at a time.
	135	* Based upon the usage context, it's worth the effort to unroll this loop
	136	* $0 - number of bytes to be moved
	137	* $4 - number of bytes to move as quadwords
	138	* $6 is current destination address
	139	* $7 is current source address
	140	*/
	141	$quadaligned:
	142	subq $4, 32, $2 # .. .. .. E : do not unroll for small stuff
	143	nop # .. .. E ..
	144	nop # .. E .. ..
	145	blt $2, $onequad # U .. .. .. : U L U L
	146
	147	/*
	148	* There is a significant assumption here that the source and destination
	149	* addresses differ by more than 32 bytes. In this particular case, a
	150	* sparsity of registers further bounds this to be a minimum of 8 bytes.
	151	* But if this isn't met, then the output result will be incorrect.
	152	* Furthermore, due to a lack of available registers, we really can't
	153	* unroll this to be an 8x loop (which would enable us to use the wh64
	154	* instruction memory hint instruction).
	155	*/
	156	$unroll4:
	157	EXI( ldq $1,0($7) ) # .. .. .. L
	158	EXI( ldq $2,8($7) ) # .. .. L ..
	159	subq $4,32,$4 # .. E .. ..
	160	nop # E .. .. .. : U U L L
	161
	162	addq $7,16,$7 # .. .. .. E
	163	EXO( stq $1,0($6) ) # .. .. L ..
	164	EXO( stq $2,8($6) ) # .. L .. ..
	165	subq $0,16,$0 # E .. .. .. : U L L U
	166
	167	addq $6,16,$6 # .. .. .. E
	168	EXI( ldq $1,0($7) ) # .. .. L ..
	169	EXI( ldq $2,8($7) ) # .. L .. ..
	170	subq $4, 32, $3 # E .. .. .. : U U L L : is there enough for another trip?
	171
	172	EXO( stq $1,0($6) ) # .. .. .. L
	173	EXO( stq $2,8($6) ) # .. .. L ..
	174	subq $0,16,$0 # .. E .. ..
	175	addq $7,16,$7 # E .. .. .. : U L L U
	176
	177	nop # .. .. .. E
	178	nop # .. .. E ..
	179	addq $6,16,$6 # .. E .. ..
	180	bgt $3,$unroll4 # U .. .. .. : U L U L
	181
	182	nop
	183	nop
	184	nop
	185	beq $4, $noquads
	186
	187	$onequad:
	188	EXI( ldq $1,0($7) )
	189	subq $4,8,$4
	190	addq $7,8,$7
	191	nop
	192
	193	EXO( stq $1,0($6) )
	194	subq $0,8,$0
	195	addq $6,8,$6
	196	bne $4,$onequad
	197
	198	$noquads:
	199	nop
	200	nop
	201	nop
	202	beq $0,$zerolength
	203
	204	/*
	205	* For small copies (or the tail of a larger copy), do a very simple byte loop.
	206	* There's no point in doing a lot of complex alignment calculations to try to
	207	* to quadword stuff for a small amount of data.
	208	* $0 - remaining number of bytes left to copy
	209	* $6 - current dest addr
	210	* $7 - current source addr
	211	*/
	212
	213	$onebyteloop:
	214	EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad
	215	addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG)
	216	nop # .. E .. .. :
	217	nop # E .. .. .. : U L U L
	218
	219	$dirtyentry:
	220	/*
	221	* the -1 is to compensate for the inc($6) done in a previous quadpack
	222	* which allows us zero dependencies within either quadpack in the loop
	223	*/
	224	EXO ( stb $2,-1($6) ) # .. .. .. L :
	225	addq $7,1,$7 # .. .. E .. : quadpack as the load
	226	subq $0,1,$0 # .. E .. .. : change count _after_ copy
	227	bgt $0,$onebyteloop # U .. .. .. : U L U L
	228
	229	$zerolength:
	230	$exitout: # Destination for exception recovery(?)
	231	nop # .. .. .. E
	232	nop # .. .. E ..
	233	nop # .. E .. ..
	234	ret $31,($28),1 # L0 .. .. .. : L U L U
	235
	236	$exitin:
	237
	238	/* A stupid byte-by-byte zeroing of the rest of the output
	239	buffer. This cures security holes by never leaving
	240	random kernel data around to be copied elsewhere. */
	241
	242	nop
	243	nop
	244	nop
	245	mov $0,$1
	246
	247	$101:
	248	EXO ( stb $31,0($6) ) # L
	249	subq $1,1,$1 # E
	250	addq $6,1,$6 # E
	251	bgt $1,$101 # U
	252
	253	nop
	254	nop
	255	nop
	256	ret $31,($28),1 # L0
	257
	258	.end __copy_user
	259