Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/alpha/lib/ev6-copy_page.S
1 files changed, 203 insertions, 0 deletions
diff --git a/arch/alpha/lib/ev6-copy_page.S b/arch/alpha/lib/ev6-copy_page.S
new file mode 100644
index 000000000000..b789db192754
--- /dev/null
+++ b/arch/alpha/lib/ev6-copy_page.S
@@ -0,0 +1,203 @@
+/*
+ * arch/alpha/lib/ev6-copy_page.S
+ *
+ * Copy an entire page.
+ */
+/* The following comparison of this routine vs the normal copy_page.S
+   was written by an unnamed ev6 hardware designer and forwarded to me
+   via Steven Hobbs <hobbs@steven.zko.dec.com>.
+ 
+   First Problem: STQ overflows.
+   -----------------------------
+        It would be nice if EV6 handled every resource overflow efficiently,
+        but for some it doesn't.  Including store queue overflows.  It causes
+        a trap and a restart of the pipe.
+        To get around this we sometimes use (to borrow a term from a VSSAD
+        researcher) "aeration".  The idea is to slow the rate at which the
+        processor receives valid instructions by inserting nops in the fetch
+        path.  In doing so, you can prevent the overflow and actually make
+        the code run faster.  You can, of course, take advantage of the fact
+        that the processor can fetch at most 4 aligned instructions per cycle.
+        I inserted enough nops to force it to take 10 cycles to fetch the
+        loop code.  In theory, EV6 should be able to execute this loop in
+        9 cycles but I was not able to get it to run that fast -- the initial
+        conditions were such that I could not reach this optimum rate on
+        (chaotic) EV6.  I wrote the code such that everything would issue
+        in order. 
+   Second Problem: Dcache index matches.
+   -------------------------------------
+        If you are going to use this routine on random aligned pages, there
+        is a 25% chance that the pages will be at the same dcache indices.
+        This results in many nasty memory traps without care.
+        The solution is to schedule the prefetches to avoid the memory
+        conflicts.  I schedule the wh64 prefetches farther ahead of the
+        read prefetches to avoid this problem.
+   Third Problem: Needs more prefetching.
+   --------------------------------------
+        In order to improve the code I added deeper prefetching to take the
+        most advantage of EV6's bandwidth.
+        I also prefetched the read stream. Note that adding the read prefetch
+        forced me to add another cycle to the inner-most kernel - up to 11
+        from the original 8 cycles per iteration.  We could improve performance
+        further by unrolling the loop and doing multiple prefetches per cycle.
+   I think that the code below will be very robust and fast code for the
+   purposes of copying aligned pages.  It is slower when both source and
+   destination pages are in the dcache, but it is my guess that this is
+   less important than the dcache miss case.  */
+        .text
+        .align 4
+        .global copy_page
+        .ent copy_page
+copy_page:
+        .prologue 0
+        /* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
+        wh64    ($16)
+        ldl     $31,0($17)
+        ldl     $31,64($17)
+        lda     $1,1*64($16)
+        wh64    ($1)
+        ldl     $31,128($17)
+        ldl     $31,192($17)
+        lda     $1,2*64($16)
+        wh64    ($1)
+        ldl     $31,256($17)
+        lda     $18,118
+        lda     $1,3*64($16)
+        wh64    ($1)
+        nop
+        lda     $1,4*64($16)
+        lda     $2,5*64($16)
+        wh64    ($1)
+        wh64    ($2)
+        lda     $1,6*64($16)
+        lda     $2,7*64($16)
+        wh64    ($1)
+        wh64    ($2)
+        lda     $1,8*64($16)
+        lda     $2,9*64($16)
+        wh64    ($1)
+        wh64    ($2)
+        lda     $19,10*64($16)
+        nop
+        /* Main prefetching/write-hinting loop.  */
+1:      ldq     $0,0($17)
+        ldq     $1,8($17)
+        unop
+        unop
+        unop
+        unop
+        ldq     $2,16($17)
+        ldq     $3,24($17)
+        ldq     $4,32($17)
+        ldq     $5,40($17)
+        unop
+        unop
+        unop
+        unop
+        ldq     $6,48($17)
+        ldq     $7,56($17)
+        ldl     $31,320($17)
+        unop
+        unop
+        unop
+        /* This gives the extra cycle of aeration above the minimum.  */
+        unop                    
+        unop
+        unop
+        unop
+        wh64    ($19)
+        unop
+        unop
+        unop
+        stq     $0,0($16)
+        subq    $18,1,$18
+        stq     $1,8($16)
+        unop
+        unop
+        stq     $2,16($16)
+        addq    $17,64,$17
+        stq     $3,24($16)
+        stq     $4,32($16)
+        stq     $5,40($16)
+        addq    $19,64,$19
+        unop
+        stq     $6,48($16)
+        stq     $7,56($16)
+        addq    $16,64,$16
+        bne     $18, 1b
+        /* Prefetch the final 5 cache lines of the read stream.  */
+        lda     $18,10
+        ldl     $31,320($17)
+        ldl     $31,384($17)
+        ldl     $31,448($17)
+        ldl     $31,512($17)
+        ldl     $31,576($17)
+        nop
+        nop
+        /* Non-prefetching, non-write-hinting cleanup loop for the
+           final 10 cache lines.  */
+2:      ldq     $0,0($17)
+        ldq     $1,8($17)
+        ldq     $2,16($17)
+        ldq     $3,24($17)
+        ldq     $4,32($17)
+        ldq     $5,40($17)
+        ldq     $6,48($17)
+        ldq     $7,56($17)
+        stq     $0,0($16)
+        subq    $18,1,$18
+        stq     $1,8($16)
+        addq    $17,64,$17
+        stq     $2,16($16)
+        stq     $3,24($16)
+        stq     $4,32($16)
+        stq     $5,40($16)
+        stq     $6,48($16)
+        stq     $7,56($16)
+        addq    $16,64,$16
+        bne     $18, 2b
+        ret
+        nop
+        unop
+        nop
+        .end copy_page
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/alpha/lib/ev6-copy_page.S

diff --git a/arch/alpha/lib/ev6-copy_page.S b/arch/alpha/lib/ev6-copy_page.S new file mode 100644 index 000000000000..b789db192754 --- /dev/null +++ b/arch/alpha/lib/ev6-copy_page.S
@@ -0,0 +1,203 @@
	1	/*
	2	* arch/alpha/lib/ev6-copy_page.S
	3	*
	4	* Copy an entire page.
	5	*/
	6
	7	/* The following comparison of this routine vs the normal copy_page.S
	8	was written by an unnamed ev6 hardware designer and forwarded to me
	9	via Steven Hobbs <hobbs@steven.zko.dec.com>.
	10
	11	First Problem: STQ overflows.
	12	-----------------------------
	13
	14	It would be nice if EV6 handled every resource overflow efficiently,
	15	but for some it doesn't. Including store queue overflows. It causes
	16	a trap and a restart of the pipe.
	17
	18	To get around this we sometimes use (to borrow a term from a VSSAD
	19	researcher) "aeration". The idea is to slow the rate at which the
	20	processor receives valid instructions by inserting nops in the fetch
	21	path. In doing so, you can prevent the overflow and actually make
	22	the code run faster. You can, of course, take advantage of the fact
	23	that the processor can fetch at most 4 aligned instructions per cycle.
	24
	25	I inserted enough nops to force it to take 10 cycles to fetch the
	26	loop code. In theory, EV6 should be able to execute this loop in
	27	9 cycles but I was not able to get it to run that fast -- the initial
	28	conditions were such that I could not reach this optimum rate on
	29	(chaotic) EV6. I wrote the code such that everything would issue
	30	in order.
	31
	32	Second Problem: Dcache index matches.
	33	-------------------------------------
	34
	35	If you are going to use this routine on random aligned pages, there
	36	is a 25% chance that the pages will be at the same dcache indices.
	37	This results in many nasty memory traps without care.
	38
	39	The solution is to schedule the prefetches to avoid the memory
	40	conflicts. I schedule the wh64 prefetches farther ahead of the
	41	read prefetches to avoid this problem.
	42
	43	Third Problem: Needs more prefetching.
	44	--------------------------------------
	45
	46	In order to improve the code I added deeper prefetching to take the
	47	most advantage of EV6's bandwidth.
	48
	49	I also prefetched the read stream. Note that adding the read prefetch
	50	forced me to add another cycle to the inner-most kernel - up to 11
	51	from the original 8 cycles per iteration. We could improve performance
	52	further by unrolling the loop and doing multiple prefetches per cycle.
	53
	54	I think that the code below will be very robust and fast code for the
	55	purposes of copying aligned pages. It is slower when both source and
	56	destination pages are in the dcache, but it is my guess that this is
	57	less important than the dcache miss case. */
	58
	59
	60	.text
	61	.align 4
	62	.global copy_page
	63	.ent copy_page
	64	copy_page:
	65	.prologue 0
	66
	67	/* Prefetch 5 read cachelines; write-hint 10 cache lines. */
	68	wh64 ($16)
	69	ldl $31,0($17)
	70	ldl $31,64($17)
	71	lda $1,1*64($16)
	72
	73	wh64 ($1)
	74	ldl $31,128($17)
	75	ldl $31,192($17)
	76	lda $1,2*64($16)
	77
	78	wh64 ($1)
	79	ldl $31,256($17)
	80	lda $18,118
	81	lda $1,3*64($16)
	82
	83	wh64 ($1)
	84	nop
	85	lda $1,4*64($16)
	86	lda $2,5*64($16)
	87
	88	wh64 ($1)
	89	wh64 ($2)
	90	lda $1,6*64($16)
	91	lda $2,7*64($16)
	92
	93	wh64 ($1)
	94	wh64 ($2)
	95	lda $1,8*64($16)
	96	lda $2,9*64($16)
	97
	98	wh64 ($1)
	99	wh64 ($2)
	100	lda $19,10*64($16)
	101	nop
	102
	103	/* Main prefetching/write-hinting loop. */
	104	1: ldq $0,0($17)
	105	ldq $1,8($17)
	106	unop
	107	unop
	108
	109	unop
	110	unop
	111	ldq $2,16($17)
	112	ldq $3,24($17)
	113
	114	ldq $4,32($17)
	115	ldq $5,40($17)
	116	unop
	117	unop
	118
	119	unop
	120	unop
	121	ldq $6,48($17)
	122	ldq $7,56($17)
	123
	124	ldl $31,320($17)
	125	unop
	126	unop
	127	unop
	128
	129	/* This gives the extra cycle of aeration above the minimum. */
	130	unop
	131	unop
	132	unop
	133	unop
	134
	135	wh64 ($19)
	136	unop
	137	unop
	138	unop
	139
	140	stq $0,0($16)
	141	subq $18,1,$18
	142	stq $1,8($16)
	143	unop
	144
	145	unop
	146	stq $2,16($16)
	147	addq $17,64,$17
	148	stq $3,24($16)
	149
	150	stq $4,32($16)
	151	stq $5,40($16)
	152	addq $19,64,$19
	153	unop
	154
	155	stq $6,48($16)
	156	stq $7,56($16)
	157	addq $16,64,$16
	158	bne $18, 1b
	159
	160	/* Prefetch the final 5 cache lines of the read stream. */
	161	lda $18,10
	162	ldl $31,320($17)
	163	ldl $31,384($17)
	164	ldl $31,448($17)
	165
	166	ldl $31,512($17)
	167	ldl $31,576($17)
	168	nop
	169	nop
	170
	171	/* Non-prefetching, non-write-hinting cleanup loop for the
	172	final 10 cache lines. */
	173	2: ldq $0,0($17)
	174	ldq $1,8($17)
	175	ldq $2,16($17)
	176	ldq $3,24($17)
	177
	178	ldq $4,32($17)
	179	ldq $5,40($17)
	180	ldq $6,48($17)
	181	ldq $7,56($17)
	182
	183	stq $0,0($16)
	184	subq $18,1,$18
	185	stq $1,8($16)
	186	addq $17,64,$17
	187
	188	stq $2,16($16)
	189	stq $3,24($16)
	190	stq $4,32($16)
	191	stq $5,40($16)
	192
	193	stq $6,48($16)
	194	stq $7,56($16)
	195	addq $16,64,$16
	196	bne $18, 2b
	197
	198	ret
	199	nop
	200	unop
	201	nop
	202
	203	.end copy_page