4 files changed, 95 insertions, 21 deletions
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index cab355c0c1f7..139cce646055 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -13,14 +13,6 @@
 */
 /*
- * This can be used to enable code to cacheline align the source pointer.
- * Experiments on tested architectures (StrongARM and XScale) didn't show
- * this a worthwhile thing to do.  That might be different in the future.
- */
-//#define CALGN(code...)        code
-#define CALGN(code...)
-/*
 * Theory of operation
 * -------------------
 *
@@ -82,7 +74,7 @@
                stmfd   sp!, {r5 - r8}
                blt     5f
-        CALGN(  ands    ip, r1, #31             )
+        CALGN(  ands    ip, r0, #31             )
        CALGN(  rsb     r3, ip, #32             )
        CALGN(  sbcnes  r4, r3, r2              )  @ C is always set here
        CALGN(  bcs     2f                      )
@@ -168,7 +160,7 @@
                subs    r2, r2, #28
                blt     14f
-        CALGN(  ands    ip, r1, #31             )
+        CALGN(  ands    ip, r0, #31             )
        CALGN(  rsb     ip, ip, #32             )
        CALGN(  sbcnes  r4, ip, r2              )  @ C is always set here
        CALGN(  subcc   r2, r2, ip              )
diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S
index ef7fddc14ac9..2e301b7bd8f1 100644
--- a/arch/arm/lib/memmove.S
+++ b/arch/arm/lib/memmove.S
@@ -13,14 +13,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-/*
- * This can be used to enable code to cacheline align the source pointer.
- * Experiments on tested architectures (StrongARM and XScale) didn't show
- * this a worthwhile thing to do.  That might be different in the future.
- */
-//#define CALGN(code...)        code
-#define CALGN(code...)
                .text
 /*
@@ -55,11 +47,12 @@ ENTRY(memmove)
                stmfd   sp!, {r5 - r8}
                blt     5f
-        CALGN(  ands    ip, r1, #31             )
+        CALGN(  ands    ip, r0, #31             )
        CALGN(  sbcnes  r4, ip, r2              )  @ C is always set here
        CALGN(  bcs     2f                      )
        CALGN(  adr     r4, 6f                  )
        CALGN(  subs    r2, r2, ip              )  @ C is set here
+        CALGN(  rsb     ip, ip, #32             )
        CALGN(  add     pc, r4, ip              )
        PLD(    pld     [r1, #-4]               )
@@ -138,8 +131,7 @@ ENTRY(memmove)
                subs    r2, r2, #28
                blt     14f
-        CALGN(  ands    ip, r1, #31             )
+        CALGN(  ands    ip, r0, #31             )
-        CALGN(  rsb     ip, ip, #32             )
        CALGN(  sbcnes  r4, ip, r2              )  @ C is always set here
        CALGN(  subcc   r2, r2, ip              )
        CALGN(  bcc     15f                     )
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
index 95b110b07a89..b477d4ac88ef 100644
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -39,6 +39,9 @@ ENTRY(memset)
        mov     r3, r1
        cmp     r2, #16
        blt     4f
+#if ! CALGN(1)+0
 /*
 * We need an extra register for this loop - save the return address and
 * use the LR
@@ -64,6 +67,49 @@ ENTRY(memset)
        stmneia r0!, {r1, r3, ip, lr}
        ldr     lr, [sp], #4
+#else
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+        stmfd   sp!, {r4-r7, lr}
+        mov     r4, r1
+        mov     r5, r1
+        mov     r6, r1
+        mov     r7, r1
+        mov     ip, r1
+        mov     lr, r1
+        cmp     r2, #96
+        tstgt   r0, #31
+        ble     3f
+        and     ip, r0, #31
+        rsb     ip, ip, #32
+        sub     r2, r2, ip
+        movs    ip, ip, lsl #(32 - 4)
+        stmcsia r0!, {r4, r5, r6, r7}
+        stmmiia r0!, {r4, r5}
+        tst     ip, #(1 << 30)
+        mov     ip, r1
+        strne   r1, [r0], #4
+3:      subs    r2, r2, #64
+        stmgeia r0!, {r1, r3-r7, ip, lr}
+        stmgeia r0!, {r1, r3-r7, ip, lr}
+        bgt     3b
+        ldmeqfd sp!, {r4-r7, pc}
+        tst     r2, #32
+        stmneia r0!, {r1, r3-r7, ip, lr}
+        tst     r2, #16
+        stmneia r0!, {r4-r7}
+        ldmfd   sp!, {r4-r7, lr}
+#endif
 4:      tst     r2, #8
        stmneia r0!, {r1, r3}
        tst     r2, #4
diff --git a/arch/arm/lib/memzero.S b/arch/arm/lib/memzero.S
index abf2508e8221..b8f79d80ee9b 100644
--- a/arch/arm/lib/memzero.S
+++ b/arch/arm/lib/memzero.S
@@ -39,6 +39,9 @@ ENTRY(__memzero)
 */
        cmp     r1, #16                 @ 1 we can skip this chunk if we
        blt     4f                      @ 1 have < 16 bytes
+#if ! CALGN(1)+0
 /*
 * We need an extra register for this loop - save the return address and
 * use the LR
@@ -64,6 +67,47 @@ ENTRY(__memzero)
        stmneia r0!, {r2, r3, ip, lr}   @ 4
        ldr     lr, [sp], #4            @ 1
+#else
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+        stmfd   sp!, {r4-r7, lr}
+        mov     r4, r2
+        mov     r5, r2
+        mov     r6, r2
+        mov     r7, r2
+        mov     ip, r2
+        mov     lr, r2
+        cmp     r1, #96
+        andgts  ip, r0, #31
+        ble     3f
+        rsb     ip, ip, #32
+        sub     r1, r1, ip
+        movs    ip, ip, lsl #(32 - 4)
+        stmcsia r0!, {r4, r5, r6, r7}
+        stmmiia r0!, {r4, r5}
+        movs    ip, ip, lsl #2
+        strcs   r2, [r0], #4
+3:      subs    r1, r1, #64
+        stmgeia r0!, {r2-r7, ip, lr}
+        stmgeia r0!, {r2-r7, ip, lr}
+        bgt     3b
+        ldmeqfd sp!, {r4-r7, pc}
+        tst     r1, #32
+        stmneia r0!, {r2-r7, ip, lr}
+        tst     r1, #16
+        stmneia r0!, {r4-r7}
+        ldmfd   sp!, {r4-r7, lr}
+#endif
 4:      tst     r1, #8                  @ 1 8 bytes or more?
        stmneia r0!, {r2, r3}           @ 2
        tst     r1, #4                  @ 1 4 bytes or more?

diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S index cab355c0c1f7..139cce646055 100644 --- a/arch/arm/lib/copy_template.S +++ b/arch/arm/lib/copy_template.S
@@ -13,14 +13,6 @@
13	*/	13	*/
14		14
15	/*	15	/*
16	* This can be used to enable code to cacheline align the source pointer.
17	* Experiments on tested architectures (StrongARM and XScale) didn't show
18	* this a worthwhile thing to do. That might be different in the future.
19	*/
20	//#define CALGN(code...) code
21	#define CALGN(code...)
22
23	/*
24	* Theory of operation	16	* Theory of operation
25	* -------------------	17	* -------------------
26	*	18	*
@@ -82,7 +74,7 @@
82	stmfd sp!, {r5 - r8}	74	stmfd sp!, {r5 - r8}
83	blt 5f	75	blt 5f
84		76
85	CALGN( ands ip, r1, #31 )	77	CALGN( ands ip, r0, #31 )
86	CALGN( rsb r3, ip, #32 )	78	CALGN( rsb r3, ip, #32 )
87	CALGN( sbcnes r4, r3, r2 ) @ C is always set here	79	CALGN( sbcnes r4, r3, r2 ) @ C is always set here
88	CALGN( bcs 2f )	80	CALGN( bcs 2f )
@@ -168,7 +160,7 @@
168	subs r2, r2, #28	160	subs r2, r2, #28
169	blt 14f	161	blt 14f
170		162
171	CALGN( ands ip, r1, #31 )	163	CALGN( ands ip, r0, #31 )
172	CALGN( rsb ip, ip, #32 )	164	CALGN( rsb ip, ip, #32 )
173	CALGN( sbcnes r4, ip, r2 ) @ C is always set here	165	CALGN( sbcnes r4, ip, r2 ) @ C is always set here
174	CALGN( subcc r2, r2, ip )	166	CALGN( subcc r2, r2, ip )


diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S index ef7fddc14ac9..2e301b7bd8f1 100644 --- a/arch/arm/lib/memmove.S +++ b/arch/arm/lib/memmove.S
@@ -13,14 +13,6 @@
13	#include <linux/linkage.h>	13	#include <linux/linkage.h>
14	#include <asm/assembler.h>	14	#include <asm/assembler.h>
15		15
16	/*
17	* This can be used to enable code to cacheline align the source pointer.
18	* Experiments on tested architectures (StrongARM and XScale) didn't show
19	* this a worthwhile thing to do. That might be different in the future.
20	*/
21	//#define CALGN(code...) code
22	#define CALGN(code...)
23
24	.text	16	.text
25		17
26	/*	18	/*
@@ -55,11 +47,12 @@ ENTRY(memmove)
55	stmfd sp!, {r5 - r8}	47	stmfd sp!, {r5 - r8}
56	blt 5f	48	blt 5f
57		49
58	CALGN( ands ip, r1, #31 )	50	CALGN( ands ip, r0, #31 )
59	CALGN( sbcnes r4, ip, r2 ) @ C is always set here	51	CALGN( sbcnes r4, ip, r2 ) @ C is always set here
60	CALGN( bcs 2f )	52	CALGN( bcs 2f )
61	CALGN( adr r4, 6f )	53	CALGN( adr r4, 6f )
62	CALGN( subs r2, r2, ip ) @ C is set here	54	CALGN( subs r2, r2, ip ) @ C is set here
		55	CALGN( rsb ip, ip, #32 )
63	CALGN( add pc, r4, ip )	56	CALGN( add pc, r4, ip )
64		57
65	PLD( pld [r1, #-4] )	58	PLD( pld [r1, #-4] )
@@ -138,8 +131,7 @@ ENTRY(memmove)
138	subs r2, r2, #28	131	subs r2, r2, #28
139	blt 14f	132	blt 14f
140		133
141	CALGN( ands ip, r1, #31 )	134	CALGN( ands ip, r0, #31 )
142	CALGN( rsb ip, ip, #32 )
143	CALGN( sbcnes r4, ip, r2 ) @ C is always set here	135	CALGN( sbcnes r4, ip, r2 ) @ C is always set here
144	CALGN( subcc r2, r2, ip )	136	CALGN( subcc r2, r2, ip )
145	CALGN( bcc 15f )	137	CALGN( bcc 15f )


diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S index 95b110b07a89..b477d4ac88ef 100644 --- a/arch/arm/lib/memset.S +++ b/arch/arm/lib/memset.S
@@ -39,6 +39,9 @@ ENTRY(memset)
39	mov r3, r1	39	mov r3, r1
40	cmp r2, #16	40	cmp r2, #16
41	blt 4f	41	blt 4f
		42
		43	#if ! CALGN(1)+0
		44
42	/*	45	/*
43	* We need an extra register for this loop - save the return address and	46	* We need an extra register for this loop - save the return address and
44	* use the LR	47	* use the LR
@@ -64,6 +67,49 @@ ENTRY(memset)
64	stmneia r0!, {r1, r3, ip, lr}	67	stmneia r0!, {r1, r3, ip, lr}
65	ldr lr, [sp], #4	68	ldr lr, [sp], #4
66		69
		70	#else
		71
		72	/*
		73	* This version aligns the destination pointer in order to write
		74	* whole cache lines at once.
		75	*/
		76
		77	stmfd sp!, {r4-r7, lr}
		78	mov r4, r1
		79	mov r5, r1
		80	mov r6, r1
		81	mov r7, r1
		82	mov ip, r1
		83	mov lr, r1
		84
		85	cmp r2, #96
		86	tstgt r0, #31
		87	ble 3f
		88
		89	and ip, r0, #31
		90	rsb ip, ip, #32
		91	sub r2, r2, ip
		92	movs ip, ip, lsl #(32 - 4)
		93	stmcsia r0!, {r4, r5, r6, r7}
		94	stmmiia r0!, {r4, r5}
		95	tst ip, #(1 << 30)
		96	mov ip, r1
		97	strne r1, [r0], #4
		98
		99	3: subs r2, r2, #64
		100	stmgeia r0!, {r1, r3-r7, ip, lr}
		101	stmgeia r0!, {r1, r3-r7, ip, lr}
		102	bgt 3b
		103	ldmeqfd sp!, {r4-r7, pc}
		104
		105	tst r2, #32
		106	stmneia r0!, {r1, r3-r7, ip, lr}
		107	tst r2, #16
		108	stmneia r0!, {r4-r7}
		109	ldmfd sp!, {r4-r7, lr}
		110
		111	#endif
		112
67	4: tst r2, #8	113	4: tst r2, #8
68	stmneia r0!, {r1, r3}	114	stmneia r0!, {r1, r3}
69	tst r2, #4	115	tst r2, #4


diff --git a/arch/arm/lib/memzero.S b/arch/arm/lib/memzero.S index abf2508e8221..b8f79d80ee9b 100644 --- a/arch/arm/lib/memzero.S +++ b/arch/arm/lib/memzero.S
@@ -39,6 +39,9 @@ ENTRY(__memzero)
39	*/	39	*/
40	cmp r1, #16 @ 1 we can skip this chunk if we	40	cmp r1, #16 @ 1 we can skip this chunk if we
41	blt 4f @ 1 have < 16 bytes	41	blt 4f @ 1 have < 16 bytes
		42
		43	#if ! CALGN(1)+0
		44
42	/*	45	/*
43	* We need an extra register for this loop - save the return address and	46	* We need an extra register for this loop - save the return address and
44	* use the LR	47	* use the LR
@@ -64,6 +67,47 @@ ENTRY(__memzero)
64	stmneia r0!, {r2, r3, ip, lr} @ 4	67	stmneia r0!, {r2, r3, ip, lr} @ 4
65	ldr lr, [sp], #4 @ 1	68	ldr lr, [sp], #4 @ 1
66		69
		70	#else
		71
		72	/*
		73	* This version aligns the destination pointer in order to write
		74	* whole cache lines at once.
		75	*/
		76
		77	stmfd sp!, {r4-r7, lr}
		78	mov r4, r2
		79	mov r5, r2
		80	mov r6, r2
		81	mov r7, r2
		82	mov ip, r2
		83	mov lr, r2
		84
		85	cmp r1, #96
		86	andgts ip, r0, #31
		87	ble 3f
		88
		89	rsb ip, ip, #32
		90	sub r1, r1, ip
		91	movs ip, ip, lsl #(32 - 4)
		92	stmcsia r0!, {r4, r5, r6, r7}
		93	stmmiia r0!, {r4, r5}
		94	movs ip, ip, lsl #2
		95	strcs r2, [r0], #4
		96
		97	3: subs r1, r1, #64
		98	stmgeia r0!, {r2-r7, ip, lr}
		99	stmgeia r0!, {r2-r7, ip, lr}
		100	bgt 3b
		101	ldmeqfd sp!, {r4-r7, pc}
		102
		103	tst r1, #32
		104	stmneia r0!, {r2-r7, ip, lr}
		105	tst r1, #16
		106	stmneia r0!, {r4-r7}
		107	ldmfd sp!, {r4-r7, lr}
		108
		109	#endif
		110
67	4: tst r1, #8 @ 1 8 bytes or more?	111	4: tst r1, #8 @ 1 8 bytes or more?
68	stmneia r0!, {r2, r3} @ 2	112	stmneia r0!, {r2, r3} @ 2
69	tst r1, #4 @ 1 4 bytes or more?	113	tst r1, #4 @ 1 4 bytes or more?