arm64: lib: Implement optimized memcpy routine

This patch, based on Linaro's Cortex Strings library, improves the performance of the assembly optimized memcpy() function. Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org> Signed-off-by: Deepak Saxena <dsaxena@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
author: zhichang.yuan <zhichang.yuan@linaro.org> 2014-04-28 01:11:29 -0400
committer: Catalin Marinas <catalin.marinas@arm.com> 2014-05-23 10:06:53 -0400
commit: 808dbac6b51f3441eb5a07724c0b0d1257046d51 (patch)
tree: 9b4e842a2c76e0a82eece326ec51718ff9a8555d /arch
parent: 74d2eb3cdb7bd6011229aa551bb36d45bcd327f4 (diff)
1 files changed, 170 insertions, 22 deletions
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 27b5003609b6..8a9a96d3ddae 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,5 +1,13 @@
 /*
 * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 /*
 * Copy a buffer from src to dest (alignment handled by the hardware)
@@ -27,27 +36,166 @@
 * Returns:
 *      x0 - dest
 */
+dstin   .req    x0
+src     .req    x1
+count   .req    x2
+tmp1    .req    x3
+tmp1w   .req    w3
+tmp2    .req    x4
+tmp2w   .req    w4
+tmp3    .req    x5
+tmp3w   .req    w5
+dst     .req    x6
+A_l     .req    x7
+A_h     .req    x8
+B_l     .req    x9
+B_h     .req    x10
+C_l     .req    x11
+C_h     .req    x12
+D_l     .req    x13
+D_h     .req    x14
 ENTRY(memcpy)
-        mov     x4, x0
+        mov     dst, dstin
-        subs    x2, x2, #8
+        cmp     count, #16
-        b.mi    2f
+        /*When memory length is less than 16, the accessed are not aligned.*/
-1:      ldr     x3, [x1], #8
+        b.lo    .Ltiny15
-        subs    x2, x2, #8
-        str     x3, [x4], #8
+        neg     tmp2, src
-        b.pl    1b
+        ands    tmp2, tmp2, #15/* Bytes to reach alignment. */
-2:      adds    x2, x2, #4
+        b.eq    .LSrcAligned
-        b.mi    3f
+        sub     count, count, tmp2
-        ldr     w3, [x1], #4
+        /*
-        sub     x2, x2, #4
+        * Copy the leading memory data from src to dst in an increasing
-        str     w3, [x4], #4
+        * address order.By this way,the risk of overwritting the source
-3:      adds    x2, x2, #2
+        * memory data is eliminated when the distance between src and
-        b.mi    4f
+        * dst is less than 16. The memory accesses here are alignment.
-        ldrh    w3, [x1], #2
+        */
-        sub     x2, x2, #2
+        tbz     tmp2, #0, 1f
-        strh    w3, [x4], #2
+        ldrb    tmp1w, [src], #1
-4:      adds    x2, x2, #1
+        strb    tmp1w, [dst], #1
-        b.mi    5f
+1:
-        ldrb    w3, [x1]
+        tbz     tmp2, #1, 2f
-        strb    w3, [x4]
+        ldrh    tmp1w, [src], #2
-5:      ret
+        strh    tmp1w, [dst], #2
+2:
+        tbz     tmp2, #2, 3f
+        ldr     tmp1w, [src], #4
+        str     tmp1w, [dst], #4
+3:
+        tbz     tmp2, #3, .LSrcAligned
+        ldr     tmp1, [src],#8
+        str     tmp1, [dst],#8
+.LSrcAligned:
+        cmp     count, #64
+        b.ge    .Lcpy_over64
+        /*
+        * Deal with small copies quickly by dropping straight into the
+        * exit block.
+        */
+.Ltail63:
+        /*
+        * Copy up to 48 bytes of data. At this point we only need the
+        * bottom 6 bits of count to be accurate.
+        */
+        ands    tmp1, count, #0x30
+        b.eq    .Ltiny15
+        cmp     tmp1w, #0x20
+        b.eq    1f
+        b.lt    2f
+        ldp     A_l, A_h, [src], #16
+        stp     A_l, A_h, [dst], #16
+1:
+        ldp     A_l, A_h, [src], #16
+        stp     A_l, A_h, [dst], #16
+2:
+        ldp     A_l, A_h, [src], #16
+        stp     A_l, A_h, [dst], #16
+.Ltiny15:
+        /*
+        * Prefer to break one ldp/stp into several load/store to access
+        * memory in an increasing address order,rather than to load/store 16
+        * bytes from (src-16) to (dst-16) and to backward the src to aligned
+        * address,which way is used in original cortex memcpy. If keeping
+        * the original memcpy process here, memmove need to satisfy the
+        * precondition that src address is at least 16 bytes bigger than dst
+        * address,otherwise some source data will be overwritten when memove
+        * call memcpy directly. To make memmove simpler and decouple the
+        * memcpy's dependency on memmove, withdrew the original process.
+        */
+        tbz     count, #3, 1f
+        ldr     tmp1, [src], #8
+        str     tmp1, [dst], #8
+1:
+        tbz     count, #2, 2f
+        ldr     tmp1w, [src], #4
+        str     tmp1w, [dst], #4
+2:
+        tbz     count, #1, 3f
+        ldrh    tmp1w, [src], #2
+        strh    tmp1w, [dst], #2
+3:
+        tbz     count, #0, .Lexitfunc
+        ldrb    tmp1w, [src]
+        strb    tmp1w, [dst]
+.Lexitfunc:
+        ret
+.Lcpy_over64:
+        subs    count, count, #128
+        b.ge    .Lcpy_body_large
+        /*
+        * Less than 128 bytes to copy, so handle 64 here and then jump
+        * to the tail.
+        */
+        ldp     A_l, A_h, [src],#16
+        stp     A_l, A_h, [dst],#16
+        ldp     B_l, B_h, [src],#16
+        ldp     C_l, C_h, [src],#16
+        stp     B_l, B_h, [dst],#16
+        stp     C_l, C_h, [dst],#16
+        ldp     D_l, D_h, [src],#16
+        stp     D_l, D_h, [dst],#16
+        tst     count, #0x3f
+        b.ne    .Ltail63
+        ret
+        /*
+        * Critical loop.  Start at a new cache line boundary.  Assuming
+        * 64 bytes per line this ensures the entire loop is in one line.
+        */
+        .p2align        L1_CACHE_SHIFT
+.Lcpy_body_large:
+        /* pre-get 64 bytes data. */
+        ldp     A_l, A_h, [src],#16
+        ldp     B_l, B_h, [src],#16
+        ldp     C_l, C_h, [src],#16
+        ldp     D_l, D_h, [src],#16
+1:
+        /*
+        * interlace the load of next 64 bytes data block with store of the last
+        * loaded 64 bytes data.
+        */
+        stp     A_l, A_h, [dst],#16
+        ldp     A_l, A_h, [src],#16
+        stp     B_l, B_h, [dst],#16
+        ldp     B_l, B_h, [src],#16
+        stp     C_l, C_h, [dst],#16
+        ldp     C_l, C_h, [src],#16
+        stp     D_l, D_h, [dst],#16
+        ldp     D_l, D_h, [src],#16
+        subs    count, count, #64
+        b.ge    1b
+        stp     A_l, A_h, [dst],#16
+        stp     B_l, B_h, [dst],#16
+        stp     C_l, C_h, [dst],#16
+        stp     D_l, D_h, [dst],#16
+        tst     count, #0x3f
+        b.ne    .Ltail63
+        ret
 ENDPROC(memcpy)
author	zhichang.yuan <zhichang.yuan@linaro.org>	2014-04-28 01:11:29 -0400
committer	Catalin Marinas <catalin.marinas@arm.com>	2014-05-23 10:06:53 -0400
commit	808dbac6b51f3441eb5a07724c0b0d1257046d51 (patch)
tree	9b4e842a2c76e0a82eece326ec51718ff9a8555d /arch
parent	74d2eb3cdb7bd6011229aa551bb36d45bcd327f4 (diff)

diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 27b5003609b6..8a9a96d3ddae 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S
@@ -1,5 +1,13 @@
1	/*	1	/*
2	* Copyright (C) 2013 ARM Ltd.	2	* Copyright (C) 2013 ARM Ltd.
		3	* Copyright (C) 2013 Linaro.
		4	*
		5	* This code is based on glibc cortex strings work originally authored by Linaro
		6	* and re-licensed under GPLv2 for the Linux kernel. The original code can
		7	* be found @
		8	*
		9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
		10	* files/head:/src/aarch64/
3	*	11	*
4	* This program is free software; you can redistribute it and/or modify	12	* This program is free software; you can redistribute it and/or modify
5	* it under the terms of the GNU General Public License version 2 as	13	* it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
16		24
17	#include <linux/linkage.h>	25	#include <linux/linkage.h>
18	#include <asm/assembler.h>	26	#include <asm/assembler.h>
		27	#include <asm/cache.h>
19		28
20	/*	29	/*
21	* Copy a buffer from src to dest (alignment handled by the hardware)	30	* Copy a buffer from src to dest (alignment handled by the hardware)
@@ -27,27 +36,166 @@
27	* Returns:	36	* Returns:
28	* x0 - dest	37	* x0 - dest
29	*/	38	*/
		39	dstin .req x0
		40	src .req x1
		41	count .req x2
		42	tmp1 .req x3
		43	tmp1w .req w3
		44	tmp2 .req x4
		45	tmp2w .req w4
		46	tmp3 .req x5
		47	tmp3w .req w5
		48	dst .req x6
		49
		50	A_l .req x7
		51	A_h .req x8
		52	B_l .req x9
		53	B_h .req x10
		54	C_l .req x11
		55	C_h .req x12
		56	D_l .req x13
		57	D_h .req x14
		58
30	ENTRY(memcpy)	59	ENTRY(memcpy)
31	mov x4, x0	60	mov dst, dstin
32	subs x2, x2, #8	61	cmp count, #16
33	b.mi 2f	62	/When memory length is less than 16, the accessed are not aligned./
34	1: ldr x3, [x1], #8	63	b.lo .Ltiny15
35	subs x2, x2, #8	64
36	str x3, [x4], #8	65	neg tmp2, src
37	b.pl 1b	66	ands tmp2, tmp2, #15/* Bytes to reach alignment. */
38	2: adds x2, x2, #4	67	b.eq .LSrcAligned
39	b.mi 3f	68	sub count, count, tmp2
40	ldr w3, [x1], #4	69	/*
41	sub x2, x2, #4	70	* Copy the leading memory data from src to dst in an increasing
42	str w3, [x4], #4	71	* address order.By this way,the risk of overwritting the source
43	3: adds x2, x2, #2	72	* memory data is eliminated when the distance between src and
44	b.mi 4f	73	* dst is less than 16. The memory accesses here are alignment.
45	ldrh w3, [x1], #2	74	*/
46	sub x2, x2, #2	75	tbz tmp2, #0, 1f
47	strh w3, [x4], #2	76	ldrb tmp1w, [src], #1
48	4: adds x2, x2, #1	77	strb tmp1w, [dst], #1
49	b.mi 5f	78	1:
50	ldrb w3, [x1]	79	tbz tmp2, #1, 2f
51	strb w3, [x4]	80	ldrh tmp1w, [src], #2
52	5: ret	81	strh tmp1w, [dst], #2
		82	2:
		83	tbz tmp2, #2, 3f
		84	ldr tmp1w, [src], #4
		85	str tmp1w, [dst], #4
		86	3:
		87	tbz tmp2, #3, .LSrcAligned
		88	ldr tmp1, [src],#8
		89	str tmp1, [dst],#8
		90
		91	.LSrcAligned:
		92	cmp count, #64
		93	b.ge .Lcpy_over64
		94	/*
		95	* Deal with small copies quickly by dropping straight into the
		96	* exit block.
		97	*/
		98	.Ltail63:
		99	/*
		100	* Copy up to 48 bytes of data. At this point we only need the
		101	* bottom 6 bits of count to be accurate.
		102	*/
		103	ands tmp1, count, #0x30
		104	b.eq .Ltiny15
		105	cmp tmp1w, #0x20
		106	b.eq 1f
		107	b.lt 2f
		108	ldp A_l, A_h, [src], #16
		109	stp A_l, A_h, [dst], #16
		110	1:
		111	ldp A_l, A_h, [src], #16
		112	stp A_l, A_h, [dst], #16
		113	2:
		114	ldp A_l, A_h, [src], #16
		115	stp A_l, A_h, [dst], #16
		116	.Ltiny15:
		117	/*
		118	* Prefer to break one ldp/stp into several load/store to access
		119	* memory in an increasing address order,rather than to load/store 16
		120	* bytes from (src-16) to (dst-16) and to backward the src to aligned
		121	* address,which way is used in original cortex memcpy. If keeping
		122	* the original memcpy process here, memmove need to satisfy the
		123	* precondition that src address is at least 16 bytes bigger than dst
		124	* address,otherwise some source data will be overwritten when memove
		125	* call memcpy directly. To make memmove simpler and decouple the
		126	* memcpy's dependency on memmove, withdrew the original process.
		127	*/
		128	tbz count, #3, 1f
		129	ldr tmp1, [src], #8
		130	str tmp1, [dst], #8
		131	1:
		132	tbz count, #2, 2f
		133	ldr tmp1w, [src], #4
		134	str tmp1w, [dst], #4
		135	2:
		136	tbz count, #1, 3f
		137	ldrh tmp1w, [src], #2
		138	strh tmp1w, [dst], #2
		139	3:
		140	tbz count, #0, .Lexitfunc
		141	ldrb tmp1w, [src]
		142	strb tmp1w, [dst]
		143
		144	.Lexitfunc:
		145	ret
		146
		147	.Lcpy_over64:
		148	subs count, count, #128
		149	b.ge .Lcpy_body_large
		150	/*
		151	* Less than 128 bytes to copy, so handle 64 here and then jump
		152	* to the tail.
		153	*/
		154	ldp A_l, A_h, [src],#16
		155	stp A_l, A_h, [dst],#16
		156	ldp B_l, B_h, [src],#16
		157	ldp C_l, C_h, [src],#16
		158	stp B_l, B_h, [dst],#16
		159	stp C_l, C_h, [dst],#16
		160	ldp D_l, D_h, [src],#16
		161	stp D_l, D_h, [dst],#16
		162
		163	tst count, #0x3f
		164	b.ne .Ltail63
		165	ret
		166
		167	/*
		168	* Critical loop. Start at a new cache line boundary. Assuming
		169	* 64 bytes per line this ensures the entire loop is in one line.
		170	*/
		171	.p2align L1_CACHE_SHIFT
		172	.Lcpy_body_large:
		173	/* pre-get 64 bytes data. */
		174	ldp A_l, A_h, [src],#16
		175	ldp B_l, B_h, [src],#16
		176	ldp C_l, C_h, [src],#16
		177	ldp D_l, D_h, [src],#16
		178	1:
		179	/*
		180	* interlace the load of next 64 bytes data block with store of the last
		181	* loaded 64 bytes data.
		182	*/
		183	stp A_l, A_h, [dst],#16
		184	ldp A_l, A_h, [src],#16
		185	stp B_l, B_h, [dst],#16
		186	ldp B_l, B_h, [src],#16
		187	stp C_l, C_h, [dst],#16
		188	ldp C_l, C_h, [src],#16
		189	stp D_l, D_h, [dst],#16
		190	ldp D_l, D_h, [src],#16
		191	subs count, count, #64
		192	b.ge 1b
		193	stp A_l, A_h, [dst],#16
		194	stp B_l, B_h, [dst],#16
		195	stp C_l, C_h, [dst],#16
		196	stp D_l, D_h, [dst],#16
		197
		198	tst count, #0x3f
		199	b.ne .Ltail63
		200	ret
53	ENDPROC(memcpy)	201	ENDPROC(memcpy)