arm64: lib: Implement optimized memmove routine

This patch, based on Linaro's Cortex Strings library, improves the performance of the assembly optimized memmove() function. Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org> Signed-off-by: Deepak Saxena <dsaxena@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
author: zhichang.yuan <zhichang.yuan@linaro.org> 2014-04-28 01:11:30 -0400
committer: Catalin Marinas <catalin.marinas@arm.com> 2014-05-23 10:07:35 -0400
commit: 280adc1951c0c9fc8f2d85571ff563a1c412b1cd (patch)
tree: 89be4d00570a72fb8b4c30216f6b45d017181e78 /arch/arm64
parent: 808dbac6b51f3441eb5a07724c0b0d1257046d51 (diff)
1 files changed, 165 insertions, 25 deletions
diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S
index b79fdfa42d39..57b19ea2dad4 100644
--- a/arch/arm64/lib/memmove.S
+++ b/arch/arm64/lib/memmove.S
@@ -1,5 +1,13 @@
 /*
 * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 /*
 * Move a buffer from src to test (alignment handled by the hardware).
@@ -28,30 +37,161 @@
 * Returns:
 *      x0 - dest
 */
+dstin   .req    x0
+src     .req    x1
+count   .req    x2
+tmp1    .req    x3
+tmp1w   .req    w3
+tmp2    .req    x4
+tmp2w   .req    w4
+tmp3    .req    x5
+tmp3w   .req    w5
+dst     .req    x6
+A_l     .req    x7
+A_h     .req    x8
+B_l     .req    x9
+B_h     .req    x10
+C_l     .req    x11
+C_h     .req    x12
+D_l     .req    x13
+D_h     .req    x14
 ENTRY(memmove)
-        cmp     x0, x1
+        cmp     dstin, src
-        b.ls    memcpy
+        b.lo    memcpy
-        add     x4, x0, x2
+        add     tmp1, src, count
-        add     x1, x1, x2
+        cmp     dstin, tmp1
-        subs    x2, x2, #8
+        b.hs    memcpy          /* No overlap.  */
-        b.mi    2f
-1:      ldr     x3, [x1, #-8]!
+        add     dst, dstin, count
-        subs    x2, x2, #8
+        add     src, src, count
-        str     x3, [x4, #-8]!
+        cmp     count, #16
-        b.pl    1b
+        b.lo    .Ltail15  /*probably non-alignment accesses.*/
-2:      adds    x2, x2, #4
-        b.mi    3f
+        ands    tmp2, src, #15     /* Bytes to reach alignment.  */
-        ldr     w3, [x1, #-4]!
+        b.eq    .LSrcAligned
-        sub     x2, x2, #4
+        sub     count, count, tmp2
-        str     w3, [x4, #-4]!
+        /*
-3:      adds    x2, x2, #2
+        * process the aligned offset length to make the src aligned firstly.
-        b.mi    4f
+        * those extra instructions' cost is acceptable. It also make the
-        ldrh    w3, [x1, #-2]!
+        * coming accesses are based on aligned address.
-        sub     x2, x2, #2
+        */
-        strh    w3, [x4, #-2]!
+        tbz     tmp2, #0, 1f
-4:      adds    x2, x2, #1
+        ldrb    tmp1w, [src, #-1]!
-        b.mi    5f
+        strb    tmp1w, [dst, #-1]!
-        ldrb    w3, [x1, #-1]
+1:
-        strb    w3, [x4, #-1]
+        tbz     tmp2, #1, 2f
-5:      ret
+        ldrh    tmp1w, [src, #-2]!
+        strh    tmp1w, [dst, #-2]!
+2:
+        tbz     tmp2, #2, 3f
+        ldr     tmp1w, [src, #-4]!
+        str     tmp1w, [dst, #-4]!
+3:
+        tbz     tmp2, #3, .LSrcAligned
+        ldr     tmp1, [src, #-8]!
+        str     tmp1, [dst, #-8]!
+.LSrcAligned:
+        cmp     count, #64
+        b.ge    .Lcpy_over64
+        /*
+        * Deal with small copies quickly by dropping straight into the
+        * exit block.
+        */
+.Ltail63:
+        /*
+        * Copy up to 48 bytes of data. At this point we only need the
+        * bottom 6 bits of count to be accurate.
+        */
+        ands    tmp1, count, #0x30
+        b.eq    .Ltail15
+        cmp     tmp1w, #0x20
+        b.eq    1f
+        b.lt    2f
+        ldp     A_l, A_h, [src, #-16]!
+        stp     A_l, A_h, [dst, #-16]!
+1:
+        ldp     A_l, A_h, [src, #-16]!
+        stp     A_l, A_h, [dst, #-16]!
+2:
+        ldp     A_l, A_h, [src, #-16]!
+        stp     A_l, A_h, [dst, #-16]!
+.Ltail15:
+        tbz     count, #3, 1f
+        ldr     tmp1, [src, #-8]!
+        str     tmp1, [dst, #-8]!
+1:
+        tbz     count, #2, 2f
+        ldr     tmp1w, [src, #-4]!
+        str     tmp1w, [dst, #-4]!
+2:
+        tbz     count, #1, 3f
+        ldrh    tmp1w, [src, #-2]!
+        strh    tmp1w, [dst, #-2]!
+3:
+        tbz     count, #0, .Lexitfunc
+        ldrb    tmp1w, [src, #-1]
+        strb    tmp1w, [dst, #-1]
+.Lexitfunc:
+        ret
+.Lcpy_over64:
+        subs    count, count, #128
+        b.ge    .Lcpy_body_large
+        /*
+        * Less than 128 bytes to copy, so handle 64 bytes here and then jump
+        * to the tail.
+        */
+        ldp     A_l, A_h, [src, #-16]
+        stp     A_l, A_h, [dst, #-16]
+        ldp     B_l, B_h, [src, #-32]
+        ldp     C_l, C_h, [src, #-48]
+        stp     B_l, B_h, [dst, #-32]
+        stp     C_l, C_h, [dst, #-48]
+        ldp     D_l, D_h, [src, #-64]!
+        stp     D_l, D_h, [dst, #-64]!
+        tst     count, #0x3f
+        b.ne    .Ltail63
+        ret
+        /*
+        * Critical loop. Start at a new cache line boundary. Assuming
+        * 64 bytes per line this ensures the entire loop is in one line.
+        */
+        .p2align        L1_CACHE_SHIFT
+.Lcpy_body_large:
+        /* pre-load 64 bytes data. */
+        ldp     A_l, A_h, [src, #-16]
+        ldp     B_l, B_h, [src, #-32]
+        ldp     C_l, C_h, [src, #-48]
+        ldp     D_l, D_h, [src, #-64]!
+1:
+        /*
+        * interlace the load of next 64 bytes data block with store of the last
+        * loaded 64 bytes data.
+        */
+        stp     A_l, A_h, [dst, #-16]
+        ldp     A_l, A_h, [src, #-16]
+        stp     B_l, B_h, [dst, #-32]
+        ldp     B_l, B_h, [src, #-32]
+        stp     C_l, C_h, [dst, #-48]
+        ldp     C_l, C_h, [src, #-48]
+        stp     D_l, D_h, [dst, #-64]!
+        ldp     D_l, D_h, [src, #-64]!
+        subs    count, count, #64
+        b.ge    1b
+        stp     A_l, A_h, [dst, #-16]
+        stp     B_l, B_h, [dst, #-32]
+        stp     C_l, C_h, [dst, #-48]
+        stp     D_l, D_h, [dst, #-64]!
+        tst     count, #0x3f
+        b.ne    .Ltail63
+        ret
 ENDPROC(memmove)
author	zhichang.yuan <zhichang.yuan@linaro.org>	2014-04-28 01:11:30 -0400
committer	Catalin Marinas <catalin.marinas@arm.com>	2014-05-23 10:07:35 -0400
commit	280adc1951c0c9fc8f2d85571ff563a1c412b1cd (patch)
tree	89be4d00570a72fb8b4c30216f6b45d017181e78 /arch/arm64
parent	808dbac6b51f3441eb5a07724c0b0d1257046d51 (diff)

diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S index b79fdfa42d39..57b19ea2dad4 100644 --- a/arch/arm64/lib/memmove.S +++ b/arch/arm64/lib/memmove.S
@@ -1,5 +1,13 @@
1	/*	1	/*
2	* Copyright (C) 2013 ARM Ltd.	2	* Copyright (C) 2013 ARM Ltd.
		3	* Copyright (C) 2013 Linaro.
		4	*
		5	* This code is based on glibc cortex strings work originally authored by Linaro
		6	* and re-licensed under GPLv2 for the Linux kernel. The original code can
		7	* be found @
		8	*
		9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
		10	* files/head:/src/aarch64/
3	*	11	*
4	* This program is free software; you can redistribute it and/or modify	12	* This program is free software; you can redistribute it and/or modify
5	* it under the terms of the GNU General Public License version 2 as	13	* it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
16		24
17	#include <linux/linkage.h>	25	#include <linux/linkage.h>
18	#include <asm/assembler.h>	26	#include <asm/assembler.h>
		27	#include <asm/cache.h>
19		28
20	/*	29	/*
21	* Move a buffer from src to test (alignment handled by the hardware).	30	* Move a buffer from src to test (alignment handled by the hardware).
@@ -28,30 +37,161 @@
28	* Returns:	37	* Returns:
29	* x0 - dest	38	* x0 - dest
30	*/	39	*/
		40	dstin .req x0
		41	src .req x1
		42	count .req x2
		43	tmp1 .req x3
		44	tmp1w .req w3
		45	tmp2 .req x4
		46	tmp2w .req w4
		47	tmp3 .req x5
		48	tmp3w .req w5
		49	dst .req x6
		50
		51	A_l .req x7
		52	A_h .req x8
		53	B_l .req x9
		54	B_h .req x10
		55	C_l .req x11
		56	C_h .req x12
		57	D_l .req x13
		58	D_h .req x14
		59
31	ENTRY(memmove)	60	ENTRY(memmove)
32	cmp x0, x1	61	cmp dstin, src
33	b.ls memcpy	62	b.lo memcpy
34	add x4, x0, x2	63	add tmp1, src, count
35	add x1, x1, x2	64	cmp dstin, tmp1
36	subs x2, x2, #8	65	b.hs memcpy /* No overlap. */
37	b.mi 2f	66
38	1: ldr x3, [x1, #-8]!	67	add dst, dstin, count
39	subs x2, x2, #8	68	add src, src, count
40	str x3, [x4, #-8]!	69	cmp count, #16
41	b.pl 1b	70	b.lo .Ltail15 /probably non-alignment accesses./
42	2: adds x2, x2, #4	71
43	b.mi 3f	72	ands tmp2, src, #15 /* Bytes to reach alignment. */
44	ldr w3, [x1, #-4]!	73	b.eq .LSrcAligned
45	sub x2, x2, #4	74	sub count, count, tmp2
46	str w3, [x4, #-4]!	75	/*
47	3: adds x2, x2, #2	76	* process the aligned offset length to make the src aligned firstly.
48	b.mi 4f	77	* those extra instructions' cost is acceptable. It also make the
49	ldrh w3, [x1, #-2]!	78	* coming accesses are based on aligned address.
50	sub x2, x2, #2	79	*/
51	strh w3, [x4, #-2]!	80	tbz tmp2, #0, 1f
52	4: adds x2, x2, #1	81	ldrb tmp1w, [src, #-1]!
53	b.mi 5f	82	strb tmp1w, [dst, #-1]!
54	ldrb w3, [x1, #-1]	83	1:
55	strb w3, [x4, #-1]	84	tbz tmp2, #1, 2f
56	5: ret	85	ldrh tmp1w, [src, #-2]!
		86	strh tmp1w, [dst, #-2]!
		87	2:
		88	tbz tmp2, #2, 3f
		89	ldr tmp1w, [src, #-4]!
		90	str tmp1w, [dst, #-4]!
		91	3:
		92	tbz tmp2, #3, .LSrcAligned
		93	ldr tmp1, [src, #-8]!
		94	str tmp1, [dst, #-8]!
		95
		96	.LSrcAligned:
		97	cmp count, #64
		98	b.ge .Lcpy_over64
		99
		100	/*
		101	* Deal with small copies quickly by dropping straight into the
		102	* exit block.
		103	*/
		104	.Ltail63:
		105	/*
		106	* Copy up to 48 bytes of data. At this point we only need the
		107	* bottom 6 bits of count to be accurate.
		108	*/
		109	ands tmp1, count, #0x30
		110	b.eq .Ltail15
		111	cmp tmp1w, #0x20
		112	b.eq 1f
		113	b.lt 2f
		114	ldp A_l, A_h, [src, #-16]!
		115	stp A_l, A_h, [dst, #-16]!
		116	1:
		117	ldp A_l, A_h, [src, #-16]!
		118	stp A_l, A_h, [dst, #-16]!
		119	2:
		120	ldp A_l, A_h, [src, #-16]!
		121	stp A_l, A_h, [dst, #-16]!
		122
		123	.Ltail15:
		124	tbz count, #3, 1f
		125	ldr tmp1, [src, #-8]!
		126	str tmp1, [dst, #-8]!
		127	1:
		128	tbz count, #2, 2f
		129	ldr tmp1w, [src, #-4]!
		130	str tmp1w, [dst, #-4]!
		131	2:
		132	tbz count, #1, 3f
		133	ldrh tmp1w, [src, #-2]!
		134	strh tmp1w, [dst, #-2]!
		135	3:
		136	tbz count, #0, .Lexitfunc
		137	ldrb tmp1w, [src, #-1]
		138	strb tmp1w, [dst, #-1]
		139
		140	.Lexitfunc:
		141	ret
		142
		143	.Lcpy_over64:
		144	subs count, count, #128
		145	b.ge .Lcpy_body_large
		146	/*
		147	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
		148	* to the tail.
		149	*/
		150	ldp A_l, A_h, [src, #-16]
		151	stp A_l, A_h, [dst, #-16]
		152	ldp B_l, B_h, [src, #-32]
		153	ldp C_l, C_h, [src, #-48]
		154	stp B_l, B_h, [dst, #-32]
		155	stp C_l, C_h, [dst, #-48]
		156	ldp D_l, D_h, [src, #-64]!
		157	stp D_l, D_h, [dst, #-64]!
		158
		159	tst count, #0x3f
		160	b.ne .Ltail63
		161	ret
		162
		163	/*
		164	* Critical loop. Start at a new cache line boundary. Assuming
		165	* 64 bytes per line this ensures the entire loop is in one line.
		166	*/
		167	.p2align L1_CACHE_SHIFT
		168	.Lcpy_body_large:
		169	/* pre-load 64 bytes data. */
		170	ldp A_l, A_h, [src, #-16]
		171	ldp B_l, B_h, [src, #-32]
		172	ldp C_l, C_h, [src, #-48]
		173	ldp D_l, D_h, [src, #-64]!
		174	1:
		175	/*
		176	* interlace the load of next 64 bytes data block with store of the last
		177	* loaded 64 bytes data.
		178	*/
		179	stp A_l, A_h, [dst, #-16]
		180	ldp A_l, A_h, [src, #-16]
		181	stp B_l, B_h, [dst, #-32]
		182	ldp B_l, B_h, [src, #-32]
		183	stp C_l, C_h, [dst, #-48]
		184	ldp C_l, C_h, [src, #-48]
		185	stp D_l, D_h, [dst, #-64]!
		186	ldp D_l, D_h, [src, #-64]!
		187	subs count, count, #64
		188	b.ge 1b
		189	stp A_l, A_h, [dst, #-16]
		190	stp B_l, B_h, [dst, #-32]
		191	stp C_l, C_h, [dst, #-48]
		192	stp D_l, D_h, [dst, #-64]!
		193
		194	tst count, #0x3f
		195	b.ne .Ltail63
		196	ret
57	ENDPROC(memmove)	197	ENDPROC(memmove)