1 files changed, 193 insertions, 0 deletions
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
new file mode 100644
index 000000000000..410fbdb8163f
--- /dev/null
+++ b/arch/arm64/lib/copy_template.S
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Copy a buffer from src to dest (alignment handled by the hardware)
+ *
+ * Parameters:
+ *      x0 - dest
+ *      x1 - src
+ *      x2 - n
+ * Returns:
+ *      x0 - dest
+ */
+dstin   .req    x0
+src     .req    x1
+count   .req    x2
+tmp1    .req    x3
+tmp1w   .req    w3
+tmp2    .req    x4
+tmp2w   .req    w4
+dst     .req    x6
+A_l     .req    x7
+A_h     .req    x8
+B_l     .req    x9
+B_h     .req    x10
+C_l     .req    x11
+C_h     .req    x12
+D_l     .req    x13
+D_h     .req    x14
+        mov     dst, dstin
+        cmp     count, #16
+        /*When memory length is less than 16, the accessed are not aligned.*/
+        b.lo    .Ltiny15
+        neg     tmp2, src
+        ands    tmp2, tmp2, #15/* Bytes to reach alignment. */
+        b.eq    .LSrcAligned
+        sub     count, count, tmp2
+        /*
+        * Copy the leading memory data from src to dst in an increasing
+        * address order.By this way,the risk of overwritting the source
+        * memory data is eliminated when the distance between src and
+        * dst is less than 16. The memory accesses here are alignment.
+        */
+        tbz     tmp2, #0, 1f
+        ldrb1   tmp1w, src, #1
+        strb1   tmp1w, dst, #1
+1:
+        tbz     tmp2, #1, 2f
+        ldrh1   tmp1w, src, #2
+        strh1   tmp1w, dst, #2
+2:
+        tbz     tmp2, #2, 3f
+        ldr1    tmp1w, src, #4
+        str1    tmp1w, dst, #4
+3:
+        tbz     tmp2, #3, .LSrcAligned
+        ldr1    tmp1, src, #8
+        str1    tmp1, dst, #8
+.LSrcAligned:
+        cmp     count, #64
+        b.ge    .Lcpy_over64
+        /*
+        * Deal with small copies quickly by dropping straight into the
+        * exit block.
+        */
+.Ltail63:
+        /*
+        * Copy up to 48 bytes of data. At this point we only need the
+        * bottom 6 bits of count to be accurate.
+        */
+        ands    tmp1, count, #0x30
+        b.eq    .Ltiny15
+        cmp     tmp1w, #0x20
+        b.eq    1f
+        b.lt    2f
+        ldp1    A_l, A_h, src, #16
+        stp1    A_l, A_h, dst, #16
+1:
+        ldp1    A_l, A_h, src, #16
+        stp1    A_l, A_h, dst, #16
+2:
+        ldp1    A_l, A_h, src, #16
+        stp1    A_l, A_h, dst, #16
+.Ltiny15:
+        /*
+        * Prefer to break one ldp/stp into several load/store to access
+        * memory in an increasing address order,rather than to load/store 16
+        * bytes from (src-16) to (dst-16) and to backward the src to aligned
+        * address,which way is used in original cortex memcpy. If keeping
+        * the original memcpy process here, memmove need to satisfy the
+        * precondition that src address is at least 16 bytes bigger than dst
+        * address,otherwise some source data will be overwritten when memove
+        * call memcpy directly. To make memmove simpler and decouple the
+        * memcpy's dependency on memmove, withdrew the original process.
+        */
+        tbz     count, #3, 1f
+        ldr1    tmp1, src, #8
+        str1    tmp1, dst, #8
+1:
+        tbz     count, #2, 2f
+        ldr1    tmp1w, src, #4
+        str1    tmp1w, dst, #4
+2:
+        tbz     count, #1, 3f
+        ldrh1   tmp1w, src, #2
+        strh1   tmp1w, dst, #2
+3:
+        tbz     count, #0, .Lexitfunc
+        ldrb1   tmp1w, src, #1
+        strb1   tmp1w, dst, #1
+        b       .Lexitfunc
+.Lcpy_over64:
+        subs    count, count, #128
+        b.ge    .Lcpy_body_large
+        /*
+        * Less than 128 bytes to copy, so handle 64 here and then jump
+        * to the tail.
+        */
+        ldp1    A_l, A_h, src, #16
+        stp1    A_l, A_h, dst, #16
+        ldp1    B_l, B_h, src, #16
+        ldp1    C_l, C_h, src, #16
+        stp1    B_l, B_h, dst, #16
+        stp1    C_l, C_h, dst, #16
+        ldp1    D_l, D_h, src, #16
+        stp1    D_l, D_h, dst, #16
+        tst     count, #0x3f
+        b.ne    .Ltail63
+        b       .Lexitfunc
+        /*
+        * Critical loop.  Start at a new cache line boundary.  Assuming
+        * 64 bytes per line this ensures the entire loop is in one line.
+        */
+        .p2align        L1_CACHE_SHIFT
+.Lcpy_body_large:
+        /* pre-get 64 bytes data. */
+        ldp1    A_l, A_h, src, #16
+        ldp1    B_l, B_h, src, #16
+        ldp1    C_l, C_h, src, #16
+        ldp1    D_l, D_h, src, #16
+1:
+        /*
+        * interlace the load of next 64 bytes data block with store of the last
+        * loaded 64 bytes data.
+        */
+        stp1    A_l, A_h, dst, #16
+        ldp1    A_l, A_h, src, #16
+        stp1    B_l, B_h, dst, #16
+        ldp1    B_l, B_h, src, #16
+        stp1    C_l, C_h, dst, #16
+        ldp1    C_l, C_h, src, #16
+        stp1    D_l, D_h, dst, #16
+        ldp1    D_l, D_h, src, #16
+        subs    count, count, #64
+        b.ge    1b
+        stp1    A_l, A_h, dst, #16
+        stp1    B_l, B_h, dst, #16
+        stp1    C_l, C_h, dst, #16
+        stp1    D_l, D_h, dst, #16
+        tst     count, #0x3f
+        b.ne    .Ltail63
+.Lexitfunc:

diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S new file mode 100644 index 000000000000..410fbdb8163f --- /dev/null +++ b/arch/arm64/lib/copy_template.S
@@ -0,0 +1,193 @@
	1	/*
	2	* Copyright (C) 2013 ARM Ltd.
	3	* Copyright (C) 2013 Linaro.
	4	*
	5	* This code is based on glibc cortex strings work originally authored by Linaro
	6	* and re-licensed under GPLv2 for the Linux kernel. The original code can
	7	* be found @
	8	*
	9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
	10	* files/head:/src/aarch64/
	11	*
	12	* This program is free software; you can redistribute it and/or modify
	13	* it under the terms of the GNU General Public License version 2 as
	14	* published by the Free Software Foundation.
	15	*
	16	* This program is distributed in the hope that it will be useful,
	17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	* GNU General Public License for more details.
	20	*
	21	* You should have received a copy of the GNU General Public License
	22	* along with this program. If not, see <http://www.gnu.org/licenses/>.
	23	*/
	24
	25
	26	/*
	27	* Copy a buffer from src to dest (alignment handled by the hardware)
	28	*
	29	* Parameters:
	30	* x0 - dest
	31	* x1 - src
	32	* x2 - n
	33	* Returns:
	34	* x0 - dest
	35	*/
	36	dstin .req x0
	37	src .req x1
	38	count .req x2
	39	tmp1 .req x3
	40	tmp1w .req w3
	41	tmp2 .req x4
	42	tmp2w .req w4
	43	dst .req x6
	44
	45	A_l .req x7
	46	A_h .req x8
	47	B_l .req x9
	48	B_h .req x10
	49	C_l .req x11
	50	C_h .req x12
	51	D_l .req x13
	52	D_h .req x14
	53
	54	mov dst, dstin
	55	cmp count, #16
	56	/When memory length is less than 16, the accessed are not aligned./
	57	b.lo .Ltiny15
	58
	59	neg tmp2, src
	60	ands tmp2, tmp2, #15/* Bytes to reach alignment. */
	61	b.eq .LSrcAligned
	62	sub count, count, tmp2
	63	/*
	64	* Copy the leading memory data from src to dst in an increasing
	65	* address order.By this way,the risk of overwritting the source
	66	* memory data is eliminated when the distance between src and
	67	* dst is less than 16. The memory accesses here are alignment.
	68	*/
	69	tbz tmp2, #0, 1f
	70	ldrb1 tmp1w, src, #1
	71	strb1 tmp1w, dst, #1
	72	1:
	73	tbz tmp2, #1, 2f
	74	ldrh1 tmp1w, src, #2
	75	strh1 tmp1w, dst, #2
	76	2:
	77	tbz tmp2, #2, 3f
	78	ldr1 tmp1w, src, #4
	79	str1 tmp1w, dst, #4
	80	3:
	81	tbz tmp2, #3, .LSrcAligned
	82	ldr1 tmp1, src, #8
	83	str1 tmp1, dst, #8
	84
	85	.LSrcAligned:
	86	cmp count, #64
	87	b.ge .Lcpy_over64
	88	/*
	89	* Deal with small copies quickly by dropping straight into the
	90	* exit block.
	91	*/
	92	.Ltail63:
	93	/*
	94	* Copy up to 48 bytes of data. At this point we only need the
	95	* bottom 6 bits of count to be accurate.
	96	*/
	97	ands tmp1, count, #0x30
	98	b.eq .Ltiny15
	99	cmp tmp1w, #0x20
	100	b.eq 1f
	101	b.lt 2f
	102	ldp1 A_l, A_h, src, #16
	103	stp1 A_l, A_h, dst, #16
	104	1:
	105	ldp1 A_l, A_h, src, #16
	106	stp1 A_l, A_h, dst, #16
	107	2:
	108	ldp1 A_l, A_h, src, #16
	109	stp1 A_l, A_h, dst, #16
	110	.Ltiny15:
	111	/*
	112	* Prefer to break one ldp/stp into several load/store to access
	113	* memory in an increasing address order,rather than to load/store 16
	114	* bytes from (src-16) to (dst-16) and to backward the src to aligned
	115	* address,which way is used in original cortex memcpy. If keeping
	116	* the original memcpy process here, memmove need to satisfy the
	117	* precondition that src address is at least 16 bytes bigger than dst
	118	* address,otherwise some source data will be overwritten when memove
	119	* call memcpy directly. To make memmove simpler and decouple the
	120	* memcpy's dependency on memmove, withdrew the original process.
	121	*/
	122	tbz count, #3, 1f
	123	ldr1 tmp1, src, #8
	124	str1 tmp1, dst, #8
	125	1:
	126	tbz count, #2, 2f
	127	ldr1 tmp1w, src, #4
	128	str1 tmp1w, dst, #4
	129	2:
	130	tbz count, #1, 3f
	131	ldrh1 tmp1w, src, #2
	132	strh1 tmp1w, dst, #2
	133	3:
	134	tbz count, #0, .Lexitfunc
	135	ldrb1 tmp1w, src, #1
	136	strb1 tmp1w, dst, #1
	137
	138	b .Lexitfunc
	139
	140	.Lcpy_over64:
	141	subs count, count, #128
	142	b.ge .Lcpy_body_large
	143	/*
	144	* Less than 128 bytes to copy, so handle 64 here and then jump
	145	* to the tail.
	146	*/
	147	ldp1 A_l, A_h, src, #16
	148	stp1 A_l, A_h, dst, #16
	149	ldp1 B_l, B_h, src, #16
	150	ldp1 C_l, C_h, src, #16
	151	stp1 B_l, B_h, dst, #16
	152	stp1 C_l, C_h, dst, #16
	153	ldp1 D_l, D_h, src, #16
	154	stp1 D_l, D_h, dst, #16
	155
	156	tst count, #0x3f
	157	b.ne .Ltail63
	158	b .Lexitfunc
	159
	160	/*
	161	* Critical loop. Start at a new cache line boundary. Assuming
	162	* 64 bytes per line this ensures the entire loop is in one line.
	163	*/
	164	.p2align L1_CACHE_SHIFT
	165	.Lcpy_body_large:
	166	/* pre-get 64 bytes data. */
	167	ldp1 A_l, A_h, src, #16
	168	ldp1 B_l, B_h, src, #16
	169	ldp1 C_l, C_h, src, #16
	170	ldp1 D_l, D_h, src, #16
	171	1:
	172	/*
	173	* interlace the load of next 64 bytes data block with store of the last
	174	* loaded 64 bytes data.
	175	*/
	176	stp1 A_l, A_h, dst, #16
	177	ldp1 A_l, A_h, src, #16
	178	stp1 B_l, B_h, dst, #16
	179	ldp1 B_l, B_h, src, #16
	180	stp1 C_l, C_h, dst, #16
	181	ldp1 C_l, C_h, src, #16
	182	stp1 D_l, D_h, dst, #16
	183	ldp1 D_l, D_h, src, #16
	184	subs count, count, #64
	185	b.ge 1b
	186	stp1 A_l, A_h, dst, #16
	187	stp1 B_l, B_h, dst, #16
	188	stp1 C_l, C_h, dst, #16
	189	stp1 D_l, D_h, dst, #16
	190
	191	tst count, #0x3f
	192	b.ne .Ltail63
	193	.Lexitfunc: