x86-64, mem: Convert memmove() to assembly file and fix return value bug

memmove_64.c only implements memmove() function which is completely written in inline assembly code. Therefore it doesn't make sense to keep the assembly code in .c file. Currently memmove() doesn't store return value to rax. This may cause issue if caller uses the return value. The patch fixes this issue. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> LKML-Reference: <1295314755-6625-1-git-send-email-fenghua.yu@intel.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
author: Fenghua Yu <fenghua.yu@intel.com> 2011-01-17 20:39:15 -0500
committer: H. Peter Anvin <hpa@linux.intel.com> 2011-01-25 19:58:39 -0500
commit: 9599ec0471deae24044241e2173090d2cbc0e899 (patch)
tree: 7ff508aefdb075ce62ef59e6218588eacedeff7f /arch/x86/lib
parent: 1bae4ce27c9c90344f23c65ea6966c50ffeae2f5 (diff)
2 files changed, 197 insertions, 192 deletions
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 000000000000..0ecb8433e5a8
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,197 @@
+/*
+ * Normally compiler builtins are used, but sometimes the compiler calls out
+ * of line code. Based on asm-i386/string.h.
+ *
+ * This assembly file is re-written from memmove_64.c file.
+ *      - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
+ */
+#define _STRING_C
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#undef memmove
+/*
+ * Implement memmove(). This can handle overlap between src and dst.
+ *
+ * Input:
+ * rdi: dest
+ * rsi: src
+ * rdx: count
+ *
+ * Output:
+ * rax: dest
+ */
+ENTRY(memmove)
+        CFI_STARTPROC
+        /* Handle more 32bytes in loop */
+        mov %rdi, %rax
+        cmp $0x20, %rdx
+        jb      1f
+        /* Decide forward/backward copy mode */
+        cmp %rdi, %rsi
+        jb      2f
+        /*
+         * movsq instruction have many startup latency
+         * so we handle small size by general register.
+         */
+        cmp  $680, %rdx
+        jb      3f
+        /*
+         * movsq instruction is only good for aligned case.
+         */
+        cmpb %dil, %sil
+        je 4f
+3:
+        sub $0x20, %rdx
+        /*
+         * We gobble 32byts forward in each loop.
+         */
+5:
+        sub $0x20, %rdx
+        movq 0*8(%rsi), %r11
+        movq 1*8(%rsi), %r10
+        movq 2*8(%rsi), %r9
+        movq 3*8(%rsi), %r8
+        leaq 4*8(%rsi), %rsi
+        movq %r11, 0*8(%rdi)
+        movq %r10, 1*8(%rdi)
+        movq %r9, 2*8(%rdi)
+        movq %r8, 3*8(%rdi)
+        leaq 4*8(%rdi), %rdi
+        jae 5b
+        addq $0x20, %rdx
+        jmp 1f
+        /*
+         * Handle data forward by movsq.
+         */
+        .p2align 4
+4:
+        movq %rdx, %rcx
+        movq -8(%rsi, %rdx), %r11
+        lea -8(%rdi, %rdx), %r10
+        shrq $3, %rcx
+        rep movsq
+        movq %r11, (%r10)
+        jmp 13f
+        /*
+         * Handle data backward by movsq.
+         */
+        .p2align 4
+7:
+        movq %rdx, %rcx
+        movq (%rsi), %r11
+        movq %rdi, %r10
+        leaq -8(%rsi, %rdx), %rsi
+        leaq -8(%rdi, %rdx), %rdi
+        shrq $3, %rcx
+        std
+        rep movsq
+        cld
+        movq %r11, (%r10)
+        jmp 13f
+        /*
+         * Start to prepare for backward copy.
+         */
+        .p2align 4
+2:
+        cmp $680, %rdx
+        jb 6f
+        cmp %dil, %sil
+        je 7b
+6:
+        /*
+         * Calculate copy position to tail.
+         */
+        addq %rdx, %rsi
+        addq %rdx, %rdi
+        subq $0x20, %rdx
+        /*
+         * We gobble 32byts backward in each loop.
+         */
+8:
+        subq $0x20, %rdx
+        movq -1*8(%rsi), %r11
+        movq -2*8(%rsi), %r10
+        movq -3*8(%rsi), %r9
+        movq -4*8(%rsi), %r8
+        leaq -4*8(%rsi), %rsi
+        movq %r11, -1*8(%rdi)
+        movq %r10, -2*8(%rdi)
+        movq %r9, -3*8(%rdi)
+        movq %r8, -4*8(%rdi)
+        leaq -4*8(%rdi), %rdi
+        jae 8b
+        /*
+         * Calculate copy position to head.
+         */
+        addq $0x20, %rdx
+        subq %rdx, %rsi
+        subq %rdx, %rdi
+1:
+        cmpq $16, %rdx
+        jb 9f
+        /*
+         * Move data from 16 bytes to 31 bytes.
+         */
+        movq 0*8(%rsi), %r11
+        movq 1*8(%rsi), %r10
+        movq -2*8(%rsi, %rdx), %r9
+        movq -1*8(%rsi, %rdx), %r8
+        movq %r11, 0*8(%rdi)
+        movq %r10, 1*8(%rdi)
+        movq %r9, -2*8(%rdi, %rdx)
+        movq %r8, -1*8(%rdi, %rdx)
+        jmp 13f
+        .p2align 4
+9:
+        cmpq $8, %rdx
+        jb 10f
+        /*
+         * Move data from 8 bytes to 15 bytes.
+         */
+        movq 0*8(%rsi), %r11
+        movq -1*8(%rsi, %rdx), %r10
+        movq %r11, 0*8(%rdi)
+        movq %r10, -1*8(%rdi, %rdx)
+        jmp 13f
+10:
+        cmpq $4, %rdx
+        jb 11f
+        /*
+         * Move data from 4 bytes to 7 bytes.
+         */
+        movl (%rsi), %r11d
+        movl -4(%rsi, %rdx), %r10d
+        movl %r11d, (%rdi)
+        movl %r10d, -4(%rdi, %rdx)
+        jmp 13f
+11:
+        cmp $2, %rdx
+        jb 12f
+        /*
+         * Move data from 2 bytes to 3 bytes.
+         */
+        movw (%rsi), %r11w
+        movw -2(%rsi, %rdx), %r10w
+        movw %r11w, (%rdi)
+        movw %r10w, -2(%rdi, %rdx)
+        jmp 13f
+12:
+        cmp $1, %rdx
+        jb 13f
+        /*
+         * Move data for 1 byte.
+         */
+        movb (%rsi), %r11b
+        movb %r11b, (%rdi)
+13:
+        retq
+        CFI_ENDPROC
+ENDPROC(memmove)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
deleted file mode 100644
index 6d0f0ec41b34..000000000000
--- a/arch/x86/lib/memmove_64.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Normally compiler builtins are used, but sometimes the compiler calls out
-   of line code. Based on asm-i386/string.h.
- */
-#define _STRING_C
-#include <linux/string.h>
-#include <linux/module.h>
-#undef memmove
-void *memmove(void *dest, const void *src, size_t count)
-{
-        unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
-        char *ret;
-        __asm__ __volatile__(
-                /* Handle more 32bytes in loop */
-                "mov %2, %3\n\t"
-                "cmp $0x20, %0\n\t"
-                "jb     1f\n\t"
-                /* Decide forward/backward copy mode */
-                "cmp %2, %1\n\t"
-                "jb     2f\n\t"
-                /*
-                 * movsq instruction have many startup latency
-                 * so we handle small size by general register.
-                 */
-                "cmp  $680, %0\n\t"
-                "jb 3f\n\t"
-                /*
-                 * movsq instruction is only good for aligned case.
-                 */
-                "cmpb %%dil, %%sil\n\t"
-                "je 4f\n\t"
-                "3:\n\t"
-                "sub $0x20, %0\n\t"
-                /*
-                 * We gobble 32byts forward in each loop.
-                 */
-                "5:\n\t"
-                "sub $0x20, %0\n\t"
-                "movq 0*8(%1), %4\n\t"
-                "movq 1*8(%1), %5\n\t"
-                "movq 2*8(%1), %6\n\t"
-                "movq 3*8(%1), %7\n\t"
-                "leaq 4*8(%1), %1\n\t"
-                "movq %4, 0*8(%2)\n\t"
-                "movq %5, 1*8(%2)\n\t"
-                "movq %6, 2*8(%2)\n\t"
-                "movq %7, 3*8(%2)\n\t"
-                "leaq 4*8(%2), %2\n\t"
-                "jae 5b\n\t"
-                "addq $0x20, %0\n\t"
-                "jmp 1f\n\t"
-                /*
-                 * Handle data forward by movsq.
-                 */
-                ".p2align 4\n\t"
-                "4:\n\t"
-                "movq %0, %8\n\t"
-                "movq -8(%1, %0), %4\n\t"
-                "lea -8(%2, %0), %5\n\t"
-                "shrq $3, %8\n\t"
-                "rep movsq\n\t"
-                "movq %4, (%5)\n\t"
-                "jmp 13f\n\t"
-                /*
-                 * Handle data backward by movsq.
-                 */
-                ".p2align 4\n\t"
-                "7:\n\t"
-                "movq %0, %8\n\t"
-                "movq (%1), %4\n\t"
-                "movq %2, %5\n\t"
-                "leaq -8(%1, %0), %1\n\t"
-                "leaq -8(%2, %0), %2\n\t"
-                "shrq $3, %8\n\t"
-                "std\n\t"
-                "rep movsq\n\t"
-                "cld\n\t"
-                "movq %4, (%5)\n\t"
-                "jmp 13f\n\t"
-                /*
-                 * Start to prepare for backward copy.
-                 */
-                ".p2align 4\n\t"
-                "2:\n\t"
-                "cmp $680, %0\n\t"
-                "jb 6f \n\t"
-                "cmp %%dil, %%sil\n\t"
-                "je 7b \n\t"
-                "6:\n\t"
-                /*
-                 * Calculate copy position to tail.
-                 */
-                "addq %0, %1\n\t"
-                "addq %0, %2\n\t"
-                "subq $0x20, %0\n\t"
-                /*
-                 * We gobble 32byts backward in each loop.
-                 */
-                "8:\n\t"
-                "subq $0x20, %0\n\t"
-                "movq -1*8(%1), %4\n\t"
-                "movq -2*8(%1), %5\n\t"
-                "movq -3*8(%1), %6\n\t"
-                "movq -4*8(%1), %7\n\t"
-                "leaq -4*8(%1), %1\n\t"
-                "movq %4, -1*8(%2)\n\t"
-                "movq %5, -2*8(%2)\n\t"
-                "movq %6, -3*8(%2)\n\t"
-                "movq %7, -4*8(%2)\n\t"
-                "leaq -4*8(%2), %2\n\t"
-                "jae 8b\n\t"
-                /*
-                 * Calculate copy position to head.
-                 */
-                "addq $0x20, %0\n\t"
-                "subq %0, %1\n\t"
-                "subq %0, %2\n\t"
-                "1:\n\t"
-                "cmpq $16, %0\n\t"
-                "jb 9f\n\t"
-                /*
-                 * Move data from 16 bytes to 31 bytes.
-                 */
-                "movq 0*8(%1), %4\n\t"
-                "movq 1*8(%1), %5\n\t"
-                "movq -2*8(%1, %0), %6\n\t"
-                "movq -1*8(%1, %0), %7\n\t"
-                "movq %4, 0*8(%2)\n\t"
-                "movq %5, 1*8(%2)\n\t"
-                "movq %6, -2*8(%2, %0)\n\t"
-                "movq %7, -1*8(%2, %0)\n\t"
-                "jmp 13f\n\t"
-                ".p2align 4\n\t"
-                "9:\n\t"
-                "cmpq $8, %0\n\t"
-                "jb 10f\n\t"
-                /*
-                 * Move data from 8 bytes to 15 bytes.
-                 */
-                "movq 0*8(%1), %4\n\t"
-                "movq -1*8(%1, %0), %5\n\t"
-                "movq %4, 0*8(%2)\n\t"
-                "movq %5, -1*8(%2, %0)\n\t"
-                "jmp 13f\n\t"
-                "10:\n\t"
-                "cmpq $4, %0\n\t"
-                "jb 11f\n\t"
-                /*
-                 * Move data from 4 bytes to 7 bytes.
-                 */
-                "movl (%1), %4d\n\t"
-                "movl -4(%1, %0), %5d\n\t"
-                "movl %4d, (%2)\n\t"
-                "movl %5d, -4(%2, %0)\n\t"
-                "jmp 13f\n\t"
-                "11:\n\t"
-                "cmp $2, %0\n\t"
-                "jb 12f\n\t"
-                /*
-                 * Move data from 2 bytes to 3 bytes.
-                 */
-                "movw (%1), %4w\n\t"
-                "movw -2(%1, %0), %5w\n\t"
-                "movw %4w, (%2)\n\t"
-                "movw %5w, -2(%2, %0)\n\t"
-                "jmp 13f\n\t"
-                "12:\n\t"
-                "cmp $1, %0\n\t"
-                "jb 13f\n\t"
-                /*
-                 * Move data for 1 byte.
-                 */
-                "movb (%1), %4b\n\t"
-                "movb %4b, (%2)\n\t"
-                "13:\n\t"
-                : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
-                  "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
-                :"0" (count),
-                 "1" (src),
-                 "2" (dest)
-                :"memory");
-                return ret;
-}
-EXPORT_SYMBOL(memmove);
author	Fenghua Yu <fenghua.yu@intel.com>	2011-01-17 20:39:15 -0500
committer	H. Peter Anvin <hpa@linux.intel.com>	2011-01-25 19:58:39 -0500
commit	9599ec0471deae24044241e2173090d2cbc0e899 (patch)
tree	7ff508aefdb075ce62ef59e6218588eacedeff7f /arch/x86/lib
parent	1bae4ce27c9c90344f23c65ea6966c50ffeae2f5 (diff)

diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S new file mode 100644 index 000000000000..0ecb8433e5a8 --- /dev/null +++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,197 @@
		1	/*
		2	* Normally compiler builtins are used, but sometimes the compiler calls out
		3	* of line code. Based on asm-i386/string.h.
		4	*
		5	* This assembly file is re-written from memmove_64.c file.
		6	* - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
		7	*/
		8	#define _STRING_C
		9	#include <linux/linkage.h>
		10	#include <asm/dwarf2.h>
		11
		12	#undef memmove
		13
		14	/*
		15	* Implement memmove(). This can handle overlap between src and dst.
		16	*
		17	* Input:
		18	* rdi: dest
		19	* rsi: src
		20	* rdx: count
		21	*
		22	* Output:
		23	* rax: dest
		24	*/
		25	ENTRY(memmove)
		26	CFI_STARTPROC
		27	/* Handle more 32bytes in loop */
		28	mov %rdi, %rax
		29	cmp $0x20, %rdx
		30	jb 1f
		31
		32	/* Decide forward/backward copy mode */
		33	cmp %rdi, %rsi
		34	jb 2f
		35
		36	/*
		37	* movsq instruction have many startup latency
		38	* so we handle small size by general register.
		39	*/
		40	cmp $680, %rdx
		41	jb 3f
		42	/*
		43	* movsq instruction is only good for aligned case.
		44	*/
		45
		46	cmpb %dil, %sil
		47	je 4f
		48	3:
		49	sub $0x20, %rdx
		50	/*
		51	* We gobble 32byts forward in each loop.
		52	*/
		53	5:
		54	sub $0x20, %rdx
		55	movq 0*8(%rsi), %r11
		56	movq 1*8(%rsi), %r10
		57	movq 2*8(%rsi), %r9
		58	movq 3*8(%rsi), %r8
		59	leaq 4*8(%rsi), %rsi
		60
		61	movq %r11, 0*8(%rdi)
		62	movq %r10, 1*8(%rdi)
		63	movq %r9, 2*8(%rdi)
		64	movq %r8, 3*8(%rdi)
		65	leaq 4*8(%rdi), %rdi
		66	jae 5b
		67	addq $0x20, %rdx
		68	jmp 1f
		69	/*
		70	* Handle data forward by movsq.
		71	*/
		72	.p2align 4
		73	4:
		74	movq %rdx, %rcx
		75	movq -8(%rsi, %rdx), %r11
		76	lea -8(%rdi, %rdx), %r10
		77	shrq $3, %rcx
		78	rep movsq
		79	movq %r11, (%r10)
		80	jmp 13f
		81	/*
		82	* Handle data backward by movsq.
		83	*/
		84	.p2align 4
		85	7:
		86	movq %rdx, %rcx
		87	movq (%rsi), %r11
		88	movq %rdi, %r10
		89	leaq -8(%rsi, %rdx), %rsi
		90	leaq -8(%rdi, %rdx), %rdi
		91	shrq $3, %rcx
		92	std
		93	rep movsq
		94	cld
		95	movq %r11, (%r10)
		96	jmp 13f
		97
		98	/*
		99	* Start to prepare for backward copy.
		100	*/
		101	.p2align 4
		102	2:
		103	cmp $680, %rdx
		104	jb 6f
		105	cmp %dil, %sil
		106	je 7b
		107	6:
		108	/*
		109	* Calculate copy position to tail.
		110	*/
		111	addq %rdx, %rsi
		112	addq %rdx, %rdi
		113	subq $0x20, %rdx
		114	/*
		115	* We gobble 32byts backward in each loop.
		116	*/
		117	8:
		118	subq $0x20, %rdx
		119	movq -1*8(%rsi), %r11
		120	movq -2*8(%rsi), %r10
		121	movq -3*8(%rsi), %r9
		122	movq -4*8(%rsi), %r8
		123	leaq -4*8(%rsi), %rsi
		124
		125	movq %r11, -1*8(%rdi)
		126	movq %r10, -2*8(%rdi)
		127	movq %r9, -3*8(%rdi)
		128	movq %r8, -4*8(%rdi)
		129	leaq -4*8(%rdi), %rdi
		130	jae 8b
		131	/*
		132	* Calculate copy position to head.
		133	*/
		134	addq $0x20, %rdx
		135	subq %rdx, %rsi
		136	subq %rdx, %rdi
		137	1:
		138	cmpq $16, %rdx
		139	jb 9f
		140	/*
		141	* Move data from 16 bytes to 31 bytes.
		142	*/
		143	movq 0*8(%rsi), %r11
		144	movq 1*8(%rsi), %r10
		145	movq -2*8(%rsi, %rdx), %r9
		146	movq -1*8(%rsi, %rdx), %r8
		147	movq %r11, 0*8(%rdi)
		148	movq %r10, 1*8(%rdi)
		149	movq %r9, -2*8(%rdi, %rdx)
		150	movq %r8, -1*8(%rdi, %rdx)
		151	jmp 13f
		152	.p2align 4
		153	9:
		154	cmpq $8, %rdx
		155	jb 10f
		156	/*
		157	* Move data from 8 bytes to 15 bytes.
		158	*/
		159	movq 0*8(%rsi), %r11
		160	movq -1*8(%rsi, %rdx), %r10
		161	movq %r11, 0*8(%rdi)
		162	movq %r10, -1*8(%rdi, %rdx)
		163	jmp 13f
		164	10:
		165	cmpq $4, %rdx
		166	jb 11f
		167	/*
		168	* Move data from 4 bytes to 7 bytes.
		169	*/
		170	movl (%rsi), %r11d
		171	movl -4(%rsi, %rdx), %r10d
		172	movl %r11d, (%rdi)
		173	movl %r10d, -4(%rdi, %rdx)
		174	jmp 13f
		175	11:
		176	cmp $2, %rdx
		177	jb 12f
		178	/*
		179	* Move data from 2 bytes to 3 bytes.
		180	*/
		181	movw (%rsi), %r11w
		182	movw -2(%rsi, %rdx), %r10w
		183	movw %r11w, (%rdi)
		184	movw %r10w, -2(%rdi, %rdx)
		185	jmp 13f
		186	12:
		187	cmp $1, %rdx
		188	jb 13f
		189	/*
		190	* Move data for 1 byte.
		191	*/
		192	movb (%rsi), %r11b
		193	movb %r11b, (%rdi)
		194	13:
		195	retq
		196	CFI_ENDPROC
		197	ENDPROC(memmove)


diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c deleted file mode 100644 index 6d0f0ec41b34..000000000000 --- a/arch/x86/lib/memmove_64.c +++ /dev/null
@@ -1,192 +0,0 @@
1	/* Normally compiler builtins are used, but sometimes the compiler calls out
2	of line code. Based on asm-i386/string.h.
3	*/
4	#define _STRING_C
5	#include <linux/string.h>
6	#include <linux/module.h>
7
8	#undef memmove
9	void memmove(void dest, const void *src, size_t count)
10	{
11	unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
12	char *ret;
13
14	__asm__ __volatile__(
15	/* Handle more 32bytes in loop */
16	"mov %2, %3\n\t"
17	"cmp $0x20, %0\n\t"
18	"jb 1f\n\t"
19
20	/* Decide forward/backward copy mode */
21	"cmp %2, %1\n\t"
22	"jb 2f\n\t"
23
24	/*
25	* movsq instruction have many startup latency
26	* so we handle small size by general register.
27	*/
28	"cmp $680, %0\n\t"
29	"jb 3f\n\t"
30	/*
31	* movsq instruction is only good for aligned case.
32	*/
33	"cmpb %%dil, %%sil\n\t"
34	"je 4f\n\t"
35	"3:\n\t"
36	"sub $0x20, %0\n\t"
37	/*
38	* We gobble 32byts forward in each loop.
39	*/
40	"5:\n\t"
41	"sub $0x20, %0\n\t"
42	"movq 0*8(%1), %4\n\t"
43	"movq 1*8(%1), %5\n\t"
44	"movq 2*8(%1), %6\n\t"
45	"movq 3*8(%1), %7\n\t"
46	"leaq 4*8(%1), %1\n\t"
47
48	"movq %4, 0*8(%2)\n\t"
49	"movq %5, 1*8(%2)\n\t"
50	"movq %6, 2*8(%2)\n\t"
51	"movq %7, 3*8(%2)\n\t"
52	"leaq 4*8(%2), %2\n\t"
53	"jae 5b\n\t"
54	"addq $0x20, %0\n\t"
55	"jmp 1f\n\t"
56	/*
57	* Handle data forward by movsq.
58	*/
59	".p2align 4\n\t"
60	"4:\n\t"
61	"movq %0, %8\n\t"
62	"movq -8(%1, %0), %4\n\t"
63	"lea -8(%2, %0), %5\n\t"
64	"shrq $3, %8\n\t"
65	"rep movsq\n\t"
66	"movq %4, (%5)\n\t"
67	"jmp 13f\n\t"
68	/*
69	* Handle data backward by movsq.
70	*/
71	".p2align 4\n\t"
72	"7:\n\t"
73	"movq %0, %8\n\t"
74	"movq (%1), %4\n\t"
75	"movq %2, %5\n\t"
76	"leaq -8(%1, %0), %1\n\t"
77	"leaq -8(%2, %0), %2\n\t"
78	"shrq $3, %8\n\t"
79	"std\n\t"
80	"rep movsq\n\t"
81	"cld\n\t"
82	"movq %4, (%5)\n\t"
83	"jmp 13f\n\t"
84
85	/*
86	* Start to prepare for backward copy.
87	*/
88	".p2align 4\n\t"
89	"2:\n\t"
90	"cmp $680, %0\n\t"
91	"jb 6f \n\t"
92	"cmp %%dil, %%sil\n\t"
93	"je 7b \n\t"
94	"6:\n\t"
95	/*
96	* Calculate copy position to tail.
97	*/
98	"addq %0, %1\n\t"
99	"addq %0, %2\n\t"
100	"subq $0x20, %0\n\t"
101	/*
102	* We gobble 32byts backward in each loop.
103	*/
104	"8:\n\t"
105	"subq $0x20, %0\n\t"
106	"movq -1*8(%1), %4\n\t"
107	"movq -2*8(%1), %5\n\t"
108	"movq -3*8(%1), %6\n\t"
109	"movq -4*8(%1), %7\n\t"
110	"leaq -4*8(%1), %1\n\t"
111
112	"movq %4, -1*8(%2)\n\t"
113	"movq %5, -2*8(%2)\n\t"
114	"movq %6, -3*8(%2)\n\t"
115	"movq %7, -4*8(%2)\n\t"
116	"leaq -4*8(%2), %2\n\t"
117	"jae 8b\n\t"
118	/*
119	* Calculate copy position to head.
120	*/
121	"addq $0x20, %0\n\t"
122	"subq %0, %1\n\t"
123	"subq %0, %2\n\t"
124	"1:\n\t"
125	"cmpq $16, %0\n\t"
126	"jb 9f\n\t"
127	/*
128	* Move data from 16 bytes to 31 bytes.
129	*/
130	"movq 0*8(%1), %4\n\t"
131	"movq 1*8(%1), %5\n\t"
132	"movq -2*8(%1, %0), %6\n\t"
133	"movq -1*8(%1, %0), %7\n\t"
134	"movq %4, 0*8(%2)\n\t"
135	"movq %5, 1*8(%2)\n\t"
136	"movq %6, -2*8(%2, %0)\n\t"
137	"movq %7, -1*8(%2, %0)\n\t"
138	"jmp 13f\n\t"
139	".p2align 4\n\t"
140	"9:\n\t"
141	"cmpq $8, %0\n\t"
142	"jb 10f\n\t"
143	/*
144	* Move data from 8 bytes to 15 bytes.
145	*/
146	"movq 0*8(%1), %4\n\t"
147	"movq -1*8(%1, %0), %5\n\t"
148	"movq %4, 0*8(%2)\n\t"
149	"movq %5, -1*8(%2, %0)\n\t"
150	"jmp 13f\n\t"
151	"10:\n\t"
152	"cmpq $4, %0\n\t"
153	"jb 11f\n\t"
154	/*
155	* Move data from 4 bytes to 7 bytes.
156	*/
157	"movl (%1), %4d\n\t"
158	"movl -4(%1, %0), %5d\n\t"
159	"movl %4d, (%2)\n\t"
160	"movl %5d, -4(%2, %0)\n\t"
161	"jmp 13f\n\t"
162	"11:\n\t"
163	"cmp $2, %0\n\t"
164	"jb 12f\n\t"
165	/*
166	* Move data from 2 bytes to 3 bytes.
167	*/
168	"movw (%1), %4w\n\t"
169	"movw -2(%1, %0), %5w\n\t"
170	"movw %4w, (%2)\n\t"
171	"movw %5w, -2(%2, %0)\n\t"
172	"jmp 13f\n\t"
173	"12:\n\t"
174	"cmp $1, %0\n\t"
175	"jb 13f\n\t"
176	/*
177	* Move data for 1 byte.
178	*/
179	"movb (%1), %4b\n\t"
180	"movb %4b, (%2)\n\t"
181	"13:\n\t"
182	: "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
183	"=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
184	:"0" (count),
185	"1" (src),
186	"2" (dest)
187	:"memory");
188
189	return ret;
190
191	}
192	EXPORT_SYMBOL(memmove);