arm64: lib: Implement optimized memcmp routine

This patch, based on Linaro's Cortex Strings library, adds an assembly optimized memcmp() function. Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org> Signed-off-by: Deepak Saxena <dsaxena@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
author: zhichang.yuan <zhichang.yuan@linaro.org> 2014-04-28 01:11:32 -0400
committer: Catalin Marinas <catalin.marinas@arm.com> 2014-05-23 10:07:57 -0400
commit: d875c9b3724083cd2629cd8507e424cd3716cd28 (patch)
tree: 284326e94a19aee3ca39ac90c04fc297221d877a /arch
parent: b29a51fe0e0be63157d8661666be8bbfd8f0c5d7 (diff)
4 files changed, 263 insertions, 1 deletions
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index 3ee8b303d9a9..3a43305cda71 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -34,4 +34,7 @@ extern void *memchr(const void *, int, __kernel_size_t);
 #define __HAVE_ARCH_MEMSET
 extern void *memset(void *, int, __kernel_size_t);
+#define __HAVE_ARCH_MEMCMP
+extern int memcmp(const void *, const void *, size_t);
 #endif
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568cd8ae..909c18e155ea 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -48,6 +48,7 @@ EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(memchr);
+EXPORT_SYMBOL(memcmp);
        /* atomic bitops */
 EXPORT_SYMBOL(set_bit);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..112c67f2b109 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
 lib-y           := bitops.o clear_user.o delay.o copy_from_user.o       \
                   copy_to_user.o copy_in_user.o copy_page.o            \
                   clear_page.o memchr.o memcpy.o memmove.o memset.o    \
-                   strchr.o strrchr.o
+                   memcmp.o strchr.o strrchr.o
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
new file mode 100644
index 000000000000..6ea0776ba6de
--- /dev/null
+++ b/arch/arm64/lib/memcmp.S
@@ -0,0 +1,258 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+/*
+* compare memory areas(when two memory areas' offset are different,
+* alignment handled by the hardware)
+*
+* Parameters:
+*  x0 - const memory area 1 pointer
+*  x1 - const memory area 2 pointer
+*  x2 - the maximal compare byte length
+* Returns:
+*  x0 - a compare result, maybe less than, equal to, or greater than ZERO
+*/
+/* Parameters and result.  */
+src1            .req    x0
+src2            .req    x1
+limit           .req    x2
+result          .req    x0
+/* Internal variables.  */
+data1           .req    x3
+data1w          .req    w3
+data2           .req    x4
+data2w          .req    w4
+has_nul         .req    x5
+diff            .req    x6
+endloop         .req    x7
+tmp1            .req    x8
+tmp2            .req    x9
+tmp3            .req    x10
+pos             .req    x11
+limit_wd        .req    x12
+mask            .req    x13
+ENTRY(memcmp)
+        cbz     limit, .Lret0
+        eor     tmp1, src1, src2
+        tst     tmp1, #7
+        b.ne    .Lmisaligned8
+        ands    tmp1, src1, #7
+        b.ne    .Lmutual_align
+        sub     limit_wd, limit, #1 /* limit != 0, so no underflow.  */
+        lsr     limit_wd, limit_wd, #3 /* Convert to Dwords.  */
+        /*
+        * The input source addresses are at alignment boundary.
+        * Directly compare eight bytes each time.
+        */
+.Lloop_aligned:
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+.Lstart_realigned:
+        subs    limit_wd, limit_wd, #1
+        eor     diff, data1, data2      /* Non-zero if differences found.  */
+        csinv   endloop, diff, xzr, cs  /* Last Dword or differences.  */
+        cbz     endloop, .Lloop_aligned
+        /* Not reached the limit, must have found a diff.  */
+        tbz     limit_wd, #63, .Lnot_limit
+        /* Limit % 8 == 0 => the diff is in the last 8 bytes. */
+        ands    limit, limit, #7
+        b.eq    .Lnot_limit
+        /*
+        * The remained bytes less than 8. It is needed to extract valid data
+        * from last eight bytes of the intended memory range.
+        */
+        lsl     limit, limit, #3        /* bytes-> bits.  */
+        mov     mask, #~0
+CPU_BE( lsr     mask, mask, limit )
+CPU_LE( lsl     mask, mask, limit )
+        bic     data1, data1, mask
+        bic     data2, data2, mask
+        orr     diff, diff, mask
+        b       .Lnot_limit
+.Lmutual_align:
+        /*
+        * Sources are mutually aligned, but are not currently at an
+        * alignment boundary. Round down the addresses and then mask off
+        * the bytes that precede the start point.
+        */
+        bic     src1, src1, #7
+        bic     src2, src2, #7
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        /*
+        * We can not add limit with alignment offset(tmp1) here. Since the
+        * addition probably make the limit overflown.
+        */
+        sub     limit_wd, limit, #1/*limit != 0, so no underflow.*/
+        and     tmp3, limit_wd, #7
+        lsr     limit_wd, limit_wd, #3
+        add     tmp3, tmp3, tmp1
+        add     limit_wd, limit_wd, tmp3, lsr #3
+        add     limit, limit, tmp1/* Adjust the limit for the extra.  */
+        lsl     tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
+        neg     tmp1, tmp1/* Bits to alignment -64.  */
+        mov     tmp2, #~0
+        /*mask off the non-intended bytes before the start address.*/
+CPU_BE( lsl     tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
+        /* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr     tmp2, tmp2, tmp1 )
+        orr     data1, data1, tmp2
+        orr     data2, data2, tmp2
+        b       .Lstart_realigned
+        /*src1 and src2 have different alignment offset.*/
+.Lmisaligned8:
+        cmp     limit, #8
+        b.lo    .Ltiny8proc /*limit < 8: compare byte by byte*/
+        and     tmp1, src1, #7
+        neg     tmp1, tmp1
+        add     tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
+        and     tmp2, src2, #7
+        neg     tmp2, tmp2
+        add     tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
+        subs    tmp3, tmp1, tmp2
+        csel    pos, tmp1, tmp2, hi /*Choose the maximum.*/
+        sub     limit, limit, pos
+        /*compare the proceeding bytes in the first 8 byte segment.*/
+.Ltinycmp:
+        ldrb    data1w, [src1], #1
+        ldrb    data2w, [src2], #1
+        subs    pos, pos, #1
+        ccmp    data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
+        b.eq    .Ltinycmp
+        cbnz    pos, 1f /*diff occurred before the last byte.*/
+        cmp     data1w, data2w
+        b.eq    .Lstart_align
+1:
+        sub     result, data1, data2
+        ret
+.Lstart_align:
+        lsr     limit_wd, limit, #3
+        cbz     limit_wd, .Lremain8
+        ands    xzr, src1, #7
+        b.eq    .Lrecal_offset
+        /*process more leading bytes to make src1 aligned...*/
+        add     src1, src1, tmp3 /*backwards src1 to alignment boundary*/
+        add     src2, src2, tmp3
+        sub     limit, limit, tmp3
+        lsr     limit_wd, limit, #3
+        cbz     limit_wd, .Lremain8
+        /*load 8 bytes from aligned SRC1..*/
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        subs    limit_wd, limit_wd, #1
+        eor     diff, data1, data2  /*Non-zero if differences found.*/
+        csinv   endloop, diff, xzr, ne
+        cbnz    endloop, .Lunequal_proc
+        /*How far is the current SRC2 from the alignment boundary...*/
+        and     tmp3, tmp3, #7
+.Lrecal_offset:/*src1 is aligned now..*/
+        neg     pos, tmp3
+.Lloopcmp_proc:
+        /*
+        * Divide the eight bytes into two parts. First,backwards the src2
+        * to an alignment boundary,load eight bytes and compare from
+        * the SRC2 alignment boundary. If all 8 bytes are equal,then start
+        * the second part's comparison. Otherwise finish the comparison.
+        * This special handle can garantee all the accesses are in the
+        * thread/task space in avoid to overrange access.
+        */
+        ldr     data1, [src1,pos]
+        ldr     data2, [src2,pos]
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        cbnz    diff, .Lnot_limit
+        /*The second part process*/
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        subs    limit_wd, limit_wd, #1
+        csinv   endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
+        cbz     endloop, .Lloopcmp_proc
+.Lunequal_proc:
+        cbz     diff, .Lremain8
+/*There is differnence occured in the latest comparison.*/
+.Lnot_limit:
+/*
+* For little endian,reverse the low significant equal bits into MSB,then
+* following CLZ can find how many equal bits exist.
+*/
+CPU_LE( rev     diff, diff )
+CPU_LE( rev     data1, data1 )
+CPU_LE( rev     data2, data2 )
+        /*
+        * The MS-non-zero bit of DIFF marks either the first bit
+        * that is different, or the end of the significant data.
+        * Shifting left now will bring the critical information into the
+        * top bits.
+        */
+        clz     pos, diff
+        lsl     data1, data1, pos
+        lsl     data2, data2, pos
+        /*
+        * We need to zero-extend (char is unsigned) the value and then
+        * perform a signed subtraction.
+        */
+        lsr     data1, data1, #56
+        sub     result, data1, data2, lsr #56
+        ret
+.Lremain8:
+        /* Limit % 8 == 0 =>. all data are equal.*/
+        ands    limit, limit, #7
+        b.eq    .Lret0
+.Ltiny8proc:
+        ldrb    data1w, [src1], #1
+        ldrb    data2w, [src2], #1
+        subs    limit, limit, #1
+        ccmp    data1w, data2w, #0, ne  /* NZCV = 0b0000. */
+        b.eq    .Ltiny8proc
+        sub     result, data1, data2
+        ret
+.Lret0:
+        mov     result, #0
+        ret
+ENDPROC(memcmp)
author	zhichang.yuan <zhichang.yuan@linaro.org>	2014-04-28 01:11:32 -0400
committer	Catalin Marinas <catalin.marinas@arm.com>	2014-05-23 10:07:57 -0400
commit	d875c9b3724083cd2629cd8507e424cd3716cd28 (patch)
tree	284326e94a19aee3ca39ac90c04fc297221d877a /arch
parent	b29a51fe0e0be63157d8661666be8bbfd8f0c5d7 (diff)

diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index 3ee8b303d9a9..3a43305cda71 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h
@@ -34,4 +34,7 @@ extern void memchr(const void , int, __kernel_size_t);
34	#define __HAVE_ARCH_MEMSET	34	#define __HAVE_ARCH_MEMSET
35	extern void memset(void , int, __kernel_size_t);	35	extern void memset(void , int, __kernel_size_t);
36		36
		37	#define __HAVE_ARCH_MEMCMP
		38	extern int memcmp(const void , const void , size_t);
		39
37	#endif	40	#endif


diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index 338b568cd8ae..909c18e155ea 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c
@@ -48,6 +48,7 @@ EXPORT_SYMBOL(memset);
48	EXPORT_SYMBOL(memcpy);	48	EXPORT_SYMBOL(memcpy);
49	EXPORT_SYMBOL(memmove);	49	EXPORT_SYMBOL(memmove);
50	EXPORT_SYMBOL(memchr);	50	EXPORT_SYMBOL(memchr);
		51	EXPORT_SYMBOL(memcmp);
51		52
52	/* atomic bitops */	53	/* atomic bitops */
53	EXPORT_SYMBOL(set_bit);	54	EXPORT_SYMBOL(set_bit);


diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 328ce1a99daa..112c67f2b109 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
1	lib-y := bitops.o clear_user.o delay.o copy_from_user.o \	1	lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
2	copy_to_user.o copy_in_user.o copy_page.o \	2	copy_to_user.o copy_in_user.o copy_page.o \
3	clear_page.o memchr.o memcpy.o memmove.o memset.o \	3	clear_page.o memchr.o memcpy.o memmove.o memset.o \
4	strchr.o strrchr.o	4	memcmp.o strchr.o strrchr.o


diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S new file mode 100644 index 000000000000..6ea0776ba6de --- /dev/null +++ b/arch/arm64/lib/memcmp.S
@@ -0,0 +1,258 @@
		1	/*
		2	* Copyright (C) 2013 ARM Ltd.
		3	* Copyright (C) 2013 Linaro.
		4	*
		5	* This code is based on glibc cortex strings work originally authored by Linaro
		6	* and re-licensed under GPLv2 for the Linux kernel. The original code can
		7	* be found @
		8	*
		9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
		10	* files/head:/src/aarch64/
		11	*
		12	* This program is free software; you can redistribute it and/or modify
		13	* it under the terms of the GNU General Public License version 2 as
		14	* published by the Free Software Foundation.
		15	*
		16	* This program is distributed in the hope that it will be useful,
		17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		19	* GNU General Public License for more details.
		20	*
		21	* You should have received a copy of the GNU General Public License
		22	* along with this program. If not, see <http://www.gnu.org/licenses/>.
		23	*/
		24
		25	#include <linux/linkage.h>
		26	#include <asm/assembler.h>
		27
		28	/*
		29	* compare memory areas(when two memory areas' offset are different,
		30	* alignment handled by the hardware)
		31	*
		32	* Parameters:
		33	* x0 - const memory area 1 pointer
		34	* x1 - const memory area 2 pointer
		35	* x2 - the maximal compare byte length
		36	* Returns:
		37	* x0 - a compare result, maybe less than, equal to, or greater than ZERO
		38	*/
		39
		40	/* Parameters and result. */
		41	src1 .req x0
		42	src2 .req x1
		43	limit .req x2
		44	result .req x0
		45
		46	/* Internal variables. */
		47	data1 .req x3
		48	data1w .req w3
		49	data2 .req x4
		50	data2w .req w4
		51	has_nul .req x5
		52	diff .req x6
		53	endloop .req x7
		54	tmp1 .req x8
		55	tmp2 .req x9
		56	tmp3 .req x10
		57	pos .req x11
		58	limit_wd .req x12
		59	mask .req x13
		60
		61	ENTRY(memcmp)
		62	cbz limit, .Lret0
		63	eor tmp1, src1, src2
		64	tst tmp1, #7
		65	b.ne .Lmisaligned8
		66	ands tmp1, src1, #7
		67	b.ne .Lmutual_align
		68	sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
		69	lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
		70	/*
		71	* The input source addresses are at alignment boundary.
		72	* Directly compare eight bytes each time.
		73	*/
		74	.Lloop_aligned:
		75	ldr data1, [src1], #8
		76	ldr data2, [src2], #8
		77	.Lstart_realigned:
		78	subs limit_wd, limit_wd, #1
		79	eor diff, data1, data2 /* Non-zero if differences found. */
		80	csinv endloop, diff, xzr, cs /* Last Dword or differences. */
		81	cbz endloop, .Lloop_aligned
		82
		83	/* Not reached the limit, must have found a diff. */
		84	tbz limit_wd, #63, .Lnot_limit
		85
		86	/* Limit % 8 == 0 => the diff is in the last 8 bytes. */
		87	ands limit, limit, #7
		88	b.eq .Lnot_limit
		89	/*
		90	* The remained bytes less than 8. It is needed to extract valid data
		91	* from last eight bytes of the intended memory range.
		92	*/
		93	lsl limit, limit, #3 /* bytes-> bits. */
		94	mov mask, #~0
		95	CPU_BE( lsr mask, mask, limit )
		96	CPU_LE( lsl mask, mask, limit )
		97	bic data1, data1, mask
		98	bic data2, data2, mask
		99
		100	orr diff, diff, mask
		101	b .Lnot_limit
		102
		103	.Lmutual_align:
		104	/*
		105	* Sources are mutually aligned, but are not currently at an
		106	* alignment boundary. Round down the addresses and then mask off
		107	* the bytes that precede the start point.
		108	*/
		109	bic src1, src1, #7
		110	bic src2, src2, #7
		111	ldr data1, [src1], #8
		112	ldr data2, [src2], #8
		113	/*
		114	* We can not add limit with alignment offset(tmp1) here. Since the
		115	* addition probably make the limit overflown.
		116	*/
		117	sub limit_wd, limit, #1/limit != 0, so no underflow./
		118	and tmp3, limit_wd, #7
		119	lsr limit_wd, limit_wd, #3
		120	add tmp3, tmp3, tmp1
		121	add limit_wd, limit_wd, tmp3, lsr #3
		122	add limit, limit, tmp1/* Adjust the limit for the extra. */
		123
		124	lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
		125	neg tmp1, tmp1/* Bits to alignment -64. */
		126	mov tmp2, #~0
		127	/mask off the non-intended bytes before the start address./
		128	CPU_BE( lsl tmp2, tmp2, tmp1 )/Big-endian.Early bytes are at MSB/
		129	/* Little-endian. Early bytes are at LSB. */
		130	CPU_LE( lsr tmp2, tmp2, tmp1 )
		131
		132	orr data1, data1, tmp2
		133	orr data2, data2, tmp2
		134	b .Lstart_realigned
		135
		136	/src1 and src2 have different alignment offset./
		137	.Lmisaligned8:
		138	cmp limit, #8
		139	b.lo .Ltiny8proc /limit < 8: compare byte by byte/
		140
		141	and tmp1, src1, #7
		142	neg tmp1, tmp1
		143	add tmp1, tmp1, #8/valid length in the first 8 bytes of src1/
		144	and tmp2, src2, #7
		145	neg tmp2, tmp2
		146	add tmp2, tmp2, #8/valid length in the first 8 bytes of src2/
		147	subs tmp3, tmp1, tmp2
		148	csel pos, tmp1, tmp2, hi /Choose the maximum./
		149
		150	sub limit, limit, pos
		151	/compare the proceeding bytes in the first 8 byte segment./
		152	.Ltinycmp:
		153	ldrb data1w, [src1], #1
		154	ldrb data2w, [src2], #1
		155	subs pos, pos, #1
		156	ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */
		157	b.eq .Ltinycmp
		158	cbnz pos, 1f /diff occurred before the last byte./
		159	cmp data1w, data2w
		160	b.eq .Lstart_align
		161	1:
		162	sub result, data1, data2
		163	ret
		164
		165	.Lstart_align:
		166	lsr limit_wd, limit, #3
		167	cbz limit_wd, .Lremain8
		168
		169	ands xzr, src1, #7
		170	b.eq .Lrecal_offset
		171	/process more leading bytes to make src1 aligned.../
		172	add src1, src1, tmp3 /backwards src1 to alignment boundary/
		173	add src2, src2, tmp3
		174	sub limit, limit, tmp3
		175	lsr limit_wd, limit, #3
		176	cbz limit_wd, .Lremain8
		177	/load 8 bytes from aligned SRC1../
		178	ldr data1, [src1], #8
		179	ldr data2, [src2], #8
		180
		181	subs limit_wd, limit_wd, #1
		182	eor diff, data1, data2 /Non-zero if differences found./
		183	csinv endloop, diff, xzr, ne
		184	cbnz endloop, .Lunequal_proc
		185	/How far is the current SRC2 from the alignment boundary.../
		186	and tmp3, tmp3, #7
		187
		188	.Lrecal_offset:/src1 is aligned now../
		189	neg pos, tmp3
		190	.Lloopcmp_proc:
		191	/*
		192	* Divide the eight bytes into two parts. First,backwards the src2
		193	* to an alignment boundary,load eight bytes and compare from
		194	* the SRC2 alignment boundary. If all 8 bytes are equal,then start
		195	* the second part's comparison. Otherwise finish the comparison.
		196	* This special handle can garantee all the accesses are in the
		197	* thread/task space in avoid to overrange access.
		198	*/
		199	ldr data1, [src1,pos]
		200	ldr data2, [src2,pos]
		201	eor diff, data1, data2 /* Non-zero if differences found. */
		202	cbnz diff, .Lnot_limit
		203
		204	/The second part process/
		205	ldr data1, [src1], #8
		206	ldr data2, [src2], #8
		207	eor diff, data1, data2 /* Non-zero if differences found. */
		208	subs limit_wd, limit_wd, #1
		209	csinv endloop, diff, xzr, ne/if limit_wd is 0,will finish the cmp/
		210	cbz endloop, .Lloopcmp_proc
		211	.Lunequal_proc:
		212	cbz diff, .Lremain8
		213
		214	/There is differnence occured in the latest comparison./
		215	.Lnot_limit:
		216	/*
		217	* For little endian,reverse the low significant equal bits into MSB,then
		218	* following CLZ can find how many equal bits exist.
		219	*/
		220	CPU_LE( rev diff, diff )
		221	CPU_LE( rev data1, data1 )
		222	CPU_LE( rev data2, data2 )
		223
		224	/*
		225	* The MS-non-zero bit of DIFF marks either the first bit
		226	* that is different, or the end of the significant data.
		227	* Shifting left now will bring the critical information into the
		228	* top bits.
		229	*/
		230	clz pos, diff
		231	lsl data1, data1, pos
		232	lsl data2, data2, pos
		233	/*
		234	* We need to zero-extend (char is unsigned) the value and then
		235	* perform a signed subtraction.
		236	*/
		237	lsr data1, data1, #56
		238	sub result, data1, data2, lsr #56
		239	ret
		240
		241	.Lremain8:
		242	/* Limit % 8 == 0 =>. all data are equal.*/
		243	ands limit, limit, #7
		244	b.eq .Lret0
		245
		246	.Ltiny8proc:
		247	ldrb data1w, [src1], #1
		248	ldrb data2w, [src2], #1
		249	subs limit, limit, #1
		250
		251	ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */
		252	b.eq .Ltiny8proc
		253	sub result, data1, data2
		254	ret
		255	.Lret0:
		256	mov result, #0
		257	ret
		258	ENDPROC(memcmp)