arm64: lib: Implement optimized string compare routines

This patch, based on Linaro's Cortex Strings library, adds an assembly optimized strcmp() and strncmp() functions. Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org> Signed-off-by: Deepak Saxena <dsaxena@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
author: zhichang.yuan <zhichang.yuan@linaro.org> 2014-04-28 01:11:33 -0400
committer: Catalin Marinas <catalin.marinas@arm.com> 2014-05-23 10:16:59 -0400
commit: 192c4d902f19b66902d7aacc19e9b169bebfb2e5 (patch)
tree: 8ab658ea4456b0ccec55e95c2aee40aed028936f /arch
parent: d875c9b3724083cd2629cd8507e424cd3716cd28 (diff)
5 files changed, 553 insertions, 1 deletions
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index 3a43305cda71..6133f4970027 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -22,6 +22,12 @@ extern char *strrchr(const char *, int c);
 #define __HAVE_ARCH_STRCHR
 extern char *strchr(const char *, int c);
+#define __HAVE_ARCH_STRCMP
+extern int strcmp(const char *, const char *);
+#define __HAVE_ARCH_STRNCMP
+extern int strncmp(const char *, const char *, __kernel_size_t);
 #define __HAVE_ARCH_MEMCPY
 extern void *memcpy(void *, const void *, __kernel_size_t);
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 909c18e155ea..2784a79dbdd9 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -44,6 +44,8 @@ EXPORT_SYMBOL(memstart_addr);
        /* string / mem functions */
 EXPORT_SYMBOL(strchr);
 EXPORT_SYMBOL(strrchr);
+EXPORT_SYMBOL(strcmp);
+EXPORT_SYMBOL(strncmp);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 112c67f2b109..aaaf6180c558 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
 lib-y           := bitops.o clear_user.o delay.o copy_from_user.o       \
                   copy_to_user.o copy_in_user.o copy_page.o            \
                   clear_page.o memchr.o memcpy.o memmove.o memset.o    \
-                   memcmp.o strchr.o strrchr.o
+                   memcmp.o strcmp.o strncmp.o strchr.o strrchr.o
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
new file mode 100644
index 000000000000..42f828b06c59
--- /dev/null
+++ b/arch/arm64/lib/strcmp.S
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+/*
+ * compare two strings
+ *
+ * Parameters:
+ *      x0 - const string 1 pointer
+ *    x1 - const string 2 pointer
+ * Returns:
+ * x0 - an integer less than, equal to, or greater than zero
+ * if  s1  is  found, respectively, to be less than, to match,
+ * or be greater than s2.
+ */
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+/* Parameters and result.  */
+src1            .req    x0
+src2            .req    x1
+result          .req    x0
+/* Internal variables.  */
+data1           .req    x2
+data1w          .req    w2
+data2           .req    x3
+data2w          .req    w3
+has_nul         .req    x4
+diff            .req    x5
+syndrome        .req    x6
+tmp1            .req    x7
+tmp2            .req    x8
+tmp3            .req    x9
+zeroones        .req    x10
+pos             .req    x11
+ENTRY(strcmp)
+        eor     tmp1, src1, src2
+        mov     zeroones, #REP8_01
+        tst     tmp1, #7
+        b.ne    .Lmisaligned8
+        ands    tmp1, src1, #7
+        b.ne    .Lmutual_align
+        /*
+        * NUL detection works on the principle that (X - 1) & (~X) & 0x80
+        * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+        * can be done in parallel across the entire word.
+        */
+.Lloop_aligned:
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+.Lstart_realigned:
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        eor     diff, data1, data2      /* Non-zero if differences found.  */
+        bic     has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
+        orr     syndrome, diff, has_nul
+        cbz     syndrome, .Lloop_aligned
+        b       .Lcal_cmpresult
+.Lmutual_align:
+        /*
+        * Sources are mutually aligned, but are not currently at an
+        * alignment boundary.  Round down the addresses and then mask off
+        * the bytes that preceed the start point.
+        */
+        bic     src1, src1, #7
+        bic     src2, src2, #7
+        lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */
+        ldr     data1, [src1], #8
+        neg     tmp1, tmp1              /* Bits to alignment -64.  */
+        ldr     data2, [src2], #8
+        mov     tmp2, #~0
+        /* Big-endian.  Early bytes are at MSB.  */
+CPU_BE( lsl     tmp2, tmp2, tmp1 )      /* Shift (tmp1 & 63).  */
+        /* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr     tmp2, tmp2, tmp1 )      /* Shift (tmp1 & 63).  */
+        orr     data1, data1, tmp2
+        orr     data2, data2, tmp2
+        b       .Lstart_realigned
+.Lmisaligned8:
+        /*
+        * Get the align offset length to compare per byte first.
+        * After this process, one string's address will be aligned.
+        */
+        and     tmp1, src1, #7
+        neg     tmp1, tmp1
+        add     tmp1, tmp1, #8
+        and     tmp2, src2, #7
+        neg     tmp2, tmp2
+        add     tmp2, tmp2, #8
+        subs    tmp3, tmp1, tmp2
+        csel    pos, tmp1, tmp2, hi /*Choose the maximum. */
+.Ltinycmp:
+        ldrb    data1w, [src1], #1
+        ldrb    data2w, [src2], #1
+        subs    pos, pos, #1
+        ccmp    data1w, #1, #0, ne  /* NZCV = 0b0000.  */
+        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+        b.eq    .Ltinycmp
+        cbnz    pos, 1f /*find the null or unequal...*/
+        cmp     data1w, #1
+        ccmp    data1w, data2w, #0, cs
+        b.eq    .Lstart_align /*the last bytes are equal....*/
+1:
+        sub     result, data1, data2
+        ret
+.Lstart_align:
+        ands    xzr, src1, #7
+        b.eq    .Lrecal_offset
+        /*process more leading bytes to make str1 aligned...*/
+        add     src1, src1, tmp3
+        add     src2, src2, tmp3
+        /*load 8 bytes from aligned str1 and non-aligned str2..*/
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        bic     has_nul, tmp1, tmp2
+        eor     diff, data1, data2 /* Non-zero if differences found.  */
+        orr     syndrome, diff, has_nul
+        cbnz    syndrome, .Lcal_cmpresult
+        /*How far is the current str2 from the alignment boundary...*/
+        and     tmp3, tmp3, #7
+.Lrecal_offset:
+        neg     pos, tmp3
+.Lloopcmp_proc:
+        /*
+        * Divide the eight bytes into two parts. First,backwards the src2
+        * to an alignment boundary,load eight bytes from the SRC2 alignment
+        * boundary,then compare with the relative bytes from SRC1.
+        * If all 8 bytes are equal,then start the second part's comparison.
+        * Otherwise finish the comparison.
+        * This special handle can garantee all the accesses are in the
+        * thread/task space in avoid to overrange access.
+        */
+        ldr     data1, [src1,pos]
+        ldr     data2, [src2,pos]
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        bic     has_nul, tmp1, tmp2
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        orr     syndrome, diff, has_nul
+        cbnz    syndrome, .Lcal_cmpresult
+        /*The second part process*/
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        bic     has_nul, tmp1, tmp2
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        orr     syndrome, diff, has_nul
+        cbz     syndrome, .Lloopcmp_proc
+.Lcal_cmpresult:
+        /*
+        * reversed the byte-order as big-endian,then CLZ can find the most
+        * significant zero bits.
+        */
+CPU_LE( rev     syndrome, syndrome )
+CPU_LE( rev     data1, data1 )
+CPU_LE( rev     data2, data2 )
+        /*
+        * For big-endian we cannot use the trick with the syndrome value
+        * as carry-propagation can corrupt the upper bits if the trailing
+        * bytes in the string contain 0x01.
+        * However, if there is no NUL byte in the dword, we can generate
+        * the result directly.  We ca not just subtract the bytes as the
+        * MSB might be significant.
+        */
+CPU_BE( cbnz    has_nul, 1f )
+CPU_BE( cmp     data1, data2 )
+CPU_BE( cset    result, ne )
+CPU_BE( cneg    result, result, lo )
+CPU_BE( ret )
+CPU_BE( 1: )
+        /*Re-compute the NUL-byte detection, using a byte-reversed value. */
+CPU_BE( rev     tmp3, data1 )
+CPU_BE( sub     tmp1, tmp3, zeroones )
+CPU_BE( orr     tmp2, tmp3, #REP8_7f )
+CPU_BE( bic     has_nul, tmp1, tmp2 )
+CPU_BE( rev     has_nul, has_nul )
+CPU_BE( orr     syndrome, diff, has_nul )
+        clz     pos, syndrome
+        /*
+        * The MS-non-zero bit of the syndrome marks either the first bit
+        * that is different, or the top bit of the first zero byte.
+        * Shifting left now will bring the critical information into the
+        * top bits.
+        */
+        lsl     data1, data1, pos
+        lsl     data2, data2, pos
+        /*
+        * But we need to zero-extend (char is unsigned) the value and then
+        * perform a signed 32-bit subtraction.
+        */
+        lsr     data1, data1, #56
+        sub     result, data1, data2, lsr #56
+        ret
+ENDPROC(strcmp)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
new file mode 100644
index 000000000000..0224cf5a5533
--- /dev/null
+++ b/arch/arm64/lib/strncmp.S
@@ -0,0 +1,310 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+/*
+ * compare two strings
+ *
+ * Parameters:
+ *  x0 - const string 1 pointer
+ *  x1 - const string 2 pointer
+ *  x2 - the maximal length to be compared
+ * Returns:
+ *  x0 - an integer less than, equal to, or greater than zero if s1 is found,
+ *     respectively, to be less than, to match, or be greater than s2.
+ */
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+/* Parameters and result.  */
+src1            .req    x0
+src2            .req    x1
+limit           .req    x2
+result          .req    x0
+/* Internal variables.  */
+data1           .req    x3
+data1w          .req    w3
+data2           .req    x4
+data2w          .req    w4
+has_nul         .req    x5
+diff            .req    x6
+syndrome        .req    x7
+tmp1            .req    x8
+tmp2            .req    x9
+tmp3            .req    x10
+zeroones        .req    x11
+pos             .req    x12
+limit_wd        .req    x13
+mask            .req    x14
+endloop         .req    x15
+ENTRY(strncmp)
+        cbz     limit, .Lret0
+        eor     tmp1, src1, src2
+        mov     zeroones, #REP8_01
+        tst     tmp1, #7
+        b.ne    .Lmisaligned8
+        ands    tmp1, src1, #7
+        b.ne    .Lmutual_align
+        /* Calculate the number of full and partial words -1.  */
+        /*
+        * when limit is mulitply of 8, if not sub 1,
+        * the judgement of last dword will wrong.
+        */
+        sub     limit_wd, limit, #1 /* limit != 0, so no underflow.  */
+        lsr     limit_wd, limit_wd, #3  /* Convert to Dwords.  */
+        /*
+        * NUL detection works on the principle that (X - 1) & (~X) & 0x80
+        * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+        * can be done in parallel across the entire word.
+        */
+.Lloop_aligned:
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+.Lstart_realigned:
+        subs    limit_wd, limit_wd, #1
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        csinv   endloop, diff, xzr, pl  /* Last Dword or differences.*/
+        bics    has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */
+        ccmp    endloop, #0, #0, eq
+        b.eq    .Lloop_aligned
+        /*Not reached the limit, must have found the end or a diff.  */
+        tbz     limit_wd, #63, .Lnot_limit
+        /* Limit % 8 == 0 => all bytes significant.  */
+        ands    limit, limit, #7
+        b.eq    .Lnot_limit
+        lsl     limit, limit, #3    /* Bits -> bytes.  */
+        mov     mask, #~0
+CPU_BE( lsr     mask, mask, limit )
+CPU_LE( lsl     mask, mask, limit )
+        bic     data1, data1, mask
+        bic     data2, data2, mask
+        /* Make sure that the NUL byte is marked in the syndrome.  */
+        orr     has_nul, has_nul, mask
+.Lnot_limit:
+        orr     syndrome, diff, has_nul
+        b       .Lcal_cmpresult
+.Lmutual_align:
+        /*
+        * Sources are mutually aligned, but are not currently at an
+        * alignment boundary.  Round down the addresses and then mask off
+        * the bytes that precede the start point.
+        * We also need to adjust the limit calculations, but without
+        * overflowing if the limit is near ULONG_MAX.
+        */
+        bic     src1, src1, #7
+        bic     src2, src2, #7
+        ldr     data1, [src1], #8
+        neg     tmp3, tmp1, lsl #3  /* 64 - bits(bytes beyond align). */
+        ldr     data2, [src2], #8
+        mov     tmp2, #~0
+        sub     limit_wd, limit, #1 /* limit != 0, so no underflow.  */
+        /* Big-endian.  Early bytes are at MSB.  */
+CPU_BE( lsl     tmp2, tmp2, tmp3 )      /* Shift (tmp1 & 63).  */
+        /* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr     tmp2, tmp2, tmp3 )      /* Shift (tmp1 & 63).  */
+        and     tmp3, limit_wd, #7
+        lsr     limit_wd, limit_wd, #3
+        /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/
+        add     limit, limit, tmp1
+        add     tmp3, tmp3, tmp1
+        orr     data1, data1, tmp2
+        orr     data2, data2, tmp2
+        add     limit_wd, limit_wd, tmp3, lsr #3
+        b       .Lstart_realigned
+/*when src1 offset is not equal to src2 offset...*/
+.Lmisaligned8:
+        cmp     limit, #8
+        b.lo    .Ltiny8proc /*limit < 8... */
+        /*
+        * Get the align offset length to compare per byte first.
+        * After this process, one string's address will be aligned.*/
+        and     tmp1, src1, #7
+        neg     tmp1, tmp1
+        add     tmp1, tmp1, #8
+        and     tmp2, src2, #7
+        neg     tmp2, tmp2
+        add     tmp2, tmp2, #8
+        subs    tmp3, tmp1, tmp2
+        csel    pos, tmp1, tmp2, hi /*Choose the maximum. */
+        /*
+        * Here, limit is not less than 8, so directly run .Ltinycmp
+        * without checking the limit.*/
+        sub     limit, limit, pos
+.Ltinycmp:
+        ldrb    data1w, [src1], #1
+        ldrb    data2w, [src2], #1
+        subs    pos, pos, #1
+        ccmp    data1w, #1, #0, ne  /* NZCV = 0b0000.  */
+        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+        b.eq    .Ltinycmp
+        cbnz    pos, 1f /*find the null or unequal...*/
+        cmp     data1w, #1
+        ccmp    data1w, data2w, #0, cs
+        b.eq    .Lstart_align /*the last bytes are equal....*/
+1:
+        sub     result, data1, data2
+        ret
+.Lstart_align:
+        lsr     limit_wd, limit, #3
+        cbz     limit_wd, .Lremain8
+        /*process more leading bytes to make str1 aligned...*/
+        ands    xzr, src1, #7
+        b.eq    .Lrecal_offset
+        add     src1, src1, tmp3        /*tmp3 is positive in this branch.*/
+        add     src2, src2, tmp3
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        sub     limit, limit, tmp3
+        lsr     limit_wd, limit, #3
+        subs    limit_wd, limit_wd, #1
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        csinv   endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
+        bics    has_nul, tmp1, tmp2
+        ccmp    endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
+        b.ne    .Lunequal_proc
+        /*How far is the current str2 from the alignment boundary...*/
+        and     tmp3, tmp3, #7
+.Lrecal_offset:
+        neg     pos, tmp3
+.Lloopcmp_proc:
+        /*
+        * Divide the eight bytes into two parts. First,backwards the src2
+        * to an alignment boundary,load eight bytes from the SRC2 alignment
+        * boundary,then compare with the relative bytes from SRC1.
+        * If all 8 bytes are equal,then start the second part's comparison.
+        * Otherwise finish the comparison.
+        * This special handle can garantee all the accesses are in the
+        * thread/task space in avoid to overrange access.
+        */
+        ldr     data1, [src1,pos]
+        ldr     data2, [src2,pos]
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        bics    has_nul, tmp1, tmp2 /* Non-zero if NUL terminator.  */
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        csinv   endloop, diff, xzr, eq
+        cbnz    endloop, .Lunequal_proc
+        /*The second part process*/
+        ldr     data1, [src1], #8
+        ldr     data2, [src2], #8
+        subs    limit_wd, limit_wd, #1
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        eor     diff, data1, data2  /* Non-zero if differences found.  */
+        csinv   endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
+        bics    has_nul, tmp1, tmp2
+        ccmp    endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
+        b.eq    .Lloopcmp_proc
+.Lunequal_proc:
+        orr     syndrome, diff, has_nul
+        cbz     syndrome, .Lremain8
+.Lcal_cmpresult:
+        /*
+        * reversed the byte-order as big-endian,then CLZ can find the most
+        * significant zero bits.
+        */
+CPU_LE( rev     syndrome, syndrome )
+CPU_LE( rev     data1, data1 )
+CPU_LE( rev     data2, data2 )
+        /*
+        * For big-endian we cannot use the trick with the syndrome value
+        * as carry-propagation can corrupt the upper bits if the trailing
+        * bytes in the string contain 0x01.
+        * However, if there is no NUL byte in the dword, we can generate
+        * the result directly.  We can't just subtract the bytes as the
+        * MSB might be significant.
+        */
+CPU_BE( cbnz    has_nul, 1f )
+CPU_BE( cmp     data1, data2 )
+CPU_BE( cset    result, ne )
+CPU_BE( cneg    result, result, lo )
+CPU_BE( ret )
+CPU_BE( 1: )
+        /* Re-compute the NUL-byte detection, using a byte-reversed value.*/
+CPU_BE( rev     tmp3, data1 )
+CPU_BE( sub     tmp1, tmp3, zeroones )
+CPU_BE( orr     tmp2, tmp3, #REP8_7f )
+CPU_BE( bic     has_nul, tmp1, tmp2 )
+CPU_BE( rev     has_nul, has_nul )
+CPU_BE( orr     syndrome, diff, has_nul )
+        /*
+        * The MS-non-zero bit of the syndrome marks either the first bit
+        * that is different, or the top bit of the first zero byte.
+        * Shifting left now will bring the critical information into the
+        * top bits.
+        */
+        clz     pos, syndrome
+        lsl     data1, data1, pos
+        lsl     data2, data2, pos
+        /*
+        * But we need to zero-extend (char is unsigned) the value and then
+        * perform a signed 32-bit subtraction.
+        */
+        lsr     data1, data1, #56
+        sub     result, data1, data2, lsr #56
+        ret
+.Lremain8:
+        /* Limit % 8 == 0 => all bytes significant.  */
+        ands    limit, limit, #7
+        b.eq    .Lret0
+.Ltiny8proc:
+        ldrb    data1w, [src1], #1
+        ldrb    data2w, [src2], #1
+        subs    limit, limit, #1
+        ccmp    data1w, #1, #0, ne  /* NZCV = 0b0000.  */
+        ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
+        b.eq    .Ltiny8proc
+        sub     result, data1, data2
+        ret
+.Lret0:
+        mov     result, #0
+        ret
+ENDPROC(strncmp)
author	zhichang.yuan <zhichang.yuan@linaro.org>	2014-04-28 01:11:33 -0400
committer	Catalin Marinas <catalin.marinas@arm.com>	2014-05-23 10:16:59 -0400
commit	192c4d902f19b66902d7aacc19e9b169bebfb2e5 (patch)
tree	8ab658ea4456b0ccec55e95c2aee40aed028936f /arch
parent	d875c9b3724083cd2629cd8507e424cd3716cd28 (diff)

diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index 3a43305cda71..6133f4970027 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h
@@ -22,6 +22,12 @@ extern char strrchr(const char , int c);
22	#define __HAVE_ARCH_STRCHR	22	#define __HAVE_ARCH_STRCHR
23	extern char strchr(const char , int c);	23	extern char strchr(const char , int c);
24		24
		25	#define __HAVE_ARCH_STRCMP
		26	extern int strcmp(const char , const char );
		27
		28	#define __HAVE_ARCH_STRNCMP
		29	extern int strncmp(const char , const char , __kernel_size_t);
		30
25	#define __HAVE_ARCH_MEMCPY	31	#define __HAVE_ARCH_MEMCPY
26	extern void memcpy(void , const void *, __kernel_size_t);	32	extern void memcpy(void , const void *, __kernel_size_t);
27		33


diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index 909c18e155ea..2784a79dbdd9 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c
@@ -44,6 +44,8 @@ EXPORT_SYMBOL(memstart_addr);
44	/* string / mem functions */	44	/* string / mem functions */
45	EXPORT_SYMBOL(strchr);	45	EXPORT_SYMBOL(strchr);
46	EXPORT_SYMBOL(strrchr);	46	EXPORT_SYMBOL(strrchr);
		47	EXPORT_SYMBOL(strcmp);
		48	EXPORT_SYMBOL(strncmp);
47	EXPORT_SYMBOL(memset);	49	EXPORT_SYMBOL(memset);
48	EXPORT_SYMBOL(memcpy);	50	EXPORT_SYMBOL(memcpy);
49	EXPORT_SYMBOL(memmove);	51	EXPORT_SYMBOL(memmove);


diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 112c67f2b109..aaaf6180c558 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
1	lib-y := bitops.o clear_user.o delay.o copy_from_user.o \	1	lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
2	copy_to_user.o copy_in_user.o copy_page.o \	2	copy_to_user.o copy_in_user.o copy_page.o \
3	clear_page.o memchr.o memcpy.o memmove.o memset.o \	3	clear_page.o memchr.o memcpy.o memmove.o memset.o \
4	memcmp.o strchr.o strrchr.o	4	memcmp.o strcmp.o strncmp.o strchr.o strrchr.o


diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S new file mode 100644 index 000000000000..42f828b06c59 --- /dev/null +++ b/arch/arm64/lib/strcmp.S
@@ -0,0 +1,234 @@
		1	/*
		2	* Copyright (C) 2013 ARM Ltd.
		3	* Copyright (C) 2013 Linaro.
		4	*
		5	* This code is based on glibc cortex strings work originally authored by Linaro
		6	* and re-licensed under GPLv2 for the Linux kernel. The original code can
		7	* be found @
		8	*
		9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
		10	* files/head:/src/aarch64/
		11	*
		12	* This program is free software; you can redistribute it and/or modify
		13	* it under the terms of the GNU General Public License version 2 as
		14	* published by the Free Software Foundation.
		15	*
		16	* This program is distributed in the hope that it will be useful,
		17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		19	* GNU General Public License for more details.
		20	*
		21	* You should have received a copy of the GNU General Public License
		22	* along with this program. If not, see <http://www.gnu.org/licenses/>.
		23	*/
		24
		25	#include <linux/linkage.h>
		26	#include <asm/assembler.h>
		27
		28	/*
		29	* compare two strings
		30	*
		31	* Parameters:
		32	* x0 - const string 1 pointer
		33	* x1 - const string 2 pointer
		34	* Returns:
		35	* x0 - an integer less than, equal to, or greater than zero
		36	* if s1 is found, respectively, to be less than, to match,
		37	* or be greater than s2.
		38	*/
		39
		40	#define REP8_01 0x0101010101010101
		41	#define REP8_7f 0x7f7f7f7f7f7f7f7f
		42	#define REP8_80 0x8080808080808080
		43
		44	/* Parameters and result. */
		45	src1 .req x0
		46	src2 .req x1
		47	result .req x0
		48
		49	/* Internal variables. */
		50	data1 .req x2
		51	data1w .req w2
		52	data2 .req x3
		53	data2w .req w3
		54	has_nul .req x4
		55	diff .req x5
		56	syndrome .req x6
		57	tmp1 .req x7
		58	tmp2 .req x8
		59	tmp3 .req x9
		60	zeroones .req x10
		61	pos .req x11
		62
		63	ENTRY(strcmp)
		64	eor tmp1, src1, src2
		65	mov zeroones, #REP8_01
		66	tst tmp1, #7
		67	b.ne .Lmisaligned8
		68	ands tmp1, src1, #7
		69	b.ne .Lmutual_align
		70
		71	/*
		72	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
		73	* (=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
		74	* can be done in parallel across the entire word.
		75	*/
		76	.Lloop_aligned:
		77	ldr data1, [src1], #8
		78	ldr data2, [src2], #8
		79	.Lstart_realigned:
		80	sub tmp1, data1, zeroones
		81	orr tmp2, data1, #REP8_7f
		82	eor diff, data1, data2 /* Non-zero if differences found. */
		83	bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
		84	orr syndrome, diff, has_nul
		85	cbz syndrome, .Lloop_aligned
		86	b .Lcal_cmpresult
		87
		88	.Lmutual_align:
		89	/*
		90	* Sources are mutually aligned, but are not currently at an
		91	* alignment boundary. Round down the addresses and then mask off
		92	* the bytes that preceed the start point.
		93	*/
		94	bic src1, src1, #7
		95	bic src2, src2, #7
		96	lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
		97	ldr data1, [src1], #8
		98	neg tmp1, tmp1 /* Bits to alignment -64. */
		99	ldr data2, [src2], #8
		100	mov tmp2, #~0
		101	/* Big-endian. Early bytes are at MSB. */
		102	CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
		103	/* Little-endian. Early bytes are at LSB. */
		104	CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
		105
		106	orr data1, data1, tmp2
		107	orr data2, data2, tmp2
		108	b .Lstart_realigned
		109
		110	.Lmisaligned8:
		111	/*
		112	* Get the align offset length to compare per byte first.
		113	* After this process, one string's address will be aligned.
		114	*/
		115	and tmp1, src1, #7
		116	neg tmp1, tmp1
		117	add tmp1, tmp1, #8
		118	and tmp2, src2, #7
		119	neg tmp2, tmp2
		120	add tmp2, tmp2, #8
		121	subs tmp3, tmp1, tmp2
		122	csel pos, tmp1, tmp2, hi /Choose the maximum. /
		123	.Ltinycmp:
		124	ldrb data1w, [src1], #1
		125	ldrb data2w, [src2], #1
		126	subs pos, pos, #1
		127	ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
		128	ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
		129	b.eq .Ltinycmp
		130	cbnz pos, 1f /find the null or unequal.../
		131	cmp data1w, #1
		132	ccmp data1w, data2w, #0, cs
		133	b.eq .Lstart_align /the last bytes are equal..../
		134	1:
		135	sub result, data1, data2
		136	ret
		137
		138	.Lstart_align:
		139	ands xzr, src1, #7
		140	b.eq .Lrecal_offset
		141	/process more leading bytes to make str1 aligned.../
		142	add src1, src1, tmp3
		143	add src2, src2, tmp3
		144	/load 8 bytes from aligned str1 and non-aligned str2../
		145	ldr data1, [src1], #8
		146	ldr data2, [src2], #8
		147
		148	sub tmp1, data1, zeroones
		149	orr tmp2, data1, #REP8_7f
		150	bic has_nul, tmp1, tmp2
		151	eor diff, data1, data2 /* Non-zero if differences found. */
		152	orr syndrome, diff, has_nul
		153	cbnz syndrome, .Lcal_cmpresult
		154	/How far is the current str2 from the alignment boundary.../
		155	and tmp3, tmp3, #7
		156	.Lrecal_offset:
		157	neg pos, tmp3
		158	.Lloopcmp_proc:
		159	/*
		160	* Divide the eight bytes into two parts. First,backwards the src2
		161	* to an alignment boundary,load eight bytes from the SRC2 alignment
		162	* boundary,then compare with the relative bytes from SRC1.
		163	* If all 8 bytes are equal,then start the second part's comparison.
		164	* Otherwise finish the comparison.
		165	* This special handle can garantee all the accesses are in the
		166	* thread/task space in avoid to overrange access.
		167	*/
		168	ldr data1, [src1,pos]
		169	ldr data2, [src2,pos]
		170	sub tmp1, data1, zeroones
		171	orr tmp2, data1, #REP8_7f
		172	bic has_nul, tmp1, tmp2
		173	eor diff, data1, data2 /* Non-zero if differences found. */
		174	orr syndrome, diff, has_nul
		175	cbnz syndrome, .Lcal_cmpresult
		176
		177	/The second part process/
		178	ldr data1, [src1], #8
		179	ldr data2, [src2], #8
		180	sub tmp1, data1, zeroones
		181	orr tmp2, data1, #REP8_7f
		182	bic has_nul, tmp1, tmp2
		183	eor diff, data1, data2 /* Non-zero if differences found. */
		184	orr syndrome, diff, has_nul
		185	cbz syndrome, .Lloopcmp_proc
		186
		187	.Lcal_cmpresult:
		188	/*
		189	* reversed the byte-order as big-endian,then CLZ can find the most
		190	* significant zero bits.
		191	*/
		192	CPU_LE( rev syndrome, syndrome )
		193	CPU_LE( rev data1, data1 )
		194	CPU_LE( rev data2, data2 )
		195
		196	/*
		197	* For big-endian we cannot use the trick with the syndrome value
		198	* as carry-propagation can corrupt the upper bits if the trailing
		199	* bytes in the string contain 0x01.
		200	* However, if there is no NUL byte in the dword, we can generate
		201	* the result directly. We ca not just subtract the bytes as the
		202	* MSB might be significant.
		203	*/
		204	CPU_BE( cbnz has_nul, 1f )
		205	CPU_BE( cmp data1, data2 )
		206	CPU_BE( cset result, ne )
		207	CPU_BE( cneg result, result, lo )
		208	CPU_BE( ret )
		209	CPU_BE( 1: )
		210	/Re-compute the NUL-byte detection, using a byte-reversed value. /
		211	CPU_BE( rev tmp3, data1 )
		212	CPU_BE( sub tmp1, tmp3, zeroones )
		213	CPU_BE( orr tmp2, tmp3, #REP8_7f )
		214	CPU_BE( bic has_nul, tmp1, tmp2 )
		215	CPU_BE( rev has_nul, has_nul )
		216	CPU_BE( orr syndrome, diff, has_nul )
		217
		218	clz pos, syndrome
		219	/*
		220	* The MS-non-zero bit of the syndrome marks either the first bit
		221	* that is different, or the top bit of the first zero byte.
		222	* Shifting left now will bring the critical information into the
		223	* top bits.
		224	*/
		225	lsl data1, data1, pos
		226	lsl data2, data2, pos
		227	/*
		228	* But we need to zero-extend (char is unsigned) the value and then
		229	* perform a signed 32-bit subtraction.
		230	*/
		231	lsr data1, data1, #56
		232	sub result, data1, data2, lsr #56
		233	ret
		234	ENDPROC(strcmp)


diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S new file mode 100644 index 000000000000..0224cf5a5533 --- /dev/null +++ b/arch/arm64/lib/strncmp.S
@@ -0,0 +1,310 @@
		1	/*
		2	* Copyright (C) 2013 ARM Ltd.
		3	* Copyright (C) 2013 Linaro.
		4	*
		5	* This code is based on glibc cortex strings work originally authored by Linaro
		6	* and re-licensed under GPLv2 for the Linux kernel. The original code can
		7	* be found @
		8	*
		9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
		10	* files/head:/src/aarch64/
		11	*
		12	* This program is free software; you can redistribute it and/or modify
		13	* it under the terms of the GNU General Public License version 2 as
		14	* published by the Free Software Foundation.
		15	*
		16	* This program is distributed in the hope that it will be useful,
		17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		19	* GNU General Public License for more details.
		20	*
		21	* You should have received a copy of the GNU General Public License
		22	* along with this program. If not, see <http://www.gnu.org/licenses/>.
		23	*/
		24
		25	#include <linux/linkage.h>
		26	#include <asm/assembler.h>
		27
		28	/*
		29	* compare two strings
		30	*
		31	* Parameters:
		32	* x0 - const string 1 pointer
		33	* x1 - const string 2 pointer
		34	* x2 - the maximal length to be compared
		35	* Returns:
		36	* x0 - an integer less than, equal to, or greater than zero if s1 is found,
		37	* respectively, to be less than, to match, or be greater than s2.
		38	*/
		39
		40	#define REP8_01 0x0101010101010101
		41	#define REP8_7f 0x7f7f7f7f7f7f7f7f
		42	#define REP8_80 0x8080808080808080
		43
		44	/* Parameters and result. */
		45	src1 .req x0
		46	src2 .req x1
		47	limit .req x2
		48	result .req x0
		49
		50	/* Internal variables. */
		51	data1 .req x3
		52	data1w .req w3
		53	data2 .req x4
		54	data2w .req w4
		55	has_nul .req x5
		56	diff .req x6
		57	syndrome .req x7
		58	tmp1 .req x8
		59	tmp2 .req x9
		60	tmp3 .req x10
		61	zeroones .req x11
		62	pos .req x12
		63	limit_wd .req x13
		64	mask .req x14
		65	endloop .req x15
		66
		67	ENTRY(strncmp)
		68	cbz limit, .Lret0
		69	eor tmp1, src1, src2
		70	mov zeroones, #REP8_01
		71	tst tmp1, #7
		72	b.ne .Lmisaligned8
		73	ands tmp1, src1, #7
		74	b.ne .Lmutual_align
		75	/* Calculate the number of full and partial words -1. */
		76	/*
		77	* when limit is mulitply of 8, if not sub 1,
		78	* the judgement of last dword will wrong.
		79	*/
		80	sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
		81	lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
		82
		83	/*
		84	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
		85	* (=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
		86	* can be done in parallel across the entire word.
		87	*/
		88	.Lloop_aligned:
		89	ldr data1, [src1], #8
		90	ldr data2, [src2], #8
		91	.Lstart_realigned:
		92	subs limit_wd, limit_wd, #1
		93	sub tmp1, data1, zeroones
		94	orr tmp2, data1, #REP8_7f
		95	eor diff, data1, data2 /* Non-zero if differences found. */
		96	csinv endloop, diff, xzr, pl /* Last Dword or differences.*/
		97	bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
		98	ccmp endloop, #0, #0, eq
		99	b.eq .Lloop_aligned
		100
		101	/Not reached the limit, must have found the end or a diff. /
		102	tbz limit_wd, #63, .Lnot_limit
		103
		104	/* Limit % 8 == 0 => all bytes significant. */
		105	ands limit, limit, #7
		106	b.eq .Lnot_limit
		107
		108	lsl limit, limit, #3 /* Bits -> bytes. */
		109	mov mask, #~0
		110	CPU_BE( lsr mask, mask, limit )
		111	CPU_LE( lsl mask, mask, limit )
		112	bic data1, data1, mask
		113	bic data2, data2, mask
		114
		115	/* Make sure that the NUL byte is marked in the syndrome. */
		116	orr has_nul, has_nul, mask
		117
		118	.Lnot_limit:
		119	orr syndrome, diff, has_nul
		120	b .Lcal_cmpresult
		121
		122	.Lmutual_align:
		123	/*
		124	* Sources are mutually aligned, but are not currently at an
		125	* alignment boundary. Round down the addresses and then mask off
		126	* the bytes that precede the start point.
		127	* We also need to adjust the limit calculations, but without
		128	* overflowing if the limit is near ULONG_MAX.
		129	*/
		130	bic src1, src1, #7
		131	bic src2, src2, #7
		132	ldr data1, [src1], #8
		133	neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
		134	ldr data2, [src2], #8
		135	mov tmp2, #~0
		136	sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
		137	/* Big-endian. Early bytes are at MSB. */
		138	CPU_BE( lsl tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */
		139	/* Little-endian. Early bytes are at LSB. */
		140	CPU_LE( lsr tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */
		141
		142	and tmp3, limit_wd, #7
		143	lsr limit_wd, limit_wd, #3
		144	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/
		145	add limit, limit, tmp1
		146	add tmp3, tmp3, tmp1
		147	orr data1, data1, tmp2
		148	orr data2, data2, tmp2
		149	add limit_wd, limit_wd, tmp3, lsr #3
		150	b .Lstart_realigned
		151
		152	/when src1 offset is not equal to src2 offset.../
		153	.Lmisaligned8:
		154	cmp limit, #8
		155	b.lo .Ltiny8proc /limit < 8... /
		156	/*
		157	* Get the align offset length to compare per byte first.
		158	* After this process, one string's address will be aligned.*/
		159	and tmp1, src1, #7
		160	neg tmp1, tmp1
		161	add tmp1, tmp1, #8
		162	and tmp2, src2, #7
		163	neg tmp2, tmp2
		164	add tmp2, tmp2, #8
		165	subs tmp3, tmp1, tmp2
		166	csel pos, tmp1, tmp2, hi /Choose the maximum. /
		167	/*
		168	* Here, limit is not less than 8, so directly run .Ltinycmp
		169	* without checking the limit.*/
		170	sub limit, limit, pos
		171	.Ltinycmp:
		172	ldrb data1w, [src1], #1
		173	ldrb data2w, [src2], #1
		174	subs pos, pos, #1
		175	ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
		176	ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
		177	b.eq .Ltinycmp
		178	cbnz pos, 1f /find the null or unequal.../
		179	cmp data1w, #1
		180	ccmp data1w, data2w, #0, cs
		181	b.eq .Lstart_align /the last bytes are equal..../
		182	1:
		183	sub result, data1, data2
		184	ret
		185
		186	.Lstart_align:
		187	lsr limit_wd, limit, #3
		188	cbz limit_wd, .Lremain8
		189	/process more leading bytes to make str1 aligned.../
		190	ands xzr, src1, #7
		191	b.eq .Lrecal_offset
		192	add src1, src1, tmp3 /tmp3 is positive in this branch./
		193	add src2, src2, tmp3
		194	ldr data1, [src1], #8
		195	ldr data2, [src2], #8
		196
		197	sub limit, limit, tmp3
		198	lsr limit_wd, limit, #3
		199	subs limit_wd, limit_wd, #1
		200
		201	sub tmp1, data1, zeroones
		202	orr tmp2, data1, #REP8_7f
		203	eor diff, data1, data2 /* Non-zero if differences found. */
		204	csinv endloop, diff, xzr, ne/if limit_wd is 0,will finish the cmp/
		205	bics has_nul, tmp1, tmp2
		206	ccmp endloop, #0, #0, eq /has_null is ZERO: no null byte/
		207	b.ne .Lunequal_proc
		208	/How far is the current str2 from the alignment boundary.../
		209	and tmp3, tmp3, #7
		210	.Lrecal_offset:
		211	neg pos, tmp3
		212	.Lloopcmp_proc:
		213	/*
		214	* Divide the eight bytes into two parts. First,backwards the src2
		215	* to an alignment boundary,load eight bytes from the SRC2 alignment
		216	* boundary,then compare with the relative bytes from SRC1.
		217	* If all 8 bytes are equal,then start the second part's comparison.
		218	* Otherwise finish the comparison.
		219	* This special handle can garantee all the accesses are in the
		220	* thread/task space in avoid to overrange access.
		221	*/
		222	ldr data1, [src1,pos]
		223	ldr data2, [src2,pos]
		224	sub tmp1, data1, zeroones
		225	orr tmp2, data1, #REP8_7f
		226	bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
		227	eor diff, data1, data2 /* Non-zero if differences found. */
		228	csinv endloop, diff, xzr, eq
		229	cbnz endloop, .Lunequal_proc
		230
		231	/The second part process/
		232	ldr data1, [src1], #8
		233	ldr data2, [src2], #8
		234	subs limit_wd, limit_wd, #1
		235	sub tmp1, data1, zeroones
		236	orr tmp2, data1, #REP8_7f
		237	eor diff, data1, data2 /* Non-zero if differences found. */
		238	csinv endloop, diff, xzr, ne/if limit_wd is 0,will finish the cmp/
		239	bics has_nul, tmp1, tmp2
		240	ccmp endloop, #0, #0, eq /has_null is ZERO: no null byte/
		241	b.eq .Lloopcmp_proc
		242
		243	.Lunequal_proc:
		244	orr syndrome, diff, has_nul
		245	cbz syndrome, .Lremain8
		246	.Lcal_cmpresult:
		247	/*
		248	* reversed the byte-order as big-endian,then CLZ can find the most
		249	* significant zero bits.
		250	*/
		251	CPU_LE( rev syndrome, syndrome )
		252	CPU_LE( rev data1, data1 )
		253	CPU_LE( rev data2, data2 )
		254	/*
		255	* For big-endian we cannot use the trick with the syndrome value
		256	* as carry-propagation can corrupt the upper bits if the trailing
		257	* bytes in the string contain 0x01.
		258	* However, if there is no NUL byte in the dword, we can generate
		259	* the result directly. We can't just subtract the bytes as the
		260	* MSB might be significant.
		261	*/
		262	CPU_BE( cbnz has_nul, 1f )
		263	CPU_BE( cmp data1, data2 )
		264	CPU_BE( cset result, ne )
		265	CPU_BE( cneg result, result, lo )
		266	CPU_BE( ret )
		267	CPU_BE( 1: )
		268	/* Re-compute the NUL-byte detection, using a byte-reversed value.*/
		269	CPU_BE( rev tmp3, data1 )
		270	CPU_BE( sub tmp1, tmp3, zeroones )
		271	CPU_BE( orr tmp2, tmp3, #REP8_7f )
		272	CPU_BE( bic has_nul, tmp1, tmp2 )
		273	CPU_BE( rev has_nul, has_nul )
		274	CPU_BE( orr syndrome, diff, has_nul )
		275	/*
		276	* The MS-non-zero bit of the syndrome marks either the first bit
		277	* that is different, or the top bit of the first zero byte.
		278	* Shifting left now will bring the critical information into the
		279	* top bits.
		280	*/
		281	clz pos, syndrome
		282	lsl data1, data1, pos
		283	lsl data2, data2, pos
		284	/*
		285	* But we need to zero-extend (char is unsigned) the value and then
		286	* perform a signed 32-bit subtraction.
		287	*/
		288	lsr data1, data1, #56
		289	sub result, data1, data2, lsr #56
		290	ret
		291
		292	.Lremain8:
		293	/* Limit % 8 == 0 => all bytes significant. */
		294	ands limit, limit, #7
		295	b.eq .Lret0
		296	.Ltiny8proc:
		297	ldrb data1w, [src1], #1
		298	ldrb data2w, [src2], #1
		299	subs limit, limit, #1
		300
		301	ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
		302	ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
		303	b.eq .Ltiny8proc
		304	sub result, data1, data2
		305	ret
		306
		307	.Lret0:
		308	mov result, #0
		309	ret
		310	ENDPROC(strncmp)