arm64: lib: Implement optimized string length routines

This patch, based on Linaro's Cortex Strings library, adds an assembly optimized strlen() and strnlen() functions. Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org> Signed-off-by: Deepak Saxena <dsaxena@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
author: zhichang.yuan <zhichang.yuan@linaro.org> 2014-04-28 01:11:34 -0400
committer: Catalin Marinas <catalin.marinas@arm.com> 2014-05-23 10:17:12 -0400
commit: 0a42cb0a6fa64cb17db11164a1ad3511b43acefe (patch)
tree: fc91d4d7a77ff9de03d0ae004b954c99ec98280f /arch/arm64/lib
parent: 192c4d902f19b66902d7aacc19e9b169bebfb2e5 (diff)
3 files changed, 299 insertions, 1 deletions
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index aaaf6180c558..d98d3e39879e 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,5 @@
 lib-y           := bitops.o clear_user.o delay.o copy_from_user.o       \
                   copy_to_user.o copy_in_user.o copy_page.o            \
                   clear_page.o memchr.o memcpy.o memmove.o memset.o    \
-                   memcmp.o strcmp.o strncmp.o strchr.o strrchr.o
+                   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o       \
+                   strchr.o strrchr.o
diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S
new file mode 100644
index 000000000000..987b68b9ce44
--- /dev/null
+++ b/arch/arm64/lib/strlen.S
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+/*
+ * calculate the length of a string
+ *
+ * Parameters:
+ *      x0 - const string pointer
+ * Returns:
+ *      x0 - the return length of specific string
+ */
+/* Arguments and results.  */
+srcin           .req    x0
+len             .req    x0
+/* Locals and temporaries.  */
+src             .req    x1
+data1           .req    x2
+data2           .req    x3
+data2a          .req    x4
+has_nul1        .req    x5
+has_nul2        .req    x6
+tmp1            .req    x7
+tmp2            .req    x8
+tmp3            .req    x9
+tmp4            .req    x10
+zeroones        .req    x11
+pos             .req    x12
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+ENTRY(strlen)
+        mov     zeroones, #REP8_01
+        bic     src, srcin, #15
+        ands    tmp1, srcin, #15
+        b.ne    .Lmisaligned
+        /*
+        * NUL detection works on the principle that (X - 1) & (~X) & 0x80
+        * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+        * can be done in parallel across the entire word.
+        */
+        /*
+        * The inner loop deals with two Dwords at a time. This has a
+        * slightly higher start-up cost, but we should win quite quickly,
+        * especially on cores with a high number of issue slots per
+        * cycle, as we get much better parallelism out of the operations.
+        */
+.Lloop:
+        ldp     data1, data2, [src], #16
+.Lrealigned:
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        sub     tmp3, data2, zeroones
+        orr     tmp4, data2, #REP8_7f
+        bic     has_nul1, tmp1, tmp2
+        bics    has_nul2, tmp3, tmp4
+        ccmp    has_nul1, #0, #0, eq    /* NZCV = 0000  */
+        b.eq    .Lloop
+        sub     len, src, srcin
+        cbz     has_nul1, .Lnul_in_data2
+CPU_BE( mov     data2, data1 )  /*prepare data to re-calculate the syndrome*/
+        sub     len, len, #8
+        mov     has_nul2, has_nul1
+.Lnul_in_data2:
+        /*
+        * For big-endian, carry propagation (if the final byte in the
+        * string is 0x01) means we cannot use has_nul directly.  The
+        * easiest way to get the correct byte is to byte-swap the data
+        * and calculate the syndrome a second time.
+        */
+CPU_BE( rev     data2, data2 )
+CPU_BE( sub     tmp1, data2, zeroones )
+CPU_BE( orr     tmp2, data2, #REP8_7f )
+CPU_BE( bic     has_nul2, tmp1, tmp2 )
+        sub     len, len, #8
+        rev     has_nul2, has_nul2
+        clz     pos, has_nul2
+        add     len, len, pos, lsr #3           /* Bits to bytes.  */
+        ret
+.Lmisaligned:
+        cmp     tmp1, #8
+        neg     tmp1, tmp1
+        ldp     data1, data2, [src], #16
+        lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */
+        mov     tmp2, #~0
+        /* Big-endian.  Early bytes are at MSB.  */
+CPU_BE( lsl     tmp2, tmp2, tmp1 )      /* Shift (tmp1 & 63).  */
+        /* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr     tmp2, tmp2, tmp1 )      /* Shift (tmp1 & 63).  */
+        orr     data1, data1, tmp2
+        orr     data2a, data2, tmp2
+        csinv   data1, data1, xzr, le
+        csel    data2, data2, data2a, le
+        b       .Lrealigned
+ENDPROC(strlen)
diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S
new file mode 100644
index 000000000000..2ca665711bf2
--- /dev/null
+++ b/arch/arm64/lib/strnlen.S
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+/*
+ * determine the length of a fixed-size string
+ *
+ * Parameters:
+ *      x0 - const string pointer
+ *      x1 - maximal string length
+ * Returns:
+ *      x0 - the return length of specific string
+ */
+/* Arguments and results.  */
+srcin           .req    x0
+len             .req    x0
+limit           .req    x1
+/* Locals and temporaries.  */
+src             .req    x2
+data1           .req    x3
+data2           .req    x4
+data2a          .req    x5
+has_nul1        .req    x6
+has_nul2        .req    x7
+tmp1            .req    x8
+tmp2            .req    x9
+tmp3            .req    x10
+tmp4            .req    x11
+zeroones        .req    x12
+pos             .req    x13
+limit_wd        .req    x14
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+ENTRY(strnlen)
+        cbz     limit, .Lhit_limit
+        mov     zeroones, #REP8_01
+        bic     src, srcin, #15
+        ands    tmp1, srcin, #15
+        b.ne    .Lmisaligned
+        /* Calculate the number of full and partial words -1.  */
+        sub     limit_wd, limit, #1 /* Limit != 0, so no underflow.  */
+        lsr     limit_wd, limit_wd, #4  /* Convert to Qwords.  */
+        /*
+        * NUL detection works on the principle that (X - 1) & (~X) & 0x80
+        * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+        * can be done in parallel across the entire word.
+        */
+        /*
+        * The inner loop deals with two Dwords at a time.  This has a
+        * slightly higher start-up cost, but we should win quite quickly,
+        * especially on cores with a high number of issue slots per
+        * cycle, as we get much better parallelism out of the operations.
+        */
+.Lloop:
+        ldp     data1, data2, [src], #16
+.Lrealigned:
+        sub     tmp1, data1, zeroones
+        orr     tmp2, data1, #REP8_7f
+        sub     tmp3, data2, zeroones
+        orr     tmp4, data2, #REP8_7f
+        bic     has_nul1, tmp1, tmp2
+        bic     has_nul2, tmp3, tmp4
+        subs    limit_wd, limit_wd, #1
+        orr     tmp1, has_nul1, has_nul2
+        ccmp    tmp1, #0, #0, pl    /* NZCV = 0000  */
+        b.eq    .Lloop
+        cbz     tmp1, .Lhit_limit   /* No null in final Qword.  */
+        /*
+        * We know there's a null in the final Qword. The easiest thing
+        * to do now is work out the length of the string and return
+        * MIN (len, limit).
+        */
+        sub     len, src, srcin
+        cbz     has_nul1, .Lnul_in_data2
+CPU_BE( mov     data2, data1 )  /*perpare data to re-calculate the syndrome*/
+        sub     len, len, #8
+        mov     has_nul2, has_nul1
+.Lnul_in_data2:
+        /*
+        * For big-endian, carry propagation (if the final byte in the
+        * string is 0x01) means we cannot use has_nul directly.  The
+        * easiest way to get the correct byte is to byte-swap the data
+        * and calculate the syndrome a second time.
+        */
+CPU_BE( rev     data2, data2 )
+CPU_BE( sub     tmp1, data2, zeroones )
+CPU_BE( orr     tmp2, data2, #REP8_7f )
+CPU_BE( bic     has_nul2, tmp1, tmp2 )
+        sub     len, len, #8
+        rev     has_nul2, has_nul2
+        clz     pos, has_nul2
+        add     len, len, pos, lsr #3       /* Bits to bytes.  */
+        cmp     len, limit
+        csel    len, len, limit, ls     /* Return the lower value.  */
+        ret
+.Lmisaligned:
+        /*
+        * Deal with a partial first word.
+        * We're doing two things in parallel here;
+        * 1) Calculate the number of words (but avoiding overflow if
+        * limit is near ULONG_MAX) - to do this we need to work out
+        * limit + tmp1 - 1 as a 65-bit value before shifting it;
+        * 2) Load and mask the initial data words - we force the bytes
+        * before the ones we are interested in to 0xff - this ensures
+        * early bytes will not hit any zero detection.
+        */
+        ldp     data1, data2, [src], #16
+        sub     limit_wd, limit, #1
+        and     tmp3, limit_wd, #15
+        lsr     limit_wd, limit_wd, #4
+        add     tmp3, tmp3, tmp1
+        add     limit_wd, limit_wd, tmp3, lsr #4
+        neg     tmp4, tmp1
+        lsl     tmp4, tmp4, #3  /* Bytes beyond alignment -> bits.  */
+        mov     tmp2, #~0
+        /* Big-endian.  Early bytes are at MSB.  */
+CPU_BE( lsl     tmp2, tmp2, tmp4 )      /* Shift (tmp1 & 63).  */
+        /* Little-endian.  Early bytes are at LSB.  */
+CPU_LE( lsr     tmp2, tmp2, tmp4 )      /* Shift (tmp1 & 63).  */
+        cmp     tmp1, #8
+        orr     data1, data1, tmp2
+        orr     data2a, data2, tmp2
+        csinv   data1, data1, xzr, le
+        csel    data2, data2, data2a, le
+        b       .Lrealigned
+.Lhit_limit:
+        mov     len, limit
+        ret
+ENDPROC(strnlen)
author	zhichang.yuan <zhichang.yuan@linaro.org>	2014-04-28 01:11:34 -0400
committer	Catalin Marinas <catalin.marinas@arm.com>	2014-05-23 10:17:12 -0400
commit	0a42cb0a6fa64cb17db11164a1ad3511b43acefe (patch)
tree	fc91d4d7a77ff9de03d0ae004b954c99ec98280f /arch/arm64/lib
parent	192c4d902f19b66902d7aacc19e9b169bebfb2e5 (diff)

diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index aaaf6180c558..d98d3e39879e 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,5 @@
1	lib-y := bitops.o clear_user.o delay.o copy_from_user.o \	1	lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
2	copy_to_user.o copy_in_user.o copy_page.o \	2	copy_to_user.o copy_in_user.o copy_page.o \
3	clear_page.o memchr.o memcpy.o memmove.o memset.o \	3	clear_page.o memchr.o memcpy.o memmove.o memset.o \
4	memcmp.o strcmp.o strncmp.o strchr.o strrchr.o	4	memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
		5	strchr.o strrchr.o


diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S new file mode 100644 index 000000000000..987b68b9ce44 --- /dev/null +++ b/arch/arm64/lib/strlen.S
@@ -0,0 +1,126 @@
		1	/*
		2	* Copyright (C) 2013 ARM Ltd.
		3	* Copyright (C) 2013 Linaro.
		4	*
		5	* This code is based on glibc cortex strings work originally authored by Linaro
		6	* and re-licensed under GPLv2 for the Linux kernel. The original code can
		7	* be found @
		8	*
		9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
		10	* files/head:/src/aarch64/
		11	*
		12	* This program is free software; you can redistribute it and/or modify
		13	* it under the terms of the GNU General Public License version 2 as
		14	* published by the Free Software Foundation.
		15	*
		16	* This program is distributed in the hope that it will be useful,
		17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		19	* GNU General Public License for more details.
		20	*
		21	* You should have received a copy of the GNU General Public License
		22	* along with this program. If not, see <http://www.gnu.org/licenses/>.
		23	*/
		24
		25	#include <linux/linkage.h>
		26	#include <asm/assembler.h>
		27
		28	/*
		29	* calculate the length of a string
		30	*
		31	* Parameters:
		32	* x0 - const string pointer
		33	* Returns:
		34	* x0 - the return length of specific string
		35	*/
		36
		37	/* Arguments and results. */
		38	srcin .req x0
		39	len .req x0
		40
		41	/* Locals and temporaries. */
		42	src .req x1
		43	data1 .req x2
		44	data2 .req x3
		45	data2a .req x4
		46	has_nul1 .req x5
		47	has_nul2 .req x6
		48	tmp1 .req x7
		49	tmp2 .req x8
		50	tmp3 .req x9
		51	tmp4 .req x10
		52	zeroones .req x11
		53	pos .req x12
		54
		55	#define REP8_01 0x0101010101010101
		56	#define REP8_7f 0x7f7f7f7f7f7f7f7f
		57	#define REP8_80 0x8080808080808080
		58
		59	ENTRY(strlen)
		60	mov zeroones, #REP8_01
		61	bic src, srcin, #15
		62	ands tmp1, srcin, #15
		63	b.ne .Lmisaligned
		64	/*
		65	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
		66	* (=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
		67	* can be done in parallel across the entire word.
		68	*/
		69	/*
		70	* The inner loop deals with two Dwords at a time. This has a
		71	* slightly higher start-up cost, but we should win quite quickly,
		72	* especially on cores with a high number of issue slots per
		73	* cycle, as we get much better parallelism out of the operations.
		74	*/
		75	.Lloop:
		76	ldp data1, data2, [src], #16
		77	.Lrealigned:
		78	sub tmp1, data1, zeroones
		79	orr tmp2, data1, #REP8_7f
		80	sub tmp3, data2, zeroones
		81	orr tmp4, data2, #REP8_7f
		82	bic has_nul1, tmp1, tmp2
		83	bics has_nul2, tmp3, tmp4
		84	ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
		85	b.eq .Lloop
		86
		87	sub len, src, srcin
		88	cbz has_nul1, .Lnul_in_data2
		89	CPU_BE( mov data2, data1 ) /prepare data to re-calculate the syndrome/
		90	sub len, len, #8
		91	mov has_nul2, has_nul1
		92	.Lnul_in_data2:
		93	/*
		94	* For big-endian, carry propagation (if the final byte in the
		95	* string is 0x01) means we cannot use has_nul directly. The
		96	* easiest way to get the correct byte is to byte-swap the data
		97	* and calculate the syndrome a second time.
		98	*/
		99	CPU_BE( rev data2, data2 )
		100	CPU_BE( sub tmp1, data2, zeroones )
		101	CPU_BE( orr tmp2, data2, #REP8_7f )
		102	CPU_BE( bic has_nul2, tmp1, tmp2 )
		103
		104	sub len, len, #8
		105	rev has_nul2, has_nul2
		106	clz pos, has_nul2
		107	add len, len, pos, lsr #3 /* Bits to bytes. */
		108	ret
		109
		110	.Lmisaligned:
		111	cmp tmp1, #8
		112	neg tmp1, tmp1
		113	ldp data1, data2, [src], #16
		114	lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
		115	mov tmp2, #~0
		116	/* Big-endian. Early bytes are at MSB. */
		117	CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
		118	/* Little-endian. Early bytes are at LSB. */
		119	CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
		120
		121	orr data1, data1, tmp2
		122	orr data2a, data2, tmp2
		123	csinv data1, data1, xzr, le
		124	csel data2, data2, data2a, le
		125	b .Lrealigned
		126	ENDPROC(strlen)


diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S new file mode 100644 index 000000000000..2ca665711bf2 --- /dev/null +++ b/arch/arm64/lib/strnlen.S
@@ -0,0 +1,171 @@
		1	/*
		2	* Copyright (C) 2013 ARM Ltd.
		3	* Copyright (C) 2013 Linaro.
		4	*
		5	* This code is based on glibc cortex strings work originally authored by Linaro
		6	* and re-licensed under GPLv2 for the Linux kernel. The original code can
		7	* be found @
		8	*
		9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
		10	* files/head:/src/aarch64/
		11	*
		12	* This program is free software; you can redistribute it and/or modify
		13	* it under the terms of the GNU General Public License version 2 as
		14	* published by the Free Software Foundation.
		15	*
		16	* This program is distributed in the hope that it will be useful,
		17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		19	* GNU General Public License for more details.
		20	*
		21	* You should have received a copy of the GNU General Public License
		22	* along with this program. If not, see <http://www.gnu.org/licenses/>.
		23	*/
		24
		25	#include <linux/linkage.h>
		26	#include <asm/assembler.h>
		27
		28	/*
		29	* determine the length of a fixed-size string
		30	*
		31	* Parameters:
		32	* x0 - const string pointer
		33	* x1 - maximal string length
		34	* Returns:
		35	* x0 - the return length of specific string
		36	*/
		37
		38	/* Arguments and results. */
		39	srcin .req x0
		40	len .req x0
		41	limit .req x1
		42
		43	/* Locals and temporaries. */
		44	src .req x2
		45	data1 .req x3
		46	data2 .req x4
		47	data2a .req x5
		48	has_nul1 .req x6
		49	has_nul2 .req x7
		50	tmp1 .req x8
		51	tmp2 .req x9
		52	tmp3 .req x10
		53	tmp4 .req x11
		54	zeroones .req x12
		55	pos .req x13
		56	limit_wd .req x14
		57
		58	#define REP8_01 0x0101010101010101
		59	#define REP8_7f 0x7f7f7f7f7f7f7f7f
		60	#define REP8_80 0x8080808080808080
		61
		62	ENTRY(strnlen)
		63	cbz limit, .Lhit_limit
		64	mov zeroones, #REP8_01
		65	bic src, srcin, #15
		66	ands tmp1, srcin, #15
		67	b.ne .Lmisaligned
		68	/* Calculate the number of full and partial words -1. */
		69	sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
		70	lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
		71
		72	/*
		73	* NUL detection works on the principle that (X - 1) & (~X) & 0x80
		74	* (=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
		75	* can be done in parallel across the entire word.
		76	*/
		77	/*
		78	* The inner loop deals with two Dwords at a time. This has a
		79	* slightly higher start-up cost, but we should win quite quickly,
		80	* especially on cores with a high number of issue slots per
		81	* cycle, as we get much better parallelism out of the operations.
		82	*/
		83	.Lloop:
		84	ldp data1, data2, [src], #16
		85	.Lrealigned:
		86	sub tmp1, data1, zeroones
		87	orr tmp2, data1, #REP8_7f
		88	sub tmp3, data2, zeroones
		89	orr tmp4, data2, #REP8_7f
		90	bic has_nul1, tmp1, tmp2
		91	bic has_nul2, tmp3, tmp4
		92	subs limit_wd, limit_wd, #1
		93	orr tmp1, has_nul1, has_nul2
		94	ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
		95	b.eq .Lloop
		96
		97	cbz tmp1, .Lhit_limit /* No null in final Qword. */
		98
		99	/*
		100	* We know there's a null in the final Qword. The easiest thing
		101	* to do now is work out the length of the string and return
		102	* MIN (len, limit).
		103	*/
		104	sub len, src, srcin
		105	cbz has_nul1, .Lnul_in_data2
		106	CPU_BE( mov data2, data1 ) /perpare data to re-calculate the syndrome/
		107
		108	sub len, len, #8
		109	mov has_nul2, has_nul1
		110	.Lnul_in_data2:
		111	/*
		112	* For big-endian, carry propagation (if the final byte in the
		113	* string is 0x01) means we cannot use has_nul directly. The
		114	* easiest way to get the correct byte is to byte-swap the data
		115	* and calculate the syndrome a second time.
		116	*/
		117	CPU_BE( rev data2, data2 )
		118	CPU_BE( sub tmp1, data2, zeroones )
		119	CPU_BE( orr tmp2, data2, #REP8_7f )
		120	CPU_BE( bic has_nul2, tmp1, tmp2 )
		121
		122	sub len, len, #8
		123	rev has_nul2, has_nul2
		124	clz pos, has_nul2
		125	add len, len, pos, lsr #3 /* Bits to bytes. */
		126	cmp len, limit
		127	csel len, len, limit, ls /* Return the lower value. */
		128	ret
		129
		130	.Lmisaligned:
		131	/*
		132	* Deal with a partial first word.
		133	* We're doing two things in parallel here;
		134	* 1) Calculate the number of words (but avoiding overflow if
		135	* limit is near ULONG_MAX) - to do this we need to work out
		136	* limit + tmp1 - 1 as a 65-bit value before shifting it;
		137	* 2) Load and mask the initial data words - we force the bytes
		138	* before the ones we are interested in to 0xff - this ensures
		139	* early bytes will not hit any zero detection.
		140	*/
		141	ldp data1, data2, [src], #16
		142
		143	sub limit_wd, limit, #1
		144	and tmp3, limit_wd, #15
		145	lsr limit_wd, limit_wd, #4
		146
		147	add tmp3, tmp3, tmp1
		148	add limit_wd, limit_wd, tmp3, lsr #4
		149
		150	neg tmp4, tmp1
		151	lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
		152
		153	mov tmp2, #~0
		154	/* Big-endian. Early bytes are at MSB. */
		155	CPU_BE( lsl tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */
		156	/* Little-endian. Early bytes are at LSB. */
		157	CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */
		158
		159	cmp tmp1, #8
		160
		161	orr data1, data1, tmp2
		162	orr data2a, data2, tmp2
		163
		164	csinv data1, data1, xzr, le
		165	csel data2, data2, data2a, le
		166	b .Lrealigned
		167
		168	.Lhit_limit:
		169	mov len, limit
		170	ret
		171	ENDPROC(strnlen)