openrisc: Add optimized memcpy routine

The generic memcpy routine provided in kernel does only byte copies. Using word copies we can lower boot time and cycles spend in memcpy quite significantly. Booting on my de0 nano I see boot times go from 7.2 to 5.6 seconds. The avg cycles in memcpy during boot go from 6467 to 1887. I tested several algorithms (see code in previous patch mails) The implementations I tested and avg cycles: - Word Copies + Loop Unrolls + Non Aligned 1882 - Word Copies + Loop Unrolls 1887 - Word Copies 2441 - Byte Copies + Loop Unrolls 6467 - Byte Copies 7600 In the end I ended up going with Word Copies + Loop Unrolls as it provides best tradeoff between simplicity and boot speedups. Signed-off-by: Stafford Horne <shorne@gmail.com>
author: Stafford Horne <shorne@gmail.com> 2016-03-21 03:16:46 -0400
committer: Stafford Horne <shorne@gmail.com> 2017-02-24 14:14:36 -0500
commit: f5d45dc9116b17ee830d3425ece1e9485c9bab88 (patch)
tree: 1ad140d3860d795bf9e425d1aa5d34faf0514c22
parent: d857a1e253498feb231173218df26f5562c70f09 (diff)
4 files changed, 128 insertions, 2 deletions
diff --git a/arch/openrisc/TODO.openrisc b/arch/openrisc/TODO.openrisc
index 0eb04c8240f9..c43d4e1d14eb 100644
--- a/arch/openrisc/TODO.openrisc
+++ b/arch/openrisc/TODO.openrisc
@@ -10,4 +10,3 @@ that are due for investigation shortly, i.e. our TODO list:
   or1k and this change is slowly trickling through the stack.  For the time
   being, or32 is equivalent to or1k.
-- Implement optimized version of memcpy and memset
diff --git a/arch/openrisc/include/asm/string.h b/arch/openrisc/include/asm/string.h
index 33470d4d6948..64939ccd7531 100644
--- a/arch/openrisc/include/asm/string.h
+++ b/arch/openrisc/include/asm/string.h
@@ -4,4 +4,7 @@
 #define __HAVE_ARCH_MEMSET
 extern void *memset(void *s, int c, __kernel_size_t n);
+#define __HAVE_ARCH_MEMCPY
+extern void *memcpy(void *dest, __const void *src, __kernel_size_t n);
 #endif /* __ASM_OPENRISC_STRING_H */
diff --git a/arch/openrisc/lib/Makefile b/arch/openrisc/lib/Makefile
index 67c583e0617f..17d9d37f32d2 100644
--- a/arch/openrisc/lib/Makefile
+++ b/arch/openrisc/lib/Makefile
@@ -2,4 +2,4 @@
 # Makefile for or32 specific library files..
 #
-obj-y  = memset.o string.o delay.o
+obj-y   := delay.o string.o memset.o memcpy.o
diff --git a/arch/openrisc/lib/memcpy.c b/arch/openrisc/lib/memcpy.c
new file mode 100644
index 000000000000..4706f01a199a
--- /dev/null
+++ b/arch/openrisc/lib/memcpy.c
@@ -0,0 +1,124 @@
+/*
+ * arch/openrisc/lib/memcpy.c
+ *
+ * Optimized memory copy routines for openrisc.  These are mostly copied
+ * from ohter sources but slightly entended based on ideas discuassed in
+ * #openrisc.
+ *
+ * The word unroll implementation is an extension to the arm byte
+ * unrolled implementation, but using word copies (if things are
+ * properly aligned)
+ *
+ * The great arm loop unroll algorithm can be found at:
+ *  arch/arm/boot/compressed/string.c
+ */
+#include <linux/export.h>
+#include <linux/string.h>
+#ifdef CONFIG_OR1200
+/*
+ * Do memcpy with word copies and loop unrolling. This gives the
+ * best performance on the OR1200 and MOR1KX archirectures
+ */
+void *memcpy(void *dest, __const void *src, __kernel_size_t n)
+{
+        int i = 0;
+        unsigned char *d, *s;
+        uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src;
+        /* If both source and dest are word aligned copy words */
+        if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
+                /* Copy 32 bytes per loop */
+                for (i = n >> 5; i > 0; i--) {
+                        *dest_w++ = *src_w++;
+                        *dest_w++ = *src_w++;
+                        *dest_w++ = *src_w++;
+                        *dest_w++ = *src_w++;
+                        *dest_w++ = *src_w++;
+                        *dest_w++ = *src_w++;
+                        *dest_w++ = *src_w++;
+                        *dest_w++ = *src_w++;
+                }
+                if (n & 1 << 4) {
+                        *dest_w++ = *src_w++;
+                        *dest_w++ = *src_w++;
+                        *dest_w++ = *src_w++;
+                        *dest_w++ = *src_w++;
+                }
+                if (n & 1 << 3) {
+                        *dest_w++ = *src_w++;
+                        *dest_w++ = *src_w++;
+                }
+                if (n & 1 << 2)
+                        *dest_w++ = *src_w++;
+                d = (unsigned char *)dest_w;
+                s = (unsigned char *)src_w;
+        } else {
+                d = (unsigned char *)dest_w;
+                s = (unsigned char *)src_w;
+                for (i = n >> 3; i > 0; i--) {
+                        *d++ = *s++;
+                        *d++ = *s++;
+                        *d++ = *s++;
+                        *d++ = *s++;
+                        *d++ = *s++;
+                        *d++ = *s++;
+                        *d++ = *s++;
+                        *d++ = *s++;
+                }
+                if (n & 1 << 2) {
+                        *d++ = *s++;
+                        *d++ = *s++;
+                        *d++ = *s++;
+                        *d++ = *s++;
+                }
+        }
+        if (n & 1 << 1) {
+                *d++ = *s++;
+                *d++ = *s++;
+        }
+        if (n & 1)
+                *d++ = *s++;
+        return dest;
+}
+#else
+/*
+ * Use word copies but no loop unrolling as we cannot assume there
+ * will be benefits on the archirecture
+ */
+void *memcpy(void *dest, __const void *src, __kernel_size_t n)
+{
+        unsigned char *d = (unsigned char *)dest, *s = (unsigned char *)src;
+        uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src;
+        /* If both source and dest are word aligned copy words */
+        if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
+                for (; n >= 4; n -= 4)
+                        *dest_w++ = *src_w++;
+        }
+        d = (unsigned char *)dest_w;
+        s = (unsigned char *)src_w;
+        /* For remaining or if not aligned, copy bytes */
+        for (; n >= 1; n -= 1)
+                *d++ = *s++;
+        return dest;
+}
+#endif
+EXPORT_SYMBOL(memcpy);
author	Stafford Horne <shorne@gmail.com>	2016-03-21 03:16:46 -0400
committer	Stafford Horne <shorne@gmail.com>	2017-02-24 14:14:36 -0500
commit	f5d45dc9116b17ee830d3425ece1e9485c9bab88 (patch)
tree	1ad140d3860d795bf9e425d1aa5d34faf0514c22
parent	d857a1e253498feb231173218df26f5562c70f09 (diff)

diff --git a/arch/openrisc/TODO.openrisc b/arch/openrisc/TODO.openrisc index 0eb04c8240f9..c43d4e1d14eb 100644 --- a/arch/openrisc/TODO.openrisc +++ b/arch/openrisc/TODO.openrisc
@@ -10,4 +10,3 @@ that are due for investigation shortly, i.e. our TODO list:
10	or1k and this change is slowly trickling through the stack. For the time	10	or1k and this change is slowly trickling through the stack. For the time
11	being, or32 is equivalent to or1k.	11	being, or32 is equivalent to or1k.
12		12
13	-- Implement optimized version of memcpy and memset


diff --git a/arch/openrisc/include/asm/string.h b/arch/openrisc/include/asm/string.h index 33470d4d6948..64939ccd7531 100644 --- a/arch/openrisc/include/asm/string.h +++ b/arch/openrisc/include/asm/string.h
@@ -4,4 +4,7 @@
4	#define __HAVE_ARCH_MEMSET	4	#define __HAVE_ARCH_MEMSET
5	extern void memset(void s, int c, __kernel_size_t n);	5	extern void memset(void s, int c, __kernel_size_t n);
6		6
		7	#define __HAVE_ARCH_MEMCPY
		8	extern void memcpy(void dest, __const void *src, __kernel_size_t n);
		9
7	#endif /* __ASM_OPENRISC_STRING_H */	10	#endif /* __ASM_OPENRISC_STRING_H */


diff --git a/arch/openrisc/lib/Makefile b/arch/openrisc/lib/Makefile index 67c583e0617f..17d9d37f32d2 100644 --- a/arch/openrisc/lib/Makefile +++ b/arch/openrisc/lib/Makefile
@@ -2,4 +2,4 @@
2	# Makefile for or32 specific library files..	2	# Makefile for or32 specific library files..
3	#	3	#
4		4
5	obj-y = memset.o string.o delay.o	5	obj-y := delay.o string.o memset.o memcpy.o


diff --git a/arch/openrisc/lib/memcpy.c b/arch/openrisc/lib/memcpy.c new file mode 100644 index 000000000000..4706f01a199a --- /dev/null +++ b/arch/openrisc/lib/memcpy.c
@@ -0,0 +1,124 @@
		1	/*
		2	* arch/openrisc/lib/memcpy.c
		3	*
		4	* Optimized memory copy routines for openrisc. These are mostly copied
		5	* from ohter sources but slightly entended based on ideas discuassed in
		6	* #openrisc.
		7	*
		8	* The word unroll implementation is an extension to the arm byte
		9	* unrolled implementation, but using word copies (if things are
		10	* properly aligned)
		11	*
		12	* The great arm loop unroll algorithm can be found at:
		13	* arch/arm/boot/compressed/string.c
		14	*/
		15
		16	#include <linux/export.h>
		17
		18	#include <linux/string.h>
		19
		20	#ifdef CONFIG_OR1200
		21	/*
		22	* Do memcpy with word copies and loop unrolling. This gives the
		23	* best performance on the OR1200 and MOR1KX archirectures
		24	*/
		25	void memcpy(void dest, __const void *src, __kernel_size_t n)
		26	{
		27	int i = 0;
		28	unsigned char d, s;
		29	uint32_t dest_w = (uint32_t )dest, src_w = (uint32_t )src;
		30
		31	/* If both source and dest are word aligned copy words */
		32	if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
		33	/* Copy 32 bytes per loop */
		34	for (i = n >> 5; i > 0; i--) {
		35	dest_w++ = src_w++;
		36	dest_w++ = src_w++;
		37	dest_w++ = src_w++;
		38	dest_w++ = src_w++;
		39	dest_w++ = src_w++;
		40	dest_w++ = src_w++;
		41	dest_w++ = src_w++;
		42	dest_w++ = src_w++;
		43	}
		44
		45	if (n & 1 << 4) {
		46	dest_w++ = src_w++;
		47	dest_w++ = src_w++;
		48	dest_w++ = src_w++;
		49	dest_w++ = src_w++;
		50	}
		51
		52	if (n & 1 << 3) {
		53	dest_w++ = src_w++;
		54	dest_w++ = src_w++;
		55	}
		56
		57	if (n & 1 << 2)
		58	dest_w++ = src_w++;
		59
		60	d = (unsigned char *)dest_w;
		61	s = (unsigned char *)src_w;
		62
		63	} else {
		64	d = (unsigned char *)dest_w;
		65	s = (unsigned char *)src_w;
		66
		67	for (i = n >> 3; i > 0; i--) {
		68	d++ = s++;
		69	d++ = s++;
		70	d++ = s++;
		71	d++ = s++;
		72	d++ = s++;
		73	d++ = s++;
		74	d++ = s++;
		75	d++ = s++;
		76	}
		77
		78	if (n & 1 << 2) {
		79	d++ = s++;
		80	d++ = s++;
		81	d++ = s++;
		82	d++ = s++;
		83	}
		84	}
		85
		86	if (n & 1 << 1) {
		87	d++ = s++;
		88	d++ = s++;
		89	}
		90
		91	if (n & 1)
		92	d++ = s++;
		93
		94	return dest;
		95	}
		96	#else
		97	/*
		98	* Use word copies but no loop unrolling as we cannot assume there
		99	* will be benefits on the archirecture
		100	*/
		101	void memcpy(void dest, __const void *src, __kernel_size_t n)
		102	{
		103	unsigned char d = (unsigned char )dest, s = (unsigned char )src;
		104	uint32_t dest_w = (uint32_t )dest, src_w = (uint32_t )src;
		105
		106	/* If both source and dest are word aligned copy words */
		107	if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
		108	for (; n >= 4; n -= 4)
		109	dest_w++ = src_w++;
		110	}
		111
		112	d = (unsigned char *)dest_w;
		113	s = (unsigned char *)src_w;
		114
		115	/* For remaining or if not aligned, copy bytes */
		116	for (; n >= 1; n -= 1)
		117	d++ = s++;
		118
		119	return dest;
		120
		121	}
		122	#endif
		123
		124	EXPORT_SYMBOL(memcpy);