aboutsummaryrefslogtreecommitdiffstats
path: root/arch/openrisc/lib/memcpy.c
diff options
context:
space:
mode:
authorStafford Horne <shorne@gmail.com>2016-03-21 03:16:46 -0400
committerStafford Horne <shorne@gmail.com>2017-02-24 14:14:36 -0500
commitf5d45dc9116b17ee830d3425ece1e9485c9bab88 (patch)
tree1ad140d3860d795bf9e425d1aa5d34faf0514c22 /arch/openrisc/lib/memcpy.c
parentd857a1e253498feb231173218df26f5562c70f09 (diff)
openrisc: Add optimized memcpy routine
The generic memcpy routine provided in kernel does only byte copies. Using word copies we can lower boot time and cycles spend in memcpy quite significantly. Booting on my de0 nano I see boot times go from 7.2 to 5.6 seconds. The avg cycles in memcpy during boot go from 6467 to 1887. I tested several algorithms (see code in previous patch mails) The implementations I tested and avg cycles: - Word Copies + Loop Unrolls + Non Aligned 1882 - Word Copies + Loop Unrolls 1887 - Word Copies 2441 - Byte Copies + Loop Unrolls 6467 - Byte Copies 7600 In the end I ended up going with Word Copies + Loop Unrolls as it provides best tradeoff between simplicity and boot speedups. Signed-off-by: Stafford Horne <shorne@gmail.com>
Diffstat (limited to 'arch/openrisc/lib/memcpy.c')
-rw-r--r--arch/openrisc/lib/memcpy.c124
1 files changed, 124 insertions, 0 deletions
diff --git a/arch/openrisc/lib/memcpy.c b/arch/openrisc/lib/memcpy.c
new file mode 100644
index 000000000000..4706f01a199a
--- /dev/null
+++ b/arch/openrisc/lib/memcpy.c
@@ -0,0 +1,124 @@
1/*
2 * arch/openrisc/lib/memcpy.c
3 *
4 * Optimized memory copy routines for openrisc. These are mostly copied
5 * from ohter sources but slightly entended based on ideas discuassed in
6 * #openrisc.
7 *
8 * The word unroll implementation is an extension to the arm byte
9 * unrolled implementation, but using word copies (if things are
10 * properly aligned)
11 *
12 * The great arm loop unroll algorithm can be found at:
13 * arch/arm/boot/compressed/string.c
14 */
15
16#include <linux/export.h>
17
18#include <linux/string.h>
19
20#ifdef CONFIG_OR1200
21/*
22 * Do memcpy with word copies and loop unrolling. This gives the
23 * best performance on the OR1200 and MOR1KX archirectures
24 */
25void *memcpy(void *dest, __const void *src, __kernel_size_t n)
26{
27 int i = 0;
28 unsigned char *d, *s;
29 uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src;
30
31 /* If both source and dest are word aligned copy words */
32 if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
33 /* Copy 32 bytes per loop */
34 for (i = n >> 5; i > 0; i--) {
35 *dest_w++ = *src_w++;
36 *dest_w++ = *src_w++;
37 *dest_w++ = *src_w++;
38 *dest_w++ = *src_w++;
39 *dest_w++ = *src_w++;
40 *dest_w++ = *src_w++;
41 *dest_w++ = *src_w++;
42 *dest_w++ = *src_w++;
43 }
44
45 if (n & 1 << 4) {
46 *dest_w++ = *src_w++;
47 *dest_w++ = *src_w++;
48 *dest_w++ = *src_w++;
49 *dest_w++ = *src_w++;
50 }
51
52 if (n & 1 << 3) {
53 *dest_w++ = *src_w++;
54 *dest_w++ = *src_w++;
55 }
56
57 if (n & 1 << 2)
58 *dest_w++ = *src_w++;
59
60 d = (unsigned char *)dest_w;
61 s = (unsigned char *)src_w;
62
63 } else {
64 d = (unsigned char *)dest_w;
65 s = (unsigned char *)src_w;
66
67 for (i = n >> 3; i > 0; i--) {
68 *d++ = *s++;
69 *d++ = *s++;
70 *d++ = *s++;
71 *d++ = *s++;
72 *d++ = *s++;
73 *d++ = *s++;
74 *d++ = *s++;
75 *d++ = *s++;
76 }
77
78 if (n & 1 << 2) {
79 *d++ = *s++;
80 *d++ = *s++;
81 *d++ = *s++;
82 *d++ = *s++;
83 }
84 }
85
86 if (n & 1 << 1) {
87 *d++ = *s++;
88 *d++ = *s++;
89 }
90
91 if (n & 1)
92 *d++ = *s++;
93
94 return dest;
95}
96#else
97/*
98 * Use word copies but no loop unrolling as we cannot assume there
99 * will be benefits on the archirecture
100 */
101void *memcpy(void *dest, __const void *src, __kernel_size_t n)
102{
103 unsigned char *d = (unsigned char *)dest, *s = (unsigned char *)src;
104 uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src;
105
106 /* If both source and dest are word aligned copy words */
107 if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
108 for (; n >= 4; n -= 4)
109 *dest_w++ = *src_w++;
110 }
111
112 d = (unsigned char *)dest_w;
113 s = (unsigned char *)src_w;
114
115 /* For remaining or if not aligned, copy bytes */
116 for (; n >= 1; n -= 1)
117 *d++ = *s++;
118
119 return dest;
120
121}
122#endif
123
124EXPORT_SYMBOL(memcpy);