aboutsummaryrefslogtreecommitdiffstats
path: root/arch/tile/lib/memset_32.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/tile/lib/memset_32.c')
-rw-r--r--arch/tile/lib/memset_32.c110
1 files changed, 1 insertions, 109 deletions
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index 57dbb3a5bff8..2042bfe6595f 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -12,13 +12,10 @@
12 * more details. 12 * more details.
13 */ 13 */
14 14
15#include <arch/chip.h>
16
17#include <linux/types.h> 15#include <linux/types.h>
18#include <linux/string.h> 16#include <linux/string.h>
19#include <linux/module.h> 17#include <linux/module.h>
20 18#include <arch/chip.h>
21#undef memset
22 19
23void *memset(void *s, int c, size_t n) 20void *memset(void *s, int c, size_t n)
24{ 21{
@@ -26,11 +23,7 @@ void *memset(void *s, int c, size_t n)
26 int n32; 23 int n32;
27 uint32_t v16, v32; 24 uint32_t v16, v32;
28 uint8_t *out8 = s; 25 uint8_t *out8 = s;
29#if !CHIP_HAS_WH64()
30 int ahead32;
31#else
32 int to_align32; 26 int to_align32;
33#endif
34 27
35 /* Experimentation shows that a trivial tight loop is a win up until 28 /* Experimentation shows that a trivial tight loop is a win up until
36 * around a size of 20, where writing a word at a time starts to win. 29 * around a size of 20, where writing a word at a time starts to win.
@@ -61,21 +54,6 @@ void *memset(void *s, int c, size_t n)
61 return s; 54 return s;
62 } 55 }
63 56
64#if !CHIP_HAS_WH64()
65 /* Use a spare issue slot to start prefetching the first cache
66 * line early. This instruction is free as the store can be buried
67 * in otherwise idle issue slots doing ALU ops.
68 */
69 __insn_prefetch(out8);
70
71 /* We prefetch the end so that a short memset that spans two cache
72 * lines gets some prefetching benefit. Again we believe this is free
73 * to issue.
74 */
75 __insn_prefetch(&out8[n - 1]);
76#endif /* !CHIP_HAS_WH64() */
77
78
79 /* Align 'out8'. We know n >= 3 so this won't write past the end. */ 57 /* Align 'out8'. We know n >= 3 so this won't write past the end. */
80 while (((uintptr_t) out8 & 3) != 0) { 58 while (((uintptr_t) out8 & 3) != 0) {
81 *out8++ = c; 59 *out8++ = c;
@@ -96,90 +74,6 @@ void *memset(void *s, int c, size_t n)
96 /* This must be at least 8 or the following loop doesn't work. */ 74 /* This must be at least 8 or the following loop doesn't work. */
97#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) 75#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
98 76
99#if !CHIP_HAS_WH64()
100
101 ahead32 = CACHE_LINE_SIZE_IN_WORDS;
102
103 /* We already prefetched the first and last cache lines, so
104 * we only need to do more prefetching if we are storing
105 * to more than two cache lines.
106 */
107 if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
108 int i;
109
110 /* Prefetch the next several cache lines.
111 * This is the setup code for the software-pipelined
112 * loop below.
113 */
114#define MAX_PREFETCH 5
115 ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
116 if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
117 ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
118
119 for (i = CACHE_LINE_SIZE_IN_WORDS;
120 i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
121 __insn_prefetch(&out32[i]);
122 }
123
124 if (n32 > ahead32) {
125 while (1) {
126 int j;
127
128 /* Prefetch by reading one word several cache lines
129 * ahead. Since loads are non-blocking this will
130 * cause the full cache line to be read while we are
131 * finishing earlier cache lines. Using a store
132 * here causes microarchitectural performance
133 * problems where a victimizing store miss goes to
134 * the head of the retry FIFO and locks the pipe for
135 * a few cycles. So a few subsequent stores in this
136 * loop go into the retry FIFO, and then later
137 * stores see other stores to the same cache line
138 * are already in the retry FIFO and themselves go
139 * into the retry FIFO, filling it up and grinding
140 * to a halt waiting for the original miss to be
141 * satisfied.
142 */
143 __insn_prefetch(&out32[ahead32]);
144
145#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
146#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
147#endif
148
149 n32 -= CACHE_LINE_SIZE_IN_WORDS;
150
151 /* Save icache space by only partially unrolling
152 * this loop.
153 */
154 for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
155 *out32++ = v32;
156 *out32++ = v32;
157 *out32++ = v32;
158 *out32++ = v32;
159 }
160
161 /* To save compiled code size, reuse this loop even
162 * when we run out of prefetching to do by dropping
163 * ahead32 down.
164 */
165 if (n32 <= ahead32) {
166 /* Not even a full cache line left,
167 * so stop now.
168 */
169 if (n32 < CACHE_LINE_SIZE_IN_WORDS)
170 break;
171
172 /* Choose a small enough value that we don't
173 * prefetch past the end. There's no sense
174 * in touching cache lines we don't have to.
175 */
176 ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
177 }
178 }
179 }
180
181#else /* CHIP_HAS_WH64() */
182
183 /* Determine how many words we need to emit before the 'out32' 77 /* Determine how many words we need to emit before the 'out32'
184 * pointer becomes aligned modulo the cache line size. 78 * pointer becomes aligned modulo the cache line size.
185 */ 79 */
@@ -236,8 +130,6 @@ void *memset(void *s, int c, size_t n)
236 n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; 130 n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
237 } 131 }
238 132
239#endif /* CHIP_HAS_WH64() */
240
241 /* Now handle any leftover values. */ 133 /* Now handle any leftover values. */
242 if (n32 != 0) { 134 if (n32 != 0) {
243 do { 135 do {