aboutsummaryrefslogtreecommitdiffstats
path: root/arch/tile/lib/memset_32.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/tile/lib/memset_32.c')
-rw-r--r--arch/tile/lib/memset_32.c105
1 files changed, 0 insertions, 105 deletions
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index 9a7837d11f7d..2042bfe6595f 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -23,11 +23,7 @@ void *memset(void *s, int c, size_t n)
23 int n32; 23 int n32;
24 uint32_t v16, v32; 24 uint32_t v16, v32;
25 uint8_t *out8 = s; 25 uint8_t *out8 = s;
26#if !CHIP_HAS_WH64()
27 int ahead32;
28#else
29 int to_align32; 26 int to_align32;
30#endif
31 27
32 /* Experimentation shows that a trivial tight loop is a win up until 28 /* Experimentation shows that a trivial tight loop is a win up until
33 * around a size of 20, where writing a word at a time starts to win. 29 * around a size of 20, where writing a word at a time starts to win.
@@ -58,21 +54,6 @@ void *memset(void *s, int c, size_t n)
58 return s; 54 return s;
59 } 55 }
60 56
61#if !CHIP_HAS_WH64()
62 /* Use a spare issue slot to start prefetching the first cache
63 * line early. This instruction is free as the store can be buried
64 * in otherwise idle issue slots doing ALU ops.
65 */
66 __insn_prefetch(out8);
67
68 /* We prefetch the end so that a short memset that spans two cache
69 * lines gets some prefetching benefit. Again we believe this is free
70 * to issue.
71 */
72 __insn_prefetch(&out8[n - 1]);
73#endif /* !CHIP_HAS_WH64() */
74
75
76 /* Align 'out8'. We know n >= 3 so this won't write past the end. */ 57 /* Align 'out8'. We know n >= 3 so this won't write past the end. */
77 while (((uintptr_t) out8 & 3) != 0) { 58 while (((uintptr_t) out8 & 3) != 0) {
78 *out8++ = c; 59 *out8++ = c;
@@ -93,90 +74,6 @@ void *memset(void *s, int c, size_t n)
93 /* This must be at least 8 or the following loop doesn't work. */ 74 /* This must be at least 8 or the following loop doesn't work. */
94#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) 75#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
95 76
96#if !CHIP_HAS_WH64()
97
98 ahead32 = CACHE_LINE_SIZE_IN_WORDS;
99
100 /* We already prefetched the first and last cache lines, so
101 * we only need to do more prefetching if we are storing
102 * to more than two cache lines.
103 */
104 if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
105 int i;
106
107 /* Prefetch the next several cache lines.
108 * This is the setup code for the software-pipelined
109 * loop below.
110 */
111#define MAX_PREFETCH 5
112 ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
113 if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
114 ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
115
116 for (i = CACHE_LINE_SIZE_IN_WORDS;
117 i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
118 __insn_prefetch(&out32[i]);
119 }
120
121 if (n32 > ahead32) {
122 while (1) {
123 int j;
124
125 /* Prefetch by reading one word several cache lines
126 * ahead. Since loads are non-blocking this will
127 * cause the full cache line to be read while we are
128 * finishing earlier cache lines. Using a store
129 * here causes microarchitectural performance
130 * problems where a victimizing store miss goes to
131 * the head of the retry FIFO and locks the pipe for
132 * a few cycles. So a few subsequent stores in this
133 * loop go into the retry FIFO, and then later
134 * stores see other stores to the same cache line
135 * are already in the retry FIFO and themselves go
136 * into the retry FIFO, filling it up and grinding
137 * to a halt waiting for the original miss to be
138 * satisfied.
139 */
140 __insn_prefetch(&out32[ahead32]);
141
142#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
143#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
144#endif
145
146 n32 -= CACHE_LINE_SIZE_IN_WORDS;
147
148 /* Save icache space by only partially unrolling
149 * this loop.
150 */
151 for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
152 *out32++ = v32;
153 *out32++ = v32;
154 *out32++ = v32;
155 *out32++ = v32;
156 }
157
158 /* To save compiled code size, reuse this loop even
159 * when we run out of prefetching to do by dropping
160 * ahead32 down.
161 */
162 if (n32 <= ahead32) {
163 /* Not even a full cache line left,
164 * so stop now.
165 */
166 if (n32 < CACHE_LINE_SIZE_IN_WORDS)
167 break;
168
169 /* Choose a small enough value that we don't
170 * prefetch past the end. There's no sense
171 * in touching cache lines we don't have to.
172 */
173 ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
174 }
175 }
176 }
177
178#else /* CHIP_HAS_WH64() */
179
180 /* Determine how many words we need to emit before the 'out32' 77 /* Determine how many words we need to emit before the 'out32'
181 * pointer becomes aligned modulo the cache line size. 78 * pointer becomes aligned modulo the cache line size.
182 */ 79 */
@@ -233,8 +130,6 @@ void *memset(void *s, int c, size_t n)
233 n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; 130 n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
234 } 131 }
235 132
236#endif /* CHIP_HAS_WH64() */
237
238 /* Now handle any leftover values. */ 133 /* Now handle any leftover values. */
239 if (n32 != 0) { 134 if (n32 != 0) {
240 do { 135 do {