diff options
Diffstat (limited to 'arch/tile/lib/memset_32.c')
-rw-r--r-- | arch/tile/lib/memset_32.c | 274 |
1 files changed, 274 insertions, 0 deletions
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c new file mode 100644 index 000000000000..8593bc82398a --- /dev/null +++ b/arch/tile/lib/memset_32.c | |||
@@ -0,0 +1,274 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <arch/chip.h> | ||
16 | |||
17 | #include <linux/types.h> | ||
18 | #include <linux/string.h> | ||
19 | #include <linux/module.h> | ||
20 | |||
21 | |||
22 | void *memset(void *s, int c, size_t n) | ||
23 | { | ||
24 | uint32_t *out32; | ||
25 | int n32; | ||
26 | uint32_t v16, v32; | ||
27 | uint8_t *out8 = s; | ||
28 | #if !CHIP_HAS_WH64() | ||
29 | int ahead32; | ||
30 | #else | ||
31 | int to_align32; | ||
32 | #endif | ||
33 | |||
34 | /* Experimentation shows that a trivial tight loop is a win up until | ||
35 | * around a size of 20, where writing a word at a time starts to win. | ||
36 | */ | ||
37 | #define BYTE_CUTOFF 20 | ||
38 | |||
39 | #if BYTE_CUTOFF < 3 | ||
40 | /* This must be at least at least this big, or some code later | ||
41 | * on doesn't work. | ||
42 | */ | ||
43 | #error "BYTE_CUTOFF is too small" | ||
44 | #endif | ||
45 | |||
46 | if (n < BYTE_CUTOFF) { | ||
47 | /* Strangely, this turns out to be the tightest way to | ||
48 | * write this loop. | ||
49 | */ | ||
50 | if (n != 0) { | ||
51 | do { | ||
52 | /* Strangely, combining these into one line | ||
53 | * performs worse. | ||
54 | */ | ||
55 | *out8 = c; | ||
56 | out8++; | ||
57 | } while (--n != 0); | ||
58 | } | ||
59 | |||
60 | return s; | ||
61 | } | ||
62 | |||
63 | #if !CHIP_HAS_WH64() | ||
64 | /* Use a spare issue slot to start prefetching the first cache | ||
65 | * line early. This instruction is free as the store can be buried | ||
66 | * in otherwise idle issue slots doing ALU ops. | ||
67 | */ | ||
68 | __insn_prefetch(out8); | ||
69 | |||
70 | /* We prefetch the end so that a short memset that spans two cache | ||
71 | * lines gets some prefetching benefit. Again we believe this is free | ||
72 | * to issue. | ||
73 | */ | ||
74 | __insn_prefetch(&out8[n - 1]); | ||
75 | #endif /* !CHIP_HAS_WH64() */ | ||
76 | |||
77 | |||
78 | /* Align 'out8'. We know n >= 3 so this won't write past the end. */ | ||
79 | while (((uintptr_t) out8 & 3) != 0) { | ||
80 | *out8++ = c; | ||
81 | --n; | ||
82 | } | ||
83 | |||
84 | /* Align 'n'. */ | ||
85 | while (n & 3) | ||
86 | out8[--n] = c; | ||
87 | |||
88 | out32 = (uint32_t *) out8; | ||
89 | n32 = n >> 2; | ||
90 | |||
91 | /* Tile input byte out to 32 bits. */ | ||
92 | v16 = __insn_intlb(c, c); | ||
93 | v32 = __insn_intlh(v16, v16); | ||
94 | |||
95 | /* This must be at least 8 or the following loop doesn't work. */ | ||
96 | #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) | ||
97 | |||
98 | #if !CHIP_HAS_WH64() | ||
99 | |||
100 | ahead32 = CACHE_LINE_SIZE_IN_WORDS; | ||
101 | |||
102 | /* We already prefetched the first and last cache lines, so | ||
103 | * we only need to do more prefetching if we are storing | ||
104 | * to more than two cache lines. | ||
105 | */ | ||
106 | if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) { | ||
107 | int i; | ||
108 | |||
109 | /* Prefetch the next several cache lines. | ||
110 | * This is the setup code for the software-pipelined | ||
111 | * loop below. | ||
112 | */ | ||
113 | #define MAX_PREFETCH 5 | ||
114 | ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS; | ||
115 | if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS) | ||
116 | ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS; | ||
117 | |||
118 | for (i = CACHE_LINE_SIZE_IN_WORDS; | ||
119 | i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS) | ||
120 | __insn_prefetch(&out32[i]); | ||
121 | } | ||
122 | |||
123 | if (n32 > ahead32) { | ||
124 | while (1) { | ||
125 | int j; | ||
126 | |||
127 | /* Prefetch by reading one word several cache lines | ||
128 | * ahead. Since loads are non-blocking this will | ||
129 | * cause the full cache line to be read while we are | ||
130 | * finishing earlier cache lines. Using a store | ||
131 | * here causes microarchitectural performance | ||
132 | * problems where a victimizing store miss goes to | ||
133 | * the head of the retry FIFO and locks the pipe for | ||
134 | * a few cycles. So a few subsequent stores in this | ||
135 | * loop go into the retry FIFO, and then later | ||
136 | * stores see other stores to the same cache line | ||
137 | * are already in the retry FIFO and themselves go | ||
138 | * into the retry FIFO, filling it up and grinding | ||
139 | * to a halt waiting for the original miss to be | ||
140 | * satisfied. | ||
141 | */ | ||
142 | __insn_prefetch(&out32[ahead32]); | ||
143 | |||
144 | #if 1 | ||
145 | #if CACHE_LINE_SIZE_IN_WORDS % 4 != 0 | ||
146 | #error "Unhandled CACHE_LINE_SIZE_IN_WORDS" | ||
147 | #endif | ||
148 | |||
149 | n32 -= CACHE_LINE_SIZE_IN_WORDS; | ||
150 | |||
151 | /* Save icache space by only partially unrolling | ||
152 | * this loop. | ||
153 | */ | ||
154 | for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) { | ||
155 | *out32++ = v32; | ||
156 | *out32++ = v32; | ||
157 | *out32++ = v32; | ||
158 | *out32++ = v32; | ||
159 | } | ||
160 | #else | ||
161 | /* Unfortunately, due to a code generator flaw this | ||
162 | * allocates a separate register for each of these | ||
163 | * stores, which requires a large number of spills, | ||
164 | * which makes this procedure enormously bigger | ||
165 | * (something like 70%) | ||
166 | */ | ||
167 | *out32++ = v32; | ||
168 | *out32++ = v32; | ||
169 | *out32++ = v32; | ||
170 | *out32++ = v32; | ||
171 | *out32++ = v32; | ||
172 | *out32++ = v32; | ||
173 | *out32++ = v32; | ||
174 | *out32++ = v32; | ||
175 | *out32++ = v32; | ||
176 | *out32++ = v32; | ||
177 | *out32++ = v32; | ||
178 | *out32++ = v32; | ||
179 | *out32++ = v32; | ||
180 | *out32++ = v32; | ||
181 | *out32++ = v32; | ||
182 | n32 -= 16; | ||
183 | #endif | ||
184 | |||
185 | /* To save compiled code size, reuse this loop even | ||
186 | * when we run out of prefetching to do by dropping | ||
187 | * ahead32 down. | ||
188 | */ | ||
189 | if (n32 <= ahead32) { | ||
190 | /* Not even a full cache line left, | ||
191 | * so stop now. | ||
192 | */ | ||
193 | if (n32 < CACHE_LINE_SIZE_IN_WORDS) | ||
194 | break; | ||
195 | |||
196 | /* Choose a small enough value that we don't | ||
197 | * prefetch past the end. There's no sense | ||
198 | * in touching cache lines we don't have to. | ||
199 | */ | ||
200 | ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1; | ||
201 | } | ||
202 | } | ||
203 | } | ||
204 | |||
205 | #else /* CHIP_HAS_WH64() */ | ||
206 | |||
207 | /* Determine how many words we need to emit before the 'out32' | ||
208 | * pointer becomes aligned modulo the cache line size. | ||
209 | */ | ||
210 | to_align32 = | ||
211 | (-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1); | ||
212 | |||
213 | /* Only bother aligning and using wh64 if there is at least | ||
214 | * one full cache line to process. This check also prevents | ||
215 | * overrunning the end of the buffer with alignment words. | ||
216 | */ | ||
217 | if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) { | ||
218 | int lines_left; | ||
219 | |||
220 | /* Align out32 mod the cache line size so we can use wh64. */ | ||
221 | n32 -= to_align32; | ||
222 | for (; to_align32 != 0; to_align32--) { | ||
223 | *out32 = v32; | ||
224 | out32++; | ||
225 | } | ||
226 | |||
227 | /* Use unsigned divide to turn this into a right shift. */ | ||
228 | lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS; | ||
229 | |||
230 | do { | ||
231 | /* Only wh64 a few lines at a time, so we don't | ||
232 | * exceed the maximum number of victim lines. | ||
233 | */ | ||
234 | int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS()) | ||
235 | ? lines_left | ||
236 | : CHIP_MAX_OUTSTANDING_VICTIMS()); | ||
237 | uint32_t *wh = out32; | ||
238 | int i = x; | ||
239 | int j; | ||
240 | |||
241 | lines_left -= x; | ||
242 | |||
243 | do { | ||
244 | __insn_wh64(wh); | ||
245 | wh += CACHE_LINE_SIZE_IN_WORDS; | ||
246 | } while (--i); | ||
247 | |||
248 | for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4); j != 0; j--) { | ||
249 | *out32++ = v32; | ||
250 | *out32++ = v32; | ||
251 | *out32++ = v32; | ||
252 | *out32++ = v32; | ||
253 | } | ||
254 | } while (lines_left != 0); | ||
255 | |||
256 | /* We processed all full lines above, so only this many | ||
257 | * words remain to be processed. | ||
258 | */ | ||
259 | n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; | ||
260 | } | ||
261 | |||
262 | #endif /* CHIP_HAS_WH64() */ | ||
263 | |||
264 | /* Now handle any leftover values. */ | ||
265 | if (n32 != 0) { | ||
266 | do { | ||
267 | *out32 = v32; | ||
268 | out32++; | ||
269 | } while (--n32 != 0); | ||
270 | } | ||
271 | |||
272 | return s; | ||
273 | } | ||
274 | EXPORT_SYMBOL(memset); | ||