diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/alpha/lib/ev6-memset.S |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/alpha/lib/ev6-memset.S')
-rw-r--r-- | arch/alpha/lib/ev6-memset.S | 597 |
1 files changed, 597 insertions, 0 deletions
diff --git a/arch/alpha/lib/ev6-memset.S b/arch/alpha/lib/ev6-memset.S new file mode 100644 index 000000000000..d8b94e1c7fca --- /dev/null +++ b/arch/alpha/lib/ev6-memset.S | |||
@@ -0,0 +1,597 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-memset.S | ||
3 | * | ||
4 | * This is an efficient (and relatively small) implementation of the C library | ||
5 | * "memset()" function for the 21264 implementation of Alpha. | ||
6 | * | ||
7 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | ||
8 | * | ||
9 | * Much of the information about 21264 scheduling/coding comes from: | ||
10 | * Compiler Writer's Guide for the Alpha 21264 | ||
11 | * abbreviated as 'CWG' in other comments here | ||
12 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | ||
13 | * Scheduling notation: | ||
14 | * E - either cluster | ||
15 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | ||
16 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | ||
17 | * The algorithm for the leading and trailing quadwords remains the same, | ||
18 | * however the loop has been unrolled to enable better memory throughput, | ||
19 | * and the code has been replicated for each of the entry points: __memset | ||
20 | * and __memsetw to permit better scheduling to eliminate the stalling | ||
21 | * encountered during the mask replication. | ||
22 | * A future enhancement might be to put in a byte store loop for really | ||
23 | * small (say < 32 bytes) memset()s. Whether or not that change would be | ||
24 | * a win in the kernel would depend upon the contextual usage. | ||
25 | * WARNING: Maintaining this is going to be more work than the above version, | ||
26 | * as fixes will need to be made in multiple places. The performance gain | ||
27 | * is worth it. | ||
28 | */ | ||
29 | |||
30 | .set noat | ||
31 | .set noreorder | ||
32 | .text | ||
33 | .globl __memset | ||
34 | .globl __memsetw | ||
35 | .globl __constant_c_memset | ||
36 | .globl memset | ||
37 | |||
38 | .ent __memset | ||
39 | .align 5 | ||
40 | __memset: | ||
41 | .frame $30,0,$26,0 | ||
42 | .prologue 0 | ||
43 | |||
44 | /* | ||
45 | * Serious stalling happens. The only way to mitigate this is to | ||
46 | * undertake a major re-write to interleave the constant materialization | ||
47 | * with other parts of the fall-through code. This is important, even | ||
48 | * though it makes maintenance tougher. | ||
49 | * Do this later. | ||
50 | */ | ||
51 | and $17,255,$1 # E : 00000000000000ch | ||
52 | insbl $17,1,$2 # U : 000000000000ch00 | ||
53 | bis $16,$16,$0 # E : return value | ||
54 | ble $18,end_b # U : zero length requested? | ||
55 | |||
56 | addq $18,$16,$6 # E : max address to write to | ||
57 | bis $1,$2,$17 # E : 000000000000chch | ||
58 | insbl $1,2,$3 # U : 0000000000ch0000 | ||
59 | insbl $1,3,$4 # U : 00000000ch000000 | ||
60 | |||
61 | or $3,$4,$3 # E : 00000000chch0000 | ||
62 | inswl $17,4,$5 # U : 0000chch00000000 | ||
63 | xor $16,$6,$1 # E : will complete write be within one quadword? | ||
64 | inswl $17,6,$2 # U : chch000000000000 | ||
65 | |||
66 | or $17,$3,$17 # E : 00000000chchchch | ||
67 | or $2,$5,$2 # E : chchchch00000000 | ||
68 | bic $1,7,$1 # E : fit within a single quadword? | ||
69 | and $16,7,$3 # E : Target addr misalignment | ||
70 | |||
71 | or $17,$2,$17 # E : chchchchchchchch | ||
72 | beq $1,within_quad_b # U : | ||
73 | nop # E : | ||
74 | beq $3,aligned_b # U : target is 0mod8 | ||
75 | |||
76 | /* | ||
77 | * Target address is misaligned, and won't fit within a quadword | ||
78 | */ | ||
79 | ldq_u $4,0($16) # L : Fetch first partial | ||
80 | bis $16,$16,$5 # E : Save the address | ||
81 | insql $17,$16,$2 # U : Insert new bytes | ||
82 | subq $3,8,$3 # E : Invert (for addressing uses) | ||
83 | |||
84 | addq $18,$3,$18 # E : $18 is new count ($3 is negative) | ||
85 | mskql $4,$16,$4 # U : clear relevant parts of the quad | ||
86 | subq $16,$3,$16 # E : $16 is new aligned destination | ||
87 | bis $2,$4,$1 # E : Final bytes | ||
88 | |||
89 | nop | ||
90 | stq_u $1,0($5) # L : Store result | ||
91 | nop | ||
92 | nop | ||
93 | |||
94 | .align 4 | ||
95 | aligned_b: | ||
96 | /* | ||
97 | * We are now guaranteed to be quad aligned, with at least | ||
98 | * one partial quad to write. | ||
99 | */ | ||
100 | |||
101 | sra $18,3,$3 # U : Number of remaining quads to write | ||
102 | and $18,7,$18 # E : Number of trailing bytes to write | ||
103 | bis $16,$16,$5 # E : Save dest address | ||
104 | beq $3,no_quad_b # U : tail stuff only | ||
105 | |||
106 | /* | ||
107 | * it's worth the effort to unroll this and use wh64 if possible | ||
108 | * Lifted a bunch of code from clear_user.S | ||
109 | * At this point, entry values are: | ||
110 | * $16 Current destination address | ||
111 | * $5 A copy of $16 | ||
112 | * $6 The max quadword address to write to | ||
113 | * $18 Number trailer bytes | ||
114 | * $3 Number quads to write | ||
115 | */ | ||
116 | |||
117 | and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) | ||
118 | subq $3, 16, $4 # E : Only try to unroll if > 128 bytes | ||
119 | subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) | ||
120 | blt $4, loop_b # U : | ||
121 | |||
122 | /* | ||
123 | * We know we've got at least 16 quads, minimum of one trip | ||
124 | * through unrolled loop. Do a quad at a time to get us 0mod64 | ||
125 | * aligned. | ||
126 | */ | ||
127 | |||
128 | nop # E : | ||
129 | nop # E : | ||
130 | nop # E : | ||
131 | beq $1, $bigalign_b # U : | ||
132 | |||
133 | $alignmod64_b: | ||
134 | stq $17, 0($5) # L : | ||
135 | subq $3, 1, $3 # E : For consistency later | ||
136 | addq $1, 8, $1 # E : Increment towards zero for alignment | ||
137 | addq $5, 8, $4 # E : Initial wh64 address (filler instruction) | ||
138 | |||
139 | nop | ||
140 | nop | ||
141 | addq $5, 8, $5 # E : Inc address | ||
142 | blt $1, $alignmod64_b # U : | ||
143 | |||
144 | $bigalign_b: | ||
145 | /* | ||
146 | * $3 - number quads left to go | ||
147 | * $5 - target address (aligned 0mod64) | ||
148 | * $17 - mask of stuff to store | ||
149 | * Scratch registers available: $7, $2, $4, $1 | ||
150 | * we know that we'll be taking a minimum of one trip through | ||
151 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle | ||
152 | * Assumes the wh64 needs to be for 2 trips through the loop in the future | ||
153 | * The wh64 is issued on for the starting destination address for trip +2 | ||
154 | * through the loop, and if there are less than two trips left, the target | ||
155 | * address will be for the current trip. | ||
156 | */ | ||
157 | |||
158 | $do_wh64_b: | ||
159 | wh64 ($4) # L1 : memory subsystem write hint | ||
160 | subq $3, 24, $2 # E : For determining future wh64 addresses | ||
161 | stq $17, 0($5) # L : | ||
162 | nop # E : | ||
163 | |||
164 | addq $5, 128, $4 # E : speculative target of next wh64 | ||
165 | stq $17, 8($5) # L : | ||
166 | stq $17, 16($5) # L : | ||
167 | addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) | ||
168 | |||
169 | stq $17, 24($5) # L : | ||
170 | stq $17, 32($5) # L : | ||
171 | cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle | ||
172 | nop | ||
173 | |||
174 | stq $17, 40($5) # L : | ||
175 | stq $17, 48($5) # L : | ||
176 | subq $3, 16, $2 # E : Repeat the loop at least once more? | ||
177 | nop | ||
178 | |||
179 | stq $17, 56($5) # L : | ||
180 | addq $5, 64, $5 # E : | ||
181 | subq $3, 8, $3 # E : | ||
182 | bge $2, $do_wh64_b # U : | ||
183 | |||
184 | nop | ||
185 | nop | ||
186 | nop | ||
187 | beq $3, no_quad_b # U : Might have finished already | ||
188 | |||
189 | .align 4 | ||
190 | /* | ||
191 | * Simple loop for trailing quadwords, or for small amounts | ||
192 | * of data (where we can't use an unrolled loop and wh64) | ||
193 | */ | ||
194 | loop_b: | ||
195 | stq $17,0($5) # L : | ||
196 | subq $3,1,$3 # E : Decrement number quads left | ||
197 | addq $5,8,$5 # E : Inc address | ||
198 | bne $3,loop_b # U : more? | ||
199 | |||
200 | no_quad_b: | ||
201 | /* | ||
202 | * Write 0..7 trailing bytes. | ||
203 | */ | ||
204 | nop # E : | ||
205 | beq $18,end_b # U : All done? | ||
206 | ldq $7,0($5) # L : | ||
207 | mskqh $7,$6,$2 # U : Mask final quad | ||
208 | |||
209 | insqh $17,$6,$4 # U : New bits | ||
210 | bis $2,$4,$1 # E : Put it all together | ||
211 | stq $1,0($5) # L : And back to memory | ||
212 | ret $31,($26),1 # L0 : | ||
213 | |||
214 | within_quad_b: | ||
215 | ldq_u $1,0($16) # L : | ||
216 | insql $17,$16,$2 # U : New bits | ||
217 | mskql $1,$16,$4 # U : Clear old | ||
218 | bis $2,$4,$2 # E : New result | ||
219 | |||
220 | mskql $2,$6,$4 # U : | ||
221 | mskqh $1,$6,$2 # U : | ||
222 | bis $2,$4,$1 # E : | ||
223 | stq_u $1,0($16) # L : | ||
224 | |||
225 | end_b: | ||
226 | nop | ||
227 | nop | ||
228 | nop | ||
229 | ret $31,($26),1 # L0 : | ||
230 | .end __memset | ||
231 | |||
232 | /* | ||
233 | * This is the original body of code, prior to replication and | ||
234 | * rescheduling. Leave it here, as there may be calls to this | ||
235 | * entry point. | ||
236 | */ | ||
237 | .align 4 | ||
238 | .ent __constant_c_memset | ||
239 | __constant_c_memset: | ||
240 | .frame $30,0,$26,0 | ||
241 | .prologue 0 | ||
242 | |||
243 | addq $18,$16,$6 # E : max address to write to | ||
244 | bis $16,$16,$0 # E : return value | ||
245 | xor $16,$6,$1 # E : will complete write be within one quadword? | ||
246 | ble $18,end # U : zero length requested? | ||
247 | |||
248 | bic $1,7,$1 # E : fit within a single quadword | ||
249 | beq $1,within_one_quad # U : | ||
250 | and $16,7,$3 # E : Target addr misalignment | ||
251 | beq $3,aligned # U : target is 0mod8 | ||
252 | |||
253 | /* | ||
254 | * Target address is misaligned, and won't fit within a quadword | ||
255 | */ | ||
256 | ldq_u $4,0($16) # L : Fetch first partial | ||
257 | bis $16,$16,$5 # E : Save the address | ||
258 | insql $17,$16,$2 # U : Insert new bytes | ||
259 | subq $3,8,$3 # E : Invert (for addressing uses) | ||
260 | |||
261 | addq $18,$3,$18 # E : $18 is new count ($3 is negative) | ||
262 | mskql $4,$16,$4 # U : clear relevant parts of the quad | ||
263 | subq $16,$3,$16 # E : $16 is new aligned destination | ||
264 | bis $2,$4,$1 # E : Final bytes | ||
265 | |||
266 | nop | ||
267 | stq_u $1,0($5) # L : Store result | ||
268 | nop | ||
269 | nop | ||
270 | |||
271 | .align 4 | ||
272 | aligned: | ||
273 | /* | ||
274 | * We are now guaranteed to be quad aligned, with at least | ||
275 | * one partial quad to write. | ||
276 | */ | ||
277 | |||
278 | sra $18,3,$3 # U : Number of remaining quads to write | ||
279 | and $18,7,$18 # E : Number of trailing bytes to write | ||
280 | bis $16,$16,$5 # E : Save dest address | ||
281 | beq $3,no_quad # U : tail stuff only | ||
282 | |||
283 | /* | ||
284 | * it's worth the effort to unroll this and use wh64 if possible | ||
285 | * Lifted a bunch of code from clear_user.S | ||
286 | * At this point, entry values are: | ||
287 | * $16 Current destination address | ||
288 | * $5 A copy of $16 | ||
289 | * $6 The max quadword address to write to | ||
290 | * $18 Number trailer bytes | ||
291 | * $3 Number quads to write | ||
292 | */ | ||
293 | |||
294 | and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) | ||
295 | subq $3, 16, $4 # E : Only try to unroll if > 128 bytes | ||
296 | subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) | ||
297 | blt $4, loop # U : | ||
298 | |||
299 | /* | ||
300 | * We know we've got at least 16 quads, minimum of one trip | ||
301 | * through unrolled loop. Do a quad at a time to get us 0mod64 | ||
302 | * aligned. | ||
303 | */ | ||
304 | |||
305 | nop # E : | ||
306 | nop # E : | ||
307 | nop # E : | ||
308 | beq $1, $bigalign # U : | ||
309 | |||
310 | $alignmod64: | ||
311 | stq $17, 0($5) # L : | ||
312 | subq $3, 1, $3 # E : For consistency later | ||
313 | addq $1, 8, $1 # E : Increment towards zero for alignment | ||
314 | addq $5, 8, $4 # E : Initial wh64 address (filler instruction) | ||
315 | |||
316 | nop | ||
317 | nop | ||
318 | addq $5, 8, $5 # E : Inc address | ||
319 | blt $1, $alignmod64 # U : | ||
320 | |||
321 | $bigalign: | ||
322 | /* | ||
323 | * $3 - number quads left to go | ||
324 | * $5 - target address (aligned 0mod64) | ||
325 | * $17 - mask of stuff to store | ||
326 | * Scratch registers available: $7, $2, $4, $1 | ||
327 | * we know that we'll be taking a minimum of one trip through | ||
328 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle | ||
329 | * Assumes the wh64 needs to be for 2 trips through the loop in the future | ||
330 | * The wh64 is issued on for the starting destination address for trip +2 | ||
331 | * through the loop, and if there are less than two trips left, the target | ||
332 | * address will be for the current trip. | ||
333 | */ | ||
334 | |||
335 | $do_wh64: | ||
336 | wh64 ($4) # L1 : memory subsystem write hint | ||
337 | subq $3, 24, $2 # E : For determining future wh64 addresses | ||
338 | stq $17, 0($5) # L : | ||
339 | nop # E : | ||
340 | |||
341 | addq $5, 128, $4 # E : speculative target of next wh64 | ||
342 | stq $17, 8($5) # L : | ||
343 | stq $17, 16($5) # L : | ||
344 | addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) | ||
345 | |||
346 | stq $17, 24($5) # L : | ||
347 | stq $17, 32($5) # L : | ||
348 | cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle | ||
349 | nop | ||
350 | |||
351 | stq $17, 40($5) # L : | ||
352 | stq $17, 48($5) # L : | ||
353 | subq $3, 16, $2 # E : Repeat the loop at least once more? | ||
354 | nop | ||
355 | |||
356 | stq $17, 56($5) # L : | ||
357 | addq $5, 64, $5 # E : | ||
358 | subq $3, 8, $3 # E : | ||
359 | bge $2, $do_wh64 # U : | ||
360 | |||
361 | nop | ||
362 | nop | ||
363 | nop | ||
364 | beq $3, no_quad # U : Might have finished already | ||
365 | |||
366 | .align 4 | ||
367 | /* | ||
368 | * Simple loop for trailing quadwords, or for small amounts | ||
369 | * of data (where we can't use an unrolled loop and wh64) | ||
370 | */ | ||
371 | loop: | ||
372 | stq $17,0($5) # L : | ||
373 | subq $3,1,$3 # E : Decrement number quads left | ||
374 | addq $5,8,$5 # E : Inc address | ||
375 | bne $3,loop # U : more? | ||
376 | |||
377 | no_quad: | ||
378 | /* | ||
379 | * Write 0..7 trailing bytes. | ||
380 | */ | ||
381 | nop # E : | ||
382 | beq $18,end # U : All done? | ||
383 | ldq $7,0($5) # L : | ||
384 | mskqh $7,$6,$2 # U : Mask final quad | ||
385 | |||
386 | insqh $17,$6,$4 # U : New bits | ||
387 | bis $2,$4,$1 # E : Put it all together | ||
388 | stq $1,0($5) # L : And back to memory | ||
389 | ret $31,($26),1 # L0 : | ||
390 | |||
391 | within_one_quad: | ||
392 | ldq_u $1,0($16) # L : | ||
393 | insql $17,$16,$2 # U : New bits | ||
394 | mskql $1,$16,$4 # U : Clear old | ||
395 | bis $2,$4,$2 # E : New result | ||
396 | |||
397 | mskql $2,$6,$4 # U : | ||
398 | mskqh $1,$6,$2 # U : | ||
399 | bis $2,$4,$1 # E : | ||
400 | stq_u $1,0($16) # L : | ||
401 | |||
402 | end: | ||
403 | nop | ||
404 | nop | ||
405 | nop | ||
406 | ret $31,($26),1 # L0 : | ||
407 | .end __constant_c_memset | ||
408 | |||
409 | /* | ||
410 | * This is a replicant of the __constant_c_memset code, rescheduled | ||
411 | * to mask stalls. Note that entry point names also had to change | ||
412 | */ | ||
413 | .align 5 | ||
414 | .ent __memsetw | ||
415 | |||
416 | __memsetw: | ||
417 | .frame $30,0,$26,0 | ||
418 | .prologue 0 | ||
419 | |||
420 | inswl $17,0,$5 # U : 000000000000c1c2 | ||
421 | inswl $17,2,$2 # U : 00000000c1c20000 | ||
422 | bis $16,$16,$0 # E : return value | ||
423 | addq $18,$16,$6 # E : max address to write to | ||
424 | |||
425 | ble $18, end_w # U : zero length requested? | ||
426 | inswl $17,4,$3 # U : 0000c1c200000000 | ||
427 | inswl $17,6,$4 # U : c1c2000000000000 | ||
428 | xor $16,$6,$1 # E : will complete write be within one quadword? | ||
429 | |||
430 | or $2,$5,$2 # E : 00000000c1c2c1c2 | ||
431 | or $3,$4,$17 # E : c1c2c1c200000000 | ||
432 | bic $1,7,$1 # E : fit within a single quadword | ||
433 | and $16,7,$3 # E : Target addr misalignment | ||
434 | |||
435 | or $17,$2,$17 # E : c1c2c1c2c1c2c1c2 | ||
436 | beq $1,within_quad_w # U : | ||
437 | nop | ||
438 | beq $3,aligned_w # U : target is 0mod8 | ||
439 | |||
440 | /* | ||
441 | * Target address is misaligned, and won't fit within a quadword | ||
442 | */ | ||
443 | ldq_u $4,0($16) # L : Fetch first partial | ||
444 | bis $16,$16,$5 # E : Save the address | ||
445 | insql $17,$16,$2 # U : Insert new bytes | ||
446 | subq $3,8,$3 # E : Invert (for addressing uses) | ||
447 | |||
448 | addq $18,$3,$18 # E : $18 is new count ($3 is negative) | ||
449 | mskql $4,$16,$4 # U : clear relevant parts of the quad | ||
450 | subq $16,$3,$16 # E : $16 is new aligned destination | ||
451 | bis $2,$4,$1 # E : Final bytes | ||
452 | |||
453 | nop | ||
454 | stq_u $1,0($5) # L : Store result | ||
455 | nop | ||
456 | nop | ||
457 | |||
458 | .align 4 | ||
459 | aligned_w: | ||
460 | /* | ||
461 | * We are now guaranteed to be quad aligned, with at least | ||
462 | * one partial quad to write. | ||
463 | */ | ||
464 | |||
465 | sra $18,3,$3 # U : Number of remaining quads to write | ||
466 | and $18,7,$18 # E : Number of trailing bytes to write | ||
467 | bis $16,$16,$5 # E : Save dest address | ||
468 | beq $3,no_quad_w # U : tail stuff only | ||
469 | |||
470 | /* | ||
471 | * it's worth the effort to unroll this and use wh64 if possible | ||
472 | * Lifted a bunch of code from clear_user.S | ||
473 | * At this point, entry values are: | ||
474 | * $16 Current destination address | ||
475 | * $5 A copy of $16 | ||
476 | * $6 The max quadword address to write to | ||
477 | * $18 Number trailer bytes | ||
478 | * $3 Number quads to write | ||
479 | */ | ||
480 | |||
481 | and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) | ||
482 | subq $3, 16, $4 # E : Only try to unroll if > 128 bytes | ||
483 | subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) | ||
484 | blt $4, loop_w # U : | ||
485 | |||
486 | /* | ||
487 | * We know we've got at least 16 quads, minimum of one trip | ||
488 | * through unrolled loop. Do a quad at a time to get us 0mod64 | ||
489 | * aligned. | ||
490 | */ | ||
491 | |||
492 | nop # E : | ||
493 | nop # E : | ||
494 | nop # E : | ||
495 | beq $1, $bigalign_w # U : | ||
496 | |||
497 | $alignmod64_w: | ||
498 | stq $17, 0($5) # L : | ||
499 | subq $3, 1, $3 # E : For consistency later | ||
500 | addq $1, 8, $1 # E : Increment towards zero for alignment | ||
501 | addq $5, 8, $4 # E : Initial wh64 address (filler instruction) | ||
502 | |||
503 | nop | ||
504 | nop | ||
505 | addq $5, 8, $5 # E : Inc address | ||
506 | blt $1, $alignmod64_w # U : | ||
507 | |||
508 | $bigalign_w: | ||
509 | /* | ||
510 | * $3 - number quads left to go | ||
511 | * $5 - target address (aligned 0mod64) | ||
512 | * $17 - mask of stuff to store | ||
513 | * Scratch registers available: $7, $2, $4, $1 | ||
514 | * we know that we'll be taking a minimum of one trip through | ||
515 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle | ||
516 | * Assumes the wh64 needs to be for 2 trips through the loop in the future | ||
517 | * The wh64 is issued on for the starting destination address for trip +2 | ||
518 | * through the loop, and if there are less than two trips left, the target | ||
519 | * address will be for the current trip. | ||
520 | */ | ||
521 | |||
522 | $do_wh64_w: | ||
523 | wh64 ($4) # L1 : memory subsystem write hint | ||
524 | subq $3, 24, $2 # E : For determining future wh64 addresses | ||
525 | stq $17, 0($5) # L : | ||
526 | nop # E : | ||
527 | |||
528 | addq $5, 128, $4 # E : speculative target of next wh64 | ||
529 | stq $17, 8($5) # L : | ||
530 | stq $17, 16($5) # L : | ||
531 | addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) | ||
532 | |||
533 | stq $17, 24($5) # L : | ||
534 | stq $17, 32($5) # L : | ||
535 | cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle | ||
536 | nop | ||
537 | |||
538 | stq $17, 40($5) # L : | ||
539 | stq $17, 48($5) # L : | ||
540 | subq $3, 16, $2 # E : Repeat the loop at least once more? | ||
541 | nop | ||
542 | |||
543 | stq $17, 56($5) # L : | ||
544 | addq $5, 64, $5 # E : | ||
545 | subq $3, 8, $3 # E : | ||
546 | bge $2, $do_wh64_w # U : | ||
547 | |||
548 | nop | ||
549 | nop | ||
550 | nop | ||
551 | beq $3, no_quad_w # U : Might have finished already | ||
552 | |||
553 | .align 4 | ||
554 | /* | ||
555 | * Simple loop for trailing quadwords, or for small amounts | ||
556 | * of data (where we can't use an unrolled loop and wh64) | ||
557 | */ | ||
558 | loop_w: | ||
559 | stq $17,0($5) # L : | ||
560 | subq $3,1,$3 # E : Decrement number quads left | ||
561 | addq $5,8,$5 # E : Inc address | ||
562 | bne $3,loop_w # U : more? | ||
563 | |||
564 | no_quad_w: | ||
565 | /* | ||
566 | * Write 0..7 trailing bytes. | ||
567 | */ | ||
568 | nop # E : | ||
569 | beq $18,end_w # U : All done? | ||
570 | ldq $7,0($5) # L : | ||
571 | mskqh $7,$6,$2 # U : Mask final quad | ||
572 | |||
573 | insqh $17,$6,$4 # U : New bits | ||
574 | bis $2,$4,$1 # E : Put it all together | ||
575 | stq $1,0($5) # L : And back to memory | ||
576 | ret $31,($26),1 # L0 : | ||
577 | |||
578 | within_quad_w: | ||
579 | ldq_u $1,0($16) # L : | ||
580 | insql $17,$16,$2 # U : New bits | ||
581 | mskql $1,$16,$4 # U : Clear old | ||
582 | bis $2,$4,$2 # E : New result | ||
583 | |||
584 | mskql $2,$6,$4 # U : | ||
585 | mskqh $1,$6,$2 # U : | ||
586 | bis $2,$4,$1 # E : | ||
587 | stq_u $1,0($16) # L : | ||
588 | |||
589 | end_w: | ||
590 | nop | ||
591 | nop | ||
592 | nop | ||
593 | ret $31,($26),1 # L0 : | ||
594 | |||
595 | .end __memsetw | ||
596 | |||
597 | memset = __memset | ||