diff options
Diffstat (limited to 'arch/tile/lib/memset_32.c')
-rw-r--r-- | arch/tile/lib/memset_32.c | 105 |
1 files changed, 0 insertions, 105 deletions
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c index 9a7837d11f7d..2042bfe6595f 100644 --- a/arch/tile/lib/memset_32.c +++ b/arch/tile/lib/memset_32.c | |||
@@ -23,11 +23,7 @@ void *memset(void *s, int c, size_t n) | |||
23 | int n32; | 23 | int n32; |
24 | uint32_t v16, v32; | 24 | uint32_t v16, v32; |
25 | uint8_t *out8 = s; | 25 | uint8_t *out8 = s; |
26 | #if !CHIP_HAS_WH64() | ||
27 | int ahead32; | ||
28 | #else | ||
29 | int to_align32; | 26 | int to_align32; |
30 | #endif | ||
31 | 27 | ||
32 | /* Experimentation shows that a trivial tight loop is a win up until | 28 | /* Experimentation shows that a trivial tight loop is a win up until |
33 | * around a size of 20, where writing a word at a time starts to win. | 29 | * around a size of 20, where writing a word at a time starts to win. |
@@ -58,21 +54,6 @@ void *memset(void *s, int c, size_t n) | |||
58 | return s; | 54 | return s; |
59 | } | 55 | } |
60 | 56 | ||
61 | #if !CHIP_HAS_WH64() | ||
62 | /* Use a spare issue slot to start prefetching the first cache | ||
63 | * line early. This instruction is free as the store can be buried | ||
64 | * in otherwise idle issue slots doing ALU ops. | ||
65 | */ | ||
66 | __insn_prefetch(out8); | ||
67 | |||
68 | /* We prefetch the end so that a short memset that spans two cache | ||
69 | * lines gets some prefetching benefit. Again we believe this is free | ||
70 | * to issue. | ||
71 | */ | ||
72 | __insn_prefetch(&out8[n - 1]); | ||
73 | #endif /* !CHIP_HAS_WH64() */ | ||
74 | |||
75 | |||
76 | /* Align 'out8'. We know n >= 3 so this won't write past the end. */ | 57 | /* Align 'out8'. We know n >= 3 so this won't write past the end. */ |
77 | while (((uintptr_t) out8 & 3) != 0) { | 58 | while (((uintptr_t) out8 & 3) != 0) { |
78 | *out8++ = c; | 59 | *out8++ = c; |
@@ -93,90 +74,6 @@ void *memset(void *s, int c, size_t n) | |||
93 | /* This must be at least 8 or the following loop doesn't work. */ | 74 | /* This must be at least 8 or the following loop doesn't work. */ |
94 | #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) | 75 | #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) |
95 | 76 | ||
96 | #if !CHIP_HAS_WH64() | ||
97 | |||
98 | ahead32 = CACHE_LINE_SIZE_IN_WORDS; | ||
99 | |||
100 | /* We already prefetched the first and last cache lines, so | ||
101 | * we only need to do more prefetching if we are storing | ||
102 | * to more than two cache lines. | ||
103 | */ | ||
104 | if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) { | ||
105 | int i; | ||
106 | |||
107 | /* Prefetch the next several cache lines. | ||
108 | * This is the setup code for the software-pipelined | ||
109 | * loop below. | ||
110 | */ | ||
111 | #define MAX_PREFETCH 5 | ||
112 | ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS; | ||
113 | if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS) | ||
114 | ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS; | ||
115 | |||
116 | for (i = CACHE_LINE_SIZE_IN_WORDS; | ||
117 | i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS) | ||
118 | __insn_prefetch(&out32[i]); | ||
119 | } | ||
120 | |||
121 | if (n32 > ahead32) { | ||
122 | while (1) { | ||
123 | int j; | ||
124 | |||
125 | /* Prefetch by reading one word several cache lines | ||
126 | * ahead. Since loads are non-blocking this will | ||
127 | * cause the full cache line to be read while we are | ||
128 | * finishing earlier cache lines. Using a store | ||
129 | * here causes microarchitectural performance | ||
130 | * problems where a victimizing store miss goes to | ||
131 | * the head of the retry FIFO and locks the pipe for | ||
132 | * a few cycles. So a few subsequent stores in this | ||
133 | * loop go into the retry FIFO, and then later | ||
134 | * stores see other stores to the same cache line | ||
135 | * are already in the retry FIFO and themselves go | ||
136 | * into the retry FIFO, filling it up and grinding | ||
137 | * to a halt waiting for the original miss to be | ||
138 | * satisfied. | ||
139 | */ | ||
140 | __insn_prefetch(&out32[ahead32]); | ||
141 | |||
142 | #if CACHE_LINE_SIZE_IN_WORDS % 4 != 0 | ||
143 | #error "Unhandled CACHE_LINE_SIZE_IN_WORDS" | ||
144 | #endif | ||
145 | |||
146 | n32 -= CACHE_LINE_SIZE_IN_WORDS; | ||
147 | |||
148 | /* Save icache space by only partially unrolling | ||
149 | * this loop. | ||
150 | */ | ||
151 | for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) { | ||
152 | *out32++ = v32; | ||
153 | *out32++ = v32; | ||
154 | *out32++ = v32; | ||
155 | *out32++ = v32; | ||
156 | } | ||
157 | |||
158 | /* To save compiled code size, reuse this loop even | ||
159 | * when we run out of prefetching to do by dropping | ||
160 | * ahead32 down. | ||
161 | */ | ||
162 | if (n32 <= ahead32) { | ||
163 | /* Not even a full cache line left, | ||
164 | * so stop now. | ||
165 | */ | ||
166 | if (n32 < CACHE_LINE_SIZE_IN_WORDS) | ||
167 | break; | ||
168 | |||
169 | /* Choose a small enough value that we don't | ||
170 | * prefetch past the end. There's no sense | ||
171 | * in touching cache lines we don't have to. | ||
172 | */ | ||
173 | ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1; | ||
174 | } | ||
175 | } | ||
176 | } | ||
177 | |||
178 | #else /* CHIP_HAS_WH64() */ | ||
179 | |||
180 | /* Determine how many words we need to emit before the 'out32' | 77 | /* Determine how many words we need to emit before the 'out32' |
181 | * pointer becomes aligned modulo the cache line size. | 78 | * pointer becomes aligned modulo the cache line size. |
182 | */ | 79 | */ |
@@ -233,8 +130,6 @@ void *memset(void *s, int c, size_t n) | |||
233 | n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; | 130 | n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; |
234 | } | 131 | } |
235 | 132 | ||
236 | #endif /* CHIP_HAS_WH64() */ | ||
237 | |||
238 | /* Now handle any leftover values. */ | 133 | /* Now handle any leftover values. */ |
239 | if (n32 != 0) { | 134 | if (n32 != 0) { |
240 | do { | 135 | do { |