diff options
Diffstat (limited to 'arch/tile/lib/memset_32.c')
-rw-r--r-- | arch/tile/lib/memset_32.c | 110 |
1 files changed, 1 insertions, 109 deletions
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c index 57dbb3a5bff8..2042bfe6595f 100644 --- a/arch/tile/lib/memset_32.c +++ b/arch/tile/lib/memset_32.c | |||
@@ -12,13 +12,10 @@ | |||
12 | * more details. | 12 | * more details. |
13 | */ | 13 | */ |
14 | 14 | ||
15 | #include <arch/chip.h> | ||
16 | |||
17 | #include <linux/types.h> | 15 | #include <linux/types.h> |
18 | #include <linux/string.h> | 16 | #include <linux/string.h> |
19 | #include <linux/module.h> | 17 | #include <linux/module.h> |
20 | 18 | #include <arch/chip.h> | |
21 | #undef memset | ||
22 | 19 | ||
23 | void *memset(void *s, int c, size_t n) | 20 | void *memset(void *s, int c, size_t n) |
24 | { | 21 | { |
@@ -26,11 +23,7 @@ void *memset(void *s, int c, size_t n) | |||
26 | int n32; | 23 | int n32; |
27 | uint32_t v16, v32; | 24 | uint32_t v16, v32; |
28 | uint8_t *out8 = s; | 25 | uint8_t *out8 = s; |
29 | #if !CHIP_HAS_WH64() | ||
30 | int ahead32; | ||
31 | #else | ||
32 | int to_align32; | 26 | int to_align32; |
33 | #endif | ||
34 | 27 | ||
35 | /* Experimentation shows that a trivial tight loop is a win up until | 28 | /* Experimentation shows that a trivial tight loop is a win up until |
36 | * around a size of 20, where writing a word at a time starts to win. | 29 | * around a size of 20, where writing a word at a time starts to win. |
@@ -61,21 +54,6 @@ void *memset(void *s, int c, size_t n) | |||
61 | return s; | 54 | return s; |
62 | } | 55 | } |
63 | 56 | ||
64 | #if !CHIP_HAS_WH64() | ||
65 | /* Use a spare issue slot to start prefetching the first cache | ||
66 | * line early. This instruction is free as the store can be buried | ||
67 | * in otherwise idle issue slots doing ALU ops. | ||
68 | */ | ||
69 | __insn_prefetch(out8); | ||
70 | |||
71 | /* We prefetch the end so that a short memset that spans two cache | ||
72 | * lines gets some prefetching benefit. Again we believe this is free | ||
73 | * to issue. | ||
74 | */ | ||
75 | __insn_prefetch(&out8[n - 1]); | ||
76 | #endif /* !CHIP_HAS_WH64() */ | ||
77 | |||
78 | |||
79 | /* Align 'out8'. We know n >= 3 so this won't write past the end. */ | 57 | /* Align 'out8'. We know n >= 3 so this won't write past the end. */ |
80 | while (((uintptr_t) out8 & 3) != 0) { | 58 | while (((uintptr_t) out8 & 3) != 0) { |
81 | *out8++ = c; | 59 | *out8++ = c; |
@@ -96,90 +74,6 @@ void *memset(void *s, int c, size_t n) | |||
96 | /* This must be at least 8 or the following loop doesn't work. */ | 74 | /* This must be at least 8 or the following loop doesn't work. */ |
97 | #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) | 75 | #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) |
98 | 76 | ||
99 | #if !CHIP_HAS_WH64() | ||
100 | |||
101 | ahead32 = CACHE_LINE_SIZE_IN_WORDS; | ||
102 | |||
103 | /* We already prefetched the first and last cache lines, so | ||
104 | * we only need to do more prefetching if we are storing | ||
105 | * to more than two cache lines. | ||
106 | */ | ||
107 | if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) { | ||
108 | int i; | ||
109 | |||
110 | /* Prefetch the next several cache lines. | ||
111 | * This is the setup code for the software-pipelined | ||
112 | * loop below. | ||
113 | */ | ||
114 | #define MAX_PREFETCH 5 | ||
115 | ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS; | ||
116 | if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS) | ||
117 | ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS; | ||
118 | |||
119 | for (i = CACHE_LINE_SIZE_IN_WORDS; | ||
120 | i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS) | ||
121 | __insn_prefetch(&out32[i]); | ||
122 | } | ||
123 | |||
124 | if (n32 > ahead32) { | ||
125 | while (1) { | ||
126 | int j; | ||
127 | |||
128 | /* Prefetch by reading one word several cache lines | ||
129 | * ahead. Since loads are non-blocking this will | ||
130 | * cause the full cache line to be read while we are | ||
131 | * finishing earlier cache lines. Using a store | ||
132 | * here causes microarchitectural performance | ||
133 | * problems where a victimizing store miss goes to | ||
134 | * the head of the retry FIFO and locks the pipe for | ||
135 | * a few cycles. So a few subsequent stores in this | ||
136 | * loop go into the retry FIFO, and then later | ||
137 | * stores see other stores to the same cache line | ||
138 | * are already in the retry FIFO and themselves go | ||
139 | * into the retry FIFO, filling it up and grinding | ||
140 | * to a halt waiting for the original miss to be | ||
141 | * satisfied. | ||
142 | */ | ||
143 | __insn_prefetch(&out32[ahead32]); | ||
144 | |||
145 | #if CACHE_LINE_SIZE_IN_WORDS % 4 != 0 | ||
146 | #error "Unhandled CACHE_LINE_SIZE_IN_WORDS" | ||
147 | #endif | ||
148 | |||
149 | n32 -= CACHE_LINE_SIZE_IN_WORDS; | ||
150 | |||
151 | /* Save icache space by only partially unrolling | ||
152 | * this loop. | ||
153 | */ | ||
154 | for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) { | ||
155 | *out32++ = v32; | ||
156 | *out32++ = v32; | ||
157 | *out32++ = v32; | ||
158 | *out32++ = v32; | ||
159 | } | ||
160 | |||
161 | /* To save compiled code size, reuse this loop even | ||
162 | * when we run out of prefetching to do by dropping | ||
163 | * ahead32 down. | ||
164 | */ | ||
165 | if (n32 <= ahead32) { | ||
166 | /* Not even a full cache line left, | ||
167 | * so stop now. | ||
168 | */ | ||
169 | if (n32 < CACHE_LINE_SIZE_IN_WORDS) | ||
170 | break; | ||
171 | |||
172 | /* Choose a small enough value that we don't | ||
173 | * prefetch past the end. There's no sense | ||
174 | * in touching cache lines we don't have to. | ||
175 | */ | ||
176 | ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1; | ||
177 | } | ||
178 | } | ||
179 | } | ||
180 | |||
181 | #else /* CHIP_HAS_WH64() */ | ||
182 | |||
183 | /* Determine how many words we need to emit before the 'out32' | 77 | /* Determine how many words we need to emit before the 'out32' |
184 | * pointer becomes aligned modulo the cache line size. | 78 | * pointer becomes aligned modulo the cache line size. |
185 | */ | 79 | */ |
@@ -236,8 +130,6 @@ void *memset(void *s, int c, size_t n) | |||
236 | n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; | 130 | n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; |
237 | } | 131 | } |
238 | 132 | ||
239 | #endif /* CHIP_HAS_WH64() */ | ||
240 | |||
241 | /* Now handle any leftover values. */ | 133 | /* Now handle any leftover values. */ |
242 | if (n32 != 0) { | 134 | if (n32 != 0) { |
243 | do { | 135 | do { |