diff options
author | Chris Metcalf <cmetcalf@tilera.com> | 2013-08-01 15:52:17 -0400 |
---|---|---|
committer | Chris Metcalf <cmetcalf@tilera.com> | 2013-08-01 16:23:12 -0400 |
commit | c53c70a90fdce3e7a53a0412abf7cc2b2a645988 (patch) | |
tree | 0b1ec6d04be95ac07563ba518047be73973d25d8 /arch/tile/lib | |
parent | dd78bc11fb2050b6a3990d0421feca4c68ca4335 (diff) |
tile: optimize and clean up string functions
This change cleans up the string code in a number of ways:
- For memcpy(), fix bug in prefetch and increase distance to 3 lines;
optimize for unaligned data; do all loads before wh64 to make memcpy
safe for forward-overlapping calls; etc. Performance is improved.
- Use new copy_byte() function on tilegx to spread a single byte value
out into a full word using the shufflebytes instruction.
- Clean up header include ordering to be more canonical, and remove
spurious #undefs of function names.
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Diffstat (limited to 'arch/tile/lib')
-rw-r--r-- | arch/tile/lib/memchr_64.c | 2 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_64.c | 261 | ||||
-rw-r--r-- | arch/tile/lib/memset_32.c | 5 | ||||
-rw-r--r-- | arch/tile/lib/memset_64.c | 9 | ||||
-rw-r--r-- | arch/tile/lib/strchr_32.c | 2 | ||||
-rw-r--r-- | arch/tile/lib/strchr_64.c | 2 | ||||
-rw-r--r-- | arch/tile/lib/string-endian.h | 13 | ||||
-rw-r--r-- | arch/tile/lib/strlen_32.c | 2 |
8 files changed, 212 insertions, 84 deletions
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c index 6f867dbf7c56..f8196b3a950e 100644 --- a/arch/tile/lib/memchr_64.c +++ b/arch/tile/lib/memchr_64.c | |||
@@ -36,7 +36,7 @@ void *memchr(const void *s, int c, size_t n) | |||
36 | p = (const uint64_t *)(s_int & -8); | 36 | p = (const uint64_t *)(s_int & -8); |
37 | 37 | ||
38 | /* Create eight copies of the byte for which we are looking. */ | 38 | /* Create eight copies of the byte for which we are looking. */ |
39 | goal = 0x0101010101010101ULL * (uint8_t) c; | 39 | goal = copy_byte(c); |
40 | 40 | ||
41 | /* Read the first word, but munge it so that bytes before the array | 41 | /* Read the first word, but munge it so that bytes before the array |
42 | * will not match goal. | 42 | * will not match goal. |
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c index c79b8e7c6828..46fc1600c17d 100644 --- a/arch/tile/lib/memcpy_64.c +++ b/arch/tile/lib/memcpy_64.c | |||
@@ -18,14 +18,17 @@ | |||
18 | /* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */ | 18 | /* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */ |
19 | 19 | ||
20 | /* Must be 8 bytes in size. */ | 20 | /* Must be 8 bytes in size. */ |
21 | #define word_t uint64_t | 21 | #define op_t uint64_t |
22 | 22 | ||
23 | #if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128 | 23 | /* Threshold value for when to enter the unrolled loops. */ |
24 | #error "Assumes 64 or 128 byte line size" | 24 | #define OP_T_THRES 16 |
25 | |||
26 | #if CHIP_L2_LINE_SIZE() != 64 | ||
27 | #error "Assumes 64 byte line size" | ||
25 | #endif | 28 | #endif |
26 | 29 | ||
27 | /* How many cache lines ahead should we prefetch? */ | 30 | /* How many cache lines ahead should we prefetch? */ |
28 | #define PREFETCH_LINES_AHEAD 3 | 31 | #define PREFETCH_LINES_AHEAD 4 |
29 | 32 | ||
30 | /* | 33 | /* |
31 | * Provide "base versions" of load and store for the normal code path. | 34 | * Provide "base versions" of load and store for the normal code path. |
@@ -58,8 +61,8 @@ int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n) | |||
58 | const char *__restrict src1 = (const char *)srcv; | 61 | const char *__restrict src1 = (const char *)srcv; |
59 | const char *__restrict src1_end; | 62 | const char *__restrict src1_end; |
60 | const char *__restrict prefetch; | 63 | const char *__restrict prefetch; |
61 | word_t *__restrict dst8; /* 8-byte pointer to destination memory. */ | 64 | op_t *__restrict dst8; /* 8-byte pointer to destination memory. */ |
62 | word_t final; /* Final bytes to write to trailing word, if any */ | 65 | op_t final; /* Final bytes to write to trailing word, if any */ |
63 | long i; | 66 | long i; |
64 | 67 | ||
65 | if (n < 16) { | 68 | if (n < 16) { |
@@ -79,104 +82,228 @@ int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n) | |||
79 | for (i = 0; i < PREFETCH_LINES_AHEAD; i++) { | 82 | for (i = 0; i < PREFETCH_LINES_AHEAD; i++) { |
80 | __insn_prefetch(prefetch); | 83 | __insn_prefetch(prefetch); |
81 | prefetch += CHIP_L2_LINE_SIZE(); | 84 | prefetch += CHIP_L2_LINE_SIZE(); |
82 | prefetch = (prefetch > src1_end) ? prefetch : src1; | 85 | prefetch = (prefetch < src1_end) ? prefetch : src1; |
83 | } | 86 | } |
84 | 87 | ||
85 | /* Copy bytes until dst is word-aligned. */ | 88 | /* Copy bytes until dst is word-aligned. */ |
86 | for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--) | 89 | for (; (uintptr_t)dst1 & (sizeof(op_t) - 1); n--) |
87 | ST1(dst1++, LD1(src1++)); | 90 | ST1(dst1++, LD1(src1++)); |
88 | 91 | ||
89 | /* 8-byte pointer to destination memory. */ | 92 | /* 8-byte pointer to destination memory. */ |
90 | dst8 = (word_t *)dst1; | 93 | dst8 = (op_t *)dst1; |
91 | 94 | ||
92 | if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) { | 95 | if (__builtin_expect((uintptr_t)src1 & (sizeof(op_t) - 1), 0)) { |
93 | /* | 96 | /* Unaligned copy. */ |
94 | * Misaligned copy. Copy 8 bytes at a time, but don't | 97 | |
95 | * bother with other fanciness. | 98 | op_t tmp0 = 0, tmp1 = 0, tmp2, tmp3; |
96 | * | 99 | const op_t *src8 = (const op_t *) ((uintptr_t)src1 & |
97 | * TODO: Consider prefetching and using wh64 as well. | 100 | -sizeof(op_t)); |
98 | */ | 101 | const void *srci = (void *)src1; |
99 | 102 | int m; | |
100 | /* Create an aligned src8. */ | 103 | |
101 | const word_t *__restrict src8 = | 104 | m = (CHIP_L2_LINE_SIZE() << 2) - |
102 | (const word_t *)((uintptr_t)src1 & -sizeof(word_t)); | 105 | (((uintptr_t)dst8) & ((CHIP_L2_LINE_SIZE() << 2) - 1)); |
103 | word_t b; | 106 | m = (n < m) ? n : m; |
104 | 107 | m /= sizeof(op_t); | |
105 | word_t a = LD8(src8++); | 108 | |
106 | for (; n >= sizeof(word_t); n -= sizeof(word_t)) { | 109 | /* Copy until 'dst' is cache-line-aligned. */ |
107 | b = LD8(src8++); | 110 | n -= (sizeof(op_t) * m); |
108 | a = __insn_dblalign(a, b, src1); | 111 | |
109 | ST8(dst8++, a); | 112 | switch (m % 4) { |
110 | a = b; | 113 | case 0: |
114 | if (__builtin_expect(!m, 0)) | ||
115 | goto _M0; | ||
116 | tmp1 = LD8(src8++); | ||
117 | tmp2 = LD8(src8++); | ||
118 | goto _8B3; | ||
119 | case 2: | ||
120 | m += 2; | ||
121 | tmp3 = LD8(src8++); | ||
122 | tmp0 = LD8(src8++); | ||
123 | goto _8B1; | ||
124 | case 3: | ||
125 | m += 1; | ||
126 | tmp2 = LD8(src8++); | ||
127 | tmp3 = LD8(src8++); | ||
128 | goto _8B2; | ||
129 | case 1: | ||
130 | m--; | ||
131 | tmp0 = LD8(src8++); | ||
132 | tmp1 = LD8(src8++); | ||
133 | if (__builtin_expect(!m, 0)) | ||
134 | goto _8B0; | ||
135 | } | ||
136 | |||
137 | do { | ||
138 | tmp2 = LD8(src8++); | ||
139 | tmp0 = __insn_dblalign(tmp0, tmp1, srci); | ||
140 | ST8(dst8++, tmp0); | ||
141 | _8B3: | ||
142 | tmp3 = LD8(src8++); | ||
143 | tmp1 = __insn_dblalign(tmp1, tmp2, srci); | ||
144 | ST8(dst8++, tmp1); | ||
145 | _8B2: | ||
146 | tmp0 = LD8(src8++); | ||
147 | tmp2 = __insn_dblalign(tmp2, tmp3, srci); | ||
148 | ST8(dst8++, tmp2); | ||
149 | _8B1: | ||
150 | tmp1 = LD8(src8++); | ||
151 | tmp3 = __insn_dblalign(tmp3, tmp0, srci); | ||
152 | ST8(dst8++, tmp3); | ||
153 | m -= 4; | ||
154 | } while (m); | ||
155 | |||
156 | _8B0: | ||
157 | tmp0 = __insn_dblalign(tmp0, tmp1, srci); | ||
158 | ST8(dst8++, tmp0); | ||
159 | src8--; | ||
160 | |||
161 | _M0: | ||
162 | if (__builtin_expect(n >= CHIP_L2_LINE_SIZE(), 0)) { | ||
163 | op_t tmp4, tmp5, tmp6, tmp7, tmp8; | ||
164 | |||
165 | prefetch = ((const char *)src8) + | ||
166 | CHIP_L2_LINE_SIZE() * PREFETCH_LINES_AHEAD; | ||
167 | |||
168 | for (tmp0 = LD8(src8++); n >= CHIP_L2_LINE_SIZE(); | ||
169 | n -= CHIP_L2_LINE_SIZE()) { | ||
170 | /* Prefetch and advance to next line to | ||
171 | prefetch, but don't go past the end. */ | ||
172 | __insn_prefetch(prefetch); | ||
173 | |||
174 | /* Make sure prefetch got scheduled | ||
175 | earlier. */ | ||
176 | __asm__ ("" : : : "memory"); | ||
177 | |||
178 | prefetch += CHIP_L2_LINE_SIZE(); | ||
179 | prefetch = (prefetch < src1_end) ? prefetch : | ||
180 | (const char *) src8; | ||
181 | |||
182 | tmp1 = LD8(src8++); | ||
183 | tmp2 = LD8(src8++); | ||
184 | tmp3 = LD8(src8++); | ||
185 | tmp4 = LD8(src8++); | ||
186 | tmp5 = LD8(src8++); | ||
187 | tmp6 = LD8(src8++); | ||
188 | tmp7 = LD8(src8++); | ||
189 | tmp8 = LD8(src8++); | ||
190 | |||
191 | tmp0 = __insn_dblalign(tmp0, tmp1, srci); | ||
192 | tmp1 = __insn_dblalign(tmp1, tmp2, srci); | ||
193 | tmp2 = __insn_dblalign(tmp2, tmp3, srci); | ||
194 | tmp3 = __insn_dblalign(tmp3, tmp4, srci); | ||
195 | tmp4 = __insn_dblalign(tmp4, tmp5, srci); | ||
196 | tmp5 = __insn_dblalign(tmp5, tmp6, srci); | ||
197 | tmp6 = __insn_dblalign(tmp6, tmp7, srci); | ||
198 | tmp7 = __insn_dblalign(tmp7, tmp8, srci); | ||
199 | |||
200 | __insn_wh64(dst8); | ||
201 | |||
202 | ST8(dst8++, tmp0); | ||
203 | ST8(dst8++, tmp1); | ||
204 | ST8(dst8++, tmp2); | ||
205 | ST8(dst8++, tmp3); | ||
206 | ST8(dst8++, tmp4); | ||
207 | ST8(dst8++, tmp5); | ||
208 | ST8(dst8++, tmp6); | ||
209 | ST8(dst8++, tmp7); | ||
210 | |||
211 | tmp0 = tmp8; | ||
212 | } | ||
213 | src8--; | ||
214 | } | ||
215 | |||
216 | /* Copy the rest 8-byte chunks. */ | ||
217 | if (n >= sizeof(op_t)) { | ||
218 | tmp0 = LD8(src8++); | ||
219 | for (; n >= sizeof(op_t); n -= sizeof(op_t)) { | ||
220 | tmp1 = LD8(src8++); | ||
221 | tmp0 = __insn_dblalign(tmp0, tmp1, srci); | ||
222 | ST8(dst8++, tmp0); | ||
223 | tmp0 = tmp1; | ||
224 | } | ||
225 | src8--; | ||
111 | } | 226 | } |
112 | 227 | ||
113 | if (n == 0) | 228 | if (n == 0) |
114 | return RETVAL; | 229 | return RETVAL; |
115 | 230 | ||
116 | b = ((const char *)src8 <= src1_end) ? *src8 : 0; | 231 | tmp0 = LD8(src8++); |
232 | tmp1 = ((const char *)src8 <= src1_end) | ||
233 | ? LD8((op_t *)src8) : 0; | ||
234 | final = __insn_dblalign(tmp0, tmp1, srci); | ||
117 | 235 | ||
118 | /* | ||
119 | * Final source bytes to write to trailing partial | ||
120 | * word, if any. | ||
121 | */ | ||
122 | final = __insn_dblalign(a, b, src1); | ||
123 | } else { | 236 | } else { |
124 | /* Aligned copy. */ | 237 | /* Aligned copy. */ |
125 | 238 | ||
126 | const word_t* __restrict src8 = (const word_t *)src1; | 239 | const op_t *__restrict src8 = (const op_t *)src1; |
127 | 240 | ||
128 | /* src8 and dst8 are both word-aligned. */ | 241 | /* src8 and dst8 are both word-aligned. */ |
129 | if (n >= CHIP_L2_LINE_SIZE()) { | 242 | if (n >= CHIP_L2_LINE_SIZE()) { |
130 | /* Copy until 'dst' is cache-line-aligned. */ | 243 | /* Copy until 'dst' is cache-line-aligned. */ |
131 | for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1); | 244 | for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1); |
132 | n -= sizeof(word_t)) | 245 | n -= sizeof(op_t)) |
133 | ST8(dst8++, LD8(src8++)); | 246 | ST8(dst8++, LD8(src8++)); |
134 | 247 | ||
135 | for (; n >= CHIP_L2_LINE_SIZE(); ) { | 248 | for (; n >= CHIP_L2_LINE_SIZE(); ) { |
136 | __insn_wh64(dst8); | 249 | op_t tmp0, tmp1, tmp2, tmp3; |
250 | op_t tmp4, tmp5, tmp6, tmp7; | ||
137 | 251 | ||
138 | /* | 252 | /* |
139 | * Prefetch and advance to next line | 253 | * Prefetch and advance to next line |
140 | * to prefetch, but don't go past the end | 254 | * to prefetch, but don't go past the |
255 | * end. | ||
141 | */ | 256 | */ |
142 | __insn_prefetch(prefetch); | 257 | __insn_prefetch(prefetch); |
258 | |||
259 | /* Make sure prefetch got scheduled | ||
260 | earlier. */ | ||
261 | __asm__ ("" : : : "memory"); | ||
262 | |||
143 | prefetch += CHIP_L2_LINE_SIZE(); | 263 | prefetch += CHIP_L2_LINE_SIZE(); |
144 | prefetch = (prefetch > src1_end) ? prefetch : | 264 | prefetch = (prefetch < src1_end) ? prefetch : |
145 | (const char *)src8; | 265 | (const char *)src8; |
146 | 266 | ||
147 | /* | 267 | /* |
148 | * Copy an entire cache line. Manually | 268 | * Do all the loads before wh64. This |
149 | * unrolled to avoid idiosyncracies of | 269 | * is necessary if [src8, src8+7] and |
150 | * compiler unrolling. | 270 | * [dst8, dst8+7] share the same cache |
271 | * line and dst8 <= src8, as can be | ||
272 | * the case when called from memmove, | ||
273 | * or with code tested on x86 whose | ||
274 | * memcpy always works with forward | ||
275 | * copies. | ||
151 | */ | 276 | */ |
152 | #define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; }) | 277 | tmp0 = LD8(src8++); |
153 | COPY_WORD(0); | 278 | tmp1 = LD8(src8++); |
154 | COPY_WORD(1); | 279 | tmp2 = LD8(src8++); |
155 | COPY_WORD(2); | 280 | tmp3 = LD8(src8++); |
156 | COPY_WORD(3); | 281 | tmp4 = LD8(src8++); |
157 | COPY_WORD(4); | 282 | tmp5 = LD8(src8++); |
158 | COPY_WORD(5); | 283 | tmp6 = LD8(src8++); |
159 | COPY_WORD(6); | 284 | tmp7 = LD8(src8++); |
160 | COPY_WORD(7); | 285 | |
161 | #if CHIP_L2_LINE_SIZE() == 128 | 286 | /* wh64 and wait for tmp7 load completion. */ |
162 | COPY_WORD(8); | 287 | __asm__ ("move %0, %0; wh64 %1\n" |
163 | COPY_WORD(9); | 288 | : : "r"(tmp7), "r"(dst8)); |
164 | COPY_WORD(10); | ||
165 | COPY_WORD(11); | ||
166 | COPY_WORD(12); | ||
167 | COPY_WORD(13); | ||
168 | COPY_WORD(14); | ||
169 | COPY_WORD(15); | ||
170 | #elif CHIP_L2_LINE_SIZE() != 64 | ||
171 | # error Fix code that assumes particular L2 cache line sizes | ||
172 | #endif | ||
173 | 289 | ||
174 | dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t); | 290 | ST8(dst8++, tmp0); |
175 | src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t); | 291 | ST8(dst8++, tmp1); |
292 | ST8(dst8++, tmp2); | ||
293 | ST8(dst8++, tmp3); | ||
294 | ST8(dst8++, tmp4); | ||
295 | ST8(dst8++, tmp5); | ||
296 | ST8(dst8++, tmp6); | ||
297 | ST8(dst8++, tmp7); | ||
298 | |||
299 | n -= CHIP_L2_LINE_SIZE(); | ||
176 | } | 300 | } |
301 | #if CHIP_L2_LINE_SIZE() != 64 | ||
302 | # error "Fix code that assumes particular L2 cache line size." | ||
303 | #endif | ||
177 | } | 304 | } |
178 | 305 | ||
179 | for (; n >= sizeof(word_t); n -= sizeof(word_t)) | 306 | for (; n >= sizeof(op_t); n -= sizeof(op_t)) |
180 | ST8(dst8++, LD8(src8++)); | 307 | ST8(dst8++, LD8(src8++)); |
181 | 308 | ||
182 | if (__builtin_expect(n == 0, 1)) | 309 | if (__builtin_expect(n == 0, 1)) |
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c index 57dbb3a5bff8..9a7837d11f7d 100644 --- a/arch/tile/lib/memset_32.c +++ b/arch/tile/lib/memset_32.c | |||
@@ -12,13 +12,10 @@ | |||
12 | * more details. | 12 | * more details. |
13 | */ | 13 | */ |
14 | 14 | ||
15 | #include <arch/chip.h> | ||
16 | |||
17 | #include <linux/types.h> | 15 | #include <linux/types.h> |
18 | #include <linux/string.h> | 16 | #include <linux/string.h> |
19 | #include <linux/module.h> | 17 | #include <linux/module.h> |
20 | 18 | #include <arch/chip.h> | |
21 | #undef memset | ||
22 | 19 | ||
23 | void *memset(void *s, int c, size_t n) | 20 | void *memset(void *s, int c, size_t n) |
24 | { | 21 | { |
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c index 3873085711d5..03ef69cd73de 100644 --- a/arch/tile/lib/memset_64.c +++ b/arch/tile/lib/memset_64.c | |||
@@ -12,13 +12,11 @@ | |||
12 | * more details. | 12 | * more details. |
13 | */ | 13 | */ |
14 | 14 | ||
15 | #include <arch/chip.h> | ||
16 | |||
17 | #include <linux/types.h> | 15 | #include <linux/types.h> |
18 | #include <linux/string.h> | 16 | #include <linux/string.h> |
19 | #include <linux/module.h> | 17 | #include <linux/module.h> |
20 | 18 | #include <arch/chip.h> | |
21 | #undef memset | 19 | #include "string-endian.h" |
22 | 20 | ||
23 | void *memset(void *s, int c, size_t n) | 21 | void *memset(void *s, int c, size_t n) |
24 | { | 22 | { |
@@ -70,8 +68,7 @@ void *memset(void *s, int c, size_t n) | |||
70 | n64 = n >> 3; | 68 | n64 = n >> 3; |
71 | 69 | ||
72 | /* Tile input byte out to 64 bits. */ | 70 | /* Tile input byte out to 64 bits. */ |
73 | /* KLUDGE */ | 71 | v64 = copy_byte(c); |
74 | v64 = 0x0101010101010101ULL * (uint8_t)c; | ||
75 | 72 | ||
76 | /* This must be at least 8 or the following loop doesn't work. */ | 73 | /* This must be at least 8 or the following loop doesn't work. */ |
77 | #define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8) | 74 | #define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8) |
diff --git a/arch/tile/lib/strchr_32.c b/arch/tile/lib/strchr_32.c index c94e6f7ae7b5..841fe6963019 100644 --- a/arch/tile/lib/strchr_32.c +++ b/arch/tile/lib/strchr_32.c | |||
@@ -16,8 +16,6 @@ | |||
16 | #include <linux/string.h> | 16 | #include <linux/string.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | 18 | ||
19 | #undef strchr | ||
20 | |||
21 | char *strchr(const char *s, int c) | 19 | char *strchr(const char *s, int c) |
22 | { | 20 | { |
23 | int z, g; | 21 | int z, g; |
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c index f39f9dc422b0..fe6e31c06f8d 100644 --- a/arch/tile/lib/strchr_64.c +++ b/arch/tile/lib/strchr_64.c | |||
@@ -26,7 +26,7 @@ char *strchr(const char *s, int c) | |||
26 | const uint64_t *p = (const uint64_t *)(s_int & -8); | 26 | const uint64_t *p = (const uint64_t *)(s_int & -8); |
27 | 27 | ||
28 | /* Create eight copies of the byte for which we are looking. */ | 28 | /* Create eight copies of the byte for which we are looking. */ |
29 | const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c; | 29 | const uint64_t goal = copy_byte(c); |
30 | 30 | ||
31 | /* Read the first aligned word, but force bytes before the string to | 31 | /* Read the first aligned word, but force bytes before the string to |
32 | * match neither zero nor goal (we make sure the high bit of each | 32 | * match neither zero nor goal (we make sure the high bit of each |
diff --git a/arch/tile/lib/string-endian.h b/arch/tile/lib/string-endian.h index c0eed7ce69c3..2e49cbfe9371 100644 --- a/arch/tile/lib/string-endian.h +++ b/arch/tile/lib/string-endian.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | 2 | * Copyright 2013 Tilera Corporation. All Rights Reserved. |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or | 4 | * This program is free software; you can redistribute it and/or |
5 | * modify it under the terms of the GNU General Public License | 5 | * modify it under the terms of the GNU General Public License |
@@ -31,3 +31,14 @@ | |||
31 | #define CFZ(x) __insn_clz(x) | 31 | #define CFZ(x) __insn_clz(x) |
32 | #define REVCZ(x) __insn_ctz(x) | 32 | #define REVCZ(x) __insn_ctz(x) |
33 | #endif | 33 | #endif |
34 | |||
35 | /* | ||
36 | * Create eight copies of the byte in a uint64_t. Byte Shuffle uses | ||
37 | * the bytes of srcB as the index into the dest vector to select a | ||
38 | * byte. With all indices of zero, the first byte is copied into all | ||
39 | * the other bytes. | ||
40 | */ | ||
41 | static inline uint64_t copy_byte(uint8_t byte) | ||
42 | { | ||
43 | return __insn_shufflebytes(byte, 0, 0); | ||
44 | } | ||
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c index 4974292a5534..f26f88e11e4a 100644 --- a/arch/tile/lib/strlen_32.c +++ b/arch/tile/lib/strlen_32.c | |||
@@ -16,8 +16,6 @@ | |||
16 | #include <linux/string.h> | 16 | #include <linux/string.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | 18 | ||
19 | #undef strlen | ||
20 | |||
21 | size_t strlen(const char *s) | 19 | size_t strlen(const char *s) |
22 | { | 20 | { |
23 | /* Get an aligned pointer. */ | 21 | /* Get an aligned pointer. */ |