diff options
author | Chris Metcalf <cmetcalf@tilera.com> | 2011-05-04 14:38:26 -0400 |
---|---|---|
committer | Chris Metcalf <cmetcalf@tilera.com> | 2011-05-12 15:52:12 -0400 |
commit | 18aecc2b645bbb07851b196452a2af314222069b (patch) | |
tree | 959f765f69af01046c6e26db12b45c3390799d3e /arch/tile/lib | |
parent | be84cb43833ee40a42e08f5425d20310f16229c7 (diff) |
arch/tile: finish enabling support for TILE-Gx 64-bit chip
This support was partially present in the existing code (look for
"__tilegx__" ifdefs) but with this change you can build a working
kernel using the TILE-Gx toolchain and ARCH=tilegx.
Most of these files are new, generally adding a foo_64.c file
where previously there was just a foo_32.c file.
The ARCH=tilegx directive redirects to arch/tile, not arch/tilegx,
using the existing SRCARCH mechanism in the top-level Makefile.
Changes to existing files:
- <asm/bitops.h> and <asm/bitops_32.h> changed to factor the
include of <asm-generic/bitops/non-atomic.h> in the common header.
- <asm/compat.h> and arch/tile/kernel/compat.c changed to remove
the "const" markers I had put on compat_sys_execve() when trying
to match some recent similar changes to the non-compat execve.
It turns out the compat version wasn't "upgraded" to use const.
- <asm/opcode-tile_64.h> and <asm/opcode_constants_64.h> were
previously included accidentally, with the 32-bit contents. Now
they have the proper 64-bit contents.
Finally, I had to hack the existing hacky drivers/input/input-compat.h
to add yet another "#ifdef" for INPUT_COMPAT_TEST (same as x86_64).
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> [drivers/input]
Diffstat (limited to 'arch/tile/lib')
-rw-r--r-- | arch/tile/lib/memchr_64.c | 71 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_64.c | 220 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_user_64.c | 86 | ||||
-rw-r--r-- | arch/tile/lib/memset_64.c | 145 | ||||
-rw-r--r-- | arch/tile/lib/spinlock_64.c | 104 | ||||
-rw-r--r-- | arch/tile/lib/strchr_64.c | 67 | ||||
-rw-r--r-- | arch/tile/lib/strlen_64.c | 38 | ||||
-rw-r--r-- | arch/tile/lib/usercopy_64.S | 196 |
8 files changed, 927 insertions, 0 deletions
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c new file mode 100644 index 00000000000..84fdc8d8e73 --- /dev/null +++ b/arch/tile/lib/memchr_64.c | |||
@@ -0,0 +1,71 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | void *memchr(const void *s, int c, size_t n) | ||
20 | { | ||
21 | const uint64_t *last_word_ptr; | ||
22 | const uint64_t *p; | ||
23 | const char *last_byte_ptr; | ||
24 | uintptr_t s_int; | ||
25 | uint64_t goal, before_mask, v, bits; | ||
26 | char *ret; | ||
27 | |||
28 | if (__builtin_expect(n == 0, 0)) { | ||
29 | /* Don't dereference any memory if the array is empty. */ | ||
30 | return NULL; | ||
31 | } | ||
32 | |||
33 | /* Get an aligned pointer. */ | ||
34 | s_int = (uintptr_t) s; | ||
35 | p = (const uint64_t *)(s_int & -8); | ||
36 | |||
37 | /* Create eight copies of the byte for which we are looking. */ | ||
38 | goal = 0x0101010101010101ULL * (uint8_t) c; | ||
39 | |||
40 | /* Read the first word, but munge it so that bytes before the array | ||
41 | * will not match goal. | ||
42 | * | ||
43 | * Note that this shift count expression works because we know | ||
44 | * shift counts are taken mod 64. | ||
45 | */ | ||
46 | before_mask = (1ULL << (s_int << 3)) - 1; | ||
47 | v = (*p | before_mask) ^ (goal & before_mask); | ||
48 | |||
49 | /* Compute the address of the last byte. */ | ||
50 | last_byte_ptr = (const char *)s + n - 1; | ||
51 | |||
52 | /* Compute the address of the word containing the last byte. */ | ||
53 | last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8); | ||
54 | |||
55 | while ((bits = __insn_v1cmpeq(v, goal)) == 0) { | ||
56 | if (__builtin_expect(p == last_word_ptr, 0)) { | ||
57 | /* We already read the last word in the array, | ||
58 | * so give up. | ||
59 | */ | ||
60 | return NULL; | ||
61 | } | ||
62 | v = *++p; | ||
63 | } | ||
64 | |||
65 | /* We found a match, but it might be in a byte past the end | ||
66 | * of the array. | ||
67 | */ | ||
68 | ret = ((char *)p) + (__insn_ctz(bits) >> 3); | ||
69 | return (ret <= last_byte_ptr) ? ret : NULL; | ||
70 | } | ||
71 | EXPORT_SYMBOL(memchr); | ||
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c new file mode 100644 index 00000000000..3fab9a6a2bb --- /dev/null +++ b/arch/tile/lib/memcpy_64.c | |||
@@ -0,0 +1,220 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/module.h> | ||
18 | #define __memcpy memcpy | ||
19 | /* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */ | ||
20 | |||
21 | /* Must be 8 bytes in size. */ | ||
22 | #define word_t uint64_t | ||
23 | |||
24 | #if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128 | ||
25 | #error "Assumes 64 or 128 byte line size" | ||
26 | #endif | ||
27 | |||
28 | /* How many cache lines ahead should we prefetch? */ | ||
29 | #define PREFETCH_LINES_AHEAD 3 | ||
30 | |||
31 | /* | ||
32 | * Provide "base versions" of load and store for the normal code path. | ||
33 | * The kernel provides other versions for userspace copies. | ||
34 | */ | ||
35 | #define ST(p, v) (*(p) = (v)) | ||
36 | #define LD(p) (*(p)) | ||
37 | |||
38 | #ifndef USERCOPY_FUNC | ||
39 | #define ST1 ST | ||
40 | #define ST2 ST | ||
41 | #define ST4 ST | ||
42 | #define ST8 ST | ||
43 | #define LD1 LD | ||
44 | #define LD2 LD | ||
45 | #define LD4 LD | ||
46 | #define LD8 LD | ||
47 | #define RETVAL dstv | ||
48 | void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n) | ||
49 | #else | ||
50 | /* | ||
51 | * Special kernel version will provide implementation of the LDn/STn | ||
52 | * macros to return a count of uncopied bytes due to mm fault. | ||
53 | */ | ||
54 | #define RETVAL 0 | ||
55 | int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n) | ||
56 | #endif | ||
57 | { | ||
58 | char *__restrict dst1 = (char *)dstv; | ||
59 | const char *__restrict src1 = (const char *)srcv; | ||
60 | const char *__restrict src1_end; | ||
61 | const char *__restrict prefetch; | ||
62 | word_t *__restrict dst8; /* 8-byte pointer to destination memory. */ | ||
63 | word_t final; /* Final bytes to write to trailing word, if any */ | ||
64 | long i; | ||
65 | |||
66 | if (n < 16) { | ||
67 | for (; n; n--) | ||
68 | ST1(dst1++, LD1(src1++)); | ||
69 | return RETVAL; | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * Locate the end of source memory we will copy. Don't | ||
74 | * prefetch past this. | ||
75 | */ | ||
76 | src1_end = src1 + n - 1; | ||
77 | |||
78 | /* Prefetch ahead a few cache lines, but not past the end. */ | ||
79 | prefetch = src1; | ||
80 | for (i = 0; i < PREFETCH_LINES_AHEAD; i++) { | ||
81 | __insn_prefetch(prefetch); | ||
82 | prefetch += CHIP_L2_LINE_SIZE(); | ||
83 | prefetch = (prefetch > src1_end) ? prefetch : src1; | ||
84 | } | ||
85 | |||
86 | /* Copy bytes until dst is word-aligned. */ | ||
87 | for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--) | ||
88 | ST1(dst1++, LD1(src1++)); | ||
89 | |||
90 | /* 8-byte pointer to destination memory. */ | ||
91 | dst8 = (word_t *)dst1; | ||
92 | |||
93 | if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) { | ||
94 | /* | ||
95 | * Misaligned copy. Copy 8 bytes at a time, but don't | ||
96 | * bother with other fanciness. | ||
97 | * | ||
98 | * TODO: Consider prefetching and using wh64 as well. | ||
99 | */ | ||
100 | |||
101 | /* Create an aligned src8. */ | ||
102 | const word_t *__restrict src8 = | ||
103 | (const word_t *)((uintptr_t)src1 & -sizeof(word_t)); | ||
104 | word_t b; | ||
105 | |||
106 | word_t a = LD8(src8++); | ||
107 | for (; n >= sizeof(word_t); n -= sizeof(word_t)) { | ||
108 | b = LD8(src8++); | ||
109 | a = __insn_dblalign(a, b, src1); | ||
110 | ST8(dst8++, a); | ||
111 | a = b; | ||
112 | } | ||
113 | |||
114 | if (n == 0) | ||
115 | return RETVAL; | ||
116 | |||
117 | b = ((const char *)src8 <= src1_end) ? *src8 : 0; | ||
118 | |||
119 | /* | ||
120 | * Final source bytes to write to trailing partial | ||
121 | * word, if any. | ||
122 | */ | ||
123 | final = __insn_dblalign(a, b, src1); | ||
124 | } else { | ||
125 | /* Aligned copy. */ | ||
126 | |||
127 | const word_t* __restrict src8 = (const word_t *)src1; | ||
128 | |||
129 | /* src8 and dst8 are both word-aligned. */ | ||
130 | if (n >= CHIP_L2_LINE_SIZE()) { | ||
131 | /* Copy until 'dst' is cache-line-aligned. */ | ||
132 | for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1); | ||
133 | n -= sizeof(word_t)) | ||
134 | ST8(dst8++, LD8(src8++)); | ||
135 | |||
136 | for (; n >= CHIP_L2_LINE_SIZE(); ) { | ||
137 | __insn_wh64(dst8); | ||
138 | |||
139 | /* | ||
140 | * Prefetch and advance to next line | ||
141 | * to prefetch, but don't go past the end | ||
142 | */ | ||
143 | __insn_prefetch(prefetch); | ||
144 | prefetch += CHIP_L2_LINE_SIZE(); | ||
145 | prefetch = (prefetch > src1_end) ? prefetch : | ||
146 | (const char *)src8; | ||
147 | |||
148 | /* | ||
149 | * Copy an entire cache line. Manually | ||
150 | * unrolled to avoid idiosyncracies of | ||
151 | * compiler unrolling. | ||
152 | */ | ||
153 | #define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; }) | ||
154 | COPY_WORD(0); | ||
155 | COPY_WORD(1); | ||
156 | COPY_WORD(2); | ||
157 | COPY_WORD(3); | ||
158 | COPY_WORD(4); | ||
159 | COPY_WORD(5); | ||
160 | COPY_WORD(6); | ||
161 | COPY_WORD(7); | ||
162 | #if CHIP_L2_LINE_SIZE() == 128 | ||
163 | COPY_WORD(8); | ||
164 | COPY_WORD(9); | ||
165 | COPY_WORD(10); | ||
166 | COPY_WORD(11); | ||
167 | COPY_WORD(12); | ||
168 | COPY_WORD(13); | ||
169 | COPY_WORD(14); | ||
170 | COPY_WORD(15); | ||
171 | #elif CHIP_L2_LINE_SIZE() != 64 | ||
172 | # error Fix code that assumes particular L2 cache line sizes | ||
173 | #endif | ||
174 | |||
175 | dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t); | ||
176 | src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t); | ||
177 | } | ||
178 | } | ||
179 | |||
180 | for (; n >= sizeof(word_t); n -= sizeof(word_t)) | ||
181 | ST8(dst8++, LD8(src8++)); | ||
182 | |||
183 | if (__builtin_expect(n == 0, 1)) | ||
184 | return RETVAL; | ||
185 | |||
186 | final = LD8(src8); | ||
187 | } | ||
188 | |||
189 | /* n != 0 if we get here. Write out any trailing bytes. */ | ||
190 | dst1 = (char *)dst8; | ||
191 | if (n & 4) { | ||
192 | ST4((uint32_t *)dst1, final); | ||
193 | dst1 += 4; | ||
194 | final >>= 32; | ||
195 | n &= 3; | ||
196 | } | ||
197 | if (n & 2) { | ||
198 | ST2((uint16_t *)dst1, final); | ||
199 | dst1 += 2; | ||
200 | final >>= 16; | ||
201 | n &= 1; | ||
202 | } | ||
203 | if (n) | ||
204 | ST1((uint8_t *)dst1, final); | ||
205 | |||
206 | return RETVAL; | ||
207 | } | ||
208 | |||
209 | |||
210 | #ifdef USERCOPY_FUNC | ||
211 | #undef ST1 | ||
212 | #undef ST2 | ||
213 | #undef ST4 | ||
214 | #undef ST8 | ||
215 | #undef LD1 | ||
216 | #undef LD2 | ||
217 | #undef LD4 | ||
218 | #undef LD8 | ||
219 | #undef USERCOPY_FUNC | ||
220 | #endif | ||
diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c new file mode 100644 index 00000000000..4763b3aff1c --- /dev/null +++ b/arch/tile/lib/memcpy_user_64.c | |||
@@ -0,0 +1,86 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * Do memcpy(), but trap and return "n" when a load or store faults. | ||
15 | * | ||
16 | * Note: this idiom only works when memcpy() compiles to a leaf function. | ||
17 | * If "sp" is updated during memcpy, the "jrp lr" will be incorrect. | ||
18 | * | ||
19 | * Also note that we are capturing "n" from the containing scope here. | ||
20 | */ | ||
21 | |||
22 | #define _ST(p, inst, v) \ | ||
23 | ({ \ | ||
24 | asm("1: " #inst " %0, %1;" \ | ||
25 | ".pushsection .coldtext.memcpy,\"ax\";" \ | ||
26 | "2: { move r0, %2; jrp lr };" \ | ||
27 | ".section __ex_table,\"a\";" \ | ||
28 | ".quad 1b, 2b;" \ | ||
29 | ".popsection" \ | ||
30 | : "=m" (*(p)) : "r" (v), "r" (n)); \ | ||
31 | }) | ||
32 | |||
33 | #define _LD(p, inst) \ | ||
34 | ({ \ | ||
35 | unsigned long __v; \ | ||
36 | asm("1: " #inst " %0, %1;" \ | ||
37 | ".pushsection .coldtext.memcpy,\"ax\";" \ | ||
38 | "2: { move r0, %2; jrp lr };" \ | ||
39 | ".section __ex_table,\"a\";" \ | ||
40 | ".quad 1b, 2b;" \ | ||
41 | ".popsection" \ | ||
42 | : "=r" (__v) : "m" (*(p)), "r" (n)); \ | ||
43 | __v; \ | ||
44 | }) | ||
45 | |||
46 | #define USERCOPY_FUNC __copy_to_user_inatomic | ||
47 | #define ST1(p, v) _ST((p), st1, (v)) | ||
48 | #define ST2(p, v) _ST((p), st2, (v)) | ||
49 | #define ST4(p, v) _ST((p), st4, (v)) | ||
50 | #define ST8(p, v) _ST((p), st, (v)) | ||
51 | #define LD1 LD | ||
52 | #define LD2 LD | ||
53 | #define LD4 LD | ||
54 | #define LD8 LD | ||
55 | #include "memcpy_64.c" | ||
56 | |||
57 | #define USERCOPY_FUNC __copy_from_user_inatomic | ||
58 | #define ST1 ST | ||
59 | #define ST2 ST | ||
60 | #define ST4 ST | ||
61 | #define ST8 ST | ||
62 | #define LD1(p) _LD((p), ld1u) | ||
63 | #define LD2(p) _LD((p), ld2u) | ||
64 | #define LD4(p) _LD((p), ld4u) | ||
65 | #define LD8(p) _LD((p), ld) | ||
66 | #include "memcpy_64.c" | ||
67 | |||
68 | #define USERCOPY_FUNC __copy_in_user_inatomic | ||
69 | #define ST1(p, v) _ST((p), st1, (v)) | ||
70 | #define ST2(p, v) _ST((p), st2, (v)) | ||
71 | #define ST4(p, v) _ST((p), st4, (v)) | ||
72 | #define ST8(p, v) _ST((p), st, (v)) | ||
73 | #define LD1(p) _LD((p), ld1u) | ||
74 | #define LD2(p) _LD((p), ld2u) | ||
75 | #define LD4(p) _LD((p), ld4u) | ||
76 | #define LD8(p) _LD((p), ld) | ||
77 | #include "memcpy_64.c" | ||
78 | |||
79 | unsigned long __copy_from_user_zeroing(void *to, const void __user *from, | ||
80 | unsigned long n) | ||
81 | { | ||
82 | unsigned long rc = __copy_from_user_inatomic(to, from, n); | ||
83 | if (unlikely(rc)) | ||
84 | memset(to + n - rc, 0, rc); | ||
85 | return rc; | ||
86 | } | ||
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c new file mode 100644 index 00000000000..3873085711d --- /dev/null +++ b/arch/tile/lib/memset_64.c | |||
@@ -0,0 +1,145 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <arch/chip.h> | ||
16 | |||
17 | #include <linux/types.h> | ||
18 | #include <linux/string.h> | ||
19 | #include <linux/module.h> | ||
20 | |||
21 | #undef memset | ||
22 | |||
23 | void *memset(void *s, int c, size_t n) | ||
24 | { | ||
25 | uint64_t *out64; | ||
26 | int n64, to_align64; | ||
27 | uint64_t v64; | ||
28 | uint8_t *out8 = s; | ||
29 | |||
30 | /* Experimentation shows that a trivial tight loop is a win up until | ||
31 | * around a size of 20, where writing a word at a time starts to win. | ||
32 | */ | ||
33 | #define BYTE_CUTOFF 20 | ||
34 | |||
35 | #if BYTE_CUTOFF < 7 | ||
36 | /* This must be at least at least this big, or some code later | ||
37 | * on doesn't work. | ||
38 | */ | ||
39 | #error "BYTE_CUTOFF is too small" | ||
40 | #endif | ||
41 | |||
42 | if (n < BYTE_CUTOFF) { | ||
43 | /* Strangely, this turns out to be the tightest way to | ||
44 | * write this loop. | ||
45 | */ | ||
46 | if (n != 0) { | ||
47 | do { | ||
48 | /* Strangely, combining these into one line | ||
49 | * performs worse. | ||
50 | */ | ||
51 | *out8 = c; | ||
52 | out8++; | ||
53 | } while (--n != 0); | ||
54 | } | ||
55 | |||
56 | return s; | ||
57 | } | ||
58 | |||
59 | /* Align 'out8'. We know n >= 7 so this won't write past the end. */ | ||
60 | while (((uintptr_t) out8 & 7) != 0) { | ||
61 | *out8++ = c; | ||
62 | --n; | ||
63 | } | ||
64 | |||
65 | /* Align 'n'. */ | ||
66 | while (n & 7) | ||
67 | out8[--n] = c; | ||
68 | |||
69 | out64 = (uint64_t *) out8; | ||
70 | n64 = n >> 3; | ||
71 | |||
72 | /* Tile input byte out to 64 bits. */ | ||
73 | /* KLUDGE */ | ||
74 | v64 = 0x0101010101010101ULL * (uint8_t)c; | ||
75 | |||
76 | /* This must be at least 8 or the following loop doesn't work. */ | ||
77 | #define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8) | ||
78 | |||
79 | /* Determine how many words we need to emit before the 'out32' | ||
80 | * pointer becomes aligned modulo the cache line size. | ||
81 | */ | ||
82 | to_align64 = (-((uintptr_t)out64 >> 3)) & | ||
83 | (CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1); | ||
84 | |||
85 | /* Only bother aligning and using wh64 if there is at least | ||
86 | * one full cache line to process. This check also prevents | ||
87 | * overrunning the end of the buffer with alignment words. | ||
88 | */ | ||
89 | if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) { | ||
90 | int lines_left; | ||
91 | |||
92 | /* Align out64 mod the cache line size so we can use wh64. */ | ||
93 | n64 -= to_align64; | ||
94 | for (; to_align64 != 0; to_align64--) { | ||
95 | *out64 = v64; | ||
96 | out64++; | ||
97 | } | ||
98 | |||
99 | /* Use unsigned divide to turn this into a right shift. */ | ||
100 | lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS; | ||
101 | |||
102 | do { | ||
103 | /* Only wh64 a few lines at a time, so we don't | ||
104 | * exceed the maximum number of victim lines. | ||
105 | */ | ||
106 | int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS()) | ||
107 | ? lines_left | ||
108 | : CHIP_MAX_OUTSTANDING_VICTIMS()); | ||
109 | uint64_t *wh = out64; | ||
110 | int i = x; | ||
111 | int j; | ||
112 | |||
113 | lines_left -= x; | ||
114 | |||
115 | do { | ||
116 | __insn_wh64(wh); | ||
117 | wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS; | ||
118 | } while (--i); | ||
119 | |||
120 | for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4); | ||
121 | j != 0; j--) { | ||
122 | *out64++ = v64; | ||
123 | *out64++ = v64; | ||
124 | *out64++ = v64; | ||
125 | *out64++ = v64; | ||
126 | } | ||
127 | } while (lines_left != 0); | ||
128 | |||
129 | /* We processed all full lines above, so only this many | ||
130 | * words remain to be processed. | ||
131 | */ | ||
132 | n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1; | ||
133 | } | ||
134 | |||
135 | /* Now handle any leftover values. */ | ||
136 | if (n64 != 0) { | ||
137 | do { | ||
138 | *out64 = v64; | ||
139 | out64++; | ||
140 | } while (--n64 != 0); | ||
141 | } | ||
142 | |||
143 | return s; | ||
144 | } | ||
145 | EXPORT_SYMBOL(memset); | ||
diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c new file mode 100644 index 00000000000..d6fb9581e98 --- /dev/null +++ b/arch/tile/lib/spinlock_64.c | |||
@@ -0,0 +1,104 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/spinlock.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <asm/processor.h> | ||
18 | |||
19 | #include "spinlock_common.h" | ||
20 | |||
21 | /* | ||
22 | * Read the spinlock value without allocating in our cache and without | ||
23 | * causing an invalidation to another cpu with a copy of the cacheline. | ||
24 | * This is important when we are spinning waiting for the lock. | ||
25 | */ | ||
26 | static inline u32 arch_spin_read_noalloc(void *lock) | ||
27 | { | ||
28 | return atomic_cmpxchg((atomic_t *)lock, -1, -1); | ||
29 | } | ||
30 | |||
31 | /* | ||
32 | * Wait until the high bits (current) match my ticket. | ||
33 | * If we notice the overflow bit set on entry, we clear it. | ||
34 | */ | ||
35 | void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket) | ||
36 | { | ||
37 | if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) { | ||
38 | __insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW); | ||
39 | my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW; | ||
40 | } | ||
41 | |||
42 | for (;;) { | ||
43 | u32 val = arch_spin_read_noalloc(lock); | ||
44 | u32 delta = my_ticket - arch_spin_current(val); | ||
45 | if (delta == 0) | ||
46 | return; | ||
47 | relax((128 / CYCLES_PER_RELAX_LOOP) * delta); | ||
48 | } | ||
49 | } | ||
50 | EXPORT_SYMBOL(arch_spin_lock_slow); | ||
51 | |||
52 | /* | ||
53 | * Check the lock to see if it is plausible, and try to get it with cmpxchg(). | ||
54 | */ | ||
55 | int arch_spin_trylock(arch_spinlock_t *lock) | ||
56 | { | ||
57 | u32 val = arch_spin_read_noalloc(lock); | ||
58 | if (unlikely(arch_spin_current(val) != arch_spin_next(val))) | ||
59 | return 0; | ||
60 | return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW) | ||
61 | == val; | ||
62 | } | ||
63 | EXPORT_SYMBOL(arch_spin_trylock); | ||
64 | |||
65 | void arch_spin_unlock_wait(arch_spinlock_t *lock) | ||
66 | { | ||
67 | u32 iterations = 0; | ||
68 | while (arch_spin_is_locked(lock)) | ||
69 | delay_backoff(iterations++); | ||
70 | } | ||
71 | EXPORT_SYMBOL(arch_spin_unlock_wait); | ||
72 | |||
73 | /* | ||
74 | * If the read lock fails due to a writer, we retry periodically | ||
75 | * until the value is positive and we write our incremented reader count. | ||
76 | */ | ||
77 | void __read_lock_failed(arch_rwlock_t *rw) | ||
78 | { | ||
79 | u32 val; | ||
80 | int iterations = 0; | ||
81 | do { | ||
82 | delay_backoff(iterations++); | ||
83 | val = __insn_fetchaddgez4(&rw->lock, 1); | ||
84 | } while (unlikely(arch_write_val_locked(val))); | ||
85 | } | ||
86 | EXPORT_SYMBOL(__read_lock_failed); | ||
87 | |||
88 | /* | ||
89 | * If we failed because there were readers, clear the "writer" bit | ||
90 | * so we don't block additional readers. Otherwise, there was another | ||
91 | * writer anyway, so our "fetchor" made no difference. Then wait, | ||
92 | * issuing periodic fetchor instructions, till we get the lock. | ||
93 | */ | ||
94 | void __write_lock_failed(arch_rwlock_t *rw, u32 val) | ||
95 | { | ||
96 | int iterations = 0; | ||
97 | do { | ||
98 | if (!arch_write_val_locked(val)) | ||
99 | val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT); | ||
100 | delay_backoff(iterations++); | ||
101 | val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT); | ||
102 | } while (val != 0); | ||
103 | } | ||
104 | EXPORT_SYMBOL(__write_lock_failed); | ||
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c new file mode 100644 index 00000000000..617a9273aaa --- /dev/null +++ b/arch/tile/lib/strchr_64.c | |||
@@ -0,0 +1,67 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | #undef strchr | ||
20 | |||
21 | char *strchr(const char *s, int c) | ||
22 | { | ||
23 | int z, g; | ||
24 | |||
25 | /* Get an aligned pointer. */ | ||
26 | const uintptr_t s_int = (uintptr_t) s; | ||
27 | const uint64_t *p = (const uint64_t *)(s_int & -8); | ||
28 | |||
29 | /* Create eight copies of the byte for which we are looking. */ | ||
30 | const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c; | ||
31 | |||
32 | /* Read the first aligned word, but force bytes before the string to | ||
33 | * match neither zero nor goal (we make sure the high bit of each | ||
34 | * byte is 1, and the low 7 bits are all the opposite of the goal | ||
35 | * byte). | ||
36 | * | ||
37 | * Note that this shift count expression works because we know shift | ||
38 | * counts are taken mod 64. | ||
39 | */ | ||
40 | const uint64_t before_mask = (1ULL << (s_int << 3)) - 1; | ||
41 | uint64_t v = (*p | before_mask) ^ | ||
42 | (goal & __insn_v1shrsi(before_mask, 1)); | ||
43 | |||
44 | uint64_t zero_matches, goal_matches; | ||
45 | while (1) { | ||
46 | /* Look for a terminating '\0'. */ | ||
47 | zero_matches = __insn_v1cmpeqi(v, 0); | ||
48 | |||
49 | /* Look for the goal byte. */ | ||
50 | goal_matches = __insn_v1cmpeq(v, goal); | ||
51 | |||
52 | if (__builtin_expect((zero_matches | goal_matches) != 0, 0)) | ||
53 | break; | ||
54 | |||
55 | v = *++p; | ||
56 | } | ||
57 | |||
58 | z = __insn_ctz(zero_matches); | ||
59 | g = __insn_ctz(goal_matches); | ||
60 | |||
61 | /* If we found c before '\0' we got a match. Note that if c == '\0' | ||
62 | * then g == z, and we correctly return the address of the '\0' | ||
63 | * rather than NULL. | ||
64 | */ | ||
65 | return (g <= z) ? ((char *)p) + (g >> 3) : NULL; | ||
66 | } | ||
67 | EXPORT_SYMBOL(strchr); | ||
diff --git a/arch/tile/lib/strlen_64.c b/arch/tile/lib/strlen_64.c new file mode 100644 index 00000000000..1c92d46202a --- /dev/null +++ b/arch/tile/lib/strlen_64.c | |||
@@ -0,0 +1,38 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | #undef strlen | ||
20 | |||
21 | size_t strlen(const char *s) | ||
22 | { | ||
23 | /* Get an aligned pointer. */ | ||
24 | const uintptr_t s_int = (uintptr_t) s; | ||
25 | const uint64_t *p = (const uint64_t *)(s_int & -8); | ||
26 | |||
27 | /* Read the first word, but force bytes before the string to be nonzero. | ||
28 | * This expression works because we know shift counts are taken mod 64. | ||
29 | */ | ||
30 | uint64_t v = *p | ((1ULL << (s_int << 3)) - 1); | ||
31 | |||
32 | uint64_t bits; | ||
33 | while ((bits = __insn_v1cmpeqi(v, 0)) == 0) | ||
34 | v = *++p; | ||
35 | |||
36 | return ((const char *)p) + (__insn_ctz(bits) >> 3) - s; | ||
37 | } | ||
38 | EXPORT_SYMBOL(strlen); | ||
diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S new file mode 100644 index 00000000000..2ff44f87b78 --- /dev/null +++ b/arch/tile/lib/usercopy_64.S | |||
@@ -0,0 +1,196 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/linkage.h> | ||
16 | #include <asm/errno.h> | ||
17 | #include <asm/cache.h> | ||
18 | #include <arch/chip.h> | ||
19 | |||
20 | /* Access user memory, but use MMU to avoid propagating kernel exceptions. */ | ||
21 | |||
22 | .pushsection .fixup,"ax" | ||
23 | |||
24 | get_user_fault: | ||
25 | { movei r1, -EFAULT; move r0, zero } | ||
26 | jrp lr | ||
27 | ENDPROC(get_user_fault) | ||
28 | |||
29 | put_user_fault: | ||
30 | { movei r0, -EFAULT; jrp lr } | ||
31 | ENDPROC(put_user_fault) | ||
32 | |||
33 | .popsection | ||
34 | |||
35 | /* | ||
36 | * __get_user_N functions take a pointer in r0, and return 0 in r1 | ||
37 | * on success, with the value in r0; or else -EFAULT in r1. | ||
38 | */ | ||
39 | #define __get_user_N(bytes, LOAD) \ | ||
40 | STD_ENTRY(__get_user_##bytes); \ | ||
41 | 1: { LOAD r0, r0; move r1, zero }; \ | ||
42 | jrp lr; \ | ||
43 | STD_ENDPROC(__get_user_##bytes); \ | ||
44 | .pushsection __ex_table,"a"; \ | ||
45 | .quad 1b, get_user_fault; \ | ||
46 | .popsection | ||
47 | |||
48 | __get_user_N(1, ld1u) | ||
49 | __get_user_N(2, ld2u) | ||
50 | __get_user_N(4, ld4u) | ||
51 | __get_user_N(8, ld) | ||
52 | |||
53 | /* | ||
54 | * __put_user_N functions take a value in r0 and a pointer in r1, | ||
55 | * and return 0 in r0 on success or -EFAULT on failure. | ||
56 | */ | ||
57 | #define __put_user_N(bytes, STORE) \ | ||
58 | STD_ENTRY(__put_user_##bytes); \ | ||
59 | 1: { STORE r1, r0; move r0, zero }; \ | ||
60 | jrp lr; \ | ||
61 | STD_ENDPROC(__put_user_##bytes); \ | ||
62 | .pushsection __ex_table,"a"; \ | ||
63 | .quad 1b, put_user_fault; \ | ||
64 | .popsection | ||
65 | |||
66 | __put_user_N(1, st1) | ||
67 | __put_user_N(2, st2) | ||
68 | __put_user_N(4, st4) | ||
69 | __put_user_N(8, st) | ||
70 | |||
71 | /* | ||
72 | * strnlen_user_asm takes the pointer in r0, and the length bound in r1. | ||
73 | * It returns the length, including the terminating NUL, or zero on exception. | ||
74 | * If length is greater than the bound, returns one plus the bound. | ||
75 | */ | ||
76 | STD_ENTRY(strnlen_user_asm) | ||
77 | { beqz r1, 2f; addi r3, r0, -1 } /* bias down to include NUL */ | ||
78 | 1: { ld1u r4, r0; addi r1, r1, -1 } | ||
79 | beqz r4, 2f | ||
80 | { bnezt r1, 1b; addi r0, r0, 1 } | ||
81 | 2: { sub r0, r0, r3; jrp lr } | ||
82 | STD_ENDPROC(strnlen_user_asm) | ||
83 | .pushsection .fixup,"ax" | ||
84 | strnlen_user_fault: | ||
85 | { move r0, zero; jrp lr } | ||
86 | ENDPROC(strnlen_user_fault) | ||
87 | .section __ex_table,"a" | ||
88 | .quad 1b, strnlen_user_fault | ||
89 | .popsection | ||
90 | |||
91 | /* | ||
92 | * strncpy_from_user_asm takes the kernel target pointer in r0, | ||
93 | * the userspace source pointer in r1, and the length bound (including | ||
94 | * the trailing NUL) in r2. On success, it returns the string length | ||
95 | * (not including the trailing NUL), or -EFAULT on failure. | ||
96 | */ | ||
97 | STD_ENTRY(strncpy_from_user_asm) | ||
98 | { beqz r2, 2f; move r3, r0 } | ||
99 | 1: { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 } | ||
100 | { st1 r0, r4; addi r0, r0, 1 } | ||
101 | beqz r2, 2f | ||
102 | bnezt r4, 1b | ||
103 | addi r0, r0, -1 /* don't count the trailing NUL */ | ||
104 | 2: { sub r0, r0, r3; jrp lr } | ||
105 | STD_ENDPROC(strncpy_from_user_asm) | ||
106 | .pushsection .fixup,"ax" | ||
107 | strncpy_from_user_fault: | ||
108 | { movei r0, -EFAULT; jrp lr } | ||
109 | ENDPROC(strncpy_from_user_fault) | ||
110 | .section __ex_table,"a" | ||
111 | .quad 1b, strncpy_from_user_fault | ||
112 | .popsection | ||
113 | |||
114 | /* | ||
115 | * clear_user_asm takes the user target address in r0 and the | ||
116 | * number of bytes to zero in r1. | ||
117 | * It returns the number of uncopiable bytes (hopefully zero) in r0. | ||
118 | * Note that we don't use a separate .fixup section here since we fall | ||
119 | * through into the "fixup" code as the last straight-line bundle anyway. | ||
120 | */ | ||
121 | STD_ENTRY(clear_user_asm) | ||
122 | { beqz r1, 2f; or r2, r0, r1 } | ||
123 | andi r2, r2, 7 | ||
124 | beqzt r2, .Lclear_aligned_user_asm | ||
125 | 1: { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 } | ||
126 | bnezt r1, 1b | ||
127 | 2: { move r0, r1; jrp lr } | ||
128 | .pushsection __ex_table,"a" | ||
129 | .quad 1b, 2b | ||
130 | .popsection | ||
131 | |||
132 | .Lclear_aligned_user_asm: | ||
133 | 1: { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 } | ||
134 | bnezt r1, 1b | ||
135 | 2: { move r0, r1; jrp lr } | ||
136 | STD_ENDPROC(clear_user_asm) | ||
137 | .pushsection __ex_table,"a" | ||
138 | .quad 1b, 2b | ||
139 | .popsection | ||
140 | |||
141 | /* | ||
142 | * flush_user_asm takes the user target address in r0 and the | ||
143 | * number of bytes to flush in r1. | ||
144 | * It returns the number of unflushable bytes (hopefully zero) in r0. | ||
145 | */ | ||
146 | STD_ENTRY(flush_user_asm) | ||
147 | beqz r1, 2f | ||
148 | { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } | ||
149 | { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } | ||
150 | { and r0, r0, r2; and r1, r1, r2 } | ||
151 | { sub r1, r1, r0 } | ||
152 | 1: { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() } | ||
153 | { addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b } | ||
154 | 2: { move r0, r1; jrp lr } | ||
155 | STD_ENDPROC(flush_user_asm) | ||
156 | .pushsection __ex_table,"a" | ||
157 | .quad 1b, 2b | ||
158 | .popsection | ||
159 | |||
160 | /* | ||
161 | * inv_user_asm takes the user target address in r0 and the | ||
162 | * number of bytes to invalidate in r1. | ||
163 | * It returns the number of not inv'able bytes (hopefully zero) in r0. | ||
164 | */ | ||
165 | STD_ENTRY(inv_user_asm) | ||
166 | beqz r1, 2f | ||
167 | { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } | ||
168 | { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } | ||
169 | { and r0, r0, r2; and r1, r1, r2 } | ||
170 | { sub r1, r1, r0 } | ||
171 | 1: { inv r0; addi r1, r1, -CHIP_INV_STRIDE() } | ||
172 | { addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b } | ||
173 | 2: { move r0, r1; jrp lr } | ||
174 | STD_ENDPROC(inv_user_asm) | ||
175 | .pushsection __ex_table,"a" | ||
176 | .quad 1b, 2b | ||
177 | .popsection | ||
178 | |||
179 | /* | ||
180 | * finv_user_asm takes the user target address in r0 and the | ||
181 | * number of bytes to flush-invalidate in r1. | ||
182 | * It returns the number of not finv'able bytes (hopefully zero) in r0. | ||
183 | */ | ||
184 | STD_ENTRY(finv_user_asm) | ||
185 | beqz r1, 2f | ||
186 | { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } | ||
187 | { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } | ||
188 | { and r0, r0, r2; and r1, r1, r2 } | ||
189 | { sub r1, r1, r0 } | ||
190 | 1: { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() } | ||
191 | { addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b } | ||
192 | 2: { move r0, r1; jrp lr } | ||
193 | STD_ENDPROC(finv_user_asm) | ||
194 | .pushsection __ex_table,"a" | ||
195 | .quad 1b, 2b | ||
196 | .popsection | ||