diff options
Diffstat (limited to 'arch/tile/lib')
-rw-r--r-- | arch/tile/lib/Makefile | 16 | ||||
-rw-r--r-- | arch/tile/lib/atomic_32.c | 330 | ||||
-rw-r--r-- | arch/tile/lib/atomic_asm_32.S | 196 | ||||
-rw-r--r-- | arch/tile/lib/cacheflush.c | 23 | ||||
-rw-r--r-- | arch/tile/lib/checksum.c | 102 | ||||
-rw-r--r-- | arch/tile/lib/cpumask.c | 52 | ||||
-rw-r--r-- | arch/tile/lib/delay.c | 34 | ||||
-rw-r--r-- | arch/tile/lib/exports.c | 79 | ||||
-rw-r--r-- | arch/tile/lib/mb_incoherent.S | 34 | ||||
-rw-r--r-- | arch/tile/lib/memchr_32.c | 68 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_32.S | 628 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_tile64.c | 271 | ||||
-rw-r--r-- | arch/tile/lib/memmove_32.c | 63 | ||||
-rw-r--r-- | arch/tile/lib/memset_32.c | 275 | ||||
-rw-r--r-- | arch/tile/lib/spinlock_32.c | 221 | ||||
-rw-r--r-- | arch/tile/lib/spinlock_common.h | 64 | ||||
-rw-r--r-- | arch/tile/lib/strchr_32.c | 66 | ||||
-rw-r--r-- | arch/tile/lib/strlen_32.c | 36 | ||||
-rw-r--r-- | arch/tile/lib/uaccess.c | 32 | ||||
-rw-r--r-- | arch/tile/lib/usercopy_32.S | 223 |
20 files changed, 2813 insertions, 0 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile new file mode 100644 index 00000000000..438af38bc9e --- /dev/null +++ b/arch/tile/lib/Makefile | |||
@@ -0,0 +1,16 @@ | |||
1 | # | ||
2 | # Makefile for TILE-specific library files.. | ||
3 | # | ||
4 | |||
5 | lib-y = cacheflush.o checksum.o cpumask.o delay.o \ | ||
6 | mb_incoherent.o uaccess.o \ | ||
7 | memcpy_$(BITS).o memchr_$(BITS).o memmove_$(BITS).o memset_$(BITS).o \ | ||
8 | strchr_$(BITS).o strlen_$(BITS).o | ||
9 | |||
10 | ifneq ($(CONFIG_TILEGX),y) | ||
11 | lib-y += atomic_32.o atomic_asm_32.o memcpy_tile64.o | ||
12 | endif | ||
13 | |||
14 | lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o | ||
15 | |||
16 | obj-$(CONFIG_MODULES) += exports.o | ||
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c new file mode 100644 index 00000000000..8040b42a8ee --- /dev/null +++ b/arch/tile/lib/atomic_32.c | |||
@@ -0,0 +1,330 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/cache.h> | ||
16 | #include <linux/delay.h> | ||
17 | #include <linux/uaccess.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <asm/atomic.h> | ||
21 | #include <asm/futex.h> | ||
22 | #include <arch/chip.h> | ||
23 | |||
24 | /* See <asm/atomic_32.h> */ | ||
25 | #if ATOMIC_LOCKS_FOUND_VIA_TABLE() | ||
26 | |||
27 | /* | ||
28 | * A block of memory containing locks for atomic ops. Each instance of this | ||
29 | * struct will be homed on a different CPU. | ||
30 | */ | ||
31 | struct atomic_locks_on_cpu { | ||
32 | int lock[ATOMIC_HASH_L2_SIZE]; | ||
33 | } __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4))); | ||
34 | |||
35 | static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool); | ||
36 | |||
37 | /* The locks we'll use until __init_atomic_per_cpu is called. */ | ||
38 | static struct atomic_locks_on_cpu __initdata initial_atomic_locks; | ||
39 | |||
40 | /* Hash into this vector to get a pointer to lock for the given atomic. */ | ||
41 | struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE] | ||
42 | __write_once = { | ||
43 | [0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks) | ||
44 | }; | ||
45 | |||
46 | #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ | ||
47 | |||
48 | /* This page is remapped on startup to be hash-for-home. */ | ||
49 | int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */] | ||
50 | __attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned"))); | ||
51 | |||
52 | #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ | ||
53 | |||
54 | static inline int *__atomic_hashed_lock(volatile void *v) | ||
55 | { | ||
56 | /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */ | ||
57 | #if ATOMIC_LOCKS_FOUND_VIA_TABLE() | ||
58 | unsigned long i = | ||
59 | (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long)); | ||
60 | unsigned long n = __insn_crc32_32(0, i); | ||
61 | |||
62 | /* Grab high bits for L1 index. */ | ||
63 | unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT); | ||
64 | /* Grab low bits for L2 index. */ | ||
65 | unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1); | ||
66 | |||
67 | return &atomic_lock_ptr[l1_index]->lock[l2_index]; | ||
68 | #else | ||
69 | /* | ||
70 | * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index. | ||
71 | * Using mm works here because atomic_locks is page aligned. | ||
72 | */ | ||
73 | unsigned long ptr = __insn_mm((unsigned long)v >> 1, | ||
74 | (unsigned long)atomic_locks, | ||
75 | 2, (ATOMIC_HASH_SHIFT + 2) - 1); | ||
76 | return (int *)ptr; | ||
77 | #endif | ||
78 | } | ||
79 | |||
80 | #ifdef CONFIG_SMP | ||
81 | /* Return whether the passed pointer is a valid atomic lock pointer. */ | ||
82 | static int is_atomic_lock(int *p) | ||
83 | { | ||
84 | #if ATOMIC_LOCKS_FOUND_VIA_TABLE() | ||
85 | int i; | ||
86 | for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) { | ||
87 | |||
88 | if (p >= &atomic_lock_ptr[i]->lock[0] && | ||
89 | p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) { | ||
90 | return 1; | ||
91 | } | ||
92 | } | ||
93 | return 0; | ||
94 | #else | ||
95 | return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE]; | ||
96 | #endif | ||
97 | } | ||
98 | |||
99 | void __atomic_fault_unlock(int *irqlock_word) | ||
100 | { | ||
101 | BUG_ON(!is_atomic_lock(irqlock_word)); | ||
102 | BUG_ON(*irqlock_word != 1); | ||
103 | *irqlock_word = 0; | ||
104 | } | ||
105 | |||
106 | #endif /* CONFIG_SMP */ | ||
107 | |||
108 | static inline int *__atomic_setup(volatile void *v) | ||
109 | { | ||
110 | /* Issue a load to the target to bring it into cache. */ | ||
111 | *(volatile int *)v; | ||
112 | return __atomic_hashed_lock(v); | ||
113 | } | ||
114 | |||
115 | int _atomic_xchg(atomic_t *v, int n) | ||
116 | { | ||
117 | return __atomic_xchg(&v->counter, __atomic_setup(v), n).val; | ||
118 | } | ||
119 | EXPORT_SYMBOL(_atomic_xchg); | ||
120 | |||
121 | int _atomic_xchg_add(atomic_t *v, int i) | ||
122 | { | ||
123 | return __atomic_xchg_add(&v->counter, __atomic_setup(v), i).val; | ||
124 | } | ||
125 | EXPORT_SYMBOL(_atomic_xchg_add); | ||
126 | |||
127 | int _atomic_xchg_add_unless(atomic_t *v, int a, int u) | ||
128 | { | ||
129 | /* | ||
130 | * Note: argument order is switched here since it is easier | ||
131 | * to use the first argument consistently as the "old value" | ||
132 | * in the assembly, as is done for _atomic_cmpxchg(). | ||
133 | */ | ||
134 | return __atomic_xchg_add_unless(&v->counter, __atomic_setup(v), u, a) | ||
135 | .val; | ||
136 | } | ||
137 | EXPORT_SYMBOL(_atomic_xchg_add_unless); | ||
138 | |||
139 | int _atomic_cmpxchg(atomic_t *v, int o, int n) | ||
140 | { | ||
141 | return __atomic_cmpxchg(&v->counter, __atomic_setup(v), o, n).val; | ||
142 | } | ||
143 | EXPORT_SYMBOL(_atomic_cmpxchg); | ||
144 | |||
145 | unsigned long _atomic_or(volatile unsigned long *p, unsigned long mask) | ||
146 | { | ||
147 | return __atomic_or((int *)p, __atomic_setup(p), mask).val; | ||
148 | } | ||
149 | EXPORT_SYMBOL(_atomic_or); | ||
150 | |||
151 | unsigned long _atomic_andn(volatile unsigned long *p, unsigned long mask) | ||
152 | { | ||
153 | return __atomic_andn((int *)p, __atomic_setup(p), mask).val; | ||
154 | } | ||
155 | EXPORT_SYMBOL(_atomic_andn); | ||
156 | |||
157 | unsigned long _atomic_xor(volatile unsigned long *p, unsigned long mask) | ||
158 | { | ||
159 | return __atomic_xor((int *)p, __atomic_setup(p), mask).val; | ||
160 | } | ||
161 | EXPORT_SYMBOL(_atomic_xor); | ||
162 | |||
163 | |||
164 | u64 _atomic64_xchg(atomic64_t *v, u64 n) | ||
165 | { | ||
166 | return __atomic64_xchg(&v->counter, __atomic_setup(v), n); | ||
167 | } | ||
168 | EXPORT_SYMBOL(_atomic64_xchg); | ||
169 | |||
170 | u64 _atomic64_xchg_add(atomic64_t *v, u64 i) | ||
171 | { | ||
172 | return __atomic64_xchg_add(&v->counter, __atomic_setup(v), i); | ||
173 | } | ||
174 | EXPORT_SYMBOL(_atomic64_xchg_add); | ||
175 | |||
176 | u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u) | ||
177 | { | ||
178 | /* | ||
179 | * Note: argument order is switched here since it is easier | ||
180 | * to use the first argument consistently as the "old value" | ||
181 | * in the assembly, as is done for _atomic_cmpxchg(). | ||
182 | */ | ||
183 | return __atomic64_xchg_add_unless(&v->counter, __atomic_setup(v), | ||
184 | u, a); | ||
185 | } | ||
186 | EXPORT_SYMBOL(_atomic64_xchg_add_unless); | ||
187 | |||
188 | u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n) | ||
189 | { | ||
190 | return __atomic64_cmpxchg(&v->counter, __atomic_setup(v), o, n); | ||
191 | } | ||
192 | EXPORT_SYMBOL(_atomic64_cmpxchg); | ||
193 | |||
194 | |||
195 | static inline int *__futex_setup(int __user *v) | ||
196 | { | ||
197 | /* | ||
198 | * Issue a prefetch to the counter to bring it into cache. | ||
199 | * As for __atomic_setup, but we can't do a read into the L1 | ||
200 | * since it might fault; instead we do a prefetch into the L2. | ||
201 | */ | ||
202 | __insn_prefetch(v); | ||
203 | return __atomic_hashed_lock((int __force *)v); | ||
204 | } | ||
205 | |||
206 | struct __get_user futex_set(int __user *v, int i) | ||
207 | { | ||
208 | return __atomic_xchg((int __force *)v, __futex_setup(v), i); | ||
209 | } | ||
210 | |||
211 | struct __get_user futex_add(int __user *v, int n) | ||
212 | { | ||
213 | return __atomic_xchg_add((int __force *)v, __futex_setup(v), n); | ||
214 | } | ||
215 | |||
216 | struct __get_user futex_or(int __user *v, int n) | ||
217 | { | ||
218 | return __atomic_or((int __force *)v, __futex_setup(v), n); | ||
219 | } | ||
220 | |||
221 | struct __get_user futex_andn(int __user *v, int n) | ||
222 | { | ||
223 | return __atomic_andn((int __force *)v, __futex_setup(v), n); | ||
224 | } | ||
225 | |||
226 | struct __get_user futex_xor(int __user *v, int n) | ||
227 | { | ||
228 | return __atomic_xor((int __force *)v, __futex_setup(v), n); | ||
229 | } | ||
230 | |||
231 | struct __get_user futex_cmpxchg(int __user *v, int o, int n) | ||
232 | { | ||
233 | return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n); | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * If any of the atomic or futex routines hit a bad address (not in | ||
238 | * the page tables at kernel PL) this routine is called. The futex | ||
239 | * routines are never used on kernel space, and the normal atomics and | ||
240 | * bitops are never used on user space. So a fault on kernel space | ||
241 | * must be fatal, but a fault on userspace is a futex fault and we | ||
242 | * need to return -EFAULT. Note that the context this routine is | ||
243 | * invoked in is the context of the "_atomic_xxx()" routines called | ||
244 | * by the functions in this file. | ||
245 | */ | ||
246 | struct __get_user __atomic_bad_address(int __user *addr) | ||
247 | { | ||
248 | if (unlikely(!access_ok(VERIFY_WRITE, addr, sizeof(int)))) | ||
249 | panic("Bad address used for kernel atomic op: %p\n", addr); | ||
250 | return (struct __get_user) { .err = -EFAULT }; | ||
251 | } | ||
252 | |||
253 | |||
254 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
255 | static int __init noatomichash(char *str) | ||
256 | { | ||
257 | pr_warning("noatomichash is deprecated.\n"); | ||
258 | return 1; | ||
259 | } | ||
260 | __setup("noatomichash", noatomichash); | ||
261 | #endif | ||
262 | |||
263 | void __init __init_atomic_per_cpu(void) | ||
264 | { | ||
265 | #if ATOMIC_LOCKS_FOUND_VIA_TABLE() | ||
266 | |||
267 | unsigned int i; | ||
268 | int actual_cpu; | ||
269 | |||
270 | /* | ||
271 | * Before this is called from setup, we just have one lock for | ||
272 | * all atomic objects/operations. Here we replace the | ||
273 | * elements of atomic_lock_ptr so that they point at per_cpu | ||
274 | * integers. This seemingly over-complex approach stems from | ||
275 | * the fact that DEFINE_PER_CPU defines an entry for each cpu | ||
276 | * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1. But | ||
277 | * for efficient hashing of atomics to their locks we want a | ||
278 | * compile time constant power of 2 for the size of this | ||
279 | * table, so we use ATOMIC_HASH_SIZE. | ||
280 | * | ||
281 | * Here we populate atomic_lock_ptr from the per cpu | ||
282 | * atomic_lock_pool, interspersing by actual cpu so that | ||
283 | * subsequent elements are homed on consecutive cpus. | ||
284 | */ | ||
285 | |||
286 | actual_cpu = cpumask_first(cpu_possible_mask); | ||
287 | |||
288 | for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) { | ||
289 | /* | ||
290 | * Preincrement to slightly bias against using cpu 0, | ||
291 | * which has plenty of stuff homed on it already. | ||
292 | */ | ||
293 | actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask); | ||
294 | if (actual_cpu >= nr_cpu_ids) | ||
295 | actual_cpu = cpumask_first(cpu_possible_mask); | ||
296 | |||
297 | atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu); | ||
298 | } | ||
299 | |||
300 | #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ | ||
301 | |||
302 | /* Validate power-of-two and "bigger than cpus" assumption */ | ||
303 | BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1)); | ||
304 | BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids); | ||
305 | |||
306 | /* | ||
307 | * On TILEPro we prefer to use a single hash-for-home | ||
308 | * page, since this means atomic operations are less | ||
309 | * likely to encounter a TLB fault and thus should | ||
310 | * in general perform faster. You may wish to disable | ||
311 | * this in situations where few hash-for-home tiles | ||
312 | * are configured. | ||
313 | */ | ||
314 | BUG_ON((unsigned long)atomic_locks % PAGE_SIZE != 0); | ||
315 | |||
316 | /* The locks must all fit on one page. */ | ||
317 | BUG_ON(ATOMIC_HASH_SIZE * sizeof(int) > PAGE_SIZE); | ||
318 | |||
319 | /* | ||
320 | * We use the page offset of the atomic value's address as | ||
321 | * an index into atomic_locks, excluding the low 3 bits. | ||
322 | * That should not produce more indices than ATOMIC_HASH_SIZE. | ||
323 | */ | ||
324 | BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE); | ||
325 | |||
326 | #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ | ||
327 | |||
328 | /* The futex code makes this assumption, so we validate it here. */ | ||
329 | BUG_ON(sizeof(atomic_t) != sizeof(int)); | ||
330 | } | ||
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S new file mode 100644 index 00000000000..5a5514b77e7 --- /dev/null +++ b/arch/tile/lib/atomic_asm_32.S | |||
@@ -0,0 +1,196 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * Support routines for atomic operations. Each function takes: | ||
15 | * | ||
16 | * r0: address to manipulate | ||
17 | * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG) | ||
18 | * r2: new value to write, or for cmpxchg/add_unless, value to compare against | ||
19 | * r3: (cmpxchg/xchg_add_unless) new value to write or add; | ||
20 | * (atomic64 ops) high word of value to write | ||
21 | * r4/r5: (cmpxchg64/add_unless64) new value to write or add | ||
22 | * | ||
23 | * The 32-bit routines return a "struct __get_user" so that the futex code | ||
24 | * has an opportunity to return -EFAULT to the user if needed. | ||
25 | * The 64-bit routines just return a "long long" with the value, | ||
26 | * since they are only used from kernel space and don't expect to fault. | ||
27 | * Support for 16-bit ops is included in the framework but we don't provide | ||
28 | * any (x86_64 has an atomic_inc_short(), so we might want to some day). | ||
29 | * | ||
30 | * Note that the caller is advised to issue a suitable L1 or L2 | ||
31 | * prefetch on the address being manipulated to avoid extra stalls. | ||
32 | * In addition, the hot path is on two icache lines, and we start with | ||
33 | * a jump to the second line to make sure they are both in cache so | ||
34 | * that we never stall waiting on icache fill while holding the lock. | ||
35 | * (This doesn't work out with most 64-bit ops, since they consume | ||
36 | * too many bundles, so may take an extra i-cache stall.) | ||
37 | * | ||
38 | * These routines set the INTERRUPT_CRITICAL_SECTION bit, just | ||
39 | * like sys_cmpxchg(), so that NMIs like PERF_COUNT will not interrupt | ||
40 | * the code, just page faults. | ||
41 | * | ||
42 | * If the load or store faults in a way that can be directly fixed in | ||
43 | * the do_page_fault_ics() handler (e.g. a vmalloc reference) we fix it | ||
44 | * directly, return to the instruction that faulted, and retry it. | ||
45 | * | ||
46 | * If the load or store faults in a way that potentially requires us | ||
47 | * to release the atomic lock, then retry (e.g. a migrating PTE), we | ||
48 | * reset the PC in do_page_fault_ics() to the "tns" instruction so | ||
49 | * that on return we will reacquire the lock and restart the op. We | ||
50 | * are somewhat overloading the exception_table_entry notion by doing | ||
51 | * this, since those entries are not normally used for migrating PTEs. | ||
52 | * | ||
53 | * If the main page fault handler discovers a bad address, it will see | ||
54 | * the PC pointing to the "tns" instruction (due to the earlier | ||
55 | * exception_table_entry processing in do_page_fault_ics), and | ||
56 | * re-reset the PC to the fault handler, atomic_bad_address(), which | ||
57 | * effectively takes over from the atomic op and can either return a | ||
58 | * bad "struct __get_user" (for user addresses) or can just panic (for | ||
59 | * bad kernel addresses). | ||
60 | * | ||
61 | * Note that if the value we would store is the same as what we | ||
62 | * loaded, we bypass the load. Other platforms with true atomics can | ||
63 | * make the guarantee that a non-atomic __clear_bit(), for example, | ||
64 | * can safely race with an atomic test_and_set_bit(); this example is | ||
65 | * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do | ||
66 | * that on Tile since the "atomic" op is really just a | ||
67 | * read/modify/write, and can race with the non-atomic | ||
68 | * read/modify/write. However, if we can short-circuit the write when | ||
69 | * it is not needed, in the atomic case, we avoid the race. | ||
70 | */ | ||
71 | |||
72 | #include <linux/linkage.h> | ||
73 | #include <asm/atomic.h> | ||
74 | #include <asm/page.h> | ||
75 | #include <asm/processor.h> | ||
76 | |||
77 | .section .text.atomic,"ax" | ||
78 | ENTRY(__start_atomic_asm_code) | ||
79 | |||
80 | .macro atomic_op, name, bitwidth, body | ||
81 | .align 64 | ||
82 | STD_ENTRY_SECTION(__atomic\name, .text.atomic) | ||
83 | { | ||
84 | movei r24, 1 | ||
85 | j 4f /* branch to second cache line */ | ||
86 | } | ||
87 | 1: { | ||
88 | .ifc \bitwidth,16 | ||
89 | lh r22, r0 | ||
90 | .else | ||
91 | lw r22, r0 | ||
92 | addi r28, r0, 4 | ||
93 | .endif | ||
94 | } | ||
95 | .ifc \bitwidth,64 | ||
96 | lw r23, r28 | ||
97 | .endif | ||
98 | \body /* set r24, and r25 if 64-bit */ | ||
99 | { | ||
100 | seq r26, r22, r24 | ||
101 | seq r27, r23, r25 | ||
102 | } | ||
103 | .ifc \bitwidth,64 | ||
104 | bbnst r27, 2f | ||
105 | .endif | ||
106 | bbs r26, 3f /* skip write-back if it's the same value */ | ||
107 | 2: { | ||
108 | .ifc \bitwidth,16 | ||
109 | sh r0, r24 | ||
110 | .else | ||
111 | sw r0, r24 | ||
112 | .endif | ||
113 | } | ||
114 | .ifc \bitwidth,64 | ||
115 | sw r28, r25 | ||
116 | .endif | ||
117 | mf | ||
118 | 3: { | ||
119 | move r0, r22 | ||
120 | .ifc \bitwidth,64 | ||
121 | move r1, r23 | ||
122 | .else | ||
123 | move r1, zero | ||
124 | .endif | ||
125 | sw ATOMIC_LOCK_REG_NAME, zero | ||
126 | } | ||
127 | mtspr INTERRUPT_CRITICAL_SECTION, zero | ||
128 | jrp lr | ||
129 | 4: { | ||
130 | move ATOMIC_LOCK_REG_NAME, r1 | ||
131 | mtspr INTERRUPT_CRITICAL_SECTION, r24 | ||
132 | } | ||
133 | #ifndef CONFIG_SMP | ||
134 | j 1b /* no atomic locks */ | ||
135 | #else | ||
136 | { | ||
137 | tns r21, ATOMIC_LOCK_REG_NAME | ||
138 | moveli r23, 2048 /* maximum backoff time in cycles */ | ||
139 | } | ||
140 | { | ||
141 | bzt r21, 1b /* branch if lock acquired */ | ||
142 | moveli r25, 32 /* starting backoff time in cycles */ | ||
143 | } | ||
144 | 5: mtspr INTERRUPT_CRITICAL_SECTION, zero | ||
145 | mfspr r26, CYCLE_LOW /* get start point for this backoff */ | ||
146 | 6: mfspr r22, CYCLE_LOW /* test to see if we've backed off enough */ | ||
147 | sub r22, r22, r26 | ||
148 | slt r22, r22, r25 | ||
149 | bbst r22, 6b | ||
150 | { | ||
151 | mtspr INTERRUPT_CRITICAL_SECTION, r24 | ||
152 | shli r25, r25, 1 /* double the backoff; retry the tns */ | ||
153 | } | ||
154 | { | ||
155 | tns r21, ATOMIC_LOCK_REG_NAME | ||
156 | slt r26, r23, r25 /* is the proposed backoff too big? */ | ||
157 | } | ||
158 | { | ||
159 | bzt r21, 1b /* branch if lock acquired */ | ||
160 | mvnz r25, r26, r23 | ||
161 | } | ||
162 | j 5b | ||
163 | #endif | ||
164 | STD_ENDPROC(__atomic\name) | ||
165 | .ifc \bitwidth,32 | ||
166 | .pushsection __ex_table,"a" | ||
167 | .word 1b, __atomic\name | ||
168 | .word 2b, __atomic\name | ||
169 | .word __atomic\name, __atomic_bad_address | ||
170 | .popsection | ||
171 | .endif | ||
172 | .endm | ||
173 | |||
174 | atomic_op _cmpxchg, 32, "seq r26, r22, r2; { bbns r26, 3f; move r24, r3 }" | ||
175 | atomic_op _xchg, 32, "move r24, r2" | ||
176 | atomic_op _xchg_add, 32, "add r24, r22, r2" | ||
177 | atomic_op _xchg_add_unless, 32, \ | ||
178 | "sne r26, r22, r2; { bbns r26, 3f; add r24, r22, r3 }" | ||
179 | atomic_op _or, 32, "or r24, r22, r2" | ||
180 | atomic_op _andn, 32, "nor r2, r2, zero; and r24, r22, r2" | ||
181 | atomic_op _xor, 32, "xor r24, r22, r2" | ||
182 | |||
183 | atomic_op 64_cmpxchg, 64, "{ seq r26, r22, r2; seq r27, r23, r3 }; \ | ||
184 | { bbns r26, 3f; move r24, r4 }; { bbns r27, 3f; move r25, r5 }" | ||
185 | atomic_op 64_xchg, 64, "{ move r24, r2; move r25, r3 }" | ||
186 | atomic_op 64_xchg_add, 64, "{ add r24, r22, r2; add r25, r23, r3 }; \ | ||
187 | slt_u r26, r24, r22; add r25, r25, r26" | ||
188 | atomic_op 64_xchg_add_unless, 64, \ | ||
189 | "{ sne r26, r22, r2; sne r27, r23, r3 }; \ | ||
190 | { bbns r26, 3f; add r24, r22, r4 }; \ | ||
191 | { bbns r27, 3f; add r25, r23, r5 }; \ | ||
192 | slt_u r26, r24, r22; add r25, r25, r26" | ||
193 | |||
194 | jrp lr /* happy backtracer */ | ||
195 | |||
196 | ENTRY(__end_atomic_asm_code) | ||
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c new file mode 100644 index 00000000000..11b6164c209 --- /dev/null +++ b/arch/tile/lib/cacheflush.c | |||
@@ -0,0 +1,23 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <asm/page.h> | ||
16 | #include <asm/cacheflush.h> | ||
17 | #include <arch/icache.h> | ||
18 | |||
19 | |||
20 | void __flush_icache_range(unsigned long start, unsigned long end) | ||
21 | { | ||
22 | invalidate_icache((const void *)start, end - start, PAGE_SIZE); | ||
23 | } | ||
diff --git a/arch/tile/lib/checksum.c b/arch/tile/lib/checksum.c new file mode 100644 index 00000000000..e4bab5bd3f3 --- /dev/null +++ b/arch/tile/lib/checksum.c | |||
@@ -0,0 +1,102 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * Support code for the main lib/checksum.c. | ||
14 | */ | ||
15 | |||
16 | #include <net/checksum.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | static inline unsigned int longto16(unsigned long x) | ||
20 | { | ||
21 | unsigned long ret; | ||
22 | #ifdef __tilegx__ | ||
23 | ret = __insn_v2sadu(x, 0); | ||
24 | ret = __insn_v2sadu(ret, 0); | ||
25 | #else | ||
26 | ret = __insn_sadh_u(x, 0); | ||
27 | ret = __insn_sadh_u(ret, 0); | ||
28 | #endif | ||
29 | return ret; | ||
30 | } | ||
31 | |||
32 | __wsum do_csum(const unsigned char *buff, int len) | ||
33 | { | ||
34 | int odd, count; | ||
35 | unsigned long result = 0; | ||
36 | |||
37 | if (len <= 0) | ||
38 | goto out; | ||
39 | odd = 1 & (unsigned long) buff; | ||
40 | if (odd) { | ||
41 | result = (*buff << 8); | ||
42 | len--; | ||
43 | buff++; | ||
44 | } | ||
45 | count = len >> 1; /* nr of 16-bit words.. */ | ||
46 | if (count) { | ||
47 | if (2 & (unsigned long) buff) { | ||
48 | result += *(const unsigned short *)buff; | ||
49 | count--; | ||
50 | len -= 2; | ||
51 | buff += 2; | ||
52 | } | ||
53 | count >>= 1; /* nr of 32-bit words.. */ | ||
54 | if (count) { | ||
55 | #ifdef __tilegx__ | ||
56 | if (4 & (unsigned long) buff) { | ||
57 | unsigned int w = *(const unsigned int *)buff; | ||
58 | result = __insn_v2sadau(result, w, 0); | ||
59 | count--; | ||
60 | len -= 4; | ||
61 | buff += 4; | ||
62 | } | ||
63 | count >>= 1; /* nr of 64-bit words.. */ | ||
64 | #endif | ||
65 | |||
66 | /* | ||
67 | * This algorithm could wrap around for very | ||
68 | * large buffers, but those should be impossible. | ||
69 | */ | ||
70 | BUG_ON(count >= 65530); | ||
71 | |||
72 | while (count) { | ||
73 | unsigned long w = *(const unsigned long *)buff; | ||
74 | count--; | ||
75 | buff += sizeof(w); | ||
76 | #ifdef __tilegx__ | ||
77 | result = __insn_v2sadau(result, w, 0); | ||
78 | #else | ||
79 | result = __insn_sadah_u(result, w, 0); | ||
80 | #endif | ||
81 | } | ||
82 | #ifdef __tilegx__ | ||
83 | if (len & 4) { | ||
84 | unsigned int w = *(const unsigned int *)buff; | ||
85 | result = __insn_v2sadau(result, w, 0); | ||
86 | buff += 4; | ||
87 | } | ||
88 | #endif | ||
89 | } | ||
90 | if (len & 2) { | ||
91 | result += *(const unsigned short *) buff; | ||
92 | buff += 2; | ||
93 | } | ||
94 | } | ||
95 | if (len & 1) | ||
96 | result += *buff; | ||
97 | result = longto16(result); | ||
98 | if (odd) | ||
99 | result = swab16(result); | ||
100 | out: | ||
101 | return result; | ||
102 | } | ||
diff --git a/arch/tile/lib/cpumask.c b/arch/tile/lib/cpumask.c new file mode 100644 index 00000000000..fdc403614d1 --- /dev/null +++ b/arch/tile/lib/cpumask.c | |||
@@ -0,0 +1,52 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/cpumask.h> | ||
16 | #include <linux/ctype.h> | ||
17 | #include <linux/errno.h> | ||
18 | #include <linux/smp.h> | ||
19 | |||
20 | /* | ||
21 | * Allow cropping out bits beyond the end of the array. | ||
22 | * Move to "lib" directory if more clients want to use this routine. | ||
23 | */ | ||
24 | int bitmap_parselist_crop(const char *bp, unsigned long *maskp, int nmaskbits) | ||
25 | { | ||
26 | unsigned a, b; | ||
27 | |||
28 | bitmap_zero(maskp, nmaskbits); | ||
29 | do { | ||
30 | if (!isdigit(*bp)) | ||
31 | return -EINVAL; | ||
32 | a = simple_strtoul(bp, (char **)&bp, 10); | ||
33 | b = a; | ||
34 | if (*bp == '-') { | ||
35 | bp++; | ||
36 | if (!isdigit(*bp)) | ||
37 | return -EINVAL; | ||
38 | b = simple_strtoul(bp, (char **)&bp, 10); | ||
39 | } | ||
40 | if (!(a <= b)) | ||
41 | return -EINVAL; | ||
42 | if (b >= nmaskbits) | ||
43 | b = nmaskbits-1; | ||
44 | while (a <= b) { | ||
45 | set_bit(a, maskp); | ||
46 | a++; | ||
47 | } | ||
48 | if (*bp == ',') | ||
49 | bp++; | ||
50 | } while (*bp != '\0' && *bp != '\n'); | ||
51 | return 0; | ||
52 | } | ||
diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c new file mode 100644 index 00000000000..5801b03c13e --- /dev/null +++ b/arch/tile/lib/delay.c | |||
@@ -0,0 +1,34 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/module.h> | ||
16 | #include <linux/delay.h> | ||
17 | #include <linux/thread_info.h> | ||
18 | #include <asm/fixmap.h> | ||
19 | #include <hv/hypervisor.h> | ||
20 | |||
21 | void __udelay(unsigned long usecs) | ||
22 | { | ||
23 | hv_nanosleep(usecs * 1000); | ||
24 | } | ||
25 | EXPORT_SYMBOL(__udelay); | ||
26 | |||
27 | void __ndelay(unsigned long nsecs) | ||
28 | { | ||
29 | hv_nanosleep(nsecs); | ||
30 | } | ||
31 | EXPORT_SYMBOL(__ndelay); | ||
32 | |||
33 | /* FIXME: should be declared in a header somewhere. */ | ||
34 | EXPORT_SYMBOL(__delay); | ||
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c new file mode 100644 index 00000000000..6bc7b52b4aa --- /dev/null +++ b/arch/tile/lib/exports.c | |||
@@ -0,0 +1,79 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * Exports from assembler code and from libtile-cc. | ||
15 | */ | ||
16 | |||
17 | #include <linux/module.h> | ||
18 | |||
19 | /* arch/tile/lib/usercopy.S */ | ||
20 | #include <linux/uaccess.h> | ||
21 | EXPORT_SYMBOL(__get_user_1); | ||
22 | EXPORT_SYMBOL(__get_user_2); | ||
23 | EXPORT_SYMBOL(__get_user_4); | ||
24 | EXPORT_SYMBOL(__get_user_8); | ||
25 | EXPORT_SYMBOL(__put_user_1); | ||
26 | EXPORT_SYMBOL(__put_user_2); | ||
27 | EXPORT_SYMBOL(__put_user_4); | ||
28 | EXPORT_SYMBOL(__put_user_8); | ||
29 | EXPORT_SYMBOL(strnlen_user_asm); | ||
30 | EXPORT_SYMBOL(strncpy_from_user_asm); | ||
31 | EXPORT_SYMBOL(clear_user_asm); | ||
32 | |||
33 | /* arch/tile/kernel/entry.S */ | ||
34 | #include <linux/kernel.h> | ||
35 | #include <asm/processor.h> | ||
36 | EXPORT_SYMBOL(current_text_addr); | ||
37 | EXPORT_SYMBOL(dump_stack); | ||
38 | |||
39 | /* arch/tile/lib/__memcpy.S */ | ||
40 | /* NOTE: on TILE64, these symbols appear in arch/tile/lib/memcpy_tile64.c */ | ||
41 | EXPORT_SYMBOL(memcpy); | ||
42 | EXPORT_SYMBOL(__copy_to_user_inatomic); | ||
43 | EXPORT_SYMBOL(__copy_from_user_inatomic); | ||
44 | EXPORT_SYMBOL(__copy_from_user_zeroing); | ||
45 | |||
46 | /* hypervisor glue */ | ||
47 | #include <hv/hypervisor.h> | ||
48 | EXPORT_SYMBOL(hv_dev_open); | ||
49 | EXPORT_SYMBOL(hv_dev_pread); | ||
50 | EXPORT_SYMBOL(hv_dev_pwrite); | ||
51 | EXPORT_SYMBOL(hv_dev_close); | ||
52 | |||
53 | /* -ltile-cc */ | ||
54 | uint32_t __udivsi3(uint32_t dividend, uint32_t divisor); | ||
55 | EXPORT_SYMBOL(__udivsi3); | ||
56 | int32_t __divsi3(int32_t dividend, int32_t divisor); | ||
57 | EXPORT_SYMBOL(__divsi3); | ||
58 | uint64_t __udivdi3(uint64_t dividend, uint64_t divisor); | ||
59 | EXPORT_SYMBOL(__udivdi3); | ||
60 | int64_t __divdi3(int64_t dividend, int64_t divisor); | ||
61 | EXPORT_SYMBOL(__divdi3); | ||
62 | uint32_t __umodsi3(uint32_t dividend, uint32_t divisor); | ||
63 | EXPORT_SYMBOL(__umodsi3); | ||
64 | int32_t __modsi3(int32_t dividend, int32_t divisor); | ||
65 | EXPORT_SYMBOL(__modsi3); | ||
66 | uint64_t __umoddi3(uint64_t dividend, uint64_t divisor); | ||
67 | EXPORT_SYMBOL(__umoddi3); | ||
68 | int64_t __moddi3(int64_t dividend, int64_t divisor); | ||
69 | EXPORT_SYMBOL(__moddi3); | ||
70 | #ifndef __tilegx__ | ||
71 | uint64_t __ll_mul(uint64_t n0, uint64_t n1); | ||
72 | EXPORT_SYMBOL(__ll_mul); | ||
73 | #endif | ||
74 | #ifndef __tilegx__ | ||
75 | int64_t __muldi3(int64_t, int64_t); | ||
76 | EXPORT_SYMBOL(__muldi3); | ||
77 | uint64_t __lshrdi3(uint64_t, unsigned int); | ||
78 | EXPORT_SYMBOL(__lshrdi3); | ||
79 | #endif | ||
diff --git a/arch/tile/lib/mb_incoherent.S b/arch/tile/lib/mb_incoherent.S new file mode 100644 index 00000000000..989ad7b68d5 --- /dev/null +++ b/arch/tile/lib/mb_incoherent.S | |||
@@ -0,0 +1,34 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * Assembly code for invoking the HV's fence_incoherent syscall. | ||
15 | */ | ||
16 | |||
17 | #include <linux/linkage.h> | ||
18 | #include <hv/syscall_public.h> | ||
19 | #include <arch/abi.h> | ||
20 | #include <arch/chip.h> | ||
21 | |||
22 | #if !CHIP_HAS_MF_WAITS_FOR_VICTIMS() | ||
23 | |||
24 | /* | ||
25 | * Invoke the hypervisor's fence_incoherent syscall, which guarantees | ||
26 | * that all victims for cachelines homed on this tile have reached memory. | ||
27 | */ | ||
28 | STD_ENTRY(__mb_incoherent) | ||
29 | moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent | ||
30 | swint2 | ||
31 | jrp lr | ||
32 | STD_ENDPROC(__mb_incoherent) | ||
33 | |||
34 | #endif | ||
diff --git a/arch/tile/lib/memchr_32.c b/arch/tile/lib/memchr_32.c new file mode 100644 index 00000000000..6235283b485 --- /dev/null +++ b/arch/tile/lib/memchr_32.c | |||
@@ -0,0 +1,68 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | void *memchr(const void *s, int c, size_t n) | ||
20 | { | ||
21 | /* Get an aligned pointer. */ | ||
22 | const uintptr_t s_int = (uintptr_t) s; | ||
23 | const uint32_t *p = (const uint32_t *)(s_int & -4); | ||
24 | |||
25 | /* Create four copies of the byte for which we are looking. */ | ||
26 | const uint32_t goal = 0x01010101 * (uint8_t) c; | ||
27 | |||
28 | /* Read the first word, but munge it so that bytes before the array | ||
29 | * will not match goal. | ||
30 | * | ||
31 | * Note that this shift count expression works because we know | ||
32 | * shift counts are taken mod 32. | ||
33 | */ | ||
34 | const uint32_t before_mask = (1 << (s_int << 3)) - 1; | ||
35 | uint32_t v = (*p | before_mask) ^ (goal & before_mask); | ||
36 | |||
37 | /* Compute the address of the last byte. */ | ||
38 | const char *const last_byte_ptr = (const char *)s + n - 1; | ||
39 | |||
40 | /* Compute the address of the word containing the last byte. */ | ||
41 | const uint32_t *const last_word_ptr = | ||
42 | (const uint32_t *)((uintptr_t) last_byte_ptr & -4); | ||
43 | |||
44 | uint32_t bits; | ||
45 | char *ret; | ||
46 | |||
47 | if (__builtin_expect(n == 0, 0)) { | ||
48 | /* Don't dereference any memory if the array is empty. */ | ||
49 | return NULL; | ||
50 | } | ||
51 | |||
52 | while ((bits = __insn_seqb(v, goal)) == 0) { | ||
53 | if (__builtin_expect(p == last_word_ptr, 0)) { | ||
54 | /* We already read the last word in the array, | ||
55 | * so give up. | ||
56 | */ | ||
57 | return NULL; | ||
58 | } | ||
59 | v = *++p; | ||
60 | } | ||
61 | |||
62 | /* We found a match, but it might be in a byte past the end | ||
63 | * of the array. | ||
64 | */ | ||
65 | ret = ((char *)p) + (__insn_ctz(bits) >> 3); | ||
66 | return (ret <= last_byte_ptr) ? ret : NULL; | ||
67 | } | ||
68 | EXPORT_SYMBOL(memchr); | ||
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S new file mode 100644 index 00000000000..f92984bf60e --- /dev/null +++ b/arch/tile/lib/memcpy_32.S | |||
@@ -0,0 +1,628 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * This file shares the implementation of the userspace memcpy and | ||
15 | * the kernel's memcpy, copy_to_user and copy_from_user. | ||
16 | */ | ||
17 | |||
18 | #include <arch/chip.h> | ||
19 | |||
20 | #if CHIP_HAS_WH64() || defined(MEMCPY_TEST_WH64) | ||
21 | #define MEMCPY_USE_WH64 | ||
22 | #endif | ||
23 | |||
24 | |||
25 | #include <linux/linkage.h> | ||
26 | |||
27 | /* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */ | ||
28 | #if !CHIP_HAS_COHERENT_LOCAL_CACHE() | ||
29 | #define memcpy __memcpy_asm | ||
30 | #define __copy_to_user_inatomic __copy_to_user_inatomic_asm | ||
31 | #define __copy_from_user_inatomic __copy_from_user_inatomic_asm | ||
32 | #define __copy_from_user_zeroing __copy_from_user_zeroing_asm | ||
33 | #endif | ||
34 | |||
35 | #define IS_MEMCPY 0 | ||
36 | #define IS_COPY_FROM_USER 1 | ||
37 | #define IS_COPY_FROM_USER_ZEROING 2 | ||
38 | #define IS_COPY_TO_USER -1 | ||
39 | |||
40 | .section .text.memcpy_common, "ax" | ||
41 | .align 64 | ||
42 | |||
43 | /* Use this to preface each bundle that can cause an exception so | ||
44 | * the kernel can clean up properly. The special cleanup code should | ||
45 | * not use these, since it knows what it is doing. | ||
46 | */ | ||
47 | #define EX \ | ||
48 | .pushsection __ex_table, "a"; \ | ||
49 | .word 9f, memcpy_common_fixup; \ | ||
50 | .popsection; \ | ||
51 | 9 | ||
52 | |||
53 | |||
54 | /* __copy_from_user_inatomic takes the kernel target address in r0, | ||
55 | * the user source in r1, and the bytes to copy in r2. | ||
56 | * It returns the number of uncopiable bytes (hopefully zero) in r0. | ||
57 | */ | ||
58 | ENTRY(__copy_from_user_inatomic) | ||
59 | .type __copy_from_user_inatomic, @function | ||
60 | FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \ | ||
61 | .text.memcpy_common, \ | ||
62 | .Lend_memcpy_common - __copy_from_user_inatomic) | ||
63 | { movei r29, IS_COPY_FROM_USER; j memcpy_common } | ||
64 | .size __copy_from_user_inatomic, . - __copy_from_user_inatomic | ||
65 | |||
66 | /* __copy_from_user_zeroing is like __copy_from_user_inatomic, but | ||
67 | * any uncopiable bytes are zeroed in the target. | ||
68 | */ | ||
69 | ENTRY(__copy_from_user_zeroing) | ||
70 | .type __copy_from_user_zeroing, @function | ||
71 | FEEDBACK_REENTER(__copy_from_user_inatomic) | ||
72 | { movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common } | ||
73 | .size __copy_from_user_zeroing, . - __copy_from_user_zeroing | ||
74 | |||
75 | /* __copy_to_user_inatomic takes the user target address in r0, | ||
76 | * the kernel source in r1, and the bytes to copy in r2. | ||
77 | * It returns the number of uncopiable bytes (hopefully zero) in r0. | ||
78 | */ | ||
79 | ENTRY(__copy_to_user_inatomic) | ||
80 | .type __copy_to_user_inatomic, @function | ||
81 | FEEDBACK_REENTER(__copy_from_user_inatomic) | ||
82 | { movei r29, IS_COPY_TO_USER; j memcpy_common } | ||
83 | .size __copy_to_user_inatomic, . - __copy_to_user_inatomic | ||
84 | |||
85 | ENTRY(memcpy) | ||
86 | .type memcpy, @function | ||
87 | FEEDBACK_REENTER(__copy_from_user_inatomic) | ||
88 | { movei r29, IS_MEMCPY } | ||
89 | .size memcpy, . - memcpy | ||
90 | /* Fall through */ | ||
91 | |||
92 | .type memcpy_common, @function | ||
93 | memcpy_common: | ||
94 | /* On entry, r29 holds one of the IS_* macro values from above. */ | ||
95 | |||
96 | |||
97 | /* r0 is the dest, r1 is the source, r2 is the size. */ | ||
98 | |||
99 | /* Save aside original dest so we can return it at the end. */ | ||
100 | { sw sp, lr; move r23, r0; or r4, r0, r1 } | ||
101 | |||
102 | /* Check for an empty size. */ | ||
103 | { bz r2, .Ldone; andi r4, r4, 3 } | ||
104 | |||
105 | /* Save aside original values in case of a fault. */ | ||
106 | { move r24, r1; move r25, r2 } | ||
107 | move r27, lr | ||
108 | |||
109 | /* Check for an unaligned source or dest. */ | ||
110 | { bnz r4, .Lcopy_unaligned_maybe_many; addli r4, r2, -256 } | ||
111 | |||
112 | .Lcheck_aligned_copy_size: | ||
113 | /* If we are copying < 256 bytes, branch to simple case. */ | ||
114 | { blzt r4, .Lcopy_8_check; slti_u r8, r2, 8 } | ||
115 | |||
116 | /* Copying >= 256 bytes, so jump to complex prefetching loop. */ | ||
117 | { andi r6, r1, 63; j .Lcopy_many } | ||
118 | |||
119 | /* | ||
120 | * | ||
121 | * Aligned 4 byte at a time copy loop | ||
122 | * | ||
123 | */ | ||
124 | |||
125 | .Lcopy_8_loop: | ||
126 | /* Copy two words at a time to hide load latency. */ | ||
127 | EX: { lw r3, r1; addi r1, r1, 4; slti_u r8, r2, 16 } | ||
128 | EX: { lw r4, r1; addi r1, r1, 4 } | ||
129 | EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } | ||
130 | EX: { sw r0, r4; addi r0, r0, 4; addi r2, r2, -4 } | ||
131 | .Lcopy_8_check: | ||
132 | { bzt r8, .Lcopy_8_loop; slti_u r4, r2, 4 } | ||
133 | |||
134 | /* Copy odd leftover word, if any. */ | ||
135 | { bnzt r4, .Lcheck_odd_stragglers } | ||
136 | EX: { lw r3, r1; addi r1, r1, 4 } | ||
137 | EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } | ||
138 | |||
139 | .Lcheck_odd_stragglers: | ||
140 | { bnz r2, .Lcopy_unaligned_few } | ||
141 | |||
142 | .Ldone: | ||
143 | /* For memcpy return original dest address, else zero. */ | ||
144 | { mz r0, r29, r23; jrp lr } | ||
145 | |||
146 | |||
147 | /* | ||
148 | * | ||
149 | * Prefetching multiple cache line copy handler (for large transfers). | ||
150 | * | ||
151 | */ | ||
152 | |||
153 | /* Copy words until r1 is cache-line-aligned. */ | ||
154 | .Lalign_loop: | ||
155 | EX: { lw r3, r1; addi r1, r1, 4 } | ||
156 | { andi r6, r1, 63 } | ||
157 | EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } | ||
158 | .Lcopy_many: | ||
159 | { bnzt r6, .Lalign_loop; addi r9, r0, 63 } | ||
160 | |||
161 | { addi r3, r1, 60; andi r9, r9, -64 } | ||
162 | |||
163 | #ifdef MEMCPY_USE_WH64 | ||
164 | /* No need to prefetch dst, we'll just do the wh64 | ||
165 | * right before we copy a line. | ||
166 | */ | ||
167 | #endif | ||
168 | |||
169 | EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 } | ||
170 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | ||
171 | { bnzt zero, .; move r27, lr } | ||
172 | EX: { lw r6, r3; addi r3, r3, 64 } | ||
173 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | ||
174 | { bnzt zero, . } | ||
175 | EX: { lw r7, r3; addi r3, r3, 64 } | ||
176 | #ifndef MEMCPY_USE_WH64 | ||
177 | /* Prefetch the dest */ | ||
178 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | ||
179 | { bnzt zero, . } | ||
180 | /* Use a real load to cause a TLB miss if necessary. We aren't using | ||
181 | * r28, so this should be fine. | ||
182 | */ | ||
183 | EX: { lw r28, r9; addi r9, r9, 64 } | ||
184 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | ||
185 | { bnzt zero, . } | ||
186 | { prefetch r9; addi r9, r9, 64 } | ||
187 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | ||
188 | { bnzt zero, . } | ||
189 | { prefetch r9; addi r9, r9, 64 } | ||
190 | #endif | ||
191 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | ||
192 | { bz zero, .Lbig_loop2 } | ||
193 | |||
194 | /* On entry to this loop: | ||
195 | * - r0 points to the start of dst line 0 | ||
196 | * - r1 points to start of src line 0 | ||
197 | * - r2 >= (256 - 60), only the first time the loop trips. | ||
198 | * - r3 contains r1 + 128 + 60 [pointer to end of source line 2] | ||
199 | * This is our prefetch address. When we get near the end | ||
200 | * rather than prefetching off the end this is changed to point | ||
201 | * to some "safe" recently loaded address. | ||
202 | * - r5 contains *(r1 + 60) [i.e. last word of source line 0] | ||
203 | * - r6 contains *(r1 + 64 + 60) [i.e. last word of source line 1] | ||
204 | * - r9 contains ((r0 + 63) & -64) | ||
205 | * [start of next dst cache line.] | ||
206 | */ | ||
207 | |||
208 | .Lbig_loop: | ||
209 | { jal .Lcopy_line2; add r15, r1, r2 } | ||
210 | |||
211 | .Lbig_loop2: | ||
212 | /* Copy line 0, first stalling until r5 is ready. */ | ||
213 | EX: { move r12, r5; lw r16, r1 } | ||
214 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } | ||
215 | /* Prefetch several lines ahead. */ | ||
216 | EX: { lw r5, r3; addi r3, r3, 64 } | ||
217 | { jal .Lcopy_line } | ||
218 | |||
219 | /* Copy line 1, first stalling until r6 is ready. */ | ||
220 | EX: { move r12, r6; lw r16, r1 } | ||
221 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } | ||
222 | /* Prefetch several lines ahead. */ | ||
223 | EX: { lw r6, r3; addi r3, r3, 64 } | ||
224 | { jal .Lcopy_line } | ||
225 | |||
226 | /* Copy line 2, first stalling until r7 is ready. */ | ||
227 | EX: { move r12, r7; lw r16, r1 } | ||
228 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } | ||
229 | /* Prefetch several lines ahead. */ | ||
230 | EX: { lw r7, r3; addi r3, r3, 64 } | ||
231 | /* Use up a caches-busy cycle by jumping back to the top of the | ||
232 | * loop. Might as well get it out of the way now. | ||
233 | */ | ||
234 | { j .Lbig_loop } | ||
235 | |||
236 | |||
237 | /* On entry: | ||
238 | * - r0 points to the destination line. | ||
239 | * - r1 points to the source line. | ||
240 | * - r3 is the next prefetch address. | ||
241 | * - r9 holds the last address used for wh64. | ||
242 | * - r12 = WORD_15 | ||
243 | * - r16 = WORD_0. | ||
244 | * - r17 == r1 + 16. | ||
245 | * - r27 holds saved lr to restore. | ||
246 | * | ||
247 | * On exit: | ||
248 | * - r0 is incremented by 64. | ||
249 | * - r1 is incremented by 64, unless that would point to a word | ||
250 | * beyond the end of the source array, in which case it is redirected | ||
251 | * to point to an arbitrary word already in the cache. | ||
252 | * - r2 is decremented by 64. | ||
253 | * - r3 is unchanged, unless it points to a word beyond the | ||
254 | * end of the source array, in which case it is redirected | ||
255 | * to point to an arbitrary word already in the cache. | ||
256 | * Redirecting is OK since if we are that close to the end | ||
257 | * of the array we will not come back to this subroutine | ||
258 | * and use the contents of the prefetched address. | ||
259 | * - r4 is nonzero iff r2 >= 64. | ||
260 | * - r9 is incremented by 64, unless it points beyond the | ||
261 | * end of the last full destination cache line, in which | ||
262 | * case it is redirected to a "safe address" that can be | ||
263 | * clobbered (sp - 64) | ||
264 | * - lr contains the value in r27. | ||
265 | */ | ||
266 | |||
267 | /* r26 unused */ | ||
268 | |||
269 | .Lcopy_line: | ||
270 | /* TODO: when r3 goes past the end, we would like to redirect it | ||
271 | * to prefetch the last partial cache line (if any) just once, for the | ||
272 | * benefit of the final cleanup loop. But we don't want to | ||
273 | * prefetch that line more than once, or subsequent prefetches | ||
274 | * will go into the RTF. But then .Lbig_loop should unconditionally | ||
275 | * branch to top of loop to execute final prefetch, and its | ||
276 | * nop should become a conditional branch. | ||
277 | */ | ||
278 | |||
279 | /* We need two non-memory cycles here to cover the resources | ||
280 | * used by the loads initiated by the caller. | ||
281 | */ | ||
282 | { add r15, r1, r2 } | ||
283 | .Lcopy_line2: | ||
284 | { slt_u r13, r3, r15; addi r17, r1, 16 } | ||
285 | |||
286 | /* NOTE: this will stall for one cycle as L1 is busy. */ | ||
287 | |||
288 | /* Fill second L1D line. */ | ||
289 | EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */ | ||
290 | |||
291 | #ifdef MEMCPY_TEST_WH64 | ||
292 | /* Issue a fake wh64 that clobbers the destination words | ||
293 | * with random garbage, for testing. | ||
294 | */ | ||
295 | { movei r19, 64; crc32_32 r10, r2, r9 } | ||
296 | .Lwh64_test_loop: | ||
297 | EX: { sw r9, r10; addi r9, r9, 4; addi r19, r19, -4 } | ||
298 | { bnzt r19, .Lwh64_test_loop; crc32_32 r10, r10, r19 } | ||
299 | #elif CHIP_HAS_WH64() | ||
300 | /* Prepare destination line for writing. */ | ||
301 | EX: { wh64 r9; addi r9, r9, 64 } | ||
302 | #else | ||
303 | /* Prefetch dest line */ | ||
304 | { prefetch r9; addi r9, r9, 64 } | ||
305 | #endif | ||
306 | /* Load seven words that are L1D hits to cover wh64 L2 usage. */ | ||
307 | |||
308 | /* Load the three remaining words from the last L1D line, which | ||
309 | * we know has already filled the L1D. | ||
310 | */ | ||
311 | EX: { lw r4, r1; addi r1, r1, 4; addi r20, r1, 16 } /* r4 = WORD_12 */ | ||
312 | EX: { lw r8, r1; addi r1, r1, 4; slt_u r13, r20, r15 }/* r8 = WORD_13 */ | ||
313 | EX: { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 } /* r11 = WORD_14 */ | ||
314 | |||
315 | /* Load the three remaining words from the first L1D line, first | ||
316 | * stalling until it has filled by "looking at" r16. | ||
317 | */ | ||
318 | EX: { lw r13, r1; addi r1, r1, 4; move zero, r16 } /* r13 = WORD_1 */ | ||
319 | EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_2 */ | ||
320 | EX: { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */ | ||
321 | |||
322 | /* Load second word from the second L1D line, first | ||
323 | * stalling until it has filled by "looking at" r17. | ||
324 | */ | ||
325 | EX: { lw r19, r1; addi r1, r1, 4; move zero, r17 } /* r19 = WORD_5 */ | ||
326 | |||
327 | /* Store last word to the destination line, potentially dirtying it | ||
328 | * for the first time, which keeps the L2 busy for two cycles. | ||
329 | */ | ||
330 | EX: { sw r10, r12 } /* store(WORD_15) */ | ||
331 | |||
332 | /* Use two L1D hits to cover the sw L2 access above. */ | ||
333 | EX: { lw r10, r1; addi r1, r1, 4 } /* r10 = WORD_6 */ | ||
334 | EX: { lw r12, r1; addi r1, r1, 4 } /* r12 = WORD_7 */ | ||
335 | |||
336 | /* Fill third L1D line. */ | ||
337 | EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */ | ||
338 | |||
339 | /* Store first L1D line. */ | ||
340 | EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */ | ||
341 | EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */ | ||
342 | EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */ | ||
343 | #ifdef MEMCPY_USE_WH64 | ||
344 | EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */ | ||
345 | #else | ||
346 | /* Back up the r9 to a cache line we are already storing to | ||
347 | * if it gets past the end of the dest vector. Strictly speaking, | ||
348 | * we don't need to back up to the start of a cache line, but it's free | ||
349 | * and tidy, so why not? | ||
350 | */ | ||
351 | EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */ | ||
352 | #endif | ||
353 | /* Store second L1D line. */ | ||
354 | EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */ | ||
355 | EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */ | ||
356 | EX: { sw r0, r10; addi r0, r0, 4 } /* store(WORD_6) */ | ||
357 | EX: { sw r0, r12; addi r0, r0, 4 } /* store(WORD_7) */ | ||
358 | |||
359 | EX: { lw r13, r1; addi r1, r1, 4; move zero, r18 } /* r13 = WORD_9 */ | ||
360 | EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_10 */ | ||
361 | EX: { lw r15, r1; move r1, r20 } /* r15 = WORD_11 */ | ||
362 | |||
363 | /* Store third L1D line. */ | ||
364 | EX: { sw r0, r18; addi r0, r0, 4 } /* store(WORD_8) */ | ||
365 | EX: { sw r0, r13; addi r0, r0, 4 } /* store(WORD_9) */ | ||
366 | EX: { sw r0, r14; addi r0, r0, 4 } /* store(WORD_10) */ | ||
367 | EX: { sw r0, r15; addi r0, r0, 4 } /* store(WORD_11) */ | ||
368 | |||
369 | /* Store rest of fourth L1D line. */ | ||
370 | EX: { sw r0, r4; addi r0, r0, 4 } /* store(WORD_12) */ | ||
371 | { | ||
372 | EX: sw r0, r8 /* store(WORD_13) */ | ||
373 | addi r0, r0, 4 | ||
374 | /* Will r2 be > 64 after we subtract 64 below? */ | ||
375 | shri r4, r2, 7 | ||
376 | } | ||
377 | { | ||
378 | EX: sw r0, r11 /* store(WORD_14) */ | ||
379 | addi r0, r0, 8 | ||
380 | /* Record 64 bytes successfully copied. */ | ||
381 | addi r2, r2, -64 | ||
382 | } | ||
383 | |||
384 | { jrp lr; move lr, r27 } | ||
385 | |||
386 | /* Convey to the backtrace library that the stack frame is size | ||
387 | * zero, and the real return address is on the stack rather than | ||
388 | * in 'lr'. | ||
389 | */ | ||
390 | { info 8 } | ||
391 | |||
392 | .align 64 | ||
393 | .Lcopy_unaligned_maybe_many: | ||
394 | /* Skip the setup overhead if we aren't copying many bytes. */ | ||
395 | { slti_u r8, r2, 20; sub r4, zero, r0 } | ||
396 | { bnzt r8, .Lcopy_unaligned_few; andi r4, r4, 3 } | ||
397 | { bz r4, .Ldest_is_word_aligned; add r18, r1, r2 } | ||
398 | |||
399 | /* | ||
400 | * | ||
401 | * unaligned 4 byte at a time copy handler. | ||
402 | * | ||
403 | */ | ||
404 | |||
405 | /* Copy single bytes until r0 == 0 mod 4, so we can store words. */ | ||
406 | .Lalign_dest_loop: | ||
407 | EX: { lb_u r3, r1; addi r1, r1, 1; addi r4, r4, -1 } | ||
408 | EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } | ||
409 | { bnzt r4, .Lalign_dest_loop; andi r3, r1, 3 } | ||
410 | |||
411 | /* If source and dest are now *both* aligned, do an aligned copy. */ | ||
412 | { bz r3, .Lcheck_aligned_copy_size; addli r4, r2, -256 } | ||
413 | |||
414 | .Ldest_is_word_aligned: | ||
415 | |||
416 | #if CHIP_HAS_DWORD_ALIGN() | ||
417 | EX: { andi r8, r0, 63; lwadd_na r6, r1, 4} | ||
418 | { slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned } | ||
419 | |||
420 | /* This copies unaligned words until either there are fewer | ||
421 | * than 4 bytes left to copy, or until the destination pointer | ||
422 | * is cache-aligned, whichever comes first. | ||
423 | * | ||
424 | * On entry: | ||
425 | * - r0 is the next store address. | ||
426 | * - r1 points 4 bytes past the load address corresponding to r0. | ||
427 | * - r2 >= 4 | ||
428 | * - r6 is the next aligned word loaded. | ||
429 | */ | ||
430 | .Lcopy_unaligned_src_words: | ||
431 | EX: { lwadd_na r7, r1, 4; slti_u r8, r2, 4 + 4 } | ||
432 | /* stall */ | ||
433 | { dword_align r6, r7, r1; slti_u r9, r2, 64 + 4 } | ||
434 | EX: { swadd r0, r6, 4; addi r2, r2, -4 } | ||
435 | { bnz r8, .Lcleanup_unaligned_words; andi r8, r0, 63 } | ||
436 | { bnzt r8, .Lcopy_unaligned_src_words; move r6, r7 } | ||
437 | |||
438 | /* On entry: | ||
439 | * - r0 is the next store address. | ||
440 | * - r1 points 4 bytes past the load address corresponding to r0. | ||
441 | * - r2 >= 4 (# of bytes left to store). | ||
442 | * - r6 is the next aligned src word value. | ||
443 | * - r9 = (r2 < 64U). | ||
444 | * - r18 points one byte past the end of source memory. | ||
445 | */ | ||
446 | .Ldest_is_L2_line_aligned: | ||
447 | |||
448 | { | ||
449 | /* Not a full cache line remains. */ | ||
450 | bnz r9, .Lcleanup_unaligned_words | ||
451 | move r7, r6 | ||
452 | } | ||
453 | |||
454 | /* r2 >= 64 */ | ||
455 | |||
456 | /* Kick off two prefetches, but don't go past the end. */ | ||
457 | { addi r3, r1, 63 - 4; addi r8, r1, 64 + 63 - 4 } | ||
458 | { prefetch r3; move r3, r8; slt_u r8, r8, r18 } | ||
459 | { mvz r3, r8, r1; addi r8, r3, 64 } | ||
460 | { prefetch r3; move r3, r8; slt_u r8, r8, r18 } | ||
461 | { mvz r3, r8, r1; movei r17, 0 } | ||
462 | |||
463 | .Lcopy_unaligned_line: | ||
464 | /* Prefetch another line. */ | ||
465 | { prefetch r3; addi r15, r1, 60; addi r3, r3, 64 } | ||
466 | /* Fire off a load of the last word we are about to copy. */ | ||
467 | EX: { lw_na r15, r15; slt_u r8, r3, r18 } | ||
468 | |||
469 | EX: { mvz r3, r8, r1; wh64 r0 } | ||
470 | |||
471 | /* This loop runs twice. | ||
472 | * | ||
473 | * On entry: | ||
474 | * - r17 is even before the first iteration, and odd before | ||
475 | * the second. It is incremented inside the loop. Encountering | ||
476 | * an even value at the end of the loop makes it stop. | ||
477 | */ | ||
478 | .Lcopy_half_an_unaligned_line: | ||
479 | EX: { | ||
480 | /* Stall until the last byte is ready. In the steady state this | ||
481 | * guarantees all words to load below will be in the L2 cache, which | ||
482 | * avoids shunting the loads to the RTF. | ||
483 | */ | ||
484 | move zero, r15 | ||
485 | lwadd_na r7, r1, 16 | ||
486 | } | ||
487 | EX: { lwadd_na r11, r1, 12 } | ||
488 | EX: { lwadd_na r14, r1, -24 } | ||
489 | EX: { lwadd_na r8, r1, 4 } | ||
490 | EX: { lwadd_na r9, r1, 4 } | ||
491 | EX: { | ||
492 | lwadd_na r10, r1, 8 | ||
493 | /* r16 = (r2 < 64), after we subtract 32 from r2 below. */ | ||
494 | slti_u r16, r2, 64 + 32 | ||
495 | } | ||
496 | EX: { lwadd_na r12, r1, 4; addi r17, r17, 1 } | ||
497 | EX: { lwadd_na r13, r1, 8; dword_align r6, r7, r1 } | ||
498 | EX: { swadd r0, r6, 4; dword_align r7, r8, r1 } | ||
499 | EX: { swadd r0, r7, 4; dword_align r8, r9, r1 } | ||
500 | EX: { swadd r0, r8, 4; dword_align r9, r10, r1 } | ||
501 | EX: { swadd r0, r9, 4; dword_align r10, r11, r1 } | ||
502 | EX: { swadd r0, r10, 4; dword_align r11, r12, r1 } | ||
503 | EX: { swadd r0, r11, 4; dword_align r12, r13, r1 } | ||
504 | EX: { swadd r0, r12, 4; dword_align r13, r14, r1 } | ||
505 | EX: { swadd r0, r13, 4; addi r2, r2, -32 } | ||
506 | { move r6, r14; bbst r17, .Lcopy_half_an_unaligned_line } | ||
507 | |||
508 | { bzt r16, .Lcopy_unaligned_line; move r7, r6 } | ||
509 | |||
510 | /* On entry: | ||
511 | * - r0 is the next store address. | ||
512 | * - r1 points 4 bytes past the load address corresponding to r0. | ||
513 | * - r2 >= 0 (# of bytes left to store). | ||
514 | * - r7 is the next aligned src word value. | ||
515 | */ | ||
516 | .Lcleanup_unaligned_words: | ||
517 | /* Handle any trailing bytes. */ | ||
518 | { bz r2, .Lcopy_unaligned_done; slti_u r8, r2, 4 } | ||
519 | { bzt r8, .Lcopy_unaligned_src_words; move r6, r7 } | ||
520 | |||
521 | /* Move r1 back to the point where it corresponds to r0. */ | ||
522 | { addi r1, r1, -4 } | ||
523 | |||
524 | #else /* !CHIP_HAS_DWORD_ALIGN() */ | ||
525 | |||
526 | /* Compute right/left shift counts and load initial source words. */ | ||
527 | { andi r5, r1, -4; andi r3, r1, 3 } | ||
528 | EX: { lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 } | ||
529 | EX: { lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 } | ||
530 | |||
531 | /* Load and store one word at a time, using shifts and ORs | ||
532 | * to correct for the misaligned src. | ||
533 | */ | ||
534 | .Lcopy_unaligned_src_loop: | ||
535 | { shr r6, r6, r3; shl r8, r7, r4 } | ||
536 | EX: { lw r7, r5; or r8, r8, r6; move r6, r7 } | ||
537 | EX: { sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 } | ||
538 | { addi r5, r5, 4; slti_u r8, r2, 8 } | ||
539 | { bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 } | ||
540 | |||
541 | { bz r2, .Lcopy_unaligned_done } | ||
542 | #endif /* !CHIP_HAS_DWORD_ALIGN() */ | ||
543 | |||
544 | /* Fall through */ | ||
545 | |||
546 | /* | ||
547 | * | ||
548 | * 1 byte at a time copy handler. | ||
549 | * | ||
550 | */ | ||
551 | |||
552 | .Lcopy_unaligned_few: | ||
553 | EX: { lb_u r3, r1; addi r1, r1, 1 } | ||
554 | EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } | ||
555 | { bnzt r2, .Lcopy_unaligned_few } | ||
556 | |||
557 | .Lcopy_unaligned_done: | ||
558 | |||
559 | /* For memcpy return original dest address, else zero. */ | ||
560 | { mz r0, r29, r23; jrp lr } | ||
561 | |||
562 | .Lend_memcpy_common: | ||
563 | .size memcpy_common, .Lend_memcpy_common - memcpy_common | ||
564 | |||
565 | .section .fixup,"ax" | ||
566 | memcpy_common_fixup: | ||
567 | .type memcpy_common_fixup, @function | ||
568 | |||
569 | /* Skip any bytes we already successfully copied. | ||
570 | * r2 (num remaining) is correct, but r0 (dst) and r1 (src) | ||
571 | * may not be quite right because of unrolling and prefetching. | ||
572 | * So we need to recompute their values as the address just | ||
573 | * after the last byte we are sure was successfully loaded and | ||
574 | * then stored. | ||
575 | */ | ||
576 | |||
577 | /* Determine how many bytes we successfully copied. */ | ||
578 | { sub r3, r25, r2 } | ||
579 | |||
580 | /* Add this to the original r0 and r1 to get their new values. */ | ||
581 | { add r0, r23, r3; add r1, r24, r3 } | ||
582 | |||
583 | { bzt r29, memcpy_fixup_loop } | ||
584 | { blzt r29, copy_to_user_fixup_loop } | ||
585 | |||
586 | copy_from_user_fixup_loop: | ||
587 | /* Try copying the rest one byte at a time, expecting a load fault. */ | ||
588 | .Lcfu: { lb_u r3, r1; addi r1, r1, 1 } | ||
589 | { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } | ||
590 | { bnzt r2, copy_from_user_fixup_loop } | ||
591 | |||
592 | .Lcopy_from_user_fixup_zero_remainder: | ||
593 | { bbs r29, 2f } /* low bit set means IS_COPY_FROM_USER */ | ||
594 | /* byte-at-a-time loop faulted, so zero the rest. */ | ||
595 | { move r3, r2; bz r2, 2f /* should be impossible, but handle it. */ } | ||
596 | 1: { sb r0, zero; addi r0, r0, 1; addi r3, r3, -1 } | ||
597 | { bnzt r3, 1b } | ||
598 | 2: move lr, r27 | ||
599 | { move r0, r2; jrp lr } | ||
600 | |||
601 | copy_to_user_fixup_loop: | ||
602 | /* Try copying the rest one byte at a time, expecting a store fault. */ | ||
603 | { lb_u r3, r1; addi r1, r1, 1 } | ||
604 | .Lctu: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } | ||
605 | { bnzt r2, copy_to_user_fixup_loop } | ||
606 | .Lcopy_to_user_fixup_done: | ||
607 | move lr, r27 | ||
608 | { move r0, r2; jrp lr } | ||
609 | |||
610 | memcpy_fixup_loop: | ||
611 | /* Try copying the rest one byte at a time. We expect a disastrous | ||
612 | * fault to happen since we are in fixup code, but let it happen. | ||
613 | */ | ||
614 | { lb_u r3, r1; addi r1, r1, 1 } | ||
615 | { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } | ||
616 | { bnzt r2, memcpy_fixup_loop } | ||
617 | /* This should be unreachable, we should have faulted again. | ||
618 | * But be paranoid and handle it in case some interrupt changed | ||
619 | * the TLB or something. | ||
620 | */ | ||
621 | move lr, r27 | ||
622 | { move r0, r23; jrp lr } | ||
623 | |||
624 | .size memcpy_common_fixup, . - memcpy_common_fixup | ||
625 | |||
626 | .section __ex_table,"a" | ||
627 | .word .Lcfu, .Lcopy_from_user_fixup_zero_remainder | ||
628 | .word .Lctu, .Lcopy_to_user_fixup_done | ||
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c new file mode 100644 index 00000000000..dfedea7b266 --- /dev/null +++ b/arch/tile/lib/memcpy_tile64.c | |||
@@ -0,0 +1,271 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/string.h> | ||
16 | #include <linux/smp.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/uaccess.h> | ||
19 | #include <asm/fixmap.h> | ||
20 | #include <asm/kmap_types.h> | ||
21 | #include <asm/tlbflush.h> | ||
22 | #include <hv/hypervisor.h> | ||
23 | #include <arch/chip.h> | ||
24 | |||
25 | |||
26 | #if !CHIP_HAS_COHERENT_LOCAL_CACHE() | ||
27 | |||
28 | /* Defined in memcpy.S */ | ||
29 | extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n); | ||
30 | extern unsigned long __copy_to_user_inatomic_asm( | ||
31 | void __user *to, const void *from, unsigned long n); | ||
32 | extern unsigned long __copy_from_user_inatomic_asm( | ||
33 | void *to, const void __user *from, unsigned long n); | ||
34 | extern unsigned long __copy_from_user_zeroing_asm( | ||
35 | void *to, const void __user *from, unsigned long n); | ||
36 | |||
37 | typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long); | ||
38 | |||
39 | /* Size above which to consider TLB games for performance */ | ||
40 | #define LARGE_COPY_CUTOFF 2048 | ||
41 | |||
42 | /* Communicate to the simulator what we are trying to do. */ | ||
43 | #define sim_allow_multiple_caching(b) \ | ||
44 | __insn_mtspr(SPR_SIM_CONTROL, \ | ||
45 | SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS)) | ||
46 | |||
47 | /* | ||
48 | * Copy memory by briefly enabling incoherent cacheline-at-a-time mode. | ||
49 | * | ||
50 | * We set up our own source and destination PTEs that we fully control. | ||
51 | * This is the only way to guarantee that we don't race with another | ||
52 | * thread that is modifying the PTE; we can't afford to try the | ||
53 | * copy_{to,from}_user() technique of catching the interrupt, since | ||
54 | * we must run with interrupts disabled to avoid the risk of some | ||
55 | * other code seeing the incoherent data in our cache. (Recall that | ||
56 | * our cache is indexed by PA, so even if the other code doesn't use | ||
57 | * our KM_MEMCPY virtual addresses, they'll still hit in cache using | ||
58 | * the normal VAs that aren't supposed to hit in cache.) | ||
59 | */ | ||
60 | static void memcpy_multicache(void *dest, const void *source, | ||
61 | pte_t dst_pte, pte_t src_pte, int len) | ||
62 | { | ||
63 | int idx; | ||
64 | unsigned long flags, newsrc, newdst; | ||
65 | pmd_t *pmdp; | ||
66 | pte_t *ptep; | ||
67 | int cpu = get_cpu(); | ||
68 | |||
69 | /* | ||
70 | * Disable interrupts so that we don't recurse into memcpy() | ||
71 | * in an interrupt handler, nor accidentally reference | ||
72 | * the PA of the source from an interrupt routine. Also | ||
73 | * notify the simulator that we're playing games so we don't | ||
74 | * generate spurious coherency warnings. | ||
75 | */ | ||
76 | local_irq_save(flags); | ||
77 | sim_allow_multiple_caching(1); | ||
78 | |||
79 | /* Set up the new dest mapping */ | ||
80 | idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + KM_MEMCPY0; | ||
81 | newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1)); | ||
82 | pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst); | ||
83 | ptep = pte_offset_kernel(pmdp, newdst); | ||
84 | if (pte_val(*ptep) != pte_val(dst_pte)) { | ||
85 | set_pte(ptep, dst_pte); | ||
86 | local_flush_tlb_page(NULL, newdst, PAGE_SIZE); | ||
87 | } | ||
88 | |||
89 | /* Set up the new source mapping */ | ||
90 | idx += (KM_MEMCPY0 - KM_MEMCPY1); | ||
91 | src_pte = hv_pte_set_nc(src_pte); | ||
92 | src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */ | ||
93 | newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); | ||
94 | pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); | ||
95 | ptep = pte_offset_kernel(pmdp, newsrc); | ||
96 | *ptep = src_pte; /* set_pte() would be confused by this */ | ||
97 | local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); | ||
98 | |||
99 | /* Actually move the data. */ | ||
100 | __memcpy_asm((void *)newdst, (const void *)newsrc, len); | ||
101 | |||
102 | /* | ||
103 | * Remap the source as locally-cached and not OLOC'ed so that | ||
104 | * we can inval without also invaling the remote cpu's cache. | ||
105 | * This also avoids known errata with inv'ing cacheable oloc data. | ||
106 | */ | ||
107 | src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); | ||
108 | src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ | ||
109 | *ptep = src_pte; /* set_pte() would be confused by this */ | ||
110 | local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); | ||
111 | |||
112 | /* | ||
113 | * Do the actual invalidation, covering the full L2 cache line | ||
114 | * at the end since __memcpy_asm() is somewhat aggressive. | ||
115 | */ | ||
116 | __inv_buffer((void *)newsrc, len); | ||
117 | |||
118 | /* | ||
119 | * We're done: notify the simulator that all is back to normal, | ||
120 | * and re-enable interrupts and pre-emption. | ||
121 | */ | ||
122 | sim_allow_multiple_caching(0); | ||
123 | local_irq_restore(flags); | ||
124 | put_cpu(); | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * Identify large copies from remotely-cached memory, and copy them | ||
129 | * via memcpy_multicache() if they look good, otherwise fall back | ||
130 | * to the particular kind of copying passed as the memcpy_t function. | ||
131 | */ | ||
132 | static unsigned long fast_copy(void *dest, const void *source, int len, | ||
133 | memcpy_t func) | ||
134 | { | ||
135 | /* | ||
136 | * Check if it's big enough to bother with. We may end up doing a | ||
137 | * small copy via TLB manipulation if we're near a page boundary, | ||
138 | * but presumably we'll make it up when we hit the second page. | ||
139 | */ | ||
140 | while (len >= LARGE_COPY_CUTOFF) { | ||
141 | int copy_size, bytes_left_on_page; | ||
142 | pte_t *src_ptep, *dst_ptep; | ||
143 | pte_t src_pte, dst_pte; | ||
144 | struct page *src_page, *dst_page; | ||
145 | |||
146 | /* Is the source page oloc'ed to a remote cpu? */ | ||
147 | retry_source: | ||
148 | src_ptep = virt_to_pte(current->mm, (unsigned long)source); | ||
149 | if (src_ptep == NULL) | ||
150 | break; | ||
151 | src_pte = *src_ptep; | ||
152 | if (!hv_pte_get_present(src_pte) || | ||
153 | !hv_pte_get_readable(src_pte) || | ||
154 | hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3) | ||
155 | break; | ||
156 | if (get_remote_cache_cpu(src_pte) == smp_processor_id()) | ||
157 | break; | ||
158 | src_page = pfn_to_page(hv_pte_get_pfn(src_pte)); | ||
159 | get_page(src_page); | ||
160 | if (pte_val(src_pte) != pte_val(*src_ptep)) { | ||
161 | put_page(src_page); | ||
162 | goto retry_source; | ||
163 | } | ||
164 | if (pte_huge(src_pte)) { | ||
165 | /* Adjust the PTE to correspond to a small page */ | ||
166 | int pfn = hv_pte_get_pfn(src_pte); | ||
167 | pfn += (((unsigned long)source & (HPAGE_SIZE-1)) | ||
168 | >> PAGE_SHIFT); | ||
169 | src_pte = pfn_pte(pfn, src_pte); | ||
170 | src_pte = pte_mksmall(src_pte); | ||
171 | } | ||
172 | |||
173 | /* Is the destination page writable? */ | ||
174 | retry_dest: | ||
175 | dst_ptep = virt_to_pte(current->mm, (unsigned long)dest); | ||
176 | if (dst_ptep == NULL) { | ||
177 | put_page(src_page); | ||
178 | break; | ||
179 | } | ||
180 | dst_pte = *dst_ptep; | ||
181 | if (!hv_pte_get_present(dst_pte) || | ||
182 | !hv_pte_get_writable(dst_pte)) { | ||
183 | put_page(src_page); | ||
184 | break; | ||
185 | } | ||
186 | dst_page = pfn_to_page(hv_pte_get_pfn(dst_pte)); | ||
187 | if (dst_page == src_page) { | ||
188 | /* | ||
189 | * Source and dest are on the same page; this | ||
190 | * potentially exposes us to incoherence if any | ||
191 | * part of src and dest overlap on a cache line. | ||
192 | * Just give up rather than trying to be precise. | ||
193 | */ | ||
194 | put_page(src_page); | ||
195 | break; | ||
196 | } | ||
197 | get_page(dst_page); | ||
198 | if (pte_val(dst_pte) != pte_val(*dst_ptep)) { | ||
199 | put_page(dst_page); | ||
200 | goto retry_dest; | ||
201 | } | ||
202 | if (pte_huge(dst_pte)) { | ||
203 | /* Adjust the PTE to correspond to a small page */ | ||
204 | int pfn = hv_pte_get_pfn(dst_pte); | ||
205 | pfn += (((unsigned long)dest & (HPAGE_SIZE-1)) | ||
206 | >> PAGE_SHIFT); | ||
207 | dst_pte = pfn_pte(pfn, dst_pte); | ||
208 | dst_pte = pte_mksmall(dst_pte); | ||
209 | } | ||
210 | |||
211 | /* All looks good: create a cachable PTE and copy from it */ | ||
212 | copy_size = len; | ||
213 | bytes_left_on_page = | ||
214 | PAGE_SIZE - (((int)source) & (PAGE_SIZE-1)); | ||
215 | if (copy_size > bytes_left_on_page) | ||
216 | copy_size = bytes_left_on_page; | ||
217 | bytes_left_on_page = | ||
218 | PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1)); | ||
219 | if (copy_size > bytes_left_on_page) | ||
220 | copy_size = bytes_left_on_page; | ||
221 | memcpy_multicache(dest, source, dst_pte, src_pte, copy_size); | ||
222 | |||
223 | /* Release the pages */ | ||
224 | put_page(dst_page); | ||
225 | put_page(src_page); | ||
226 | |||
227 | /* Continue on the next page */ | ||
228 | dest += copy_size; | ||
229 | source += copy_size; | ||
230 | len -= copy_size; | ||
231 | } | ||
232 | |||
233 | return func(dest, source, len); | ||
234 | } | ||
235 | |||
236 | void *memcpy(void *to, const void *from, __kernel_size_t n) | ||
237 | { | ||
238 | if (n < LARGE_COPY_CUTOFF) | ||
239 | return (void *)__memcpy_asm(to, from, n); | ||
240 | else | ||
241 | return (void *)fast_copy(to, from, n, __memcpy_asm); | ||
242 | } | ||
243 | |||
244 | unsigned long __copy_to_user_inatomic(void __user *to, const void *from, | ||
245 | unsigned long n) | ||
246 | { | ||
247 | if (n < LARGE_COPY_CUTOFF) | ||
248 | return __copy_to_user_inatomic_asm(to, from, n); | ||
249 | else | ||
250 | return fast_copy(to, from, n, __copy_to_user_inatomic_asm); | ||
251 | } | ||
252 | |||
253 | unsigned long __copy_from_user_inatomic(void *to, const void __user *from, | ||
254 | unsigned long n) | ||
255 | { | ||
256 | if (n < LARGE_COPY_CUTOFF) | ||
257 | return __copy_from_user_inatomic_asm(to, from, n); | ||
258 | else | ||
259 | return fast_copy(to, from, n, __copy_from_user_inatomic_asm); | ||
260 | } | ||
261 | |||
262 | unsigned long __copy_from_user_zeroing(void *to, const void __user *from, | ||
263 | unsigned long n) | ||
264 | { | ||
265 | if (n < LARGE_COPY_CUTOFF) | ||
266 | return __copy_from_user_zeroing_asm(to, from, n); | ||
267 | else | ||
268 | return fast_copy(to, from, n, __copy_from_user_zeroing_asm); | ||
269 | } | ||
270 | |||
271 | #endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */ | ||
diff --git a/arch/tile/lib/memmove_32.c b/arch/tile/lib/memmove_32.c new file mode 100644 index 00000000000..fd615ae6ade --- /dev/null +++ b/arch/tile/lib/memmove_32.c | |||
@@ -0,0 +1,63 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | void *memmove(void *dest, const void *src, size_t n) | ||
20 | { | ||
21 | if ((const char *)src >= (char *)dest + n | ||
22 | || (char *)dest >= (const char *)src + n) { | ||
23 | /* We found no overlap, so let memcpy do all the heavy | ||
24 | * lifting (prefetching, etc.) | ||
25 | */ | ||
26 | return memcpy(dest, src, n); | ||
27 | } | ||
28 | |||
29 | if (n != 0) { | ||
30 | const uint8_t *in; | ||
31 | uint8_t x; | ||
32 | uint8_t *out; | ||
33 | int stride; | ||
34 | |||
35 | if (src < dest) { | ||
36 | /* copy backwards */ | ||
37 | in = (const uint8_t *)src + n - 1; | ||
38 | out = (uint8_t *)dest + n - 1; | ||
39 | stride = -1; | ||
40 | } else { | ||
41 | /* copy forwards */ | ||
42 | in = (const uint8_t *)src; | ||
43 | out = (uint8_t *)dest; | ||
44 | stride = 1; | ||
45 | } | ||
46 | |||
47 | /* Manually software-pipeline this loop. */ | ||
48 | x = *in; | ||
49 | in += stride; | ||
50 | |||
51 | while (--n != 0) { | ||
52 | *out = x; | ||
53 | out += stride; | ||
54 | x = *in; | ||
55 | in += stride; | ||
56 | } | ||
57 | |||
58 | *out = x; | ||
59 | } | ||
60 | |||
61 | return dest; | ||
62 | } | ||
63 | EXPORT_SYMBOL(memmove); | ||
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c new file mode 100644 index 00000000000..bfde5d864df --- /dev/null +++ b/arch/tile/lib/memset_32.c | |||
@@ -0,0 +1,275 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <arch/chip.h> | ||
16 | |||
17 | #include <linux/types.h> | ||
18 | #include <linux/string.h> | ||
19 | #include <linux/module.h> | ||
20 | |||
21 | |||
22 | void *memset(void *s, int c, size_t n) | ||
23 | { | ||
24 | uint32_t *out32; | ||
25 | int n32; | ||
26 | uint32_t v16, v32; | ||
27 | uint8_t *out8 = s; | ||
28 | #if !CHIP_HAS_WH64() | ||
29 | int ahead32; | ||
30 | #else | ||
31 | int to_align32; | ||
32 | #endif | ||
33 | |||
34 | /* Experimentation shows that a trivial tight loop is a win up until | ||
35 | * around a size of 20, where writing a word at a time starts to win. | ||
36 | */ | ||
37 | #define BYTE_CUTOFF 20 | ||
38 | |||
39 | #if BYTE_CUTOFF < 3 | ||
40 | /* This must be at least at least this big, or some code later | ||
41 | * on doesn't work. | ||
42 | */ | ||
43 | #error "BYTE_CUTOFF is too small" | ||
44 | #endif | ||
45 | |||
46 | if (n < BYTE_CUTOFF) { | ||
47 | /* Strangely, this turns out to be the tightest way to | ||
48 | * write this loop. | ||
49 | */ | ||
50 | if (n != 0) { | ||
51 | do { | ||
52 | /* Strangely, combining these into one line | ||
53 | * performs worse. | ||
54 | */ | ||
55 | *out8 = c; | ||
56 | out8++; | ||
57 | } while (--n != 0); | ||
58 | } | ||
59 | |||
60 | return s; | ||
61 | } | ||
62 | |||
63 | #if !CHIP_HAS_WH64() | ||
64 | /* Use a spare issue slot to start prefetching the first cache | ||
65 | * line early. This instruction is free as the store can be buried | ||
66 | * in otherwise idle issue slots doing ALU ops. | ||
67 | */ | ||
68 | __insn_prefetch(out8); | ||
69 | |||
70 | /* We prefetch the end so that a short memset that spans two cache | ||
71 | * lines gets some prefetching benefit. Again we believe this is free | ||
72 | * to issue. | ||
73 | */ | ||
74 | __insn_prefetch(&out8[n - 1]); | ||
75 | #endif /* !CHIP_HAS_WH64() */ | ||
76 | |||
77 | |||
78 | /* Align 'out8'. We know n >= 3 so this won't write past the end. */ | ||
79 | while (((uintptr_t) out8 & 3) != 0) { | ||
80 | *out8++ = c; | ||
81 | --n; | ||
82 | } | ||
83 | |||
84 | /* Align 'n'. */ | ||
85 | while (n & 3) | ||
86 | out8[--n] = c; | ||
87 | |||
88 | out32 = (uint32_t *) out8; | ||
89 | n32 = n >> 2; | ||
90 | |||
91 | /* Tile input byte out to 32 bits. */ | ||
92 | v16 = __insn_intlb(c, c); | ||
93 | v32 = __insn_intlh(v16, v16); | ||
94 | |||
95 | /* This must be at least 8 or the following loop doesn't work. */ | ||
96 | #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) | ||
97 | |||
98 | #if !CHIP_HAS_WH64() | ||
99 | |||
100 | ahead32 = CACHE_LINE_SIZE_IN_WORDS; | ||
101 | |||
102 | /* We already prefetched the first and last cache lines, so | ||
103 | * we only need to do more prefetching if we are storing | ||
104 | * to more than two cache lines. | ||
105 | */ | ||
106 | if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) { | ||
107 | int i; | ||
108 | |||
109 | /* Prefetch the next several cache lines. | ||
110 | * This is the setup code for the software-pipelined | ||
111 | * loop below. | ||
112 | */ | ||
113 | #define MAX_PREFETCH 5 | ||
114 | ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS; | ||
115 | if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS) | ||
116 | ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS; | ||
117 | |||
118 | for (i = CACHE_LINE_SIZE_IN_WORDS; | ||
119 | i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS) | ||
120 | __insn_prefetch(&out32[i]); | ||
121 | } | ||
122 | |||
123 | if (n32 > ahead32) { | ||
124 | while (1) { | ||
125 | int j; | ||
126 | |||
127 | /* Prefetch by reading one word several cache lines | ||
128 | * ahead. Since loads are non-blocking this will | ||
129 | * cause the full cache line to be read while we are | ||
130 | * finishing earlier cache lines. Using a store | ||
131 | * here causes microarchitectural performance | ||
132 | * problems where a victimizing store miss goes to | ||
133 | * the head of the retry FIFO and locks the pipe for | ||
134 | * a few cycles. So a few subsequent stores in this | ||
135 | * loop go into the retry FIFO, and then later | ||
136 | * stores see other stores to the same cache line | ||
137 | * are already in the retry FIFO and themselves go | ||
138 | * into the retry FIFO, filling it up and grinding | ||
139 | * to a halt waiting for the original miss to be | ||
140 | * satisfied. | ||
141 | */ | ||
142 | __insn_prefetch(&out32[ahead32]); | ||
143 | |||
144 | #if 1 | ||
145 | #if CACHE_LINE_SIZE_IN_WORDS % 4 != 0 | ||
146 | #error "Unhandled CACHE_LINE_SIZE_IN_WORDS" | ||
147 | #endif | ||
148 | |||
149 | n32 -= CACHE_LINE_SIZE_IN_WORDS; | ||
150 | |||
151 | /* Save icache space by only partially unrolling | ||
152 | * this loop. | ||
153 | */ | ||
154 | for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) { | ||
155 | *out32++ = v32; | ||
156 | *out32++ = v32; | ||
157 | *out32++ = v32; | ||
158 | *out32++ = v32; | ||
159 | } | ||
160 | #else | ||
161 | /* Unfortunately, due to a code generator flaw this | ||
162 | * allocates a separate register for each of these | ||
163 | * stores, which requires a large number of spills, | ||
164 | * which makes this procedure enormously bigger | ||
165 | * (something like 70%) | ||
166 | */ | ||
167 | *out32++ = v32; | ||
168 | *out32++ = v32; | ||
169 | *out32++ = v32; | ||
170 | *out32++ = v32; | ||
171 | *out32++ = v32; | ||
172 | *out32++ = v32; | ||
173 | *out32++ = v32; | ||
174 | *out32++ = v32; | ||
175 | *out32++ = v32; | ||
176 | *out32++ = v32; | ||
177 | *out32++ = v32; | ||
178 | *out32++ = v32; | ||
179 | *out32++ = v32; | ||
180 | *out32++ = v32; | ||
181 | *out32++ = v32; | ||
182 | n32 -= 16; | ||
183 | #endif | ||
184 | |||
185 | /* To save compiled code size, reuse this loop even | ||
186 | * when we run out of prefetching to do by dropping | ||
187 | * ahead32 down. | ||
188 | */ | ||
189 | if (n32 <= ahead32) { | ||
190 | /* Not even a full cache line left, | ||
191 | * so stop now. | ||
192 | */ | ||
193 | if (n32 < CACHE_LINE_SIZE_IN_WORDS) | ||
194 | break; | ||
195 | |||
196 | /* Choose a small enough value that we don't | ||
197 | * prefetch past the end. There's no sense | ||
198 | * in touching cache lines we don't have to. | ||
199 | */ | ||
200 | ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1; | ||
201 | } | ||
202 | } | ||
203 | } | ||
204 | |||
205 | #else /* CHIP_HAS_WH64() */ | ||
206 | |||
207 | /* Determine how many words we need to emit before the 'out32' | ||
208 | * pointer becomes aligned modulo the cache line size. | ||
209 | */ | ||
210 | to_align32 = | ||
211 | (-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1); | ||
212 | |||
213 | /* Only bother aligning and using wh64 if there is at least | ||
214 | * one full cache line to process. This check also prevents | ||
215 | * overrunning the end of the buffer with alignment words. | ||
216 | */ | ||
217 | if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) { | ||
218 | int lines_left; | ||
219 | |||
220 | /* Align out32 mod the cache line size so we can use wh64. */ | ||
221 | n32 -= to_align32; | ||
222 | for (; to_align32 != 0; to_align32--) { | ||
223 | *out32 = v32; | ||
224 | out32++; | ||
225 | } | ||
226 | |||
227 | /* Use unsigned divide to turn this into a right shift. */ | ||
228 | lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS; | ||
229 | |||
230 | do { | ||
231 | /* Only wh64 a few lines at a time, so we don't | ||
232 | * exceed the maximum number of victim lines. | ||
233 | */ | ||
234 | int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS()) | ||
235 | ? lines_left | ||
236 | : CHIP_MAX_OUTSTANDING_VICTIMS()); | ||
237 | uint32_t *wh = out32; | ||
238 | int i = x; | ||
239 | int j; | ||
240 | |||
241 | lines_left -= x; | ||
242 | |||
243 | do { | ||
244 | __insn_wh64(wh); | ||
245 | wh += CACHE_LINE_SIZE_IN_WORDS; | ||
246 | } while (--i); | ||
247 | |||
248 | for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4); | ||
249 | j != 0; j--) { | ||
250 | *out32++ = v32; | ||
251 | *out32++ = v32; | ||
252 | *out32++ = v32; | ||
253 | *out32++ = v32; | ||
254 | } | ||
255 | } while (lines_left != 0); | ||
256 | |||
257 | /* We processed all full lines above, so only this many | ||
258 | * words remain to be processed. | ||
259 | */ | ||
260 | n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; | ||
261 | } | ||
262 | |||
263 | #endif /* CHIP_HAS_WH64() */ | ||
264 | |||
265 | /* Now handle any leftover values. */ | ||
266 | if (n32 != 0) { | ||
267 | do { | ||
268 | *out32 = v32; | ||
269 | out32++; | ||
270 | } while (--n32 != 0); | ||
271 | } | ||
272 | |||
273 | return s; | ||
274 | } | ||
275 | EXPORT_SYMBOL(memset); | ||
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c new file mode 100644 index 00000000000..485e24d62c6 --- /dev/null +++ b/arch/tile/lib/spinlock_32.c | |||
@@ -0,0 +1,221 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/spinlock.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <asm/processor.h> | ||
18 | |||
19 | #include "spinlock_common.h" | ||
20 | |||
21 | void arch_spin_lock(arch_spinlock_t *lock) | ||
22 | { | ||
23 | int my_ticket; | ||
24 | int iterations = 0; | ||
25 | int delta; | ||
26 | |||
27 | while ((my_ticket = __insn_tns((void *)&lock->next_ticket)) & 1) | ||
28 | delay_backoff(iterations++); | ||
29 | |||
30 | /* Increment the next ticket number, implicitly releasing tns lock. */ | ||
31 | lock->next_ticket = my_ticket + TICKET_QUANTUM; | ||
32 | |||
33 | /* Wait until it's our turn. */ | ||
34 | while ((delta = my_ticket - lock->current_ticket) != 0) | ||
35 | relax((128 / CYCLES_PER_RELAX_LOOP) * delta); | ||
36 | } | ||
37 | EXPORT_SYMBOL(arch_spin_lock); | ||
38 | |||
39 | int arch_spin_trylock(arch_spinlock_t *lock) | ||
40 | { | ||
41 | /* | ||
42 | * Grab a ticket; no need to retry if it's busy, we'll just | ||
43 | * treat that the same as "locked", since someone else | ||
44 | * will lock it momentarily anyway. | ||
45 | */ | ||
46 | int my_ticket = __insn_tns((void *)&lock->next_ticket); | ||
47 | |||
48 | if (my_ticket == lock->current_ticket) { | ||
49 | /* Not currently locked, so lock it by keeping this ticket. */ | ||
50 | lock->next_ticket = my_ticket + TICKET_QUANTUM; | ||
51 | /* Success! */ | ||
52 | return 1; | ||
53 | } | ||
54 | |||
55 | if (!(my_ticket & 1)) { | ||
56 | /* Release next_ticket. */ | ||
57 | lock->next_ticket = my_ticket; | ||
58 | } | ||
59 | |||
60 | return 0; | ||
61 | } | ||
62 | EXPORT_SYMBOL(arch_spin_trylock); | ||
63 | |||
64 | void arch_spin_unlock_wait(arch_spinlock_t *lock) | ||
65 | { | ||
66 | u32 iterations = 0; | ||
67 | while (arch_spin_is_locked(lock)) | ||
68 | delay_backoff(iterations++); | ||
69 | } | ||
70 | EXPORT_SYMBOL(arch_spin_unlock_wait); | ||
71 | |||
72 | /* | ||
73 | * The low byte is always reserved to be the marker for a "tns" operation | ||
74 | * since the low bit is set to "1" by a tns. The next seven bits are | ||
75 | * zeroes. The next byte holds the "next" writer value, i.e. the ticket | ||
76 | * available for the next task that wants to write. The third byte holds | ||
77 | * the current writer value, i.e. the writer who holds the current ticket. | ||
78 | * If current == next == 0, there are no interested writers. | ||
79 | */ | ||
80 | #define WR_NEXT_SHIFT _WR_NEXT_SHIFT | ||
81 | #define WR_CURR_SHIFT _WR_CURR_SHIFT | ||
82 | #define WR_WIDTH _WR_WIDTH | ||
83 | #define WR_MASK ((1 << WR_WIDTH) - 1) | ||
84 | |||
85 | /* | ||
86 | * The last eight bits hold the active reader count. This has to be | ||
87 | * zero before a writer can start to write. | ||
88 | */ | ||
89 | #define RD_COUNT_SHIFT _RD_COUNT_SHIFT | ||
90 | #define RD_COUNT_WIDTH _RD_COUNT_WIDTH | ||
91 | #define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1) | ||
92 | |||
93 | |||
94 | /* Lock the word, spinning until there are no tns-ers. */ | ||
95 | static inline u32 get_rwlock(arch_rwlock_t *rwlock) | ||
96 | { | ||
97 | u32 iterations = 0; | ||
98 | for (;;) { | ||
99 | u32 val = __insn_tns((int *)&rwlock->lock); | ||
100 | if (unlikely(val & 1)) { | ||
101 | delay_backoff(iterations++); | ||
102 | continue; | ||
103 | } | ||
104 | return val; | ||
105 | } | ||
106 | } | ||
107 | |||
108 | int arch_read_trylock_slow(arch_rwlock_t *rwlock) | ||
109 | { | ||
110 | u32 val = get_rwlock(rwlock); | ||
111 | int locked = (val << RD_COUNT_WIDTH) == 0; | ||
112 | rwlock->lock = val + (locked << RD_COUNT_SHIFT); | ||
113 | return locked; | ||
114 | } | ||
115 | EXPORT_SYMBOL(arch_read_trylock_slow); | ||
116 | |||
117 | void arch_read_unlock_slow(arch_rwlock_t *rwlock) | ||
118 | { | ||
119 | u32 val = get_rwlock(rwlock); | ||
120 | rwlock->lock = val - (1 << RD_COUNT_SHIFT); | ||
121 | } | ||
122 | EXPORT_SYMBOL(arch_read_unlock_slow); | ||
123 | |||
124 | void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val) | ||
125 | { | ||
126 | u32 eq, mask = 1 << WR_CURR_SHIFT; | ||
127 | while (unlikely(val & 1)) { | ||
128 | /* Limited backoff since we are the highest-priority task. */ | ||
129 | relax(4); | ||
130 | val = __insn_tns((int *)&rwlock->lock); | ||
131 | } | ||
132 | val = __insn_addb(val, mask); | ||
133 | eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT)); | ||
134 | val = __insn_mz(eq & mask, val); | ||
135 | rwlock->lock = val; | ||
136 | } | ||
137 | EXPORT_SYMBOL(arch_write_unlock_slow); | ||
138 | |||
139 | /* | ||
140 | * We spin until everything but the reader bits (which are in the high | ||
141 | * part of the word) are zero, i.e. no active or waiting writers, no tns. | ||
142 | * | ||
143 | * ISSUE: This approach can permanently starve readers. A reader who sees | ||
144 | * a writer could instead take a ticket lock (just like a writer would), | ||
145 | * and atomically enter read mode (with 1 reader) when it gets the ticket. | ||
146 | * This way both readers and writers will always make forward progress | ||
147 | * in a finite time. | ||
148 | */ | ||
149 | void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val) | ||
150 | { | ||
151 | u32 iterations = 0; | ||
152 | do { | ||
153 | if (!(val & 1)) | ||
154 | rwlock->lock = val; | ||
155 | delay_backoff(iterations++); | ||
156 | val = __insn_tns((int *)&rwlock->lock); | ||
157 | } while ((val << RD_COUNT_WIDTH) != 0); | ||
158 | rwlock->lock = val + (1 << RD_COUNT_SHIFT); | ||
159 | } | ||
160 | EXPORT_SYMBOL(arch_read_lock_slow); | ||
161 | |||
162 | void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val) | ||
163 | { | ||
164 | /* | ||
165 | * The trailing underscore on this variable (and curr_ below) | ||
166 | * reminds us that the high bits are garbage; we mask them out | ||
167 | * when we compare them. | ||
168 | */ | ||
169 | u32 my_ticket_; | ||
170 | |||
171 | /* Take out the next ticket; this will also stop would-be readers. */ | ||
172 | if (val & 1) | ||
173 | val = get_rwlock(rwlock); | ||
174 | rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT); | ||
175 | |||
176 | /* Extract my ticket value from the original word. */ | ||
177 | my_ticket_ = val >> WR_NEXT_SHIFT; | ||
178 | |||
179 | /* | ||
180 | * Wait until the "current" field matches our ticket, and | ||
181 | * there are no remaining readers. | ||
182 | */ | ||
183 | for (;;) { | ||
184 | u32 curr_ = val >> WR_CURR_SHIFT; | ||
185 | u32 readers = val >> RD_COUNT_SHIFT; | ||
186 | u32 delta = ((my_ticket_ - curr_) & WR_MASK) + !!readers; | ||
187 | if (likely(delta == 0)) | ||
188 | break; | ||
189 | |||
190 | /* Delay based on how many lock-holders are still out there. */ | ||
191 | relax((256 / CYCLES_PER_RELAX_LOOP) * delta); | ||
192 | |||
193 | /* | ||
194 | * Get a non-tns value to check; we don't need to tns | ||
195 | * it ourselves. Since we're not tns'ing, we retry | ||
196 | * more rapidly to get a valid value. | ||
197 | */ | ||
198 | while ((val = rwlock->lock) & 1) | ||
199 | relax(4); | ||
200 | } | ||
201 | } | ||
202 | EXPORT_SYMBOL(arch_write_lock_slow); | ||
203 | |||
204 | int __tns_atomic_acquire(atomic_t *lock) | ||
205 | { | ||
206 | int ret; | ||
207 | u32 iterations = 0; | ||
208 | |||
209 | BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION)); | ||
210 | __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1); | ||
211 | |||
212 | while ((ret = __insn_tns((void *)&lock->counter)) == 1) | ||
213 | delay_backoff(iterations++); | ||
214 | return ret; | ||
215 | } | ||
216 | |||
217 | void __tns_atomic_release(atomic_t *p, int v) | ||
218 | { | ||
219 | p->counter = v; | ||
220 | __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); | ||
221 | } | ||
diff --git a/arch/tile/lib/spinlock_common.h b/arch/tile/lib/spinlock_common.h new file mode 100644 index 00000000000..c1010980913 --- /dev/null +++ b/arch/tile/lib/spinlock_common.h | |||
@@ -0,0 +1,64 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * This file is included into spinlock_32.c or _64.c. | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | * The mfspr in __spinlock_relax() is 5 or 6 cycles plus 2 for loop | ||
18 | * overhead. | ||
19 | */ | ||
20 | #ifdef __tilegx__ | ||
21 | #define CYCLES_PER_RELAX_LOOP 7 | ||
22 | #else | ||
23 | #define CYCLES_PER_RELAX_LOOP 8 | ||
24 | #endif | ||
25 | |||
26 | /* | ||
27 | * Idle the core for CYCLES_PER_RELAX_LOOP * iterations cycles. | ||
28 | */ | ||
29 | static inline void | ||
30 | relax(int iterations) | ||
31 | { | ||
32 | for (/*above*/; iterations > 0; iterations--) | ||
33 | __insn_mfspr(SPR_PASS); | ||
34 | barrier(); | ||
35 | } | ||
36 | |||
37 | /* Perform bounded exponential backoff.*/ | ||
38 | static void delay_backoff(int iterations) | ||
39 | { | ||
40 | u32 exponent, loops; | ||
41 | |||
42 | /* | ||
43 | * 2^exponent is how many times we go around the loop, | ||
44 | * which takes 8 cycles. We want to start with a 16- to 31-cycle | ||
45 | * loop, so we need to go around minimum 2 = 2^1 times, so we | ||
46 | * bias the original value up by 1. | ||
47 | */ | ||
48 | exponent = iterations + 1; | ||
49 | |||
50 | /* | ||
51 | * Don't allow exponent to exceed 7, so we have 128 loops, | ||
52 | * or 1,024 (to 2,047) cycles, as our maximum. | ||
53 | */ | ||
54 | if (exponent > 8) | ||
55 | exponent = 8; | ||
56 | |||
57 | loops = 1 << exponent; | ||
58 | |||
59 | /* Add a randomness factor so two cpus never get in lock step. */ | ||
60 | loops += __insn_crc32_32(stack_pointer, get_cycles_low()) & | ||
61 | (loops - 1); | ||
62 | |||
63 | relax(1 << exponent); | ||
64 | } | ||
diff --git a/arch/tile/lib/strchr_32.c b/arch/tile/lib/strchr_32.c new file mode 100644 index 00000000000..c94e6f7ae7b --- /dev/null +++ b/arch/tile/lib/strchr_32.c | |||
@@ -0,0 +1,66 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | #undef strchr | ||
20 | |||
21 | char *strchr(const char *s, int c) | ||
22 | { | ||
23 | int z, g; | ||
24 | |||
25 | /* Get an aligned pointer. */ | ||
26 | const uintptr_t s_int = (uintptr_t) s; | ||
27 | const uint32_t *p = (const uint32_t *)(s_int & -4); | ||
28 | |||
29 | /* Create four copies of the byte for which we are looking. */ | ||
30 | const uint32_t goal = 0x01010101 * (uint8_t) c; | ||
31 | |||
32 | /* Read the first aligned word, but force bytes before the string to | ||
33 | * match neither zero nor goal (we make sure the high bit of each | ||
34 | * byte is 1, and the low 7 bits are all the opposite of the goal | ||
35 | * byte). | ||
36 | * | ||
37 | * Note that this shift count expression works because we know shift | ||
38 | * counts are taken mod 32. | ||
39 | */ | ||
40 | const uint32_t before_mask = (1 << (s_int << 3)) - 1; | ||
41 | uint32_t v = (*p | before_mask) ^ (goal & __insn_shrib(before_mask, 1)); | ||
42 | |||
43 | uint32_t zero_matches, goal_matches; | ||
44 | while (1) { | ||
45 | /* Look for a terminating '\0'. */ | ||
46 | zero_matches = __insn_seqb(v, 0); | ||
47 | |||
48 | /* Look for the goal byte. */ | ||
49 | goal_matches = __insn_seqb(v, goal); | ||
50 | |||
51 | if (__builtin_expect(zero_matches | goal_matches, 0)) | ||
52 | break; | ||
53 | |||
54 | v = *++p; | ||
55 | } | ||
56 | |||
57 | z = __insn_ctz(zero_matches); | ||
58 | g = __insn_ctz(goal_matches); | ||
59 | |||
60 | /* If we found c before '\0' we got a match. Note that if c == '\0' | ||
61 | * then g == z, and we correctly return the address of the '\0' | ||
62 | * rather than NULL. | ||
63 | */ | ||
64 | return (g <= z) ? ((char *)p) + (g >> 3) : NULL; | ||
65 | } | ||
66 | EXPORT_SYMBOL(strchr); | ||
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c new file mode 100644 index 00000000000..f26f88e11e4 --- /dev/null +++ b/arch/tile/lib/strlen_32.c | |||
@@ -0,0 +1,36 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | size_t strlen(const char *s) | ||
20 | { | ||
21 | /* Get an aligned pointer. */ | ||
22 | const uintptr_t s_int = (uintptr_t) s; | ||
23 | const uint32_t *p = (const uint32_t *)(s_int & -4); | ||
24 | |||
25 | /* Read the first word, but force bytes before the string to be nonzero. | ||
26 | * This expression works because we know shift counts are taken mod 32. | ||
27 | */ | ||
28 | uint32_t v = *p | ((1 << (s_int << 3)) - 1); | ||
29 | |||
30 | uint32_t bits; | ||
31 | while ((bits = __insn_seqb(v, 0)) == 0) | ||
32 | v = *++p; | ||
33 | |||
34 | return ((const char *)p) + (__insn_ctz(bits) >> 3) - s; | ||
35 | } | ||
36 | EXPORT_SYMBOL(strlen); | ||
diff --git a/arch/tile/lib/uaccess.c b/arch/tile/lib/uaccess.c new file mode 100644 index 00000000000..f8d398c9ee7 --- /dev/null +++ b/arch/tile/lib/uaccess.c | |||
@@ -0,0 +1,32 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/uaccess.h> | ||
16 | #include <linux/module.h> | ||
17 | |||
18 | int __range_ok(unsigned long addr, unsigned long size) | ||
19 | { | ||
20 | unsigned long limit = current_thread_info()->addr_limit.seg; | ||
21 | return !((addr < limit && size <= limit - addr) || | ||
22 | is_arch_mappable_range(addr, size)); | ||
23 | } | ||
24 | EXPORT_SYMBOL(__range_ok); | ||
25 | |||
26 | #ifdef CONFIG_DEBUG_COPY_FROM_USER | ||
27 | void copy_from_user_overflow(void) | ||
28 | { | ||
29 | WARN(1, "Buffer overflow detected!\n"); | ||
30 | } | ||
31 | EXPORT_SYMBOL(copy_from_user_overflow); | ||
32 | #endif | ||
diff --git a/arch/tile/lib/usercopy_32.S b/arch/tile/lib/usercopy_32.S new file mode 100644 index 00000000000..979f76d8374 --- /dev/null +++ b/arch/tile/lib/usercopy_32.S | |||
@@ -0,0 +1,223 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/linkage.h> | ||
16 | #include <asm/errno.h> | ||
17 | #include <asm/cache.h> | ||
18 | #include <arch/chip.h> | ||
19 | |||
20 | /* Access user memory, but use MMU to avoid propagating kernel exceptions. */ | ||
21 | |||
22 | .pushsection .fixup,"ax" | ||
23 | |||
24 | get_user_fault: | ||
25 | { move r0, zero; move r1, zero } | ||
26 | { movei r2, -EFAULT; jrp lr } | ||
27 | ENDPROC(get_user_fault) | ||
28 | |||
29 | put_user_fault: | ||
30 | { movei r0, -EFAULT; jrp lr } | ||
31 | ENDPROC(put_user_fault) | ||
32 | |||
33 | .popsection | ||
34 | |||
35 | /* | ||
36 | * __get_user_N functions take a pointer in r0, and return 0 in r2 | ||
37 | * on success, with the value in r0; or else -EFAULT in r2. | ||
38 | */ | ||
39 | #define __get_user_N(bytes, LOAD) \ | ||
40 | STD_ENTRY(__get_user_##bytes); \ | ||
41 | 1: { LOAD r0, r0; move r1, zero; move r2, zero }; \ | ||
42 | jrp lr; \ | ||
43 | STD_ENDPROC(__get_user_##bytes); \ | ||
44 | .pushsection __ex_table,"a"; \ | ||
45 | .word 1b, get_user_fault; \ | ||
46 | .popsection | ||
47 | |||
48 | __get_user_N(1, lb_u) | ||
49 | __get_user_N(2, lh_u) | ||
50 | __get_user_N(4, lw) | ||
51 | |||
52 | /* | ||
53 | * __get_user_8 takes a pointer in r0, and returns 0 in r2 | ||
54 | * on success, with the value in r0/r1; or else -EFAULT in r2. | ||
55 | */ | ||
56 | STD_ENTRY(__get_user_8); | ||
57 | 1: { lw r0, r0; addi r1, r0, 4 }; | ||
58 | 2: { lw r1, r1; move r2, zero }; | ||
59 | jrp lr; | ||
60 | STD_ENDPROC(__get_user_8); | ||
61 | .pushsection __ex_table,"a"; | ||
62 | .word 1b, get_user_fault; | ||
63 | .word 2b, get_user_fault; | ||
64 | .popsection | ||
65 | |||
66 | /* | ||
67 | * __put_user_N functions take a value in r0 and a pointer in r1, | ||
68 | * and return 0 in r0 on success or -EFAULT on failure. | ||
69 | */ | ||
70 | #define __put_user_N(bytes, STORE) \ | ||
71 | STD_ENTRY(__put_user_##bytes); \ | ||
72 | 1: { STORE r1, r0; move r0, zero }; \ | ||
73 | jrp lr; \ | ||
74 | STD_ENDPROC(__put_user_##bytes); \ | ||
75 | .pushsection __ex_table,"a"; \ | ||
76 | .word 1b, put_user_fault; \ | ||
77 | .popsection | ||
78 | |||
79 | __put_user_N(1, sb) | ||
80 | __put_user_N(2, sh) | ||
81 | __put_user_N(4, sw) | ||
82 | |||
83 | /* | ||
84 | * __put_user_8 takes a value in r0/r1 and a pointer in r2, | ||
85 | * and returns 0 in r0 on success or -EFAULT on failure. | ||
86 | */ | ||
87 | STD_ENTRY(__put_user_8) | ||
88 | 1: { sw r2, r0; addi r2, r2, 4 } | ||
89 | 2: { sw r2, r1; move r0, zero } | ||
90 | jrp lr | ||
91 | STD_ENDPROC(__put_user_8) | ||
92 | .pushsection __ex_table,"a" | ||
93 | .word 1b, put_user_fault | ||
94 | .word 2b, put_user_fault | ||
95 | .popsection | ||
96 | |||
97 | |||
98 | /* | ||
99 | * strnlen_user_asm takes the pointer in r0, and the length bound in r1. | ||
100 | * It returns the length, including the terminating NUL, or zero on exception. | ||
101 | * If length is greater than the bound, returns one plus the bound. | ||
102 | */ | ||
103 | STD_ENTRY(strnlen_user_asm) | ||
104 | { bz r1, 2f; addi r3, r0, -1 } /* bias down to include NUL */ | ||
105 | 1: { lb_u r4, r0; addi r1, r1, -1 } | ||
106 | bz r4, 2f | ||
107 | { bnzt r1, 1b; addi r0, r0, 1 } | ||
108 | 2: { sub r0, r0, r3; jrp lr } | ||
109 | STD_ENDPROC(strnlen_user_asm) | ||
110 | .pushsection .fixup,"ax" | ||
111 | strnlen_user_fault: | ||
112 | { move r0, zero; jrp lr } | ||
113 | ENDPROC(strnlen_user_fault) | ||
114 | .section __ex_table,"a" | ||
115 | .word 1b, strnlen_user_fault | ||
116 | .popsection | ||
117 | |||
118 | /* | ||
119 | * strncpy_from_user_asm takes the kernel target pointer in r0, | ||
120 | * the userspace source pointer in r1, and the length bound (including | ||
121 | * the trailing NUL) in r2. On success, it returns the string length | ||
122 | * (not including the trailing NUL), or -EFAULT on failure. | ||
123 | */ | ||
124 | STD_ENTRY(strncpy_from_user_asm) | ||
125 | { bz r2, 2f; move r3, r0 } | ||
126 | 1: { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 } | ||
127 | { sb r0, r4; addi r0, r0, 1 } | ||
128 | bz r2, 2f | ||
129 | bnzt r4, 1b | ||
130 | addi r0, r0, -1 /* don't count the trailing NUL */ | ||
131 | 2: { sub r0, r0, r3; jrp lr } | ||
132 | STD_ENDPROC(strncpy_from_user_asm) | ||
133 | .pushsection .fixup,"ax" | ||
134 | strncpy_from_user_fault: | ||
135 | { movei r0, -EFAULT; jrp lr } | ||
136 | ENDPROC(strncpy_from_user_fault) | ||
137 | .section __ex_table,"a" | ||
138 | .word 1b, strncpy_from_user_fault | ||
139 | .popsection | ||
140 | |||
141 | /* | ||
142 | * clear_user_asm takes the user target address in r0 and the | ||
143 | * number of bytes to zero in r1. | ||
144 | * It returns the number of uncopiable bytes (hopefully zero) in r0. | ||
145 | * Note that we don't use a separate .fixup section here since we fall | ||
146 | * through into the "fixup" code as the last straight-line bundle anyway. | ||
147 | */ | ||
148 | STD_ENTRY(clear_user_asm) | ||
149 | { bz r1, 2f; or r2, r0, r1 } | ||
150 | andi r2, r2, 3 | ||
151 | bzt r2, .Lclear_aligned_user_asm | ||
152 | 1: { sb r0, zero; addi r0, r0, 1; addi r1, r1, -1 } | ||
153 | bnzt r1, 1b | ||
154 | 2: { move r0, r1; jrp lr } | ||
155 | .pushsection __ex_table,"a" | ||
156 | .word 1b, 2b | ||
157 | .popsection | ||
158 | |||
159 | .Lclear_aligned_user_asm: | ||
160 | 1: { sw r0, zero; addi r0, r0, 4; addi r1, r1, -4 } | ||
161 | bnzt r1, 1b | ||
162 | 2: { move r0, r1; jrp lr } | ||
163 | STD_ENDPROC(clear_user_asm) | ||
164 | .pushsection __ex_table,"a" | ||
165 | .word 1b, 2b | ||
166 | .popsection | ||
167 | |||
168 | /* | ||
169 | * flush_user_asm takes the user target address in r0 and the | ||
170 | * number of bytes to flush in r1. | ||
171 | * It returns the number of unflushable bytes (hopefully zero) in r0. | ||
172 | */ | ||
173 | STD_ENTRY(flush_user_asm) | ||
174 | bz r1, 2f | ||
175 | { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } | ||
176 | { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } | ||
177 | { and r0, r0, r2; and r1, r1, r2 } | ||
178 | { sub r1, r1, r0 } | ||
179 | 1: { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() } | ||
180 | { addi r0, r0, CHIP_FLUSH_STRIDE(); bnzt r1, 1b } | ||
181 | 2: { move r0, r1; jrp lr } | ||
182 | STD_ENDPROC(flush_user_asm) | ||
183 | .pushsection __ex_table,"a" | ||
184 | .word 1b, 2b | ||
185 | .popsection | ||
186 | |||
187 | /* | ||
188 | * inv_user_asm takes the user target address in r0 and the | ||
189 | * number of bytes to invalidate in r1. | ||
190 | * It returns the number of not inv'able bytes (hopefully zero) in r0. | ||
191 | */ | ||
192 | STD_ENTRY(inv_user_asm) | ||
193 | bz r1, 2f | ||
194 | { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } | ||
195 | { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } | ||
196 | { and r0, r0, r2; and r1, r1, r2 } | ||
197 | { sub r1, r1, r0 } | ||
198 | 1: { inv r0; addi r1, r1, -CHIP_INV_STRIDE() } | ||
199 | { addi r0, r0, CHIP_INV_STRIDE(); bnzt r1, 1b } | ||
200 | 2: { move r0, r1; jrp lr } | ||
201 | STD_ENDPROC(inv_user_asm) | ||
202 | .pushsection __ex_table,"a" | ||
203 | .word 1b, 2b | ||
204 | .popsection | ||
205 | |||
206 | /* | ||
207 | * finv_user_asm takes the user target address in r0 and the | ||
208 | * number of bytes to flush-invalidate in r1. | ||
209 | * It returns the number of not finv'able bytes (hopefully zero) in r0. | ||
210 | */ | ||
211 | STD_ENTRY(finv_user_asm) | ||
212 | bz r1, 2f | ||
213 | { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } | ||
214 | { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } | ||
215 | { and r0, r0, r2; and r1, r1, r2 } | ||
216 | { sub r1, r1, r0 } | ||
217 | 1: { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() } | ||
218 | { addi r0, r0, CHIP_FINV_STRIDE(); bnzt r1, 1b } | ||
219 | 2: { move r0, r1; jrp lr } | ||
220 | STD_ENDPROC(finv_user_asm) | ||
221 | .pushsection __ex_table,"a" | ||
222 | .word 1b, 2b | ||
223 | .popsection | ||