aboutsummaryrefslogtreecommitdiffstats
path: root/arch/tile/lib
diff options
context:
space:
mode:
authorChris Metcalf <cmetcalf@tilera.com>2010-05-28 23:09:12 -0400
committerChris Metcalf <cmetcalf@tilera.com>2010-06-04 17:11:18 -0400
commit867e359b97c970a60626d5d76bbe2a8fadbf38fb (patch)
treec5ccbb7f5172e8555977119608ecb1eee3cc37e3 /arch/tile/lib
parent5360bd776f73d0a7da571d72a09a03f237e99900 (diff)
arch/tile: core support for Tilera 32-bit chips.
This change is the core kernel support for TILEPro and TILE64 chips. No driver support (except the console driver) is included yet. This includes the relevant Linux headers in asm/; the low-level low-level "Tile architecture" headers in arch/, which are shared with the hypervisor, etc., and are build-system agnostic; and the relevant hypervisor headers in hv/. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> Acked-by: Arnd Bergmann <arnd@arndb.de> Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Reviewed-by: Paul Mundt <lethal@linux-sh.org>
Diffstat (limited to 'arch/tile/lib')
-rw-r--r--arch/tile/lib/Makefile16
-rw-r--r--arch/tile/lib/__invalidate_icache.S106
-rw-r--r--arch/tile/lib/atomic_32.c347
-rw-r--r--arch/tile/lib/atomic_asm_32.S197
-rw-r--r--arch/tile/lib/checksum.c102
-rw-r--r--arch/tile/lib/cpumask.c51
-rw-r--r--arch/tile/lib/delay.c34
-rw-r--r--arch/tile/lib/exports.c78
-rw-r--r--arch/tile/lib/mb_incoherent.S34
-rw-r--r--arch/tile/lib/memchr_32.c68
-rw-r--r--arch/tile/lib/memcpy_32.S628
-rw-r--r--arch/tile/lib/memcpy_tile64.c271
-rw-r--r--arch/tile/lib/memmove_32.c63
-rw-r--r--arch/tile/lib/memset_32.c274
-rw-r--r--arch/tile/lib/spinlock_32.c221
-rw-r--r--arch/tile/lib/spinlock_common.h64
-rw-r--r--arch/tile/lib/strchr_32.c66
-rw-r--r--arch/tile/lib/strlen_32.c36
-rw-r--r--arch/tile/lib/uaccess.c31
-rw-r--r--arch/tile/lib/usercopy_32.S223
20 files changed, 2910 insertions, 0 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
new file mode 100644
index 000000000000..ea9c209d33fb
--- /dev/null
+++ b/arch/tile/lib/Makefile
@@ -0,0 +1,16 @@
1#
2# Makefile for TILE-specific library files..
3#
4
5lib-y = checksum.o cpumask.o delay.o __invalidate_icache.o \
6 mb_incoherent.o uaccess.o \
7 memcpy_$(BITS).o memchr_$(BITS).o memmove_$(BITS).o memset_$(BITS).o \
8 strchr_$(BITS).o strlen_$(BITS).o
9
10ifneq ($(CONFIG_TILEGX),y)
11lib-y += atomic_32.o atomic_asm_32.o memcpy_tile64.o
12endif
13
14lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o
15
16obj-$(CONFIG_MODULES) += exports.o
diff --git a/arch/tile/lib/__invalidate_icache.S b/arch/tile/lib/__invalidate_icache.S
new file mode 100644
index 000000000000..92e705059127
--- /dev/null
+++ b/arch/tile/lib/__invalidate_icache.S
@@ -0,0 +1,106 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 * A routine for synchronizing the instruction and data caches.
14 * Useful for self-modifying code.
15 *
16 * r0 holds the buffer address
17 * r1 holds the size in bytes
18 */
19
20#include <arch/chip.h>
21#include <feedback.h>
22
23#if defined(__NEWLIB__) || defined(__BME__)
24#include <sys/page.h>
25#else
26#include <asm/page.h>
27#endif
28
29#ifdef __tilegx__
30/* Share code among Tile family chips but adjust opcodes appropriately. */
31#define slt cmpltu
32#define bbst blbst
33#define bnezt bnzt
34#endif
35
36#if defined(__tilegx__) && __SIZEOF_POINTER__ == 4
37/* Force 32-bit ops so pointers wrap around appropriately. */
38#define ADD_PTR addx
39#define ADDI_PTR addxi
40#else
41#define ADD_PTR add
42#define ADDI_PTR addi
43#endif
44
45 .section .text.__invalidate_icache, "ax"
46 .global __invalidate_icache
47 .type __invalidate_icache,@function
48 .hidden __invalidate_icache
49 .align 8
50__invalidate_icache:
51 FEEDBACK_ENTER(__invalidate_icache)
52 {
53 ADD_PTR r1, r0, r1 /* end of buffer */
54 blez r1, .Lexit /* skip out if size <= 0 */
55 }
56 {
57 ADDI_PTR r1, r1, -1 /* point to last byte to flush */
58 andi r0, r0, -CHIP_L1I_LINE_SIZE() /* align to cache-line size */
59 }
60 {
61 andi r1, r1, -CHIP_L1I_LINE_SIZE() /* last cache line to flush */
62 mf
63 }
64#if CHIP_L1I_CACHE_SIZE() > PAGE_SIZE
65 {
66 moveli r4, CHIP_L1I_CACHE_SIZE() / PAGE_SIZE /* loop counter */
67 move r2, r0 /* remember starting address */
68 }
69#endif
70 drain
71 {
72 slt r3, r0, r1 /* set up loop invariant */
73#if CHIP_L1I_CACHE_SIZE() > PAGE_SIZE
74 moveli r6, PAGE_SIZE
75#endif
76 }
77.Lentry:
78 {
79 icoh r0
80 ADDI_PTR r0, r0, CHIP_L1I_LINE_SIZE() /* advance buffer */
81 }
82 {
83 slt r3, r0, r1 /* check if buffer < buffer + size */
84 bbst r3, .Lentry /* loop if buffer < buffer + size */
85 }
86#if CHIP_L1I_CACHE_SIZE() > PAGE_SIZE
87 {
88 ADD_PTR r2, r2, r6
89 ADD_PTR r1, r1, r6
90 }
91 {
92 move r0, r2
93 addi r4, r4, -1
94 }
95 {
96 slt r3, r0, r1 /* set up loop invariant */
97 bnezt r4, .Lentry
98 }
99#endif
100 drain
101.Lexit:
102 jrp lr
103
104.Lend___invalidate_icache:
105 .size __invalidate_icache, \
106 .Lend___invalidate_icache - __invalidate_icache
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
new file mode 100644
index 000000000000..be1e8acd105d
--- /dev/null
+++ b/arch/tile/lib/atomic_32.c
@@ -0,0 +1,347 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/cache.h>
16#include <linux/delay.h>
17#include <linux/uaccess.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <asm/atomic.h>
21#include <arch/chip.h>
22
23/* The routines in atomic_asm.S are private, so we only declare them here. */
24extern struct __get_user __atomic_cmpxchg(volatile int *p,
25 int *lock, int o, int n);
26extern struct __get_user __atomic_xchg(volatile int *p, int *lock, int n);
27extern struct __get_user __atomic_xchg_add(volatile int *p, int *lock, int n);
28extern struct __get_user __atomic_xchg_add_unless(volatile int *p,
29 int *lock, int o, int n);
30extern struct __get_user __atomic_or(volatile int *p, int *lock, int n);
31extern struct __get_user __atomic_andn(volatile int *p, int *lock, int n);
32extern struct __get_user __atomic_xor(volatile int *p, int *lock, int n);
33
34extern u64 __atomic64_cmpxchg(volatile u64 *p, int *lock, u64 o, u64 n);
35extern u64 __atomic64_xchg(volatile u64 *p, int *lock, u64 n);
36extern u64 __atomic64_xchg_add(volatile u64 *p, int *lock, u64 n);
37extern u64 __atomic64_xchg_add_unless(volatile u64 *p,
38 int *lock, u64 o, u64 n);
39
40
41/* See <asm/atomic.h> */
42#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
43
44/*
45 * A block of memory containing locks for atomic ops. Each instance of this
46 * struct will be homed on a different CPU.
47 */
48struct atomic_locks_on_cpu {
49 int lock[ATOMIC_HASH_L2_SIZE];
50} __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4)));
51
52static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool);
53
54/* The locks we'll use until __init_atomic_per_cpu is called. */
55static struct atomic_locks_on_cpu __initdata initial_atomic_locks;
56
57/* Hash into this vector to get a pointer to lock for the given atomic. */
58struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
59 __write_once = {
60 [0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks)
61};
62
63#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
64
65/* This page is remapped on startup to be hash-for-home. */
66int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */]
67 __attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned")));
68
69#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
70
71static inline int *__atomic_hashed_lock(volatile void *v)
72{
73 /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */
74#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
75 unsigned long i =
76 (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
77 unsigned long n = __insn_crc32_32(0, i);
78
79 /* Grab high bits for L1 index. */
80 unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT);
81 /* Grab low bits for L2 index. */
82 unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1);
83
84 return &atomic_lock_ptr[l1_index]->lock[l2_index];
85#else
86 /*
87 * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index.
88 * Using mm works here because atomic_locks is page aligned.
89 */
90 unsigned long ptr = __insn_mm((unsigned long)v >> 1,
91 (unsigned long)atomic_locks,
92 2, (ATOMIC_HASH_SHIFT + 2) - 1);
93 return (int *)ptr;
94#endif
95}
96
97#ifdef CONFIG_SMP
98/* Return whether the passed pointer is a valid atomic lock pointer. */
99static int is_atomic_lock(int *p)
100{
101#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
102 int i;
103 for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
104
105 if (p >= &atomic_lock_ptr[i]->lock[0] &&
106 p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) {
107 return 1;
108 }
109 }
110 return 0;
111#else
112 return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE];
113#endif
114}
115
116void __atomic_fault_unlock(int *irqlock_word)
117{
118 BUG_ON(!is_atomic_lock(irqlock_word));
119 BUG_ON(*irqlock_word != 1);
120 *irqlock_word = 0;
121}
122
123#endif /* CONFIG_SMP */
124
125static inline int *__atomic_setup(volatile void *v)
126{
127 /* Issue a load to the target to bring it into cache. */
128 *(volatile int *)v;
129 return __atomic_hashed_lock(v);
130}
131
132int _atomic_xchg(atomic_t *v, int n)
133{
134 return __atomic_xchg(&v->counter, __atomic_setup(v), n).val;
135}
136EXPORT_SYMBOL(_atomic_xchg);
137
138int _atomic_xchg_add(atomic_t *v, int i)
139{
140 return __atomic_xchg_add(&v->counter, __atomic_setup(v), i).val;
141}
142EXPORT_SYMBOL(_atomic_xchg_add);
143
144int _atomic_xchg_add_unless(atomic_t *v, int a, int u)
145{
146 /*
147 * Note: argument order is switched here since it is easier
148 * to use the first argument consistently as the "old value"
149 * in the assembly, as is done for _atomic_cmpxchg().
150 */
151 return __atomic_xchg_add_unless(&v->counter, __atomic_setup(v), u, a)
152 .val;
153}
154EXPORT_SYMBOL(_atomic_xchg_add_unless);
155
156int _atomic_cmpxchg(atomic_t *v, int o, int n)
157{
158 return __atomic_cmpxchg(&v->counter, __atomic_setup(v), o, n).val;
159}
160EXPORT_SYMBOL(_atomic_cmpxchg);
161
162unsigned long _atomic_or(volatile unsigned long *p, unsigned long mask)
163{
164 return __atomic_or((int *)p, __atomic_setup(p), mask).val;
165}
166EXPORT_SYMBOL(_atomic_or);
167
168unsigned long _atomic_andn(volatile unsigned long *p, unsigned long mask)
169{
170 return __atomic_andn((int *)p, __atomic_setup(p), mask).val;
171}
172EXPORT_SYMBOL(_atomic_andn);
173
174unsigned long _atomic_xor(volatile unsigned long *p, unsigned long mask)
175{
176 return __atomic_xor((int *)p, __atomic_setup(p), mask).val;
177}
178EXPORT_SYMBOL(_atomic_xor);
179
180
181u64 _atomic64_xchg(atomic64_t *v, u64 n)
182{
183 return __atomic64_xchg(&v->counter, __atomic_setup(v), n);
184}
185EXPORT_SYMBOL(_atomic64_xchg);
186
187u64 _atomic64_xchg_add(atomic64_t *v, u64 i)
188{
189 return __atomic64_xchg_add(&v->counter, __atomic_setup(v), i);
190}
191EXPORT_SYMBOL(_atomic64_xchg_add);
192
193u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u)
194{
195 /*
196 * Note: argument order is switched here since it is easier
197 * to use the first argument consistently as the "old value"
198 * in the assembly, as is done for _atomic_cmpxchg().
199 */
200 return __atomic64_xchg_add_unless(&v->counter, __atomic_setup(v),
201 u, a);
202}
203EXPORT_SYMBOL(_atomic64_xchg_add_unless);
204
205u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
206{
207 return __atomic64_cmpxchg(&v->counter, __atomic_setup(v), o, n);
208}
209EXPORT_SYMBOL(_atomic64_cmpxchg);
210
211
212static inline int *__futex_setup(__user int *v)
213{
214 /*
215 * Issue a prefetch to the counter to bring it into cache.
216 * As for __atomic_setup, but we can't do a read into the L1
217 * since it might fault; instead we do a prefetch into the L2.
218 */
219 __insn_prefetch(v);
220 return __atomic_hashed_lock(v);
221}
222
223struct __get_user futex_set(int *v, int i)
224{
225 return __atomic_xchg(v, __futex_setup(v), i);
226}
227
228struct __get_user futex_add(int *v, int n)
229{
230 return __atomic_xchg_add(v, __futex_setup(v), n);
231}
232
233struct __get_user futex_or(int *v, int n)
234{
235 return __atomic_or(v, __futex_setup(v), n);
236}
237
238struct __get_user futex_andn(int *v, int n)
239{
240 return __atomic_andn(v, __futex_setup(v), n);
241}
242
243struct __get_user futex_xor(int *v, int n)
244{
245 return __atomic_xor(v, __futex_setup(v), n);
246}
247
248struct __get_user futex_cmpxchg(int *v, int o, int n)
249{
250 return __atomic_cmpxchg(v, __futex_setup(v), o, n);
251}
252
253/*
254 * If any of the atomic or futex routines hit a bad address (not in
255 * the page tables at kernel PL) this routine is called. The futex
256 * routines are never used on kernel space, and the normal atomics and
257 * bitops are never used on user space. So a fault on kernel space
258 * must be fatal, but a fault on userspace is a futex fault and we
259 * need to return -EFAULT. Note that the context this routine is
260 * invoked in is the context of the "_atomic_xxx()" routines called
261 * by the functions in this file.
262 */
263struct __get_user __atomic_bad_address(int *addr)
264{
265 if (unlikely(!access_ok(VERIFY_WRITE, addr, sizeof(int))))
266 panic("Bad address used for kernel atomic op: %p\n", addr);
267 return (struct __get_user) { .err = -EFAULT };
268}
269
270
271#if CHIP_HAS_CBOX_HOME_MAP()
272static int __init noatomichash(char *str)
273{
274 printk("noatomichash is deprecated.\n");
275 return 1;
276}
277__setup("noatomichash", noatomichash);
278#endif
279
280void __init __init_atomic_per_cpu(void)
281{
282#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
283
284 unsigned int i;
285 int actual_cpu;
286
287 /*
288 * Before this is called from setup, we just have one lock for
289 * all atomic objects/operations. Here we replace the
290 * elements of atomic_lock_ptr so that they point at per_cpu
291 * integers. This seemingly over-complex approach stems from
292 * the fact that DEFINE_PER_CPU defines an entry for each cpu
293 * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1. But
294 * for efficient hashing of atomics to their locks we want a
295 * compile time constant power of 2 for the size of this
296 * table, so we use ATOMIC_HASH_SIZE.
297 *
298 * Here we populate atomic_lock_ptr from the per cpu
299 * atomic_lock_pool, interspersing by actual cpu so that
300 * subsequent elements are homed on consecutive cpus.
301 */
302
303 actual_cpu = cpumask_first(cpu_possible_mask);
304
305 for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
306 /*
307 * Preincrement to slightly bias against using cpu 0,
308 * which has plenty of stuff homed on it already.
309 */
310 actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask);
311 if (actual_cpu >= nr_cpu_ids)
312 actual_cpu = cpumask_first(cpu_possible_mask);
313
314 atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu);
315 }
316
317#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
318
319 /* Validate power-of-two and "bigger than cpus" assumption */
320 BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
321 BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
322
323 /*
324 * On TILEPro we prefer to use a single hash-for-home
325 * page, since this means atomic operations are less
326 * likely to encounter a TLB fault and thus should
327 * in general perform faster. You may wish to disable
328 * this in situations where few hash-for-home tiles
329 * are configured.
330 */
331 BUG_ON((unsigned long)atomic_locks % PAGE_SIZE != 0);
332
333 /* The locks must all fit on one page. */
334 BUG_ON(ATOMIC_HASH_SIZE * sizeof(int) > PAGE_SIZE);
335
336 /*
337 * We use the page offset of the atomic value's address as
338 * an index into atomic_locks, excluding the low 3 bits.
339 * That should not produce more indices than ATOMIC_HASH_SIZE.
340 */
341 BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
342
343#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
344
345 /* The futex code makes this assumption, so we validate it here. */
346 BUG_ON(sizeof(atomic_t) != sizeof(int));
347}
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
new file mode 100644
index 000000000000..c0d058578192
--- /dev/null
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -0,0 +1,197 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * Support routines for atomic operations. Each function takes:
15 *
16 * r0: address to manipulate
17 * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG)
18 * r2: new value to write, or for cmpxchg/add_unless, value to compare against
19 * r3: (cmpxchg/xchg_add_unless) new value to write or add;
20 * (atomic64 ops) high word of value to write
21 * r4/r5: (cmpxchg64/add_unless64) new value to write or add
22 *
23 * The 32-bit routines return a "struct __get_user" so that the futex code
24 * has an opportunity to return -EFAULT to the user if needed.
25 * The 64-bit routines just return a "long long" with the value,
26 * since they are only used from kernel space and don't expect to fault.
27 * Support for 16-bit ops is included in the framework but we don't provide
28 * any (x86_64 has an atomic_inc_short(), so we might want to some day).
29 *
30 * Note that the caller is advised to issue a suitable L1 or L2
31 * prefetch on the address being manipulated to avoid extra stalls.
32 * In addition, the hot path is on two icache lines, and we start with
33 * a jump to the second line to make sure they are both in cache so
34 * that we never stall waiting on icache fill while holding the lock.
35 * (This doesn't work out with most 64-bit ops, since they consume
36 * too many bundles, so may take an extra i-cache stall.)
37 *
38 * These routines set the INTERRUPT_CRITICAL_SECTION bit, just
39 * like sys_cmpxchg(), so that NMIs like PERF_COUNT will not interrupt
40 * the code, just page faults.
41 *
42 * If the load or store faults in a way that can be directly fixed in
43 * the do_page_fault_ics() handler (e.g. a vmalloc reference) we fix it
44 * directly, return to the instruction that faulted, and retry it.
45 *
46 * If the load or store faults in a way that potentially requires us
47 * to release the atomic lock, then retry (e.g. a migrating PTE), we
48 * reset the PC in do_page_fault_ics() to the "tns" instruction so
49 * that on return we will reacquire the lock and restart the op. We
50 * are somewhat overloading the exception_table_entry notion by doing
51 * this, since those entries are not normally used for migrating PTEs.
52 *
53 * If the main page fault handler discovers a bad address, it will see
54 * the PC pointing to the "tns" instruction (due to the earlier
55 * exception_table_entry processing in do_page_fault_ics), and
56 * re-reset the PC to the fault handler, atomic_bad_address(), which
57 * effectively takes over from the atomic op and can either return a
58 * bad "struct __get_user" (for user addresses) or can just panic (for
59 * bad kernel addresses).
60 *
61 * Note that if the value we would store is the same as what we
62 * loaded, we bypass the load. Other platforms with true atomics can
63 * make the guarantee that a non-atomic __clear_bit(), for example,
64 * can safely race with an atomic test_and_set_bit(); this example is
65 * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do
66 * that on Tile since the "atomic" op is really just a
67 * read/modify/write, and can race with the non-atomic
68 * read/modify/write. However, if we can short-circuit the write when
69 * it is not needed, in the atomic case, we avoid the race.
70 */
71
72#include <linux/linkage.h>
73#include <asm/atomic.h>
74#include <asm/page.h>
75#include <asm/processor.h>
76
77 .section .text.atomic,"ax"
78ENTRY(__start_atomic_asm_code)
79
80 .macro atomic_op, name, bitwidth, body
81 .align 64
82STD_ENTRY_SECTION(__atomic\name, .text.atomic)
83 {
84 movei r24, 1
85 j 4f /* branch to second cache line */
86 }
871: {
88 .ifc \bitwidth,16
89 lh r22, r0
90 .else
91 lw r22, r0
92 addi r23, r0, 4
93 .endif
94 }
95 .ifc \bitwidth,64
96 lw r23, r23
97 .endif
98 \body /* set r24, and r25 if 64-bit */
99 {
100 seq r26, r22, r24
101 seq r27, r23, r25
102 }
103 .ifc \bitwidth,64
104 bbnst r27, 2f
105 .endif
106 bbs r26, 3f /* skip write-back if it's the same value */
1072: {
108 .ifc \bitwidth,16
109 sh r0, r24
110 .else
111 sw r0, r24
112 addi r23, r0, 4
113 .endif
114 }
115 .ifc \bitwidth,64
116 sw r23, r25
117 .endif
118 mf
1193: {
120 move r0, r22
121 .ifc \bitwidth,64
122 move r1, r23
123 .else
124 move r1, zero
125 .endif
126 sw ATOMIC_LOCK_REG_NAME, zero
127 }
128 mtspr INTERRUPT_CRITICAL_SECTION, zero
129 jrp lr
1304: {
131 move ATOMIC_LOCK_REG_NAME, r1
132 mtspr INTERRUPT_CRITICAL_SECTION, r24
133 }
134#ifndef CONFIG_SMP
135 j 1b /* no atomic locks */
136#else
137 {
138 tns r21, ATOMIC_LOCK_REG_NAME
139 moveli r23, 2048 /* maximum backoff time in cycles */
140 }
141 {
142 bzt r21, 1b /* branch if lock acquired */
143 moveli r25, 32 /* starting backoff time in cycles */
144 }
1455: mtspr INTERRUPT_CRITICAL_SECTION, zero
146 mfspr r26, CYCLE_LOW /* get start point for this backoff */
1476: mfspr r22, CYCLE_LOW /* test to see if we've backed off enough */
148 sub r22, r22, r26
149 slt r22, r22, r25
150 bbst r22, 6b
151 {
152 mtspr INTERRUPT_CRITICAL_SECTION, r24
153 shli r25, r25, 1 /* double the backoff; retry the tns */
154 }
155 {
156 tns r21, ATOMIC_LOCK_REG_NAME
157 slt r26, r23, r25 /* is the proposed backoff too big? */
158 }
159 {
160 bzt r21, 1b /* branch if lock acquired */
161 mvnz r25, r26, r23
162 }
163 j 5b
164#endif
165 STD_ENDPROC(__atomic\name)
166 .ifc \bitwidth,32
167 .pushsection __ex_table,"a"
168 .word 1b, __atomic\name
169 .word 2b, __atomic\name
170 .word __atomic\name, __atomic_bad_address
171 .popsection
172 .endif
173 .endm
174
175atomic_op _cmpxchg, 32, "seq r26, r22, r2; { bbns r26, 3f; move r24, r3 }"
176atomic_op _xchg, 32, "move r24, r2"
177atomic_op _xchg_add, 32, "add r24, r22, r2"
178atomic_op _xchg_add_unless, 32, \
179 "sne r26, r22, r2; { bbns r26, 3f; add r24, r22, r3 }"
180atomic_op _or, 32, "or r24, r22, r2"
181atomic_op _andn, 32, "nor r2, r2, zero; and r24, r22, r2"
182atomic_op _xor, 32, "xor r24, r22, r2"
183
184atomic_op 64_cmpxchg, 64, "{ seq r26, r22, r2; seq r27, r23, r3 }; \
185 { bbns r26, 3f; move r24, r4 }; { bbns r27, 3f; move r25, r5 }"
186atomic_op 64_xchg, 64, "{ move r24, r2; move r25, r3 }"
187atomic_op 64_xchg_add, 64, "{ add r24, r22, r2; add r25, r23, r3 }; \
188 slt_u r26, r24, r22; add r25, r25, r26"
189atomic_op 64_xchg_add_unless, 64, \
190 "{ sne r26, r22, r2; sne r27, r23, r3 }; \
191 { bbns r26, 3f; add r24, r22, r4 }; \
192 { bbns r27, 3f; add r25, r23, r5 }; \
193 slt_u r26, r24, r22; add r25, r25, r26"
194
195 jrp lr /* happy backtracer */
196
197ENTRY(__end_atomic_asm_code)
diff --git a/arch/tile/lib/checksum.c b/arch/tile/lib/checksum.c
new file mode 100644
index 000000000000..e4bab5bd3f31
--- /dev/null
+++ b/arch/tile/lib/checksum.c
@@ -0,0 +1,102 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 * Support code for the main lib/checksum.c.
14 */
15
16#include <net/checksum.h>
17#include <linux/module.h>
18
19static inline unsigned int longto16(unsigned long x)
20{
21 unsigned long ret;
22#ifdef __tilegx__
23 ret = __insn_v2sadu(x, 0);
24 ret = __insn_v2sadu(ret, 0);
25#else
26 ret = __insn_sadh_u(x, 0);
27 ret = __insn_sadh_u(ret, 0);
28#endif
29 return ret;
30}
31
32__wsum do_csum(const unsigned char *buff, int len)
33{
34 int odd, count;
35 unsigned long result = 0;
36
37 if (len <= 0)
38 goto out;
39 odd = 1 & (unsigned long) buff;
40 if (odd) {
41 result = (*buff << 8);
42 len--;
43 buff++;
44 }
45 count = len >> 1; /* nr of 16-bit words.. */
46 if (count) {
47 if (2 & (unsigned long) buff) {
48 result += *(const unsigned short *)buff;
49 count--;
50 len -= 2;
51 buff += 2;
52 }
53 count >>= 1; /* nr of 32-bit words.. */
54 if (count) {
55#ifdef __tilegx__
56 if (4 & (unsigned long) buff) {
57 unsigned int w = *(const unsigned int *)buff;
58 result = __insn_v2sadau(result, w, 0);
59 count--;
60 len -= 4;
61 buff += 4;
62 }
63 count >>= 1; /* nr of 64-bit words.. */
64#endif
65
66 /*
67 * This algorithm could wrap around for very
68 * large buffers, but those should be impossible.
69 */
70 BUG_ON(count >= 65530);
71
72 while (count) {
73 unsigned long w = *(const unsigned long *)buff;
74 count--;
75 buff += sizeof(w);
76#ifdef __tilegx__
77 result = __insn_v2sadau(result, w, 0);
78#else
79 result = __insn_sadah_u(result, w, 0);
80#endif
81 }
82#ifdef __tilegx__
83 if (len & 4) {
84 unsigned int w = *(const unsigned int *)buff;
85 result = __insn_v2sadau(result, w, 0);
86 buff += 4;
87 }
88#endif
89 }
90 if (len & 2) {
91 result += *(const unsigned short *) buff;
92 buff += 2;
93 }
94 }
95 if (len & 1)
96 result += *buff;
97 result = longto16(result);
98 if (odd)
99 result = swab16(result);
100out:
101 return result;
102}
diff --git a/arch/tile/lib/cpumask.c b/arch/tile/lib/cpumask.c
new file mode 100644
index 000000000000..af745b3b2559
--- /dev/null
+++ b/arch/tile/lib/cpumask.c
@@ -0,0 +1,51 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/cpumask.h>
16#include <linux/ctype.h>
17#include <linux/errno.h>
18
19/*
20 * Allow cropping out bits beyond the end of the array.
21 * Move to "lib" directory if more clients want to use this routine.
22 */
23int bitmap_parselist_crop(const char *bp, unsigned long *maskp, int nmaskbits)
24{
25 unsigned a, b;
26
27 bitmap_zero(maskp, nmaskbits);
28 do {
29 if (!isdigit(*bp))
30 return -EINVAL;
31 a = simple_strtoul(bp, (char **)&bp, 10);
32 b = a;
33 if (*bp == '-') {
34 bp++;
35 if (!isdigit(*bp))
36 return -EINVAL;
37 b = simple_strtoul(bp, (char **)&bp, 10);
38 }
39 if (!(a <= b))
40 return -EINVAL;
41 if (b >= nmaskbits)
42 b = nmaskbits-1;
43 while (a <= b) {
44 set_bit(a, maskp);
45 a++;
46 }
47 if (*bp == ',')
48 bp++;
49 } while (*bp != '\0' && *bp != '\n');
50 return 0;
51}
diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c
new file mode 100644
index 000000000000..5801b03c13ef
--- /dev/null
+++ b/arch/tile/lib/delay.c
@@ -0,0 +1,34 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/module.h>
16#include <linux/delay.h>
17#include <linux/thread_info.h>
18#include <asm/fixmap.h>
19#include <hv/hypervisor.h>
20
21void __udelay(unsigned long usecs)
22{
23 hv_nanosleep(usecs * 1000);
24}
25EXPORT_SYMBOL(__udelay);
26
27void __ndelay(unsigned long nsecs)
28{
29 hv_nanosleep(nsecs);
30}
31EXPORT_SYMBOL(__ndelay);
32
33/* FIXME: should be declared in a header somewhere. */
34EXPORT_SYMBOL(__delay);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
new file mode 100644
index 000000000000..af8e70e2a0ce
--- /dev/null
+++ b/arch/tile/lib/exports.c
@@ -0,0 +1,78 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * Exports from assembler code and from libtile-cc.
15 */
16
17#include <linux/module.h>
18
19/* arch/tile/lib/usercopy.S */
20#include <linux/uaccess.h>
21EXPORT_SYMBOL(__get_user_1);
22EXPORT_SYMBOL(__get_user_2);
23EXPORT_SYMBOL(__get_user_4);
24EXPORT_SYMBOL(__put_user_1);
25EXPORT_SYMBOL(__put_user_2);
26EXPORT_SYMBOL(__put_user_4);
27EXPORT_SYMBOL(__put_user_8);
28EXPORT_SYMBOL(strnlen_user_asm);
29EXPORT_SYMBOL(strncpy_from_user_asm);
30EXPORT_SYMBOL(clear_user_asm);
31
32/* arch/tile/kernel/entry.S */
33#include <linux/kernel.h>
34#include <asm/processor.h>
35EXPORT_SYMBOL(current_text_addr);
36EXPORT_SYMBOL(dump_stack);
37
38/* arch/tile/lib/__memcpy.S */
39/* NOTE: on TILE64, these symbols appear in arch/tile/lib/memcpy_tile64.c */
40EXPORT_SYMBOL(memcpy);
41EXPORT_SYMBOL(__copy_to_user_inatomic);
42EXPORT_SYMBOL(__copy_from_user_inatomic);
43EXPORT_SYMBOL(__copy_from_user_zeroing);
44
45/* hypervisor glue */
46#include <hv/hypervisor.h>
47EXPORT_SYMBOL(hv_dev_open);
48EXPORT_SYMBOL(hv_dev_pread);
49EXPORT_SYMBOL(hv_dev_pwrite);
50EXPORT_SYMBOL(hv_dev_close);
51
52/* -ltile-cc */
53uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
54EXPORT_SYMBOL(__udivsi3);
55int32_t __divsi3(int32_t dividend, int32_t divisor);
56EXPORT_SYMBOL(__divsi3);
57uint64_t __udivdi3(uint64_t dividend, uint64_t divisor);
58EXPORT_SYMBOL(__udivdi3);
59int64_t __divdi3(int64_t dividend, int64_t divisor);
60EXPORT_SYMBOL(__divdi3);
61uint32_t __umodsi3(uint32_t dividend, uint32_t divisor);
62EXPORT_SYMBOL(__umodsi3);
63int32_t __modsi3(int32_t dividend, int32_t divisor);
64EXPORT_SYMBOL(__modsi3);
65uint64_t __umoddi3(uint64_t dividend, uint64_t divisor);
66EXPORT_SYMBOL(__umoddi3);
67int64_t __moddi3(int64_t dividend, int64_t divisor);
68EXPORT_SYMBOL(__moddi3);
69#ifndef __tilegx__
70uint64_t __ll_mul(uint64_t n0, uint64_t n1);
71EXPORT_SYMBOL(__ll_mul);
72#endif
73#ifndef __tilegx__
74int64_t __muldi3(int64_t, int64_t);
75EXPORT_SYMBOL(__muldi3);
76uint64_t __lshrdi3(uint64_t, unsigned int);
77EXPORT_SYMBOL(__lshrdi3);
78#endif
diff --git a/arch/tile/lib/mb_incoherent.S b/arch/tile/lib/mb_incoherent.S
new file mode 100644
index 000000000000..989ad7b68d5a
--- /dev/null
+++ b/arch/tile/lib/mb_incoherent.S
@@ -0,0 +1,34 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * Assembly code for invoking the HV's fence_incoherent syscall.
15 */
16
17#include <linux/linkage.h>
18#include <hv/syscall_public.h>
19#include <arch/abi.h>
20#include <arch/chip.h>
21
22#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
23
24/*
25 * Invoke the hypervisor's fence_incoherent syscall, which guarantees
26 * that all victims for cachelines homed on this tile have reached memory.
27 */
28STD_ENTRY(__mb_incoherent)
29 moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent
30 swint2
31 jrp lr
32 STD_ENDPROC(__mb_incoherent)
33
34#endif
diff --git a/arch/tile/lib/memchr_32.c b/arch/tile/lib/memchr_32.c
new file mode 100644
index 000000000000..6235283b4859
--- /dev/null
+++ b/arch/tile/lib/memchr_32.c
@@ -0,0 +1,68 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/types.h>
16#include <linux/string.h>
17#include <linux/module.h>
18
19void *memchr(const void *s, int c, size_t n)
20{
21 /* Get an aligned pointer. */
22 const uintptr_t s_int = (uintptr_t) s;
23 const uint32_t *p = (const uint32_t *)(s_int & -4);
24
25 /* Create four copies of the byte for which we are looking. */
26 const uint32_t goal = 0x01010101 * (uint8_t) c;
27
28 /* Read the first word, but munge it so that bytes before the array
29 * will not match goal.
30 *
31 * Note that this shift count expression works because we know
32 * shift counts are taken mod 32.
33 */
34 const uint32_t before_mask = (1 << (s_int << 3)) - 1;
35 uint32_t v = (*p | before_mask) ^ (goal & before_mask);
36
37 /* Compute the address of the last byte. */
38 const char *const last_byte_ptr = (const char *)s + n - 1;
39
40 /* Compute the address of the word containing the last byte. */
41 const uint32_t *const last_word_ptr =
42 (const uint32_t *)((uintptr_t) last_byte_ptr & -4);
43
44 uint32_t bits;
45 char *ret;
46
47 if (__builtin_expect(n == 0, 0)) {
48 /* Don't dereference any memory if the array is empty. */
49 return NULL;
50 }
51
52 while ((bits = __insn_seqb(v, goal)) == 0) {
53 if (__builtin_expect(p == last_word_ptr, 0)) {
54 /* We already read the last word in the array,
55 * so give up.
56 */
57 return NULL;
58 }
59 v = *++p;
60 }
61
62 /* We found a match, but it might be in a byte past the end
63 * of the array.
64 */
65 ret = ((char *)p) + (__insn_ctz(bits) >> 3);
66 return (ret <= last_byte_ptr) ? ret : NULL;
67}
68EXPORT_SYMBOL(memchr);
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
new file mode 100644
index 000000000000..f92984bf60ec
--- /dev/null
+++ b/arch/tile/lib/memcpy_32.S
@@ -0,0 +1,628 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * This file shares the implementation of the userspace memcpy and
15 * the kernel's memcpy, copy_to_user and copy_from_user.
16 */
17
18#include <arch/chip.h>
19
20#if CHIP_HAS_WH64() || defined(MEMCPY_TEST_WH64)
21#define MEMCPY_USE_WH64
22#endif
23
24
25#include <linux/linkage.h>
26
27/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
28#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
29#define memcpy __memcpy_asm
30#define __copy_to_user_inatomic __copy_to_user_inatomic_asm
31#define __copy_from_user_inatomic __copy_from_user_inatomic_asm
32#define __copy_from_user_zeroing __copy_from_user_zeroing_asm
33#endif
34
35#define IS_MEMCPY 0
36#define IS_COPY_FROM_USER 1
37#define IS_COPY_FROM_USER_ZEROING 2
38#define IS_COPY_TO_USER -1
39
40 .section .text.memcpy_common, "ax"
41 .align 64
42
43/* Use this to preface each bundle that can cause an exception so
44 * the kernel can clean up properly. The special cleanup code should
45 * not use these, since it knows what it is doing.
46 */
47#define EX \
48 .pushsection __ex_table, "a"; \
49 .word 9f, memcpy_common_fixup; \
50 .popsection; \
51 9
52
53
54/* __copy_from_user_inatomic takes the kernel target address in r0,
55 * the user source in r1, and the bytes to copy in r2.
56 * It returns the number of uncopiable bytes (hopefully zero) in r0.
57 */
58ENTRY(__copy_from_user_inatomic)
59.type __copy_from_user_inatomic, @function
60 FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \
61 .text.memcpy_common, \
62 .Lend_memcpy_common - __copy_from_user_inatomic)
63 { movei r29, IS_COPY_FROM_USER; j memcpy_common }
64 .size __copy_from_user_inatomic, . - __copy_from_user_inatomic
65
66/* __copy_from_user_zeroing is like __copy_from_user_inatomic, but
67 * any uncopiable bytes are zeroed in the target.
68 */
69ENTRY(__copy_from_user_zeroing)
70.type __copy_from_user_zeroing, @function
71 FEEDBACK_REENTER(__copy_from_user_inatomic)
72 { movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common }
73 .size __copy_from_user_zeroing, . - __copy_from_user_zeroing
74
75/* __copy_to_user_inatomic takes the user target address in r0,
76 * the kernel source in r1, and the bytes to copy in r2.
77 * It returns the number of uncopiable bytes (hopefully zero) in r0.
78 */
79ENTRY(__copy_to_user_inatomic)
80.type __copy_to_user_inatomic, @function
81 FEEDBACK_REENTER(__copy_from_user_inatomic)
82 { movei r29, IS_COPY_TO_USER; j memcpy_common }
83 .size __copy_to_user_inatomic, . - __copy_to_user_inatomic
84
85ENTRY(memcpy)
86.type memcpy, @function
87 FEEDBACK_REENTER(__copy_from_user_inatomic)
88 { movei r29, IS_MEMCPY }
89 .size memcpy, . - memcpy
90 /* Fall through */
91
92 .type memcpy_common, @function
93memcpy_common:
94 /* On entry, r29 holds one of the IS_* macro values from above. */
95
96
97 /* r0 is the dest, r1 is the source, r2 is the size. */
98
99 /* Save aside original dest so we can return it at the end. */
100 { sw sp, lr; move r23, r0; or r4, r0, r1 }
101
102 /* Check for an empty size. */
103 { bz r2, .Ldone; andi r4, r4, 3 }
104
105 /* Save aside original values in case of a fault. */
106 { move r24, r1; move r25, r2 }
107 move r27, lr
108
109 /* Check for an unaligned source or dest. */
110 { bnz r4, .Lcopy_unaligned_maybe_many; addli r4, r2, -256 }
111
112.Lcheck_aligned_copy_size:
113 /* If we are copying < 256 bytes, branch to simple case. */
114 { blzt r4, .Lcopy_8_check; slti_u r8, r2, 8 }
115
116 /* Copying >= 256 bytes, so jump to complex prefetching loop. */
117 { andi r6, r1, 63; j .Lcopy_many }
118
119/*
120 *
121 * Aligned 4 byte at a time copy loop
122 *
123 */
124
125.Lcopy_8_loop:
126 /* Copy two words at a time to hide load latency. */
127EX: { lw r3, r1; addi r1, r1, 4; slti_u r8, r2, 16 }
128EX: { lw r4, r1; addi r1, r1, 4 }
129EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
130EX: { sw r0, r4; addi r0, r0, 4; addi r2, r2, -4 }
131.Lcopy_8_check:
132 { bzt r8, .Lcopy_8_loop; slti_u r4, r2, 4 }
133
134 /* Copy odd leftover word, if any. */
135 { bnzt r4, .Lcheck_odd_stragglers }
136EX: { lw r3, r1; addi r1, r1, 4 }
137EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
138
139.Lcheck_odd_stragglers:
140 { bnz r2, .Lcopy_unaligned_few }
141
142.Ldone:
143 /* For memcpy return original dest address, else zero. */
144 { mz r0, r29, r23; jrp lr }
145
146
147/*
148 *
149 * Prefetching multiple cache line copy handler (for large transfers).
150 *
151 */
152
153 /* Copy words until r1 is cache-line-aligned. */
154.Lalign_loop:
155EX: { lw r3, r1; addi r1, r1, 4 }
156 { andi r6, r1, 63 }
157EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
158.Lcopy_many:
159 { bnzt r6, .Lalign_loop; addi r9, r0, 63 }
160
161 { addi r3, r1, 60; andi r9, r9, -64 }
162
163#ifdef MEMCPY_USE_WH64
164 /* No need to prefetch dst, we'll just do the wh64
165 * right before we copy a line.
166 */
167#endif
168
169EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 }
170 /* Intentionally stall for a few cycles to leave L2 cache alone. */
171 { bnzt zero, .; move r27, lr }
172EX: { lw r6, r3; addi r3, r3, 64 }
173 /* Intentionally stall for a few cycles to leave L2 cache alone. */
174 { bnzt zero, . }
175EX: { lw r7, r3; addi r3, r3, 64 }
176#ifndef MEMCPY_USE_WH64
177 /* Prefetch the dest */
178 /* Intentionally stall for a few cycles to leave L2 cache alone. */
179 { bnzt zero, . }
180 /* Use a real load to cause a TLB miss if necessary. We aren't using
181 * r28, so this should be fine.
182 */
183EX: { lw r28, r9; addi r9, r9, 64 }
184 /* Intentionally stall for a few cycles to leave L2 cache alone. */
185 { bnzt zero, . }
186 { prefetch r9; addi r9, r9, 64 }
187 /* Intentionally stall for a few cycles to leave L2 cache alone. */
188 { bnzt zero, . }
189 { prefetch r9; addi r9, r9, 64 }
190#endif
191 /* Intentionally stall for a few cycles to leave L2 cache alone. */
192 { bz zero, .Lbig_loop2 }
193
194 /* On entry to this loop:
195 * - r0 points to the start of dst line 0
196 * - r1 points to start of src line 0
197 * - r2 >= (256 - 60), only the first time the loop trips.
198 * - r3 contains r1 + 128 + 60 [pointer to end of source line 2]
199 * This is our prefetch address. When we get near the end
200 * rather than prefetching off the end this is changed to point
201 * to some "safe" recently loaded address.
202 * - r5 contains *(r1 + 60) [i.e. last word of source line 0]
203 * - r6 contains *(r1 + 64 + 60) [i.e. last word of source line 1]
204 * - r9 contains ((r0 + 63) & -64)
205 * [start of next dst cache line.]
206 */
207
208.Lbig_loop:
209 { jal .Lcopy_line2; add r15, r1, r2 }
210
211.Lbig_loop2:
212 /* Copy line 0, first stalling until r5 is ready. */
213EX: { move r12, r5; lw r16, r1 }
214 { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
215 /* Prefetch several lines ahead. */
216EX: { lw r5, r3; addi r3, r3, 64 }
217 { jal .Lcopy_line }
218
219 /* Copy line 1, first stalling until r6 is ready. */
220EX: { move r12, r6; lw r16, r1 }
221 { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
222 /* Prefetch several lines ahead. */
223EX: { lw r6, r3; addi r3, r3, 64 }
224 { jal .Lcopy_line }
225
226 /* Copy line 2, first stalling until r7 is ready. */
227EX: { move r12, r7; lw r16, r1 }
228 { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
229 /* Prefetch several lines ahead. */
230EX: { lw r7, r3; addi r3, r3, 64 }
231 /* Use up a caches-busy cycle by jumping back to the top of the
232 * loop. Might as well get it out of the way now.
233 */
234 { j .Lbig_loop }
235
236
237 /* On entry:
238 * - r0 points to the destination line.
239 * - r1 points to the source line.
240 * - r3 is the next prefetch address.
241 * - r9 holds the last address used for wh64.
242 * - r12 = WORD_15
243 * - r16 = WORD_0.
244 * - r17 == r1 + 16.
245 * - r27 holds saved lr to restore.
246 *
247 * On exit:
248 * - r0 is incremented by 64.
249 * - r1 is incremented by 64, unless that would point to a word
250 * beyond the end of the source array, in which case it is redirected
251 * to point to an arbitrary word already in the cache.
252 * - r2 is decremented by 64.
253 * - r3 is unchanged, unless it points to a word beyond the
254 * end of the source array, in which case it is redirected
255 * to point to an arbitrary word already in the cache.
256 * Redirecting is OK since if we are that close to the end
257 * of the array we will not come back to this subroutine
258 * and use the contents of the prefetched address.
259 * - r4 is nonzero iff r2 >= 64.
260 * - r9 is incremented by 64, unless it points beyond the
261 * end of the last full destination cache line, in which
262 * case it is redirected to a "safe address" that can be
263 * clobbered (sp - 64)
264 * - lr contains the value in r27.
265 */
266
267/* r26 unused */
268
269.Lcopy_line:
270 /* TODO: when r3 goes past the end, we would like to redirect it
271 * to prefetch the last partial cache line (if any) just once, for the
272 * benefit of the final cleanup loop. But we don't want to
273 * prefetch that line more than once, or subsequent prefetches
274 * will go into the RTF. But then .Lbig_loop should unconditionally
275 * branch to top of loop to execute final prefetch, and its
276 * nop should become a conditional branch.
277 */
278
279 /* We need two non-memory cycles here to cover the resources
280 * used by the loads initiated by the caller.
281 */
282 { add r15, r1, r2 }
283.Lcopy_line2:
284 { slt_u r13, r3, r15; addi r17, r1, 16 }
285
286 /* NOTE: this will stall for one cycle as L1 is busy. */
287
288 /* Fill second L1D line. */
289EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
290
291#ifdef MEMCPY_TEST_WH64
292 /* Issue a fake wh64 that clobbers the destination words
293 * with random garbage, for testing.
294 */
295 { movei r19, 64; crc32_32 r10, r2, r9 }
296.Lwh64_test_loop:
297EX: { sw r9, r10; addi r9, r9, 4; addi r19, r19, -4 }
298 { bnzt r19, .Lwh64_test_loop; crc32_32 r10, r10, r19 }
299#elif CHIP_HAS_WH64()
300 /* Prepare destination line for writing. */
301EX: { wh64 r9; addi r9, r9, 64 }
302#else
303 /* Prefetch dest line */
304 { prefetch r9; addi r9, r9, 64 }
305#endif
306 /* Load seven words that are L1D hits to cover wh64 L2 usage. */
307
308 /* Load the three remaining words from the last L1D line, which
309 * we know has already filled the L1D.
310 */
311EX: { lw r4, r1; addi r1, r1, 4; addi r20, r1, 16 } /* r4 = WORD_12 */
312EX: { lw r8, r1; addi r1, r1, 4; slt_u r13, r20, r15 }/* r8 = WORD_13 */
313EX: { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 } /* r11 = WORD_14 */
314
315 /* Load the three remaining words from the first L1D line, first
316 * stalling until it has filled by "looking at" r16.
317 */
318EX: { lw r13, r1; addi r1, r1, 4; move zero, r16 } /* r13 = WORD_1 */
319EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_2 */
320EX: { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */
321
322 /* Load second word from the second L1D line, first
323 * stalling until it has filled by "looking at" r17.
324 */
325EX: { lw r19, r1; addi r1, r1, 4; move zero, r17 } /* r19 = WORD_5 */
326
327 /* Store last word to the destination line, potentially dirtying it
328 * for the first time, which keeps the L2 busy for two cycles.
329 */
330EX: { sw r10, r12 } /* store(WORD_15) */
331
332 /* Use two L1D hits to cover the sw L2 access above. */
333EX: { lw r10, r1; addi r1, r1, 4 } /* r10 = WORD_6 */
334EX: { lw r12, r1; addi r1, r1, 4 } /* r12 = WORD_7 */
335
336 /* Fill third L1D line. */
337EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */
338
339 /* Store first L1D line. */
340EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
341EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
342EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
343#ifdef MEMCPY_USE_WH64
344EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
345#else
346 /* Back up the r9 to a cache line we are already storing to
347 * if it gets past the end of the dest vector. Strictly speaking,
348 * we don't need to back up to the start of a cache line, but it's free
349 * and tidy, so why not?
350 */
351EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
352#endif
353 /* Store second L1D line. */
354EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
355EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */
356EX: { sw r0, r10; addi r0, r0, 4 } /* store(WORD_6) */
357EX: { sw r0, r12; addi r0, r0, 4 } /* store(WORD_7) */
358
359EX: { lw r13, r1; addi r1, r1, 4; move zero, r18 } /* r13 = WORD_9 */
360EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_10 */
361EX: { lw r15, r1; move r1, r20 } /* r15 = WORD_11 */
362
363 /* Store third L1D line. */
364EX: { sw r0, r18; addi r0, r0, 4 } /* store(WORD_8) */
365EX: { sw r0, r13; addi r0, r0, 4 } /* store(WORD_9) */
366EX: { sw r0, r14; addi r0, r0, 4 } /* store(WORD_10) */
367EX: { sw r0, r15; addi r0, r0, 4 } /* store(WORD_11) */
368
369 /* Store rest of fourth L1D line. */
370EX: { sw r0, r4; addi r0, r0, 4 } /* store(WORD_12) */
371 {
372EX: sw r0, r8 /* store(WORD_13) */
373 addi r0, r0, 4
374 /* Will r2 be > 64 after we subtract 64 below? */
375 shri r4, r2, 7
376 }
377 {
378EX: sw r0, r11 /* store(WORD_14) */
379 addi r0, r0, 8
380 /* Record 64 bytes successfully copied. */
381 addi r2, r2, -64
382 }
383
384 { jrp lr; move lr, r27 }
385
386 /* Convey to the backtrace library that the stack frame is size
387 * zero, and the real return address is on the stack rather than
388 * in 'lr'.
389 */
390 { info 8 }
391
392 .align 64
393.Lcopy_unaligned_maybe_many:
394 /* Skip the setup overhead if we aren't copying many bytes. */
395 { slti_u r8, r2, 20; sub r4, zero, r0 }
396 { bnzt r8, .Lcopy_unaligned_few; andi r4, r4, 3 }
397 { bz r4, .Ldest_is_word_aligned; add r18, r1, r2 }
398
399/*
400 *
401 * unaligned 4 byte at a time copy handler.
402 *
403 */
404
405 /* Copy single bytes until r0 == 0 mod 4, so we can store words. */
406.Lalign_dest_loop:
407EX: { lb_u r3, r1; addi r1, r1, 1; addi r4, r4, -1 }
408EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
409 { bnzt r4, .Lalign_dest_loop; andi r3, r1, 3 }
410
411 /* If source and dest are now *both* aligned, do an aligned copy. */
412 { bz r3, .Lcheck_aligned_copy_size; addli r4, r2, -256 }
413
414.Ldest_is_word_aligned:
415
416#if CHIP_HAS_DWORD_ALIGN()
417EX: { andi r8, r0, 63; lwadd_na r6, r1, 4}
418 { slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
419
420 /* This copies unaligned words until either there are fewer
421 * than 4 bytes left to copy, or until the destination pointer
422 * is cache-aligned, whichever comes first.
423 *
424 * On entry:
425 * - r0 is the next store address.
426 * - r1 points 4 bytes past the load address corresponding to r0.
427 * - r2 >= 4
428 * - r6 is the next aligned word loaded.
429 */
430.Lcopy_unaligned_src_words:
431EX: { lwadd_na r7, r1, 4; slti_u r8, r2, 4 + 4 }
432 /* stall */
433 { dword_align r6, r7, r1; slti_u r9, r2, 64 + 4 }
434EX: { swadd r0, r6, 4; addi r2, r2, -4 }
435 { bnz r8, .Lcleanup_unaligned_words; andi r8, r0, 63 }
436 { bnzt r8, .Lcopy_unaligned_src_words; move r6, r7 }
437
438 /* On entry:
439 * - r0 is the next store address.
440 * - r1 points 4 bytes past the load address corresponding to r0.
441 * - r2 >= 4 (# of bytes left to store).
442 * - r6 is the next aligned src word value.
443 * - r9 = (r2 < 64U).
444 * - r18 points one byte past the end of source memory.
445 */
446.Ldest_is_L2_line_aligned:
447
448 {
449 /* Not a full cache line remains. */
450 bnz r9, .Lcleanup_unaligned_words
451 move r7, r6
452 }
453
454 /* r2 >= 64 */
455
456 /* Kick off two prefetches, but don't go past the end. */
457 { addi r3, r1, 63 - 4; addi r8, r1, 64 + 63 - 4 }
458 { prefetch r3; move r3, r8; slt_u r8, r8, r18 }
459 { mvz r3, r8, r1; addi r8, r3, 64 }
460 { prefetch r3; move r3, r8; slt_u r8, r8, r18 }
461 { mvz r3, r8, r1; movei r17, 0 }
462
463.Lcopy_unaligned_line:
464 /* Prefetch another line. */
465 { prefetch r3; addi r15, r1, 60; addi r3, r3, 64 }
466 /* Fire off a load of the last word we are about to copy. */
467EX: { lw_na r15, r15; slt_u r8, r3, r18 }
468
469EX: { mvz r3, r8, r1; wh64 r0 }
470
471 /* This loop runs twice.
472 *
473 * On entry:
474 * - r17 is even before the first iteration, and odd before
475 * the second. It is incremented inside the loop. Encountering
476 * an even value at the end of the loop makes it stop.
477 */
478.Lcopy_half_an_unaligned_line:
479EX: {
480 /* Stall until the last byte is ready. In the steady state this
481 * guarantees all words to load below will be in the L2 cache, which
482 * avoids shunting the loads to the RTF.
483 */
484 move zero, r15
485 lwadd_na r7, r1, 16
486 }
487EX: { lwadd_na r11, r1, 12 }
488EX: { lwadd_na r14, r1, -24 }
489EX: { lwadd_na r8, r1, 4 }
490EX: { lwadd_na r9, r1, 4 }
491EX: {
492 lwadd_na r10, r1, 8
493 /* r16 = (r2 < 64), after we subtract 32 from r2 below. */
494 slti_u r16, r2, 64 + 32
495 }
496EX: { lwadd_na r12, r1, 4; addi r17, r17, 1 }
497EX: { lwadd_na r13, r1, 8; dword_align r6, r7, r1 }
498EX: { swadd r0, r6, 4; dword_align r7, r8, r1 }
499EX: { swadd r0, r7, 4; dword_align r8, r9, r1 }
500EX: { swadd r0, r8, 4; dword_align r9, r10, r1 }
501EX: { swadd r0, r9, 4; dword_align r10, r11, r1 }
502EX: { swadd r0, r10, 4; dword_align r11, r12, r1 }
503EX: { swadd r0, r11, 4; dword_align r12, r13, r1 }
504EX: { swadd r0, r12, 4; dword_align r13, r14, r1 }
505EX: { swadd r0, r13, 4; addi r2, r2, -32 }
506 { move r6, r14; bbst r17, .Lcopy_half_an_unaligned_line }
507
508 { bzt r16, .Lcopy_unaligned_line; move r7, r6 }
509
510 /* On entry:
511 * - r0 is the next store address.
512 * - r1 points 4 bytes past the load address corresponding to r0.
513 * - r2 >= 0 (# of bytes left to store).
514 * - r7 is the next aligned src word value.
515 */
516.Lcleanup_unaligned_words:
517 /* Handle any trailing bytes. */
518 { bz r2, .Lcopy_unaligned_done; slti_u r8, r2, 4 }
519 { bzt r8, .Lcopy_unaligned_src_words; move r6, r7 }
520
521 /* Move r1 back to the point where it corresponds to r0. */
522 { addi r1, r1, -4 }
523
524#else /* !CHIP_HAS_DWORD_ALIGN() */
525
526 /* Compute right/left shift counts and load initial source words. */
527 { andi r5, r1, -4; andi r3, r1, 3 }
528EX: { lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 }
529EX: { lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 }
530
531 /* Load and store one word at a time, using shifts and ORs
532 * to correct for the misaligned src.
533 */
534.Lcopy_unaligned_src_loop:
535 { shr r6, r6, r3; shl r8, r7, r4 }
536EX: { lw r7, r5; or r8, r8, r6; move r6, r7 }
537EX: { sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 }
538 { addi r5, r5, 4; slti_u r8, r2, 8 }
539 { bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 }
540
541 { bz r2, .Lcopy_unaligned_done }
542#endif /* !CHIP_HAS_DWORD_ALIGN() */
543
544 /* Fall through */
545
546/*
547 *
548 * 1 byte at a time copy handler.
549 *
550 */
551
552.Lcopy_unaligned_few:
553EX: { lb_u r3, r1; addi r1, r1, 1 }
554EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
555 { bnzt r2, .Lcopy_unaligned_few }
556
557.Lcopy_unaligned_done:
558
559 /* For memcpy return original dest address, else zero. */
560 { mz r0, r29, r23; jrp lr }
561
562.Lend_memcpy_common:
563 .size memcpy_common, .Lend_memcpy_common - memcpy_common
564
565 .section .fixup,"ax"
566memcpy_common_fixup:
567 .type memcpy_common_fixup, @function
568
569 /* Skip any bytes we already successfully copied.
570 * r2 (num remaining) is correct, but r0 (dst) and r1 (src)
571 * may not be quite right because of unrolling and prefetching.
572 * So we need to recompute their values as the address just
573 * after the last byte we are sure was successfully loaded and
574 * then stored.
575 */
576
577 /* Determine how many bytes we successfully copied. */
578 { sub r3, r25, r2 }
579
580 /* Add this to the original r0 and r1 to get their new values. */
581 { add r0, r23, r3; add r1, r24, r3 }
582
583 { bzt r29, memcpy_fixup_loop }
584 { blzt r29, copy_to_user_fixup_loop }
585
586copy_from_user_fixup_loop:
587 /* Try copying the rest one byte at a time, expecting a load fault. */
588.Lcfu: { lb_u r3, r1; addi r1, r1, 1 }
589 { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
590 { bnzt r2, copy_from_user_fixup_loop }
591
592.Lcopy_from_user_fixup_zero_remainder:
593 { bbs r29, 2f } /* low bit set means IS_COPY_FROM_USER */
594 /* byte-at-a-time loop faulted, so zero the rest. */
595 { move r3, r2; bz r2, 2f /* should be impossible, but handle it. */ }
5961: { sb r0, zero; addi r0, r0, 1; addi r3, r3, -1 }
597 { bnzt r3, 1b }
5982: move lr, r27
599 { move r0, r2; jrp lr }
600
601copy_to_user_fixup_loop:
602 /* Try copying the rest one byte at a time, expecting a store fault. */
603 { lb_u r3, r1; addi r1, r1, 1 }
604.Lctu: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
605 { bnzt r2, copy_to_user_fixup_loop }
606.Lcopy_to_user_fixup_done:
607 move lr, r27
608 { move r0, r2; jrp lr }
609
610memcpy_fixup_loop:
611 /* Try copying the rest one byte at a time. We expect a disastrous
612 * fault to happen since we are in fixup code, but let it happen.
613 */
614 { lb_u r3, r1; addi r1, r1, 1 }
615 { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
616 { bnzt r2, memcpy_fixup_loop }
617 /* This should be unreachable, we should have faulted again.
618 * But be paranoid and handle it in case some interrupt changed
619 * the TLB or something.
620 */
621 move lr, r27
622 { move r0, r23; jrp lr }
623
624 .size memcpy_common_fixup, . - memcpy_common_fixup
625
626 .section __ex_table,"a"
627 .word .Lcfu, .Lcopy_from_user_fixup_zero_remainder
628 .word .Lctu, .Lcopy_to_user_fixup_done
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
new file mode 100644
index 000000000000..4f0047342469
--- /dev/null
+++ b/arch/tile/lib/memcpy_tile64.c
@@ -0,0 +1,271 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/string.h>
16#include <linux/smp.h>
17#include <linux/module.h>
18#include <linux/uaccess.h>
19#include <asm/fixmap.h>
20#include <asm/kmap_types.h>
21#include <asm/tlbflush.h>
22#include <hv/hypervisor.h>
23#include <arch/chip.h>
24
25
26#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
27
28/* Defined in memcpy.S */
29extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n);
30extern unsigned long __copy_to_user_inatomic_asm(
31 void __user *to, const void *from, unsigned long n);
32extern unsigned long __copy_from_user_inatomic_asm(
33 void *to, const void __user *from, unsigned long n);
34extern unsigned long __copy_from_user_zeroing_asm(
35 void *to, const void __user *from, unsigned long n);
36
37typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
38
39/* Size above which to consider TLB games for performance */
40#define LARGE_COPY_CUTOFF 2048
41
42/* Communicate to the simulator what we are trying to do. */
43#define sim_allow_multiple_caching(b) \
44 __insn_mtspr(SPR_SIM_CONTROL, \
45 SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS))
46
47/*
48 * Copy memory by briefly enabling incoherent cacheline-at-a-time mode.
49 *
50 * We set up our own source and destination PTEs that we fully control.
51 * This is the only way to guarantee that we don't race with another
52 * thread that is modifying the PTE; we can't afford to try the
53 * copy_{to,from}_user() technique of catching the interrupt, since
54 * we must run with interrupts disabled to avoid the risk of some
55 * other code seeing the incoherent data in our cache. (Recall that
56 * our cache is indexed by PA, so even if the other code doesn't use
57 * our KM_MEMCPY virtual addresses, they'll still hit in cache using
58 * the normal VAs that aren't supposed to hit in cache.)
59 */
60static void memcpy_multicache(void *dest, const void *source,
61 pte_t dst_pte, pte_t src_pte, int len)
62{
63 int idx, i;
64 unsigned long flags, newsrc, newdst, endsrc;
65 pmd_t *pmdp;
66 pte_t *ptep;
67 int cpu = get_cpu();
68
69 /*
70 * Disable interrupts so that we don't recurse into memcpy()
71 * in an interrupt handler, nor accidentally reference
72 * the PA of the source from an interrupt routine. Also
73 * notify the simulator that we're playing games so we don't
74 * generate spurious coherency warnings.
75 */
76 local_irq_save(flags);
77 sim_allow_multiple_caching(1);
78
79 /* Set up the new dest mapping */
80 idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + KM_MEMCPY0;
81 newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
82 pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
83 ptep = pte_offset_kernel(pmdp, newdst);
84 if (pte_val(*ptep) != pte_val(dst_pte)) {
85 set_pte(ptep, dst_pte);
86 local_flush_tlb_page(NULL, newdst, PAGE_SIZE);
87 }
88
89 /* Set up the new source mapping */
90 idx += (KM_MEMCPY0 - KM_MEMCPY1);
91 src_pte = hv_pte_set_nc(src_pte);
92 src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */
93 newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
94 pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
95 ptep = pte_offset_kernel(pmdp, newsrc);
96 *ptep = src_pte; /* set_pte() would be confused by this */
97 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
98
99 /* Actually move the data. */
100 __memcpy_asm((void *)newdst, (const void *)newsrc, len);
101
102 /*
103 * Remap the source as locally-cached and not OLOC'ed so that
104 * we can inval without also invaling the remote cpu's cache.
105 * This also avoids known errata with inv'ing cacheable oloc data.
106 */
107 src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
108 src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
109 *ptep = src_pte; /* set_pte() would be confused by this */
110 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
111
112 /*
113 * Do the actual invalidation, covering the full L2 cache line
114 * at the end since __memcpy_asm() is somewhat aggressive.
115 */
116 __inv_buffer((void *)newsrc, len);
117
118 /*
119 * We're done: notify the simulator that all is back to normal,
120 * and re-enable interrupts and pre-emption.
121 */
122 sim_allow_multiple_caching(0);
123 local_irq_restore(flags);
124 put_cpu_no_resched();
125}
126
127/*
128 * Identify large copies from remotely-cached memory, and copy them
129 * via memcpy_multicache() if they look good, otherwise fall back
130 * to the particular kind of copying passed as the memcpy_t function.
131 */
132static unsigned long fast_copy(void *dest, const void *source, int len,
133 memcpy_t func)
134{
135 /*
136 * Check if it's big enough to bother with. We may end up doing a
137 * small copy via TLB manipulation if we're near a page boundary,
138 * but presumably we'll make it up when we hit the second page.
139 */
140 while (len >= LARGE_COPY_CUTOFF) {
141 int copy_size, bytes_left_on_page;
142 pte_t *src_ptep, *dst_ptep;
143 pte_t src_pte, dst_pte;
144 struct page *src_page, *dst_page;
145
146 /* Is the source page oloc'ed to a remote cpu? */
147retry_source:
148 src_ptep = virt_to_pte(current->mm, (unsigned long)source);
149 if (src_ptep == NULL)
150 break;
151 src_pte = *src_ptep;
152 if (!hv_pte_get_present(src_pte) ||
153 !hv_pte_get_readable(src_pte) ||
154 hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3)
155 break;
156 if (get_remote_cache_cpu(src_pte) == smp_processor_id())
157 break;
158 src_page = pfn_to_page(hv_pte_get_pfn(src_pte));
159 get_page(src_page);
160 if (pte_val(src_pte) != pte_val(*src_ptep)) {
161 put_page(src_page);
162 goto retry_source;
163 }
164 if (pte_huge(src_pte)) {
165 /* Adjust the PTE to correspond to a small page */
166 int pfn = hv_pte_get_pfn(src_pte);
167 pfn += (((unsigned long)source & (HPAGE_SIZE-1))
168 >> PAGE_SHIFT);
169 src_pte = pfn_pte(pfn, src_pte);
170 src_pte = pte_mksmall(src_pte);
171 }
172
173 /* Is the destination page writable? */
174retry_dest:
175 dst_ptep = virt_to_pte(current->mm, (unsigned long)dest);
176 if (dst_ptep == NULL) {
177 put_page(src_page);
178 break;
179 }
180 dst_pte = *dst_ptep;
181 if (!hv_pte_get_present(dst_pte) ||
182 !hv_pte_get_writable(dst_pte)) {
183 put_page(src_page);
184 break;
185 }
186 dst_page = pfn_to_page(hv_pte_get_pfn(dst_pte));
187 if (dst_page == src_page) {
188 /*
189 * Source and dest are on the same page; this
190 * potentially exposes us to incoherence if any
191 * part of src and dest overlap on a cache line.
192 * Just give up rather than trying to be precise.
193 */
194 put_page(src_page);
195 break;
196 }
197 get_page(dst_page);
198 if (pte_val(dst_pte) != pte_val(*dst_ptep)) {
199 put_page(dst_page);
200 goto retry_dest;
201 }
202 if (pte_huge(dst_pte)) {
203 /* Adjust the PTE to correspond to a small page */
204 int pfn = hv_pte_get_pfn(dst_pte);
205 pfn += (((unsigned long)dest & (HPAGE_SIZE-1))
206 >> PAGE_SHIFT);
207 dst_pte = pfn_pte(pfn, dst_pte);
208 dst_pte = pte_mksmall(dst_pte);
209 }
210
211 /* All looks good: create a cachable PTE and copy from it */
212 copy_size = len;
213 bytes_left_on_page =
214 PAGE_SIZE - (((int)source) & (PAGE_SIZE-1));
215 if (copy_size > bytes_left_on_page)
216 copy_size = bytes_left_on_page;
217 bytes_left_on_page =
218 PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1));
219 if (copy_size > bytes_left_on_page)
220 copy_size = bytes_left_on_page;
221 memcpy_multicache(dest, source, dst_pte, src_pte, copy_size);
222
223 /* Release the pages */
224 put_page(dst_page);
225 put_page(src_page);
226
227 /* Continue on the next page */
228 dest += copy_size;
229 source += copy_size;
230 len -= copy_size;
231 }
232
233 return func(dest, source, len);
234}
235
236void *memcpy(void *to, const void *from, __kernel_size_t n)
237{
238 if (n < LARGE_COPY_CUTOFF)
239 return (void *)__memcpy_asm(to, from, n);
240 else
241 return (void *)fast_copy(to, from, n, __memcpy_asm);
242}
243
244unsigned long __copy_to_user_inatomic(void __user *to, const void *from,
245 unsigned long n)
246{
247 if (n < LARGE_COPY_CUTOFF)
248 return __copy_to_user_inatomic_asm(to, from, n);
249 else
250 return fast_copy(to, from, n, __copy_to_user_inatomic_asm);
251}
252
253unsigned long __copy_from_user_inatomic(void *to, const void __user *from,
254 unsigned long n)
255{
256 if (n < LARGE_COPY_CUTOFF)
257 return __copy_from_user_inatomic_asm(to, from, n);
258 else
259 return fast_copy(to, from, n, __copy_from_user_inatomic_asm);
260}
261
262unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
263 unsigned long n)
264{
265 if (n < LARGE_COPY_CUTOFF)
266 return __copy_from_user_zeroing_asm(to, from, n);
267 else
268 return fast_copy(to, from, n, __copy_from_user_zeroing_asm);
269}
270
271#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */
diff --git a/arch/tile/lib/memmove_32.c b/arch/tile/lib/memmove_32.c
new file mode 100644
index 000000000000..f09d8c4523ec
--- /dev/null
+++ b/arch/tile/lib/memmove_32.c
@@ -0,0 +1,63 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/types.h>
16#include <linux/string.h>
17#include <linux/module.h>
18
19void *memmove(void *dest, const void *src, size_t n)
20{
21 if ((const char *)src >= (char *)dest + n
22 || (char *)dest >= (const char *)src + n) {
23 /* We found no overlap, so let memcpy do all the heavy
24 * lifting (prefetching, etc.)
25 */
26 return memcpy(dest, src, n);
27 }
28
29 if (n != 0) {
30 const uint8_t *in;
31 uint8_t x;
32 uint8_t *out;
33 int stride;
34
35 if (src < dest) {
36 /* copy backwards */
37 in = (const uint8_t *)src + n - 1;
38 out = (uint8_t *)dest + n - 1;
39 stride = -1;
40 } else {
41 /* copy forwards */
42 in = (const uint8_t *)src;
43 out = (uint8_t *)dest;
44 stride = 1;
45 }
46
47 /* Manually software-pipeline this loop. */
48 x = *in;
49 in += stride;
50
51 while (--n != 0) {
52 *out = x;
53 out += stride;
54 x = *in;
55 in += stride;
56 }
57
58 *out = x;
59 }
60
61 return dest;
62}
63EXPORT_SYMBOL(memmove);
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
new file mode 100644
index 000000000000..8593bc82398a
--- /dev/null
+++ b/arch/tile/lib/memset_32.c
@@ -0,0 +1,274 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <arch/chip.h>
16
17#include <linux/types.h>
18#include <linux/string.h>
19#include <linux/module.h>
20
21
22void *memset(void *s, int c, size_t n)
23{
24 uint32_t *out32;
25 int n32;
26 uint32_t v16, v32;
27 uint8_t *out8 = s;
28#if !CHIP_HAS_WH64()
29 int ahead32;
30#else
31 int to_align32;
32#endif
33
34 /* Experimentation shows that a trivial tight loop is a win up until
35 * around a size of 20, where writing a word at a time starts to win.
36 */
37#define BYTE_CUTOFF 20
38
39#if BYTE_CUTOFF < 3
40 /* This must be at least at least this big, or some code later
41 * on doesn't work.
42 */
43#error "BYTE_CUTOFF is too small"
44#endif
45
46 if (n < BYTE_CUTOFF) {
47 /* Strangely, this turns out to be the tightest way to
48 * write this loop.
49 */
50 if (n != 0) {
51 do {
52 /* Strangely, combining these into one line
53 * performs worse.
54 */
55 *out8 = c;
56 out8++;
57 } while (--n != 0);
58 }
59
60 return s;
61 }
62
63#if !CHIP_HAS_WH64()
64 /* Use a spare issue slot to start prefetching the first cache
65 * line early. This instruction is free as the store can be buried
66 * in otherwise idle issue slots doing ALU ops.
67 */
68 __insn_prefetch(out8);
69
70 /* We prefetch the end so that a short memset that spans two cache
71 * lines gets some prefetching benefit. Again we believe this is free
72 * to issue.
73 */
74 __insn_prefetch(&out8[n - 1]);
75#endif /* !CHIP_HAS_WH64() */
76
77
78 /* Align 'out8'. We know n >= 3 so this won't write past the end. */
79 while (((uintptr_t) out8 & 3) != 0) {
80 *out8++ = c;
81 --n;
82 }
83
84 /* Align 'n'. */
85 while (n & 3)
86 out8[--n] = c;
87
88 out32 = (uint32_t *) out8;
89 n32 = n >> 2;
90
91 /* Tile input byte out to 32 bits. */
92 v16 = __insn_intlb(c, c);
93 v32 = __insn_intlh(v16, v16);
94
95 /* This must be at least 8 or the following loop doesn't work. */
96#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
97
98#if !CHIP_HAS_WH64()
99
100 ahead32 = CACHE_LINE_SIZE_IN_WORDS;
101
102 /* We already prefetched the first and last cache lines, so
103 * we only need to do more prefetching if we are storing
104 * to more than two cache lines.
105 */
106 if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
107 int i;
108
109 /* Prefetch the next several cache lines.
110 * This is the setup code for the software-pipelined
111 * loop below.
112 */
113#define MAX_PREFETCH 5
114 ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
115 if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
116 ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
117
118 for (i = CACHE_LINE_SIZE_IN_WORDS;
119 i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
120 __insn_prefetch(&out32[i]);
121 }
122
123 if (n32 > ahead32) {
124 while (1) {
125 int j;
126
127 /* Prefetch by reading one word several cache lines
128 * ahead. Since loads are non-blocking this will
129 * cause the full cache line to be read while we are
130 * finishing earlier cache lines. Using a store
131 * here causes microarchitectural performance
132 * problems where a victimizing store miss goes to
133 * the head of the retry FIFO and locks the pipe for
134 * a few cycles. So a few subsequent stores in this
135 * loop go into the retry FIFO, and then later
136 * stores see other stores to the same cache line
137 * are already in the retry FIFO and themselves go
138 * into the retry FIFO, filling it up and grinding
139 * to a halt waiting for the original miss to be
140 * satisfied.
141 */
142 __insn_prefetch(&out32[ahead32]);
143
144#if 1
145#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
146#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
147#endif
148
149 n32 -= CACHE_LINE_SIZE_IN_WORDS;
150
151 /* Save icache space by only partially unrolling
152 * this loop.
153 */
154 for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
155 *out32++ = v32;
156 *out32++ = v32;
157 *out32++ = v32;
158 *out32++ = v32;
159 }
160#else
161 /* Unfortunately, due to a code generator flaw this
162 * allocates a separate register for each of these
163 * stores, which requires a large number of spills,
164 * which makes this procedure enormously bigger
165 * (something like 70%)
166 */
167 *out32++ = v32;
168 *out32++ = v32;
169 *out32++ = v32;
170 *out32++ = v32;
171 *out32++ = v32;
172 *out32++ = v32;
173 *out32++ = v32;
174 *out32++ = v32;
175 *out32++ = v32;
176 *out32++ = v32;
177 *out32++ = v32;
178 *out32++ = v32;
179 *out32++ = v32;
180 *out32++ = v32;
181 *out32++ = v32;
182 n32 -= 16;
183#endif
184
185 /* To save compiled code size, reuse this loop even
186 * when we run out of prefetching to do by dropping
187 * ahead32 down.
188 */
189 if (n32 <= ahead32) {
190 /* Not even a full cache line left,
191 * so stop now.
192 */
193 if (n32 < CACHE_LINE_SIZE_IN_WORDS)
194 break;
195
196 /* Choose a small enough value that we don't
197 * prefetch past the end. There's no sense
198 * in touching cache lines we don't have to.
199 */
200 ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
201 }
202 }
203 }
204
205#else /* CHIP_HAS_WH64() */
206
207 /* Determine how many words we need to emit before the 'out32'
208 * pointer becomes aligned modulo the cache line size.
209 */
210 to_align32 =
211 (-((uintptr_t)out32 >> 2)) & (CACHE_LINE_SIZE_IN_WORDS - 1);
212
213 /* Only bother aligning and using wh64 if there is at least
214 * one full cache line to process. This check also prevents
215 * overrunning the end of the buffer with alignment words.
216 */
217 if (to_align32 <= n32 - CACHE_LINE_SIZE_IN_WORDS) {
218 int lines_left;
219
220 /* Align out32 mod the cache line size so we can use wh64. */
221 n32 -= to_align32;
222 for (; to_align32 != 0; to_align32--) {
223 *out32 = v32;
224 out32++;
225 }
226
227 /* Use unsigned divide to turn this into a right shift. */
228 lines_left = (unsigned)n32 / CACHE_LINE_SIZE_IN_WORDS;
229
230 do {
231 /* Only wh64 a few lines at a time, so we don't
232 * exceed the maximum number of victim lines.
233 */
234 int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
235 ? lines_left
236 : CHIP_MAX_OUTSTANDING_VICTIMS());
237 uint32_t *wh = out32;
238 int i = x;
239 int j;
240
241 lines_left -= x;
242
243 do {
244 __insn_wh64(wh);
245 wh += CACHE_LINE_SIZE_IN_WORDS;
246 } while (--i);
247
248 for (j = x * (CACHE_LINE_SIZE_IN_WORDS / 4); j != 0; j--) {
249 *out32++ = v32;
250 *out32++ = v32;
251 *out32++ = v32;
252 *out32++ = v32;
253 }
254 } while (lines_left != 0);
255
256 /* We processed all full lines above, so only this many
257 * words remain to be processed.
258 */
259 n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
260 }
261
262#endif /* CHIP_HAS_WH64() */
263
264 /* Now handle any leftover values. */
265 if (n32 != 0) {
266 do {
267 *out32 = v32;
268 out32++;
269 } while (--n32 != 0);
270 }
271
272 return s;
273}
274EXPORT_SYMBOL(memset);
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c
new file mode 100644
index 000000000000..485e24d62c6b
--- /dev/null
+++ b/arch/tile/lib/spinlock_32.c
@@ -0,0 +1,221 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <asm/processor.h>
18
19#include "spinlock_common.h"
20
21void arch_spin_lock(arch_spinlock_t *lock)
22{
23 int my_ticket;
24 int iterations = 0;
25 int delta;
26
27 while ((my_ticket = __insn_tns((void *)&lock->next_ticket)) & 1)
28 delay_backoff(iterations++);
29
30 /* Increment the next ticket number, implicitly releasing tns lock. */
31 lock->next_ticket = my_ticket + TICKET_QUANTUM;
32
33 /* Wait until it's our turn. */
34 while ((delta = my_ticket - lock->current_ticket) != 0)
35 relax((128 / CYCLES_PER_RELAX_LOOP) * delta);
36}
37EXPORT_SYMBOL(arch_spin_lock);
38
39int arch_spin_trylock(arch_spinlock_t *lock)
40{
41 /*
42 * Grab a ticket; no need to retry if it's busy, we'll just
43 * treat that the same as "locked", since someone else
44 * will lock it momentarily anyway.
45 */
46 int my_ticket = __insn_tns((void *)&lock->next_ticket);
47
48 if (my_ticket == lock->current_ticket) {
49 /* Not currently locked, so lock it by keeping this ticket. */
50 lock->next_ticket = my_ticket + TICKET_QUANTUM;
51 /* Success! */
52 return 1;
53 }
54
55 if (!(my_ticket & 1)) {
56 /* Release next_ticket. */
57 lock->next_ticket = my_ticket;
58 }
59
60 return 0;
61}
62EXPORT_SYMBOL(arch_spin_trylock);
63
64void arch_spin_unlock_wait(arch_spinlock_t *lock)
65{
66 u32 iterations = 0;
67 while (arch_spin_is_locked(lock))
68 delay_backoff(iterations++);
69}
70EXPORT_SYMBOL(arch_spin_unlock_wait);
71
72/*
73 * The low byte is always reserved to be the marker for a "tns" operation
74 * since the low bit is set to "1" by a tns. The next seven bits are
75 * zeroes. The next byte holds the "next" writer value, i.e. the ticket
76 * available for the next task that wants to write. The third byte holds
77 * the current writer value, i.e. the writer who holds the current ticket.
78 * If current == next == 0, there are no interested writers.
79 */
80#define WR_NEXT_SHIFT _WR_NEXT_SHIFT
81#define WR_CURR_SHIFT _WR_CURR_SHIFT
82#define WR_WIDTH _WR_WIDTH
83#define WR_MASK ((1 << WR_WIDTH) - 1)
84
85/*
86 * The last eight bits hold the active reader count. This has to be
87 * zero before a writer can start to write.
88 */
89#define RD_COUNT_SHIFT _RD_COUNT_SHIFT
90#define RD_COUNT_WIDTH _RD_COUNT_WIDTH
91#define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1)
92
93
94/* Lock the word, spinning until there are no tns-ers. */
95static inline u32 get_rwlock(arch_rwlock_t *rwlock)
96{
97 u32 iterations = 0;
98 for (;;) {
99 u32 val = __insn_tns((int *)&rwlock->lock);
100 if (unlikely(val & 1)) {
101 delay_backoff(iterations++);
102 continue;
103 }
104 return val;
105 }
106}
107
108int arch_read_trylock_slow(arch_rwlock_t *rwlock)
109{
110 u32 val = get_rwlock(rwlock);
111 int locked = (val << RD_COUNT_WIDTH) == 0;
112 rwlock->lock = val + (locked << RD_COUNT_SHIFT);
113 return locked;
114}
115EXPORT_SYMBOL(arch_read_trylock_slow);
116
117void arch_read_unlock_slow(arch_rwlock_t *rwlock)
118{
119 u32 val = get_rwlock(rwlock);
120 rwlock->lock = val - (1 << RD_COUNT_SHIFT);
121}
122EXPORT_SYMBOL(arch_read_unlock_slow);
123
124void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val)
125{
126 u32 eq, mask = 1 << WR_CURR_SHIFT;
127 while (unlikely(val & 1)) {
128 /* Limited backoff since we are the highest-priority task. */
129 relax(4);
130 val = __insn_tns((int *)&rwlock->lock);
131 }
132 val = __insn_addb(val, mask);
133 eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
134 val = __insn_mz(eq & mask, val);
135 rwlock->lock = val;
136}
137EXPORT_SYMBOL(arch_write_unlock_slow);
138
139/*
140 * We spin until everything but the reader bits (which are in the high
141 * part of the word) are zero, i.e. no active or waiting writers, no tns.
142 *
143 * ISSUE: This approach can permanently starve readers. A reader who sees
144 * a writer could instead take a ticket lock (just like a writer would),
145 * and atomically enter read mode (with 1 reader) when it gets the ticket.
146 * This way both readers and writers will always make forward progress
147 * in a finite time.
148 */
149void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val)
150{
151 u32 iterations = 0;
152 do {
153 if (!(val & 1))
154 rwlock->lock = val;
155 delay_backoff(iterations++);
156 val = __insn_tns((int *)&rwlock->lock);
157 } while ((val << RD_COUNT_WIDTH) != 0);
158 rwlock->lock = val + (1 << RD_COUNT_SHIFT);
159}
160EXPORT_SYMBOL(arch_read_lock_slow);
161
162void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
163{
164 /*
165 * The trailing underscore on this variable (and curr_ below)
166 * reminds us that the high bits are garbage; we mask them out
167 * when we compare them.
168 */
169 u32 my_ticket_;
170
171 /* Take out the next ticket; this will also stop would-be readers. */
172 if (val & 1)
173 val = get_rwlock(rwlock);
174 rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT);
175
176 /* Extract my ticket value from the original word. */
177 my_ticket_ = val >> WR_NEXT_SHIFT;
178
179 /*
180 * Wait until the "current" field matches our ticket, and
181 * there are no remaining readers.
182 */
183 for (;;) {
184 u32 curr_ = val >> WR_CURR_SHIFT;
185 u32 readers = val >> RD_COUNT_SHIFT;
186 u32 delta = ((my_ticket_ - curr_) & WR_MASK) + !!readers;
187 if (likely(delta == 0))
188 break;
189
190 /* Delay based on how many lock-holders are still out there. */
191 relax((256 / CYCLES_PER_RELAX_LOOP) * delta);
192
193 /*
194 * Get a non-tns value to check; we don't need to tns
195 * it ourselves. Since we're not tns'ing, we retry
196 * more rapidly to get a valid value.
197 */
198 while ((val = rwlock->lock) & 1)
199 relax(4);
200 }
201}
202EXPORT_SYMBOL(arch_write_lock_slow);
203
204int __tns_atomic_acquire(atomic_t *lock)
205{
206 int ret;
207 u32 iterations = 0;
208
209 BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION));
210 __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
211
212 while ((ret = __insn_tns((void *)&lock->counter)) == 1)
213 delay_backoff(iterations++);
214 return ret;
215}
216
217void __tns_atomic_release(atomic_t *p, int v)
218{
219 p->counter = v;
220 __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
221}
diff --git a/arch/tile/lib/spinlock_common.h b/arch/tile/lib/spinlock_common.h
new file mode 100644
index 000000000000..8dffebde6630
--- /dev/null
+++ b/arch/tile/lib/spinlock_common.h
@@ -0,0 +1,64 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 * This file is included into spinlock_32.c or _64.c.
14 */
15
16/*
17 * The mfspr in __spinlock_relax() is 5 or 6 cycles plus 2 for loop
18 * overhead.
19 */
20#ifdef __tilegx__
21#define CYCLES_PER_RELAX_LOOP 7
22#else
23#define CYCLES_PER_RELAX_LOOP 8
24#endif
25
26/*
27 * Idle the core for CYCLES_PER_RELAX_LOOP * iterations cycles.
28 */
29static inline void
30relax(int iterations)
31{
32 for (/*above*/; iterations > 0; iterations--)
33 __insn_mfspr(SPR_PASS);
34 barrier();
35}
36
37/* Perform bounded exponential backoff.*/
38void delay_backoff(int iterations)
39{
40 u32 exponent, loops;
41
42 /*
43 * 2^exponent is how many times we go around the loop,
44 * which takes 8 cycles. We want to start with a 16- to 31-cycle
45 * loop, so we need to go around minimum 2 = 2^1 times, so we
46 * bias the original value up by 1.
47 */
48 exponent = iterations + 1;
49
50 /*
51 * Don't allow exponent to exceed 7, so we have 128 loops,
52 * or 1,024 (to 2,047) cycles, as our maximum.
53 */
54 if (exponent > 8)
55 exponent = 8;
56
57 loops = 1 << exponent;
58
59 /* Add a randomness factor so two cpus never get in lock step. */
60 loops += __insn_crc32_32(stack_pointer, get_cycles_low()) &
61 (loops - 1);
62
63 relax(1 << exponent);
64}
diff --git a/arch/tile/lib/strchr_32.c b/arch/tile/lib/strchr_32.c
new file mode 100644
index 000000000000..c94e6f7ae7b5
--- /dev/null
+++ b/arch/tile/lib/strchr_32.c
@@ -0,0 +1,66 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/types.h>
16#include <linux/string.h>
17#include <linux/module.h>
18
19#undef strchr
20
21char *strchr(const char *s, int c)
22{
23 int z, g;
24
25 /* Get an aligned pointer. */
26 const uintptr_t s_int = (uintptr_t) s;
27 const uint32_t *p = (const uint32_t *)(s_int & -4);
28
29 /* Create four copies of the byte for which we are looking. */
30 const uint32_t goal = 0x01010101 * (uint8_t) c;
31
32 /* Read the first aligned word, but force bytes before the string to
33 * match neither zero nor goal (we make sure the high bit of each
34 * byte is 1, and the low 7 bits are all the opposite of the goal
35 * byte).
36 *
37 * Note that this shift count expression works because we know shift
38 * counts are taken mod 32.
39 */
40 const uint32_t before_mask = (1 << (s_int << 3)) - 1;
41 uint32_t v = (*p | before_mask) ^ (goal & __insn_shrib(before_mask, 1));
42
43 uint32_t zero_matches, goal_matches;
44 while (1) {
45 /* Look for a terminating '\0'. */
46 zero_matches = __insn_seqb(v, 0);
47
48 /* Look for the goal byte. */
49 goal_matches = __insn_seqb(v, goal);
50
51 if (__builtin_expect(zero_matches | goal_matches, 0))
52 break;
53
54 v = *++p;
55 }
56
57 z = __insn_ctz(zero_matches);
58 g = __insn_ctz(goal_matches);
59
60 /* If we found c before '\0' we got a match. Note that if c == '\0'
61 * then g == z, and we correctly return the address of the '\0'
62 * rather than NULL.
63 */
64 return (g <= z) ? ((char *)p) + (g >> 3) : NULL;
65}
66EXPORT_SYMBOL(strchr);
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c
new file mode 100644
index 000000000000..f26f88e11e4a
--- /dev/null
+++ b/arch/tile/lib/strlen_32.c
@@ -0,0 +1,36 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/types.h>
16#include <linux/string.h>
17#include <linux/module.h>
18
19size_t strlen(const char *s)
20{
21 /* Get an aligned pointer. */
22 const uintptr_t s_int = (uintptr_t) s;
23 const uint32_t *p = (const uint32_t *)(s_int & -4);
24
25 /* Read the first word, but force bytes before the string to be nonzero.
26 * This expression works because we know shift counts are taken mod 32.
27 */
28 uint32_t v = *p | ((1 << (s_int << 3)) - 1);
29
30 uint32_t bits;
31 while ((bits = __insn_seqb(v, 0)) == 0)
32 v = *++p;
33
34 return ((const char *)p) + (__insn_ctz(bits) >> 3) - s;
35}
36EXPORT_SYMBOL(strlen);
diff --git a/arch/tile/lib/uaccess.c b/arch/tile/lib/uaccess.c
new file mode 100644
index 000000000000..9ae182568b77
--- /dev/null
+++ b/arch/tile/lib/uaccess.c
@@ -0,0 +1,31 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/uaccess.h>
16#include <linux/module.h>
17
18int __range_ok(unsigned long addr, unsigned long size)
19{
20 unsigned long limit = current_thread_info()->addr_limit.seg;
21 __chk_user_ptr(addr);
22 return !((addr < limit && size <= limit - addr) ||
23 is_arch_mappable_range(addr, size));
24}
25EXPORT_SYMBOL(__range_ok);
26
27void copy_from_user_overflow(void)
28{
29 WARN(1, "Buffer overflow detected!\n");
30}
31EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/arch/tile/lib/usercopy_32.S b/arch/tile/lib/usercopy_32.S
new file mode 100644
index 000000000000..979f76d83746
--- /dev/null
+++ b/arch/tile/lib/usercopy_32.S
@@ -0,0 +1,223 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/linkage.h>
16#include <asm/errno.h>
17#include <asm/cache.h>
18#include <arch/chip.h>
19
20/* Access user memory, but use MMU to avoid propagating kernel exceptions. */
21
22 .pushsection .fixup,"ax"
23
24get_user_fault:
25 { move r0, zero; move r1, zero }
26 { movei r2, -EFAULT; jrp lr }
27 ENDPROC(get_user_fault)
28
29put_user_fault:
30 { movei r0, -EFAULT; jrp lr }
31 ENDPROC(put_user_fault)
32
33 .popsection
34
35/*
36 * __get_user_N functions take a pointer in r0, and return 0 in r2
37 * on success, with the value in r0; or else -EFAULT in r2.
38 */
39#define __get_user_N(bytes, LOAD) \
40 STD_ENTRY(__get_user_##bytes); \
411: { LOAD r0, r0; move r1, zero; move r2, zero }; \
42 jrp lr; \
43 STD_ENDPROC(__get_user_##bytes); \
44 .pushsection __ex_table,"a"; \
45 .word 1b, get_user_fault; \
46 .popsection
47
48__get_user_N(1, lb_u)
49__get_user_N(2, lh_u)
50__get_user_N(4, lw)
51
52/*
53 * __get_user_8 takes a pointer in r0, and returns 0 in r2
54 * on success, with the value in r0/r1; or else -EFAULT in r2.
55 */
56 STD_ENTRY(__get_user_8);
571: { lw r0, r0; addi r1, r0, 4 };
582: { lw r1, r1; move r2, zero };
59 jrp lr;
60 STD_ENDPROC(__get_user_8);
61 .pushsection __ex_table,"a";
62 .word 1b, get_user_fault;
63 .word 2b, get_user_fault;
64 .popsection
65
66/*
67 * __put_user_N functions take a value in r0 and a pointer in r1,
68 * and return 0 in r0 on success or -EFAULT on failure.
69 */
70#define __put_user_N(bytes, STORE) \
71 STD_ENTRY(__put_user_##bytes); \
721: { STORE r1, r0; move r0, zero }; \
73 jrp lr; \
74 STD_ENDPROC(__put_user_##bytes); \
75 .pushsection __ex_table,"a"; \
76 .word 1b, put_user_fault; \
77 .popsection
78
79__put_user_N(1, sb)
80__put_user_N(2, sh)
81__put_user_N(4, sw)
82
83/*
84 * __put_user_8 takes a value in r0/r1 and a pointer in r2,
85 * and returns 0 in r0 on success or -EFAULT on failure.
86 */
87STD_ENTRY(__put_user_8)
881: { sw r2, r0; addi r2, r2, 4 }
892: { sw r2, r1; move r0, zero }
90 jrp lr
91 STD_ENDPROC(__put_user_8)
92 .pushsection __ex_table,"a"
93 .word 1b, put_user_fault
94 .word 2b, put_user_fault
95 .popsection
96
97
98/*
99 * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
100 * It returns the length, including the terminating NUL, or zero on exception.
101 * If length is greater than the bound, returns one plus the bound.
102 */
103STD_ENTRY(strnlen_user_asm)
104 { bz r1, 2f; addi r3, r0, -1 } /* bias down to include NUL */
1051: { lb_u r4, r0; addi r1, r1, -1 }
106 bz r4, 2f
107 { bnzt r1, 1b; addi r0, r0, 1 }
1082: { sub r0, r0, r3; jrp lr }
109 STD_ENDPROC(strnlen_user_asm)
110 .pushsection .fixup,"ax"
111strnlen_user_fault:
112 { move r0, zero; jrp lr }
113 ENDPROC(strnlen_user_fault)
114 .section __ex_table,"a"
115 .word 1b, strnlen_user_fault
116 .popsection
117
118/*
119 * strncpy_from_user_asm takes the kernel target pointer in r0,
120 * the userspace source pointer in r1, and the length bound (including
121 * the trailing NUL) in r2. On success, it returns the string length
122 * (not including the trailing NUL), or -EFAULT on failure.
123 */
124STD_ENTRY(strncpy_from_user_asm)
125 { bz r2, 2f; move r3, r0 }
1261: { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
127 { sb r0, r4; addi r0, r0, 1 }
128 bz r2, 2f
129 bnzt r4, 1b
130 addi r0, r0, -1 /* don't count the trailing NUL */
1312: { sub r0, r0, r3; jrp lr }
132 STD_ENDPROC(strncpy_from_user_asm)
133 .pushsection .fixup,"ax"
134strncpy_from_user_fault:
135 { movei r0, -EFAULT; jrp lr }
136 ENDPROC(strncpy_from_user_fault)
137 .section __ex_table,"a"
138 .word 1b, strncpy_from_user_fault
139 .popsection
140
141/*
142 * clear_user_asm takes the user target address in r0 and the
143 * number of bytes to zero in r1.
144 * It returns the number of uncopiable bytes (hopefully zero) in r0.
145 * Note that we don't use a separate .fixup section here since we fall
146 * through into the "fixup" code as the last straight-line bundle anyway.
147 */
148STD_ENTRY(clear_user_asm)
149 { bz r1, 2f; or r2, r0, r1 }
150 andi r2, r2, 3
151 bzt r2, .Lclear_aligned_user_asm
1521: { sb r0, zero; addi r0, r0, 1; addi r1, r1, -1 }
153 bnzt r1, 1b
1542: { move r0, r1; jrp lr }
155 .pushsection __ex_table,"a"
156 .word 1b, 2b
157 .popsection
158
159.Lclear_aligned_user_asm:
1601: { sw r0, zero; addi r0, r0, 4; addi r1, r1, -4 }
161 bnzt r1, 1b
1622: { move r0, r1; jrp lr }
163 STD_ENDPROC(clear_user_asm)
164 .pushsection __ex_table,"a"
165 .word 1b, 2b
166 .popsection
167
168/*
169 * flush_user_asm takes the user target address in r0 and the
170 * number of bytes to flush in r1.
171 * It returns the number of unflushable bytes (hopefully zero) in r0.
172 */
173STD_ENTRY(flush_user_asm)
174 bz r1, 2f
175 { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
176 { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
177 { and r0, r0, r2; and r1, r1, r2 }
178 { sub r1, r1, r0 }
1791: { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() }
180 { addi r0, r0, CHIP_FLUSH_STRIDE(); bnzt r1, 1b }
1812: { move r0, r1; jrp lr }
182 STD_ENDPROC(flush_user_asm)
183 .pushsection __ex_table,"a"
184 .word 1b, 2b
185 .popsection
186
187/*
188 * inv_user_asm takes the user target address in r0 and the
189 * number of bytes to invalidate in r1.
190 * It returns the number of not inv'able bytes (hopefully zero) in r0.
191 */
192STD_ENTRY(inv_user_asm)
193 bz r1, 2f
194 { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
195 { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
196 { and r0, r0, r2; and r1, r1, r2 }
197 { sub r1, r1, r0 }
1981: { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
199 { addi r0, r0, CHIP_INV_STRIDE(); bnzt r1, 1b }
2002: { move r0, r1; jrp lr }
201 STD_ENDPROC(inv_user_asm)
202 .pushsection __ex_table,"a"
203 .word 1b, 2b
204 .popsection
205
206/*
207 * finv_user_asm takes the user target address in r0 and the
208 * number of bytes to flush-invalidate in r1.
209 * It returns the number of not finv'able bytes (hopefully zero) in r0.
210 */
211STD_ENTRY(finv_user_asm)
212 bz r1, 2f
213 { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
214 { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
215 { and r0, r0, r2; and r1, r1, r2 }
216 { sub r1, r1, r0 }
2171: { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() }
218 { addi r0, r0, CHIP_FINV_STRIDE(); bnzt r1, 1b }
2192: { move r0, r1; jrp lr }
220 STD_ENDPROC(finv_user_asm)
221 .pushsection __ex_table,"a"
222 .word 1b, 2b
223 .popsection