aboutsummaryrefslogtreecommitdiffstats
path: root/arch/tile/lib
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/tile/lib
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'arch/tile/lib')
-rw-r--r--arch/tile/lib/Makefile5
-rw-r--r--arch/tile/lib/atomic_32.c25
-rw-r--r--arch/tile/lib/atomic_asm_32.S4
-rw-r--r--arch/tile/lib/cacheflush.c120
-rw-r--r--arch/tile/lib/delay.c21
-rw-r--r--arch/tile/lib/exports.c7
-rw-r--r--arch/tile/lib/mb_incoherent.S34
-rw-r--r--arch/tile/lib/memchr_32.c35
-rw-r--r--arch/tile/lib/memchr_64.c71
-rw-r--r--arch/tile/lib/memcpy_32.S206
-rw-r--r--arch/tile/lib/memcpy_64.c220
-rw-r--r--arch/tile/lib/memcpy_tile64.c15
-rw-r--r--arch/tile/lib/memcpy_user_64.c86
-rw-r--r--arch/tile/lib/memmove.c (renamed from arch/tile/lib/memmove_32.c)0
-rw-r--r--arch/tile/lib/memset_32.c1
-rw-r--r--arch/tile/lib/memset_64.c145
-rw-r--r--arch/tile/lib/spinlock_32.c190
-rw-r--r--arch/tile/lib/spinlock_64.c104
-rw-r--r--arch/tile/lib/strchr_64.c67
-rw-r--r--arch/tile/lib/strlen_32.c2
-rw-r--r--arch/tile/lib/strlen_64.c38
-rw-r--r--arch/tile/lib/usercopy_64.S196
22 files changed, 1336 insertions, 256 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 746dc81ed3c4..0c26086ecbef 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -2,9 +2,8 @@
2# Makefile for TILE-specific library files.. 2# Makefile for TILE-specific library files..
3# 3#
4 4
5lib-y = cacheflush.o checksum.o cpumask.o delay.o \ 5lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
6 mb_incoherent.o uaccess.o \ 6 memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
7 memcpy_$(BITS).o memchr_$(BITS).o memmove_$(BITS).o memset_$(BITS).o \
8 strchr_$(BITS).o strlen_$(BITS).o 7 strchr_$(BITS).o strlen_$(BITS).o
9 8
10ifeq ($(CONFIG_TILEGX),y) 9ifeq ($(CONFIG_TILEGX),y)
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index 8040b42a8eea..46570211df52 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -46,14 +46,13 @@ struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
46#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ 46#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
47 47
48/* This page is remapped on startup to be hash-for-home. */ 48/* This page is remapped on startup to be hash-for-home. */
49int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */] 49int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
50 __attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned")));
51 50
52#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ 51#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
53 52
54static inline int *__atomic_hashed_lock(volatile void *v) 53static inline int *__atomic_hashed_lock(volatile void *v)
55{ 54{
56 /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */ 55 /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
57#if ATOMIC_LOCKS_FOUND_VIA_TABLE() 56#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
58 unsigned long i = 57 unsigned long i =
59 (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long)); 58 (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
@@ -203,32 +202,32 @@ static inline int *__futex_setup(int __user *v)
203 return __atomic_hashed_lock((int __force *)v); 202 return __atomic_hashed_lock((int __force *)v);
204} 203}
205 204
206struct __get_user futex_set(int __user *v, int i) 205struct __get_user futex_set(u32 __user *v, int i)
207{ 206{
208 return __atomic_xchg((int __force *)v, __futex_setup(v), i); 207 return __atomic_xchg((int __force *)v, __futex_setup(v), i);
209} 208}
210 209
211struct __get_user futex_add(int __user *v, int n) 210struct __get_user futex_add(u32 __user *v, int n)
212{ 211{
213 return __atomic_xchg_add((int __force *)v, __futex_setup(v), n); 212 return __atomic_xchg_add((int __force *)v, __futex_setup(v), n);
214} 213}
215 214
216struct __get_user futex_or(int __user *v, int n) 215struct __get_user futex_or(u32 __user *v, int n)
217{ 216{
218 return __atomic_or((int __force *)v, __futex_setup(v), n); 217 return __atomic_or((int __force *)v, __futex_setup(v), n);
219} 218}
220 219
221struct __get_user futex_andn(int __user *v, int n) 220struct __get_user futex_andn(u32 __user *v, int n)
222{ 221{
223 return __atomic_andn((int __force *)v, __futex_setup(v), n); 222 return __atomic_andn((int __force *)v, __futex_setup(v), n);
224} 223}
225 224
226struct __get_user futex_xor(int __user *v, int n) 225struct __get_user futex_xor(u32 __user *v, int n)
227{ 226{
228 return __atomic_xor((int __force *)v, __futex_setup(v), n); 227 return __atomic_xor((int __force *)v, __futex_setup(v), n);
229} 228}
230 229
231struct __get_user futex_cmpxchg(int __user *v, int o, int n) 230struct __get_user futex_cmpxchg(u32 __user *v, int o, int n)
232{ 231{
233 return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n); 232 return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n);
234} 233}
@@ -300,7 +299,7 @@ void __init __init_atomic_per_cpu(void)
300#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ 299#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
301 300
302 /* Validate power-of-two and "bigger than cpus" assumption */ 301 /* Validate power-of-two and "bigger than cpus" assumption */
303 BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1)); 302 BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
304 BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids); 303 BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
305 304
306 /* 305 /*
@@ -314,17 +313,17 @@ void __init __init_atomic_per_cpu(void)
314 BUG_ON((unsigned long)atomic_locks % PAGE_SIZE != 0); 313 BUG_ON((unsigned long)atomic_locks % PAGE_SIZE != 0);
315 314
316 /* The locks must all fit on one page. */ 315 /* The locks must all fit on one page. */
317 BUG_ON(ATOMIC_HASH_SIZE * sizeof(int) > PAGE_SIZE); 316 BUILD_BUG_ON(ATOMIC_HASH_SIZE * sizeof(int) > PAGE_SIZE);
318 317
319 /* 318 /*
320 * We use the page offset of the atomic value's address as 319 * We use the page offset of the atomic value's address as
321 * an index into atomic_locks, excluding the low 3 bits. 320 * an index into atomic_locks, excluding the low 3 bits.
322 * That should not produce more indices than ATOMIC_HASH_SIZE. 321 * That should not produce more indices than ATOMIC_HASH_SIZE.
323 */ 322 */
324 BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE); 323 BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
325 324
326#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ 325#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
327 326
328 /* The futex code makes this assumption, so we validate it here. */ 327 /* The futex code makes this assumption, so we validate it here. */
329 BUG_ON(sizeof(atomic_t) != sizeof(int)); 328 BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
330} 329}
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 5a5514b77e78..24448734f6f1 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -14,7 +14,7 @@
14 * Support routines for atomic operations. Each function takes: 14 * Support routines for atomic operations. Each function takes:
15 * 15 *
16 * r0: address to manipulate 16 * r0: address to manipulate
17 * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG) 17 * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)
18 * r2: new value to write, or for cmpxchg/add_unless, value to compare against 18 * r2: new value to write, or for cmpxchg/add_unless, value to compare against
19 * r3: (cmpxchg/xchg_add_unless) new value to write or add; 19 * r3: (cmpxchg/xchg_add_unless) new value to write or add;
20 * (atomic64 ops) high word of value to write 20 * (atomic64 ops) high word of value to write
@@ -59,7 +59,7 @@
59 * bad kernel addresses). 59 * bad kernel addresses).
60 * 60 *
61 * Note that if the value we would store is the same as what we 61 * Note that if the value we would store is the same as what we
62 * loaded, we bypass the load. Other platforms with true atomics can 62 * loaded, we bypass the store. Other platforms with true atomics can
63 * make the guarantee that a non-atomic __clear_bit(), for example, 63 * make the guarantee that a non-atomic __clear_bit(), for example,
64 * can safely race with an atomic test_and_set_bit(); this example is 64 * can safely race with an atomic test_and_set_bit(); this example is
65 * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do 65 * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c
index 11b6164c2097..8928aace7a64 100644
--- a/arch/tile/lib/cacheflush.c
+++ b/arch/tile/lib/cacheflush.c
@@ -15,9 +15,129 @@
15#include <asm/page.h> 15#include <asm/page.h>
16#include <asm/cacheflush.h> 16#include <asm/cacheflush.h>
17#include <arch/icache.h> 17#include <arch/icache.h>
18#include <arch/spr_def.h>
18 19
19 20
20void __flush_icache_range(unsigned long start, unsigned long end) 21void __flush_icache_range(unsigned long start, unsigned long end)
21{ 22{
22 invalidate_icache((const void *)start, end - start, PAGE_SIZE); 23 invalidate_icache((const void *)start, end - start, PAGE_SIZE);
23} 24}
25
26
27/* Force a load instruction to issue. */
28static inline void force_load(char *p)
29{
30 *(volatile char *)p;
31}
32
33/*
34 * Flush and invalidate a VA range that is homed remotely on a single
35 * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
36 * until the memory controller holds the flushed values.
37 */
38void finv_buffer_remote(void *buffer, size_t size, int hfh)
39{
40 char *p, *base;
41 size_t step_size, load_count;
42 const unsigned long STRIPE_WIDTH = 8192;
43#ifdef __tilegx__
44 /*
45 * On TILE-Gx, we must disable the dstream prefetcher before doing
46 * a cache flush; otherwise, we could end up with data in the cache
47 * that we don't want there. Note that normally we'd do an mf
48 * after the SPR write to disabling the prefetcher, but we do one
49 * below, before any further loads, so there's no need to do it
50 * here.
51 */
52 uint_reg_t old_dstream_pf = __insn_mfspr(SPR_DSTREAM_PF);
53 __insn_mtspr(SPR_DSTREAM_PF, 0);
54#endif
55
56 /*
57 * Flush and invalidate the buffer out of the local L1/L2
58 * and request the home cache to flush and invalidate as well.
59 */
60 __finv_buffer(buffer, size);
61
62 /*
63 * Wait for the home cache to acknowledge that it has processed
64 * all the flush-and-invalidate requests. This does not mean
65 * that the flushed data has reached the memory controller yet,
66 * but it does mean the home cache is processing the flushes.
67 */
68 __insn_mf();
69
70 /*
71 * Issue a load to the last cache line, which can't complete
72 * until all the previously-issued flushes to the same memory
73 * controller have also completed. If we weren't striping
74 * memory, that one load would be sufficient, but since we may
75 * be, we also need to back up to the last load issued to
76 * another memory controller, which would be the point where
77 * we crossed an 8KB boundary (the granularity of striping
78 * across memory controllers). Keep backing up and doing this
79 * until we are before the beginning of the buffer, or have
80 * hit all the controllers.
81 *
82 * If we are flushing a hash-for-home buffer, it's even worse.
83 * Each line may be homed on a different tile, and each tile
84 * may have up to four lines that are on different
85 * controllers. So as we walk backwards, we have to touch
86 * enough cache lines to satisfy these constraints. In
87 * practice this ends up being close enough to "load from
88 * every cache line on a full memory stripe on each
89 * controller" that we simply do that, to simplify the logic.
90 *
91 * FIXME: See bug 9535 for some issues with this code.
92 */
93 if (hfh) {
94 step_size = L2_CACHE_BYTES;
95 load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) *
96 (1 << CHIP_LOG_NUM_MSHIMS());
97 } else {
98 step_size = STRIPE_WIDTH;
99 load_count = (1 << CHIP_LOG_NUM_MSHIMS());
100 }
101
102 /* Load the last byte of the buffer. */
103 p = (char *)buffer + size - 1;
104 force_load(p);
105
106 /* Bump down to the end of the previous stripe or cache line. */
107 p -= step_size;
108 p = (char *)((unsigned long)p | (step_size - 1));
109
110 /* Figure out how far back we need to go. */
111 base = p - (step_size * (load_count - 2));
112 if ((long)base < (long)buffer)
113 base = buffer;
114
115 /*
116 * Fire all the loads we need. The MAF only has eight entries
117 * so we can have at most eight outstanding loads, so we
118 * unroll by that amount.
119 */
120#pragma unroll 8
121 for (; p >= base; p -= step_size)
122 force_load(p);
123
124 /*
125 * Repeat, but with inv's instead of loads, to get rid of the
126 * data we just loaded into our own cache and the old home L3.
127 * No need to unroll since inv's don't target a register.
128 */
129 p = (char *)buffer + size - 1;
130 __insn_inv(p);
131 p -= step_size;
132 p = (char *)((unsigned long)p | (step_size - 1));
133 for (; p >= base; p -= step_size)
134 __insn_inv(p);
135
136 /* Wait for the load+inv's (and thus finvs) to have completed. */
137 __insn_mf();
138
139#ifdef __tilegx__
140 /* Reenable the prefetcher. */
141 __insn_mtspr(SPR_DSTREAM_PF, old_dstream_pf);
142#endif
143}
diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c
index 5801b03c13ef..cdacdd11d360 100644
--- a/arch/tile/lib/delay.c
+++ b/arch/tile/lib/delay.c
@@ -15,20 +15,31 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/thread_info.h> 17#include <linux/thread_info.h>
18#include <asm/fixmap.h> 18#include <asm/timex.h>
19#include <hv/hypervisor.h>
20 19
21void __udelay(unsigned long usecs) 20void __udelay(unsigned long usecs)
22{ 21{
23 hv_nanosleep(usecs * 1000); 22 if (usecs > ULONG_MAX / 1000) {
23 WARN_ON_ONCE(usecs > ULONG_MAX / 1000);
24 usecs = ULONG_MAX / 1000;
25 }
26 __ndelay(usecs * 1000);
24} 27}
25EXPORT_SYMBOL(__udelay); 28EXPORT_SYMBOL(__udelay);
26 29
27void __ndelay(unsigned long nsecs) 30void __ndelay(unsigned long nsecs)
28{ 31{
29 hv_nanosleep(nsecs); 32 cycles_t target = get_cycles();
33 target += ns2cycles(nsecs);
34 while (get_cycles() < target)
35 cpu_relax();
30} 36}
31EXPORT_SYMBOL(__ndelay); 37EXPORT_SYMBOL(__ndelay);
32 38
33/* FIXME: should be declared in a header somewhere. */ 39void __delay(unsigned long cycles)
40{
41 cycles_t target = get_cycles() + cycles;
42 while (get_cycles() < target)
43 cpu_relax();
44}
34EXPORT_SYMBOL(__delay); 45EXPORT_SYMBOL(__delay);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index ce5dbf56578f..49284fae9d09 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -29,6 +29,9 @@ EXPORT_SYMBOL(__put_user_8);
29EXPORT_SYMBOL(strnlen_user_asm); 29EXPORT_SYMBOL(strnlen_user_asm);
30EXPORT_SYMBOL(strncpy_from_user_asm); 30EXPORT_SYMBOL(strncpy_from_user_asm);
31EXPORT_SYMBOL(clear_user_asm); 31EXPORT_SYMBOL(clear_user_asm);
32EXPORT_SYMBOL(flush_user_asm);
33EXPORT_SYMBOL(inv_user_asm);
34EXPORT_SYMBOL(finv_user_asm);
32 35
33/* arch/tile/kernel/entry.S */ 36/* arch/tile/kernel/entry.S */
34#include <linux/kernel.h> 37#include <linux/kernel.h>
@@ -82,4 +85,8 @@ int64_t __muldi3(int64_t, int64_t);
82EXPORT_SYMBOL(__muldi3); 85EXPORT_SYMBOL(__muldi3);
83uint64_t __lshrdi3(uint64_t, unsigned int); 86uint64_t __lshrdi3(uint64_t, unsigned int);
84EXPORT_SYMBOL(__lshrdi3); 87EXPORT_SYMBOL(__lshrdi3);
88uint64_t __ashrdi3(uint64_t, unsigned int);
89EXPORT_SYMBOL(__ashrdi3);
90uint64_t __ashldi3(uint64_t, unsigned int);
91EXPORT_SYMBOL(__ashldi3);
85#endif 92#endif
diff --git a/arch/tile/lib/mb_incoherent.S b/arch/tile/lib/mb_incoherent.S
deleted file mode 100644
index 989ad7b68d5a..000000000000
--- a/arch/tile/lib/mb_incoherent.S
+++ /dev/null
@@ -1,34 +0,0 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * Assembly code for invoking the HV's fence_incoherent syscall.
15 */
16
17#include <linux/linkage.h>
18#include <hv/syscall_public.h>
19#include <arch/abi.h>
20#include <arch/chip.h>
21
22#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
23
24/*
25 * Invoke the hypervisor's fence_incoherent syscall, which guarantees
26 * that all victims for cachelines homed on this tile have reached memory.
27 */
28STD_ENTRY(__mb_incoherent)
29 moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent
30 swint2
31 jrp lr
32 STD_ENDPROC(__mb_incoherent)
33
34#endif
diff --git a/arch/tile/lib/memchr_32.c b/arch/tile/lib/memchr_32.c
index 6235283b4859..cc3d9badf030 100644
--- a/arch/tile/lib/memchr_32.c
+++ b/arch/tile/lib/memchr_32.c
@@ -18,12 +18,24 @@
18 18
19void *memchr(const void *s, int c, size_t n) 19void *memchr(const void *s, int c, size_t n)
20{ 20{
21 const uint32_t *last_word_ptr;
22 const uint32_t *p;
23 const char *last_byte_ptr;
24 uintptr_t s_int;
25 uint32_t goal, before_mask, v, bits;
26 char *ret;
27
28 if (__builtin_expect(n == 0, 0)) {
29 /* Don't dereference any memory if the array is empty. */
30 return NULL;
31 }
32
21 /* Get an aligned pointer. */ 33 /* Get an aligned pointer. */
22 const uintptr_t s_int = (uintptr_t) s; 34 s_int = (uintptr_t) s;
23 const uint32_t *p = (const uint32_t *)(s_int & -4); 35 p = (const uint32_t *)(s_int & -4);
24 36
25 /* Create four copies of the byte for which we are looking. */ 37 /* Create four copies of the byte for which we are looking. */
26 const uint32_t goal = 0x01010101 * (uint8_t) c; 38 goal = 0x01010101 * (uint8_t) c;
27 39
28 /* Read the first word, but munge it so that bytes before the array 40 /* Read the first word, but munge it so that bytes before the array
29 * will not match goal. 41 * will not match goal.
@@ -31,23 +43,14 @@ void *memchr(const void *s, int c, size_t n)
31 * Note that this shift count expression works because we know 43 * Note that this shift count expression works because we know
32 * shift counts are taken mod 32. 44 * shift counts are taken mod 32.
33 */ 45 */
34 const uint32_t before_mask = (1 << (s_int << 3)) - 1; 46 before_mask = (1 << (s_int << 3)) - 1;
35 uint32_t v = (*p | before_mask) ^ (goal & before_mask); 47 v = (*p | before_mask) ^ (goal & before_mask);
36 48
37 /* Compute the address of the last byte. */ 49 /* Compute the address of the last byte. */
38 const char *const last_byte_ptr = (const char *)s + n - 1; 50 last_byte_ptr = (const char *)s + n - 1;
39 51
40 /* Compute the address of the word containing the last byte. */ 52 /* Compute the address of the word containing the last byte. */
41 const uint32_t *const last_word_ptr = 53 last_word_ptr = (const uint32_t *)((uintptr_t) last_byte_ptr & -4);
42 (const uint32_t *)((uintptr_t) last_byte_ptr & -4);
43
44 uint32_t bits;
45 char *ret;
46
47 if (__builtin_expect(n == 0, 0)) {
48 /* Don't dereference any memory if the array is empty. */
49 return NULL;
50 }
51 54
52 while ((bits = __insn_seqb(v, goal)) == 0) { 55 while ((bits = __insn_seqb(v, goal)) == 0) {
53 if (__builtin_expect(p == last_word_ptr, 0)) { 56 if (__builtin_expect(p == last_word_ptr, 0)) {
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c
new file mode 100644
index 000000000000..84fdc8d8e735
--- /dev/null
+++ b/arch/tile/lib/memchr_64.c
@@ -0,0 +1,71 @@
1/*
2 * Copyright 2011 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/types.h>
16#include <linux/string.h>
17#include <linux/module.h>
18
19void *memchr(const void *s, int c, size_t n)
20{
21 const uint64_t *last_word_ptr;
22 const uint64_t *p;
23 const char *last_byte_ptr;
24 uintptr_t s_int;
25 uint64_t goal, before_mask, v, bits;
26 char *ret;
27
28 if (__builtin_expect(n == 0, 0)) {
29 /* Don't dereference any memory if the array is empty. */
30 return NULL;
31 }
32
33 /* Get an aligned pointer. */
34 s_int = (uintptr_t) s;
35 p = (const uint64_t *)(s_int & -8);
36
37 /* Create eight copies of the byte for which we are looking. */
38 goal = 0x0101010101010101ULL * (uint8_t) c;
39
40 /* Read the first word, but munge it so that bytes before the array
41 * will not match goal.
42 *
43 * Note that this shift count expression works because we know
44 * shift counts are taken mod 64.
45 */
46 before_mask = (1ULL << (s_int << 3)) - 1;
47 v = (*p | before_mask) ^ (goal & before_mask);
48
49 /* Compute the address of the last byte. */
50 last_byte_ptr = (const char *)s + n - 1;
51
52 /* Compute the address of the word containing the last byte. */
53 last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8);
54
55 while ((bits = __insn_v1cmpeq(v, goal)) == 0) {
56 if (__builtin_expect(p == last_word_ptr, 0)) {
57 /* We already read the last word in the array,
58 * so give up.
59 */
60 return NULL;
61 }
62 v = *++p;
63 }
64
65 /* We found a match, but it might be in a byte past the end
66 * of the array.
67 */
68 ret = ((char *)p) + (__insn_ctz(bits) >> 3);
69 return (ret <= last_byte_ptr) ? ret : NULL;
70}
71EXPORT_SYMBOL(memchr);
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
index 30c3b7ebb55d..2a419a6122db 100644
--- a/arch/tile/lib/memcpy_32.S
+++ b/arch/tile/lib/memcpy_32.S
@@ -10,14 +10,16 @@
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for 11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details. 12 * more details.
13 *
14 * This file shares the implementation of the userspace memcpy and
15 * the kernel's memcpy, copy_to_user and copy_from_user.
16 */ 13 */
17 14
18#include <arch/chip.h> 15#include <arch/chip.h>
19 16
20 17
18/*
19 * This file shares the implementation of the userspace memcpy and
20 * the kernel's memcpy, copy_to_user and copy_from_user.
21 */
22
21#include <linux/linkage.h> 23#include <linux/linkage.h>
22 24
23/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */ 25/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
@@ -53,9 +55,9 @@
53 */ 55 */
54ENTRY(__copy_from_user_inatomic) 56ENTRY(__copy_from_user_inatomic)
55.type __copy_from_user_inatomic, @function 57.type __copy_from_user_inatomic, @function
56 FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \ 58 FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \
57 .text.memcpy_common, \ 59 .text.memcpy_common, \
58 .Lend_memcpy_common - __copy_from_user_inatomic) 60 .Lend_memcpy_common - __copy_from_user_inatomic)
59 { movei r29, IS_COPY_FROM_USER; j memcpy_common } 61 { movei r29, IS_COPY_FROM_USER; j memcpy_common }
60 .size __copy_from_user_inatomic, . - __copy_from_user_inatomic 62 .size __copy_from_user_inatomic, . - __copy_from_user_inatomic
61 63
@@ -64,7 +66,7 @@ ENTRY(__copy_from_user_inatomic)
64 */ 66 */
65ENTRY(__copy_from_user_zeroing) 67ENTRY(__copy_from_user_zeroing)
66.type __copy_from_user_zeroing, @function 68.type __copy_from_user_zeroing, @function
67 FEEDBACK_REENTER(__copy_from_user_inatomic) 69 FEEDBACK_REENTER(__copy_from_user_inatomic)
68 { movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common } 70 { movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common }
69 .size __copy_from_user_zeroing, . - __copy_from_user_zeroing 71 .size __copy_from_user_zeroing, . - __copy_from_user_zeroing
70 72
@@ -74,13 +76,13 @@ ENTRY(__copy_from_user_zeroing)
74 */ 76 */
75ENTRY(__copy_to_user_inatomic) 77ENTRY(__copy_to_user_inatomic)
76.type __copy_to_user_inatomic, @function 78.type __copy_to_user_inatomic, @function
77 FEEDBACK_REENTER(__copy_from_user_inatomic) 79 FEEDBACK_REENTER(__copy_from_user_inatomic)
78 { movei r29, IS_COPY_TO_USER; j memcpy_common } 80 { movei r29, IS_COPY_TO_USER; j memcpy_common }
79 .size __copy_to_user_inatomic, . - __copy_to_user_inatomic 81 .size __copy_to_user_inatomic, . - __copy_to_user_inatomic
80 82
81ENTRY(memcpy) 83ENTRY(memcpy)
82.type memcpy, @function 84.type memcpy, @function
83 FEEDBACK_REENTER(__copy_from_user_inatomic) 85 FEEDBACK_REENTER(__copy_from_user_inatomic)
84 { movei r29, IS_MEMCPY } 86 { movei r29, IS_MEMCPY }
85 .size memcpy, . - memcpy 87 .size memcpy, . - memcpy
86 /* Fall through */ 88 /* Fall through */
@@ -157,35 +159,35 @@ EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
157 { addi r3, r1, 60; andi r9, r9, -64 } 159 { addi r3, r1, 60; andi r9, r9, -64 }
158 160
159#if CHIP_HAS_WH64() 161#if CHIP_HAS_WH64()
160 /* No need to prefetch dst, we'll just do the wh64 162 /* No need to prefetch dst, we'll just do the wh64
161 * right before we copy a line. 163 * right before we copy a line.
162 */ 164 */
163#endif 165#endif
164 166
165EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 } 167EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 }
166 /* Intentionally stall for a few cycles to leave L2 cache alone. */ 168 /* Intentionally stall for a few cycles to leave L2 cache alone. */
167 { bnzt zero, .; move r27, lr } 169 { bnzt zero, .; move r27, lr }
168EX: { lw r6, r3; addi r3, r3, 64 } 170EX: { lw r6, r3; addi r3, r3, 64 }
169 /* Intentionally stall for a few cycles to leave L2 cache alone. */ 171 /* Intentionally stall for a few cycles to leave L2 cache alone. */
170 { bnzt zero, . } 172 { bnzt zero, . }
171EX: { lw r7, r3; addi r3, r3, 64 } 173EX: { lw r7, r3; addi r3, r3, 64 }
172#if !CHIP_HAS_WH64() 174#if !CHIP_HAS_WH64()
173 /* Prefetch the dest */ 175 /* Prefetch the dest */
174 /* Intentionally stall for a few cycles to leave L2 cache alone. */ 176 /* Intentionally stall for a few cycles to leave L2 cache alone. */
175 { bnzt zero, . } 177 { bnzt zero, . }
176 /* Use a real load to cause a TLB miss if necessary. We aren't using 178 /* Use a real load to cause a TLB miss if necessary. We aren't using
177 * r28, so this should be fine. 179 * r28, so this should be fine.
178 */ 180 */
179EX: { lw r28, r9; addi r9, r9, 64 } 181EX: { lw r28, r9; addi r9, r9, 64 }
180 /* Intentionally stall for a few cycles to leave L2 cache alone. */ 182 /* Intentionally stall for a few cycles to leave L2 cache alone. */
181 { bnzt zero, . } 183 { bnzt zero, . }
182 { prefetch r9; addi r9, r9, 64 } 184 { prefetch r9; addi r9, r9, 64 }
183 /* Intentionally stall for a few cycles to leave L2 cache alone. */ 185 /* Intentionally stall for a few cycles to leave L2 cache alone. */
184 { bnzt zero, . } 186 { bnzt zero, . }
185 { prefetch r9; addi r9, r9, 64 } 187 { prefetch r9; addi r9, r9, 64 }
186#endif 188#endif
187 /* Intentionally stall for a few cycles to leave L2 cache alone. */ 189 /* Intentionally stall for a few cycles to leave L2 cache alone. */
188 { bz zero, .Lbig_loop2 } 190 { bz zero, .Lbig_loop2 }
189 191
190 /* On entry to this loop: 192 /* On entry to this loop:
191 * - r0 points to the start of dst line 0 193 * - r0 points to the start of dst line 0
@@ -197,7 +199,7 @@ EX: { lw r28, r9; addi r9, r9, 64 }
197 * to some "safe" recently loaded address. 199 * to some "safe" recently loaded address.
198 * - r5 contains *(r1 + 60) [i.e. last word of source line 0] 200 * - r5 contains *(r1 + 60) [i.e. last word of source line 0]
199 * - r6 contains *(r1 + 64 + 60) [i.e. last word of source line 1] 201 * - r6 contains *(r1 + 64 + 60) [i.e. last word of source line 1]
200 * - r9 contains ((r0 + 63) & -64) 202 * - r9 contains ((r0 + 63) & -64)
201 * [start of next dst cache line.] 203 * [start of next dst cache line.]
202 */ 204 */
203 205
@@ -208,137 +210,137 @@ EX: { lw r28, r9; addi r9, r9, 64 }
208 /* Copy line 0, first stalling until r5 is ready. */ 210 /* Copy line 0, first stalling until r5 is ready. */
209EX: { move r12, r5; lw r16, r1 } 211EX: { move r12, r5; lw r16, r1 }
210 { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } 212 { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
211 /* Prefetch several lines ahead. */ 213 /* Prefetch several lines ahead. */
212EX: { lw r5, r3; addi r3, r3, 64 } 214EX: { lw r5, r3; addi r3, r3, 64 }
213 { jal .Lcopy_line } 215 { jal .Lcopy_line }
214 216
215 /* Copy line 1, first stalling until r6 is ready. */ 217 /* Copy line 1, first stalling until r6 is ready. */
216EX: { move r12, r6; lw r16, r1 } 218EX: { move r12, r6; lw r16, r1 }
217 { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } 219 { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
218 /* Prefetch several lines ahead. */ 220 /* Prefetch several lines ahead. */
219EX: { lw r6, r3; addi r3, r3, 64 } 221EX: { lw r6, r3; addi r3, r3, 64 }
220 { jal .Lcopy_line } 222 { jal .Lcopy_line }
221 223
222 /* Copy line 2, first stalling until r7 is ready. */ 224 /* Copy line 2, first stalling until r7 is ready. */
223EX: { move r12, r7; lw r16, r1 } 225EX: { move r12, r7; lw r16, r1 }
224 { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } 226 { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
225 /* Prefetch several lines ahead. */ 227 /* Prefetch several lines ahead. */
226EX: { lw r7, r3; addi r3, r3, 64 } 228EX: { lw r7, r3; addi r3, r3, 64 }
227 /* Use up a caches-busy cycle by jumping back to the top of the 229 /* Use up a caches-busy cycle by jumping back to the top of the
228 * loop. Might as well get it out of the way now. 230 * loop. Might as well get it out of the way now.
229 */ 231 */
230 { j .Lbig_loop } 232 { j .Lbig_loop }
231 233
232 234
233 /* On entry: 235 /* On entry:
234 * - r0 points to the destination line. 236 * - r0 points to the destination line.
235 * - r1 points to the source line. 237 * - r1 points to the source line.
236 * - r3 is the next prefetch address. 238 * - r3 is the next prefetch address.
237 * - r9 holds the last address used for wh64. 239 * - r9 holds the last address used for wh64.
238 * - r12 = WORD_15 240 * - r12 = WORD_15
239 * - r16 = WORD_0. 241 * - r16 = WORD_0.
240 * - r17 == r1 + 16. 242 * - r17 == r1 + 16.
241 * - r27 holds saved lr to restore. 243 * - r27 holds saved lr to restore.
242 * 244 *
243 * On exit: 245 * On exit:
244 * - r0 is incremented by 64. 246 * - r0 is incremented by 64.
245 * - r1 is incremented by 64, unless that would point to a word 247 * - r1 is incremented by 64, unless that would point to a word
246 * beyond the end of the source array, in which case it is redirected 248 * beyond the end of the source array, in which case it is redirected
247 * to point to an arbitrary word already in the cache. 249 * to point to an arbitrary word already in the cache.
248 * - r2 is decremented by 64. 250 * - r2 is decremented by 64.
249 * - r3 is unchanged, unless it points to a word beyond the 251 * - r3 is unchanged, unless it points to a word beyond the
250 * end of the source array, in which case it is redirected 252 * end of the source array, in which case it is redirected
251 * to point to an arbitrary word already in the cache. 253 * to point to an arbitrary word already in the cache.
252 * Redirecting is OK since if we are that close to the end 254 * Redirecting is OK since if we are that close to the end
253 * of the array we will not come back to this subroutine 255 * of the array we will not come back to this subroutine
254 * and use the contents of the prefetched address. 256 * and use the contents of the prefetched address.
255 * - r4 is nonzero iff r2 >= 64. 257 * - r4 is nonzero iff r2 >= 64.
256 * - r9 is incremented by 64, unless it points beyond the 258 * - r9 is incremented by 64, unless it points beyond the
257 * end of the last full destination cache line, in which 259 * end of the last full destination cache line, in which
258 * case it is redirected to a "safe address" that can be 260 * case it is redirected to a "safe address" that can be
259 * clobbered (sp - 64) 261 * clobbered (sp - 64)
260 * - lr contains the value in r27. 262 * - lr contains the value in r27.
261 */ 263 */
262 264
263/* r26 unused */ 265/* r26 unused */
264 266
265.Lcopy_line: 267.Lcopy_line:
266 /* TODO: when r3 goes past the end, we would like to redirect it 268 /* TODO: when r3 goes past the end, we would like to redirect it
267 * to prefetch the last partial cache line (if any) just once, for the 269 * to prefetch the last partial cache line (if any) just once, for the
268 * benefit of the final cleanup loop. But we don't want to 270 * benefit of the final cleanup loop. But we don't want to
269 * prefetch that line more than once, or subsequent prefetches 271 * prefetch that line more than once, or subsequent prefetches
270 * will go into the RTF. But then .Lbig_loop should unconditionally 272 * will go into the RTF. But then .Lbig_loop should unconditionally
271 * branch to top of loop to execute final prefetch, and its 273 * branch to top of loop to execute final prefetch, and its
272 * nop should become a conditional branch. 274 * nop should become a conditional branch.
273 */ 275 */
274 276
275 /* We need two non-memory cycles here to cover the resources 277 /* We need two non-memory cycles here to cover the resources
276 * used by the loads initiated by the caller. 278 * used by the loads initiated by the caller.
277 */ 279 */
278 { add r15, r1, r2 } 280 { add r15, r1, r2 }
279.Lcopy_line2: 281.Lcopy_line2:
280 { slt_u r13, r3, r15; addi r17, r1, 16 } 282 { slt_u r13, r3, r15; addi r17, r1, 16 }
281 283
282 /* NOTE: this will stall for one cycle as L1 is busy. */ 284 /* NOTE: this will stall for one cycle as L1 is busy. */
283 285
284 /* Fill second L1D line. */ 286 /* Fill second L1D line. */
285EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */ 287EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
286 288
287#if CHIP_HAS_WH64() 289#if CHIP_HAS_WH64()
288 /* Prepare destination line for writing. */ 290 /* Prepare destination line for writing. */
289EX: { wh64 r9; addi r9, r9, 64 } 291EX: { wh64 r9; addi r9, r9, 64 }
290#else 292#else
291 /* Prefetch dest line */ 293 /* Prefetch dest line */
292 { prefetch r9; addi r9, r9, 64 } 294 { prefetch r9; addi r9, r9, 64 }
293#endif 295#endif
294 /* Load seven words that are L1D hits to cover wh64 L2 usage. */ 296 /* Load seven words that are L1D hits to cover wh64 L2 usage. */
295 297
296 /* Load the three remaining words from the last L1D line, which 298 /* Load the three remaining words from the last L1D line, which
297 * we know has already filled the L1D. 299 * we know has already filled the L1D.
298 */ 300 */
299EX: { lw r4, r1; addi r1, r1, 4; addi r20, r1, 16 } /* r4 = WORD_12 */ 301EX: { lw r4, r1; addi r1, r1, 4; addi r20, r1, 16 } /* r4 = WORD_12 */
300EX: { lw r8, r1; addi r1, r1, 4; slt_u r13, r20, r15 }/* r8 = WORD_13 */ 302EX: { lw r8, r1; addi r1, r1, 4; slt_u r13, r20, r15 }/* r8 = WORD_13 */
301EX: { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 } /* r11 = WORD_14 */ 303EX: { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 } /* r11 = WORD_14 */
302 304
303 /* Load the three remaining words from the first L1D line, first 305 /* Load the three remaining words from the first L1D line, first
304 * stalling until it has filled by "looking at" r16. 306 * stalling until it has filled by "looking at" r16.
305 */ 307 */
306EX: { lw r13, r1; addi r1, r1, 4; move zero, r16 } /* r13 = WORD_1 */ 308EX: { lw r13, r1; addi r1, r1, 4; move zero, r16 } /* r13 = WORD_1 */
307EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_2 */ 309EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_2 */
308EX: { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */ 310EX: { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */
309 311
310 /* Load second word from the second L1D line, first 312 /* Load second word from the second L1D line, first
311 * stalling until it has filled by "looking at" r17. 313 * stalling until it has filled by "looking at" r17.
312 */ 314 */
313EX: { lw r19, r1; addi r1, r1, 4; move zero, r17 } /* r19 = WORD_5 */ 315EX: { lw r19, r1; addi r1, r1, 4; move zero, r17 } /* r19 = WORD_5 */
314 316
315 /* Store last word to the destination line, potentially dirtying it 317 /* Store last word to the destination line, potentially dirtying it
316 * for the first time, which keeps the L2 busy for two cycles. 318 * for the first time, which keeps the L2 busy for two cycles.
317 */ 319 */
318EX: { sw r10, r12 } /* store(WORD_15) */ 320EX: { sw r10, r12 } /* store(WORD_15) */
319 321
320 /* Use two L1D hits to cover the sw L2 access above. */ 322 /* Use two L1D hits to cover the sw L2 access above. */
321EX: { lw r10, r1; addi r1, r1, 4 } /* r10 = WORD_6 */ 323EX: { lw r10, r1; addi r1, r1, 4 } /* r10 = WORD_6 */
322EX: { lw r12, r1; addi r1, r1, 4 } /* r12 = WORD_7 */ 324EX: { lw r12, r1; addi r1, r1, 4 } /* r12 = WORD_7 */
323 325
324 /* Fill third L1D line. */ 326 /* Fill third L1D line. */
325EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */ 327EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */
326 328
327 /* Store first L1D line. */ 329 /* Store first L1D line. */
328EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */ 330EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
329EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */ 331EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
330EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */ 332EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
331#if CHIP_HAS_WH64() 333#if CHIP_HAS_WH64()
332EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */ 334EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
333#else 335#else
334 /* Back up the r9 to a cache line we are already storing to 336 /* Back up the r9 to a cache line we are already storing to
335 * if it gets past the end of the dest vector. Strictly speaking, 337 * if it gets past the end of the dest vector. Strictly speaking,
336 * we don't need to back up to the start of a cache line, but it's free 338 * we don't need to back up to the start of a cache line, but it's free
337 * and tidy, so why not? 339 * and tidy, so why not?
338 */ 340 */
339EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */ 341EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
340#endif 342#endif
341 /* Store second L1D line. */ 343 /* Store second L1D line. */
342EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */ 344EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
343EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */ 345EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */
344EX: { sw r0, r10; addi r0, r0, 4 } /* store(WORD_6) */ 346EX: { sw r0, r10; addi r0, r0, 4 } /* store(WORD_6) */
@@ -348,30 +350,30 @@ EX: { lw r13, r1; addi r1, r1, 4; move zero, r18 } /* r13 = WORD_9 */
348EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_10 */ 350EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_10 */
349EX: { lw r15, r1; move r1, r20 } /* r15 = WORD_11 */ 351EX: { lw r15, r1; move r1, r20 } /* r15 = WORD_11 */
350 352
351 /* Store third L1D line. */ 353 /* Store third L1D line. */
352EX: { sw r0, r18; addi r0, r0, 4 } /* store(WORD_8) */ 354EX: { sw r0, r18; addi r0, r0, 4 } /* store(WORD_8) */
353EX: { sw r0, r13; addi r0, r0, 4 } /* store(WORD_9) */ 355EX: { sw r0, r13; addi r0, r0, 4 } /* store(WORD_9) */
354EX: { sw r0, r14; addi r0, r0, 4 } /* store(WORD_10) */ 356EX: { sw r0, r14; addi r0, r0, 4 } /* store(WORD_10) */
355EX: { sw r0, r15; addi r0, r0, 4 } /* store(WORD_11) */ 357EX: { sw r0, r15; addi r0, r0, 4 } /* store(WORD_11) */
356 358
357 /* Store rest of fourth L1D line. */ 359 /* Store rest of fourth L1D line. */
358EX: { sw r0, r4; addi r0, r0, 4 } /* store(WORD_12) */ 360EX: { sw r0, r4; addi r0, r0, 4 } /* store(WORD_12) */
359 { 361 {
360EX: sw r0, r8 /* store(WORD_13) */ 362EX: sw r0, r8 /* store(WORD_13) */
361 addi r0, r0, 4 363 addi r0, r0, 4
362 /* Will r2 be > 64 after we subtract 64 below? */ 364 /* Will r2 be > 64 after we subtract 64 below? */
363 shri r4, r2, 7 365 shri r4, r2, 7
364 } 366 }
365 { 367 {
366EX: sw r0, r11 /* store(WORD_14) */ 368EX: sw r0, r11 /* store(WORD_14) */
367 addi r0, r0, 8 369 addi r0, r0, 8
368 /* Record 64 bytes successfully copied. */ 370 /* Record 64 bytes successfully copied. */
369 addi r2, r2, -64 371 addi r2, r2, -64
370 } 372 }
371 373
372 { jrp lr; move lr, r27 } 374 { jrp lr; move lr, r27 }
373 375
374 /* Convey to the backtrace library that the stack frame is size 376 /* Convey to the backtrace library that the stack frame is size
375 * zero, and the real return address is on the stack rather than 377 * zero, and the real return address is on the stack rather than
376 * in 'lr'. 378 * in 'lr'.
377 */ 379 */
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c
new file mode 100644
index 000000000000..3fab9a6a2bbe
--- /dev/null
+++ b/arch/tile/lib/memcpy_64.c
@@ -0,0 +1,220 @@
1/*
2 * Copyright 2011 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/types.h>
16#include <linux/string.h>
17#include <linux/module.h>
18#define __memcpy memcpy
19/* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
20
21/* Must be 8 bytes in size. */
22#define word_t uint64_t
23
24#if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128
25#error "Assumes 64 or 128 byte line size"
26#endif
27
28/* How many cache lines ahead should we prefetch? */
29#define PREFETCH_LINES_AHEAD 3
30
31/*
32 * Provide "base versions" of load and store for the normal code path.
33 * The kernel provides other versions for userspace copies.
34 */
35#define ST(p, v) (*(p) = (v))
36#define LD(p) (*(p))
37
38#ifndef USERCOPY_FUNC
39#define ST1 ST
40#define ST2 ST
41#define ST4 ST
42#define ST8 ST
43#define LD1 LD
44#define LD2 LD
45#define LD4 LD
46#define LD8 LD
47#define RETVAL dstv
48void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
49#else
50/*
51 * Special kernel version will provide implementation of the LDn/STn
52 * macros to return a count of uncopied bytes due to mm fault.
53 */
54#define RETVAL 0
55int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
56#endif
57{
58 char *__restrict dst1 = (char *)dstv;
59 const char *__restrict src1 = (const char *)srcv;
60 const char *__restrict src1_end;
61 const char *__restrict prefetch;
62 word_t *__restrict dst8; /* 8-byte pointer to destination memory. */
63 word_t final; /* Final bytes to write to trailing word, if any */
64 long i;
65
66 if (n < 16) {
67 for (; n; n--)
68 ST1(dst1++, LD1(src1++));
69 return RETVAL;
70 }
71
72 /*
73 * Locate the end of source memory we will copy. Don't
74 * prefetch past this.
75 */
76 src1_end = src1 + n - 1;
77
78 /* Prefetch ahead a few cache lines, but not past the end. */
79 prefetch = src1;
80 for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
81 __insn_prefetch(prefetch);
82 prefetch += CHIP_L2_LINE_SIZE();
83 prefetch = (prefetch > src1_end) ? prefetch : src1;
84 }
85
86 /* Copy bytes until dst is word-aligned. */
87 for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--)
88 ST1(dst1++, LD1(src1++));
89
90 /* 8-byte pointer to destination memory. */
91 dst8 = (word_t *)dst1;
92
93 if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) {
94 /*
95 * Misaligned copy. Copy 8 bytes at a time, but don't
96 * bother with other fanciness.
97 *
98 * TODO: Consider prefetching and using wh64 as well.
99 */
100
101 /* Create an aligned src8. */
102 const word_t *__restrict src8 =
103 (const word_t *)((uintptr_t)src1 & -sizeof(word_t));
104 word_t b;
105
106 word_t a = LD8(src8++);
107 for (; n >= sizeof(word_t); n -= sizeof(word_t)) {
108 b = LD8(src8++);
109 a = __insn_dblalign(a, b, src1);
110 ST8(dst8++, a);
111 a = b;
112 }
113
114 if (n == 0)
115 return RETVAL;
116
117 b = ((const char *)src8 <= src1_end) ? *src8 : 0;
118
119 /*
120 * Final source bytes to write to trailing partial
121 * word, if any.
122 */
123 final = __insn_dblalign(a, b, src1);
124 } else {
125 /* Aligned copy. */
126
127 const word_t* __restrict src8 = (const word_t *)src1;
128
129 /* src8 and dst8 are both word-aligned. */
130 if (n >= CHIP_L2_LINE_SIZE()) {
131 /* Copy until 'dst' is cache-line-aligned. */
132 for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
133 n -= sizeof(word_t))
134 ST8(dst8++, LD8(src8++));
135
136 for (; n >= CHIP_L2_LINE_SIZE(); ) {
137 __insn_wh64(dst8);
138
139 /*
140 * Prefetch and advance to next line
141 * to prefetch, but don't go past the end
142 */
143 __insn_prefetch(prefetch);
144 prefetch += CHIP_L2_LINE_SIZE();
145 prefetch = (prefetch > src1_end) ? prefetch :
146 (const char *)src8;
147
148 /*
149 * Copy an entire cache line. Manually
150 * unrolled to avoid idiosyncracies of
151 * compiler unrolling.
152 */
153#define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; })
154 COPY_WORD(0);
155 COPY_WORD(1);
156 COPY_WORD(2);
157 COPY_WORD(3);
158 COPY_WORD(4);
159 COPY_WORD(5);
160 COPY_WORD(6);
161 COPY_WORD(7);
162#if CHIP_L2_LINE_SIZE() == 128
163 COPY_WORD(8);
164 COPY_WORD(9);
165 COPY_WORD(10);
166 COPY_WORD(11);
167 COPY_WORD(12);
168 COPY_WORD(13);
169 COPY_WORD(14);
170 COPY_WORD(15);
171#elif CHIP_L2_LINE_SIZE() != 64
172# error Fix code that assumes particular L2 cache line sizes
173#endif
174
175 dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
176 src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
177 }
178 }
179
180 for (; n >= sizeof(word_t); n -= sizeof(word_t))
181 ST8(dst8++, LD8(src8++));
182
183 if (__builtin_expect(n == 0, 1))
184 return RETVAL;
185
186 final = LD8(src8);
187 }
188
189 /* n != 0 if we get here. Write out any trailing bytes. */
190 dst1 = (char *)dst8;
191 if (n & 4) {
192 ST4((uint32_t *)dst1, final);
193 dst1 += 4;
194 final >>= 32;
195 n &= 3;
196 }
197 if (n & 2) {
198 ST2((uint16_t *)dst1, final);
199 dst1 += 2;
200 final >>= 16;
201 n &= 1;
202 }
203 if (n)
204 ST1((uint8_t *)dst1, final);
205
206 return RETVAL;
207}
208
209
210#ifdef USERCOPY_FUNC
211#undef ST1
212#undef ST2
213#undef ST4
214#undef ST8
215#undef LD1
216#undef LD2
217#undef LD4
218#undef LD8
219#undef USERCOPY_FUNC
220#endif
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
index dfedea7b266b..b2fe15e01075 100644
--- a/arch/tile/lib/memcpy_tile64.c
+++ b/arch/tile/lib/memcpy_tile64.c
@@ -54,7 +54,7 @@ typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
54 * we must run with interrupts disabled to avoid the risk of some 54 * we must run with interrupts disabled to avoid the risk of some
55 * other code seeing the incoherent data in our cache. (Recall that 55 * other code seeing the incoherent data in our cache. (Recall that
56 * our cache is indexed by PA, so even if the other code doesn't use 56 * our cache is indexed by PA, so even if the other code doesn't use
57 * our KM_MEMCPY virtual addresses, they'll still hit in cache using 57 * our kmap_atomic virtual addresses, they'll still hit in cache using
58 * the normal VAs that aren't supposed to hit in cache.) 58 * the normal VAs that aren't supposed to hit in cache.)
59 */ 59 */
60static void memcpy_multicache(void *dest, const void *source, 60static void memcpy_multicache(void *dest, const void *source,
@@ -64,6 +64,7 @@ static void memcpy_multicache(void *dest, const void *source,
64 unsigned long flags, newsrc, newdst; 64 unsigned long flags, newsrc, newdst;
65 pmd_t *pmdp; 65 pmd_t *pmdp;
66 pte_t *ptep; 66 pte_t *ptep;
67 int type0, type1;
67 int cpu = get_cpu(); 68 int cpu = get_cpu();
68 69
69 /* 70 /*
@@ -77,7 +78,8 @@ static void memcpy_multicache(void *dest, const void *source,
77 sim_allow_multiple_caching(1); 78 sim_allow_multiple_caching(1);
78 79
79 /* Set up the new dest mapping */ 80 /* Set up the new dest mapping */
80 idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + KM_MEMCPY0; 81 type0 = kmap_atomic_idx_push();
82 idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0;
81 newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1)); 83 newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
82 pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst); 84 pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
83 ptep = pte_offset_kernel(pmdp, newdst); 85 ptep = pte_offset_kernel(pmdp, newdst);
@@ -87,13 +89,14 @@ static void memcpy_multicache(void *dest, const void *source,
87 } 89 }
88 90
89 /* Set up the new source mapping */ 91 /* Set up the new source mapping */
90 idx += (KM_MEMCPY0 - KM_MEMCPY1); 92 type1 = kmap_atomic_idx_push();
93 idx += (type0 - type1);
91 src_pte = hv_pte_set_nc(src_pte); 94 src_pte = hv_pte_set_nc(src_pte);
92 src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */ 95 src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */
93 newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); 96 newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
94 pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); 97 pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
95 ptep = pte_offset_kernel(pmdp, newsrc); 98 ptep = pte_offset_kernel(pmdp, newsrc);
96 *ptep = src_pte; /* set_pte() would be confused by this */ 99 __set_pte(ptep, src_pte); /* set_pte() would be confused by this */
97 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); 100 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
98 101
99 /* Actually move the data. */ 102 /* Actually move the data. */
@@ -106,7 +109,7 @@ static void memcpy_multicache(void *dest, const void *source,
106 */ 109 */
107 src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); 110 src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
108 src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ 111 src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
109 *ptep = src_pte; /* set_pte() would be confused by this */ 112 __set_pte(ptep, src_pte); /* set_pte() would be confused by this */
110 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); 113 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
111 114
112 /* 115 /*
@@ -119,6 +122,8 @@ static void memcpy_multicache(void *dest, const void *source,
119 * We're done: notify the simulator that all is back to normal, 122 * We're done: notify the simulator that all is back to normal,
120 * and re-enable interrupts and pre-emption. 123 * and re-enable interrupts and pre-emption.
121 */ 124 */
125 kmap_atomic_idx_pop();
126 kmap_atomic_idx_pop();
122 sim_allow_multiple_caching(0); 127 sim_allow_multiple_caching(0);
123 local_irq_restore(flags); 128 local_irq_restore(flags);
124 put_cpu(); 129 put_cpu();
diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c
new file mode 100644
index 000000000000..4763b3aff1cc
--- /dev/null
+++ b/arch/tile/lib/memcpy_user_64.c
@@ -0,0 +1,86 @@
1/*
2 * Copyright 2011 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * Do memcpy(), but trap and return "n" when a load or store faults.
15 *
16 * Note: this idiom only works when memcpy() compiles to a leaf function.
17 * If "sp" is updated during memcpy, the "jrp lr" will be incorrect.
18 *
19 * Also note that we are capturing "n" from the containing scope here.
20 */
21
22#define _ST(p, inst, v) \
23 ({ \
24 asm("1: " #inst " %0, %1;" \
25 ".pushsection .coldtext.memcpy,\"ax\";" \
26 "2: { move r0, %2; jrp lr };" \
27 ".section __ex_table,\"a\";" \
28 ".quad 1b, 2b;" \
29 ".popsection" \
30 : "=m" (*(p)) : "r" (v), "r" (n)); \
31 })
32
33#define _LD(p, inst) \
34 ({ \
35 unsigned long __v; \
36 asm("1: " #inst " %0, %1;" \
37 ".pushsection .coldtext.memcpy,\"ax\";" \
38 "2: { move r0, %2; jrp lr };" \
39 ".section __ex_table,\"a\";" \
40 ".quad 1b, 2b;" \
41 ".popsection" \
42 : "=r" (__v) : "m" (*(p)), "r" (n)); \
43 __v; \
44 })
45
46#define USERCOPY_FUNC __copy_to_user_inatomic
47#define ST1(p, v) _ST((p), st1, (v))
48#define ST2(p, v) _ST((p), st2, (v))
49#define ST4(p, v) _ST((p), st4, (v))
50#define ST8(p, v) _ST((p), st, (v))
51#define LD1 LD
52#define LD2 LD
53#define LD4 LD
54#define LD8 LD
55#include "memcpy_64.c"
56
57#define USERCOPY_FUNC __copy_from_user_inatomic
58#define ST1 ST
59#define ST2 ST
60#define ST4 ST
61#define ST8 ST
62#define LD1(p) _LD((p), ld1u)
63#define LD2(p) _LD((p), ld2u)
64#define LD4(p) _LD((p), ld4u)
65#define LD8(p) _LD((p), ld)
66#include "memcpy_64.c"
67
68#define USERCOPY_FUNC __copy_in_user_inatomic
69#define ST1(p, v) _ST((p), st1, (v))
70#define ST2(p, v) _ST((p), st2, (v))
71#define ST4(p, v) _ST((p), st4, (v))
72#define ST8(p, v) _ST((p), st, (v))
73#define LD1(p) _LD((p), ld1u)
74#define LD2(p) _LD((p), ld2u)
75#define LD4(p) _LD((p), ld4u)
76#define LD8(p) _LD((p), ld)
77#include "memcpy_64.c"
78
79unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
80 unsigned long n)
81{
82 unsigned long rc = __copy_from_user_inatomic(to, from, n);
83 if (unlikely(rc))
84 memset(to + n - rc, 0, rc);
85 return rc;
86}
diff --git a/arch/tile/lib/memmove_32.c b/arch/tile/lib/memmove.c
index fd615ae6ade7..fd615ae6ade7 100644
--- a/arch/tile/lib/memmove_32.c
+++ b/arch/tile/lib/memmove.c
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index d014c1fbcbc2..57dbb3a5bff8 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -18,6 +18,7 @@
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/module.h> 19#include <linux/module.h>
20 20
21#undef memset
21 22
22void *memset(void *s, int c, size_t n) 23void *memset(void *s, int c, size_t n)
23{ 24{
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c
new file mode 100644
index 000000000000..3873085711d5
--- /dev/null
+++ b/arch/tile/lib/memset_64.c
@@ -0,0 +1,145 @@
1/*
2 * Copyright 2011 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <arch/chip.h>
16
17#include <linux/types.h>
18#include <linux/string.h>
19#include <linux/module.h>
20
21#undef memset
22
23void *memset(void *s, int c, size_t n)
24{
25 uint64_t *out64;
26 int n64, to_align64;
27 uint64_t v64;
28 uint8_t *out8 = s;
29
30 /* Experimentation shows that a trivial tight loop is a win up until
31 * around a size of 20, where writing a word at a time starts to win.
32 */
33#define BYTE_CUTOFF 20
34
35#if BYTE_CUTOFF < 7
36 /* This must be at least at least this big, or some code later
37 * on doesn't work.
38 */
39#error "BYTE_CUTOFF is too small"
40#endif
41
42 if (n < BYTE_CUTOFF) {
43 /* Strangely, this turns out to be the tightest way to
44 * write this loop.
45 */
46 if (n != 0) {
47 do {
48 /* Strangely, combining these into one line
49 * performs worse.
50 */
51 *out8 = c;
52 out8++;
53 } while (--n != 0);
54 }
55
56 return s;
57 }
58
59 /* Align 'out8'. We know n >= 7 so this won't write past the end. */
60 while (((uintptr_t) out8 & 7) != 0) {
61 *out8++ = c;
62 --n;
63 }
64
65 /* Align 'n'. */
66 while (n & 7)
67 out8[--n] = c;
68
69 out64 = (uint64_t *) out8;
70 n64 = n >> 3;
71
72 /* Tile input byte out to 64 bits. */
73 /* KLUDGE */
74 v64 = 0x0101010101010101ULL * (uint8_t)c;
75
76 /* This must be at least 8 or the following loop doesn't work. */
77#define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8)
78
79 /* Determine how many words we need to emit before the 'out32'
80 * pointer becomes aligned modulo the cache line size.
81 */
82 to_align64 = (-((uintptr_t)out64 >> 3)) &
83 (CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1);
84
85 /* Only bother aligning and using wh64 if there is at least
86 * one full cache line to process. This check also prevents
87 * overrunning the end of the buffer with alignment words.
88 */
89 if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) {
90 int lines_left;
91
92 /* Align out64 mod the cache line size so we can use wh64. */
93 n64 -= to_align64;
94 for (; to_align64 != 0; to_align64--) {
95 *out64 = v64;
96 out64++;
97 }
98
99 /* Use unsigned divide to turn this into a right shift. */
100 lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS;
101
102 do {
103 /* Only wh64 a few lines at a time, so we don't
104 * exceed the maximum number of victim lines.
105 */
106 int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS())
107 ? lines_left
108 : CHIP_MAX_OUTSTANDING_VICTIMS());
109 uint64_t *wh = out64;
110 int i = x;
111 int j;
112
113 lines_left -= x;
114
115 do {
116 __insn_wh64(wh);
117 wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS;
118 } while (--i);
119
120 for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4);
121 j != 0; j--) {
122 *out64++ = v64;
123 *out64++ = v64;
124 *out64++ = v64;
125 *out64++ = v64;
126 }
127 } while (lines_left != 0);
128
129 /* We processed all full lines above, so only this many
130 * words remain to be processed.
131 */
132 n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1;
133 }
134
135 /* Now handle any leftover values. */
136 if (n64 != 0) {
137 do {
138 *out64 = v64;
139 out64++;
140 } while (--n64 != 0);
141 }
142
143 return s;
144}
145EXPORT_SYMBOL(memset);
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c
index 485e24d62c6b..cb0999fb64b4 100644
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -15,6 +15,7 @@
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <asm/processor.h> 17#include <asm/processor.h>
18#include <arch/spr_def.h>
18 19
19#include "spinlock_common.h" 20#include "spinlock_common.h"
20 21
@@ -91,75 +92,75 @@ EXPORT_SYMBOL(arch_spin_unlock_wait);
91#define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1) 92#define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1)
92 93
93 94
94/* Lock the word, spinning until there are no tns-ers. */ 95/*
95static inline u32 get_rwlock(arch_rwlock_t *rwlock) 96 * We can get the read lock if everything but the reader bits (which
96{ 97 * are in the high part of the word) is zero, i.e. no active or
97 u32 iterations = 0; 98 * waiting writers, no tns.
98 for (;;) { 99 *
99 u32 val = __insn_tns((int *)&rwlock->lock); 100 * We guard the tns/store-back with an interrupt critical section to
100 if (unlikely(val & 1)) { 101 * preserve the semantic that the same read lock can be acquired in an
101 delay_backoff(iterations++); 102 * interrupt context.
102 continue; 103 */
103 } 104inline int arch_read_trylock(arch_rwlock_t *rwlock)
104 return val;
105 }
106}
107
108int arch_read_trylock_slow(arch_rwlock_t *rwlock)
109{
110 u32 val = get_rwlock(rwlock);
111 int locked = (val << RD_COUNT_WIDTH) == 0;
112 rwlock->lock = val + (locked << RD_COUNT_SHIFT);
113 return locked;
114}
115EXPORT_SYMBOL(arch_read_trylock_slow);
116
117void arch_read_unlock_slow(arch_rwlock_t *rwlock)
118{
119 u32 val = get_rwlock(rwlock);
120 rwlock->lock = val - (1 << RD_COUNT_SHIFT);
121}
122EXPORT_SYMBOL(arch_read_unlock_slow);
123
124void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val)
125{ 105{
126 u32 eq, mask = 1 << WR_CURR_SHIFT; 106 u32 val;
127 while (unlikely(val & 1)) { 107 __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
128 /* Limited backoff since we are the highest-priority task. */ 108 val = __insn_tns((int *)&rwlock->lock);
129 relax(4); 109 if (likely((val << _RD_COUNT_WIDTH) == 0)) {
130 val = __insn_tns((int *)&rwlock->lock); 110 val += 1 << RD_COUNT_SHIFT;
111 rwlock->lock = val;
112 __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
113 BUG_ON(val == 0); /* we don't expect wraparound */
114 return 1;
131 } 115 }
132 val = __insn_addb(val, mask); 116 if ((val & 1) == 0)
133 eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT)); 117 rwlock->lock = val;
134 val = __insn_mz(eq & mask, val); 118 __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
135 rwlock->lock = val; 119 return 0;
136} 120}
137EXPORT_SYMBOL(arch_write_unlock_slow); 121EXPORT_SYMBOL(arch_read_trylock);
138 122
139/* 123/*
140 * We spin until everything but the reader bits (which are in the high 124 * Spin doing arch_read_trylock() until we acquire the lock.
141 * part of the word) are zero, i.e. no active or waiting writers, no tns.
142 *
143 * ISSUE: This approach can permanently starve readers. A reader who sees 125 * ISSUE: This approach can permanently starve readers. A reader who sees
144 * a writer could instead take a ticket lock (just like a writer would), 126 * a writer could instead take a ticket lock (just like a writer would),
145 * and atomically enter read mode (with 1 reader) when it gets the ticket. 127 * and atomically enter read mode (with 1 reader) when it gets the ticket.
146 * This way both readers and writers will always make forward progress 128 * This way both readers and writers would always make forward progress
147 * in a finite time. 129 * in a finite time.
148 */ 130 */
149void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val) 131void arch_read_lock(arch_rwlock_t *rwlock)
150{ 132{
151 u32 iterations = 0; 133 u32 iterations = 0;
152 do { 134 while (unlikely(!arch_read_trylock(rwlock)))
153 if (!(val & 1))
154 rwlock->lock = val;
155 delay_backoff(iterations++); 135 delay_backoff(iterations++);
136}
137EXPORT_SYMBOL(arch_read_lock);
138
139void arch_read_unlock(arch_rwlock_t *rwlock)
140{
141 u32 val, iterations = 0;
142
143 mb(); /* guarantee anything modified under the lock is visible */
144 for (;;) {
145 __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
156 val = __insn_tns((int *)&rwlock->lock); 146 val = __insn_tns((int *)&rwlock->lock);
157 } while ((val << RD_COUNT_WIDTH) != 0); 147 if (likely(val & 1) == 0) {
158 rwlock->lock = val + (1 << RD_COUNT_SHIFT); 148 rwlock->lock = val - (1 << _RD_COUNT_SHIFT);
149 __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
150 break;
151 }
152 __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
153 delay_backoff(iterations++);
154 }
159} 155}
160EXPORT_SYMBOL(arch_read_lock_slow); 156EXPORT_SYMBOL(arch_read_unlock);
161 157
162void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val) 158/*
159 * We don't need an interrupt critical section here (unlike for
160 * arch_read_lock) since we should never use a bare write lock where
161 * it could be interrupted by code that could try to re-acquire it.
162 */
163void arch_write_lock(arch_rwlock_t *rwlock)
163{ 164{
164 /* 165 /*
165 * The trailing underscore on this variable (and curr_ below) 166 * The trailing underscore on this variable (and curr_ below)
@@ -167,23 +168,36 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
167 * when we compare them. 168 * when we compare them.
168 */ 169 */
169 u32 my_ticket_; 170 u32 my_ticket_;
171 u32 iterations = 0;
172 u32 val = __insn_tns((int *)&rwlock->lock);
170 173
171 /* Take out the next ticket; this will also stop would-be readers. */ 174 if (likely(val == 0)) {
172 if (val & 1) 175 rwlock->lock = 1 << _WR_NEXT_SHIFT;
173 val = get_rwlock(rwlock); 176 return;
174 rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT); 177 }
175
176 /* Extract my ticket value from the original word. */
177 my_ticket_ = val >> WR_NEXT_SHIFT;
178 178
179 /* 179 /*
180 * Wait until the "current" field matches our ticket, and 180 * Wait until there are no readers, then bump up the next
181 * there are no remaining readers. 181 * field and capture the ticket value.
182 */ 182 */
183 for (;;) { 183 for (;;) {
184 if (!(val & 1)) {
185 if ((val >> RD_COUNT_SHIFT) == 0)
186 break;
187 rwlock->lock = val;
188 }
189 delay_backoff(iterations++);
190 val = __insn_tns((int *)&rwlock->lock);
191 }
192
193 /* Take out the next ticket and extract my ticket value. */
194 rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT);
195 my_ticket_ = val >> WR_NEXT_SHIFT;
196
197 /* Wait until the "current" field matches our ticket. */
198 for (;;) {
184 u32 curr_ = val >> WR_CURR_SHIFT; 199 u32 curr_ = val >> WR_CURR_SHIFT;
185 u32 readers = val >> RD_COUNT_SHIFT; 200 u32 delta = ((my_ticket_ - curr_) & WR_MASK);
186 u32 delta = ((my_ticket_ - curr_) & WR_MASK) + !!readers;
187 if (likely(delta == 0)) 201 if (likely(delta == 0))
188 break; 202 break;
189 203
@@ -199,23 +213,47 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
199 relax(4); 213 relax(4);
200 } 214 }
201} 215}
202EXPORT_SYMBOL(arch_write_lock_slow); 216EXPORT_SYMBOL(arch_write_lock);
203 217
204int __tns_atomic_acquire(atomic_t *lock) 218int arch_write_trylock(arch_rwlock_t *rwlock)
205{ 219{
206 int ret; 220 u32 val = __insn_tns((int *)&rwlock->lock);
207 u32 iterations = 0;
208 221
209 BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION)); 222 /*
210 __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1); 223 * If a tns is in progress, or there's a waiting or active locker,
224 * or active readers, we can't take the lock, so give up.
225 */
226 if (unlikely(val != 0)) {
227 if (!(val & 1))
228 rwlock->lock = val;
229 return 0;
230 }
211 231
212 while ((ret = __insn_tns((void *)&lock->counter)) == 1) 232 /* Set the "next" field to mark it locked. */
213 delay_backoff(iterations++); 233 rwlock->lock = 1 << _WR_NEXT_SHIFT;
214 return ret; 234 return 1;
215} 235}
236EXPORT_SYMBOL(arch_write_trylock);
216 237
217void __tns_atomic_release(atomic_t *p, int v) 238void arch_write_unlock(arch_rwlock_t *rwlock)
218{ 239{
219 p->counter = v; 240 u32 val, eq, mask;
220 __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); 241
242 mb(); /* guarantee anything modified under the lock is visible */
243 val = __insn_tns((int *)&rwlock->lock);
244 if (likely(val == (1 << _WR_NEXT_SHIFT))) {
245 rwlock->lock = 0;
246 return;
247 }
248 while (unlikely(val & 1)) {
249 /* Limited backoff since we are the highest-priority task. */
250 relax(4);
251 val = __insn_tns((int *)&rwlock->lock);
252 }
253 mask = 1 << WR_CURR_SHIFT;
254 val = __insn_addb(val, mask);
255 eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
256 val = __insn_mz(eq & mask, val);
257 rwlock->lock = val;
221} 258}
259EXPORT_SYMBOL(arch_write_unlock);
diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c
new file mode 100644
index 000000000000..d6fb9581e980
--- /dev/null
+++ b/arch/tile/lib/spinlock_64.c
@@ -0,0 +1,104 @@
1/*
2 * Copyright 2011 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <asm/processor.h>
18
19#include "spinlock_common.h"
20
21/*
22 * Read the spinlock value without allocating in our cache and without
23 * causing an invalidation to another cpu with a copy of the cacheline.
24 * This is important when we are spinning waiting for the lock.
25 */
26static inline u32 arch_spin_read_noalloc(void *lock)
27{
28 return atomic_cmpxchg((atomic_t *)lock, -1, -1);
29}
30
31/*
32 * Wait until the high bits (current) match my ticket.
33 * If we notice the overflow bit set on entry, we clear it.
34 */
35void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket)
36{
37 if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) {
38 __insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW);
39 my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW;
40 }
41
42 for (;;) {
43 u32 val = arch_spin_read_noalloc(lock);
44 u32 delta = my_ticket - arch_spin_current(val);
45 if (delta == 0)
46 return;
47 relax((128 / CYCLES_PER_RELAX_LOOP) * delta);
48 }
49}
50EXPORT_SYMBOL(arch_spin_lock_slow);
51
52/*
53 * Check the lock to see if it is plausible, and try to get it with cmpxchg().
54 */
55int arch_spin_trylock(arch_spinlock_t *lock)
56{
57 u32 val = arch_spin_read_noalloc(lock);
58 if (unlikely(arch_spin_current(val) != arch_spin_next(val)))
59 return 0;
60 return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW)
61 == val;
62}
63EXPORT_SYMBOL(arch_spin_trylock);
64
65void arch_spin_unlock_wait(arch_spinlock_t *lock)
66{
67 u32 iterations = 0;
68 while (arch_spin_is_locked(lock))
69 delay_backoff(iterations++);
70}
71EXPORT_SYMBOL(arch_spin_unlock_wait);
72
73/*
74 * If the read lock fails due to a writer, we retry periodically
75 * until the value is positive and we write our incremented reader count.
76 */
77void __read_lock_failed(arch_rwlock_t *rw)
78{
79 u32 val;
80 int iterations = 0;
81 do {
82 delay_backoff(iterations++);
83 val = __insn_fetchaddgez4(&rw->lock, 1);
84 } while (unlikely(arch_write_val_locked(val)));
85}
86EXPORT_SYMBOL(__read_lock_failed);
87
88/*
89 * If we failed because there were readers, clear the "writer" bit
90 * so we don't block additional readers. Otherwise, there was another
91 * writer anyway, so our "fetchor" made no difference. Then wait,
92 * issuing periodic fetchor instructions, till we get the lock.
93 */
94void __write_lock_failed(arch_rwlock_t *rw, u32 val)
95{
96 int iterations = 0;
97 do {
98 if (!arch_write_val_locked(val))
99 val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT);
100 delay_backoff(iterations++);
101 val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT);
102 } while (val != 0);
103}
104EXPORT_SYMBOL(__write_lock_failed);
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c
new file mode 100644
index 000000000000..617a9273aaa8
--- /dev/null
+++ b/arch/tile/lib/strchr_64.c
@@ -0,0 +1,67 @@
1/*
2 * Copyright 2011 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/types.h>
16#include <linux/string.h>
17#include <linux/module.h>
18
19#undef strchr
20
21char *strchr(const char *s, int c)
22{
23 int z, g;
24
25 /* Get an aligned pointer. */
26 const uintptr_t s_int = (uintptr_t) s;
27 const uint64_t *p = (const uint64_t *)(s_int & -8);
28
29 /* Create eight copies of the byte for which we are looking. */
30 const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c;
31
32 /* Read the first aligned word, but force bytes before the string to
33 * match neither zero nor goal (we make sure the high bit of each
34 * byte is 1, and the low 7 bits are all the opposite of the goal
35 * byte).
36 *
37 * Note that this shift count expression works because we know shift
38 * counts are taken mod 64.
39 */
40 const uint64_t before_mask = (1ULL << (s_int << 3)) - 1;
41 uint64_t v = (*p | before_mask) ^
42 (goal & __insn_v1shrsi(before_mask, 1));
43
44 uint64_t zero_matches, goal_matches;
45 while (1) {
46 /* Look for a terminating '\0'. */
47 zero_matches = __insn_v1cmpeqi(v, 0);
48
49 /* Look for the goal byte. */
50 goal_matches = __insn_v1cmpeq(v, goal);
51
52 if (__builtin_expect((zero_matches | goal_matches) != 0, 0))
53 break;
54
55 v = *++p;
56 }
57
58 z = __insn_ctz(zero_matches);
59 g = __insn_ctz(goal_matches);
60
61 /* If we found c before '\0' we got a match. Note that if c == '\0'
62 * then g == z, and we correctly return the address of the '\0'
63 * rather than NULL.
64 */
65 return (g <= z) ? ((char *)p) + (g >> 3) : NULL;
66}
67EXPORT_SYMBOL(strchr);
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c
index f26f88e11e4a..4974292a5534 100644
--- a/arch/tile/lib/strlen_32.c
+++ b/arch/tile/lib/strlen_32.c
@@ -16,6 +16,8 @@
16#include <linux/string.h> 16#include <linux/string.h>
17#include <linux/module.h> 17#include <linux/module.h>
18 18
19#undef strlen
20
19size_t strlen(const char *s) 21size_t strlen(const char *s)
20{ 22{
21 /* Get an aligned pointer. */ 23 /* Get an aligned pointer. */
diff --git a/arch/tile/lib/strlen_64.c b/arch/tile/lib/strlen_64.c
new file mode 100644
index 000000000000..1c92d46202a8
--- /dev/null
+++ b/arch/tile/lib/strlen_64.c
@@ -0,0 +1,38 @@
1/*
2 * Copyright 2011 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/types.h>
16#include <linux/string.h>
17#include <linux/module.h>
18
19#undef strlen
20
21size_t strlen(const char *s)
22{
23 /* Get an aligned pointer. */
24 const uintptr_t s_int = (uintptr_t) s;
25 const uint64_t *p = (const uint64_t *)(s_int & -8);
26
27 /* Read the first word, but force bytes before the string to be nonzero.
28 * This expression works because we know shift counts are taken mod 64.
29 */
30 uint64_t v = *p | ((1ULL << (s_int << 3)) - 1);
31
32 uint64_t bits;
33 while ((bits = __insn_v1cmpeqi(v, 0)) == 0)
34 v = *++p;
35
36 return ((const char *)p) + (__insn_ctz(bits) >> 3) - s;
37}
38EXPORT_SYMBOL(strlen);
diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S
new file mode 100644
index 000000000000..2ff44f87b78e
--- /dev/null
+++ b/arch/tile/lib/usercopy_64.S
@@ -0,0 +1,196 @@
1/*
2 * Copyright 2011 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/linkage.h>
16#include <asm/errno.h>
17#include <asm/cache.h>
18#include <arch/chip.h>
19
20/* Access user memory, but use MMU to avoid propagating kernel exceptions. */
21
22 .pushsection .fixup,"ax"
23
24get_user_fault:
25 { movei r1, -EFAULT; move r0, zero }
26 jrp lr
27 ENDPROC(get_user_fault)
28
29put_user_fault:
30 { movei r0, -EFAULT; jrp lr }
31 ENDPROC(put_user_fault)
32
33 .popsection
34
35/*
36 * __get_user_N functions take a pointer in r0, and return 0 in r1
37 * on success, with the value in r0; or else -EFAULT in r1.
38 */
39#define __get_user_N(bytes, LOAD) \
40 STD_ENTRY(__get_user_##bytes); \
411: { LOAD r0, r0; move r1, zero }; \
42 jrp lr; \
43 STD_ENDPROC(__get_user_##bytes); \
44 .pushsection __ex_table,"a"; \
45 .quad 1b, get_user_fault; \
46 .popsection
47
48__get_user_N(1, ld1u)
49__get_user_N(2, ld2u)
50__get_user_N(4, ld4u)
51__get_user_N(8, ld)
52
53/*
54 * __put_user_N functions take a value in r0 and a pointer in r1,
55 * and return 0 in r0 on success or -EFAULT on failure.
56 */
57#define __put_user_N(bytes, STORE) \
58 STD_ENTRY(__put_user_##bytes); \
591: { STORE r1, r0; move r0, zero }; \
60 jrp lr; \
61 STD_ENDPROC(__put_user_##bytes); \
62 .pushsection __ex_table,"a"; \
63 .quad 1b, put_user_fault; \
64 .popsection
65
66__put_user_N(1, st1)
67__put_user_N(2, st2)
68__put_user_N(4, st4)
69__put_user_N(8, st)
70
71/*
72 * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
73 * It returns the length, including the terminating NUL, or zero on exception.
74 * If length is greater than the bound, returns one plus the bound.
75 */
76STD_ENTRY(strnlen_user_asm)
77 { beqz r1, 2f; addi r3, r0, -1 } /* bias down to include NUL */
781: { ld1u r4, r0; addi r1, r1, -1 }
79 beqz r4, 2f
80 { bnezt r1, 1b; addi r0, r0, 1 }
812: { sub r0, r0, r3; jrp lr }
82 STD_ENDPROC(strnlen_user_asm)
83 .pushsection .fixup,"ax"
84strnlen_user_fault:
85 { move r0, zero; jrp lr }
86 ENDPROC(strnlen_user_fault)
87 .section __ex_table,"a"
88 .quad 1b, strnlen_user_fault
89 .popsection
90
91/*
92 * strncpy_from_user_asm takes the kernel target pointer in r0,
93 * the userspace source pointer in r1, and the length bound (including
94 * the trailing NUL) in r2. On success, it returns the string length
95 * (not including the trailing NUL), or -EFAULT on failure.
96 */
97STD_ENTRY(strncpy_from_user_asm)
98 { beqz r2, 2f; move r3, r0 }
991: { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
100 { st1 r0, r4; addi r0, r0, 1 }
101 beqz r2, 2f
102 bnezt r4, 1b
103 addi r0, r0, -1 /* don't count the trailing NUL */
1042: { sub r0, r0, r3; jrp lr }
105 STD_ENDPROC(strncpy_from_user_asm)
106 .pushsection .fixup,"ax"
107strncpy_from_user_fault:
108 { movei r0, -EFAULT; jrp lr }
109 ENDPROC(strncpy_from_user_fault)
110 .section __ex_table,"a"
111 .quad 1b, strncpy_from_user_fault
112 .popsection
113
114/*
115 * clear_user_asm takes the user target address in r0 and the
116 * number of bytes to zero in r1.
117 * It returns the number of uncopiable bytes (hopefully zero) in r0.
118 * Note that we don't use a separate .fixup section here since we fall
119 * through into the "fixup" code as the last straight-line bundle anyway.
120 */
121STD_ENTRY(clear_user_asm)
122 { beqz r1, 2f; or r2, r0, r1 }
123 andi r2, r2, 7
124 beqzt r2, .Lclear_aligned_user_asm
1251: { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 }
126 bnezt r1, 1b
1272: { move r0, r1; jrp lr }
128 .pushsection __ex_table,"a"
129 .quad 1b, 2b
130 .popsection
131
132.Lclear_aligned_user_asm:
1331: { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 }
134 bnezt r1, 1b
1352: { move r0, r1; jrp lr }
136 STD_ENDPROC(clear_user_asm)
137 .pushsection __ex_table,"a"
138 .quad 1b, 2b
139 .popsection
140
141/*
142 * flush_user_asm takes the user target address in r0 and the
143 * number of bytes to flush in r1.
144 * It returns the number of unflushable bytes (hopefully zero) in r0.
145 */
146STD_ENTRY(flush_user_asm)
147 beqz r1, 2f
148 { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
149 { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
150 { and r0, r0, r2; and r1, r1, r2 }
151 { sub r1, r1, r0 }
1521: { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() }
153 { addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b }
1542: { move r0, r1; jrp lr }
155 STD_ENDPROC(flush_user_asm)
156 .pushsection __ex_table,"a"
157 .quad 1b, 2b
158 .popsection
159
160/*
161 * inv_user_asm takes the user target address in r0 and the
162 * number of bytes to invalidate in r1.
163 * It returns the number of not inv'able bytes (hopefully zero) in r0.
164 */
165STD_ENTRY(inv_user_asm)
166 beqz r1, 2f
167 { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
168 { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
169 { and r0, r0, r2; and r1, r1, r2 }
170 { sub r1, r1, r0 }
1711: { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
172 { addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b }
1732: { move r0, r1; jrp lr }
174 STD_ENDPROC(inv_user_asm)
175 .pushsection __ex_table,"a"
176 .quad 1b, 2b
177 .popsection
178
179/*
180 * finv_user_asm takes the user target address in r0 and the
181 * number of bytes to flush-invalidate in r1.
182 * It returns the number of not finv'able bytes (hopefully zero) in r0.
183 */
184STD_ENTRY(finv_user_asm)
185 beqz r1, 2f
186 { movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
187 { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
188 { and r0, r0, r2; and r1, r1, r2 }
189 { sub r1, r1, r0 }
1901: { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() }
191 { addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b }
1922: { move r0, r1; jrp lr }
193 STD_ENDPROC(finv_user_asm)
194 .pushsection __ex_table,"a"
195 .quad 1b, 2b
196 .popsection