diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/tile/lib | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'arch/tile/lib')
-rw-r--r-- | arch/tile/lib/Makefile | 5 | ||||
-rw-r--r-- | arch/tile/lib/atomic_32.c | 25 | ||||
-rw-r--r-- | arch/tile/lib/atomic_asm_32.S | 4 | ||||
-rw-r--r-- | arch/tile/lib/cacheflush.c | 120 | ||||
-rw-r--r-- | arch/tile/lib/delay.c | 21 | ||||
-rw-r--r-- | arch/tile/lib/exports.c | 7 | ||||
-rw-r--r-- | arch/tile/lib/mb_incoherent.S | 34 | ||||
-rw-r--r-- | arch/tile/lib/memchr_32.c | 35 | ||||
-rw-r--r-- | arch/tile/lib/memchr_64.c | 71 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_32.S | 206 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_64.c | 220 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_tile64.c | 15 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_user_64.c | 86 | ||||
-rw-r--r-- | arch/tile/lib/memmove.c (renamed from arch/tile/lib/memmove_32.c) | 0 | ||||
-rw-r--r-- | arch/tile/lib/memset_32.c | 1 | ||||
-rw-r--r-- | arch/tile/lib/memset_64.c | 145 | ||||
-rw-r--r-- | arch/tile/lib/spinlock_32.c | 190 | ||||
-rw-r--r-- | arch/tile/lib/spinlock_64.c | 104 | ||||
-rw-r--r-- | arch/tile/lib/strchr_64.c | 67 | ||||
-rw-r--r-- | arch/tile/lib/strlen_32.c | 2 | ||||
-rw-r--r-- | arch/tile/lib/strlen_64.c | 38 | ||||
-rw-r--r-- | arch/tile/lib/usercopy_64.S | 196 |
22 files changed, 1336 insertions, 256 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile index 746dc81ed3c4..0c26086ecbef 100644 --- a/arch/tile/lib/Makefile +++ b/arch/tile/lib/Makefile | |||
@@ -2,9 +2,8 @@ | |||
2 | # Makefile for TILE-specific library files.. | 2 | # Makefile for TILE-specific library files.. |
3 | # | 3 | # |
4 | 4 | ||
5 | lib-y = cacheflush.o checksum.o cpumask.o delay.o \ | 5 | lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \ |
6 | mb_incoherent.o uaccess.o \ | 6 | memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \ |
7 | memcpy_$(BITS).o memchr_$(BITS).o memmove_$(BITS).o memset_$(BITS).o \ | ||
8 | strchr_$(BITS).o strlen_$(BITS).o | 7 | strchr_$(BITS).o strlen_$(BITS).o |
9 | 8 | ||
10 | ifeq ($(CONFIG_TILEGX),y) | 9 | ifeq ($(CONFIG_TILEGX),y) |
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c index 8040b42a8eea..46570211df52 100644 --- a/arch/tile/lib/atomic_32.c +++ b/arch/tile/lib/atomic_32.c | |||
@@ -46,14 +46,13 @@ struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE] | |||
46 | #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ | 46 | #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ |
47 | 47 | ||
48 | /* This page is remapped on startup to be hash-for-home. */ | 48 | /* This page is remapped on startup to be hash-for-home. */ |
49 | int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */] | 49 | int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss; |
50 | __attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned"))); | ||
51 | 50 | ||
52 | #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ | 51 | #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ |
53 | 52 | ||
54 | static inline int *__atomic_hashed_lock(volatile void *v) | 53 | static inline int *__atomic_hashed_lock(volatile void *v) |
55 | { | 54 | { |
56 | /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */ | 55 | /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */ |
57 | #if ATOMIC_LOCKS_FOUND_VIA_TABLE() | 56 | #if ATOMIC_LOCKS_FOUND_VIA_TABLE() |
58 | unsigned long i = | 57 | unsigned long i = |
59 | (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long)); | 58 | (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long)); |
@@ -203,32 +202,32 @@ static inline int *__futex_setup(int __user *v) | |||
203 | return __atomic_hashed_lock((int __force *)v); | 202 | return __atomic_hashed_lock((int __force *)v); |
204 | } | 203 | } |
205 | 204 | ||
206 | struct __get_user futex_set(int __user *v, int i) | 205 | struct __get_user futex_set(u32 __user *v, int i) |
207 | { | 206 | { |
208 | return __atomic_xchg((int __force *)v, __futex_setup(v), i); | 207 | return __atomic_xchg((int __force *)v, __futex_setup(v), i); |
209 | } | 208 | } |
210 | 209 | ||
211 | struct __get_user futex_add(int __user *v, int n) | 210 | struct __get_user futex_add(u32 __user *v, int n) |
212 | { | 211 | { |
213 | return __atomic_xchg_add((int __force *)v, __futex_setup(v), n); | 212 | return __atomic_xchg_add((int __force *)v, __futex_setup(v), n); |
214 | } | 213 | } |
215 | 214 | ||
216 | struct __get_user futex_or(int __user *v, int n) | 215 | struct __get_user futex_or(u32 __user *v, int n) |
217 | { | 216 | { |
218 | return __atomic_or((int __force *)v, __futex_setup(v), n); | 217 | return __atomic_or((int __force *)v, __futex_setup(v), n); |
219 | } | 218 | } |
220 | 219 | ||
221 | struct __get_user futex_andn(int __user *v, int n) | 220 | struct __get_user futex_andn(u32 __user *v, int n) |
222 | { | 221 | { |
223 | return __atomic_andn((int __force *)v, __futex_setup(v), n); | 222 | return __atomic_andn((int __force *)v, __futex_setup(v), n); |
224 | } | 223 | } |
225 | 224 | ||
226 | struct __get_user futex_xor(int __user *v, int n) | 225 | struct __get_user futex_xor(u32 __user *v, int n) |
227 | { | 226 | { |
228 | return __atomic_xor((int __force *)v, __futex_setup(v), n); | 227 | return __atomic_xor((int __force *)v, __futex_setup(v), n); |
229 | } | 228 | } |
230 | 229 | ||
231 | struct __get_user futex_cmpxchg(int __user *v, int o, int n) | 230 | struct __get_user futex_cmpxchg(u32 __user *v, int o, int n) |
232 | { | 231 | { |
233 | return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n); | 232 | return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n); |
234 | } | 233 | } |
@@ -300,7 +299,7 @@ void __init __init_atomic_per_cpu(void) | |||
300 | #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ | 299 | #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ |
301 | 300 | ||
302 | /* Validate power-of-two and "bigger than cpus" assumption */ | 301 | /* Validate power-of-two and "bigger than cpus" assumption */ |
303 | BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1)); | 302 | BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1)); |
304 | BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids); | 303 | BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids); |
305 | 304 | ||
306 | /* | 305 | /* |
@@ -314,17 +313,17 @@ void __init __init_atomic_per_cpu(void) | |||
314 | BUG_ON((unsigned long)atomic_locks % PAGE_SIZE != 0); | 313 | BUG_ON((unsigned long)atomic_locks % PAGE_SIZE != 0); |
315 | 314 | ||
316 | /* The locks must all fit on one page. */ | 315 | /* The locks must all fit on one page. */ |
317 | BUG_ON(ATOMIC_HASH_SIZE * sizeof(int) > PAGE_SIZE); | 316 | BUILD_BUG_ON(ATOMIC_HASH_SIZE * sizeof(int) > PAGE_SIZE); |
318 | 317 | ||
319 | /* | 318 | /* |
320 | * We use the page offset of the atomic value's address as | 319 | * We use the page offset of the atomic value's address as |
321 | * an index into atomic_locks, excluding the low 3 bits. | 320 | * an index into atomic_locks, excluding the low 3 bits. |
322 | * That should not produce more indices than ATOMIC_HASH_SIZE. | 321 | * That should not produce more indices than ATOMIC_HASH_SIZE. |
323 | */ | 322 | */ |
324 | BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE); | 323 | BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE); |
325 | 324 | ||
326 | #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ | 325 | #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ |
327 | 326 | ||
328 | /* The futex code makes this assumption, so we validate it here. */ | 327 | /* The futex code makes this assumption, so we validate it here. */ |
329 | BUG_ON(sizeof(atomic_t) != sizeof(int)); | 328 | BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int)); |
330 | } | 329 | } |
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S index 5a5514b77e78..24448734f6f1 100644 --- a/arch/tile/lib/atomic_asm_32.S +++ b/arch/tile/lib/atomic_asm_32.S | |||
@@ -14,7 +14,7 @@ | |||
14 | * Support routines for atomic operations. Each function takes: | 14 | * Support routines for atomic operations. Each function takes: |
15 | * | 15 | * |
16 | * r0: address to manipulate | 16 | * r0: address to manipulate |
17 | * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG) | 17 | * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG) |
18 | * r2: new value to write, or for cmpxchg/add_unless, value to compare against | 18 | * r2: new value to write, or for cmpxchg/add_unless, value to compare against |
19 | * r3: (cmpxchg/xchg_add_unless) new value to write or add; | 19 | * r3: (cmpxchg/xchg_add_unless) new value to write or add; |
20 | * (atomic64 ops) high word of value to write | 20 | * (atomic64 ops) high word of value to write |
@@ -59,7 +59,7 @@ | |||
59 | * bad kernel addresses). | 59 | * bad kernel addresses). |
60 | * | 60 | * |
61 | * Note that if the value we would store is the same as what we | 61 | * Note that if the value we would store is the same as what we |
62 | * loaded, we bypass the load. Other platforms with true atomics can | 62 | * loaded, we bypass the store. Other platforms with true atomics can |
63 | * make the guarantee that a non-atomic __clear_bit(), for example, | 63 | * make the guarantee that a non-atomic __clear_bit(), for example, |
64 | * can safely race with an atomic test_and_set_bit(); this example is | 64 | * can safely race with an atomic test_and_set_bit(); this example is |
65 | * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do | 65 | * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do |
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c index 11b6164c2097..8928aace7a64 100644 --- a/arch/tile/lib/cacheflush.c +++ b/arch/tile/lib/cacheflush.c | |||
@@ -15,9 +15,129 @@ | |||
15 | #include <asm/page.h> | 15 | #include <asm/page.h> |
16 | #include <asm/cacheflush.h> | 16 | #include <asm/cacheflush.h> |
17 | #include <arch/icache.h> | 17 | #include <arch/icache.h> |
18 | #include <arch/spr_def.h> | ||
18 | 19 | ||
19 | 20 | ||
20 | void __flush_icache_range(unsigned long start, unsigned long end) | 21 | void __flush_icache_range(unsigned long start, unsigned long end) |
21 | { | 22 | { |
22 | invalidate_icache((const void *)start, end - start, PAGE_SIZE); | 23 | invalidate_icache((const void *)start, end - start, PAGE_SIZE); |
23 | } | 24 | } |
25 | |||
26 | |||
27 | /* Force a load instruction to issue. */ | ||
28 | static inline void force_load(char *p) | ||
29 | { | ||
30 | *(volatile char *)p; | ||
31 | } | ||
32 | |||
33 | /* | ||
34 | * Flush and invalidate a VA range that is homed remotely on a single | ||
35 | * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting | ||
36 | * until the memory controller holds the flushed values. | ||
37 | */ | ||
38 | void finv_buffer_remote(void *buffer, size_t size, int hfh) | ||
39 | { | ||
40 | char *p, *base; | ||
41 | size_t step_size, load_count; | ||
42 | const unsigned long STRIPE_WIDTH = 8192; | ||
43 | #ifdef __tilegx__ | ||
44 | /* | ||
45 | * On TILE-Gx, we must disable the dstream prefetcher before doing | ||
46 | * a cache flush; otherwise, we could end up with data in the cache | ||
47 | * that we don't want there. Note that normally we'd do an mf | ||
48 | * after the SPR write to disabling the prefetcher, but we do one | ||
49 | * below, before any further loads, so there's no need to do it | ||
50 | * here. | ||
51 | */ | ||
52 | uint_reg_t old_dstream_pf = __insn_mfspr(SPR_DSTREAM_PF); | ||
53 | __insn_mtspr(SPR_DSTREAM_PF, 0); | ||
54 | #endif | ||
55 | |||
56 | /* | ||
57 | * Flush and invalidate the buffer out of the local L1/L2 | ||
58 | * and request the home cache to flush and invalidate as well. | ||
59 | */ | ||
60 | __finv_buffer(buffer, size); | ||
61 | |||
62 | /* | ||
63 | * Wait for the home cache to acknowledge that it has processed | ||
64 | * all the flush-and-invalidate requests. This does not mean | ||
65 | * that the flushed data has reached the memory controller yet, | ||
66 | * but it does mean the home cache is processing the flushes. | ||
67 | */ | ||
68 | __insn_mf(); | ||
69 | |||
70 | /* | ||
71 | * Issue a load to the last cache line, which can't complete | ||
72 | * until all the previously-issued flushes to the same memory | ||
73 | * controller have also completed. If we weren't striping | ||
74 | * memory, that one load would be sufficient, but since we may | ||
75 | * be, we also need to back up to the last load issued to | ||
76 | * another memory controller, which would be the point where | ||
77 | * we crossed an 8KB boundary (the granularity of striping | ||
78 | * across memory controllers). Keep backing up and doing this | ||
79 | * until we are before the beginning of the buffer, or have | ||
80 | * hit all the controllers. | ||
81 | * | ||
82 | * If we are flushing a hash-for-home buffer, it's even worse. | ||
83 | * Each line may be homed on a different tile, and each tile | ||
84 | * may have up to four lines that are on different | ||
85 | * controllers. So as we walk backwards, we have to touch | ||
86 | * enough cache lines to satisfy these constraints. In | ||
87 | * practice this ends up being close enough to "load from | ||
88 | * every cache line on a full memory stripe on each | ||
89 | * controller" that we simply do that, to simplify the logic. | ||
90 | * | ||
91 | * FIXME: See bug 9535 for some issues with this code. | ||
92 | */ | ||
93 | if (hfh) { | ||
94 | step_size = L2_CACHE_BYTES; | ||
95 | load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) * | ||
96 | (1 << CHIP_LOG_NUM_MSHIMS()); | ||
97 | } else { | ||
98 | step_size = STRIPE_WIDTH; | ||
99 | load_count = (1 << CHIP_LOG_NUM_MSHIMS()); | ||
100 | } | ||
101 | |||
102 | /* Load the last byte of the buffer. */ | ||
103 | p = (char *)buffer + size - 1; | ||
104 | force_load(p); | ||
105 | |||
106 | /* Bump down to the end of the previous stripe or cache line. */ | ||
107 | p -= step_size; | ||
108 | p = (char *)((unsigned long)p | (step_size - 1)); | ||
109 | |||
110 | /* Figure out how far back we need to go. */ | ||
111 | base = p - (step_size * (load_count - 2)); | ||
112 | if ((long)base < (long)buffer) | ||
113 | base = buffer; | ||
114 | |||
115 | /* | ||
116 | * Fire all the loads we need. The MAF only has eight entries | ||
117 | * so we can have at most eight outstanding loads, so we | ||
118 | * unroll by that amount. | ||
119 | */ | ||
120 | #pragma unroll 8 | ||
121 | for (; p >= base; p -= step_size) | ||
122 | force_load(p); | ||
123 | |||
124 | /* | ||
125 | * Repeat, but with inv's instead of loads, to get rid of the | ||
126 | * data we just loaded into our own cache and the old home L3. | ||
127 | * No need to unroll since inv's don't target a register. | ||
128 | */ | ||
129 | p = (char *)buffer + size - 1; | ||
130 | __insn_inv(p); | ||
131 | p -= step_size; | ||
132 | p = (char *)((unsigned long)p | (step_size - 1)); | ||
133 | for (; p >= base; p -= step_size) | ||
134 | __insn_inv(p); | ||
135 | |||
136 | /* Wait for the load+inv's (and thus finvs) to have completed. */ | ||
137 | __insn_mf(); | ||
138 | |||
139 | #ifdef __tilegx__ | ||
140 | /* Reenable the prefetcher. */ | ||
141 | __insn_mtspr(SPR_DSTREAM_PF, old_dstream_pf); | ||
142 | #endif | ||
143 | } | ||
diff --git a/arch/tile/lib/delay.c b/arch/tile/lib/delay.c index 5801b03c13ef..cdacdd11d360 100644 --- a/arch/tile/lib/delay.c +++ b/arch/tile/lib/delay.c | |||
@@ -15,20 +15,31 @@ | |||
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/delay.h> | 16 | #include <linux/delay.h> |
17 | #include <linux/thread_info.h> | 17 | #include <linux/thread_info.h> |
18 | #include <asm/fixmap.h> | 18 | #include <asm/timex.h> |
19 | #include <hv/hypervisor.h> | ||
20 | 19 | ||
21 | void __udelay(unsigned long usecs) | 20 | void __udelay(unsigned long usecs) |
22 | { | 21 | { |
23 | hv_nanosleep(usecs * 1000); | 22 | if (usecs > ULONG_MAX / 1000) { |
23 | WARN_ON_ONCE(usecs > ULONG_MAX / 1000); | ||
24 | usecs = ULONG_MAX / 1000; | ||
25 | } | ||
26 | __ndelay(usecs * 1000); | ||
24 | } | 27 | } |
25 | EXPORT_SYMBOL(__udelay); | 28 | EXPORT_SYMBOL(__udelay); |
26 | 29 | ||
27 | void __ndelay(unsigned long nsecs) | 30 | void __ndelay(unsigned long nsecs) |
28 | { | 31 | { |
29 | hv_nanosleep(nsecs); | 32 | cycles_t target = get_cycles(); |
33 | target += ns2cycles(nsecs); | ||
34 | while (get_cycles() < target) | ||
35 | cpu_relax(); | ||
30 | } | 36 | } |
31 | EXPORT_SYMBOL(__ndelay); | 37 | EXPORT_SYMBOL(__ndelay); |
32 | 38 | ||
33 | /* FIXME: should be declared in a header somewhere. */ | 39 | void __delay(unsigned long cycles) |
40 | { | ||
41 | cycles_t target = get_cycles() + cycles; | ||
42 | while (get_cycles() < target) | ||
43 | cpu_relax(); | ||
44 | } | ||
34 | EXPORT_SYMBOL(__delay); | 45 | EXPORT_SYMBOL(__delay); |
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c index ce5dbf56578f..49284fae9d09 100644 --- a/arch/tile/lib/exports.c +++ b/arch/tile/lib/exports.c | |||
@@ -29,6 +29,9 @@ EXPORT_SYMBOL(__put_user_8); | |||
29 | EXPORT_SYMBOL(strnlen_user_asm); | 29 | EXPORT_SYMBOL(strnlen_user_asm); |
30 | EXPORT_SYMBOL(strncpy_from_user_asm); | 30 | EXPORT_SYMBOL(strncpy_from_user_asm); |
31 | EXPORT_SYMBOL(clear_user_asm); | 31 | EXPORT_SYMBOL(clear_user_asm); |
32 | EXPORT_SYMBOL(flush_user_asm); | ||
33 | EXPORT_SYMBOL(inv_user_asm); | ||
34 | EXPORT_SYMBOL(finv_user_asm); | ||
32 | 35 | ||
33 | /* arch/tile/kernel/entry.S */ | 36 | /* arch/tile/kernel/entry.S */ |
34 | #include <linux/kernel.h> | 37 | #include <linux/kernel.h> |
@@ -82,4 +85,8 @@ int64_t __muldi3(int64_t, int64_t); | |||
82 | EXPORT_SYMBOL(__muldi3); | 85 | EXPORT_SYMBOL(__muldi3); |
83 | uint64_t __lshrdi3(uint64_t, unsigned int); | 86 | uint64_t __lshrdi3(uint64_t, unsigned int); |
84 | EXPORT_SYMBOL(__lshrdi3); | 87 | EXPORT_SYMBOL(__lshrdi3); |
88 | uint64_t __ashrdi3(uint64_t, unsigned int); | ||
89 | EXPORT_SYMBOL(__ashrdi3); | ||
90 | uint64_t __ashldi3(uint64_t, unsigned int); | ||
91 | EXPORT_SYMBOL(__ashldi3); | ||
85 | #endif | 92 | #endif |
diff --git a/arch/tile/lib/mb_incoherent.S b/arch/tile/lib/mb_incoherent.S deleted file mode 100644 index 989ad7b68d5a..000000000000 --- a/arch/tile/lib/mb_incoherent.S +++ /dev/null | |||
@@ -1,34 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * Assembly code for invoking the HV's fence_incoherent syscall. | ||
15 | */ | ||
16 | |||
17 | #include <linux/linkage.h> | ||
18 | #include <hv/syscall_public.h> | ||
19 | #include <arch/abi.h> | ||
20 | #include <arch/chip.h> | ||
21 | |||
22 | #if !CHIP_HAS_MF_WAITS_FOR_VICTIMS() | ||
23 | |||
24 | /* | ||
25 | * Invoke the hypervisor's fence_incoherent syscall, which guarantees | ||
26 | * that all victims for cachelines homed on this tile have reached memory. | ||
27 | */ | ||
28 | STD_ENTRY(__mb_incoherent) | ||
29 | moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent | ||
30 | swint2 | ||
31 | jrp lr | ||
32 | STD_ENDPROC(__mb_incoherent) | ||
33 | |||
34 | #endif | ||
diff --git a/arch/tile/lib/memchr_32.c b/arch/tile/lib/memchr_32.c index 6235283b4859..cc3d9badf030 100644 --- a/arch/tile/lib/memchr_32.c +++ b/arch/tile/lib/memchr_32.c | |||
@@ -18,12 +18,24 @@ | |||
18 | 18 | ||
19 | void *memchr(const void *s, int c, size_t n) | 19 | void *memchr(const void *s, int c, size_t n) |
20 | { | 20 | { |
21 | const uint32_t *last_word_ptr; | ||
22 | const uint32_t *p; | ||
23 | const char *last_byte_ptr; | ||
24 | uintptr_t s_int; | ||
25 | uint32_t goal, before_mask, v, bits; | ||
26 | char *ret; | ||
27 | |||
28 | if (__builtin_expect(n == 0, 0)) { | ||
29 | /* Don't dereference any memory if the array is empty. */ | ||
30 | return NULL; | ||
31 | } | ||
32 | |||
21 | /* Get an aligned pointer. */ | 33 | /* Get an aligned pointer. */ |
22 | const uintptr_t s_int = (uintptr_t) s; | 34 | s_int = (uintptr_t) s; |
23 | const uint32_t *p = (const uint32_t *)(s_int & -4); | 35 | p = (const uint32_t *)(s_int & -4); |
24 | 36 | ||
25 | /* Create four copies of the byte for which we are looking. */ | 37 | /* Create four copies of the byte for which we are looking. */ |
26 | const uint32_t goal = 0x01010101 * (uint8_t) c; | 38 | goal = 0x01010101 * (uint8_t) c; |
27 | 39 | ||
28 | /* Read the first word, but munge it so that bytes before the array | 40 | /* Read the first word, but munge it so that bytes before the array |
29 | * will not match goal. | 41 | * will not match goal. |
@@ -31,23 +43,14 @@ void *memchr(const void *s, int c, size_t n) | |||
31 | * Note that this shift count expression works because we know | 43 | * Note that this shift count expression works because we know |
32 | * shift counts are taken mod 32. | 44 | * shift counts are taken mod 32. |
33 | */ | 45 | */ |
34 | const uint32_t before_mask = (1 << (s_int << 3)) - 1; | 46 | before_mask = (1 << (s_int << 3)) - 1; |
35 | uint32_t v = (*p | before_mask) ^ (goal & before_mask); | 47 | v = (*p | before_mask) ^ (goal & before_mask); |
36 | 48 | ||
37 | /* Compute the address of the last byte. */ | 49 | /* Compute the address of the last byte. */ |
38 | const char *const last_byte_ptr = (const char *)s + n - 1; | 50 | last_byte_ptr = (const char *)s + n - 1; |
39 | 51 | ||
40 | /* Compute the address of the word containing the last byte. */ | 52 | /* Compute the address of the word containing the last byte. */ |
41 | const uint32_t *const last_word_ptr = | 53 | last_word_ptr = (const uint32_t *)((uintptr_t) last_byte_ptr & -4); |
42 | (const uint32_t *)((uintptr_t) last_byte_ptr & -4); | ||
43 | |||
44 | uint32_t bits; | ||
45 | char *ret; | ||
46 | |||
47 | if (__builtin_expect(n == 0, 0)) { | ||
48 | /* Don't dereference any memory if the array is empty. */ | ||
49 | return NULL; | ||
50 | } | ||
51 | 54 | ||
52 | while ((bits = __insn_seqb(v, goal)) == 0) { | 55 | while ((bits = __insn_seqb(v, goal)) == 0) { |
53 | if (__builtin_expect(p == last_word_ptr, 0)) { | 56 | if (__builtin_expect(p == last_word_ptr, 0)) { |
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c new file mode 100644 index 000000000000..84fdc8d8e735 --- /dev/null +++ b/arch/tile/lib/memchr_64.c | |||
@@ -0,0 +1,71 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | void *memchr(const void *s, int c, size_t n) | ||
20 | { | ||
21 | const uint64_t *last_word_ptr; | ||
22 | const uint64_t *p; | ||
23 | const char *last_byte_ptr; | ||
24 | uintptr_t s_int; | ||
25 | uint64_t goal, before_mask, v, bits; | ||
26 | char *ret; | ||
27 | |||
28 | if (__builtin_expect(n == 0, 0)) { | ||
29 | /* Don't dereference any memory if the array is empty. */ | ||
30 | return NULL; | ||
31 | } | ||
32 | |||
33 | /* Get an aligned pointer. */ | ||
34 | s_int = (uintptr_t) s; | ||
35 | p = (const uint64_t *)(s_int & -8); | ||
36 | |||
37 | /* Create eight copies of the byte for which we are looking. */ | ||
38 | goal = 0x0101010101010101ULL * (uint8_t) c; | ||
39 | |||
40 | /* Read the first word, but munge it so that bytes before the array | ||
41 | * will not match goal. | ||
42 | * | ||
43 | * Note that this shift count expression works because we know | ||
44 | * shift counts are taken mod 64. | ||
45 | */ | ||
46 | before_mask = (1ULL << (s_int << 3)) - 1; | ||
47 | v = (*p | before_mask) ^ (goal & before_mask); | ||
48 | |||
49 | /* Compute the address of the last byte. */ | ||
50 | last_byte_ptr = (const char *)s + n - 1; | ||
51 | |||
52 | /* Compute the address of the word containing the last byte. */ | ||
53 | last_word_ptr = (const uint64_t *)((uintptr_t) last_byte_ptr & -8); | ||
54 | |||
55 | while ((bits = __insn_v1cmpeq(v, goal)) == 0) { | ||
56 | if (__builtin_expect(p == last_word_ptr, 0)) { | ||
57 | /* We already read the last word in the array, | ||
58 | * so give up. | ||
59 | */ | ||
60 | return NULL; | ||
61 | } | ||
62 | v = *++p; | ||
63 | } | ||
64 | |||
65 | /* We found a match, but it might be in a byte past the end | ||
66 | * of the array. | ||
67 | */ | ||
68 | ret = ((char *)p) + (__insn_ctz(bits) >> 3); | ||
69 | return (ret <= last_byte_ptr) ? ret : NULL; | ||
70 | } | ||
71 | EXPORT_SYMBOL(memchr); | ||
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S index 30c3b7ebb55d..2a419a6122db 100644 --- a/arch/tile/lib/memcpy_32.S +++ b/arch/tile/lib/memcpy_32.S | |||
@@ -10,14 +10,16 @@ | |||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | 10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or |
11 | * NON INFRINGEMENT. See the GNU General Public License for | 11 | * NON INFRINGEMENT. See the GNU General Public License for |
12 | * more details. | 12 | * more details. |
13 | * | ||
14 | * This file shares the implementation of the userspace memcpy and | ||
15 | * the kernel's memcpy, copy_to_user and copy_from_user. | ||
16 | */ | 13 | */ |
17 | 14 | ||
18 | #include <arch/chip.h> | 15 | #include <arch/chip.h> |
19 | 16 | ||
20 | 17 | ||
18 | /* | ||
19 | * This file shares the implementation of the userspace memcpy and | ||
20 | * the kernel's memcpy, copy_to_user and copy_from_user. | ||
21 | */ | ||
22 | |||
21 | #include <linux/linkage.h> | 23 | #include <linux/linkage.h> |
22 | 24 | ||
23 | /* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */ | 25 | /* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */ |
@@ -53,9 +55,9 @@ | |||
53 | */ | 55 | */ |
54 | ENTRY(__copy_from_user_inatomic) | 56 | ENTRY(__copy_from_user_inatomic) |
55 | .type __copy_from_user_inatomic, @function | 57 | .type __copy_from_user_inatomic, @function |
56 | FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \ | 58 | FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \ |
57 | .text.memcpy_common, \ | 59 | .text.memcpy_common, \ |
58 | .Lend_memcpy_common - __copy_from_user_inatomic) | 60 | .Lend_memcpy_common - __copy_from_user_inatomic) |
59 | { movei r29, IS_COPY_FROM_USER; j memcpy_common } | 61 | { movei r29, IS_COPY_FROM_USER; j memcpy_common } |
60 | .size __copy_from_user_inatomic, . - __copy_from_user_inatomic | 62 | .size __copy_from_user_inatomic, . - __copy_from_user_inatomic |
61 | 63 | ||
@@ -64,7 +66,7 @@ ENTRY(__copy_from_user_inatomic) | |||
64 | */ | 66 | */ |
65 | ENTRY(__copy_from_user_zeroing) | 67 | ENTRY(__copy_from_user_zeroing) |
66 | .type __copy_from_user_zeroing, @function | 68 | .type __copy_from_user_zeroing, @function |
67 | FEEDBACK_REENTER(__copy_from_user_inatomic) | 69 | FEEDBACK_REENTER(__copy_from_user_inatomic) |
68 | { movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common } | 70 | { movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common } |
69 | .size __copy_from_user_zeroing, . - __copy_from_user_zeroing | 71 | .size __copy_from_user_zeroing, . - __copy_from_user_zeroing |
70 | 72 | ||
@@ -74,13 +76,13 @@ ENTRY(__copy_from_user_zeroing) | |||
74 | */ | 76 | */ |
75 | ENTRY(__copy_to_user_inatomic) | 77 | ENTRY(__copy_to_user_inatomic) |
76 | .type __copy_to_user_inatomic, @function | 78 | .type __copy_to_user_inatomic, @function |
77 | FEEDBACK_REENTER(__copy_from_user_inatomic) | 79 | FEEDBACK_REENTER(__copy_from_user_inatomic) |
78 | { movei r29, IS_COPY_TO_USER; j memcpy_common } | 80 | { movei r29, IS_COPY_TO_USER; j memcpy_common } |
79 | .size __copy_to_user_inatomic, . - __copy_to_user_inatomic | 81 | .size __copy_to_user_inatomic, . - __copy_to_user_inatomic |
80 | 82 | ||
81 | ENTRY(memcpy) | 83 | ENTRY(memcpy) |
82 | .type memcpy, @function | 84 | .type memcpy, @function |
83 | FEEDBACK_REENTER(__copy_from_user_inatomic) | 85 | FEEDBACK_REENTER(__copy_from_user_inatomic) |
84 | { movei r29, IS_MEMCPY } | 86 | { movei r29, IS_MEMCPY } |
85 | .size memcpy, . - memcpy | 87 | .size memcpy, . - memcpy |
86 | /* Fall through */ | 88 | /* Fall through */ |
@@ -157,35 +159,35 @@ EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } | |||
157 | { addi r3, r1, 60; andi r9, r9, -64 } | 159 | { addi r3, r1, 60; andi r9, r9, -64 } |
158 | 160 | ||
159 | #if CHIP_HAS_WH64() | 161 | #if CHIP_HAS_WH64() |
160 | /* No need to prefetch dst, we'll just do the wh64 | 162 | /* No need to prefetch dst, we'll just do the wh64 |
161 | * right before we copy a line. | 163 | * right before we copy a line. |
162 | */ | 164 | */ |
163 | #endif | 165 | #endif |
164 | 166 | ||
165 | EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 } | 167 | EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 } |
166 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | 168 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ |
167 | { bnzt zero, .; move r27, lr } | 169 | { bnzt zero, .; move r27, lr } |
168 | EX: { lw r6, r3; addi r3, r3, 64 } | 170 | EX: { lw r6, r3; addi r3, r3, 64 } |
169 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | 171 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ |
170 | { bnzt zero, . } | 172 | { bnzt zero, . } |
171 | EX: { lw r7, r3; addi r3, r3, 64 } | 173 | EX: { lw r7, r3; addi r3, r3, 64 } |
172 | #if !CHIP_HAS_WH64() | 174 | #if !CHIP_HAS_WH64() |
173 | /* Prefetch the dest */ | 175 | /* Prefetch the dest */ |
174 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | 176 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ |
175 | { bnzt zero, . } | 177 | { bnzt zero, . } |
176 | /* Use a real load to cause a TLB miss if necessary. We aren't using | 178 | /* Use a real load to cause a TLB miss if necessary. We aren't using |
177 | * r28, so this should be fine. | 179 | * r28, so this should be fine. |
178 | */ | 180 | */ |
179 | EX: { lw r28, r9; addi r9, r9, 64 } | 181 | EX: { lw r28, r9; addi r9, r9, 64 } |
180 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | 182 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ |
181 | { bnzt zero, . } | 183 | { bnzt zero, . } |
182 | { prefetch r9; addi r9, r9, 64 } | 184 | { prefetch r9; addi r9, r9, 64 } |
183 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | 185 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ |
184 | { bnzt zero, . } | 186 | { bnzt zero, . } |
185 | { prefetch r9; addi r9, r9, 64 } | 187 | { prefetch r9; addi r9, r9, 64 } |
186 | #endif | 188 | #endif |
187 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | 189 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ |
188 | { bz zero, .Lbig_loop2 } | 190 | { bz zero, .Lbig_loop2 } |
189 | 191 | ||
190 | /* On entry to this loop: | 192 | /* On entry to this loop: |
191 | * - r0 points to the start of dst line 0 | 193 | * - r0 points to the start of dst line 0 |
@@ -197,7 +199,7 @@ EX: { lw r28, r9; addi r9, r9, 64 } | |||
197 | * to some "safe" recently loaded address. | 199 | * to some "safe" recently loaded address. |
198 | * - r5 contains *(r1 + 60) [i.e. last word of source line 0] | 200 | * - r5 contains *(r1 + 60) [i.e. last word of source line 0] |
199 | * - r6 contains *(r1 + 64 + 60) [i.e. last word of source line 1] | 201 | * - r6 contains *(r1 + 64 + 60) [i.e. last word of source line 1] |
200 | * - r9 contains ((r0 + 63) & -64) | 202 | * - r9 contains ((r0 + 63) & -64) |
201 | * [start of next dst cache line.] | 203 | * [start of next dst cache line.] |
202 | */ | 204 | */ |
203 | 205 | ||
@@ -208,137 +210,137 @@ EX: { lw r28, r9; addi r9, r9, 64 } | |||
208 | /* Copy line 0, first stalling until r5 is ready. */ | 210 | /* Copy line 0, first stalling until r5 is ready. */ |
209 | EX: { move r12, r5; lw r16, r1 } | 211 | EX: { move r12, r5; lw r16, r1 } |
210 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } | 212 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } |
211 | /* Prefetch several lines ahead. */ | 213 | /* Prefetch several lines ahead. */ |
212 | EX: { lw r5, r3; addi r3, r3, 64 } | 214 | EX: { lw r5, r3; addi r3, r3, 64 } |
213 | { jal .Lcopy_line } | 215 | { jal .Lcopy_line } |
214 | 216 | ||
215 | /* Copy line 1, first stalling until r6 is ready. */ | 217 | /* Copy line 1, first stalling until r6 is ready. */ |
216 | EX: { move r12, r6; lw r16, r1 } | 218 | EX: { move r12, r6; lw r16, r1 } |
217 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } | 219 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } |
218 | /* Prefetch several lines ahead. */ | 220 | /* Prefetch several lines ahead. */ |
219 | EX: { lw r6, r3; addi r3, r3, 64 } | 221 | EX: { lw r6, r3; addi r3, r3, 64 } |
220 | { jal .Lcopy_line } | 222 | { jal .Lcopy_line } |
221 | 223 | ||
222 | /* Copy line 2, first stalling until r7 is ready. */ | 224 | /* Copy line 2, first stalling until r7 is ready. */ |
223 | EX: { move r12, r7; lw r16, r1 } | 225 | EX: { move r12, r7; lw r16, r1 } |
224 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } | 226 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } |
225 | /* Prefetch several lines ahead. */ | 227 | /* Prefetch several lines ahead. */ |
226 | EX: { lw r7, r3; addi r3, r3, 64 } | 228 | EX: { lw r7, r3; addi r3, r3, 64 } |
227 | /* Use up a caches-busy cycle by jumping back to the top of the | 229 | /* Use up a caches-busy cycle by jumping back to the top of the |
228 | * loop. Might as well get it out of the way now. | 230 | * loop. Might as well get it out of the way now. |
229 | */ | 231 | */ |
230 | { j .Lbig_loop } | 232 | { j .Lbig_loop } |
231 | 233 | ||
232 | 234 | ||
233 | /* On entry: | 235 | /* On entry: |
234 | * - r0 points to the destination line. | 236 | * - r0 points to the destination line. |
235 | * - r1 points to the source line. | 237 | * - r1 points to the source line. |
236 | * - r3 is the next prefetch address. | 238 | * - r3 is the next prefetch address. |
237 | * - r9 holds the last address used for wh64. | 239 | * - r9 holds the last address used for wh64. |
238 | * - r12 = WORD_15 | 240 | * - r12 = WORD_15 |
239 | * - r16 = WORD_0. | 241 | * - r16 = WORD_0. |
240 | * - r17 == r1 + 16. | 242 | * - r17 == r1 + 16. |
241 | * - r27 holds saved lr to restore. | 243 | * - r27 holds saved lr to restore. |
242 | * | 244 | * |
243 | * On exit: | 245 | * On exit: |
244 | * - r0 is incremented by 64. | 246 | * - r0 is incremented by 64. |
245 | * - r1 is incremented by 64, unless that would point to a word | 247 | * - r1 is incremented by 64, unless that would point to a word |
246 | * beyond the end of the source array, in which case it is redirected | 248 | * beyond the end of the source array, in which case it is redirected |
247 | * to point to an arbitrary word already in the cache. | 249 | * to point to an arbitrary word already in the cache. |
248 | * - r2 is decremented by 64. | 250 | * - r2 is decremented by 64. |
249 | * - r3 is unchanged, unless it points to a word beyond the | 251 | * - r3 is unchanged, unless it points to a word beyond the |
250 | * end of the source array, in which case it is redirected | 252 | * end of the source array, in which case it is redirected |
251 | * to point to an arbitrary word already in the cache. | 253 | * to point to an arbitrary word already in the cache. |
252 | * Redirecting is OK since if we are that close to the end | 254 | * Redirecting is OK since if we are that close to the end |
253 | * of the array we will not come back to this subroutine | 255 | * of the array we will not come back to this subroutine |
254 | * and use the contents of the prefetched address. | 256 | * and use the contents of the prefetched address. |
255 | * - r4 is nonzero iff r2 >= 64. | 257 | * - r4 is nonzero iff r2 >= 64. |
256 | * - r9 is incremented by 64, unless it points beyond the | 258 | * - r9 is incremented by 64, unless it points beyond the |
257 | * end of the last full destination cache line, in which | 259 | * end of the last full destination cache line, in which |
258 | * case it is redirected to a "safe address" that can be | 260 | * case it is redirected to a "safe address" that can be |
259 | * clobbered (sp - 64) | 261 | * clobbered (sp - 64) |
260 | * - lr contains the value in r27. | 262 | * - lr contains the value in r27. |
261 | */ | 263 | */ |
262 | 264 | ||
263 | /* r26 unused */ | 265 | /* r26 unused */ |
264 | 266 | ||
265 | .Lcopy_line: | 267 | .Lcopy_line: |
266 | /* TODO: when r3 goes past the end, we would like to redirect it | 268 | /* TODO: when r3 goes past the end, we would like to redirect it |
267 | * to prefetch the last partial cache line (if any) just once, for the | 269 | * to prefetch the last partial cache line (if any) just once, for the |
268 | * benefit of the final cleanup loop. But we don't want to | 270 | * benefit of the final cleanup loop. But we don't want to |
269 | * prefetch that line more than once, or subsequent prefetches | 271 | * prefetch that line more than once, or subsequent prefetches |
270 | * will go into the RTF. But then .Lbig_loop should unconditionally | 272 | * will go into the RTF. But then .Lbig_loop should unconditionally |
271 | * branch to top of loop to execute final prefetch, and its | 273 | * branch to top of loop to execute final prefetch, and its |
272 | * nop should become a conditional branch. | 274 | * nop should become a conditional branch. |
273 | */ | 275 | */ |
274 | 276 | ||
275 | /* We need two non-memory cycles here to cover the resources | 277 | /* We need two non-memory cycles here to cover the resources |
276 | * used by the loads initiated by the caller. | 278 | * used by the loads initiated by the caller. |
277 | */ | 279 | */ |
278 | { add r15, r1, r2 } | 280 | { add r15, r1, r2 } |
279 | .Lcopy_line2: | 281 | .Lcopy_line2: |
280 | { slt_u r13, r3, r15; addi r17, r1, 16 } | 282 | { slt_u r13, r3, r15; addi r17, r1, 16 } |
281 | 283 | ||
282 | /* NOTE: this will stall for one cycle as L1 is busy. */ | 284 | /* NOTE: this will stall for one cycle as L1 is busy. */ |
283 | 285 | ||
284 | /* Fill second L1D line. */ | 286 | /* Fill second L1D line. */ |
285 | EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */ | 287 | EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */ |
286 | 288 | ||
287 | #if CHIP_HAS_WH64() | 289 | #if CHIP_HAS_WH64() |
288 | /* Prepare destination line for writing. */ | 290 | /* Prepare destination line for writing. */ |
289 | EX: { wh64 r9; addi r9, r9, 64 } | 291 | EX: { wh64 r9; addi r9, r9, 64 } |
290 | #else | 292 | #else |
291 | /* Prefetch dest line */ | 293 | /* Prefetch dest line */ |
292 | { prefetch r9; addi r9, r9, 64 } | 294 | { prefetch r9; addi r9, r9, 64 } |
293 | #endif | 295 | #endif |
294 | /* Load seven words that are L1D hits to cover wh64 L2 usage. */ | 296 | /* Load seven words that are L1D hits to cover wh64 L2 usage. */ |
295 | 297 | ||
296 | /* Load the three remaining words from the last L1D line, which | 298 | /* Load the three remaining words from the last L1D line, which |
297 | * we know has already filled the L1D. | 299 | * we know has already filled the L1D. |
298 | */ | 300 | */ |
299 | EX: { lw r4, r1; addi r1, r1, 4; addi r20, r1, 16 } /* r4 = WORD_12 */ | 301 | EX: { lw r4, r1; addi r1, r1, 4; addi r20, r1, 16 } /* r4 = WORD_12 */ |
300 | EX: { lw r8, r1; addi r1, r1, 4; slt_u r13, r20, r15 }/* r8 = WORD_13 */ | 302 | EX: { lw r8, r1; addi r1, r1, 4; slt_u r13, r20, r15 }/* r8 = WORD_13 */ |
301 | EX: { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 } /* r11 = WORD_14 */ | 303 | EX: { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 } /* r11 = WORD_14 */ |
302 | 304 | ||
303 | /* Load the three remaining words from the first L1D line, first | 305 | /* Load the three remaining words from the first L1D line, first |
304 | * stalling until it has filled by "looking at" r16. | 306 | * stalling until it has filled by "looking at" r16. |
305 | */ | 307 | */ |
306 | EX: { lw r13, r1; addi r1, r1, 4; move zero, r16 } /* r13 = WORD_1 */ | 308 | EX: { lw r13, r1; addi r1, r1, 4; move zero, r16 } /* r13 = WORD_1 */ |
307 | EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_2 */ | 309 | EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_2 */ |
308 | EX: { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */ | 310 | EX: { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */ |
309 | 311 | ||
310 | /* Load second word from the second L1D line, first | 312 | /* Load second word from the second L1D line, first |
311 | * stalling until it has filled by "looking at" r17. | 313 | * stalling until it has filled by "looking at" r17. |
312 | */ | 314 | */ |
313 | EX: { lw r19, r1; addi r1, r1, 4; move zero, r17 } /* r19 = WORD_5 */ | 315 | EX: { lw r19, r1; addi r1, r1, 4; move zero, r17 } /* r19 = WORD_5 */ |
314 | 316 | ||
315 | /* Store last word to the destination line, potentially dirtying it | 317 | /* Store last word to the destination line, potentially dirtying it |
316 | * for the first time, which keeps the L2 busy for two cycles. | 318 | * for the first time, which keeps the L2 busy for two cycles. |
317 | */ | 319 | */ |
318 | EX: { sw r10, r12 } /* store(WORD_15) */ | 320 | EX: { sw r10, r12 } /* store(WORD_15) */ |
319 | 321 | ||
320 | /* Use two L1D hits to cover the sw L2 access above. */ | 322 | /* Use two L1D hits to cover the sw L2 access above. */ |
321 | EX: { lw r10, r1; addi r1, r1, 4 } /* r10 = WORD_6 */ | 323 | EX: { lw r10, r1; addi r1, r1, 4 } /* r10 = WORD_6 */ |
322 | EX: { lw r12, r1; addi r1, r1, 4 } /* r12 = WORD_7 */ | 324 | EX: { lw r12, r1; addi r1, r1, 4 } /* r12 = WORD_7 */ |
323 | 325 | ||
324 | /* Fill third L1D line. */ | 326 | /* Fill third L1D line. */ |
325 | EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */ | 327 | EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */ |
326 | 328 | ||
327 | /* Store first L1D line. */ | 329 | /* Store first L1D line. */ |
328 | EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */ | 330 | EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */ |
329 | EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */ | 331 | EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */ |
330 | EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */ | 332 | EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */ |
331 | #if CHIP_HAS_WH64() | 333 | #if CHIP_HAS_WH64() |
332 | EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */ | 334 | EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */ |
333 | #else | 335 | #else |
334 | /* Back up the r9 to a cache line we are already storing to | 336 | /* Back up the r9 to a cache line we are already storing to |
335 | * if it gets past the end of the dest vector. Strictly speaking, | 337 | * if it gets past the end of the dest vector. Strictly speaking, |
336 | * we don't need to back up to the start of a cache line, but it's free | 338 | * we don't need to back up to the start of a cache line, but it's free |
337 | * and tidy, so why not? | 339 | * and tidy, so why not? |
338 | */ | 340 | */ |
339 | EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */ | 341 | EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */ |
340 | #endif | 342 | #endif |
341 | /* Store second L1D line. */ | 343 | /* Store second L1D line. */ |
342 | EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */ | 344 | EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */ |
343 | EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */ | 345 | EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */ |
344 | EX: { sw r0, r10; addi r0, r0, 4 } /* store(WORD_6) */ | 346 | EX: { sw r0, r10; addi r0, r0, 4 } /* store(WORD_6) */ |
@@ -348,30 +350,30 @@ EX: { lw r13, r1; addi r1, r1, 4; move zero, r18 } /* r13 = WORD_9 */ | |||
348 | EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_10 */ | 350 | EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_10 */ |
349 | EX: { lw r15, r1; move r1, r20 } /* r15 = WORD_11 */ | 351 | EX: { lw r15, r1; move r1, r20 } /* r15 = WORD_11 */ |
350 | 352 | ||
351 | /* Store third L1D line. */ | 353 | /* Store third L1D line. */ |
352 | EX: { sw r0, r18; addi r0, r0, 4 } /* store(WORD_8) */ | 354 | EX: { sw r0, r18; addi r0, r0, 4 } /* store(WORD_8) */ |
353 | EX: { sw r0, r13; addi r0, r0, 4 } /* store(WORD_9) */ | 355 | EX: { sw r0, r13; addi r0, r0, 4 } /* store(WORD_9) */ |
354 | EX: { sw r0, r14; addi r0, r0, 4 } /* store(WORD_10) */ | 356 | EX: { sw r0, r14; addi r0, r0, 4 } /* store(WORD_10) */ |
355 | EX: { sw r0, r15; addi r0, r0, 4 } /* store(WORD_11) */ | 357 | EX: { sw r0, r15; addi r0, r0, 4 } /* store(WORD_11) */ |
356 | 358 | ||
357 | /* Store rest of fourth L1D line. */ | 359 | /* Store rest of fourth L1D line. */ |
358 | EX: { sw r0, r4; addi r0, r0, 4 } /* store(WORD_12) */ | 360 | EX: { sw r0, r4; addi r0, r0, 4 } /* store(WORD_12) */ |
359 | { | 361 | { |
360 | EX: sw r0, r8 /* store(WORD_13) */ | 362 | EX: sw r0, r8 /* store(WORD_13) */ |
361 | addi r0, r0, 4 | 363 | addi r0, r0, 4 |
362 | /* Will r2 be > 64 after we subtract 64 below? */ | 364 | /* Will r2 be > 64 after we subtract 64 below? */ |
363 | shri r4, r2, 7 | 365 | shri r4, r2, 7 |
364 | } | 366 | } |
365 | { | 367 | { |
366 | EX: sw r0, r11 /* store(WORD_14) */ | 368 | EX: sw r0, r11 /* store(WORD_14) */ |
367 | addi r0, r0, 8 | 369 | addi r0, r0, 8 |
368 | /* Record 64 bytes successfully copied. */ | 370 | /* Record 64 bytes successfully copied. */ |
369 | addi r2, r2, -64 | 371 | addi r2, r2, -64 |
370 | } | 372 | } |
371 | 373 | ||
372 | { jrp lr; move lr, r27 } | 374 | { jrp lr; move lr, r27 } |
373 | 375 | ||
374 | /* Convey to the backtrace library that the stack frame is size | 376 | /* Convey to the backtrace library that the stack frame is size |
375 | * zero, and the real return address is on the stack rather than | 377 | * zero, and the real return address is on the stack rather than |
376 | * in 'lr'. | 378 | * in 'lr'. |
377 | */ | 379 | */ |
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c new file mode 100644 index 000000000000..3fab9a6a2bbe --- /dev/null +++ b/arch/tile/lib/memcpy_64.c | |||
@@ -0,0 +1,220 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/module.h> | ||
18 | #define __memcpy memcpy | ||
19 | /* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */ | ||
20 | |||
21 | /* Must be 8 bytes in size. */ | ||
22 | #define word_t uint64_t | ||
23 | |||
24 | #if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128 | ||
25 | #error "Assumes 64 or 128 byte line size" | ||
26 | #endif | ||
27 | |||
28 | /* How many cache lines ahead should we prefetch? */ | ||
29 | #define PREFETCH_LINES_AHEAD 3 | ||
30 | |||
31 | /* | ||
32 | * Provide "base versions" of load and store for the normal code path. | ||
33 | * The kernel provides other versions for userspace copies. | ||
34 | */ | ||
35 | #define ST(p, v) (*(p) = (v)) | ||
36 | #define LD(p) (*(p)) | ||
37 | |||
38 | #ifndef USERCOPY_FUNC | ||
39 | #define ST1 ST | ||
40 | #define ST2 ST | ||
41 | #define ST4 ST | ||
42 | #define ST8 ST | ||
43 | #define LD1 LD | ||
44 | #define LD2 LD | ||
45 | #define LD4 LD | ||
46 | #define LD8 LD | ||
47 | #define RETVAL dstv | ||
48 | void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n) | ||
49 | #else | ||
50 | /* | ||
51 | * Special kernel version will provide implementation of the LDn/STn | ||
52 | * macros to return a count of uncopied bytes due to mm fault. | ||
53 | */ | ||
54 | #define RETVAL 0 | ||
55 | int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n) | ||
56 | #endif | ||
57 | { | ||
58 | char *__restrict dst1 = (char *)dstv; | ||
59 | const char *__restrict src1 = (const char *)srcv; | ||
60 | const char *__restrict src1_end; | ||
61 | const char *__restrict prefetch; | ||
62 | word_t *__restrict dst8; /* 8-byte pointer to destination memory. */ | ||
63 | word_t final; /* Final bytes to write to trailing word, if any */ | ||
64 | long i; | ||
65 | |||
66 | if (n < 16) { | ||
67 | for (; n; n--) | ||
68 | ST1(dst1++, LD1(src1++)); | ||
69 | return RETVAL; | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * Locate the end of source memory we will copy. Don't | ||
74 | * prefetch past this. | ||
75 | */ | ||
76 | src1_end = src1 + n - 1; | ||
77 | |||
78 | /* Prefetch ahead a few cache lines, but not past the end. */ | ||
79 | prefetch = src1; | ||
80 | for (i = 0; i < PREFETCH_LINES_AHEAD; i++) { | ||
81 | __insn_prefetch(prefetch); | ||
82 | prefetch += CHIP_L2_LINE_SIZE(); | ||
83 | prefetch = (prefetch > src1_end) ? prefetch : src1; | ||
84 | } | ||
85 | |||
86 | /* Copy bytes until dst is word-aligned. */ | ||
87 | for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--) | ||
88 | ST1(dst1++, LD1(src1++)); | ||
89 | |||
90 | /* 8-byte pointer to destination memory. */ | ||
91 | dst8 = (word_t *)dst1; | ||
92 | |||
93 | if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) { | ||
94 | /* | ||
95 | * Misaligned copy. Copy 8 bytes at a time, but don't | ||
96 | * bother with other fanciness. | ||
97 | * | ||
98 | * TODO: Consider prefetching and using wh64 as well. | ||
99 | */ | ||
100 | |||
101 | /* Create an aligned src8. */ | ||
102 | const word_t *__restrict src8 = | ||
103 | (const word_t *)((uintptr_t)src1 & -sizeof(word_t)); | ||
104 | word_t b; | ||
105 | |||
106 | word_t a = LD8(src8++); | ||
107 | for (; n >= sizeof(word_t); n -= sizeof(word_t)) { | ||
108 | b = LD8(src8++); | ||
109 | a = __insn_dblalign(a, b, src1); | ||
110 | ST8(dst8++, a); | ||
111 | a = b; | ||
112 | } | ||
113 | |||
114 | if (n == 0) | ||
115 | return RETVAL; | ||
116 | |||
117 | b = ((const char *)src8 <= src1_end) ? *src8 : 0; | ||
118 | |||
119 | /* | ||
120 | * Final source bytes to write to trailing partial | ||
121 | * word, if any. | ||
122 | */ | ||
123 | final = __insn_dblalign(a, b, src1); | ||
124 | } else { | ||
125 | /* Aligned copy. */ | ||
126 | |||
127 | const word_t* __restrict src8 = (const word_t *)src1; | ||
128 | |||
129 | /* src8 and dst8 are both word-aligned. */ | ||
130 | if (n >= CHIP_L2_LINE_SIZE()) { | ||
131 | /* Copy until 'dst' is cache-line-aligned. */ | ||
132 | for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1); | ||
133 | n -= sizeof(word_t)) | ||
134 | ST8(dst8++, LD8(src8++)); | ||
135 | |||
136 | for (; n >= CHIP_L2_LINE_SIZE(); ) { | ||
137 | __insn_wh64(dst8); | ||
138 | |||
139 | /* | ||
140 | * Prefetch and advance to next line | ||
141 | * to prefetch, but don't go past the end | ||
142 | */ | ||
143 | __insn_prefetch(prefetch); | ||
144 | prefetch += CHIP_L2_LINE_SIZE(); | ||
145 | prefetch = (prefetch > src1_end) ? prefetch : | ||
146 | (const char *)src8; | ||
147 | |||
148 | /* | ||
149 | * Copy an entire cache line. Manually | ||
150 | * unrolled to avoid idiosyncracies of | ||
151 | * compiler unrolling. | ||
152 | */ | ||
153 | #define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; }) | ||
154 | COPY_WORD(0); | ||
155 | COPY_WORD(1); | ||
156 | COPY_WORD(2); | ||
157 | COPY_WORD(3); | ||
158 | COPY_WORD(4); | ||
159 | COPY_WORD(5); | ||
160 | COPY_WORD(6); | ||
161 | COPY_WORD(7); | ||
162 | #if CHIP_L2_LINE_SIZE() == 128 | ||
163 | COPY_WORD(8); | ||
164 | COPY_WORD(9); | ||
165 | COPY_WORD(10); | ||
166 | COPY_WORD(11); | ||
167 | COPY_WORD(12); | ||
168 | COPY_WORD(13); | ||
169 | COPY_WORD(14); | ||
170 | COPY_WORD(15); | ||
171 | #elif CHIP_L2_LINE_SIZE() != 64 | ||
172 | # error Fix code that assumes particular L2 cache line sizes | ||
173 | #endif | ||
174 | |||
175 | dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t); | ||
176 | src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t); | ||
177 | } | ||
178 | } | ||
179 | |||
180 | for (; n >= sizeof(word_t); n -= sizeof(word_t)) | ||
181 | ST8(dst8++, LD8(src8++)); | ||
182 | |||
183 | if (__builtin_expect(n == 0, 1)) | ||
184 | return RETVAL; | ||
185 | |||
186 | final = LD8(src8); | ||
187 | } | ||
188 | |||
189 | /* n != 0 if we get here. Write out any trailing bytes. */ | ||
190 | dst1 = (char *)dst8; | ||
191 | if (n & 4) { | ||
192 | ST4((uint32_t *)dst1, final); | ||
193 | dst1 += 4; | ||
194 | final >>= 32; | ||
195 | n &= 3; | ||
196 | } | ||
197 | if (n & 2) { | ||
198 | ST2((uint16_t *)dst1, final); | ||
199 | dst1 += 2; | ||
200 | final >>= 16; | ||
201 | n &= 1; | ||
202 | } | ||
203 | if (n) | ||
204 | ST1((uint8_t *)dst1, final); | ||
205 | |||
206 | return RETVAL; | ||
207 | } | ||
208 | |||
209 | |||
210 | #ifdef USERCOPY_FUNC | ||
211 | #undef ST1 | ||
212 | #undef ST2 | ||
213 | #undef ST4 | ||
214 | #undef ST8 | ||
215 | #undef LD1 | ||
216 | #undef LD2 | ||
217 | #undef LD4 | ||
218 | #undef LD8 | ||
219 | #undef USERCOPY_FUNC | ||
220 | #endif | ||
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c index dfedea7b266b..b2fe15e01075 100644 --- a/arch/tile/lib/memcpy_tile64.c +++ b/arch/tile/lib/memcpy_tile64.c | |||
@@ -54,7 +54,7 @@ typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long); | |||
54 | * we must run with interrupts disabled to avoid the risk of some | 54 | * we must run with interrupts disabled to avoid the risk of some |
55 | * other code seeing the incoherent data in our cache. (Recall that | 55 | * other code seeing the incoherent data in our cache. (Recall that |
56 | * our cache is indexed by PA, so even if the other code doesn't use | 56 | * our cache is indexed by PA, so even if the other code doesn't use |
57 | * our KM_MEMCPY virtual addresses, they'll still hit in cache using | 57 | * our kmap_atomic virtual addresses, they'll still hit in cache using |
58 | * the normal VAs that aren't supposed to hit in cache.) | 58 | * the normal VAs that aren't supposed to hit in cache.) |
59 | */ | 59 | */ |
60 | static void memcpy_multicache(void *dest, const void *source, | 60 | static void memcpy_multicache(void *dest, const void *source, |
@@ -64,6 +64,7 @@ static void memcpy_multicache(void *dest, const void *source, | |||
64 | unsigned long flags, newsrc, newdst; | 64 | unsigned long flags, newsrc, newdst; |
65 | pmd_t *pmdp; | 65 | pmd_t *pmdp; |
66 | pte_t *ptep; | 66 | pte_t *ptep; |
67 | int type0, type1; | ||
67 | int cpu = get_cpu(); | 68 | int cpu = get_cpu(); |
68 | 69 | ||
69 | /* | 70 | /* |
@@ -77,7 +78,8 @@ static void memcpy_multicache(void *dest, const void *source, | |||
77 | sim_allow_multiple_caching(1); | 78 | sim_allow_multiple_caching(1); |
78 | 79 | ||
79 | /* Set up the new dest mapping */ | 80 | /* Set up the new dest mapping */ |
80 | idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + KM_MEMCPY0; | 81 | type0 = kmap_atomic_idx_push(); |
82 | idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0; | ||
81 | newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1)); | 83 | newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1)); |
82 | pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst); | 84 | pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst); |
83 | ptep = pte_offset_kernel(pmdp, newdst); | 85 | ptep = pte_offset_kernel(pmdp, newdst); |
@@ -87,13 +89,14 @@ static void memcpy_multicache(void *dest, const void *source, | |||
87 | } | 89 | } |
88 | 90 | ||
89 | /* Set up the new source mapping */ | 91 | /* Set up the new source mapping */ |
90 | idx += (KM_MEMCPY0 - KM_MEMCPY1); | 92 | type1 = kmap_atomic_idx_push(); |
93 | idx += (type0 - type1); | ||
91 | src_pte = hv_pte_set_nc(src_pte); | 94 | src_pte = hv_pte_set_nc(src_pte); |
92 | src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */ | 95 | src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */ |
93 | newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); | 96 | newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); |
94 | pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); | 97 | pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); |
95 | ptep = pte_offset_kernel(pmdp, newsrc); | 98 | ptep = pte_offset_kernel(pmdp, newsrc); |
96 | *ptep = src_pte; /* set_pte() would be confused by this */ | 99 | __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ |
97 | local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); | 100 | local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); |
98 | 101 | ||
99 | /* Actually move the data. */ | 102 | /* Actually move the data. */ |
@@ -106,7 +109,7 @@ static void memcpy_multicache(void *dest, const void *source, | |||
106 | */ | 109 | */ |
107 | src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); | 110 | src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); |
108 | src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ | 111 | src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ |
109 | *ptep = src_pte; /* set_pte() would be confused by this */ | 112 | __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ |
110 | local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); | 113 | local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); |
111 | 114 | ||
112 | /* | 115 | /* |
@@ -119,6 +122,8 @@ static void memcpy_multicache(void *dest, const void *source, | |||
119 | * We're done: notify the simulator that all is back to normal, | 122 | * We're done: notify the simulator that all is back to normal, |
120 | * and re-enable interrupts and pre-emption. | 123 | * and re-enable interrupts and pre-emption. |
121 | */ | 124 | */ |
125 | kmap_atomic_idx_pop(); | ||
126 | kmap_atomic_idx_pop(); | ||
122 | sim_allow_multiple_caching(0); | 127 | sim_allow_multiple_caching(0); |
123 | local_irq_restore(flags); | 128 | local_irq_restore(flags); |
124 | put_cpu(); | 129 | put_cpu(); |
diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c new file mode 100644 index 000000000000..4763b3aff1cc --- /dev/null +++ b/arch/tile/lib/memcpy_user_64.c | |||
@@ -0,0 +1,86 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * Do memcpy(), but trap and return "n" when a load or store faults. | ||
15 | * | ||
16 | * Note: this idiom only works when memcpy() compiles to a leaf function. | ||
17 | * If "sp" is updated during memcpy, the "jrp lr" will be incorrect. | ||
18 | * | ||
19 | * Also note that we are capturing "n" from the containing scope here. | ||
20 | */ | ||
21 | |||
22 | #define _ST(p, inst, v) \ | ||
23 | ({ \ | ||
24 | asm("1: " #inst " %0, %1;" \ | ||
25 | ".pushsection .coldtext.memcpy,\"ax\";" \ | ||
26 | "2: { move r0, %2; jrp lr };" \ | ||
27 | ".section __ex_table,\"a\";" \ | ||
28 | ".quad 1b, 2b;" \ | ||
29 | ".popsection" \ | ||
30 | : "=m" (*(p)) : "r" (v), "r" (n)); \ | ||
31 | }) | ||
32 | |||
33 | #define _LD(p, inst) \ | ||
34 | ({ \ | ||
35 | unsigned long __v; \ | ||
36 | asm("1: " #inst " %0, %1;" \ | ||
37 | ".pushsection .coldtext.memcpy,\"ax\";" \ | ||
38 | "2: { move r0, %2; jrp lr };" \ | ||
39 | ".section __ex_table,\"a\";" \ | ||
40 | ".quad 1b, 2b;" \ | ||
41 | ".popsection" \ | ||
42 | : "=r" (__v) : "m" (*(p)), "r" (n)); \ | ||
43 | __v; \ | ||
44 | }) | ||
45 | |||
46 | #define USERCOPY_FUNC __copy_to_user_inatomic | ||
47 | #define ST1(p, v) _ST((p), st1, (v)) | ||
48 | #define ST2(p, v) _ST((p), st2, (v)) | ||
49 | #define ST4(p, v) _ST((p), st4, (v)) | ||
50 | #define ST8(p, v) _ST((p), st, (v)) | ||
51 | #define LD1 LD | ||
52 | #define LD2 LD | ||
53 | #define LD4 LD | ||
54 | #define LD8 LD | ||
55 | #include "memcpy_64.c" | ||
56 | |||
57 | #define USERCOPY_FUNC __copy_from_user_inatomic | ||
58 | #define ST1 ST | ||
59 | #define ST2 ST | ||
60 | #define ST4 ST | ||
61 | #define ST8 ST | ||
62 | #define LD1(p) _LD((p), ld1u) | ||
63 | #define LD2(p) _LD((p), ld2u) | ||
64 | #define LD4(p) _LD((p), ld4u) | ||
65 | #define LD8(p) _LD((p), ld) | ||
66 | #include "memcpy_64.c" | ||
67 | |||
68 | #define USERCOPY_FUNC __copy_in_user_inatomic | ||
69 | #define ST1(p, v) _ST((p), st1, (v)) | ||
70 | #define ST2(p, v) _ST((p), st2, (v)) | ||
71 | #define ST4(p, v) _ST((p), st4, (v)) | ||
72 | #define ST8(p, v) _ST((p), st, (v)) | ||
73 | #define LD1(p) _LD((p), ld1u) | ||
74 | #define LD2(p) _LD((p), ld2u) | ||
75 | #define LD4(p) _LD((p), ld4u) | ||
76 | #define LD8(p) _LD((p), ld) | ||
77 | #include "memcpy_64.c" | ||
78 | |||
79 | unsigned long __copy_from_user_zeroing(void *to, const void __user *from, | ||
80 | unsigned long n) | ||
81 | { | ||
82 | unsigned long rc = __copy_from_user_inatomic(to, from, n); | ||
83 | if (unlikely(rc)) | ||
84 | memset(to + n - rc, 0, rc); | ||
85 | return rc; | ||
86 | } | ||
diff --git a/arch/tile/lib/memmove_32.c b/arch/tile/lib/memmove.c index fd615ae6ade7..fd615ae6ade7 100644 --- a/arch/tile/lib/memmove_32.c +++ b/arch/tile/lib/memmove.c | |||
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c index d014c1fbcbc2..57dbb3a5bff8 100644 --- a/arch/tile/lib/memset_32.c +++ b/arch/tile/lib/memset_32.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/string.h> | 18 | #include <linux/string.h> |
19 | #include <linux/module.h> | 19 | #include <linux/module.h> |
20 | 20 | ||
21 | #undef memset | ||
21 | 22 | ||
22 | void *memset(void *s, int c, size_t n) | 23 | void *memset(void *s, int c, size_t n) |
23 | { | 24 | { |
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c new file mode 100644 index 000000000000..3873085711d5 --- /dev/null +++ b/arch/tile/lib/memset_64.c | |||
@@ -0,0 +1,145 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <arch/chip.h> | ||
16 | |||
17 | #include <linux/types.h> | ||
18 | #include <linux/string.h> | ||
19 | #include <linux/module.h> | ||
20 | |||
21 | #undef memset | ||
22 | |||
23 | void *memset(void *s, int c, size_t n) | ||
24 | { | ||
25 | uint64_t *out64; | ||
26 | int n64, to_align64; | ||
27 | uint64_t v64; | ||
28 | uint8_t *out8 = s; | ||
29 | |||
30 | /* Experimentation shows that a trivial tight loop is a win up until | ||
31 | * around a size of 20, where writing a word at a time starts to win. | ||
32 | */ | ||
33 | #define BYTE_CUTOFF 20 | ||
34 | |||
35 | #if BYTE_CUTOFF < 7 | ||
36 | /* This must be at least at least this big, or some code later | ||
37 | * on doesn't work. | ||
38 | */ | ||
39 | #error "BYTE_CUTOFF is too small" | ||
40 | #endif | ||
41 | |||
42 | if (n < BYTE_CUTOFF) { | ||
43 | /* Strangely, this turns out to be the tightest way to | ||
44 | * write this loop. | ||
45 | */ | ||
46 | if (n != 0) { | ||
47 | do { | ||
48 | /* Strangely, combining these into one line | ||
49 | * performs worse. | ||
50 | */ | ||
51 | *out8 = c; | ||
52 | out8++; | ||
53 | } while (--n != 0); | ||
54 | } | ||
55 | |||
56 | return s; | ||
57 | } | ||
58 | |||
59 | /* Align 'out8'. We know n >= 7 so this won't write past the end. */ | ||
60 | while (((uintptr_t) out8 & 7) != 0) { | ||
61 | *out8++ = c; | ||
62 | --n; | ||
63 | } | ||
64 | |||
65 | /* Align 'n'. */ | ||
66 | while (n & 7) | ||
67 | out8[--n] = c; | ||
68 | |||
69 | out64 = (uint64_t *) out8; | ||
70 | n64 = n >> 3; | ||
71 | |||
72 | /* Tile input byte out to 64 bits. */ | ||
73 | /* KLUDGE */ | ||
74 | v64 = 0x0101010101010101ULL * (uint8_t)c; | ||
75 | |||
76 | /* This must be at least 8 or the following loop doesn't work. */ | ||
77 | #define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8) | ||
78 | |||
79 | /* Determine how many words we need to emit before the 'out32' | ||
80 | * pointer becomes aligned modulo the cache line size. | ||
81 | */ | ||
82 | to_align64 = (-((uintptr_t)out64 >> 3)) & | ||
83 | (CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1); | ||
84 | |||
85 | /* Only bother aligning and using wh64 if there is at least | ||
86 | * one full cache line to process. This check also prevents | ||
87 | * overrunning the end of the buffer with alignment words. | ||
88 | */ | ||
89 | if (to_align64 <= n64 - CACHE_LINE_SIZE_IN_DOUBLEWORDS) { | ||
90 | int lines_left; | ||
91 | |||
92 | /* Align out64 mod the cache line size so we can use wh64. */ | ||
93 | n64 -= to_align64; | ||
94 | for (; to_align64 != 0; to_align64--) { | ||
95 | *out64 = v64; | ||
96 | out64++; | ||
97 | } | ||
98 | |||
99 | /* Use unsigned divide to turn this into a right shift. */ | ||
100 | lines_left = (unsigned)n64 / CACHE_LINE_SIZE_IN_DOUBLEWORDS; | ||
101 | |||
102 | do { | ||
103 | /* Only wh64 a few lines at a time, so we don't | ||
104 | * exceed the maximum number of victim lines. | ||
105 | */ | ||
106 | int x = ((lines_left < CHIP_MAX_OUTSTANDING_VICTIMS()) | ||
107 | ? lines_left | ||
108 | : CHIP_MAX_OUTSTANDING_VICTIMS()); | ||
109 | uint64_t *wh = out64; | ||
110 | int i = x; | ||
111 | int j; | ||
112 | |||
113 | lines_left -= x; | ||
114 | |||
115 | do { | ||
116 | __insn_wh64(wh); | ||
117 | wh += CACHE_LINE_SIZE_IN_DOUBLEWORDS; | ||
118 | } while (--i); | ||
119 | |||
120 | for (j = x * (CACHE_LINE_SIZE_IN_DOUBLEWORDS / 4); | ||
121 | j != 0; j--) { | ||
122 | *out64++ = v64; | ||
123 | *out64++ = v64; | ||
124 | *out64++ = v64; | ||
125 | *out64++ = v64; | ||
126 | } | ||
127 | } while (lines_left != 0); | ||
128 | |||
129 | /* We processed all full lines above, so only this many | ||
130 | * words remain to be processed. | ||
131 | */ | ||
132 | n64 &= CACHE_LINE_SIZE_IN_DOUBLEWORDS - 1; | ||
133 | } | ||
134 | |||
135 | /* Now handle any leftover values. */ | ||
136 | if (n64 != 0) { | ||
137 | do { | ||
138 | *out64 = v64; | ||
139 | out64++; | ||
140 | } while (--n64 != 0); | ||
141 | } | ||
142 | |||
143 | return s; | ||
144 | } | ||
145 | EXPORT_SYMBOL(memset); | ||
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c index 485e24d62c6b..cb0999fb64b4 100644 --- a/arch/tile/lib/spinlock_32.c +++ b/arch/tile/lib/spinlock_32.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/spinlock.h> | 15 | #include <linux/spinlock.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <asm/processor.h> | 17 | #include <asm/processor.h> |
18 | #include <arch/spr_def.h> | ||
18 | 19 | ||
19 | #include "spinlock_common.h" | 20 | #include "spinlock_common.h" |
20 | 21 | ||
@@ -91,75 +92,75 @@ EXPORT_SYMBOL(arch_spin_unlock_wait); | |||
91 | #define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1) | 92 | #define RD_COUNT_MASK ((1 << RD_COUNT_WIDTH) - 1) |
92 | 93 | ||
93 | 94 | ||
94 | /* Lock the word, spinning until there are no tns-ers. */ | 95 | /* |
95 | static inline u32 get_rwlock(arch_rwlock_t *rwlock) | 96 | * We can get the read lock if everything but the reader bits (which |
96 | { | 97 | * are in the high part of the word) is zero, i.e. no active or |
97 | u32 iterations = 0; | 98 | * waiting writers, no tns. |
98 | for (;;) { | 99 | * |
99 | u32 val = __insn_tns((int *)&rwlock->lock); | 100 | * We guard the tns/store-back with an interrupt critical section to |
100 | if (unlikely(val & 1)) { | 101 | * preserve the semantic that the same read lock can be acquired in an |
101 | delay_backoff(iterations++); | 102 | * interrupt context. |
102 | continue; | 103 | */ |
103 | } | 104 | inline int arch_read_trylock(arch_rwlock_t *rwlock) |
104 | return val; | ||
105 | } | ||
106 | } | ||
107 | |||
108 | int arch_read_trylock_slow(arch_rwlock_t *rwlock) | ||
109 | { | ||
110 | u32 val = get_rwlock(rwlock); | ||
111 | int locked = (val << RD_COUNT_WIDTH) == 0; | ||
112 | rwlock->lock = val + (locked << RD_COUNT_SHIFT); | ||
113 | return locked; | ||
114 | } | ||
115 | EXPORT_SYMBOL(arch_read_trylock_slow); | ||
116 | |||
117 | void arch_read_unlock_slow(arch_rwlock_t *rwlock) | ||
118 | { | ||
119 | u32 val = get_rwlock(rwlock); | ||
120 | rwlock->lock = val - (1 << RD_COUNT_SHIFT); | ||
121 | } | ||
122 | EXPORT_SYMBOL(arch_read_unlock_slow); | ||
123 | |||
124 | void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val) | ||
125 | { | 105 | { |
126 | u32 eq, mask = 1 << WR_CURR_SHIFT; | 106 | u32 val; |
127 | while (unlikely(val & 1)) { | 107 | __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1); |
128 | /* Limited backoff since we are the highest-priority task. */ | 108 | val = __insn_tns((int *)&rwlock->lock); |
129 | relax(4); | 109 | if (likely((val << _RD_COUNT_WIDTH) == 0)) { |
130 | val = __insn_tns((int *)&rwlock->lock); | 110 | val += 1 << RD_COUNT_SHIFT; |
111 | rwlock->lock = val; | ||
112 | __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); | ||
113 | BUG_ON(val == 0); /* we don't expect wraparound */ | ||
114 | return 1; | ||
131 | } | 115 | } |
132 | val = __insn_addb(val, mask); | 116 | if ((val & 1) == 0) |
133 | eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT)); | 117 | rwlock->lock = val; |
134 | val = __insn_mz(eq & mask, val); | 118 | __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); |
135 | rwlock->lock = val; | 119 | return 0; |
136 | } | 120 | } |
137 | EXPORT_SYMBOL(arch_write_unlock_slow); | 121 | EXPORT_SYMBOL(arch_read_trylock); |
138 | 122 | ||
139 | /* | 123 | /* |
140 | * We spin until everything but the reader bits (which are in the high | 124 | * Spin doing arch_read_trylock() until we acquire the lock. |
141 | * part of the word) are zero, i.e. no active or waiting writers, no tns. | ||
142 | * | ||
143 | * ISSUE: This approach can permanently starve readers. A reader who sees | 125 | * ISSUE: This approach can permanently starve readers. A reader who sees |
144 | * a writer could instead take a ticket lock (just like a writer would), | 126 | * a writer could instead take a ticket lock (just like a writer would), |
145 | * and atomically enter read mode (with 1 reader) when it gets the ticket. | 127 | * and atomically enter read mode (with 1 reader) when it gets the ticket. |
146 | * This way both readers and writers will always make forward progress | 128 | * This way both readers and writers would always make forward progress |
147 | * in a finite time. | 129 | * in a finite time. |
148 | */ | 130 | */ |
149 | void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val) | 131 | void arch_read_lock(arch_rwlock_t *rwlock) |
150 | { | 132 | { |
151 | u32 iterations = 0; | 133 | u32 iterations = 0; |
152 | do { | 134 | while (unlikely(!arch_read_trylock(rwlock))) |
153 | if (!(val & 1)) | ||
154 | rwlock->lock = val; | ||
155 | delay_backoff(iterations++); | 135 | delay_backoff(iterations++); |
136 | } | ||
137 | EXPORT_SYMBOL(arch_read_lock); | ||
138 | |||
139 | void arch_read_unlock(arch_rwlock_t *rwlock) | ||
140 | { | ||
141 | u32 val, iterations = 0; | ||
142 | |||
143 | mb(); /* guarantee anything modified under the lock is visible */ | ||
144 | for (;;) { | ||
145 | __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1); | ||
156 | val = __insn_tns((int *)&rwlock->lock); | 146 | val = __insn_tns((int *)&rwlock->lock); |
157 | } while ((val << RD_COUNT_WIDTH) != 0); | 147 | if (likely(val & 1) == 0) { |
158 | rwlock->lock = val + (1 << RD_COUNT_SHIFT); | 148 | rwlock->lock = val - (1 << _RD_COUNT_SHIFT); |
149 | __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); | ||
150 | break; | ||
151 | } | ||
152 | __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); | ||
153 | delay_backoff(iterations++); | ||
154 | } | ||
159 | } | 155 | } |
160 | EXPORT_SYMBOL(arch_read_lock_slow); | 156 | EXPORT_SYMBOL(arch_read_unlock); |
161 | 157 | ||
162 | void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val) | 158 | /* |
159 | * We don't need an interrupt critical section here (unlike for | ||
160 | * arch_read_lock) since we should never use a bare write lock where | ||
161 | * it could be interrupted by code that could try to re-acquire it. | ||
162 | */ | ||
163 | void arch_write_lock(arch_rwlock_t *rwlock) | ||
163 | { | 164 | { |
164 | /* | 165 | /* |
165 | * The trailing underscore on this variable (and curr_ below) | 166 | * The trailing underscore on this variable (and curr_ below) |
@@ -167,23 +168,36 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val) | |||
167 | * when we compare them. | 168 | * when we compare them. |
168 | */ | 169 | */ |
169 | u32 my_ticket_; | 170 | u32 my_ticket_; |
171 | u32 iterations = 0; | ||
172 | u32 val = __insn_tns((int *)&rwlock->lock); | ||
170 | 173 | ||
171 | /* Take out the next ticket; this will also stop would-be readers. */ | 174 | if (likely(val == 0)) { |
172 | if (val & 1) | 175 | rwlock->lock = 1 << _WR_NEXT_SHIFT; |
173 | val = get_rwlock(rwlock); | 176 | return; |
174 | rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT); | 177 | } |
175 | |||
176 | /* Extract my ticket value from the original word. */ | ||
177 | my_ticket_ = val >> WR_NEXT_SHIFT; | ||
178 | 178 | ||
179 | /* | 179 | /* |
180 | * Wait until the "current" field matches our ticket, and | 180 | * Wait until there are no readers, then bump up the next |
181 | * there are no remaining readers. | 181 | * field and capture the ticket value. |
182 | */ | 182 | */ |
183 | for (;;) { | 183 | for (;;) { |
184 | if (!(val & 1)) { | ||
185 | if ((val >> RD_COUNT_SHIFT) == 0) | ||
186 | break; | ||
187 | rwlock->lock = val; | ||
188 | } | ||
189 | delay_backoff(iterations++); | ||
190 | val = __insn_tns((int *)&rwlock->lock); | ||
191 | } | ||
192 | |||
193 | /* Take out the next ticket and extract my ticket value. */ | ||
194 | rwlock->lock = __insn_addb(val, 1 << WR_NEXT_SHIFT); | ||
195 | my_ticket_ = val >> WR_NEXT_SHIFT; | ||
196 | |||
197 | /* Wait until the "current" field matches our ticket. */ | ||
198 | for (;;) { | ||
184 | u32 curr_ = val >> WR_CURR_SHIFT; | 199 | u32 curr_ = val >> WR_CURR_SHIFT; |
185 | u32 readers = val >> RD_COUNT_SHIFT; | 200 | u32 delta = ((my_ticket_ - curr_) & WR_MASK); |
186 | u32 delta = ((my_ticket_ - curr_) & WR_MASK) + !!readers; | ||
187 | if (likely(delta == 0)) | 201 | if (likely(delta == 0)) |
188 | break; | 202 | break; |
189 | 203 | ||
@@ -199,23 +213,47 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val) | |||
199 | relax(4); | 213 | relax(4); |
200 | } | 214 | } |
201 | } | 215 | } |
202 | EXPORT_SYMBOL(arch_write_lock_slow); | 216 | EXPORT_SYMBOL(arch_write_lock); |
203 | 217 | ||
204 | int __tns_atomic_acquire(atomic_t *lock) | 218 | int arch_write_trylock(arch_rwlock_t *rwlock) |
205 | { | 219 | { |
206 | int ret; | 220 | u32 val = __insn_tns((int *)&rwlock->lock); |
207 | u32 iterations = 0; | ||
208 | 221 | ||
209 | BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION)); | 222 | /* |
210 | __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1); | 223 | * If a tns is in progress, or there's a waiting or active locker, |
224 | * or active readers, we can't take the lock, so give up. | ||
225 | */ | ||
226 | if (unlikely(val != 0)) { | ||
227 | if (!(val & 1)) | ||
228 | rwlock->lock = val; | ||
229 | return 0; | ||
230 | } | ||
211 | 231 | ||
212 | while ((ret = __insn_tns((void *)&lock->counter)) == 1) | 232 | /* Set the "next" field to mark it locked. */ |
213 | delay_backoff(iterations++); | 233 | rwlock->lock = 1 << _WR_NEXT_SHIFT; |
214 | return ret; | 234 | return 1; |
215 | } | 235 | } |
236 | EXPORT_SYMBOL(arch_write_trylock); | ||
216 | 237 | ||
217 | void __tns_atomic_release(atomic_t *p, int v) | 238 | void arch_write_unlock(arch_rwlock_t *rwlock) |
218 | { | 239 | { |
219 | p->counter = v; | 240 | u32 val, eq, mask; |
220 | __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); | 241 | |
242 | mb(); /* guarantee anything modified under the lock is visible */ | ||
243 | val = __insn_tns((int *)&rwlock->lock); | ||
244 | if (likely(val == (1 << _WR_NEXT_SHIFT))) { | ||
245 | rwlock->lock = 0; | ||
246 | return; | ||
247 | } | ||
248 | while (unlikely(val & 1)) { | ||
249 | /* Limited backoff since we are the highest-priority task. */ | ||
250 | relax(4); | ||
251 | val = __insn_tns((int *)&rwlock->lock); | ||
252 | } | ||
253 | mask = 1 << WR_CURR_SHIFT; | ||
254 | val = __insn_addb(val, mask); | ||
255 | eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT)); | ||
256 | val = __insn_mz(eq & mask, val); | ||
257 | rwlock->lock = val; | ||
221 | } | 258 | } |
259 | EXPORT_SYMBOL(arch_write_unlock); | ||
diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c new file mode 100644 index 000000000000..d6fb9581e980 --- /dev/null +++ b/arch/tile/lib/spinlock_64.c | |||
@@ -0,0 +1,104 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/spinlock.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <asm/processor.h> | ||
18 | |||
19 | #include "spinlock_common.h" | ||
20 | |||
21 | /* | ||
22 | * Read the spinlock value without allocating in our cache and without | ||
23 | * causing an invalidation to another cpu with a copy of the cacheline. | ||
24 | * This is important when we are spinning waiting for the lock. | ||
25 | */ | ||
26 | static inline u32 arch_spin_read_noalloc(void *lock) | ||
27 | { | ||
28 | return atomic_cmpxchg((atomic_t *)lock, -1, -1); | ||
29 | } | ||
30 | |||
31 | /* | ||
32 | * Wait until the high bits (current) match my ticket. | ||
33 | * If we notice the overflow bit set on entry, we clear it. | ||
34 | */ | ||
35 | void arch_spin_lock_slow(arch_spinlock_t *lock, u32 my_ticket) | ||
36 | { | ||
37 | if (unlikely(my_ticket & __ARCH_SPIN_NEXT_OVERFLOW)) { | ||
38 | __insn_fetchand4(&lock->lock, ~__ARCH_SPIN_NEXT_OVERFLOW); | ||
39 | my_ticket &= ~__ARCH_SPIN_NEXT_OVERFLOW; | ||
40 | } | ||
41 | |||
42 | for (;;) { | ||
43 | u32 val = arch_spin_read_noalloc(lock); | ||
44 | u32 delta = my_ticket - arch_spin_current(val); | ||
45 | if (delta == 0) | ||
46 | return; | ||
47 | relax((128 / CYCLES_PER_RELAX_LOOP) * delta); | ||
48 | } | ||
49 | } | ||
50 | EXPORT_SYMBOL(arch_spin_lock_slow); | ||
51 | |||
52 | /* | ||
53 | * Check the lock to see if it is plausible, and try to get it with cmpxchg(). | ||
54 | */ | ||
55 | int arch_spin_trylock(arch_spinlock_t *lock) | ||
56 | { | ||
57 | u32 val = arch_spin_read_noalloc(lock); | ||
58 | if (unlikely(arch_spin_current(val) != arch_spin_next(val))) | ||
59 | return 0; | ||
60 | return cmpxchg(&lock->lock, val, (val + 1) & ~__ARCH_SPIN_NEXT_OVERFLOW) | ||
61 | == val; | ||
62 | } | ||
63 | EXPORT_SYMBOL(arch_spin_trylock); | ||
64 | |||
65 | void arch_spin_unlock_wait(arch_spinlock_t *lock) | ||
66 | { | ||
67 | u32 iterations = 0; | ||
68 | while (arch_spin_is_locked(lock)) | ||
69 | delay_backoff(iterations++); | ||
70 | } | ||
71 | EXPORT_SYMBOL(arch_spin_unlock_wait); | ||
72 | |||
73 | /* | ||
74 | * If the read lock fails due to a writer, we retry periodically | ||
75 | * until the value is positive and we write our incremented reader count. | ||
76 | */ | ||
77 | void __read_lock_failed(arch_rwlock_t *rw) | ||
78 | { | ||
79 | u32 val; | ||
80 | int iterations = 0; | ||
81 | do { | ||
82 | delay_backoff(iterations++); | ||
83 | val = __insn_fetchaddgez4(&rw->lock, 1); | ||
84 | } while (unlikely(arch_write_val_locked(val))); | ||
85 | } | ||
86 | EXPORT_SYMBOL(__read_lock_failed); | ||
87 | |||
88 | /* | ||
89 | * If we failed because there were readers, clear the "writer" bit | ||
90 | * so we don't block additional readers. Otherwise, there was another | ||
91 | * writer anyway, so our "fetchor" made no difference. Then wait, | ||
92 | * issuing periodic fetchor instructions, till we get the lock. | ||
93 | */ | ||
94 | void __write_lock_failed(arch_rwlock_t *rw, u32 val) | ||
95 | { | ||
96 | int iterations = 0; | ||
97 | do { | ||
98 | if (!arch_write_val_locked(val)) | ||
99 | val = __insn_fetchand4(&rw->lock, ~__WRITE_LOCK_BIT); | ||
100 | delay_backoff(iterations++); | ||
101 | val = __insn_fetchor4(&rw->lock, __WRITE_LOCK_BIT); | ||
102 | } while (val != 0); | ||
103 | } | ||
104 | EXPORT_SYMBOL(__write_lock_failed); | ||
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c new file mode 100644 index 000000000000..617a9273aaa8 --- /dev/null +++ b/arch/tile/lib/strchr_64.c | |||
@@ -0,0 +1,67 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | #undef strchr | ||
20 | |||
21 | char *strchr(const char *s, int c) | ||
22 | { | ||
23 | int z, g; | ||
24 | |||
25 | /* Get an aligned pointer. */ | ||
26 | const uintptr_t s_int = (uintptr_t) s; | ||
27 | const uint64_t *p = (const uint64_t *)(s_int & -8); | ||
28 | |||
29 | /* Create eight copies of the byte for which we are looking. */ | ||
30 | const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c; | ||
31 | |||
32 | /* Read the first aligned word, but force bytes before the string to | ||
33 | * match neither zero nor goal (we make sure the high bit of each | ||
34 | * byte is 1, and the low 7 bits are all the opposite of the goal | ||
35 | * byte). | ||
36 | * | ||
37 | * Note that this shift count expression works because we know shift | ||
38 | * counts are taken mod 64. | ||
39 | */ | ||
40 | const uint64_t before_mask = (1ULL << (s_int << 3)) - 1; | ||
41 | uint64_t v = (*p | before_mask) ^ | ||
42 | (goal & __insn_v1shrsi(before_mask, 1)); | ||
43 | |||
44 | uint64_t zero_matches, goal_matches; | ||
45 | while (1) { | ||
46 | /* Look for a terminating '\0'. */ | ||
47 | zero_matches = __insn_v1cmpeqi(v, 0); | ||
48 | |||
49 | /* Look for the goal byte. */ | ||
50 | goal_matches = __insn_v1cmpeq(v, goal); | ||
51 | |||
52 | if (__builtin_expect((zero_matches | goal_matches) != 0, 0)) | ||
53 | break; | ||
54 | |||
55 | v = *++p; | ||
56 | } | ||
57 | |||
58 | z = __insn_ctz(zero_matches); | ||
59 | g = __insn_ctz(goal_matches); | ||
60 | |||
61 | /* If we found c before '\0' we got a match. Note that if c == '\0' | ||
62 | * then g == z, and we correctly return the address of the '\0' | ||
63 | * rather than NULL. | ||
64 | */ | ||
65 | return (g <= z) ? ((char *)p) + (g >> 3) : NULL; | ||
66 | } | ||
67 | EXPORT_SYMBOL(strchr); | ||
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c index f26f88e11e4a..4974292a5534 100644 --- a/arch/tile/lib/strlen_32.c +++ b/arch/tile/lib/strlen_32.c | |||
@@ -16,6 +16,8 @@ | |||
16 | #include <linux/string.h> | 16 | #include <linux/string.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | 18 | ||
19 | #undef strlen | ||
20 | |||
19 | size_t strlen(const char *s) | 21 | size_t strlen(const char *s) |
20 | { | 22 | { |
21 | /* Get an aligned pointer. */ | 23 | /* Get an aligned pointer. */ |
diff --git a/arch/tile/lib/strlen_64.c b/arch/tile/lib/strlen_64.c new file mode 100644 index 000000000000..1c92d46202a8 --- /dev/null +++ b/arch/tile/lib/strlen_64.c | |||
@@ -0,0 +1,38 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/types.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <linux/module.h> | ||
18 | |||
19 | #undef strlen | ||
20 | |||
21 | size_t strlen(const char *s) | ||
22 | { | ||
23 | /* Get an aligned pointer. */ | ||
24 | const uintptr_t s_int = (uintptr_t) s; | ||
25 | const uint64_t *p = (const uint64_t *)(s_int & -8); | ||
26 | |||
27 | /* Read the first word, but force bytes before the string to be nonzero. | ||
28 | * This expression works because we know shift counts are taken mod 64. | ||
29 | */ | ||
30 | uint64_t v = *p | ((1ULL << (s_int << 3)) - 1); | ||
31 | |||
32 | uint64_t bits; | ||
33 | while ((bits = __insn_v1cmpeqi(v, 0)) == 0) | ||
34 | v = *++p; | ||
35 | |||
36 | return ((const char *)p) + (__insn_ctz(bits) >> 3) - s; | ||
37 | } | ||
38 | EXPORT_SYMBOL(strlen); | ||
diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S new file mode 100644 index 000000000000..2ff44f87b78e --- /dev/null +++ b/arch/tile/lib/usercopy_64.S | |||
@@ -0,0 +1,196 @@ | |||
1 | /* | ||
2 | * Copyright 2011 Tilera Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation, version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | |||
15 | #include <linux/linkage.h> | ||
16 | #include <asm/errno.h> | ||
17 | #include <asm/cache.h> | ||
18 | #include <arch/chip.h> | ||
19 | |||
20 | /* Access user memory, but use MMU to avoid propagating kernel exceptions. */ | ||
21 | |||
22 | .pushsection .fixup,"ax" | ||
23 | |||
24 | get_user_fault: | ||
25 | { movei r1, -EFAULT; move r0, zero } | ||
26 | jrp lr | ||
27 | ENDPROC(get_user_fault) | ||
28 | |||
29 | put_user_fault: | ||
30 | { movei r0, -EFAULT; jrp lr } | ||
31 | ENDPROC(put_user_fault) | ||
32 | |||
33 | .popsection | ||
34 | |||
35 | /* | ||
36 | * __get_user_N functions take a pointer in r0, and return 0 in r1 | ||
37 | * on success, with the value in r0; or else -EFAULT in r1. | ||
38 | */ | ||
39 | #define __get_user_N(bytes, LOAD) \ | ||
40 | STD_ENTRY(__get_user_##bytes); \ | ||
41 | 1: { LOAD r0, r0; move r1, zero }; \ | ||
42 | jrp lr; \ | ||
43 | STD_ENDPROC(__get_user_##bytes); \ | ||
44 | .pushsection __ex_table,"a"; \ | ||
45 | .quad 1b, get_user_fault; \ | ||
46 | .popsection | ||
47 | |||
48 | __get_user_N(1, ld1u) | ||
49 | __get_user_N(2, ld2u) | ||
50 | __get_user_N(4, ld4u) | ||
51 | __get_user_N(8, ld) | ||
52 | |||
53 | /* | ||
54 | * __put_user_N functions take a value in r0 and a pointer in r1, | ||
55 | * and return 0 in r0 on success or -EFAULT on failure. | ||
56 | */ | ||
57 | #define __put_user_N(bytes, STORE) \ | ||
58 | STD_ENTRY(__put_user_##bytes); \ | ||
59 | 1: { STORE r1, r0; move r0, zero }; \ | ||
60 | jrp lr; \ | ||
61 | STD_ENDPROC(__put_user_##bytes); \ | ||
62 | .pushsection __ex_table,"a"; \ | ||
63 | .quad 1b, put_user_fault; \ | ||
64 | .popsection | ||
65 | |||
66 | __put_user_N(1, st1) | ||
67 | __put_user_N(2, st2) | ||
68 | __put_user_N(4, st4) | ||
69 | __put_user_N(8, st) | ||
70 | |||
71 | /* | ||
72 | * strnlen_user_asm takes the pointer in r0, and the length bound in r1. | ||
73 | * It returns the length, including the terminating NUL, or zero on exception. | ||
74 | * If length is greater than the bound, returns one plus the bound. | ||
75 | */ | ||
76 | STD_ENTRY(strnlen_user_asm) | ||
77 | { beqz r1, 2f; addi r3, r0, -1 } /* bias down to include NUL */ | ||
78 | 1: { ld1u r4, r0; addi r1, r1, -1 } | ||
79 | beqz r4, 2f | ||
80 | { bnezt r1, 1b; addi r0, r0, 1 } | ||
81 | 2: { sub r0, r0, r3; jrp lr } | ||
82 | STD_ENDPROC(strnlen_user_asm) | ||
83 | .pushsection .fixup,"ax" | ||
84 | strnlen_user_fault: | ||
85 | { move r0, zero; jrp lr } | ||
86 | ENDPROC(strnlen_user_fault) | ||
87 | .section __ex_table,"a" | ||
88 | .quad 1b, strnlen_user_fault | ||
89 | .popsection | ||
90 | |||
91 | /* | ||
92 | * strncpy_from_user_asm takes the kernel target pointer in r0, | ||
93 | * the userspace source pointer in r1, and the length bound (including | ||
94 | * the trailing NUL) in r2. On success, it returns the string length | ||
95 | * (not including the trailing NUL), or -EFAULT on failure. | ||
96 | */ | ||
97 | STD_ENTRY(strncpy_from_user_asm) | ||
98 | { beqz r2, 2f; move r3, r0 } | ||
99 | 1: { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 } | ||
100 | { st1 r0, r4; addi r0, r0, 1 } | ||
101 | beqz r2, 2f | ||
102 | bnezt r4, 1b | ||
103 | addi r0, r0, -1 /* don't count the trailing NUL */ | ||
104 | 2: { sub r0, r0, r3; jrp lr } | ||
105 | STD_ENDPROC(strncpy_from_user_asm) | ||
106 | .pushsection .fixup,"ax" | ||
107 | strncpy_from_user_fault: | ||
108 | { movei r0, -EFAULT; jrp lr } | ||
109 | ENDPROC(strncpy_from_user_fault) | ||
110 | .section __ex_table,"a" | ||
111 | .quad 1b, strncpy_from_user_fault | ||
112 | .popsection | ||
113 | |||
114 | /* | ||
115 | * clear_user_asm takes the user target address in r0 and the | ||
116 | * number of bytes to zero in r1. | ||
117 | * It returns the number of uncopiable bytes (hopefully zero) in r0. | ||
118 | * Note that we don't use a separate .fixup section here since we fall | ||
119 | * through into the "fixup" code as the last straight-line bundle anyway. | ||
120 | */ | ||
121 | STD_ENTRY(clear_user_asm) | ||
122 | { beqz r1, 2f; or r2, r0, r1 } | ||
123 | andi r2, r2, 7 | ||
124 | beqzt r2, .Lclear_aligned_user_asm | ||
125 | 1: { st1 r0, zero; addi r0, r0, 1; addi r1, r1, -1 } | ||
126 | bnezt r1, 1b | ||
127 | 2: { move r0, r1; jrp lr } | ||
128 | .pushsection __ex_table,"a" | ||
129 | .quad 1b, 2b | ||
130 | .popsection | ||
131 | |||
132 | .Lclear_aligned_user_asm: | ||
133 | 1: { st r0, zero; addi r0, r0, 8; addi r1, r1, -8 } | ||
134 | bnezt r1, 1b | ||
135 | 2: { move r0, r1; jrp lr } | ||
136 | STD_ENDPROC(clear_user_asm) | ||
137 | .pushsection __ex_table,"a" | ||
138 | .quad 1b, 2b | ||
139 | .popsection | ||
140 | |||
141 | /* | ||
142 | * flush_user_asm takes the user target address in r0 and the | ||
143 | * number of bytes to flush in r1. | ||
144 | * It returns the number of unflushable bytes (hopefully zero) in r0. | ||
145 | */ | ||
146 | STD_ENTRY(flush_user_asm) | ||
147 | beqz r1, 2f | ||
148 | { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } | ||
149 | { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } | ||
150 | { and r0, r0, r2; and r1, r1, r2 } | ||
151 | { sub r1, r1, r0 } | ||
152 | 1: { flush r0; addi r1, r1, -CHIP_FLUSH_STRIDE() } | ||
153 | { addi r0, r0, CHIP_FLUSH_STRIDE(); bnezt r1, 1b } | ||
154 | 2: { move r0, r1; jrp lr } | ||
155 | STD_ENDPROC(flush_user_asm) | ||
156 | .pushsection __ex_table,"a" | ||
157 | .quad 1b, 2b | ||
158 | .popsection | ||
159 | |||
160 | /* | ||
161 | * inv_user_asm takes the user target address in r0 and the | ||
162 | * number of bytes to invalidate in r1. | ||
163 | * It returns the number of not inv'able bytes (hopefully zero) in r0. | ||
164 | */ | ||
165 | STD_ENTRY(inv_user_asm) | ||
166 | beqz r1, 2f | ||
167 | { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } | ||
168 | { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } | ||
169 | { and r0, r0, r2; and r1, r1, r2 } | ||
170 | { sub r1, r1, r0 } | ||
171 | 1: { inv r0; addi r1, r1, -CHIP_INV_STRIDE() } | ||
172 | { addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b } | ||
173 | 2: { move r0, r1; jrp lr } | ||
174 | STD_ENDPROC(inv_user_asm) | ||
175 | .pushsection __ex_table,"a" | ||
176 | .quad 1b, 2b | ||
177 | .popsection | ||
178 | |||
179 | /* | ||
180 | * finv_user_asm takes the user target address in r0 and the | ||
181 | * number of bytes to flush-invalidate in r1. | ||
182 | * It returns the number of not finv'able bytes (hopefully zero) in r0. | ||
183 | */ | ||
184 | STD_ENTRY(finv_user_asm) | ||
185 | beqz r1, 2f | ||
186 | { movei r2, L2_CACHE_BYTES; add r1, r0, r1 } | ||
187 | { sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 } | ||
188 | { and r0, r0, r2; and r1, r1, r2 } | ||
189 | { sub r1, r1, r0 } | ||
190 | 1: { finv r0; addi r1, r1, -CHIP_FINV_STRIDE() } | ||
191 | { addi r0, r0, CHIP_FINV_STRIDE(); bnezt r1, 1b } | ||
192 | 2: { move r0, r1; jrp lr } | ||
193 | STD_ENDPROC(finv_user_asm) | ||
194 | .pushsection __ex_table,"a" | ||
195 | .quad 1b, 2b | ||
196 | .popsection | ||