aboutsummaryrefslogtreecommitdiffstats
path: root/arch/tile/lib
diff options
context:
space:
mode:
authorChris Metcalf <cmetcalf@tilera.com>2013-08-15 16:23:24 -0400
committerChris Metcalf <cmetcalf@tilera.com>2013-09-03 14:53:29 -0400
commitd7c9661115fd23b4dabb710b3080dd9919dfa891 (patch)
tree5eaeb8c4aab296f39d6aa896ec9408419ec17441 /arch/tile/lib
parentd6a0aa314c06743b702931cb468f400b7615c5c9 (diff)
tile: remove support for TILE64
This chip is no longer being actively developed for (it was superceded by the TILEPro64 in 2008), and in any case the existing compiler and toolchain in the community do not support it. It's unlikely that the kernel works with TILE64 at this point as the configuration has not been tested in years. The support is also awkward as it requires maintaining a significant number of ifdefs. So, just remove it altogether. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Diffstat (limited to 'arch/tile/lib')
-rw-r--r--arch/tile/lib/Makefile2
-rw-r--r--arch/tile/lib/atomic_32.c90
-rw-r--r--arch/tile/lib/memcpy_32.S61
-rw-r--r--arch/tile/lib/memcpy_tile64.c280
-rw-r--r--arch/tile/lib/memset_32.c105
5 files changed, 1 insertions, 537 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 9adfd76fbdd8..c4211cbb2021 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -7,7 +7,7 @@ lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
7 strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o 7 strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o
8 8
9lib-$(CONFIG_TILEGX) += memcpy_user_64.o 9lib-$(CONFIG_TILEGX) += memcpy_user_64.o
10lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o memcpy_tile64.o 10lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o
11lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o 11lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o
12 12
13obj-$(CONFIG_MODULES) += exports.o 13obj-$(CONFIG_MODULES) += exports.o
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index 42eacb1f737a..5d91d1860640 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -20,50 +20,12 @@
20#include <linux/atomic.h> 20#include <linux/atomic.h>
21#include <arch/chip.h> 21#include <arch/chip.h>
22 22
23/* See <asm/atomic_32.h> */
24#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
25
26/*
27 * A block of memory containing locks for atomic ops. Each instance of this
28 * struct will be homed on a different CPU.
29 */
30struct atomic_locks_on_cpu {
31 int lock[ATOMIC_HASH_L2_SIZE];
32} __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4)));
33
34static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool);
35
36/* The locks we'll use until __init_atomic_per_cpu is called. */
37static struct atomic_locks_on_cpu __initdata initial_atomic_locks;
38
39/* Hash into this vector to get a pointer to lock for the given atomic. */
40struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
41 __write_once = {
42 [0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks)
43};
44
45#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
46
47/* This page is remapped on startup to be hash-for-home. */ 23/* This page is remapped on startup to be hash-for-home. */
48int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss; 24int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
49 25
50#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
51
52int *__atomic_hashed_lock(volatile void *v) 26int *__atomic_hashed_lock(volatile void *v)
53{ 27{
54 /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */ 28 /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
55#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
56 unsigned long i =
57 (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
58 unsigned long n = __insn_crc32_32(0, i);
59
60 /* Grab high bits for L1 index. */
61 unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT);
62 /* Grab low bits for L2 index. */
63 unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1);
64
65 return &atomic_lock_ptr[l1_index]->lock[l2_index];
66#else
67 /* 29 /*
68 * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index. 30 * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index.
69 * Using mm works here because atomic_locks is page aligned. 31 * Using mm works here because atomic_locks is page aligned.
@@ -72,26 +34,13 @@ int *__atomic_hashed_lock(volatile void *v)
72 (unsigned long)atomic_locks, 34 (unsigned long)atomic_locks,
73 2, (ATOMIC_HASH_SHIFT + 2) - 1); 35 2, (ATOMIC_HASH_SHIFT + 2) - 1);
74 return (int *)ptr; 36 return (int *)ptr;
75#endif
76} 37}
77 38
78#ifdef CONFIG_SMP 39#ifdef CONFIG_SMP
79/* Return whether the passed pointer is a valid atomic lock pointer. */ 40/* Return whether the passed pointer is a valid atomic lock pointer. */
80static int is_atomic_lock(int *p) 41static int is_atomic_lock(int *p)
81{ 42{
82#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
83 int i;
84 for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
85
86 if (p >= &atomic_lock_ptr[i]->lock[0] &&
87 p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) {
88 return 1;
89 }
90 }
91 return 0;
92#else
93 return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE]; 43 return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE];
94#endif
95} 44}
96 45
97void __atomic_fault_unlock(int *irqlock_word) 46void __atomic_fault_unlock(int *irqlock_word)
@@ -210,43 +159,6 @@ struct __get_user __atomic_bad_address(int __user *addr)
210 159
211void __init __init_atomic_per_cpu(void) 160void __init __init_atomic_per_cpu(void)
212{ 161{
213#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
214
215 unsigned int i;
216 int actual_cpu;
217
218 /*
219 * Before this is called from setup, we just have one lock for
220 * all atomic objects/operations. Here we replace the
221 * elements of atomic_lock_ptr so that they point at per_cpu
222 * integers. This seemingly over-complex approach stems from
223 * the fact that DEFINE_PER_CPU defines an entry for each cpu
224 * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1. But
225 * for efficient hashing of atomics to their locks we want a
226 * compile time constant power of 2 for the size of this
227 * table, so we use ATOMIC_HASH_SIZE.
228 *
229 * Here we populate atomic_lock_ptr from the per cpu
230 * atomic_lock_pool, interspersing by actual cpu so that
231 * subsequent elements are homed on consecutive cpus.
232 */
233
234 actual_cpu = cpumask_first(cpu_possible_mask);
235
236 for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
237 /*
238 * Preincrement to slightly bias against using cpu 0,
239 * which has plenty of stuff homed on it already.
240 */
241 actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask);
242 if (actual_cpu >= nr_cpu_ids)
243 actual_cpu = cpumask_first(cpu_possible_mask);
244
245 atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu);
246 }
247
248#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
249
250 /* Validate power-of-two and "bigger than cpus" assumption */ 162 /* Validate power-of-two and "bigger than cpus" assumption */
251 BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1)); 163 BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
252 BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids); 164 BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
@@ -270,6 +182,4 @@ void __init __init_atomic_per_cpu(void)
270 * That should not produce more indices than ATOMIC_HASH_SIZE. 182 * That should not produce more indices than ATOMIC_HASH_SIZE.
271 */ 183 */
272 BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE); 184 BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
273
274#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
275} 185}
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
index 8ba7626cfeb1..a2771ae5da53 100644
--- a/arch/tile/lib/memcpy_32.S
+++ b/arch/tile/lib/memcpy_32.S
@@ -22,14 +22,6 @@
22 22
23#include <linux/linkage.h> 23#include <linux/linkage.h>
24 24
25/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
26#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
27#define memcpy __memcpy_asm
28#define __copy_to_user_inatomic __copy_to_user_inatomic_asm
29#define __copy_from_user_inatomic __copy_from_user_inatomic_asm
30#define __copy_from_user_zeroing __copy_from_user_zeroing_asm
31#endif
32
33#define IS_MEMCPY 0 25#define IS_MEMCPY 0
34#define IS_COPY_FROM_USER 1 26#define IS_COPY_FROM_USER 1
35#define IS_COPY_FROM_USER_ZEROING 2 27#define IS_COPY_FROM_USER_ZEROING 2
@@ -159,12 +151,9 @@ EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
159 151
160 { addi r3, r1, 60; andi r9, r9, -64 } 152 { addi r3, r1, 60; andi r9, r9, -64 }
161 153
162#if CHIP_HAS_WH64()
163 /* No need to prefetch dst, we'll just do the wh64 154 /* No need to prefetch dst, we'll just do the wh64
164 * right before we copy a line. 155 * right before we copy a line.
165 */ 156 */
166#endif
167
168EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 } 157EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 }
169 /* Intentionally stall for a few cycles to leave L2 cache alone. */ 158 /* Intentionally stall for a few cycles to leave L2 cache alone. */
170 { bnzt zero, .; move r27, lr } 159 { bnzt zero, .; move r27, lr }
@@ -172,21 +161,6 @@ EX: { lw r6, r3; addi r3, r3, 64 }
172 /* Intentionally stall for a few cycles to leave L2 cache alone. */ 161 /* Intentionally stall for a few cycles to leave L2 cache alone. */
173 { bnzt zero, . } 162 { bnzt zero, . }
174EX: { lw r7, r3; addi r3, r3, 64 } 163EX: { lw r7, r3; addi r3, r3, 64 }
175#if !CHIP_HAS_WH64()
176 /* Prefetch the dest */
177 /* Intentionally stall for a few cycles to leave L2 cache alone. */
178 { bnzt zero, . }
179 /* Use a real load to cause a TLB miss if necessary. We aren't using
180 * r28, so this should be fine.
181 */
182EX: { lw r28, r9; addi r9, r9, 64 }
183 /* Intentionally stall for a few cycles to leave L2 cache alone. */
184 { bnzt zero, . }
185 { prefetch r9; addi r9, r9, 64 }
186 /* Intentionally stall for a few cycles to leave L2 cache alone. */
187 { bnzt zero, . }
188 { prefetch r9; addi r9, r9, 64 }
189#endif
190 /* Intentionally stall for a few cycles to leave L2 cache alone. */ 164 /* Intentionally stall for a few cycles to leave L2 cache alone. */
191 { bz zero, .Lbig_loop2 } 165 { bz zero, .Lbig_loop2 }
192 166
@@ -287,13 +261,8 @@ EX: { lw r7, r3; addi r3, r3, 64 }
287 /* Fill second L1D line. */ 261 /* Fill second L1D line. */
288EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */ 262EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
289 263
290#if CHIP_HAS_WH64()
291 /* Prepare destination line for writing. */ 264 /* Prepare destination line for writing. */
292EX: { wh64 r9; addi r9, r9, 64 } 265EX: { wh64 r9; addi r9, r9, 64 }
293#else
294 /* Prefetch dest line */
295 { prefetch r9; addi r9, r9, 64 }
296#endif
297 /* Load seven words that are L1D hits to cover wh64 L2 usage. */ 266 /* Load seven words that are L1D hits to cover wh64 L2 usage. */
298 267
299 /* Load the three remaining words from the last L1D line, which 268 /* Load the three remaining words from the last L1D line, which
@@ -331,16 +300,7 @@ EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */
331EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */ 300EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
332EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */ 301EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
333EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */ 302EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
334#if CHIP_HAS_WH64()
335EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */ 303EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
336#else
337 /* Back up the r9 to a cache line we are already storing to
338 * if it gets past the end of the dest vector. Strictly speaking,
339 * we don't need to back up to the start of a cache line, but it's free
340 * and tidy, so why not?
341 */
342EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
343#endif
344 /* Store second L1D line. */ 304 /* Store second L1D line. */
345EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */ 305EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
346EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */ 306EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */
@@ -404,7 +364,6 @@ EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
404 364
405.Ldest_is_word_aligned: 365.Ldest_is_word_aligned:
406 366
407#if CHIP_HAS_DWORD_ALIGN()
408EX: { andi r8, r0, 63; lwadd_na r6, r1, 4} 367EX: { andi r8, r0, 63; lwadd_na r6, r1, 4}
409 { slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned } 368 { slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
410 369
@@ -512,26 +471,6 @@ EX: { swadd r0, r13, 4; addi r2, r2, -32 }
512 /* Move r1 back to the point where it corresponds to r0. */ 471 /* Move r1 back to the point where it corresponds to r0. */
513 { addi r1, r1, -4 } 472 { addi r1, r1, -4 }
514 473
515#else /* !CHIP_HAS_DWORD_ALIGN() */
516
517 /* Compute right/left shift counts and load initial source words. */
518 { andi r5, r1, -4; andi r3, r1, 3 }
519EX: { lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 }
520EX: { lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 }
521
522 /* Load and store one word at a time, using shifts and ORs
523 * to correct for the misaligned src.
524 */
525.Lcopy_unaligned_src_loop:
526 { shr r6, r6, r3; shl r8, r7, r4 }
527EX: { lw r7, r5; or r8, r8, r6; move r6, r7 }
528EX: { sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 }
529 { addi r5, r5, 4; slti_u r8, r2, 8 }
530 { bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 }
531
532 { bz r2, .Lcopy_unaligned_done }
533#endif /* !CHIP_HAS_DWORD_ALIGN() */
534
535 /* Fall through */ 474 /* Fall through */
536 475
537/* 476/*
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
deleted file mode 100644
index 0290c222847b..000000000000
--- a/arch/tile/lib/memcpy_tile64.c
+++ /dev/null
@@ -1,280 +0,0 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/string.h>
16#include <linux/smp.h>
17#include <linux/module.h>
18#include <linux/uaccess.h>
19#include <asm/fixmap.h>
20#include <asm/kmap_types.h>
21#include <asm/tlbflush.h>
22#include <hv/hypervisor.h>
23#include <arch/chip.h>
24
25
26#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
27
28/* Defined in memcpy.S */
29extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n);
30extern unsigned long __copy_to_user_inatomic_asm(
31 void __user *to, const void *from, unsigned long n);
32extern unsigned long __copy_from_user_inatomic_asm(
33 void *to, const void __user *from, unsigned long n);
34extern unsigned long __copy_from_user_zeroing_asm(
35 void *to, const void __user *from, unsigned long n);
36
37typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
38
39/* Size above which to consider TLB games for performance */
40#define LARGE_COPY_CUTOFF 2048
41
42/* Communicate to the simulator what we are trying to do. */
43#define sim_allow_multiple_caching(b) \
44 __insn_mtspr(SPR_SIM_CONTROL, \
45 SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS))
46
47/*
48 * Copy memory by briefly enabling incoherent cacheline-at-a-time mode.
49 *
50 * We set up our own source and destination PTEs that we fully control.
51 * This is the only way to guarantee that we don't race with another
52 * thread that is modifying the PTE; we can't afford to try the
53 * copy_{to,from}_user() technique of catching the interrupt, since
54 * we must run with interrupts disabled to avoid the risk of some
55 * other code seeing the incoherent data in our cache. (Recall that
56 * our cache is indexed by PA, so even if the other code doesn't use
57 * our kmap_atomic virtual addresses, they'll still hit in cache using
58 * the normal VAs that aren't supposed to hit in cache.)
59 */
60static void memcpy_multicache(void *dest, const void *source,
61 pte_t dst_pte, pte_t src_pte, int len)
62{
63 int idx;
64 unsigned long flags, newsrc, newdst;
65 pmd_t *pmdp;
66 pte_t *ptep;
67 int type0, type1;
68 int cpu = smp_processor_id();
69
70 /*
71 * Disable interrupts so that we don't recurse into memcpy()
72 * in an interrupt handler, nor accidentally reference
73 * the PA of the source from an interrupt routine. Also
74 * notify the simulator that we're playing games so we don't
75 * generate spurious coherency warnings.
76 */
77 local_irq_save(flags);
78 sim_allow_multiple_caching(1);
79
80 /* Set up the new dest mapping */
81 type0 = kmap_atomic_idx_push();
82 idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0;
83 newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
84 pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
85 ptep = pte_offset_kernel(pmdp, newdst);
86 if (pte_val(*ptep) != pte_val(dst_pte)) {
87 set_pte(ptep, dst_pte);
88 local_flush_tlb_page(NULL, newdst, PAGE_SIZE);
89 }
90
91 /* Set up the new source mapping */
92 type1 = kmap_atomic_idx_push();
93 idx += (type0 - type1);
94 src_pte = hv_pte_set_nc(src_pte);
95 src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */
96 newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
97 pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
98 ptep = pte_offset_kernel(pmdp, newsrc);
99 __set_pte(ptep, src_pte); /* set_pte() would be confused by this */
100 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
101
102 /* Actually move the data. */
103 __memcpy_asm((void *)newdst, (const void *)newsrc, len);
104
105 /*
106 * Remap the source as locally-cached and not OLOC'ed so that
107 * we can inval without also invaling the remote cpu's cache.
108 * This also avoids known errata with inv'ing cacheable oloc data.
109 */
110 src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
111 src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
112 __set_pte(ptep, src_pte); /* set_pte() would be confused by this */
113 local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
114
115 /*
116 * Do the actual invalidation, covering the full L2 cache line
117 * at the end since __memcpy_asm() is somewhat aggressive.
118 */
119 __inv_buffer((void *)newsrc, len);
120
121 /*
122 * We're done: notify the simulator that all is back to normal,
123 * and re-enable interrupts and pre-emption.
124 */
125 kmap_atomic_idx_pop();
126 kmap_atomic_idx_pop();
127 sim_allow_multiple_caching(0);
128 local_irq_restore(flags);
129}
130
131/*
132 * Identify large copies from remotely-cached memory, and copy them
133 * via memcpy_multicache() if they look good, otherwise fall back
134 * to the particular kind of copying passed as the memcpy_t function.
135 */
136static unsigned long fast_copy(void *dest, const void *source, int len,
137 memcpy_t func)
138{
139 int cpu = get_cpu();
140 unsigned long retval;
141
142 /*
143 * Check if it's big enough to bother with. We may end up doing a
144 * small copy via TLB manipulation if we're near a page boundary,
145 * but presumably we'll make it up when we hit the second page.
146 */
147 while (len >= LARGE_COPY_CUTOFF) {
148 int copy_size, bytes_left_on_page;
149 pte_t *src_ptep, *dst_ptep;
150 pte_t src_pte, dst_pte;
151 struct page *src_page, *dst_page;
152
153 /* Is the source page oloc'ed to a remote cpu? */
154retry_source:
155 src_ptep = virt_to_pte(current->mm, (unsigned long)source);
156 if (src_ptep == NULL)
157 break;
158 src_pte = *src_ptep;
159 if (!hv_pte_get_present(src_pte) ||
160 !hv_pte_get_readable(src_pte) ||
161 hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3)
162 break;
163 if (get_remote_cache_cpu(src_pte) == cpu)
164 break;
165 src_page = pfn_to_page(pte_pfn(src_pte));
166 get_page(src_page);
167 if (pte_val(src_pte) != pte_val(*src_ptep)) {
168 put_page(src_page);
169 goto retry_source;
170 }
171 if (pte_huge(src_pte)) {
172 /* Adjust the PTE to correspond to a small page */
173 int pfn = pte_pfn(src_pte);
174 pfn += (((unsigned long)source & (HPAGE_SIZE-1))
175 >> PAGE_SHIFT);
176 src_pte = pfn_pte(pfn, src_pte);
177 src_pte = pte_mksmall(src_pte);
178 }
179
180 /* Is the destination page writable? */
181retry_dest:
182 dst_ptep = virt_to_pte(current->mm, (unsigned long)dest);
183 if (dst_ptep == NULL) {
184 put_page(src_page);
185 break;
186 }
187 dst_pte = *dst_ptep;
188 if (!hv_pte_get_present(dst_pte) ||
189 !hv_pte_get_writable(dst_pte)) {
190 put_page(src_page);
191 break;
192 }
193 dst_page = pfn_to_page(pte_pfn(dst_pte));
194 if (dst_page == src_page) {
195 /*
196 * Source and dest are on the same page; this
197 * potentially exposes us to incoherence if any
198 * part of src and dest overlap on a cache line.
199 * Just give up rather than trying to be precise.
200 */
201 put_page(src_page);
202 break;
203 }
204 get_page(dst_page);
205 if (pte_val(dst_pte) != pte_val(*dst_ptep)) {
206 put_page(dst_page);
207 goto retry_dest;
208 }
209 if (pte_huge(dst_pte)) {
210 /* Adjust the PTE to correspond to a small page */
211 int pfn = pte_pfn(dst_pte);
212 pfn += (((unsigned long)dest & (HPAGE_SIZE-1))
213 >> PAGE_SHIFT);
214 dst_pte = pfn_pte(pfn, dst_pte);
215 dst_pte = pte_mksmall(dst_pte);
216 }
217
218 /* All looks good: create a cachable PTE and copy from it */
219 copy_size = len;
220 bytes_left_on_page =
221 PAGE_SIZE - (((int)source) & (PAGE_SIZE-1));
222 if (copy_size > bytes_left_on_page)
223 copy_size = bytes_left_on_page;
224 bytes_left_on_page =
225 PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1));
226 if (copy_size > bytes_left_on_page)
227 copy_size = bytes_left_on_page;
228 memcpy_multicache(dest, source, dst_pte, src_pte, copy_size);
229
230 /* Release the pages */
231 put_page(dst_page);
232 put_page(src_page);
233
234 /* Continue on the next page */
235 dest += copy_size;
236 source += copy_size;
237 len -= copy_size;
238 }
239
240 retval = func(dest, source, len);
241 put_cpu();
242 return retval;
243}
244
245void *memcpy(void *to, const void *from, __kernel_size_t n)
246{
247 if (n < LARGE_COPY_CUTOFF)
248 return (void *)__memcpy_asm(to, from, n);
249 else
250 return (void *)fast_copy(to, from, n, __memcpy_asm);
251}
252
253unsigned long __copy_to_user_inatomic(void __user *to, const void *from,
254 unsigned long n)
255{
256 if (n < LARGE_COPY_CUTOFF)
257 return __copy_to_user_inatomic_asm(to, from, n);
258 else
259 return fast_copy(to, from, n, __copy_to_user_inatomic_asm);
260}
261
262unsigned long __copy_from_user_inatomic(void *to, const void __user *from,
263 unsigned long n)
264{
265 if (n < LARGE_COPY_CUTOFF)
266 return __copy_from_user_inatomic_asm(to, from, n);
267 else
268 return fast_copy(to, from, n, __copy_from_user_inatomic_asm);
269}
270
271unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
272 unsigned long n)
273{
274 if (n < LARGE_COPY_CUTOFF)
275 return __copy_from_user_zeroing_asm(to, from, n);
276 else
277 return fast_copy(to, from, n, __copy_from_user_zeroing_asm);
278}
279
280#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index 9a7837d11f7d..2042bfe6595f 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -23,11 +23,7 @@ void *memset(void *s, int c, size_t n)
23 int n32; 23 int n32;
24 uint32_t v16, v32; 24 uint32_t v16, v32;
25 uint8_t *out8 = s; 25 uint8_t *out8 = s;
26#if !CHIP_HAS_WH64()
27 int ahead32;
28#else
29 int to_align32; 26 int to_align32;
30#endif
31 27
32 /* Experimentation shows that a trivial tight loop is a win up until 28 /* Experimentation shows that a trivial tight loop is a win up until
33 * around a size of 20, where writing a word at a time starts to win. 29 * around a size of 20, where writing a word at a time starts to win.
@@ -58,21 +54,6 @@ void *memset(void *s, int c, size_t n)
58 return s; 54 return s;
59 } 55 }
60 56
61#if !CHIP_HAS_WH64()
62 /* Use a spare issue slot to start prefetching the first cache
63 * line early. This instruction is free as the store can be buried
64 * in otherwise idle issue slots doing ALU ops.
65 */
66 __insn_prefetch(out8);
67
68 /* We prefetch the end so that a short memset that spans two cache
69 * lines gets some prefetching benefit. Again we believe this is free
70 * to issue.
71 */
72 __insn_prefetch(&out8[n - 1]);
73#endif /* !CHIP_HAS_WH64() */
74
75
76 /* Align 'out8'. We know n >= 3 so this won't write past the end. */ 57 /* Align 'out8'. We know n >= 3 so this won't write past the end. */
77 while (((uintptr_t) out8 & 3) != 0) { 58 while (((uintptr_t) out8 & 3) != 0) {
78 *out8++ = c; 59 *out8++ = c;
@@ -93,90 +74,6 @@ void *memset(void *s, int c, size_t n)
93 /* This must be at least 8 or the following loop doesn't work. */ 74 /* This must be at least 8 or the following loop doesn't work. */
94#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) 75#define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
95 76
96#if !CHIP_HAS_WH64()
97
98 ahead32 = CACHE_LINE_SIZE_IN_WORDS;
99
100 /* We already prefetched the first and last cache lines, so
101 * we only need to do more prefetching if we are storing
102 * to more than two cache lines.
103 */
104 if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
105 int i;
106
107 /* Prefetch the next several cache lines.
108 * This is the setup code for the software-pipelined
109 * loop below.
110 */
111#define MAX_PREFETCH 5
112 ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
113 if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
114 ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
115
116 for (i = CACHE_LINE_SIZE_IN_WORDS;
117 i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
118 __insn_prefetch(&out32[i]);
119 }
120
121 if (n32 > ahead32) {
122 while (1) {
123 int j;
124
125 /* Prefetch by reading one word several cache lines
126 * ahead. Since loads are non-blocking this will
127 * cause the full cache line to be read while we are
128 * finishing earlier cache lines. Using a store
129 * here causes microarchitectural performance
130 * problems where a victimizing store miss goes to
131 * the head of the retry FIFO and locks the pipe for
132 * a few cycles. So a few subsequent stores in this
133 * loop go into the retry FIFO, and then later
134 * stores see other stores to the same cache line
135 * are already in the retry FIFO and themselves go
136 * into the retry FIFO, filling it up and grinding
137 * to a halt waiting for the original miss to be
138 * satisfied.
139 */
140 __insn_prefetch(&out32[ahead32]);
141
142#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
143#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
144#endif
145
146 n32 -= CACHE_LINE_SIZE_IN_WORDS;
147
148 /* Save icache space by only partially unrolling
149 * this loop.
150 */
151 for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
152 *out32++ = v32;
153 *out32++ = v32;
154 *out32++ = v32;
155 *out32++ = v32;
156 }
157
158 /* To save compiled code size, reuse this loop even
159 * when we run out of prefetching to do by dropping
160 * ahead32 down.
161 */
162 if (n32 <= ahead32) {
163 /* Not even a full cache line left,
164 * so stop now.
165 */
166 if (n32 < CACHE_LINE_SIZE_IN_WORDS)
167 break;
168
169 /* Choose a small enough value that we don't
170 * prefetch past the end. There's no sense
171 * in touching cache lines we don't have to.
172 */
173 ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
174 }
175 }
176 }
177
178#else /* CHIP_HAS_WH64() */
179
180 /* Determine how many words we need to emit before the 'out32' 77 /* Determine how many words we need to emit before the 'out32'
181 * pointer becomes aligned modulo the cache line size. 78 * pointer becomes aligned modulo the cache line size.
182 */ 79 */
@@ -233,8 +130,6 @@ void *memset(void *s, int c, size_t n)
233 n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; 130 n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
234 } 131 }
235 132
236#endif /* CHIP_HAS_WH64() */
237
238 /* Now handle any leftover values. */ 133 /* Now handle any leftover values. */
239 if (n32 != 0) { 134 if (n32 != 0) {
240 do { 135 do {