aboutsummaryrefslogtreecommitdiffstats
path: root/arch/tile/mm
diff options
context:
space:
mode:
authorChris Metcalf <cmetcalf@tilera.com>2010-05-28 23:09:12 -0400
committerChris Metcalf <cmetcalf@tilera.com>2010-06-04 17:11:18 -0400
commit867e359b97c970a60626d5d76bbe2a8fadbf38fb (patch)
treec5ccbb7f5172e8555977119608ecb1eee3cc37e3 /arch/tile/mm
parent5360bd776f73d0a7da571d72a09a03f237e99900 (diff)
arch/tile: core support for Tilera 32-bit chips.
This change is the core kernel support for TILEPro and TILE64 chips. No driver support (except the console driver) is included yet. This includes the relevant Linux headers in asm/; the low-level low-level "Tile architecture" headers in arch/, which are shared with the hypervisor, etc., and are build-system agnostic; and the relevant hypervisor headers in hv/. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> Acked-by: Arnd Bergmann <arnd@arndb.de> Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Reviewed-by: Paul Mundt <lethal@linux-sh.org>
Diffstat (limited to 'arch/tile/mm')
-rw-r--r--arch/tile/mm/Makefile9
-rw-r--r--arch/tile/mm/elf.c164
-rw-r--r--arch/tile/mm/extable.c30
-rw-r--r--arch/tile/mm/fault.c905
-rw-r--r--arch/tile/mm/highmem.c328
-rw-r--r--arch/tile/mm/homecache.c445
-rw-r--r--arch/tile/mm/hugetlbpage.c343
-rw-r--r--arch/tile/mm/init.c1082
-rw-r--r--arch/tile/mm/migrate.h50
-rw-r--r--arch/tile/mm/migrate_32.S211
-rw-r--r--arch/tile/mm/mmap.c75
-rw-r--r--arch/tile/mm/pgtable.c566
12 files changed, 4208 insertions, 0 deletions
diff --git a/arch/tile/mm/Makefile b/arch/tile/mm/Makefile
new file mode 100644
index 000000000000..e252aeddc17d
--- /dev/null
+++ b/arch/tile/mm/Makefile
@@ -0,0 +1,9 @@
1#
2# Makefile for the linux tile-specific parts of the memory manager.
3#
4
5obj-y := init.o pgtable.o fault.o extable.o elf.o \
6 mmap.o homecache.o migrate_$(BITS).o
7
8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
9obj-$(CONFIG_HIGHMEM) += highmem.o
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
new file mode 100644
index 000000000000..818c9bef060c
--- /dev/null
+++ b/arch/tile/mm/elf.c
@@ -0,0 +1,164 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/binfmts.h>
18#include <linux/compat.h>
19#include <linux/mman.h>
20#include <linux/elf.h>
21#include <asm/pgtable.h>
22#include <asm/pgalloc.h>
23
24/* Notify a running simulator, if any, that an exec just occurred. */
25static void sim_notify_exec(const char *binary_name)
26{
27 unsigned char c;
28 do {
29 c = *binary_name++;
30 __insn_mtspr(SPR_SIM_CONTROL,
31 (SIM_CONTROL_OS_EXEC
32 | (c << _SIM_CONTROL_OPERATOR_BITS)));
33
34 } while (c);
35}
36
37static int notify_exec(void)
38{
39 int retval = 0; /* failure */
40 struct vm_area_struct *vma = current->mm->mmap;
41 while (vma) {
42 if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
43 break;
44 vma = vma->vm_next;
45 }
46 if (vma) {
47 char *buf = (char *) __get_free_page(GFP_KERNEL);
48 if (buf) {
49 char *path = d_path(&vma->vm_file->f_path,
50 buf, PAGE_SIZE);
51 if (!IS_ERR(path)) {
52 sim_notify_exec(path);
53 retval = 1;
54 }
55 free_page((unsigned long)buf);
56 }
57 }
58 return retval;
59}
60
61/* Notify a running simulator, if any, that we loaded an interpreter. */
62static void sim_notify_interp(unsigned long load_addr)
63{
64 size_t i;
65 for (i = 0; i < sizeof(load_addr); i++) {
66 unsigned char c = load_addr >> (i * 8);
67 __insn_mtspr(SPR_SIM_CONTROL,
68 (SIM_CONTROL_OS_INTERP
69 | (c << _SIM_CONTROL_OPERATOR_BITS)));
70 }
71}
72
73
74/* Kernel address of page used to map read-only kernel data into userspace. */
75static void *vdso_page;
76
77/* One-entry array used for install_special_mapping. */
78static struct page *vdso_pages[1];
79
80int __init vdso_setup(void)
81{
82 extern char __rt_sigreturn[], __rt_sigreturn_end[];
83 vdso_page = (void *)get_zeroed_page(GFP_ATOMIC);
84 memcpy(vdso_page, __rt_sigreturn, __rt_sigreturn_end - __rt_sigreturn);
85 vdso_pages[0] = virt_to_page(vdso_page);
86 return 0;
87}
88device_initcall(vdso_setup);
89
90const char *arch_vma_name(struct vm_area_struct *vma)
91{
92 if (vma->vm_private_data == vdso_pages)
93 return "[vdso]";
94#ifndef __tilegx__
95 if (vma->vm_start == MEM_USER_INTRPT)
96 return "[intrpt]";
97#endif
98 return NULL;
99}
100
101int arch_setup_additional_pages(struct linux_binprm *bprm,
102 int executable_stack)
103{
104 struct mm_struct *mm = current->mm;
105 unsigned long vdso_base;
106 int retval = 0;
107
108 /*
109 * Notify the simulator that an exec just occurred.
110 * If we can't find the filename of the mapping, just use
111 * whatever was passed as the linux_binprm filename.
112 */
113 if (!notify_exec())
114 sim_notify_exec(bprm->filename);
115
116 down_write(&mm->mmap_sem);
117
118 /*
119 * MAYWRITE to allow gdb to COW and set breakpoints
120 *
121 * Make sure the vDSO gets into every core dump. Dumping its
122 * contents makes post-mortem fully interpretable later
123 * without matching up the same kernel and hardware config to
124 * see what PC values meant.
125 */
126 vdso_base = VDSO_BASE;
127 retval = install_special_mapping(mm, vdso_base, PAGE_SIZE,
128 VM_READ|VM_EXEC|
129 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
130 VM_ALWAYSDUMP,
131 vdso_pages);
132
133#ifndef __tilegx__
134 /*
135 * Set up a user-interrupt mapping here; the user can't
136 * create one themselves since it is above TASK_SIZE.
137 * We make it unwritable by default, so the model for adding
138 * interrupt vectors always involves an mprotect.
139 */
140 if (!retval) {
141 unsigned long addr = MEM_USER_INTRPT;
142 addr = mmap_region(NULL, addr, INTRPT_SIZE,
143 MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
144 VM_READ|VM_EXEC|
145 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
146 if (addr > (unsigned long) -PAGE_SIZE)
147 retval = (int) addr;
148 }
149#endif
150
151 up_write(&mm->mmap_sem);
152
153 return retval;
154}
155
156
157void elf_plat_init(struct pt_regs *regs, unsigned long load_addr)
158{
159 /* Zero all registers. */
160 memset(regs, 0, sizeof(*regs));
161
162 /* Report the interpreter's load address. */
163 sim_notify_interp(load_addr);
164}
diff --git a/arch/tile/mm/extable.c b/arch/tile/mm/extable.c
new file mode 100644
index 000000000000..4fb0acb9d154
--- /dev/null
+++ b/arch/tile/mm/extable.c
@@ -0,0 +1,30 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/module.h>
16#include <linux/spinlock.h>
17#include <linux/uaccess.h>
18
19int fixup_exception(struct pt_regs *regs)
20{
21 const struct exception_table_entry *fixup;
22
23 fixup = search_exception_tables(regs->pc);
24 if (fixup) {
25 regs->pc = fixup->fixup;
26 return 1;
27 }
28
29 return 0;
30}
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
new file mode 100644
index 000000000000..9b6b92f07def
--- /dev/null
+++ b/arch/tile/mm/fault.c
@@ -0,0 +1,905 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * From i386 code copyright (C) 1995 Linus Torvalds
15 */
16
17#include <linux/signal.h>
18#include <linux/sched.h>
19#include <linux/kernel.h>
20#include <linux/errno.h>
21#include <linux/string.h>
22#include <linux/types.h>
23#include <linux/ptrace.h>
24#include <linux/mman.h>
25#include <linux/mm.h>
26#include <linux/smp.h>
27#include <linux/smp_lock.h>
28#include <linux/interrupt.h>
29#include <linux/init.h>
30#include <linux/tty.h>
31#include <linux/vt_kern.h> /* For unblank_screen() */
32#include <linux/highmem.h>
33#include <linux/module.h>
34#include <linux/kprobes.h>
35#include <linux/hugetlb.h>
36#include <linux/syscalls.h>
37#include <linux/uaccess.h>
38
39#include <asm/system.h>
40#include <asm/pgalloc.h>
41#include <asm/sections.h>
42
43#include <arch/interrupts.h>
44
45/*
46 * Unlock any spinlocks which will prevent us from getting the
47 * message out
48 */
49void bust_spinlocks(int yes)
50{
51 int loglevel_save = console_loglevel;
52
53 if (yes) {
54 oops_in_progress = 1;
55 return;
56 }
57 oops_in_progress = 0;
58 /*
59 * OK, the message is on the console. Now we call printk()
60 * without oops_in_progress set so that printk will give klogd
61 * a poke. Hold onto your hats...
62 */
63 console_loglevel = 15; /* NMI oopser may have shut the console up */
64 printk(" ");
65 console_loglevel = loglevel_save;
66}
67
68static noinline void force_sig_info_fault(int si_signo, int si_code,
69 unsigned long address, int fault_num, struct task_struct *tsk)
70{
71 siginfo_t info;
72
73 if (unlikely(tsk->pid < 2)) {
74 panic("Signal %d (code %d) at %#lx sent to %s!",
75 si_signo, si_code & 0xffff, address,
76 tsk->pid ? "init" : "the idle task");
77 }
78
79 info.si_signo = si_signo;
80 info.si_errno = 0;
81 info.si_code = si_code;
82 info.si_addr = (void __user *)address;
83 info.si_trapno = fault_num;
84 force_sig_info(si_signo, &info, tsk);
85}
86
87#ifndef __tilegx__
88/*
89 * Synthesize the fault a PL0 process would get by doing a word-load of
90 * an unaligned address or a high kernel address. Called indirectly
91 * from sys_cmpxchg() in kernel/intvec.S.
92 */
93int _sys_cmpxchg_badaddr(unsigned long address, struct pt_regs *regs)
94{
95 if (address >= PAGE_OFFSET)
96 force_sig_info_fault(SIGSEGV, SEGV_MAPERR, address,
97 INT_DTLB_MISS, current);
98 else
99 force_sig_info_fault(SIGBUS, BUS_ADRALN, address,
100 INT_UNALIGN_DATA, current);
101
102 /*
103 * Adjust pc to point at the actual instruction, which is unusual
104 * for syscalls normally, but is appropriate when we are claiming
105 * that a syscall swint1 caused a page fault or bus error.
106 */
107 regs->pc -= 8;
108
109 /*
110 * Mark this as a caller-save interrupt, like a normal page fault,
111 * so that when we go through the signal handler path we will
112 * properly restore r0, r1, and r2 for the signal handler arguments.
113 */
114 regs->flags |= PT_FLAGS_CALLER_SAVES;
115
116 return 0;
117}
118#endif
119
120static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
121{
122 unsigned index = pgd_index(address);
123 pgd_t *pgd_k;
124 pud_t *pud, *pud_k;
125 pmd_t *pmd, *pmd_k;
126
127 pgd += index;
128 pgd_k = init_mm.pgd + index;
129
130 if (!pgd_present(*pgd_k))
131 return NULL;
132
133 pud = pud_offset(pgd, address);
134 pud_k = pud_offset(pgd_k, address);
135 if (!pud_present(*pud_k))
136 return NULL;
137
138 pmd = pmd_offset(pud, address);
139 pmd_k = pmd_offset(pud_k, address);
140 if (!pmd_present(*pmd_k))
141 return NULL;
142 if (!pmd_present(*pmd)) {
143 set_pmd(pmd, *pmd_k);
144 arch_flush_lazy_mmu_mode();
145 } else
146 BUG_ON(pmd_ptfn(*pmd) != pmd_ptfn(*pmd_k));
147 return pmd_k;
148}
149
150/*
151 * Handle a fault on the vmalloc or module mapping area
152 */
153static inline int vmalloc_fault(pgd_t *pgd, unsigned long address)
154{
155 pmd_t *pmd_k;
156 pte_t *pte_k;
157
158 /* Make sure we are in vmalloc area */
159 if (!(address >= VMALLOC_START && address < VMALLOC_END))
160 return -1;
161
162 /*
163 * Synchronize this task's top level page-table
164 * with the 'reference' page table.
165 */
166 pmd_k = vmalloc_sync_one(pgd, address);
167 if (!pmd_k)
168 return -1;
169 if (pmd_huge(*pmd_k))
170 return 0; /* support TILE huge_vmap() API */
171 pte_k = pte_offset_kernel(pmd_k, address);
172 if (!pte_present(*pte_k))
173 return -1;
174 return 0;
175}
176
177/* Wait until this PTE has completed migration. */
178static void wait_for_migration(pte_t *pte)
179{
180 if (pte_migrating(*pte)) {
181 /*
182 * Wait until the migrater fixes up this pte.
183 * We scale the loop count by the clock rate so we'll wait for
184 * a few seconds here.
185 */
186 int retries = 0;
187 int bound = get_clock_rate();
188 while (pte_migrating(*pte)) {
189 barrier();
190 if (++retries > bound)
191 panic("Hit migrating PTE (%#llx) and"
192 " page PFN %#lx still migrating",
193 pte->val, pte_pfn(*pte));
194 }
195 }
196}
197
198/*
199 * It's not generally safe to use "current" to get the page table pointer,
200 * since we might be running an oprofile interrupt in the middle of a
201 * task switch.
202 */
203static pgd_t *get_current_pgd(void)
204{
205 HV_Context ctx = hv_inquire_context();
206 unsigned long pgd_pfn = ctx.page_table >> PAGE_SHIFT;
207 struct page *pgd_page = pfn_to_page(pgd_pfn);
208 BUG_ON(PageHighMem(pgd_page)); /* oops, HIGHPTE? */
209 return (pgd_t *) __va(ctx.page_table);
210}
211
212/*
213 * We can receive a page fault from a migrating PTE at any time.
214 * Handle it by just waiting until the fault resolves.
215 *
216 * It's also possible to get a migrating kernel PTE that resolves
217 * itself during the downcall from hypervisor to Linux. We just check
218 * here to see if the PTE seems valid, and if so we retry it.
219 *
220 * NOTE! We MUST NOT take any locks for this case. We may be in an
221 * interrupt or a critical region, and must do as little as possible.
222 * Similarly, we can't use atomic ops here, since we may be handling a
223 * fault caused by an atomic op access.
224 */
225static int handle_migrating_pte(pgd_t *pgd, int fault_num,
226 unsigned long address,
227 int is_kernel_mode, int write)
228{
229 pud_t *pud;
230 pmd_t *pmd;
231 pte_t *pte;
232 pte_t pteval;
233
234 if (pgd_addr_invalid(address))
235 return 0;
236
237 pgd += pgd_index(address);
238 pud = pud_offset(pgd, address);
239 if (!pud || !pud_present(*pud))
240 return 0;
241 pmd = pmd_offset(pud, address);
242 if (!pmd || !pmd_present(*pmd))
243 return 0;
244 pte = pmd_huge_page(*pmd) ? ((pte_t *)pmd) :
245 pte_offset_kernel(pmd, address);
246 pteval = *pte;
247 if (pte_migrating(pteval)) {
248 wait_for_migration(pte);
249 return 1;
250 }
251
252 if (!is_kernel_mode || !pte_present(pteval))
253 return 0;
254 if (fault_num == INT_ITLB_MISS) {
255 if (pte_exec(pteval))
256 return 1;
257 } else if (write) {
258 if (pte_write(pteval))
259 return 1;
260 } else {
261 if (pte_read(pteval))
262 return 1;
263 }
264
265 return 0;
266}
267
268/*
269 * This routine is responsible for faulting in user pages.
270 * It passes the work off to one of the appropriate routines.
271 * It returns true if the fault was successfully handled.
272 */
273static int handle_page_fault(struct pt_regs *regs,
274 int fault_num,
275 int is_page_fault,
276 unsigned long address,
277 int write)
278{
279 struct task_struct *tsk;
280 struct mm_struct *mm;
281 struct vm_area_struct *vma;
282 unsigned long stack_offset;
283 int fault;
284 int si_code;
285 int is_kernel_mode;
286 pgd_t *pgd;
287
288 /* on TILE, protection faults are always writes */
289 if (!is_page_fault)
290 write = 1;
291
292 is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
293
294 tsk = validate_current();
295
296 /*
297 * Check to see if we might be overwriting the stack, and bail
298 * out if so. The page fault code is a relatively likely
299 * place to get trapped in an infinite regress, and once we
300 * overwrite the whole stack, it becomes very hard to recover.
301 */
302 stack_offset = stack_pointer & (THREAD_SIZE-1);
303 if (stack_offset < THREAD_SIZE / 8) {
304 printk(KERN_ALERT "Potential stack overrun: sp %#lx\n",
305 stack_pointer);
306 show_regs(regs);
307 printk(KERN_ALERT "Killing current process %d/%s\n",
308 tsk->pid, tsk->comm);
309 do_group_exit(SIGKILL);
310 }
311
312 /*
313 * Early on, we need to check for migrating PTE entries;
314 * see homecache.c. If we find a migrating PTE, we wait until
315 * the backing page claims to be done migrating, then we procede.
316 * For kernel PTEs, we rewrite the PTE and return and retry.
317 * Otherwise, we treat the fault like a normal "no PTE" fault,
318 * rather than trying to patch up the existing PTE.
319 */
320 pgd = get_current_pgd();
321 if (handle_migrating_pte(pgd, fault_num, address,
322 is_kernel_mode, write))
323 return 1;
324
325 si_code = SEGV_MAPERR;
326
327 /*
328 * We fault-in kernel-space virtual memory on-demand. The
329 * 'reference' page table is init_mm.pgd.
330 *
331 * NOTE! We MUST NOT take any locks for this case. We may
332 * be in an interrupt or a critical region, and should
333 * only copy the information from the master page table,
334 * nothing more.
335 *
336 * This verifies that the fault happens in kernel space
337 * and that the fault was not a protection fault.
338 */
339 if (unlikely(address >= TASK_SIZE &&
340 !is_arch_mappable_range(address, 0))) {
341 if (is_kernel_mode && is_page_fault &&
342 vmalloc_fault(pgd, address) >= 0)
343 return 1;
344 /*
345 * Don't take the mm semaphore here. If we fixup a prefetch
346 * fault we could otherwise deadlock.
347 */
348 mm = NULL; /* happy compiler */
349 vma = NULL;
350 goto bad_area_nosemaphore;
351 }
352
353 /*
354 * If we're trying to touch user-space addresses, we must
355 * be either at PL0, or else with interrupts enabled in the
356 * kernel, so either way we can re-enable interrupts here.
357 */
358 local_irq_enable();
359
360 mm = tsk->mm;
361
362 /*
363 * If we're in an interrupt, have no user context or are running in an
364 * atomic region then we must not take the fault.
365 */
366 if (in_atomic() || !mm) {
367 vma = NULL; /* happy compiler */
368 goto bad_area_nosemaphore;
369 }
370
371 /*
372 * When running in the kernel we expect faults to occur only to
373 * addresses in user space. All other faults represent errors in the
374 * kernel and should generate an OOPS. Unfortunately, in the case of an
375 * erroneous fault occurring in a code path which already holds mmap_sem
376 * we will deadlock attempting to validate the fault against the
377 * address space. Luckily the kernel only validly references user
378 * space from well defined areas of code, which are listed in the
379 * exceptions table.
380 *
381 * As the vast majority of faults will be valid we will only perform
382 * the source reference check when there is a possibility of a deadlock.
383 * Attempt to lock the address space, if we cannot we then validate the
384 * source. If this is invalid we can skip the address space check,
385 * thus avoiding the deadlock.
386 */
387 if (!down_read_trylock(&mm->mmap_sem)) {
388 if (is_kernel_mode &&
389 !search_exception_tables(regs->pc)) {
390 vma = NULL; /* happy compiler */
391 goto bad_area_nosemaphore;
392 }
393 down_read(&mm->mmap_sem);
394 }
395
396 vma = find_vma(mm, address);
397 if (!vma)
398 goto bad_area;
399 if (vma->vm_start <= address)
400 goto good_area;
401 if (!(vma->vm_flags & VM_GROWSDOWN))
402 goto bad_area;
403 if (regs->sp < PAGE_OFFSET) {
404 /*
405 * accessing the stack below sp is always a bug.
406 */
407 if (address < regs->sp)
408 goto bad_area;
409 }
410 if (expand_stack(vma, address))
411 goto bad_area;
412
413/*
414 * Ok, we have a good vm_area for this memory access, so
415 * we can handle it..
416 */
417good_area:
418 si_code = SEGV_ACCERR;
419 if (fault_num == INT_ITLB_MISS) {
420 if (!(vma->vm_flags & VM_EXEC))
421 goto bad_area;
422 } else if (write) {
423#ifdef TEST_VERIFY_AREA
424 if (!is_page_fault && regs->cs == KERNEL_CS)
425 printk("WP fault at "REGFMT"\n", regs->eip);
426#endif
427 if (!(vma->vm_flags & VM_WRITE))
428 goto bad_area;
429 } else {
430 if (!is_page_fault || !(vma->vm_flags & VM_READ))
431 goto bad_area;
432 }
433
434 survive:
435 /*
436 * If for any reason at all we couldn't handle the fault,
437 * make sure we exit gracefully rather than endlessly redo
438 * the fault.
439 */
440 fault = handle_mm_fault(mm, vma, address, write);
441 if (unlikely(fault & VM_FAULT_ERROR)) {
442 if (fault & VM_FAULT_OOM)
443 goto out_of_memory;
444 else if (fault & VM_FAULT_SIGBUS)
445 goto do_sigbus;
446 BUG();
447 }
448 if (fault & VM_FAULT_MAJOR)
449 tsk->maj_flt++;
450 else
451 tsk->min_flt++;
452
453 /*
454 * If this was an asynchronous fault,
455 * restart the appropriate engine.
456 */
457 switch (fault_num) {
458#if CHIP_HAS_TILE_DMA()
459 case INT_DMATLB_MISS:
460 case INT_DMATLB_MISS_DWNCL:
461 case INT_DMATLB_ACCESS:
462 case INT_DMATLB_ACCESS_DWNCL:
463 __insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK);
464 break;
465#endif
466#if CHIP_HAS_SN_PROC()
467 case INT_SNITLB_MISS:
468 case INT_SNITLB_MISS_DWNCL:
469 __insn_mtspr(SPR_SNCTL,
470 __insn_mfspr(SPR_SNCTL) &
471 ~SPR_SNCTL__FRZPROC_MASK);
472 break;
473#endif
474 }
475
476 up_read(&mm->mmap_sem);
477 return 1;
478
479/*
480 * Something tried to access memory that isn't in our memory map..
481 * Fix it, but check if it's kernel or user first..
482 */
483bad_area:
484 up_read(&mm->mmap_sem);
485
486bad_area_nosemaphore:
487 /* User mode accesses just cause a SIGSEGV */
488 if (!is_kernel_mode) {
489 /*
490 * It's possible to have interrupts off here.
491 */
492 local_irq_enable();
493
494 force_sig_info_fault(SIGSEGV, si_code, address,
495 fault_num, tsk);
496 return 0;
497 }
498
499no_context:
500 /* Are we prepared to handle this kernel fault? */
501 if (fixup_exception(regs))
502 return 0;
503
504/*
505 * Oops. The kernel tried to access some bad page. We'll have to
506 * terminate things with extreme prejudice.
507 */
508
509 bust_spinlocks(1);
510
511 /* FIXME: no lookup_address() yet */
512#ifdef SUPPORT_LOOKUP_ADDRESS
513 if (fault_num == INT_ITLB_MISS) {
514 pte_t *pte = lookup_address(address);
515
516 if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
517 printk(KERN_CRIT "kernel tried to execute"
518 " non-executable page - exploit attempt?"
519 " (uid: %d)\n", current->uid);
520 }
521#endif
522 if (address < PAGE_SIZE)
523 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference\n");
524 else
525 printk(KERN_ALERT "Unable to handle kernel paging request\n");
526 printk(" at virtual address "REGFMT", pc "REGFMT"\n",
527 address, regs->pc);
528
529 show_regs(regs);
530
531 if (unlikely(tsk->pid < 2)) {
532 panic("Kernel page fault running %s!",
533 tsk->pid ? "init" : "the idle task");
534 }
535
536 /*
537 * More FIXME: we should probably copy the i386 here and
538 * implement a generic die() routine. Not today.
539 */
540#ifdef SUPPORT_DIE
541 die("Oops", regs);
542#endif
543 bust_spinlocks(1);
544
545 do_group_exit(SIGKILL);
546
547/*
548 * We ran out of memory, or some other thing happened to us that made
549 * us unable to handle the page fault gracefully.
550 */
551out_of_memory:
552 up_read(&mm->mmap_sem);
553 if (is_global_init(tsk)) {
554 yield();
555 down_read(&mm->mmap_sem);
556 goto survive;
557 }
558 printk("VM: killing process %s\n", tsk->comm);
559 if (!is_kernel_mode)
560 do_group_exit(SIGKILL);
561 goto no_context;
562
563do_sigbus:
564 up_read(&mm->mmap_sem);
565
566 /* Kernel mode? Handle exceptions or die */
567 if (is_kernel_mode)
568 goto no_context;
569
570 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, fault_num, tsk);
571 return 0;
572}
573
574#ifndef __tilegx__
575
576extern char sys_cmpxchg[], __sys_cmpxchg_end[];
577extern char __sys_cmpxchg_grab_lock[];
578extern char __start_atomic_asm_code[], __end_atomic_asm_code[];
579
580/*
581 * We return this structure in registers to avoid having to write
582 * additional save/restore code in the intvec.S caller.
583 */
584struct intvec_state {
585 void *handler;
586 unsigned long vecnum;
587 unsigned long fault_num;
588 unsigned long info;
589 unsigned long retval;
590};
591
592/* We must release ICS before panicking or we won't get anywhere. */
593#define ics_panic(fmt, ...) do { \
594 __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); \
595 panic(fmt, __VA_ARGS__); \
596} while (0)
597
598void do_page_fault(struct pt_regs *regs, int fault_num,
599 unsigned long address, unsigned long write);
600
601/*
602 * When we take an ITLB or DTLB fault or access violation in the
603 * supervisor while the critical section bit is set, the hypervisor is
604 * reluctant to write new values into the EX_CONTEXT_1_x registers,
605 * since that might indicate we have not yet squirreled the SPR
606 * contents away and can thus safely take a recursive interrupt.
607 * Accordingly, the hypervisor passes us the PC via SYSTEM_SAVE_1_2.
608 */
609struct intvec_state do_page_fault_ics(struct pt_regs *regs, int fault_num,
610 unsigned long address,
611 unsigned long info)
612{
613 unsigned long pc = info & ~1;
614 int write = info & 1;
615 pgd_t *pgd = get_current_pgd();
616
617 /* Retval is 1 at first since we will handle the fault fully. */
618 struct intvec_state state = {
619 do_page_fault, fault_num, address, write, 1
620 };
621
622 /* Validate that we are plausibly in the right routine. */
623 if ((pc & 0x7) != 0 || pc < PAGE_OFFSET ||
624 (fault_num != INT_DTLB_MISS &&
625 fault_num != INT_DTLB_ACCESS)) {
626 unsigned long old_pc = regs->pc;
627 regs->pc = pc;
628 ics_panic("Bad ICS page fault args:"
629 " old PC %#lx, fault %d/%d at %#lx\n",
630 old_pc, fault_num, write, address);
631 }
632
633 /* We might be faulting on a vmalloc page, so check that first. */
634 if (fault_num != INT_DTLB_ACCESS && vmalloc_fault(pgd, address) >= 0)
635 return state;
636
637 /*
638 * If we faulted with ICS set in sys_cmpxchg, we are providing
639 * a user syscall service that should generate a signal on
640 * fault. We didn't set up a kernel stack on initial entry to
641 * sys_cmpxchg, but instead had one set up by the fault, which
642 * (because sys_cmpxchg never releases ICS) came to us via the
643 * SYSTEM_SAVE_1_2 mechanism, and thus EX_CONTEXT_1_[01] are
644 * still referencing the original user code. We release the
645 * atomic lock and rewrite pt_regs so that it appears that we
646 * came from user-space directly, and after we finish the
647 * fault we'll go back to user space and re-issue the swint.
648 * This way the backtrace information is correct if we need to
649 * emit a stack dump at any point while handling this.
650 *
651 * Must match register use in sys_cmpxchg().
652 */
653 if (pc >= (unsigned long) sys_cmpxchg &&
654 pc < (unsigned long) __sys_cmpxchg_end) {
655#ifdef CONFIG_SMP
656 /* Don't unlock before we could have locked. */
657 if (pc >= (unsigned long)__sys_cmpxchg_grab_lock) {
658 int *lock_ptr = (int *)(regs->regs[ATOMIC_LOCK_REG]);
659 __atomic_fault_unlock(lock_ptr);
660 }
661#endif
662 regs->sp = regs->regs[27];
663 }
664
665 /*
666 * We can also fault in the atomic assembly, in which
667 * case we use the exception table to do the first-level fixup.
668 * We may re-fixup again in the real fault handler if it
669 * turns out the faulting address is just bad, and not,
670 * for example, migrating.
671 */
672 else if (pc >= (unsigned long) __start_atomic_asm_code &&
673 pc < (unsigned long) __end_atomic_asm_code) {
674 const struct exception_table_entry *fixup;
675#ifdef CONFIG_SMP
676 /* Unlock the atomic lock. */
677 int *lock_ptr = (int *)(regs->regs[ATOMIC_LOCK_REG]);
678 __atomic_fault_unlock(lock_ptr);
679#endif
680 fixup = search_exception_tables(pc);
681 if (!fixup)
682 ics_panic("ICS atomic fault not in table:"
683 " PC %#lx, fault %d", pc, fault_num);
684 regs->pc = fixup->fixup;
685 regs->ex1 = PL_ICS_EX1(KERNEL_PL, 0);
686 }
687
688 /*
689 * NOTE: the one other type of access that might bring us here
690 * are the memory ops in __tns_atomic_acquire/__tns_atomic_release,
691 * but we don't have to check specially for them since we can
692 * always safely return to the address of the fault and retry,
693 * since no separate atomic locks are involved.
694 */
695
696 /*
697 * Now that we have released the atomic lock (if necessary),
698 * it's safe to spin if the PTE that caused the fault was migrating.
699 */
700 if (fault_num == INT_DTLB_ACCESS)
701 write = 1;
702 if (handle_migrating_pte(pgd, fault_num, address, 1, write))
703 return state;
704
705 /* Return zero so that we continue on with normal fault handling. */
706 state.retval = 0;
707 return state;
708}
709
710#endif /* !__tilegx__ */
711
712/*
713 * This routine handles page faults. It determines the address, and the
714 * problem, and then passes it handle_page_fault() for normal DTLB and
715 * ITLB issues, and for DMA or SN processor faults when we are in user
716 * space. For the latter, if we're in kernel mode, we just save the
717 * interrupt away appropriately and return immediately. We can't do
718 * page faults for user code while in kernel mode.
719 */
720void do_page_fault(struct pt_regs *regs, int fault_num,
721 unsigned long address, unsigned long write)
722{
723 int is_page_fault;
724
725 /* This case should have been handled by do_page_fault_ics(). */
726 BUG_ON(write & ~1);
727
728#if CHIP_HAS_TILE_DMA()
729 /*
730 * If it's a DMA fault, suspend the transfer while we're
731 * handling the miss; we'll restart after it's handled. If we
732 * don't suspend, it's possible that this process could swap
733 * out and back in, and restart the engine since the DMA is
734 * still 'running'.
735 */
736 if (fault_num == INT_DMATLB_MISS ||
737 fault_num == INT_DMATLB_ACCESS ||
738 fault_num == INT_DMATLB_MISS_DWNCL ||
739 fault_num == INT_DMATLB_ACCESS_DWNCL) {
740 __insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__SUSPEND_MASK);
741 while (__insn_mfspr(SPR_DMA_USER_STATUS) &
742 SPR_DMA_STATUS__BUSY_MASK)
743 ;
744 }
745#endif
746
747 /* Validate fault num and decide if this is a first-time page fault. */
748 switch (fault_num) {
749 case INT_ITLB_MISS:
750 case INT_DTLB_MISS:
751#if CHIP_HAS_TILE_DMA()
752 case INT_DMATLB_MISS:
753 case INT_DMATLB_MISS_DWNCL:
754#endif
755#if CHIP_HAS_SN_PROC()
756 case INT_SNITLB_MISS:
757 case INT_SNITLB_MISS_DWNCL:
758#endif
759 is_page_fault = 1;
760 break;
761
762 case INT_DTLB_ACCESS:
763#if CHIP_HAS_TILE_DMA()
764 case INT_DMATLB_ACCESS:
765 case INT_DMATLB_ACCESS_DWNCL:
766#endif
767 is_page_fault = 0;
768 break;
769
770 default:
771 panic("Bad fault number %d in do_page_fault", fault_num);
772 }
773
774 if (EX1_PL(regs->ex1) != USER_PL) {
775 struct async_tlb *async;
776 switch (fault_num) {
777#if CHIP_HAS_TILE_DMA()
778 case INT_DMATLB_MISS:
779 case INT_DMATLB_ACCESS:
780 case INT_DMATLB_MISS_DWNCL:
781 case INT_DMATLB_ACCESS_DWNCL:
782 async = &current->thread.dma_async_tlb;
783 break;
784#endif
785#if CHIP_HAS_SN_PROC()
786 case INT_SNITLB_MISS:
787 case INT_SNITLB_MISS_DWNCL:
788 async = &current->thread.sn_async_tlb;
789 break;
790#endif
791 default:
792 async = NULL;
793 }
794 if (async) {
795
796 /*
797 * No vmalloc check required, so we can allow
798 * interrupts immediately at this point.
799 */
800 local_irq_enable();
801
802 set_thread_flag(TIF_ASYNC_TLB);
803 if (async->fault_num != 0) {
804 panic("Second async fault %d;"
805 " old fault was %d (%#lx/%ld)",
806 fault_num, async->fault_num,
807 address, write);
808 }
809 BUG_ON(fault_num == 0);
810 async->fault_num = fault_num;
811 async->is_fault = is_page_fault;
812 async->is_write = write;
813 async->address = address;
814 return;
815 }
816 }
817
818 handle_page_fault(regs, fault_num, is_page_fault, address, write);
819}
820
821
822#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
823/*
824 * Check an async_tlb structure to see if a deferred fault is waiting,
825 * and if so pass it to the page-fault code.
826 */
827static void handle_async_page_fault(struct pt_regs *regs,
828 struct async_tlb *async)
829{
830 if (async->fault_num) {
831 /*
832 * Clear async->fault_num before calling the page-fault
833 * handler so that if we re-interrupt before returning
834 * from the function we have somewhere to put the
835 * information from the new interrupt.
836 */
837 int fault_num = async->fault_num;
838 async->fault_num = 0;
839 handle_page_fault(regs, fault_num, async->is_fault,
840 async->address, async->is_write);
841 }
842}
843#endif /* CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() */
844
845
846/*
847 * This routine effectively re-issues asynchronous page faults
848 * when we are returning to user space.
849 */
850void do_async_page_fault(struct pt_regs *regs)
851{
852 /*
853 * Clear thread flag early. If we re-interrupt while processing
854 * code here, we will reset it and recall this routine before
855 * returning to user space.
856 */
857 clear_thread_flag(TIF_ASYNC_TLB);
858
859#if CHIP_HAS_TILE_DMA()
860 handle_async_page_fault(regs, &current->thread.dma_async_tlb);
861#endif
862#if CHIP_HAS_SN_PROC()
863 handle_async_page_fault(regs, &current->thread.sn_async_tlb);
864#endif
865}
866
867void vmalloc_sync_all(void)
868{
869#ifdef __tilegx__
870 /* Currently all L1 kernel pmd's are static and shared. */
871 BUG_ON(pgd_index(VMALLOC_END) != pgd_index(VMALLOC_START));
872#else
873 /*
874 * Note that races in the updates of insync and start aren't
875 * problematic: insync can only get set bits added, and updates to
876 * start are only improving performance (without affecting correctness
877 * if undone).
878 */
879 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
880 static unsigned long start = PAGE_OFFSET;
881 unsigned long address;
882
883 BUILD_BUG_ON(PAGE_OFFSET & ~PGDIR_MASK);
884 for (address = start; address >= PAGE_OFFSET; address += PGDIR_SIZE) {
885 if (!test_bit(pgd_index(address), insync)) {
886 unsigned long flags;
887 struct list_head *pos;
888
889 spin_lock_irqsave(&pgd_lock, flags);
890 list_for_each(pos, &pgd_list)
891 if (!vmalloc_sync_one(list_to_pgd(pos),
892 address)) {
893 /* Must be at first entry in list. */
894 BUG_ON(pos != pgd_list.next);
895 break;
896 }
897 spin_unlock_irqrestore(&pgd_lock, flags);
898 if (pos != pgd_list.next)
899 set_bit(pgd_index(address), insync);
900 }
901 if (address == start && test_bit(pgd_index(address), insync))
902 start = address + PGDIR_SIZE;
903 }
904#endif
905}
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
new file mode 100644
index 000000000000..1fcecc5b9e03
--- /dev/null
+++ b/arch/tile/mm/highmem.c
@@ -0,0 +1,328 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/highmem.h>
16#include <linux/module.h>
17#include <linux/pagemap.h>
18#include <asm/homecache.h>
19
20#define kmap_get_pte(vaddr) \
21 pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)),\
22 (vaddr)), (vaddr))
23
24
25void *kmap(struct page *page)
26{
27 void *kva;
28 unsigned long flags;
29 pte_t *ptep;
30
31 might_sleep();
32 if (!PageHighMem(page))
33 return page_address(page);
34 kva = kmap_high(page);
35
36 /*
37 * Rewrite the PTE under the lock. This ensures that the page
38 * is not currently migrating.
39 */
40 ptep = kmap_get_pte((unsigned long)kva);
41 flags = homecache_kpte_lock();
42 set_pte_at(&init_mm, kva, ptep, mk_pte(page, page_to_kpgprot(page)));
43 homecache_kpte_unlock(flags);
44
45 return kva;
46}
47EXPORT_SYMBOL(kmap);
48
49void kunmap(struct page *page)
50{
51 if (in_interrupt())
52 BUG();
53 if (!PageHighMem(page))
54 return;
55 kunmap_high(page);
56}
57EXPORT_SYMBOL(kunmap);
58
59static void debug_kmap_atomic_prot(enum km_type type)
60{
61#ifdef CONFIG_DEBUG_HIGHMEM
62 static unsigned warn_count = 10;
63
64 if (unlikely(warn_count == 0))
65 return;
66
67 if (unlikely(in_interrupt())) {
68 if (in_irq()) {
69 if (type != KM_IRQ0 && type != KM_IRQ1 &&
70 type != KM_BIO_SRC_IRQ &&
71 /* type != KM_BIO_DST_IRQ && */
72 type != KM_BOUNCE_READ) {
73 WARN_ON(1);
74 warn_count--;
75 }
76 } else if (!irqs_disabled()) { /* softirq */
77 if (type != KM_IRQ0 && type != KM_IRQ1 &&
78 type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
79 type != KM_SKB_SUNRPC_DATA &&
80 type != KM_SKB_DATA_SOFTIRQ &&
81 type != KM_BOUNCE_READ) {
82 WARN_ON(1);
83 warn_count--;
84 }
85 }
86 }
87
88 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
89 type == KM_BIO_SRC_IRQ /* || type == KM_BIO_DST_IRQ */) {
90 if (!irqs_disabled()) {
91 WARN_ON(1);
92 warn_count--;
93 }
94 } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
95 if (irq_count() == 0 && !irqs_disabled()) {
96 WARN_ON(1);
97 warn_count--;
98 }
99 }
100#endif
101}
102
103/*
104 * Describe a single atomic mapping of a page on a given cpu at a
105 * given address, and allow it to be linked into a list.
106 */
107struct atomic_mapped_page {
108 struct list_head list;
109 struct page *page;
110 int cpu;
111 unsigned long va;
112};
113
114static spinlock_t amp_lock = __SPIN_LOCK_UNLOCKED(&amp_lock);
115static struct list_head amp_list = LIST_HEAD_INIT(amp_list);
116
117/*
118 * Combining this structure with a per-cpu declaration lets us give
119 * each cpu an atomic_mapped_page structure per type.
120 */
121struct kmap_amps {
122 struct atomic_mapped_page per_type[KM_TYPE_NR];
123};
124DEFINE_PER_CPU(struct kmap_amps, amps);
125
126/*
127 * Add a page and va, on this cpu, to the list of kmap_atomic pages,
128 * and write the new pte to memory. Writing the new PTE under the
129 * lock guarantees that it is either on the list before migration starts
130 * (if we won the race), or set_pte() sets the migrating bit in the PTE
131 * (if we lost the race). And doing it under the lock guarantees
132 * that when kmap_atomic_fix_one_pte() comes along, it finds a valid
133 * PTE in memory, iff the mapping is still on the amp_list.
134 *
135 * Finally, doing it under the lock lets us safely examine the page
136 * to see if it is immutable or not, for the generic kmap_atomic() case.
137 * If we examine it earlier we are exposed to a race where it looks
138 * writable earlier, but becomes immutable before we write the PTE.
139 */
140static void kmap_atomic_register(struct page *page, enum km_type type,
141 unsigned long va, pte_t *ptep, pte_t pteval)
142{
143 unsigned long flags;
144 struct atomic_mapped_page *amp;
145
146 flags = homecache_kpte_lock();
147 spin_lock(&amp_lock);
148
149 /* With interrupts disabled, now fill in the per-cpu info. */
150 amp = &__get_cpu_var(amps).per_type[type];
151 amp->page = page;
152 amp->cpu = smp_processor_id();
153 amp->va = va;
154
155 /* For generic kmap_atomic(), choose the PTE writability now. */
156 if (!pte_read(pteval))
157 pteval = mk_pte(page, page_to_kpgprot(page));
158
159 list_add(&amp->list, &amp_list);
160 set_pte(ptep, pteval);
161 arch_flush_lazy_mmu_mode();
162
163 spin_unlock(&amp_lock);
164 homecache_kpte_unlock(flags);
165}
166
167/*
168 * Remove a page and va, on this cpu, from the list of kmap_atomic pages.
169 * Linear-time search, but we count on the lists being short.
170 * We don't need to adjust the PTE under the lock (as opposed to the
171 * kmap_atomic_register() case), since we're just unconditionally
172 * zeroing the PTE after it's off the list.
173 */
174static void kmap_atomic_unregister(struct page *page, unsigned long va)
175{
176 unsigned long flags;
177 struct atomic_mapped_page *amp;
178 int cpu = smp_processor_id();
179 spin_lock_irqsave(&amp_lock, flags);
180 list_for_each_entry(amp, &amp_list, list) {
181 if (amp->page == page && amp->cpu == cpu && amp->va == va)
182 break;
183 }
184 BUG_ON(&amp->list == &amp_list);
185 list_del(&amp->list);
186 spin_unlock_irqrestore(&amp_lock, flags);
187}
188
189/* Helper routine for kmap_atomic_fix_kpte(), below. */
190static void kmap_atomic_fix_one_kpte(struct atomic_mapped_page *amp,
191 int finished)
192{
193 pte_t *ptep = kmap_get_pte(amp->va);
194 if (!finished) {
195 set_pte(ptep, pte_mkmigrate(*ptep));
196 flush_remote(0, 0, NULL, amp->va, PAGE_SIZE, PAGE_SIZE,
197 cpumask_of(amp->cpu), NULL, 0);
198 } else {
199 /*
200 * Rewrite a default kernel PTE for this page.
201 * We rely on the fact that set_pte() writes the
202 * present+migrating bits last.
203 */
204 pte_t pte = mk_pte(amp->page, page_to_kpgprot(amp->page));
205 set_pte(ptep, pte);
206 }
207}
208
209/*
210 * This routine is a helper function for homecache_fix_kpte(); see
211 * its comments for more information on the "finished" argument here.
212 *
213 * Note that we hold the lock while doing the remote flushes, which
214 * will stall any unrelated cpus trying to do kmap_atomic operations.
215 * We could just update the PTEs under the lock, and save away copies
216 * of the structs (or just the va+cpu), then flush them after we
217 * release the lock, but it seems easier just to do it all under the lock.
218 */
219void kmap_atomic_fix_kpte(struct page *page, int finished)
220{
221 struct atomic_mapped_page *amp;
222 unsigned long flags;
223 spin_lock_irqsave(&amp_lock, flags);
224 list_for_each_entry(amp, &amp_list, list) {
225 if (amp->page == page)
226 kmap_atomic_fix_one_kpte(amp, finished);
227 }
228 spin_unlock_irqrestore(&amp_lock, flags);
229}
230
231/*
232 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap
233 * because the kmap code must perform a global TLB invalidation when
234 * the kmap pool wraps.
235 *
236 * Note that they may be slower than on x86 (etc.) because unlike on
237 * those platforms, we do have to take a global lock to map and unmap
238 * pages on Tile (see above).
239 *
240 * When holding an atomic kmap is is not legal to sleep, so atomic
241 * kmaps are appropriate for short, tight code paths only.
242 */
243void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
244{
245 enum fixed_addresses idx;
246 unsigned long vaddr;
247 pte_t *pte;
248
249 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
250 pagefault_disable();
251
252 /* Avoid icache flushes by disallowing atomic executable mappings. */
253 BUG_ON(pte_exec(prot));
254
255 if (!PageHighMem(page))
256 return page_address(page);
257
258 debug_kmap_atomic_prot(type);
259
260 idx = type + KM_TYPE_NR*smp_processor_id();
261 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
262 pte = kmap_get_pte(vaddr);
263 BUG_ON(!pte_none(*pte));
264
265 /* Register that this page is mapped atomically on this cpu. */
266 kmap_atomic_register(page, type, vaddr, pte, mk_pte(page, prot));
267
268 return (void *)vaddr;
269}
270EXPORT_SYMBOL(kmap_atomic_prot);
271
272void *kmap_atomic(struct page *page, enum km_type type)
273{
274 /* PAGE_NONE is a magic value that tells us to check immutability. */
275 return kmap_atomic_prot(page, type, PAGE_NONE);
276}
277EXPORT_SYMBOL(kmap_atomic);
278
279void kunmap_atomic(void *kvaddr, enum km_type type)
280{
281 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
282 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
283
284 /*
285 * Force other mappings to Oops if they try to access this pte without
286 * first remapping it. Keeping stale mappings around is a bad idea.
287 */
288 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) {
289 pte_t *pte = kmap_get_pte(vaddr);
290 pte_t pteval = *pte;
291 BUG_ON(!pte_present(pteval) && !pte_migrating(pteval));
292 kmap_atomic_unregister(pte_page(pteval), vaddr);
293 kpte_clear_flush(pte, vaddr);
294 } else {
295 /* Must be a lowmem page */
296 BUG_ON(vaddr < PAGE_OFFSET);
297 BUG_ON(vaddr >= (unsigned long)high_memory);
298 }
299
300 arch_flush_lazy_mmu_mode();
301 pagefault_enable();
302}
303EXPORT_SYMBOL(kunmap_atomic);
304
305/*
306 * This API is supposed to allow us to map memory without a "struct page".
307 * Currently we don't support this, though this may change in the future.
308 */
309void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
310{
311 return kmap_atomic(pfn_to_page(pfn), type);
312}
313void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
314{
315 return kmap_atomic_prot(pfn_to_page(pfn), type, prot);
316}
317
318struct page *kmap_atomic_to_page(void *ptr)
319{
320 pte_t *pte;
321 unsigned long vaddr = (unsigned long)ptr;
322
323 if (vaddr < FIXADDR_START)
324 return virt_to_page(ptr);
325
326 pte = kmap_get_pte(vaddr);
327 return pte_page(*pte);
328}
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
new file mode 100644
index 000000000000..52feb77133ce
--- /dev/null
+++ b/arch/tile/mm/homecache.c
@@ -0,0 +1,445 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * This code maintains the "home" for each page in the system.
15 */
16
17#include <linux/kernel.h>
18#include <linux/mm.h>
19#include <linux/spinlock.h>
20#include <linux/list.h>
21#include <linux/bootmem.h>
22#include <linux/rmap.h>
23#include <linux/pagemap.h>
24#include <linux/mutex.h>
25#include <linux/interrupt.h>
26#include <linux/sysctl.h>
27#include <linux/pagevec.h>
28#include <linux/ptrace.h>
29#include <linux/timex.h>
30#include <linux/cache.h>
31#include <linux/smp.h>
32
33#include <asm/page.h>
34#include <asm/sections.h>
35#include <asm/tlbflush.h>
36#include <asm/pgalloc.h>
37#include <asm/homecache.h>
38
39#include "migrate.h"
40
41
42#if CHIP_HAS_COHERENT_LOCAL_CACHE()
43
44/*
45 * The noallocl2 option suppresses all use of the L2 cache to cache
46 * locally from a remote home. There's no point in using it if we
47 * don't have coherent local caching, though.
48 */
49int __write_once noallocl2;
50static int __init set_noallocl2(char *str)
51{
52 noallocl2 = 1;
53 return 0;
54}
55early_param("noallocl2", set_noallocl2);
56
57#else
58
59#define noallocl2 0
60
61#endif
62
63
64
65/* Provide no-op versions of these routines to keep flush_remote() cleaner. */
66#define mark_caches_evicted_start() 0
67#define mark_caches_evicted_finish(mask, timestamp) do {} while (0)
68
69
70
71
72/*
73 * Update the irq_stat for cpus that we are going to interrupt
74 * with TLB or cache flushes. Also handle removing dataplane cpus
75 * from the TLB flush set, and setting dataplane_tlb_state instead.
76 */
77static void hv_flush_update(const struct cpumask *cache_cpumask,
78 struct cpumask *tlb_cpumask,
79 unsigned long tlb_va, unsigned long tlb_length,
80 HV_Remote_ASID *asids, int asidcount)
81{
82 struct cpumask mask;
83 int i, cpu;
84
85 cpumask_clear(&mask);
86 if (cache_cpumask)
87 cpumask_or(&mask, &mask, cache_cpumask);
88 if (tlb_cpumask && tlb_length) {
89 cpumask_or(&mask, &mask, tlb_cpumask);
90 }
91
92 for (i = 0; i < asidcount; ++i)
93 cpumask_set_cpu(asids[i].y * smp_width + asids[i].x, &mask);
94
95 /*
96 * Don't bother to update atomically; losing a count
97 * here is not that critical.
98 */
99 for_each_cpu(cpu, &mask)
100 ++per_cpu(irq_stat, cpu).irq_hv_flush_count;
101}
102
103/*
104 * This wrapper function around hv_flush_remote() does several things:
105 *
106 * - Provides a return value error-checking panic path, since
107 * there's never any good reason for hv_flush_remote() to fail.
108 * - Accepts a 32-bit PFN rather than a 64-bit PA, which generally
109 * is the type that Linux wants to pass around anyway.
110 * - Centralizes the mark_caches_evicted() handling.
111 * - Canonicalizes that lengths of zero make cpumasks NULL.
112 * - Handles deferring TLB flushes for dataplane tiles.
113 * - Tracks remote interrupts in the per-cpu irq_cpustat_t.
114 *
115 * Note that we have to wait until the cache flush completes before
116 * updating the per-cpu last_cache_flush word, since otherwise another
117 * concurrent flush can race, conclude the flush has already
118 * completed, and start to use the page while it's still dirty
119 * remotely (running concurrently with the actual evict, presumably).
120 */
121void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
122 const struct cpumask *cache_cpumask_orig,
123 HV_VirtAddr tlb_va, unsigned long tlb_length,
124 unsigned long tlb_pgsize,
125 const struct cpumask *tlb_cpumask_orig,
126 HV_Remote_ASID *asids, int asidcount)
127{
128 int rc;
129 int timestamp = 0; /* happy compiler */
130 struct cpumask cache_cpumask_copy, tlb_cpumask_copy;
131 struct cpumask *cache_cpumask, *tlb_cpumask;
132 HV_PhysAddr cache_pa;
133 char cache_buf[NR_CPUS*5], tlb_buf[NR_CPUS*5];
134
135 mb(); /* provided just to simplify "magic hypervisor" mode */
136
137 /*
138 * Canonicalize and copy the cpumasks.
139 */
140 if (cache_cpumask_orig && cache_control) {
141 cpumask_copy(&cache_cpumask_copy, cache_cpumask_orig);
142 cache_cpumask = &cache_cpumask_copy;
143 } else {
144 cpumask_clear(&cache_cpumask_copy);
145 cache_cpumask = NULL;
146 }
147 if (cache_cpumask == NULL)
148 cache_control = 0;
149 if (tlb_cpumask_orig && tlb_length) {
150 cpumask_copy(&tlb_cpumask_copy, tlb_cpumask_orig);
151 tlb_cpumask = &tlb_cpumask_copy;
152 } else {
153 cpumask_clear(&tlb_cpumask_copy);
154 tlb_cpumask = NULL;
155 }
156
157 hv_flush_update(cache_cpumask, tlb_cpumask, tlb_va, tlb_length,
158 asids, asidcount);
159 cache_pa = (HV_PhysAddr)cache_pfn << PAGE_SHIFT;
160 if (cache_control & HV_FLUSH_EVICT_L2)
161 timestamp = mark_caches_evicted_start();
162 rc = hv_flush_remote(cache_pa, cache_control,
163 cpumask_bits(cache_cpumask),
164 tlb_va, tlb_length, tlb_pgsize,
165 cpumask_bits(tlb_cpumask),
166 asids, asidcount);
167 if (cache_control & HV_FLUSH_EVICT_L2)
168 mark_caches_evicted_finish(cache_cpumask, timestamp);
169 if (rc == 0)
170 return;
171 cpumask_scnprintf(cache_buf, sizeof(cache_buf), &cache_cpumask_copy);
172 cpumask_scnprintf(tlb_buf, sizeof(tlb_buf), &tlb_cpumask_copy);
173
174 printk("hv_flush_remote(%#llx, %#lx, %p [%s],"
175 " %#lx, %#lx, %#lx, %p [%s], %p, %d) = %d\n",
176 cache_pa, cache_control, cache_cpumask, cache_buf,
177 (unsigned long)tlb_va, tlb_length, tlb_pgsize,
178 tlb_cpumask, tlb_buf,
179 asids, asidcount, rc);
180 if (asidcount > 0) {
181 int i;
182 printk(" asids:");
183 for (i = 0; i < asidcount; ++i)
184 printk(" %d,%d,%d",
185 asids[i].x, asids[i].y, asids[i].asid);
186 printk("\n");
187 }
188 panic("Unsafe to continue.");
189}
190
191void homecache_evict(const struct cpumask *mask)
192{
193 flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0);
194}
195
196/* Return a mask of the cpus whose caches currently own these pages. */
197static void homecache_mask(struct page *page, int pages,
198 struct cpumask *home_mask)
199{
200 int i;
201 cpumask_clear(home_mask);
202 for (i = 0; i < pages; ++i) {
203 int home = page_home(&page[i]);
204 if (home == PAGE_HOME_IMMUTABLE ||
205 home == PAGE_HOME_INCOHERENT) {
206 cpumask_copy(home_mask, cpu_possible_mask);
207 return;
208 }
209#if CHIP_HAS_CBOX_HOME_MAP()
210 if (home == PAGE_HOME_HASH) {
211 cpumask_or(home_mask, home_mask, &hash_for_home_map);
212 continue;
213 }
214#endif
215 if (home == PAGE_HOME_UNCACHED)
216 continue;
217 BUG_ON(home < 0 || home >= NR_CPUS);
218 cpumask_set_cpu(home, home_mask);
219 }
220}
221
222/*
223 * Return the passed length, or zero if it's long enough that we
224 * believe we should evict the whole L2 cache.
225 */
226static unsigned long cache_flush_length(unsigned long length)
227{
228 return (length >= CHIP_L2_CACHE_SIZE()) ? HV_FLUSH_EVICT_L2 : length;
229}
230
231/* On the simulator, confirm lines have been evicted everywhere. */
232static void validate_lines_evicted(unsigned long pfn, size_t length)
233{
234 sim_syscall(SIM_SYSCALL_VALIDATE_LINES_EVICTED,
235 (HV_PhysAddr)pfn << PAGE_SHIFT, length);
236}
237
238/* Flush a page out of whatever cache(s) it is in. */
239void homecache_flush_cache(struct page *page, int order)
240{
241 int pages = 1 << order;
242 int length = cache_flush_length(pages * PAGE_SIZE);
243 unsigned long pfn = page_to_pfn(page);
244 struct cpumask home_mask;
245
246 homecache_mask(page, pages, &home_mask);
247 flush_remote(pfn, length, &home_mask, 0, 0, 0, NULL, NULL, 0);
248 validate_lines_evicted(pfn, pages * PAGE_SIZE);
249}
250
251
252/* Report the home corresponding to a given PTE. */
253static int pte_to_home(pte_t pte)
254{
255 if (hv_pte_get_nc(pte))
256 return PAGE_HOME_IMMUTABLE;
257 switch (hv_pte_get_mode(pte)) {
258 case HV_PTE_MODE_CACHE_TILE_L3:
259 return get_remote_cache_cpu(pte);
260 case HV_PTE_MODE_CACHE_NO_L3:
261 return PAGE_HOME_INCOHERENT;
262 case HV_PTE_MODE_UNCACHED:
263 return PAGE_HOME_UNCACHED;
264#if CHIP_HAS_CBOX_HOME_MAP()
265 case HV_PTE_MODE_CACHE_HASH_L3:
266 return PAGE_HOME_HASH;
267#endif
268 }
269 panic("Bad PTE %#llx\n", pte.val);
270}
271
272/* Update the home of a PTE if necessary (can also be used for a pgprot_t). */
273pte_t pte_set_home(pte_t pte, int home)
274{
275 /* Check for non-linear file mapping "PTEs" and pass them through. */
276 if (pte_file(pte))
277 return pte;
278
279#if CHIP_HAS_MMIO()
280 /* Check for MMIO mappings and pass them through. */
281 if (hv_pte_get_mode(pte) == HV_PTE_MODE_MMIO)
282 return pte;
283#endif
284
285
286 /*
287 * Only immutable pages get NC mappings. If we have a
288 * non-coherent PTE, but the underlying page is not
289 * immutable, it's likely the result of a forced
290 * caching setting running up against ptrace setting
291 * the page to be writable underneath. In this case,
292 * just keep the PTE coherent.
293 */
294 if (hv_pte_get_nc(pte) && home != PAGE_HOME_IMMUTABLE) {
295 pte = hv_pte_clear_nc(pte);
296 printk("non-immutable page incoherently referenced: %#llx\n",
297 pte.val);
298 }
299
300 switch (home) {
301
302 case PAGE_HOME_UNCACHED:
303 pte = hv_pte_set_mode(pte, HV_PTE_MODE_UNCACHED);
304 break;
305
306 case PAGE_HOME_INCOHERENT:
307 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
308 break;
309
310 case PAGE_HOME_IMMUTABLE:
311 /*
312 * We could home this page anywhere, since it's immutable,
313 * but by default just home it to follow "hash_default".
314 */
315 BUG_ON(hv_pte_get_writable(pte));
316 if (pte_get_forcecache(pte)) {
317 /* Upgrade "force any cpu" to "No L3" for immutable. */
318 if (hv_pte_get_mode(pte) == HV_PTE_MODE_CACHE_TILE_L3
319 && pte_get_anyhome(pte)) {
320 pte = hv_pte_set_mode(pte,
321 HV_PTE_MODE_CACHE_NO_L3);
322 }
323 } else
324#if CHIP_HAS_CBOX_HOME_MAP()
325 if (hash_default)
326 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
327 else
328#endif
329 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
330 pte = hv_pte_set_nc(pte);
331 break;
332
333#if CHIP_HAS_CBOX_HOME_MAP()
334 case PAGE_HOME_HASH:
335 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
336 break;
337#endif
338
339 default:
340 BUG_ON(home < 0 || home >= NR_CPUS ||
341 !cpu_is_valid_lotar(home));
342 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_TILE_L3);
343 pte = set_remote_cache_cpu(pte, home);
344 break;
345 }
346
347#if CHIP_HAS_NC_AND_NOALLOC_BITS()
348 if (noallocl2)
349 pte = hv_pte_set_no_alloc_l2(pte);
350
351 /* Simplify "no local and no l3" to "uncached" */
352 if (hv_pte_get_no_alloc_l2(pte) && hv_pte_get_no_alloc_l1(pte) &&
353 hv_pte_get_mode(pte) == HV_PTE_MODE_CACHE_NO_L3) {
354 pte = hv_pte_set_mode(pte, HV_PTE_MODE_UNCACHED);
355 }
356#endif
357
358 /* Checking this case here gives a better panic than from the hv. */
359 BUG_ON(hv_pte_get_mode(pte) == 0);
360
361 return pte;
362}
363
364/*
365 * The routines in this section are the "static" versions of the normal
366 * dynamic homecaching routines; they just set the home cache
367 * of a kernel page once, and require a full-chip cache/TLB flush,
368 * so they're not suitable for anything but infrequent use.
369 */
370
371#if CHIP_HAS_CBOX_HOME_MAP()
372static inline int initial_page_home(void) { return PAGE_HOME_HASH; }
373#else
374static inline int initial_page_home(void) { return 0; }
375#endif
376
377int page_home(struct page *page)
378{
379 if (PageHighMem(page)) {
380 return initial_page_home();
381 } else {
382 unsigned long kva = (unsigned long)page_address(page);
383 return pte_to_home(*virt_to_pte(NULL, kva));
384 }
385}
386
387void homecache_change_page_home(struct page *page, int order, int home)
388{
389 int i, pages = (1 << order);
390 unsigned long kva;
391
392 BUG_ON(PageHighMem(page));
393 BUG_ON(page_count(page) > 1);
394 BUG_ON(page_mapcount(page) != 0);
395 kva = (unsigned long) page_address(page);
396 flush_remote(0, HV_FLUSH_EVICT_L2, &cpu_cacheable_map,
397 kva, pages * PAGE_SIZE, PAGE_SIZE, cpu_online_mask,
398 NULL, 0);
399
400 for (i = 0; i < pages; ++i, kva += PAGE_SIZE) {
401 pte_t *ptep = virt_to_pte(NULL, kva);
402 pte_t pteval = *ptep;
403 BUG_ON(!pte_present(pteval) || pte_huge(pteval));
404 *ptep = pte_set_home(pteval, home);
405 }
406}
407
408struct page *homecache_alloc_pages(gfp_t gfp_mask,
409 unsigned int order, int home)
410{
411 struct page *page;
412 BUG_ON(gfp_mask & __GFP_HIGHMEM); /* must be lowmem */
413 page = alloc_pages(gfp_mask, order);
414 if (page)
415 homecache_change_page_home(page, order, home);
416 return page;
417}
418
419struct page *homecache_alloc_pages_node(int nid, gfp_t gfp_mask,
420 unsigned int order, int home)
421{
422 struct page *page;
423 BUG_ON(gfp_mask & __GFP_HIGHMEM); /* must be lowmem */
424 page = alloc_pages_node(nid, gfp_mask, order);
425 if (page)
426 homecache_change_page_home(page, order, home);
427 return page;
428}
429
430void homecache_free_pages(unsigned long addr, unsigned int order)
431{
432 struct page *page;
433
434 if (addr == 0)
435 return;
436
437 VM_BUG_ON(!virt_addr_valid((void *)addr));
438 page = virt_to_page((void *)addr);
439 if (put_page_testzero(page)) {
440 int pages = (1 << order);
441 homecache_change_page_home(page, order, initial_page_home());
442 while (pages--)
443 __free_page(page++);
444 }
445}
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
new file mode 100644
index 000000000000..c38570f8f0d0
--- /dev/null
+++ b/arch/tile/mm/hugetlbpage.c
@@ -0,0 +1,343 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * TILE Huge TLB Page Support for Kernel.
15 * Taken from i386 hugetlb implementation:
16 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
17 */
18
19#include <linux/init.h>
20#include <linux/fs.h>
21#include <linux/mm.h>
22#include <linux/hugetlb.h>
23#include <linux/pagemap.h>
24#include <linux/smp_lock.h>
25#include <linux/slab.h>
26#include <linux/err.h>
27#include <linux/sysctl.h>
28#include <linux/mman.h>
29#include <asm/tlb.h>
30#include <asm/tlbflush.h>
31
32pte_t *huge_pte_alloc(struct mm_struct *mm,
33 unsigned long addr, unsigned long sz)
34{
35 pgd_t *pgd;
36 pud_t *pud;
37 pte_t *pte = NULL;
38
39 /* We do not yet support multiple huge page sizes. */
40 BUG_ON(sz != PMD_SIZE);
41
42 pgd = pgd_offset(mm, addr);
43 pud = pud_alloc(mm, pgd, addr);
44 if (pud)
45 pte = (pte_t *) pmd_alloc(mm, pud, addr);
46 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
47
48 return pte;
49}
50
51pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
52{
53 pgd_t *pgd;
54 pud_t *pud;
55 pmd_t *pmd = NULL;
56
57 pgd = pgd_offset(mm, addr);
58 if (pgd_present(*pgd)) {
59 pud = pud_offset(pgd, addr);
60 if (pud_present(*pud))
61 pmd = pmd_offset(pud, addr);
62 }
63 return (pte_t *) pmd;
64}
65
66#ifdef HUGETLB_TEST
67struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
68 int write)
69{
70 unsigned long start = address;
71 int length = 1;
72 int nr;
73 struct page *page;
74 struct vm_area_struct *vma;
75
76 vma = find_vma(mm, addr);
77 if (!vma || !is_vm_hugetlb_page(vma))
78 return ERR_PTR(-EINVAL);
79
80 pte = huge_pte_offset(mm, address);
81
82 /* hugetlb should be locked, and hence, prefaulted */
83 WARN_ON(!pte || pte_none(*pte));
84
85 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
86
87 WARN_ON(!PageHead(page));
88
89 return page;
90}
91
92int pmd_huge(pmd_t pmd)
93{
94 return 0;
95}
96
97int pud_huge(pud_t pud)
98{
99 return 0;
100}
101
102struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
103 pmd_t *pmd, int write)
104{
105 return NULL;
106}
107
108#else
109
110struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
111 int write)
112{
113 return ERR_PTR(-EINVAL);
114}
115
116int pmd_huge(pmd_t pmd)
117{
118 return !!(pmd_val(pmd) & _PAGE_HUGE_PAGE);
119}
120
121int pud_huge(pud_t pud)
122{
123 return !!(pud_val(pud) & _PAGE_HUGE_PAGE);
124}
125
126struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
127 pmd_t *pmd, int write)
128{
129 struct page *page;
130
131 page = pte_page(*(pte_t *)pmd);
132 if (page)
133 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
134 return page;
135}
136
137struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
138 pud_t *pud, int write)
139{
140 struct page *page;
141
142 page = pte_page(*(pte_t *)pud);
143 if (page)
144 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
145 return page;
146}
147
148int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
149{
150 return 0;
151}
152
153#endif
154
155#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
156static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
157 unsigned long addr, unsigned long len,
158 unsigned long pgoff, unsigned long flags)
159{
160 struct hstate *h = hstate_file(file);
161 struct mm_struct *mm = current->mm;
162 struct vm_area_struct *vma;
163 unsigned long start_addr;
164
165 if (len > mm->cached_hole_size) {
166 start_addr = mm->free_area_cache;
167 } else {
168 start_addr = TASK_UNMAPPED_BASE;
169 mm->cached_hole_size = 0;
170 }
171
172full_search:
173 addr = ALIGN(start_addr, huge_page_size(h));
174
175 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
176 /* At this point: (!vma || addr < vma->vm_end). */
177 if (TASK_SIZE - len < addr) {
178 /*
179 * Start a new search - just in case we missed
180 * some holes.
181 */
182 if (start_addr != TASK_UNMAPPED_BASE) {
183 start_addr = TASK_UNMAPPED_BASE;
184 mm->cached_hole_size = 0;
185 goto full_search;
186 }
187 return -ENOMEM;
188 }
189 if (!vma || addr + len <= vma->vm_start) {
190 mm->free_area_cache = addr + len;
191 return addr;
192 }
193 if (addr + mm->cached_hole_size < vma->vm_start)
194 mm->cached_hole_size = vma->vm_start - addr;
195 addr = ALIGN(vma->vm_end, huge_page_size(h));
196 }
197}
198
199static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
200 unsigned long addr0, unsigned long len,
201 unsigned long pgoff, unsigned long flags)
202{
203 struct hstate *h = hstate_file(file);
204 struct mm_struct *mm = current->mm;
205 struct vm_area_struct *vma, *prev_vma;
206 unsigned long base = mm->mmap_base, addr = addr0;
207 unsigned long largest_hole = mm->cached_hole_size;
208 int first_time = 1;
209
210 /* don't allow allocations above current base */
211 if (mm->free_area_cache > base)
212 mm->free_area_cache = base;
213
214 if (len <= largest_hole) {
215 largest_hole = 0;
216 mm->free_area_cache = base;
217 }
218try_again:
219 /* make sure it can fit in the remaining address space */
220 if (mm->free_area_cache < len)
221 goto fail;
222
223 /* either no address requested or cant fit in requested address hole */
224 addr = (mm->free_area_cache - len) & huge_page_mask(h);
225 do {
226 /*
227 * Lookup failure means no vma is above this address,
228 * i.e. return with success:
229 */
230 vma = find_vma_prev(mm, addr, &prev_vma);
231 if (!vma) {
232 return addr;
233 break;
234 }
235
236 /*
237 * new region fits between prev_vma->vm_end and
238 * vma->vm_start, use it:
239 */
240 if (addr + len <= vma->vm_start &&
241 (!prev_vma || (addr >= prev_vma->vm_end))) {
242 /* remember the address as a hint for next time */
243 mm->cached_hole_size = largest_hole;
244 mm->free_area_cache = addr;
245 return addr;
246 } else {
247 /* pull free_area_cache down to the first hole */
248 if (mm->free_area_cache == vma->vm_end) {
249 mm->free_area_cache = vma->vm_start;
250 mm->cached_hole_size = largest_hole;
251 }
252 }
253
254 /* remember the largest hole we saw so far */
255 if (addr + largest_hole < vma->vm_start)
256 largest_hole = vma->vm_start - addr;
257
258 /* try just below the current vma->vm_start */
259 addr = (vma->vm_start - len) & huge_page_mask(h);
260
261 } while (len <= vma->vm_start);
262
263fail:
264 /*
265 * if hint left us with no space for the requested
266 * mapping then try again:
267 */
268 if (first_time) {
269 mm->free_area_cache = base;
270 largest_hole = 0;
271 first_time = 0;
272 goto try_again;
273 }
274 /*
275 * A failed mmap() very likely causes application failure,
276 * so fall back to the bottom-up function here. This scenario
277 * can happen with large stack limits and large mmap()
278 * allocations.
279 */
280 mm->free_area_cache = TASK_UNMAPPED_BASE;
281 mm->cached_hole_size = ~0UL;
282 addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
283 len, pgoff, flags);
284
285 /*
286 * Restore the topdown base:
287 */
288 mm->free_area_cache = base;
289 mm->cached_hole_size = ~0UL;
290
291 return addr;
292}
293
294unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
295 unsigned long len, unsigned long pgoff, unsigned long flags)
296{
297 struct hstate *h = hstate_file(file);
298 struct mm_struct *mm = current->mm;
299 struct vm_area_struct *vma;
300
301 if (len & ~huge_page_mask(h))
302 return -EINVAL;
303 if (len > TASK_SIZE)
304 return -ENOMEM;
305
306 if (flags & MAP_FIXED) {
307 if (prepare_hugepage_range(file, addr, len))
308 return -EINVAL;
309 return addr;
310 }
311
312 if (addr) {
313 addr = ALIGN(addr, huge_page_size(h));
314 vma = find_vma(mm, addr);
315 if (TASK_SIZE - len >= addr &&
316 (!vma || addr + len <= vma->vm_start))
317 return addr;
318 }
319 if (current->mm->get_unmapped_area == arch_get_unmapped_area)
320 return hugetlb_get_unmapped_area_bottomup(file, addr, len,
321 pgoff, flags);
322 else
323 return hugetlb_get_unmapped_area_topdown(file, addr, len,
324 pgoff, flags);
325}
326
327static __init int setup_hugepagesz(char *opt)
328{
329 unsigned long ps = memparse(opt, &opt);
330 if (ps == PMD_SIZE) {
331 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
332 } else if (ps == PUD_SIZE) {
333 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
334 } else {
335 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
336 ps >> 20);
337 return 0;
338 }
339 return 1;
340}
341__setup("hugepagesz=", setup_hugepagesz);
342
343#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
new file mode 100644
index 000000000000..125ac53b60fc
--- /dev/null
+++ b/arch/tile/mm/init.c
@@ -0,0 +1,1082 @@
1/*
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright 2010 Tilera Corporation. All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation, version 2.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
12 * NON INFRINGEMENT. See the GNU General Public License for
13 * more details.
14 */
15
16#include <linux/module.h>
17#include <linux/signal.h>
18#include <linux/sched.h>
19#include <linux/kernel.h>
20#include <linux/errno.h>
21#include <linux/string.h>
22#include <linux/types.h>
23#include <linux/ptrace.h>
24#include <linux/mman.h>
25#include <linux/mm.h>
26#include <linux/hugetlb.h>
27#include <linux/swap.h>
28#include <linux/smp.h>
29#include <linux/init.h>
30#include <linux/highmem.h>
31#include <linux/pagemap.h>
32#include <linux/poison.h>
33#include <linux/bootmem.h>
34#include <linux/slab.h>
35#include <linux/proc_fs.h>
36#include <linux/efi.h>
37#include <linux/memory_hotplug.h>
38#include <linux/uaccess.h>
39#include <asm/mmu_context.h>
40#include <asm/processor.h>
41#include <asm/system.h>
42#include <asm/pgtable.h>
43#include <asm/pgalloc.h>
44#include <asm/dma.h>
45#include <asm/fixmap.h>
46#include <asm/tlb.h>
47#include <asm/tlbflush.h>
48#include <asm/sections.h>
49#include <asm/setup.h>
50#include <asm/homecache.h>
51#include <hv/hypervisor.h>
52#include <arch/chip.h>
53
54#include "migrate.h"
55
56/*
57 * We could set FORCE_MAX_ZONEORDER to "(HPAGE_SHIFT - PAGE_SHIFT + 1)"
58 * in the Tile Kconfig, but this generates configure warnings.
59 * Do it here and force people to get it right to compile this file.
60 * The problem is that with 4KB small pages and 16MB huge pages,
61 * the default value doesn't allow us to group enough small pages
62 * together to make up a huge page.
63 */
64#if CONFIG_FORCE_MAX_ZONEORDER < HPAGE_SHIFT - PAGE_SHIFT + 1
65# error "Change FORCE_MAX_ZONEORDER in arch/tile/Kconfig to match page size"
66#endif
67
68#define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0))
69
70unsigned long VMALLOC_RESERVE = CONFIG_VMALLOC_RESERVE;
71
72DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
73
74/* Create an L2 page table */
75static pte_t * __init alloc_pte(void)
76{
77 return __alloc_bootmem(L2_KERNEL_PGTABLE_SIZE, HV_PAGE_TABLE_ALIGN, 0);
78}
79
80/*
81 * L2 page tables per controller. We allocate these all at once from
82 * the bootmem allocator and store them here. This saves on kernel L2
83 * page table memory, compared to allocating a full 64K page per L2
84 * page table, and also means that in cases where we use huge pages,
85 * we are guaranteed to later be able to shatter those huge pages and
86 * switch to using these page tables instead, without requiring
87 * further allocation. Each l2_ptes[] entry points to the first page
88 * table for the first hugepage-size piece of memory on the
89 * controller; other page tables are just indexed directly, i.e. the
90 * L2 page tables are contiguous in memory for each controller.
91 */
92static pte_t *l2_ptes[MAX_NUMNODES];
93static int num_l2_ptes[MAX_NUMNODES];
94
95static void init_prealloc_ptes(int node, int pages)
96{
97 BUG_ON(pages & (HV_L2_ENTRIES-1));
98 if (pages) {
99 num_l2_ptes[node] = pages;
100 l2_ptes[node] = __alloc_bootmem(pages * sizeof(pte_t),
101 HV_PAGE_TABLE_ALIGN, 0);
102 }
103}
104
105pte_t *get_prealloc_pte(unsigned long pfn)
106{
107 int node = pfn_to_nid(pfn);
108 pfn &= ~(-1UL << (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT));
109 BUG_ON(node >= MAX_NUMNODES);
110 BUG_ON(pfn >= num_l2_ptes[node]);
111 return &l2_ptes[node][pfn];
112}
113
114/*
115 * What caching do we expect pages from the heap to have when
116 * they are allocated during bootup? (Once we've installed the
117 * "real" swapper_pg_dir.)
118 */
119static int initial_heap_home(void)
120{
121#if CHIP_HAS_CBOX_HOME_MAP()
122 if (hash_default)
123 return PAGE_HOME_HASH;
124#endif
125 return smp_processor_id();
126}
127
128/*
129 * Place a pointer to an L2 page table in a middle page
130 * directory entry.
131 */
132static void __init assign_pte(pmd_t *pmd, pte_t *page_table)
133{
134 phys_addr_t pa = __pa(page_table);
135 unsigned long l2_ptfn = pa >> HV_LOG2_PAGE_TABLE_ALIGN;
136 pte_t pteval = hv_pte_set_ptfn(__pgprot(_PAGE_TABLE), l2_ptfn);
137 BUG_ON((pa & (HV_PAGE_TABLE_ALIGN-1)) != 0);
138 pteval = pte_set_home(pteval, initial_heap_home());
139 *(pte_t *)pmd = pteval;
140 if (page_table != (pte_t *)pmd_page_vaddr(*pmd))
141 BUG();
142}
143
144#ifdef __tilegx__
145
146#if HV_L1_SIZE != HV_L2_SIZE
147# error Rework assumption that L1 and L2 page tables are same size.
148#endif
149
150/* Since pmd_t arrays and pte_t arrays are the same size, just use casts. */
151static inline pmd_t *alloc_pmd(void)
152{
153 return (pmd_t *)alloc_pte();
154}
155
156static inline void assign_pmd(pud_t *pud, pmd_t *pmd)
157{
158 assign_pte((pmd_t *)pud, (pte_t *)pmd);
159}
160
161#endif /* __tilegx__ */
162
163/* Replace the given pmd with a full PTE table. */
164void __init shatter_pmd(pmd_t *pmd)
165{
166 pte_t *pte = get_prealloc_pte(pte_pfn(*(pte_t *)pmd));
167 assign_pte(pmd, pte);
168}
169
170#ifdef CONFIG_HIGHMEM
171/*
172 * This function initializes a certain range of kernel virtual memory
173 * with new bootmem page tables, everywhere page tables are missing in
174 * the given range.
175 */
176
177/*
178 * NOTE: The pagetables are allocated contiguous on the physical space
179 * so we can cache the place of the first one and move around without
180 * checking the pgd every time.
181 */
182static void __init page_table_range_init(unsigned long start,
183 unsigned long end, pgd_t *pgd_base)
184{
185 pgd_t *pgd;
186 int pgd_idx;
187 unsigned long vaddr;
188
189 vaddr = start;
190 pgd_idx = pgd_index(vaddr);
191 pgd = pgd_base + pgd_idx;
192
193 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
194 pmd_t *pmd = pmd_offset(pud_offset(pgd, vaddr), vaddr);
195 if (pmd_none(*pmd))
196 assign_pte(pmd, alloc_pte());
197 vaddr += PMD_SIZE;
198 }
199}
200#endif /* CONFIG_HIGHMEM */
201
202
203#if CHIP_HAS_CBOX_HOME_MAP()
204
205static int __initdata ktext_hash = 1; /* .text pages */
206static int __initdata kdata_hash = 1; /* .data and .bss pages */
207int __write_once hash_default = 1; /* kernel allocator pages */
208EXPORT_SYMBOL(hash_default);
209int __write_once kstack_hash = 1; /* if no homecaching, use h4h */
210#endif /* CHIP_HAS_CBOX_HOME_MAP */
211
212/*
213 * CPUs to use to for striping the pages of kernel data. If hash-for-home
214 * is available, this is only relevant if kcache_hash sets up the
215 * .data and .bss to be page-homed, and we don't want the default mode
216 * of using the full set of kernel cpus for the striping.
217 */
218static __initdata struct cpumask kdata_mask;
219static __initdata int kdata_arg_seen;
220
221int __write_once kdata_huge; /* if no homecaching, small pages */
222
223
224/* Combine a generic pgprot_t with cache home to get a cache-aware pgprot. */
225static pgprot_t __init construct_pgprot(pgprot_t prot, int home)
226{
227 prot = pte_set_home(prot, home);
228#if CHIP_HAS_CBOX_HOME_MAP()
229 if (home == PAGE_HOME_IMMUTABLE) {
230 if (ktext_hash)
231 prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_HASH_L3);
232 else
233 prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_NO_L3);
234 }
235#endif
236 return prot;
237}
238
239/*
240 * For a given kernel data VA, how should it be cached?
241 * We return the complete pgprot_t with caching bits set.
242 */
243static pgprot_t __init init_pgprot(ulong address)
244{
245 int cpu;
246 unsigned long page;
247 enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
248
249#if CHIP_HAS_CBOX_HOME_MAP()
250 /* For kdata=huge, everything is just hash-for-home. */
251 if (kdata_huge)
252 return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
253#endif
254
255 /* We map the aliased pages of permanent text inaccessible. */
256 if (address < (ulong) _sinittext - CODE_DELTA)
257 return PAGE_NONE;
258
259 /*
260 * We map read-only data non-coherent for performance. We could
261 * use neighborhood caching on TILE64, but it's not clear it's a win.
262 */
263 if ((address >= (ulong) __start_rodata &&
264 address < (ulong) __end_rodata) ||
265 address == (ulong) empty_zero_page) {
266 return construct_pgprot(PAGE_KERNEL_RO, PAGE_HOME_IMMUTABLE);
267 }
268
269 /* As a performance optimization, keep the boot init stack here. */
270 if (address >= (ulong)&init_thread_union &&
271 address < (ulong)&init_thread_union + THREAD_SIZE)
272 return construct_pgprot(PAGE_KERNEL, smp_processor_id());
273
274#ifndef __tilegx__
275#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
276 /* Force the atomic_locks[] array page to be hash-for-home. */
277 if (address == (ulong) atomic_locks)
278 return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
279#endif
280#endif
281
282 /*
283 * Everything else that isn't data or bss is heap, so mark it
284 * with the initial heap home (hash-for-home, or this cpu). This
285 * includes any addresses after the loaded image; any address before
286 * _einittext (since we already captured the case of text before
287 * _sinittext); and any init-data pages.
288 *
289 * All the LOWMEM pages that we mark this way will get their
290 * struct page homecache properly marked later, in set_page_homes().
291 * The HIGHMEM pages we leave with a default zero for their
292 * homes, but with a zero free_time we don't have to actually
293 * do a flush action the first time we use them, either.
294 */
295 if (address >= (ulong) _end || address < (ulong) _sdata ||
296 (address >= (ulong) _sinitdata &&
297 address < (ulong) _einitdata))
298 return construct_pgprot(PAGE_KERNEL, initial_heap_home());
299
300#if CHIP_HAS_CBOX_HOME_MAP()
301 /* Use hash-for-home if requested for data/bss. */
302 if (kdata_hash)
303 return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
304#endif
305
306 /*
307 * Otherwise we just hand out consecutive cpus. To avoid
308 * requiring this function to hold state, we just walk forward from
309 * _sdata by PAGE_SIZE, skipping the readonly and init data, to reach
310 * the requested address, while walking cpu home around kdata_mask.
311 * This is typically no more than a dozen or so iterations.
312 */
313 BUG_ON(_einitdata != __bss_start);
314 for (page = (ulong)_sdata, cpu = NR_CPUS; ; ) {
315 cpu = cpumask_next(cpu, &kdata_mask);
316 if (cpu == NR_CPUS)
317 cpu = cpumask_first(&kdata_mask);
318 if (page >= address)
319 break;
320 page += PAGE_SIZE;
321 if (page == (ulong)__start_rodata)
322 page = (ulong)__end_rodata;
323 if (page == (ulong)&init_thread_union)
324 page += THREAD_SIZE;
325 if (page == (ulong)_sinitdata)
326 page = (ulong)_einitdata;
327 if (page == (ulong)empty_zero_page)
328 page += PAGE_SIZE;
329#ifndef __tilegx__
330#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
331 if (page == (ulong)atomic_locks)
332 page += PAGE_SIZE;
333#endif
334#endif
335
336 }
337 return construct_pgprot(PAGE_KERNEL, cpu);
338}
339
340/*
341 * This function sets up how we cache the kernel text. If we have
342 * hash-for-home support, normally that is used instead (see the
343 * kcache_hash boot flag for more information). But if we end up
344 * using a page-based caching technique, this option sets up the
345 * details of that. In addition, the "ktext=nocache" option may
346 * always be used to disable local caching of text pages, if desired.
347 */
348
349static int __initdata ktext_arg_seen;
350static int __initdata ktext_small;
351static int __initdata ktext_local;
352static int __initdata ktext_all;
353static int __initdata ktext_nondataplane;
354static int __initdata ktext_nocache;
355static struct cpumask __initdata ktext_mask;
356
357static int __init setup_ktext(char *str)
358{
359 if (str == NULL)
360 return -EINVAL;
361
362 /* If you have a leading "nocache", turn off ktext caching */
363 if (strncmp(str, "nocache", 7) == 0) {
364 ktext_nocache = 1;
365 printk("ktext: disabling local caching of kernel text\n");
366 str += 7;
367 if (*str == ',')
368 ++str;
369 if (*str == '\0')
370 return 0;
371 }
372
373 ktext_arg_seen = 1;
374
375 /* Default setting on Tile64: use a huge page */
376 if (strcmp(str, "huge") == 0)
377 printk("ktext: using one huge locally cached page\n");
378
379 /* Pay TLB cost but get no cache benefit: cache small pages locally */
380 else if (strcmp(str, "local") == 0) {
381 ktext_small = 1;
382 ktext_local = 1;
383 printk("ktext: using small pages with local caching\n");
384 }
385
386 /* Neighborhood cache ktext pages on all cpus. */
387 else if (strcmp(str, "all") == 0) {
388 ktext_small = 1;
389 ktext_all = 1;
390 printk("ktext: using maximal caching neighborhood\n");
391 }
392
393
394 /* Neighborhood ktext pages on specified mask */
395 else if (cpulist_parse(str, &ktext_mask) == 0) {
396 char buf[NR_CPUS * 5];
397 cpulist_scnprintf(buf, sizeof(buf), &ktext_mask);
398 if (cpumask_weight(&ktext_mask) > 1) {
399 ktext_small = 1;
400 printk("ktext: using caching neighborhood %s "
401 "with small pages\n", buf);
402 } else {
403 printk("ktext: caching on cpu %s with one huge page\n",
404 buf);
405 }
406 }
407
408 else if (*str)
409 return -EINVAL;
410
411 return 0;
412}
413
414early_param("ktext", setup_ktext);
415
416
417static inline pgprot_t ktext_set_nocache(pgprot_t prot)
418{
419 if (!ktext_nocache)
420 prot = hv_pte_set_nc(prot);
421#if CHIP_HAS_NC_AND_NOALLOC_BITS()
422 else
423 prot = hv_pte_set_no_alloc_l2(prot);
424#endif
425 return prot;
426}
427
428#ifndef __tilegx__
429static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
430{
431 return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va);
432}
433#else
434static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
435{
436 pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va);
437 if (pud_none(*pud))
438 assign_pmd(pud, alloc_pmd());
439 return pmd_offset(pud, va);
440}
441#endif
442
443/* Temporary page table we use for staging. */
444static pgd_t pgtables[PTRS_PER_PGD]
445 __attribute__((section(".init.page")));
446
447/*
448 * This maps the physical memory to kernel virtual address space, a total
449 * of max_low_pfn pages, by creating page tables starting from address
450 * PAGE_OFFSET.
451 *
452 * This routine transitions us from using a set of compiled-in large
453 * pages to using some more precise caching, including removing access
454 * to code pages mapped at PAGE_OFFSET (executed only at MEM_SV_START)
455 * marking read-only data as locally cacheable, striping the remaining
456 * .data and .bss across all the available tiles, and removing access
457 * to pages above the top of RAM (thus ensuring a page fault from a bad
458 * virtual address rather than a hypervisor shoot down for accessing
459 * memory outside the assigned limits).
460 */
461static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
462{
463 unsigned long address, pfn;
464 pmd_t *pmd;
465 pte_t *pte;
466 int pte_ofs;
467 const struct cpumask *my_cpu_mask = cpumask_of(smp_processor_id());
468 struct cpumask kstripe_mask;
469 int rc, i;
470
471#if CHIP_HAS_CBOX_HOME_MAP()
472 if (ktext_arg_seen && ktext_hash) {
473 printk("warning: \"ktext\" boot argument ignored"
474 " if \"kcache_hash\" sets up text hash-for-home\n");
475 ktext_small = 0;
476 }
477
478 if (kdata_arg_seen && kdata_hash) {
479 printk("warning: \"kdata\" boot argument ignored"
480 " if \"kcache_hash\" sets up data hash-for-home\n");
481 }
482
483 if (kdata_huge && !hash_default) {
484 printk("warning: disabling \"kdata=huge\"; requires"
485 " kcache_hash=all or =allbutstack\n");
486 kdata_huge = 0;
487 }
488#endif
489
490 /*
491 * Set up a mask for cpus to use for kernel striping.
492 * This is normally all cpus, but minus dataplane cpus if any.
493 * If the dataplane covers the whole chip, we stripe over
494 * the whole chip too.
495 */
496 cpumask_copy(&kstripe_mask, cpu_possible_mask);
497 if (!kdata_arg_seen)
498 kdata_mask = kstripe_mask;
499
500 /* Allocate and fill in L2 page tables */
501 for (i = 0; i < MAX_NUMNODES; ++i) {
502#ifdef CONFIG_HIGHMEM
503 unsigned long end_pfn = node_lowmem_end_pfn[i];
504#else
505 unsigned long end_pfn = node_end_pfn[i];
506#endif
507 unsigned long end_huge_pfn = 0;
508
509 /* Pre-shatter the last huge page to allow per-cpu pages. */
510 if (kdata_huge)
511 end_huge_pfn = end_pfn - (HPAGE_SIZE >> PAGE_SHIFT);
512
513 pfn = node_start_pfn[i];
514
515 /* Allocate enough memory to hold L2 page tables for node. */
516 init_prealloc_ptes(i, end_pfn - pfn);
517
518 address = (unsigned long) pfn_to_kaddr(pfn);
519 while (pfn < end_pfn) {
520 BUG_ON(address & (HPAGE_SIZE-1));
521 pmd = get_pmd(pgtables, address);
522 pte = get_prealloc_pte(pfn);
523 if (pfn < end_huge_pfn) {
524 pgprot_t prot = init_pgprot(address);
525 *(pte_t *)pmd = pte_mkhuge(pfn_pte(pfn, prot));
526 for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE;
527 pfn++, pte_ofs++, address += PAGE_SIZE)
528 pte[pte_ofs] = pfn_pte(pfn, prot);
529 } else {
530 if (kdata_huge)
531 printk(KERN_DEBUG "pre-shattered huge"
532 " page at %#lx\n", address);
533 for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE;
534 pfn++, pte_ofs++, address += PAGE_SIZE) {
535 pgprot_t prot = init_pgprot(address);
536 pte[pte_ofs] = pfn_pte(pfn, prot);
537 }
538 assign_pte(pmd, pte);
539 }
540 }
541 }
542
543 /*
544 * Set or check ktext_map now that we have cpu_possible_mask
545 * and kstripe_mask to work with.
546 */
547 if (ktext_all)
548 cpumask_copy(&ktext_mask, cpu_possible_mask);
549 else if (ktext_nondataplane)
550 ktext_mask = kstripe_mask;
551 else if (!cpumask_empty(&ktext_mask)) {
552 /* Sanity-check any mask that was requested */
553 struct cpumask bad;
554 cpumask_andnot(&bad, &ktext_mask, cpu_possible_mask);
555 cpumask_and(&ktext_mask, &ktext_mask, cpu_possible_mask);
556 if (!cpumask_empty(&bad)) {
557 char buf[NR_CPUS * 5];
558 cpulist_scnprintf(buf, sizeof(buf), &bad);
559 printk("ktext: not using unavailable cpus %s\n", buf);
560 }
561 if (cpumask_empty(&ktext_mask)) {
562 printk("ktext: no valid cpus; caching on %d.\n",
563 smp_processor_id());
564 cpumask_copy(&ktext_mask,
565 cpumask_of(smp_processor_id()));
566 }
567 }
568
569 address = MEM_SV_INTRPT;
570 pmd = get_pmd(pgtables, address);
571 if (ktext_small) {
572 /* Allocate an L2 PTE for the kernel text */
573 int cpu = 0;
574 pgprot_t prot = construct_pgprot(PAGE_KERNEL_EXEC,
575 PAGE_HOME_IMMUTABLE);
576
577 if (ktext_local) {
578 if (ktext_nocache)
579 prot = hv_pte_set_mode(prot,
580 HV_PTE_MODE_UNCACHED);
581 else
582 prot = hv_pte_set_mode(prot,
583 HV_PTE_MODE_CACHE_NO_L3);
584 } else {
585 prot = hv_pte_set_mode(prot,
586 HV_PTE_MODE_CACHE_TILE_L3);
587 cpu = cpumask_first(&ktext_mask);
588
589 prot = ktext_set_nocache(prot);
590 }
591
592 BUG_ON(address != (unsigned long)_stext);
593 pfn = 0; /* code starts at PA 0 */
594 pte = alloc_pte();
595 for (pte_ofs = 0; address < (unsigned long)_einittext;
596 pfn++, pte_ofs++, address += PAGE_SIZE) {
597 if (!ktext_local) {
598 prot = set_remote_cache_cpu(prot, cpu);
599 cpu = cpumask_next(cpu, &ktext_mask);
600 if (cpu == NR_CPUS)
601 cpu = cpumask_first(&ktext_mask);
602 }
603 pte[pte_ofs] = pfn_pte(pfn, prot);
604 }
605 assign_pte(pmd, pte);
606 } else {
607 pte_t pteval = pfn_pte(0, PAGE_KERNEL_EXEC);
608 pteval = pte_mkhuge(pteval);
609#if CHIP_HAS_CBOX_HOME_MAP()
610 if (ktext_hash) {
611 pteval = hv_pte_set_mode(pteval,
612 HV_PTE_MODE_CACHE_HASH_L3);
613 pteval = ktext_set_nocache(pteval);
614 } else
615#endif /* CHIP_HAS_CBOX_HOME_MAP() */
616 if (cpumask_weight(&ktext_mask) == 1) {
617 pteval = set_remote_cache_cpu(pteval,
618 cpumask_first(&ktext_mask));
619 pteval = hv_pte_set_mode(pteval,
620 HV_PTE_MODE_CACHE_TILE_L3);
621 pteval = ktext_set_nocache(pteval);
622 } else if (ktext_nocache)
623 pteval = hv_pte_set_mode(pteval,
624 HV_PTE_MODE_UNCACHED);
625 else
626 pteval = hv_pte_set_mode(pteval,
627 HV_PTE_MODE_CACHE_NO_L3);
628 *(pte_t *)pmd = pteval;
629 }
630
631 /* Set swapper_pgprot here so it is flushed to memory right away. */
632 swapper_pgprot = init_pgprot((unsigned long)swapper_pg_dir);
633
634 /*
635 * Since we may be changing the caching of the stack and page
636 * table itself, we invoke an assembly helper to do the
637 * following steps:
638 *
639 * - flush the cache so we start with an empty slate
640 * - install pgtables[] as the real page table
641 * - flush the TLB so the new page table takes effect
642 */
643 rc = flush_and_install_context(__pa(pgtables),
644 init_pgprot((unsigned long)pgtables),
645 __get_cpu_var(current_asid),
646 cpumask_bits(my_cpu_mask));
647 BUG_ON(rc != 0);
648
649 /* Copy the page table back to the normal swapper_pg_dir. */
650 memcpy(pgd_base, pgtables, sizeof(pgtables));
651 __install_page_table(pgd_base, __get_cpu_var(current_asid),
652 swapper_pgprot);
653}
654
655/*
656 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
657 * is valid. The argument is a physical page number.
658 *
659 * On Tile, the only valid things for which we can just hand out unchecked
660 * PTEs are the kernel code and data. Anything else might change its
661 * homing with time, and we wouldn't know to adjust the /dev/mem PTEs.
662 * Note that init_thread_union is released to heap soon after boot,
663 * so we include it in the init data.
664 *
665 * For TILE-Gx, we might want to consider allowing access to PA
666 * regions corresponding to PCI space, etc.
667 */
668int devmem_is_allowed(unsigned long pagenr)
669{
670 return pagenr < kaddr_to_pfn(_end) &&
671 !(pagenr >= kaddr_to_pfn(&init_thread_union) ||
672 pagenr < kaddr_to_pfn(_einitdata)) &&
673 !(pagenr >= kaddr_to_pfn(_sinittext) ||
674 pagenr <= kaddr_to_pfn(_einittext-1));
675}
676
677#ifdef CONFIG_HIGHMEM
678static void __init permanent_kmaps_init(pgd_t *pgd_base)
679{
680 pgd_t *pgd;
681 pud_t *pud;
682 pmd_t *pmd;
683 pte_t *pte;
684 unsigned long vaddr;
685
686 vaddr = PKMAP_BASE;
687 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
688
689 pgd = swapper_pg_dir + pgd_index(vaddr);
690 pud = pud_offset(pgd, vaddr);
691 pmd = pmd_offset(pud, vaddr);
692 pte = pte_offset_kernel(pmd, vaddr);
693 pkmap_page_table = pte;
694}
695#endif /* CONFIG_HIGHMEM */
696
697
698static void __init init_free_pfn_range(unsigned long start, unsigned long end)
699{
700 unsigned long pfn;
701 struct page *page = pfn_to_page(start);
702
703 for (pfn = start; pfn < end; ) {
704 /* Optimize by freeing pages in large batches */
705 int order = __ffs(pfn);
706 int count, i;
707 struct page *p;
708
709 if (order >= MAX_ORDER)
710 order = MAX_ORDER-1;
711 count = 1 << order;
712 while (pfn + count > end) {
713 count >>= 1;
714 --order;
715 }
716 for (p = page, i = 0; i < count; ++i, ++p) {
717 __ClearPageReserved(p);
718 /*
719 * Hacky direct set to avoid unnecessary
720 * lock take/release for EVERY page here.
721 */
722 p->_count.counter = 0;
723 p->_mapcount.counter = -1;
724 }
725 init_page_count(page);
726 __free_pages(page, order);
727 totalram_pages += count;
728
729 page += count;
730 pfn += count;
731 }
732}
733
734static void __init set_non_bootmem_pages_init(void)
735{
736 struct zone *z;
737 for_each_zone(z) {
738 unsigned long start, end;
739 int nid = z->zone_pgdat->node_id;
740
741 start = z->zone_start_pfn;
742 if (start == 0)
743 continue; /* bootmem */
744 end = start + z->spanned_pages;
745 if (zone_idx(z) == ZONE_NORMAL) {
746 BUG_ON(start != node_start_pfn[nid]);
747 start = node_free_pfn[nid];
748 }
749#ifdef CONFIG_HIGHMEM
750 if (zone_idx(z) == ZONE_HIGHMEM)
751 totalhigh_pages += z->spanned_pages;
752#endif
753 if (kdata_huge) {
754 unsigned long percpu_pfn = node_percpu_pfn[nid];
755 if (start < percpu_pfn && end > percpu_pfn)
756 end = percpu_pfn;
757 }
758#ifdef CONFIG_PCI
759 if (start <= pci_reserve_start_pfn &&
760 end > pci_reserve_start_pfn) {
761 if (end > pci_reserve_end_pfn)
762 init_free_pfn_range(pci_reserve_end_pfn, end);
763 end = pci_reserve_start_pfn;
764 }
765#endif
766 init_free_pfn_range(start, end);
767 }
768}
769
770/*
771 * paging_init() sets up the page tables - note that all of lowmem is
772 * already mapped by head.S.
773 */
774void __init paging_init(void)
775{
776#ifdef CONFIG_HIGHMEM
777 unsigned long vaddr, end;
778#endif
779#ifdef __tilegx__
780 pud_t *pud;
781#endif
782 pgd_t *pgd_base = swapper_pg_dir;
783
784 kernel_physical_mapping_init(pgd_base);
785
786#ifdef CONFIG_HIGHMEM
787 /*
788 * Fixed mappings, only the page table structure has to be
789 * created - mappings will be set by set_fixmap():
790 */
791 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
792 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
793 page_table_range_init(vaddr, end, pgd_base);
794 permanent_kmaps_init(pgd_base);
795#endif
796
797#ifdef __tilegx__
798 /*
799 * Since GX allocates just one pmd_t array worth of vmalloc space,
800 * we go ahead and allocate it statically here, then share it
801 * globally. As a result we don't have to worry about any task
802 * changing init_mm once we get up and running, and there's no
803 * need for e.g. vmalloc_sync_all().
804 */
805 BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END));
806 pud = pud_offset(pgd_base + pgd_index(VMALLOC_START), VMALLOC_START);
807 assign_pmd(pud, alloc_pmd());
808#endif
809}
810
811
812/*
813 * Walk the kernel page tables and derive the page_home() from
814 * the PTEs, so that set_pte() can properly validate the caching
815 * of all PTEs it sees.
816 */
817void __init set_page_homes(void)
818{
819}
820
821static void __init set_max_mapnr_init(void)
822{
823#ifdef CONFIG_FLATMEM
824 max_mapnr = max_low_pfn;
825#endif
826}
827
828void __init mem_init(void)
829{
830 int codesize, datasize, initsize;
831 int i;
832#ifndef __tilegx__
833 void *last;
834#endif
835
836#ifdef CONFIG_FLATMEM
837 if (!mem_map)
838 BUG();
839#endif
840
841#ifdef CONFIG_HIGHMEM
842 /* check that fixmap and pkmap do not overlap */
843 if (PKMAP_ADDR(LAST_PKMAP-1) >= FIXADDR_START) {
844 printk(KERN_ERR "fixmap and kmap areas overlap"
845 " - this will crash\n");
846 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
847 PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP-1),
848 FIXADDR_START);
849 BUG();
850 }
851#endif
852
853 set_max_mapnr_init();
854
855 /* this will put all bootmem onto the freelists */
856 totalram_pages += free_all_bootmem();
857
858 /* count all remaining LOWMEM and give all HIGHMEM to page allocator */
859 set_non_bootmem_pages_init();
860
861 codesize = (unsigned long)&_etext - (unsigned long)&_text;
862 datasize = (unsigned long)&_end - (unsigned long)&_sdata;
863 initsize = (unsigned long)&_einittext - (unsigned long)&_sinittext;
864 initsize += (unsigned long)&_einitdata - (unsigned long)&_sinitdata;
865
866 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk data, %dk init, %ldk highmem)\n",
867 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
868 num_physpages << (PAGE_SHIFT-10),
869 codesize >> 10,
870 datasize >> 10,
871 initsize >> 10,
872 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
873 );
874
875 /*
876 * In debug mode, dump some interesting memory mappings.
877 */
878#ifdef CONFIG_HIGHMEM
879 printk(KERN_DEBUG " KMAP %#lx - %#lx\n",
880 FIXADDR_START, FIXADDR_TOP + PAGE_SIZE - 1);
881 printk(KERN_DEBUG " PKMAP %#lx - %#lx\n",
882 PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP) - 1);
883#endif
884#ifdef CONFIG_HUGEVMAP
885 printk(KERN_DEBUG " HUGEMAP %#lx - %#lx\n",
886 HUGE_VMAP_BASE, HUGE_VMAP_END - 1);
887#endif
888 printk(KERN_DEBUG " VMALLOC %#lx - %#lx\n",
889 _VMALLOC_START, _VMALLOC_END - 1);
890#ifdef __tilegx__
891 for (i = MAX_NUMNODES-1; i >= 0; --i) {
892 struct pglist_data *node = &node_data[i];
893 if (node->node_present_pages) {
894 unsigned long start = (unsigned long)
895 pfn_to_kaddr(node->node_start_pfn);
896 unsigned long end = start +
897 (node->node_present_pages << PAGE_SHIFT);
898 printk(KERN_DEBUG " MEM%d %#lx - %#lx\n",
899 i, start, end - 1);
900 }
901 }
902#else
903 last = high_memory;
904 for (i = MAX_NUMNODES-1; i >= 0; --i) {
905 if ((unsigned long)vbase_map[i] != -1UL) {
906 printk(KERN_DEBUG " LOWMEM%d %#lx - %#lx\n",
907 i, (unsigned long) (vbase_map[i]),
908 (unsigned long) (last-1));
909 last = vbase_map[i];
910 }
911 }
912#endif
913
914#ifndef __tilegx__
915 /*
916 * Convert from using one lock for all atomic operations to
917 * one per cpu.
918 */
919 __init_atomic_per_cpu();
920#endif
921}
922
923/*
924 * this is for the non-NUMA, single node SMP system case.
925 * Specifically, in the case of x86, we will always add
926 * memory to the highmem for now.
927 */
928#ifndef CONFIG_NEED_MULTIPLE_NODES
929int arch_add_memory(u64 start, u64 size)
930{
931 struct pglist_data *pgdata = &contig_page_data;
932 struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
933 unsigned long start_pfn = start >> PAGE_SHIFT;
934 unsigned long nr_pages = size >> PAGE_SHIFT;
935
936 return __add_pages(zone, start_pfn, nr_pages);
937}
938
939int remove_memory(u64 start, u64 size)
940{
941 return -EINVAL;
942}
943#endif
944
945struct kmem_cache *pgd_cache;
946
947void __init pgtable_cache_init(void)
948{
949 pgd_cache = kmem_cache_create("pgd",
950 PTRS_PER_PGD*sizeof(pgd_t),
951 PTRS_PER_PGD*sizeof(pgd_t),
952 0,
953 NULL);
954 if (!pgd_cache)
955 panic("pgtable_cache_init(): Cannot create pgd cache");
956}
957
958#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
959/*
960 * The __w1data area holds data that is only written during initialization,
961 * and is read-only and thus freely cacheable thereafter. Fix the page
962 * table entries that cover that region accordingly.
963 */
964static void mark_w1data_ro(void)
965{
966 /* Loop over page table entries */
967 unsigned long addr = (unsigned long)__w1data_begin;
968 BUG_ON((addr & (PAGE_SIZE-1)) != 0);
969 for (; addr <= (unsigned long)__w1data_end - 1; addr += PAGE_SIZE) {
970 unsigned long pfn = kaddr_to_pfn((void *)addr);
971 struct page *page = pfn_to_page(pfn);
972 pte_t *ptep = virt_to_pte(NULL, addr);
973 BUG_ON(pte_huge(*ptep)); /* not relevant for kdata_huge */
974 set_pte_at(&init_mm, addr, ptep, pfn_pte(pfn, PAGE_KERNEL_RO));
975 }
976}
977#endif
978
979#ifdef CONFIG_DEBUG_PAGEALLOC
980static long __write_once initfree;
981#else
982static long __write_once initfree = 1;
983#endif
984
985/* Select whether to free (1) or mark unusable (0) the __init pages. */
986static int __init set_initfree(char *str)
987{
988 strict_strtol(str, 0, &initfree);
989 printk("initfree: %s free init pages\n", initfree ? "will" : "won't");
990 return 1;
991}
992__setup("initfree=", set_initfree);
993
994static void free_init_pages(char *what, unsigned long begin, unsigned long end)
995{
996 unsigned long addr = (unsigned long) begin;
997
998 if (kdata_huge && !initfree) {
999 printk("Warning: ignoring initfree=0:"
1000 " incompatible with kdata=huge\n");
1001 initfree = 1;
1002 }
1003 end = (end + PAGE_SIZE - 1) & PAGE_MASK;
1004 local_flush_tlb_pages(NULL, begin, PAGE_SIZE, end - begin);
1005 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1006 /*
1007 * Note we just reset the home here directly in the
1008 * page table. We know this is safe because our caller
1009 * just flushed the caches on all the other cpus,
1010 * and they won't be touching any of these pages.
1011 */
1012 int pfn = kaddr_to_pfn((void *)addr);
1013 struct page *page = pfn_to_page(pfn);
1014 pte_t *ptep = virt_to_pte(NULL, addr);
1015 if (!initfree) {
1016 /*
1017 * If debugging page accesses then do not free
1018 * this memory but mark them not present - any
1019 * buggy init-section access will create a
1020 * kernel page fault:
1021 */
1022 pte_clear(&init_mm, addr, ptep);
1023 continue;
1024 }
1025 __ClearPageReserved(page);
1026 init_page_count(page);
1027 if (pte_huge(*ptep))
1028 BUG_ON(!kdata_huge);
1029 else
1030 set_pte_at(&init_mm, addr, ptep,
1031 pfn_pte(pfn, PAGE_KERNEL));
1032 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
1033 free_page(addr);
1034 totalram_pages++;
1035 }
1036 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1037}
1038
1039void free_initmem(void)
1040{
1041 const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
1042
1043 /*
1044 * Evict the dirty initdata on the boot cpu, evict the w1data
1045 * wherever it's homed, and evict all the init code everywhere.
1046 * We are guaranteed that no one will touch the init pages any
1047 * more, and although other cpus may be touching the w1data,
1048 * we only actually change the caching on tile64, which won't
1049 * be keeping local copies in the other tiles' caches anyway.
1050 */
1051 homecache_evict(&cpu_cacheable_map);
1052
1053 /* Free the data pages that we won't use again after init. */
1054 free_init_pages("unused kernel data",
1055 (unsigned long)_sinitdata,
1056 (unsigned long)_einitdata);
1057
1058 /*
1059 * Free the pages mapped from 0xc0000000 that correspond to code
1060 * pages from 0xfd000000 that we won't use again after init.
1061 */
1062 free_init_pages("unused kernel text",
1063 (unsigned long)_sinittext - text_delta,
1064 (unsigned long)_einittext - text_delta);
1065
1066#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
1067 /*
1068 * Upgrade the .w1data section to globally cached.
1069 * We don't do this on tilepro, since the cache architecture
1070 * pretty much makes it irrelevant, and in any case we end
1071 * up having racing issues with other tiles that may touch
1072 * the data after we flush the cache but before we update
1073 * the PTEs and flush the TLBs, causing sharer shootdowns
1074 * later. Even though this is to clean data, it seems like
1075 * an unnecessary complication.
1076 */
1077 mark_w1data_ro();
1078#endif
1079
1080 /* Do a global TLB flush so everyone sees the changes. */
1081 flush_tlb_all();
1082}
diff --git a/arch/tile/mm/migrate.h b/arch/tile/mm/migrate.h
new file mode 100644
index 000000000000..cd45a0837fa6
--- /dev/null
+++ b/arch/tile/mm/migrate.h
@@ -0,0 +1,50 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * Structure definitions for migration, exposed here for use by
15 * arch/tile/kernel/asm-offsets.c.
16 */
17
18#ifndef MM_MIGRATE_H
19#define MM_MIGRATE_H
20
21#include <linux/cpumask.h>
22#include <hv/hypervisor.h>
23
24/*
25 * This function is used as a helper when setting up the initial
26 * page table (swapper_pg_dir).
27 */
28extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access,
29 HV_ASID asid,
30 const unsigned long *cpumask);
31
32/*
33 * This function supports migration as a "helper" as follows:
34 *
35 * - Set the stack PTE itself to "migrating".
36 * - Do a global TLB flush for (va,length) and the specified ASIDs.
37 * - Do a cache-evict on all necessary cpus.
38 * - Write the new stack PTE.
39 *
40 * Note that any non-NULL pointers must not point to the page that
41 * is handled by the stack_pte itself.
42 */
43extern int homecache_migrate_stack_and_flush(pte_t stack_pte, unsigned long va,
44 size_t length, pte_t *stack_ptep,
45 const struct cpumask *cache_cpumask,
46 const struct cpumask *tlb_cpumask,
47 HV_Remote_ASID *asids,
48 int asidcount);
49
50#endif /* MM_MIGRATE_H */
diff --git a/arch/tile/mm/migrate_32.S b/arch/tile/mm/migrate_32.S
new file mode 100644
index 000000000000..f738765cd1e6
--- /dev/null
+++ b/arch/tile/mm/migrate_32.S
@@ -0,0 +1,211 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * This routine is a helper for migrating the home of a set of pages to
15 * a new cpu. See the documentation in homecache.c for more information.
16 */
17
18#include <linux/linkage.h>
19#include <linux/threads.h>
20#include <asm/page.h>
21#include <asm/types.h>
22#include <asm/asm-offsets.h>
23#include <hv/hypervisor.h>
24
25 .text
26
27/*
28 * First, some definitions that apply to all the code in the file.
29 */
30
31/* Locals (caller-save) */
32#define r_tmp r10
33#define r_save_sp r11
34
35/* What we save where in the stack frame; must include all callee-saves. */
36#define FRAME_SP 4
37#define FRAME_R30 8
38#define FRAME_R31 12
39#define FRAME_R32 16
40#define FRAME_R33 20
41#define FRAME_R34 24
42#define FRAME_R35 28
43#define FRAME_SIZE 32
44
45
46
47
48/*
49 * On entry:
50 *
51 * r0 low word of the new context PA to install (moved to r_context_lo)
52 * r1 high word of the new context PA to install (moved to r_context_hi)
53 * r2 low word of PTE to use for context access (moved to r_access_lo)
54 * r3 high word of PTE to use for context access (moved to r_access_lo)
55 * r4 ASID to use for new context (moved to r_asid)
56 * r5 pointer to cpumask with just this cpu set in it (r_my_cpumask)
57 */
58
59/* Arguments (caller-save) */
60#define r_context_lo_in r0
61#define r_context_hi_in r1
62#define r_access_lo_in r2
63#define r_access_hi_in r3
64#define r_asid_in r4
65#define r_my_cpumask r5
66
67/* Locals (callee-save); must not be more than FRAME_xxx above. */
68#define r_save_ics r30
69#define r_context_lo r31
70#define r_context_hi r32
71#define r_access_lo r33
72#define r_access_hi r34
73#define r_asid r35
74
75STD_ENTRY(flush_and_install_context)
76 /*
77 * Create a stack frame; we can't touch it once we flush the
78 * cache until we install the new page table and flush the TLB.
79 */
80 {
81 move r_save_sp, sp
82 sw sp, lr
83 addi sp, sp, -FRAME_SIZE
84 }
85 addi r_tmp, sp, FRAME_SP
86 {
87 sw r_tmp, r_save_sp
88 addi r_tmp, sp, FRAME_R30
89 }
90 {
91 sw r_tmp, r30
92 addi r_tmp, sp, FRAME_R31
93 }
94 {
95 sw r_tmp, r31
96 addi r_tmp, sp, FRAME_R32
97 }
98 {
99 sw r_tmp, r32
100 addi r_tmp, sp, FRAME_R33
101 }
102 {
103 sw r_tmp, r33
104 addi r_tmp, sp, FRAME_R34
105 }
106 {
107 sw r_tmp, r34
108 addi r_tmp, sp, FRAME_R35
109 }
110 sw r_tmp, r35
111
112 /* Move some arguments to callee-save registers. */
113 {
114 move r_context_lo, r_context_lo_in
115 move r_context_hi, r_context_hi_in
116 }
117 {
118 move r_access_lo, r_access_lo_in
119 move r_access_hi, r_access_hi_in
120 }
121 move r_asid, r_asid_in
122
123 /* Disable interrupts, since we can't use our stack. */
124 {
125 mfspr r_save_ics, INTERRUPT_CRITICAL_SECTION
126 movei r_tmp, 1
127 }
128 mtspr INTERRUPT_CRITICAL_SECTION, r_tmp
129
130 /* First, flush our L2 cache. */
131 {
132 move r0, zero /* cache_pa */
133 move r1, zero
134 }
135 {
136 auli r2, zero, ha16(HV_FLUSH_EVICT_L2) /* cache_control */
137 move r3, r_my_cpumask /* cache_cpumask */
138 }
139 {
140 move r4, zero /* tlb_va */
141 move r5, zero /* tlb_length */
142 }
143 {
144 move r6, zero /* tlb_pgsize */
145 move r7, zero /* tlb_cpumask */
146 }
147 {
148 move r8, zero /* asids */
149 move r9, zero /* asidcount */
150 }
151 jal hv_flush_remote
152 bnz r0, .Ldone
153
154 /* Now install the new page table. */
155 {
156 move r0, r_context_lo
157 move r1, r_context_hi
158 }
159 {
160 move r2, r_access_lo
161 move r3, r_access_hi
162 }
163 {
164 move r4, r_asid
165 movei r5, HV_CTX_DIRECTIO
166 }
167 jal hv_install_context
168 bnz r0, .Ldone
169
170 /* Finally, flush the TLB. */
171 {
172 movei r0, 0 /* preserve_global */
173 jal hv_flush_all
174 }
175
176.Ldone:
177 /* Reset interrupts back how they were before. */
178 mtspr INTERRUPT_CRITICAL_SECTION, r_save_ics
179
180 /* Restore the callee-saved registers and return. */
181 addli lr, sp, FRAME_SIZE
182 {
183 lw lr, lr
184 addli r_tmp, sp, FRAME_R30
185 }
186 {
187 lw r30, r_tmp
188 addli r_tmp, sp, FRAME_R31
189 }
190 {
191 lw r31, r_tmp
192 addli r_tmp, sp, FRAME_R32
193 }
194 {
195 lw r32, r_tmp
196 addli r_tmp, sp, FRAME_R33
197 }
198 {
199 lw r33, r_tmp
200 addli r_tmp, sp, FRAME_R34
201 }
202 {
203 lw r34, r_tmp
204 addli r_tmp, sp, FRAME_R35
205 }
206 {
207 lw r35, r_tmp
208 addi sp, sp, FRAME_SIZE
209 }
210 jrp lr
211 STD_ENDPROC(flush_and_install_context)
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
new file mode 100644
index 000000000000..f96f4cec602a
--- /dev/null
+++ b/arch/tile/mm/mmap.c
@@ -0,0 +1,75 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * Taken from the i386 architecture and simplified.
15 */
16
17#include <linux/mm.h>
18#include <linux/random.h>
19#include <linux/limits.h>
20#include <linux/sched.h>
21#include <linux/mman.h>
22#include <linux/compat.h>
23
24/*
25 * Top of mmap area (just below the process stack).
26 *
27 * Leave an at least ~128 MB hole.
28 */
29#define MIN_GAP (128*1024*1024)
30#define MAX_GAP (TASK_SIZE/6*5)
31
32static inline unsigned long mmap_base(struct mm_struct *mm)
33{
34 unsigned long gap = rlimit(RLIMIT_STACK);
35 unsigned long random_factor = 0;
36
37 if (current->flags & PF_RANDOMIZE)
38 random_factor = get_random_int() % (1024*1024);
39
40 if (gap < MIN_GAP)
41 gap = MIN_GAP;
42 else if (gap > MAX_GAP)
43 gap = MAX_GAP;
44
45 return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
46}
47
48/*
49 * This function, called very early during the creation of a new
50 * process VM image, sets up which VM layout function to use:
51 */
52void arch_pick_mmap_layout(struct mm_struct *mm)
53{
54#if !defined(__tilegx__)
55 int is_32bit = 1;
56#elif defined(CONFIG_COMPAT)
57 int is_32bit = is_compat_task();
58#else
59 int is_32bit = 0;
60#endif
61
62 /*
63 * Use standard layout if the expected stack growth is unlimited
64 * or we are running native 64 bits.
65 */
66 if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
67 mm->mmap_base = TASK_UNMAPPED_BASE;
68 mm->get_unmapped_area = arch_get_unmapped_area;
69 mm->unmap_area = arch_unmap_area;
70 } else {
71 mm->mmap_base = mmap_base(mm);
72 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
73 mm->unmap_area = arch_unmap_area_topdown;
74 }
75}
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
new file mode 100644
index 000000000000..289e729bbd76
--- /dev/null
+++ b/arch/tile/mm/pgtable.c
@@ -0,0 +1,566 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/sched.h>
16#include <linux/kernel.h>
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/smp.h>
21#include <linux/highmem.h>
22#include <linux/slab.h>
23#include <linux/pagemap.h>
24#include <linux/spinlock.h>
25#include <linux/cpumask.h>
26#include <linux/module.h>
27#include <linux/io.h>
28#include <linux/vmalloc.h>
29#include <linux/smp.h>
30
31#include <asm/system.h>
32#include <asm/pgtable.h>
33#include <asm/pgalloc.h>
34#include <asm/fixmap.h>
35#include <asm/tlb.h>
36#include <asm/tlbflush.h>
37#include <asm/homecache.h>
38
39#define K(x) ((x) << (PAGE_SHIFT-10))
40
41/*
42 * The normal show_free_areas() is too verbose on Tile, with dozens
43 * of processors and often four NUMA zones each with high and lowmem.
44 */
45void show_mem(void)
46{
47 struct zone *zone;
48
49 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu"
50 " free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu"
51 " pagecache:%lu swap:%lu\n",
52 (global_page_state(NR_ACTIVE_ANON) +
53 global_page_state(NR_ACTIVE_FILE)),
54 (global_page_state(NR_INACTIVE_ANON) +
55 global_page_state(NR_INACTIVE_FILE)),
56 global_page_state(NR_FILE_DIRTY),
57 global_page_state(NR_WRITEBACK),
58 global_page_state(NR_UNSTABLE_NFS),
59 global_page_state(NR_FREE_PAGES),
60 (global_page_state(NR_SLAB_RECLAIMABLE) +
61 global_page_state(NR_SLAB_UNRECLAIMABLE)),
62 global_page_state(NR_FILE_MAPPED),
63 global_page_state(NR_PAGETABLE),
64 global_page_state(NR_BOUNCE),
65 global_page_state(NR_FILE_PAGES),
66 nr_swap_pages);
67
68 for_each_zone(zone) {
69 unsigned long flags, order, total = 0, largest_order = -1;
70
71 if (!populated_zone(zone))
72 continue;
73
74 printk("Node %d %7s: ", zone_to_nid(zone), zone->name);
75 spin_lock_irqsave(&zone->lock, flags);
76 for (order = 0; order < MAX_ORDER; order++) {
77 int nr = zone->free_area[order].nr_free;
78 total += nr << order;
79 if (nr)
80 largest_order = order;
81 }
82 spin_unlock_irqrestore(&zone->lock, flags);
83 printk("%lukB (largest %luKb)\n",
84 K(total), largest_order ? K(1UL) << largest_order : 0);
85 }
86}
87
88/*
89 * Associate a virtual page frame with a given physical page frame
90 * and protection flags for that frame.
91 */
92static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
93{
94 pgd_t *pgd;
95 pud_t *pud;
96 pmd_t *pmd;
97 pte_t *pte;
98
99 pgd = swapper_pg_dir + pgd_index(vaddr);
100 if (pgd_none(*pgd)) {
101 BUG();
102 return;
103 }
104 pud = pud_offset(pgd, vaddr);
105 if (pud_none(*pud)) {
106 BUG();
107 return;
108 }
109 pmd = pmd_offset(pud, vaddr);
110 if (pmd_none(*pmd)) {
111 BUG();
112 return;
113 }
114 pte = pte_offset_kernel(pmd, vaddr);
115 /* <pfn,flags> stored as-is, to permit clearing entries */
116 set_pte(pte, pfn_pte(pfn, flags));
117
118 /*
119 * It's enough to flush this one mapping.
120 * This appears conservative since it is only called
121 * from __set_fixmap.
122 */
123 local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
124}
125
126/*
127 * Associate a huge virtual page frame with a given physical page frame
128 * and protection flags for that frame. pfn is for the base of the page,
129 * vaddr is what the page gets mapped to - both must be properly aligned.
130 * The pmd must already be instantiated.
131 */
132void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
133{
134 pgd_t *pgd;
135 pud_t *pud;
136 pmd_t *pmd;
137
138 if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
139 printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
140 return; /* BUG(); */
141 }
142 if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
143 printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
144 return; /* BUG(); */
145 }
146 pgd = swapper_pg_dir + pgd_index(vaddr);
147 if (pgd_none(*pgd)) {
148 printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
149 return; /* BUG(); */
150 }
151 pud = pud_offset(pgd, vaddr);
152 pmd = pmd_offset(pud, vaddr);
153 set_pmd(pmd, ptfn_pmd(HV_PFN_TO_PTFN(pfn), flags));
154 /*
155 * It's enough to flush this one mapping.
156 * We flush both small and huge TSBs to be sure.
157 */
158 local_flush_tlb_page(NULL, vaddr, HPAGE_SIZE);
159 local_flush_tlb_pages(NULL, vaddr, PAGE_SIZE, HPAGE_SIZE);
160}
161
162void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
163{
164 unsigned long address = __fix_to_virt(idx);
165
166 if (idx >= __end_of_fixed_addresses) {
167 BUG();
168 return;
169 }
170 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
171}
172
173#if defined(CONFIG_HIGHPTE)
174pte_t *_pte_offset_map(pmd_t *dir, unsigned long address, enum km_type type)
175{
176 pte_t *pte = kmap_atomic(pmd_page(*dir), type) +
177 (pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
178 return &pte[pte_index(address)];
179}
180#endif
181
182/*
183 * List of all pgd's needed so it can invalidate entries in both cached
184 * and uncached pgd's. This is essentially codepath-based locking
185 * against pageattr.c; it is the unique case in which a valid change
186 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
187 * vmalloc faults work because attached pagetables are never freed.
188 * The locking scheme was chosen on the basis of manfred's
189 * recommendations and having no core impact whatsoever.
190 * -- wli
191 */
192DEFINE_SPINLOCK(pgd_lock);
193LIST_HEAD(pgd_list);
194
195static inline void pgd_list_add(pgd_t *pgd)
196{
197 list_add(pgd_to_list(pgd), &pgd_list);
198}
199
200static inline void pgd_list_del(pgd_t *pgd)
201{
202 list_del(pgd_to_list(pgd));
203}
204
205#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET)
206#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START)
207
208static void pgd_ctor(pgd_t *pgd)
209{
210 unsigned long flags;
211
212 memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t));
213 spin_lock_irqsave(&pgd_lock, flags);
214
215#ifndef __tilegx__
216 /*
217 * Check that the user interrupt vector has no L2.
218 * It never should for the swapper, and new page tables
219 * should always start with an empty user interrupt vector.
220 */
221 BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
222#endif
223
224 clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
225 swapper_pg_dir + KERNEL_PGD_INDEX_START,
226 KERNEL_PGD_PTRS);
227
228 pgd_list_add(pgd);
229 spin_unlock_irqrestore(&pgd_lock, flags);
230}
231
232static void pgd_dtor(pgd_t *pgd)
233{
234 unsigned long flags; /* can be called from interrupt context */
235
236 spin_lock_irqsave(&pgd_lock, flags);
237 pgd_list_del(pgd);
238 spin_unlock_irqrestore(&pgd_lock, flags);
239}
240
241pgd_t *pgd_alloc(struct mm_struct *mm)
242{
243 pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
244 if (pgd)
245 pgd_ctor(pgd);
246 return pgd;
247}
248
249void pgd_free(struct mm_struct *mm, pgd_t *pgd)
250{
251 pgd_dtor(pgd);
252 kmem_cache_free(pgd_cache, pgd);
253}
254
255
256#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
257
258struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
259{
260 int flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP;
261 struct page *p;
262
263#ifdef CONFIG_HIGHPTE
264 flags |= __GFP_HIGHMEM;
265#endif
266
267 p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
268 if (p == NULL)
269 return NULL;
270
271 pgtable_page_ctor(p);
272 return p;
273}
274
275/*
276 * Free page immediately (used in __pte_alloc if we raced with another
277 * process). We have to correct whatever pte_alloc_one() did before
278 * returning the pages to the allocator.
279 */
280void pte_free(struct mm_struct *mm, struct page *p)
281{
282 pgtable_page_dtor(p);
283 __free_pages(p, L2_USER_PGTABLE_ORDER);
284}
285
286void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
287 unsigned long address)
288{
289 int i;
290
291 pgtable_page_dtor(pte);
292 tlb->need_flush = 1;
293 if (tlb_fast_mode(tlb)) {
294 struct page *pte_pages[L2_USER_PGTABLE_PAGES];
295 for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
296 pte_pages[i] = pte + i;
297 free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES);
298 return;
299 }
300 for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) {
301 tlb->pages[tlb->nr++] = pte + i;
302 if (tlb->nr >= FREE_PTE_NR)
303 tlb_flush_mmu(tlb, 0, 0);
304 }
305}
306
307#ifndef __tilegx__
308
309/*
310 * FIXME: needs to be atomic vs hypervisor writes. For now we make the
311 * window of vulnerability a bit smaller by doing an unlocked 8-bit update.
312 */
313int ptep_test_and_clear_young(struct vm_area_struct *vma,
314 unsigned long addr, pte_t *ptep)
315{
316#if HV_PTE_INDEX_ACCESSED < 8 || HV_PTE_INDEX_ACCESSED >= 16
317# error Code assumes HV_PTE "accessed" bit in second byte
318#endif
319 u8 *tmp = (u8 *)ptep;
320 u8 second_byte = tmp[1];
321 if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8))))
322 return 0;
323 tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8));
324 return 1;
325}
326
327/*
328 * This implementation is atomic vs hypervisor writes, since the hypervisor
329 * always writes the low word (where "accessed" and "dirty" are) and this
330 * routine only writes the high word.
331 */
332void ptep_set_wrprotect(struct mm_struct *mm,
333 unsigned long addr, pte_t *ptep)
334{
335#if HV_PTE_INDEX_WRITABLE < 32
336# error Code assumes HV_PTE "writable" bit in high word
337#endif
338 u32 *tmp = (u32 *)ptep;
339 tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32));
340}
341
342#endif
343
344pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
345{
346 pgd_t *pgd;
347 pud_t *pud;
348 pmd_t *pmd;
349
350 if (pgd_addr_invalid(addr))
351 return NULL;
352
353 pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr);
354 pud = pud_offset(pgd, addr);
355 if (!pud_present(*pud))
356 return NULL;
357 pmd = pmd_offset(pud, addr);
358 if (pmd_huge_page(*pmd))
359 return (pte_t *)pmd;
360 if (!pmd_present(*pmd))
361 return NULL;
362 return pte_offset_kernel(pmd, addr);
363}
364
365pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
366{
367 unsigned int width = smp_width;
368 int x = cpu % width;
369 int y = cpu / width;
370 BUG_ON(y >= smp_height);
371 BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
372 BUG_ON(cpu < 0 || cpu >= NR_CPUS);
373 BUG_ON(!cpu_is_valid_lotar(cpu));
374 return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y));
375}
376
377int get_remote_cache_cpu(pgprot_t prot)
378{
379 HV_LOTAR lotar = hv_pte_get_lotar(prot);
380 int x = HV_LOTAR_X(lotar);
381 int y = HV_LOTAR_Y(lotar);
382 BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
383 return x + y * smp_width;
384}
385
386void set_pte_order(pte_t *ptep, pte_t pte, int order)
387{
388 unsigned long pfn = pte_pfn(pte);
389 struct page *page = pfn_to_page(pfn);
390
391 /* Update the home of a PTE if necessary */
392 pte = pte_set_home(pte, page_home(page));
393
394#ifdef __tilegx__
395 *ptep = pte;
396#else
397 /*
398 * When setting a PTE, write the high bits first, then write
399 * the low bits. This sets the "present" bit only after the
400 * other bits are in place. If a particular PTE update
401 * involves transitioning from one valid PTE to another, it
402 * may be necessary to call set_pte_order() more than once,
403 * transitioning via a suitable intermediate state.
404 * Note that this sequence also means that if we are transitioning
405 * from any migrating PTE to a non-migrating one, we will not
406 * see a half-updated PTE with the migrating bit off.
407 */
408#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
409# error Must write the present and migrating bits last
410#endif
411 ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
412 barrier();
413 ((u32 *)ptep)[0] = (u32)(pte_val(pte));
414#endif
415}
416
417/* Can this mm load a PTE with cached_priority set? */
418static inline int mm_is_priority_cached(struct mm_struct *mm)
419{
420 return mm->context.priority_cached;
421}
422
423/*
424 * Add a priority mapping to an mm_context and
425 * notify the hypervisor if this is the first one.
426 */
427void start_mm_caching(struct mm_struct *mm)
428{
429 if (!mm_is_priority_cached(mm)) {
430 mm->context.priority_cached = -1U;
431 hv_set_caching(-1U);
432 }
433}
434
435/*
436 * Validate and return the priority_cached flag. We know if it's zero
437 * that we don't need to scan, since we immediately set it non-zero
438 * when we first consider a MAP_CACHE_PRIORITY mapping.
439 *
440 * We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it,
441 * since we're in an interrupt context (servicing switch_mm) we don't
442 * worry about it and don't unset the "priority_cached" field.
443 * Presumably we'll come back later and have more luck and clear
444 * the value then; for now we'll just keep the cache marked for priority.
445 */
446static unsigned int update_priority_cached(struct mm_struct *mm)
447{
448 if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
449 struct vm_area_struct *vm;
450 for (vm = mm->mmap; vm; vm = vm->vm_next) {
451 if (hv_pte_get_cached_priority(vm->vm_page_prot))
452 break;
453 }
454 if (vm == NULL)
455 mm->context.priority_cached = 0;
456 up_write(&mm->mmap_sem);
457 }
458 return mm->context.priority_cached;
459}
460
461/* Set caching correctly for an mm that we are switching to. */
462void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
463{
464 if (!mm_is_priority_cached(next)) {
465 /*
466 * If the new mm doesn't use priority caching, just see if we
467 * need the hv_set_caching(), or can assume it's already zero.
468 */
469 if (mm_is_priority_cached(prev))
470 hv_set_caching(0);
471 } else {
472 hv_set_caching(update_priority_cached(next));
473 }
474}
475
476#if CHIP_HAS_MMIO()
477
478/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
479void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
480 pgprot_t home)
481{
482 void *addr;
483 struct vm_struct *area;
484 unsigned long offset, last_addr;
485 pgprot_t pgprot;
486
487 /* Don't allow wraparound or zero size */
488 last_addr = phys_addr + size - 1;
489 if (!size || last_addr < phys_addr)
490 return NULL;
491
492 /* Create a read/write, MMIO VA mapping homed at the requested shim. */
493 pgprot = PAGE_KERNEL;
494 pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
495 pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
496
497 /*
498 * Mappings have to be page-aligned
499 */
500 offset = phys_addr & ~PAGE_MASK;
501 phys_addr &= PAGE_MASK;
502 size = PAGE_ALIGN(last_addr+1) - phys_addr;
503
504 /*
505 * Ok, go for it..
506 */
507 area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
508 if (!area)
509 return NULL;
510 area->phys_addr = phys_addr;
511 addr = area->addr;
512 if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
513 phys_addr, pgprot)) {
514 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
515 return NULL;
516 }
517 return (__force void __iomem *) (offset + (char *)addr);
518}
519EXPORT_SYMBOL(ioremap_prot);
520
521/* Map a PCI MMIO bus address into VA space. */
522void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
523{
524 panic("ioremap for PCI MMIO is not supported");
525}
526EXPORT_SYMBOL(ioremap);
527
528/* Unmap an MMIO VA mapping. */
529void iounmap(volatile void __iomem *addr_in)
530{
531 volatile void __iomem *addr = (volatile void __iomem *)
532 (PAGE_MASK & (unsigned long __force)addr_in);
533#if 1
534 vunmap((void * __force)addr);
535#else
536 /* x86 uses this complicated flow instead of vunmap(). Is
537 * there any particular reason we should do the same? */
538 struct vm_struct *p, *o;
539
540 /* Use the vm area unlocked, assuming the caller
541 ensures there isn't another iounmap for the same address
542 in parallel. Reuse of the virtual address is prevented by
543 leaving it in the global lists until we're done with it.
544 cpa takes care of the direct mappings. */
545 read_lock(&vmlist_lock);
546 for (p = vmlist; p; p = p->next) {
547 if (p->addr == addr)
548 break;
549 }
550 read_unlock(&vmlist_lock);
551
552 if (!p) {
553 printk("iounmap: bad address %p\n", addr);
554 dump_stack();
555 return;
556 }
557
558 /* Finally remove it */
559 o = remove_vm_area((void *)addr);
560 BUG_ON(p != o || o == NULL);
561 kfree(p);
562#endif
563}
564EXPORT_SYMBOL(iounmap);
565
566#endif /* CHIP_HAS_MMIO() */