aboutsummaryrefslogtreecommitdiffstats
path: root/arch/tile/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/tile/mm')
-rw-r--r--arch/tile/mm/Makefile9
-rw-r--r--arch/tile/mm/elf.c164
-rw-r--r--arch/tile/mm/extable.c30
-rw-r--r--arch/tile/mm/fault.c867
-rw-r--r--arch/tile/mm/highmem.c328
-rw-r--r--arch/tile/mm/homecache.c433
-rw-r--r--arch/tile/mm/hugetlbpage.c343
-rw-r--r--arch/tile/mm/init.c1085
-rw-r--r--arch/tile/mm/migrate.h50
-rw-r--r--arch/tile/mm/migrate_32.S211
-rw-r--r--arch/tile/mm/mmap.c75
-rw-r--r--arch/tile/mm/pgtable.c530
12 files changed, 4125 insertions, 0 deletions
diff --git a/arch/tile/mm/Makefile b/arch/tile/mm/Makefile
new file mode 100644
index 000000000000..e252aeddc17d
--- /dev/null
+++ b/arch/tile/mm/Makefile
@@ -0,0 +1,9 @@
1#
2# Makefile for the linux tile-specific parts of the memory manager.
3#
4
5obj-y := init.o pgtable.o fault.o extable.o elf.o \
6 mmap.o homecache.o migrate_$(BITS).o
7
8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
9obj-$(CONFIG_HIGHMEM) += highmem.o
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
new file mode 100644
index 000000000000..55e58e93bfc5
--- /dev/null
+++ b/arch/tile/mm/elf.c
@@ -0,0 +1,164 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/binfmts.h>
18#include <linux/compat.h>
19#include <linux/mman.h>
20#include <linux/elf.h>
21#include <asm/pgtable.h>
22#include <asm/pgalloc.h>
23#include <asm/sections.h>
24
25/* Notify a running simulator, if any, that an exec just occurred. */
26static void sim_notify_exec(const char *binary_name)
27{
28 unsigned char c;
29 do {
30 c = *binary_name++;
31 __insn_mtspr(SPR_SIM_CONTROL,
32 (SIM_CONTROL_OS_EXEC
33 | (c << _SIM_CONTROL_OPERATOR_BITS)));
34
35 } while (c);
36}
37
38static int notify_exec(void)
39{
40 int retval = 0; /* failure */
41 struct vm_area_struct *vma = current->mm->mmap;
42 while (vma) {
43 if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
44 break;
45 vma = vma->vm_next;
46 }
47 if (vma) {
48 char *buf = (char *) __get_free_page(GFP_KERNEL);
49 if (buf) {
50 char *path = d_path(&vma->vm_file->f_path,
51 buf, PAGE_SIZE);
52 if (!IS_ERR(path)) {
53 sim_notify_exec(path);
54 retval = 1;
55 }
56 free_page((unsigned long)buf);
57 }
58 }
59 return retval;
60}
61
62/* Notify a running simulator, if any, that we loaded an interpreter. */
63static void sim_notify_interp(unsigned long load_addr)
64{
65 size_t i;
66 for (i = 0; i < sizeof(load_addr); i++) {
67 unsigned char c = load_addr >> (i * 8);
68 __insn_mtspr(SPR_SIM_CONTROL,
69 (SIM_CONTROL_OS_INTERP
70 | (c << _SIM_CONTROL_OPERATOR_BITS)));
71 }
72}
73
74
75/* Kernel address of page used to map read-only kernel data into userspace. */
76static void *vdso_page;
77
78/* One-entry array used for install_special_mapping. */
79static struct page *vdso_pages[1];
80
81static int __init vdso_setup(void)
82{
83 vdso_page = (void *)get_zeroed_page(GFP_ATOMIC);
84 memcpy(vdso_page, __rt_sigreturn, __rt_sigreturn_end - __rt_sigreturn);
85 vdso_pages[0] = virt_to_page(vdso_page);
86 return 0;
87}
88device_initcall(vdso_setup);
89
90const char *arch_vma_name(struct vm_area_struct *vma)
91{
92 if (vma->vm_private_data == vdso_pages)
93 return "[vdso]";
94#ifndef __tilegx__
95 if (vma->vm_start == MEM_USER_INTRPT)
96 return "[intrpt]";
97#endif
98 return NULL;
99}
100
101int arch_setup_additional_pages(struct linux_binprm *bprm,
102 int executable_stack)
103{
104 struct mm_struct *mm = current->mm;
105 unsigned long vdso_base;
106 int retval = 0;
107
108 /*
109 * Notify the simulator that an exec just occurred.
110 * If we can't find the filename of the mapping, just use
111 * whatever was passed as the linux_binprm filename.
112 */
113 if (!notify_exec())
114 sim_notify_exec(bprm->filename);
115
116 down_write(&mm->mmap_sem);
117
118 /*
119 * MAYWRITE to allow gdb to COW and set breakpoints
120 *
121 * Make sure the vDSO gets into every core dump. Dumping its
122 * contents makes post-mortem fully interpretable later
123 * without matching up the same kernel and hardware config to
124 * see what PC values meant.
125 */
126 vdso_base = VDSO_BASE;
127 retval = install_special_mapping(mm, vdso_base, PAGE_SIZE,
128 VM_READ|VM_EXEC|
129 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
130 VM_ALWAYSDUMP,
131 vdso_pages);
132
133#ifndef __tilegx__
134 /*
135 * Set up a user-interrupt mapping here; the user can't
136 * create one themselves since it is above TASK_SIZE.
137 * We make it unwritable by default, so the model for adding
138 * interrupt vectors always involves an mprotect.
139 */
140 if (!retval) {
141 unsigned long addr = MEM_USER_INTRPT;
142 addr = mmap_region(NULL, addr, INTRPT_SIZE,
143 MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
144 VM_READ|VM_EXEC|
145 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
146 if (addr > (unsigned long) -PAGE_SIZE)
147 retval = (int) addr;
148 }
149#endif
150
151 up_write(&mm->mmap_sem);
152
153 return retval;
154}
155
156
157void elf_plat_init(struct pt_regs *regs, unsigned long load_addr)
158{
159 /* Zero all registers. */
160 memset(regs, 0, sizeof(*regs));
161
162 /* Report the interpreter's load address. */
163 sim_notify_interp(load_addr);
164}
diff --git a/arch/tile/mm/extable.c b/arch/tile/mm/extable.c
new file mode 100644
index 000000000000..4fb0acb9d154
--- /dev/null
+++ b/arch/tile/mm/extable.c
@@ -0,0 +1,30 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/module.h>
16#include <linux/spinlock.h>
17#include <linux/uaccess.h>
18
19int fixup_exception(struct pt_regs *regs)
20{
21 const struct exception_table_entry *fixup;
22
23 fixup = search_exception_tables(regs->pc);
24 if (fixup) {
25 regs->pc = fixup->fixup;
26 return 1;
27 }
28
29 return 0;
30}
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
new file mode 100644
index 000000000000..0011f06b4fe2
--- /dev/null
+++ b/arch/tile/mm/fault.c
@@ -0,0 +1,867 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * From i386 code copyright (C) 1995 Linus Torvalds
15 */
16
17#include <linux/signal.h>
18#include <linux/sched.h>
19#include <linux/kernel.h>
20#include <linux/errno.h>
21#include <linux/string.h>
22#include <linux/types.h>
23#include <linux/ptrace.h>
24#include <linux/mman.h>
25#include <linux/mm.h>
26#include <linux/smp.h>
27#include <linux/smp_lock.h>
28#include <linux/interrupt.h>
29#include <linux/init.h>
30#include <linux/tty.h>
31#include <linux/vt_kern.h> /* For unblank_screen() */
32#include <linux/highmem.h>
33#include <linux/module.h>
34#include <linux/kprobes.h>
35#include <linux/hugetlb.h>
36#include <linux/syscalls.h>
37#include <linux/uaccess.h>
38
39#include <asm/system.h>
40#include <asm/pgalloc.h>
41#include <asm/sections.h>
42#include <asm/traps.h>
43#include <asm/syscalls.h>
44
45#include <arch/interrupts.h>
46
47static noinline void force_sig_info_fault(int si_signo, int si_code,
48 unsigned long address, int fault_num, struct task_struct *tsk)
49{
50 siginfo_t info;
51
52 if (unlikely(tsk->pid < 2)) {
53 panic("Signal %d (code %d) at %#lx sent to %s!",
54 si_signo, si_code & 0xffff, address,
55 tsk->pid ? "init" : "the idle task");
56 }
57
58 info.si_signo = si_signo;
59 info.si_errno = 0;
60 info.si_code = si_code;
61 info.si_addr = (void __user *)address;
62 info.si_trapno = fault_num;
63 force_sig_info(si_signo, &info, tsk);
64}
65
66#ifndef __tilegx__
67/*
68 * Synthesize the fault a PL0 process would get by doing a word-load of
69 * an unaligned address or a high kernel address. Called indirectly
70 * from sys_cmpxchg() in kernel/intvec.S.
71 */
72int _sys_cmpxchg_badaddr(unsigned long address, struct pt_regs *regs)
73{
74 if (address >= PAGE_OFFSET)
75 force_sig_info_fault(SIGSEGV, SEGV_MAPERR, address,
76 INT_DTLB_MISS, current);
77 else
78 force_sig_info_fault(SIGBUS, BUS_ADRALN, address,
79 INT_UNALIGN_DATA, current);
80
81 /*
82 * Adjust pc to point at the actual instruction, which is unusual
83 * for syscalls normally, but is appropriate when we are claiming
84 * that a syscall swint1 caused a page fault or bus error.
85 */
86 regs->pc -= 8;
87
88 /*
89 * Mark this as a caller-save interrupt, like a normal page fault,
90 * so that when we go through the signal handler path we will
91 * properly restore r0, r1, and r2 for the signal handler arguments.
92 */
93 regs->flags |= PT_FLAGS_CALLER_SAVES;
94
95 return 0;
96}
97#endif
98
99static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
100{
101 unsigned index = pgd_index(address);
102 pgd_t *pgd_k;
103 pud_t *pud, *pud_k;
104 pmd_t *pmd, *pmd_k;
105
106 pgd += index;
107 pgd_k = init_mm.pgd + index;
108
109 if (!pgd_present(*pgd_k))
110 return NULL;
111
112 pud = pud_offset(pgd, address);
113 pud_k = pud_offset(pgd_k, address);
114 if (!pud_present(*pud_k))
115 return NULL;
116
117 pmd = pmd_offset(pud, address);
118 pmd_k = pmd_offset(pud_k, address);
119 if (!pmd_present(*pmd_k))
120 return NULL;
121 if (!pmd_present(*pmd)) {
122 set_pmd(pmd, *pmd_k);
123 arch_flush_lazy_mmu_mode();
124 } else
125 BUG_ON(pmd_ptfn(*pmd) != pmd_ptfn(*pmd_k));
126 return pmd_k;
127}
128
129/*
130 * Handle a fault on the vmalloc or module mapping area
131 */
132static inline int vmalloc_fault(pgd_t *pgd, unsigned long address)
133{
134 pmd_t *pmd_k;
135 pte_t *pte_k;
136
137 /* Make sure we are in vmalloc area */
138 if (!(address >= VMALLOC_START && address < VMALLOC_END))
139 return -1;
140
141 /*
142 * Synchronize this task's top level page-table
143 * with the 'reference' page table.
144 */
145 pmd_k = vmalloc_sync_one(pgd, address);
146 if (!pmd_k)
147 return -1;
148 if (pmd_huge(*pmd_k))
149 return 0; /* support TILE huge_vmap() API */
150 pte_k = pte_offset_kernel(pmd_k, address);
151 if (!pte_present(*pte_k))
152 return -1;
153 return 0;
154}
155
156/* Wait until this PTE has completed migration. */
157static void wait_for_migration(pte_t *pte)
158{
159 if (pte_migrating(*pte)) {
160 /*
161 * Wait until the migrater fixes up this pte.
162 * We scale the loop count by the clock rate so we'll wait for
163 * a few seconds here.
164 */
165 int retries = 0;
166 int bound = get_clock_rate();
167 while (pte_migrating(*pte)) {
168 barrier();
169 if (++retries > bound)
170 panic("Hit migrating PTE (%#llx) and"
171 " page PFN %#lx still migrating",
172 pte->val, pte_pfn(*pte));
173 }
174 }
175}
176
177/*
178 * It's not generally safe to use "current" to get the page table pointer,
179 * since we might be running an oprofile interrupt in the middle of a
180 * task switch.
181 */
182static pgd_t *get_current_pgd(void)
183{
184 HV_Context ctx = hv_inquire_context();
185 unsigned long pgd_pfn = ctx.page_table >> PAGE_SHIFT;
186 struct page *pgd_page = pfn_to_page(pgd_pfn);
187 BUG_ON(PageHighMem(pgd_page)); /* oops, HIGHPTE? */
188 return (pgd_t *) __va(ctx.page_table);
189}
190
191/*
192 * We can receive a page fault from a migrating PTE at any time.
193 * Handle it by just waiting until the fault resolves.
194 *
195 * It's also possible to get a migrating kernel PTE that resolves
196 * itself during the downcall from hypervisor to Linux. We just check
197 * here to see if the PTE seems valid, and if so we retry it.
198 *
199 * NOTE! We MUST NOT take any locks for this case. We may be in an
200 * interrupt or a critical region, and must do as little as possible.
201 * Similarly, we can't use atomic ops here, since we may be handling a
202 * fault caused by an atomic op access.
203 */
204static int handle_migrating_pte(pgd_t *pgd, int fault_num,
205 unsigned long address,
206 int is_kernel_mode, int write)
207{
208 pud_t *pud;
209 pmd_t *pmd;
210 pte_t *pte;
211 pte_t pteval;
212
213 if (pgd_addr_invalid(address))
214 return 0;
215
216 pgd += pgd_index(address);
217 pud = pud_offset(pgd, address);
218 if (!pud || !pud_present(*pud))
219 return 0;
220 pmd = pmd_offset(pud, address);
221 if (!pmd || !pmd_present(*pmd))
222 return 0;
223 pte = pmd_huge_page(*pmd) ? ((pte_t *)pmd) :
224 pte_offset_kernel(pmd, address);
225 pteval = *pte;
226 if (pte_migrating(pteval)) {
227 wait_for_migration(pte);
228 return 1;
229 }
230
231 if (!is_kernel_mode || !pte_present(pteval))
232 return 0;
233 if (fault_num == INT_ITLB_MISS) {
234 if (pte_exec(pteval))
235 return 1;
236 } else if (write) {
237 if (pte_write(pteval))
238 return 1;
239 } else {
240 if (pte_read(pteval))
241 return 1;
242 }
243
244 return 0;
245}
246
247/*
248 * This routine is responsible for faulting in user pages.
249 * It passes the work off to one of the appropriate routines.
250 * It returns true if the fault was successfully handled.
251 */
252static int handle_page_fault(struct pt_regs *regs,
253 int fault_num,
254 int is_page_fault,
255 unsigned long address,
256 int write)
257{
258 struct task_struct *tsk;
259 struct mm_struct *mm;
260 struct vm_area_struct *vma;
261 unsigned long stack_offset;
262 int fault;
263 int si_code;
264 int is_kernel_mode;
265 pgd_t *pgd;
266
267 /* on TILE, protection faults are always writes */
268 if (!is_page_fault)
269 write = 1;
270
271 is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
272
273 tsk = validate_current();
274
275 /*
276 * Check to see if we might be overwriting the stack, and bail
277 * out if so. The page fault code is a relatively likely
278 * place to get trapped in an infinite regress, and once we
279 * overwrite the whole stack, it becomes very hard to recover.
280 */
281 stack_offset = stack_pointer & (THREAD_SIZE-1);
282 if (stack_offset < THREAD_SIZE / 8) {
283 pr_alert("Potential stack overrun: sp %#lx\n",
284 stack_pointer);
285 show_regs(regs);
286 pr_alert("Killing current process %d/%s\n",
287 tsk->pid, tsk->comm);
288 do_group_exit(SIGKILL);
289 }
290
291 /*
292 * Early on, we need to check for migrating PTE entries;
293 * see homecache.c. If we find a migrating PTE, we wait until
294 * the backing page claims to be done migrating, then we procede.
295 * For kernel PTEs, we rewrite the PTE and return and retry.
296 * Otherwise, we treat the fault like a normal "no PTE" fault,
297 * rather than trying to patch up the existing PTE.
298 */
299 pgd = get_current_pgd();
300 if (handle_migrating_pte(pgd, fault_num, address,
301 is_kernel_mode, write))
302 return 1;
303
304 si_code = SEGV_MAPERR;
305
306 /*
307 * We fault-in kernel-space virtual memory on-demand. The
308 * 'reference' page table is init_mm.pgd.
309 *
310 * NOTE! We MUST NOT take any locks for this case. We may
311 * be in an interrupt or a critical region, and should
312 * only copy the information from the master page table,
313 * nothing more.
314 *
315 * This verifies that the fault happens in kernel space
316 * and that the fault was not a protection fault.
317 */
318 if (unlikely(address >= TASK_SIZE &&
319 !is_arch_mappable_range(address, 0))) {
320 if (is_kernel_mode && is_page_fault &&
321 vmalloc_fault(pgd, address) >= 0)
322 return 1;
323 /*
324 * Don't take the mm semaphore here. If we fixup a prefetch
325 * fault we could otherwise deadlock.
326 */
327 mm = NULL; /* happy compiler */
328 vma = NULL;
329 goto bad_area_nosemaphore;
330 }
331
332 /*
333 * If we're trying to touch user-space addresses, we must
334 * be either at PL0, or else with interrupts enabled in the
335 * kernel, so either way we can re-enable interrupts here.
336 */
337 local_irq_enable();
338
339 mm = tsk->mm;
340
341 /*
342 * If we're in an interrupt, have no user context or are running in an
343 * atomic region then we must not take the fault.
344 */
345 if (in_atomic() || !mm) {
346 vma = NULL; /* happy compiler */
347 goto bad_area_nosemaphore;
348 }
349
350 /*
351 * When running in the kernel we expect faults to occur only to
352 * addresses in user space. All other faults represent errors in the
353 * kernel and should generate an OOPS. Unfortunately, in the case of an
354 * erroneous fault occurring in a code path which already holds mmap_sem
355 * we will deadlock attempting to validate the fault against the
356 * address space. Luckily the kernel only validly references user
357 * space from well defined areas of code, which are listed in the
358 * exceptions table.
359 *
360 * As the vast majority of faults will be valid we will only perform
361 * the source reference check when there is a possibility of a deadlock.
362 * Attempt to lock the address space, if we cannot we then validate the
363 * source. If this is invalid we can skip the address space check,
364 * thus avoiding the deadlock.
365 */
366 if (!down_read_trylock(&mm->mmap_sem)) {
367 if (is_kernel_mode &&
368 !search_exception_tables(regs->pc)) {
369 vma = NULL; /* happy compiler */
370 goto bad_area_nosemaphore;
371 }
372 down_read(&mm->mmap_sem);
373 }
374
375 vma = find_vma(mm, address);
376 if (!vma)
377 goto bad_area;
378 if (vma->vm_start <= address)
379 goto good_area;
380 if (!(vma->vm_flags & VM_GROWSDOWN))
381 goto bad_area;
382 if (regs->sp < PAGE_OFFSET) {
383 /*
384 * accessing the stack below sp is always a bug.
385 */
386 if (address < regs->sp)
387 goto bad_area;
388 }
389 if (expand_stack(vma, address))
390 goto bad_area;
391
392/*
393 * Ok, we have a good vm_area for this memory access, so
394 * we can handle it..
395 */
396good_area:
397 si_code = SEGV_ACCERR;
398 if (fault_num == INT_ITLB_MISS) {
399 if (!(vma->vm_flags & VM_EXEC))
400 goto bad_area;
401 } else if (write) {
402#ifdef TEST_VERIFY_AREA
403 if (!is_page_fault && regs->cs == KERNEL_CS)
404 pr_err("WP fault at "REGFMT"\n", regs->eip);
405#endif
406 if (!(vma->vm_flags & VM_WRITE))
407 goto bad_area;
408 } else {
409 if (!is_page_fault || !(vma->vm_flags & VM_READ))
410 goto bad_area;
411 }
412
413 survive:
414 /*
415 * If for any reason at all we couldn't handle the fault,
416 * make sure we exit gracefully rather than endlessly redo
417 * the fault.
418 */
419 fault = handle_mm_fault(mm, vma, address, write);
420 if (unlikely(fault & VM_FAULT_ERROR)) {
421 if (fault & VM_FAULT_OOM)
422 goto out_of_memory;
423 else if (fault & VM_FAULT_SIGBUS)
424 goto do_sigbus;
425 BUG();
426 }
427 if (fault & VM_FAULT_MAJOR)
428 tsk->maj_flt++;
429 else
430 tsk->min_flt++;
431
432#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
433 /*
434 * If this was an asynchronous fault,
435 * restart the appropriate engine.
436 */
437 switch (fault_num) {
438#if CHIP_HAS_TILE_DMA()
439 case INT_DMATLB_MISS:
440 case INT_DMATLB_MISS_DWNCL:
441 case INT_DMATLB_ACCESS:
442 case INT_DMATLB_ACCESS_DWNCL:
443 __insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK);
444 break;
445#endif
446#if CHIP_HAS_SN_PROC()
447 case INT_SNITLB_MISS:
448 case INT_SNITLB_MISS_DWNCL:
449 __insn_mtspr(SPR_SNCTL,
450 __insn_mfspr(SPR_SNCTL) &
451 ~SPR_SNCTL__FRZPROC_MASK);
452 break;
453#endif
454 }
455#endif
456
457 up_read(&mm->mmap_sem);
458 return 1;
459
460/*
461 * Something tried to access memory that isn't in our memory map..
462 * Fix it, but check if it's kernel or user first..
463 */
464bad_area:
465 up_read(&mm->mmap_sem);
466
467bad_area_nosemaphore:
468 /* User mode accesses just cause a SIGSEGV */
469 if (!is_kernel_mode) {
470 /*
471 * It's possible to have interrupts off here.
472 */
473 local_irq_enable();
474
475 force_sig_info_fault(SIGSEGV, si_code, address,
476 fault_num, tsk);
477 return 0;
478 }
479
480no_context:
481 /* Are we prepared to handle this kernel fault? */
482 if (fixup_exception(regs))
483 return 0;
484
485/*
486 * Oops. The kernel tried to access some bad page. We'll have to
487 * terminate things with extreme prejudice.
488 */
489
490 bust_spinlocks(1);
491
492 /* FIXME: no lookup_address() yet */
493#ifdef SUPPORT_LOOKUP_ADDRESS
494 if (fault_num == INT_ITLB_MISS) {
495 pte_t *pte = lookup_address(address);
496
497 if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
498 pr_crit("kernel tried to execute"
499 " non-executable page - exploit attempt?"
500 " (uid: %d)\n", current->uid);
501 }
502#endif
503 if (address < PAGE_SIZE)
504 pr_alert("Unable to handle kernel NULL pointer dereference\n");
505 else
506 pr_alert("Unable to handle kernel paging request\n");
507 pr_alert(" at virtual address "REGFMT", pc "REGFMT"\n",
508 address, regs->pc);
509
510 show_regs(regs);
511
512 if (unlikely(tsk->pid < 2)) {
513 panic("Kernel page fault running %s!",
514 tsk->pid ? "init" : "the idle task");
515 }
516
517 /*
518 * More FIXME: we should probably copy the i386 here and
519 * implement a generic die() routine. Not today.
520 */
521#ifdef SUPPORT_DIE
522 die("Oops", regs);
523#endif
524 bust_spinlocks(1);
525
526 do_group_exit(SIGKILL);
527
528/*
529 * We ran out of memory, or some other thing happened to us that made
530 * us unable to handle the page fault gracefully.
531 */
532out_of_memory:
533 up_read(&mm->mmap_sem);
534 if (is_global_init(tsk)) {
535 yield();
536 down_read(&mm->mmap_sem);
537 goto survive;
538 }
539 pr_alert("VM: killing process %s\n", tsk->comm);
540 if (!is_kernel_mode)
541 do_group_exit(SIGKILL);
542 goto no_context;
543
544do_sigbus:
545 up_read(&mm->mmap_sem);
546
547 /* Kernel mode? Handle exceptions or die */
548 if (is_kernel_mode)
549 goto no_context;
550
551 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, fault_num, tsk);
552 return 0;
553}
554
555#ifndef __tilegx__
556
557/* We must release ICS before panicking or we won't get anywhere. */
558#define ics_panic(fmt, ...) do { \
559 __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); \
560 panic(fmt, __VA_ARGS__); \
561} while (0)
562
563/*
564 * When we take an ITLB or DTLB fault or access violation in the
565 * supervisor while the critical section bit is set, the hypervisor is
566 * reluctant to write new values into the EX_CONTEXT_1_x registers,
567 * since that might indicate we have not yet squirreled the SPR
568 * contents away and can thus safely take a recursive interrupt.
569 * Accordingly, the hypervisor passes us the PC via SYSTEM_SAVE_1_2.
570 */
571struct intvec_state do_page_fault_ics(struct pt_regs *regs, int fault_num,
572 unsigned long address,
573 unsigned long info)
574{
575 unsigned long pc = info & ~1;
576 int write = info & 1;
577 pgd_t *pgd = get_current_pgd();
578
579 /* Retval is 1 at first since we will handle the fault fully. */
580 struct intvec_state state = {
581 do_page_fault, fault_num, address, write, 1
582 };
583
584 /* Validate that we are plausibly in the right routine. */
585 if ((pc & 0x7) != 0 || pc < PAGE_OFFSET ||
586 (fault_num != INT_DTLB_MISS &&
587 fault_num != INT_DTLB_ACCESS)) {
588 unsigned long old_pc = regs->pc;
589 regs->pc = pc;
590 ics_panic("Bad ICS page fault args:"
591 " old PC %#lx, fault %d/%d at %#lx\n",
592 old_pc, fault_num, write, address);
593 }
594
595 /* We might be faulting on a vmalloc page, so check that first. */
596 if (fault_num != INT_DTLB_ACCESS && vmalloc_fault(pgd, address) >= 0)
597 return state;
598
599 /*
600 * If we faulted with ICS set in sys_cmpxchg, we are providing
601 * a user syscall service that should generate a signal on
602 * fault. We didn't set up a kernel stack on initial entry to
603 * sys_cmpxchg, but instead had one set up by the fault, which
604 * (because sys_cmpxchg never releases ICS) came to us via the
605 * SYSTEM_SAVE_1_2 mechanism, and thus EX_CONTEXT_1_[01] are
606 * still referencing the original user code. We release the
607 * atomic lock and rewrite pt_regs so that it appears that we
608 * came from user-space directly, and after we finish the
609 * fault we'll go back to user space and re-issue the swint.
610 * This way the backtrace information is correct if we need to
611 * emit a stack dump at any point while handling this.
612 *
613 * Must match register use in sys_cmpxchg().
614 */
615 if (pc >= (unsigned long) sys_cmpxchg &&
616 pc < (unsigned long) __sys_cmpxchg_end) {
617#ifdef CONFIG_SMP
618 /* Don't unlock before we could have locked. */
619 if (pc >= (unsigned long)__sys_cmpxchg_grab_lock) {
620 int *lock_ptr = (int *)(regs->regs[ATOMIC_LOCK_REG]);
621 __atomic_fault_unlock(lock_ptr);
622 }
623#endif
624 regs->sp = regs->regs[27];
625 }
626
627 /*
628 * We can also fault in the atomic assembly, in which
629 * case we use the exception table to do the first-level fixup.
630 * We may re-fixup again in the real fault handler if it
631 * turns out the faulting address is just bad, and not,
632 * for example, migrating.
633 */
634 else if (pc >= (unsigned long) __start_atomic_asm_code &&
635 pc < (unsigned long) __end_atomic_asm_code) {
636 const struct exception_table_entry *fixup;
637#ifdef CONFIG_SMP
638 /* Unlock the atomic lock. */
639 int *lock_ptr = (int *)(regs->regs[ATOMIC_LOCK_REG]);
640 __atomic_fault_unlock(lock_ptr);
641#endif
642 fixup = search_exception_tables(pc);
643 if (!fixup)
644 ics_panic("ICS atomic fault not in table:"
645 " PC %#lx, fault %d", pc, fault_num);
646 regs->pc = fixup->fixup;
647 regs->ex1 = PL_ICS_EX1(KERNEL_PL, 0);
648 }
649
650 /*
651 * NOTE: the one other type of access that might bring us here
652 * are the memory ops in __tns_atomic_acquire/__tns_atomic_release,
653 * but we don't have to check specially for them since we can
654 * always safely return to the address of the fault and retry,
655 * since no separate atomic locks are involved.
656 */
657
658 /*
659 * Now that we have released the atomic lock (if necessary),
660 * it's safe to spin if the PTE that caused the fault was migrating.
661 */
662 if (fault_num == INT_DTLB_ACCESS)
663 write = 1;
664 if (handle_migrating_pte(pgd, fault_num, address, 1, write))
665 return state;
666
667 /* Return zero so that we continue on with normal fault handling. */
668 state.retval = 0;
669 return state;
670}
671
672#endif /* !__tilegx__ */
673
674/*
675 * This routine handles page faults. It determines the address, and the
676 * problem, and then passes it handle_page_fault() for normal DTLB and
677 * ITLB issues, and for DMA or SN processor faults when we are in user
678 * space. For the latter, if we're in kernel mode, we just save the
679 * interrupt away appropriately and return immediately. We can't do
680 * page faults for user code while in kernel mode.
681 */
682void do_page_fault(struct pt_regs *regs, int fault_num,
683 unsigned long address, unsigned long write)
684{
685 int is_page_fault;
686
687 /* This case should have been handled by do_page_fault_ics(). */
688 BUG_ON(write & ~1);
689
690#if CHIP_HAS_TILE_DMA()
691 /*
692 * If it's a DMA fault, suspend the transfer while we're
693 * handling the miss; we'll restart after it's handled. If we
694 * don't suspend, it's possible that this process could swap
695 * out and back in, and restart the engine since the DMA is
696 * still 'running'.
697 */
698 if (fault_num == INT_DMATLB_MISS ||
699 fault_num == INT_DMATLB_ACCESS ||
700 fault_num == INT_DMATLB_MISS_DWNCL ||
701 fault_num == INT_DMATLB_ACCESS_DWNCL) {
702 __insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__SUSPEND_MASK);
703 while (__insn_mfspr(SPR_DMA_USER_STATUS) &
704 SPR_DMA_STATUS__BUSY_MASK)
705 ;
706 }
707#endif
708
709 /* Validate fault num and decide if this is a first-time page fault. */
710 switch (fault_num) {
711 case INT_ITLB_MISS:
712 case INT_DTLB_MISS:
713#if CHIP_HAS_TILE_DMA()
714 case INT_DMATLB_MISS:
715 case INT_DMATLB_MISS_DWNCL:
716#endif
717#if CHIP_HAS_SN_PROC()
718 case INT_SNITLB_MISS:
719 case INT_SNITLB_MISS_DWNCL:
720#endif
721 is_page_fault = 1;
722 break;
723
724 case INT_DTLB_ACCESS:
725#if CHIP_HAS_TILE_DMA()
726 case INT_DMATLB_ACCESS:
727 case INT_DMATLB_ACCESS_DWNCL:
728#endif
729 is_page_fault = 0;
730 break;
731
732 default:
733 panic("Bad fault number %d in do_page_fault", fault_num);
734 }
735
736 if (EX1_PL(regs->ex1) != USER_PL) {
737 struct async_tlb *async;
738 switch (fault_num) {
739#if CHIP_HAS_TILE_DMA()
740 case INT_DMATLB_MISS:
741 case INT_DMATLB_ACCESS:
742 case INT_DMATLB_MISS_DWNCL:
743 case INT_DMATLB_ACCESS_DWNCL:
744 async = &current->thread.dma_async_tlb;
745 break;
746#endif
747#if CHIP_HAS_SN_PROC()
748 case INT_SNITLB_MISS:
749 case INT_SNITLB_MISS_DWNCL:
750 async = &current->thread.sn_async_tlb;
751 break;
752#endif
753 default:
754 async = NULL;
755 }
756 if (async) {
757
758 /*
759 * No vmalloc check required, so we can allow
760 * interrupts immediately at this point.
761 */
762 local_irq_enable();
763
764 set_thread_flag(TIF_ASYNC_TLB);
765 if (async->fault_num != 0) {
766 panic("Second async fault %d;"
767 " old fault was %d (%#lx/%ld)",
768 fault_num, async->fault_num,
769 address, write);
770 }
771 BUG_ON(fault_num == 0);
772 async->fault_num = fault_num;
773 async->is_fault = is_page_fault;
774 async->is_write = write;
775 async->address = address;
776 return;
777 }
778 }
779
780 handle_page_fault(regs, fault_num, is_page_fault, address, write);
781}
782
783
784#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
785/*
786 * Check an async_tlb structure to see if a deferred fault is waiting,
787 * and if so pass it to the page-fault code.
788 */
789static void handle_async_page_fault(struct pt_regs *regs,
790 struct async_tlb *async)
791{
792 if (async->fault_num) {
793 /*
794 * Clear async->fault_num before calling the page-fault
795 * handler so that if we re-interrupt before returning
796 * from the function we have somewhere to put the
797 * information from the new interrupt.
798 */
799 int fault_num = async->fault_num;
800 async->fault_num = 0;
801 handle_page_fault(regs, fault_num, async->is_fault,
802 async->address, async->is_write);
803 }
804}
805#endif /* CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() */
806
807
808/*
809 * This routine effectively re-issues asynchronous page faults
810 * when we are returning to user space.
811 */
812void do_async_page_fault(struct pt_regs *regs)
813{
814 /*
815 * Clear thread flag early. If we re-interrupt while processing
816 * code here, we will reset it and recall this routine before
817 * returning to user space.
818 */
819 clear_thread_flag(TIF_ASYNC_TLB);
820
821#if CHIP_HAS_TILE_DMA()
822 handle_async_page_fault(regs, &current->thread.dma_async_tlb);
823#endif
824#if CHIP_HAS_SN_PROC()
825 handle_async_page_fault(regs, &current->thread.sn_async_tlb);
826#endif
827}
828
829void vmalloc_sync_all(void)
830{
831#ifdef __tilegx__
832 /* Currently all L1 kernel pmd's are static and shared. */
833 BUG_ON(pgd_index(VMALLOC_END) != pgd_index(VMALLOC_START));
834#else
835 /*
836 * Note that races in the updates of insync and start aren't
837 * problematic: insync can only get set bits added, and updates to
838 * start are only improving performance (without affecting correctness
839 * if undone).
840 */
841 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
842 static unsigned long start = PAGE_OFFSET;
843 unsigned long address;
844
845 BUILD_BUG_ON(PAGE_OFFSET & ~PGDIR_MASK);
846 for (address = start; address >= PAGE_OFFSET; address += PGDIR_SIZE) {
847 if (!test_bit(pgd_index(address), insync)) {
848 unsigned long flags;
849 struct list_head *pos;
850
851 spin_lock_irqsave(&pgd_lock, flags);
852 list_for_each(pos, &pgd_list)
853 if (!vmalloc_sync_one(list_to_pgd(pos),
854 address)) {
855 /* Must be at first entry in list. */
856 BUG_ON(pos != pgd_list.next);
857 break;
858 }
859 spin_unlock_irqrestore(&pgd_lock, flags);
860 if (pos != pgd_list.next)
861 set_bit(pgd_index(address), insync);
862 }
863 if (address == start && test_bit(pgd_index(address), insync))
864 start = address + PGDIR_SIZE;
865 }
866#endif
867}
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
new file mode 100644
index 000000000000..ff1cdff5114d
--- /dev/null
+++ b/arch/tile/mm/highmem.c
@@ -0,0 +1,328 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/highmem.h>
16#include <linux/module.h>
17#include <linux/pagemap.h>
18#include <asm/homecache.h>
19
20#define kmap_get_pte(vaddr) \
21 pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)),\
22 (vaddr)), (vaddr))
23
24
25void *kmap(struct page *page)
26{
27 void *kva;
28 unsigned long flags;
29 pte_t *ptep;
30
31 might_sleep();
32 if (!PageHighMem(page))
33 return page_address(page);
34 kva = kmap_high(page);
35
36 /*
37 * Rewrite the PTE under the lock. This ensures that the page
38 * is not currently migrating.
39 */
40 ptep = kmap_get_pte((unsigned long)kva);
41 flags = homecache_kpte_lock();
42 set_pte_at(&init_mm, kva, ptep, mk_pte(page, page_to_kpgprot(page)));
43 homecache_kpte_unlock(flags);
44
45 return kva;
46}
47EXPORT_SYMBOL(kmap);
48
49void kunmap(struct page *page)
50{
51 if (in_interrupt())
52 BUG();
53 if (!PageHighMem(page))
54 return;
55 kunmap_high(page);
56}
57EXPORT_SYMBOL(kunmap);
58
59static void debug_kmap_atomic_prot(enum km_type type)
60{
61#ifdef CONFIG_DEBUG_HIGHMEM
62 static unsigned warn_count = 10;
63
64 if (unlikely(warn_count == 0))
65 return;
66
67 if (unlikely(in_interrupt())) {
68 if (in_irq()) {
69 if (type != KM_IRQ0 && type != KM_IRQ1 &&
70 type != KM_BIO_SRC_IRQ &&
71 /* type != KM_BIO_DST_IRQ && */
72 type != KM_BOUNCE_READ) {
73 WARN_ON(1);
74 warn_count--;
75 }
76 } else if (!irqs_disabled()) { /* softirq */
77 if (type != KM_IRQ0 && type != KM_IRQ1 &&
78 type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
79 type != KM_SKB_SUNRPC_DATA &&
80 type != KM_SKB_DATA_SOFTIRQ &&
81 type != KM_BOUNCE_READ) {
82 WARN_ON(1);
83 warn_count--;
84 }
85 }
86 }
87
88 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
89 type == KM_BIO_SRC_IRQ /* || type == KM_BIO_DST_IRQ */) {
90 if (!irqs_disabled()) {
91 WARN_ON(1);
92 warn_count--;
93 }
94 } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
95 if (irq_count() == 0 && !irqs_disabled()) {
96 WARN_ON(1);
97 warn_count--;
98 }
99 }
100#endif
101}
102
103/*
104 * Describe a single atomic mapping of a page on a given cpu at a
105 * given address, and allow it to be linked into a list.
106 */
107struct atomic_mapped_page {
108 struct list_head list;
109 struct page *page;
110 int cpu;
111 unsigned long va;
112};
113
114static spinlock_t amp_lock = __SPIN_LOCK_UNLOCKED(&amp_lock);
115static struct list_head amp_list = LIST_HEAD_INIT(amp_list);
116
117/*
118 * Combining this structure with a per-cpu declaration lets us give
119 * each cpu an atomic_mapped_page structure per type.
120 */
121struct kmap_amps {
122 struct atomic_mapped_page per_type[KM_TYPE_NR];
123};
124static DEFINE_PER_CPU(struct kmap_amps, amps);
125
126/*
127 * Add a page and va, on this cpu, to the list of kmap_atomic pages,
128 * and write the new pte to memory. Writing the new PTE under the
129 * lock guarantees that it is either on the list before migration starts
130 * (if we won the race), or set_pte() sets the migrating bit in the PTE
131 * (if we lost the race). And doing it under the lock guarantees
132 * that when kmap_atomic_fix_one_pte() comes along, it finds a valid
133 * PTE in memory, iff the mapping is still on the amp_list.
134 *
135 * Finally, doing it under the lock lets us safely examine the page
136 * to see if it is immutable or not, for the generic kmap_atomic() case.
137 * If we examine it earlier we are exposed to a race where it looks
138 * writable earlier, but becomes immutable before we write the PTE.
139 */
140static void kmap_atomic_register(struct page *page, enum km_type type,
141 unsigned long va, pte_t *ptep, pte_t pteval)
142{
143 unsigned long flags;
144 struct atomic_mapped_page *amp;
145
146 flags = homecache_kpte_lock();
147 spin_lock(&amp_lock);
148
149 /* With interrupts disabled, now fill in the per-cpu info. */
150 amp = &__get_cpu_var(amps).per_type[type];
151 amp->page = page;
152 amp->cpu = smp_processor_id();
153 amp->va = va;
154
155 /* For generic kmap_atomic(), choose the PTE writability now. */
156 if (!pte_read(pteval))
157 pteval = mk_pte(page, page_to_kpgprot(page));
158
159 list_add(&amp->list, &amp_list);
160 set_pte(ptep, pteval);
161 arch_flush_lazy_mmu_mode();
162
163 spin_unlock(&amp_lock);
164 homecache_kpte_unlock(flags);
165}
166
167/*
168 * Remove a page and va, on this cpu, from the list of kmap_atomic pages.
169 * Linear-time search, but we count on the lists being short.
170 * We don't need to adjust the PTE under the lock (as opposed to the
171 * kmap_atomic_register() case), since we're just unconditionally
172 * zeroing the PTE after it's off the list.
173 */
174static void kmap_atomic_unregister(struct page *page, unsigned long va)
175{
176 unsigned long flags;
177 struct atomic_mapped_page *amp;
178 int cpu = smp_processor_id();
179 spin_lock_irqsave(&amp_lock, flags);
180 list_for_each_entry(amp, &amp_list, list) {
181 if (amp->page == page && amp->cpu == cpu && amp->va == va)
182 break;
183 }
184 BUG_ON(&amp->list == &amp_list);
185 list_del(&amp->list);
186 spin_unlock_irqrestore(&amp_lock, flags);
187}
188
189/* Helper routine for kmap_atomic_fix_kpte(), below. */
190static void kmap_atomic_fix_one_kpte(struct atomic_mapped_page *amp,
191 int finished)
192{
193 pte_t *ptep = kmap_get_pte(amp->va);
194 if (!finished) {
195 set_pte(ptep, pte_mkmigrate(*ptep));
196 flush_remote(0, 0, NULL, amp->va, PAGE_SIZE, PAGE_SIZE,
197 cpumask_of(amp->cpu), NULL, 0);
198 } else {
199 /*
200 * Rewrite a default kernel PTE for this page.
201 * We rely on the fact that set_pte() writes the
202 * present+migrating bits last.
203 */
204 pte_t pte = mk_pte(amp->page, page_to_kpgprot(amp->page));
205 set_pte(ptep, pte);
206 }
207}
208
209/*
210 * This routine is a helper function for homecache_fix_kpte(); see
211 * its comments for more information on the "finished" argument here.
212 *
213 * Note that we hold the lock while doing the remote flushes, which
214 * will stall any unrelated cpus trying to do kmap_atomic operations.
215 * We could just update the PTEs under the lock, and save away copies
216 * of the structs (or just the va+cpu), then flush them after we
217 * release the lock, but it seems easier just to do it all under the lock.
218 */
219void kmap_atomic_fix_kpte(struct page *page, int finished)
220{
221 struct atomic_mapped_page *amp;
222 unsigned long flags;
223 spin_lock_irqsave(&amp_lock, flags);
224 list_for_each_entry(amp, &amp_list, list) {
225 if (amp->page == page)
226 kmap_atomic_fix_one_kpte(amp, finished);
227 }
228 spin_unlock_irqrestore(&amp_lock, flags);
229}
230
231/*
232 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap
233 * because the kmap code must perform a global TLB invalidation when
234 * the kmap pool wraps.
235 *
236 * Note that they may be slower than on x86 (etc.) because unlike on
237 * those platforms, we do have to take a global lock to map and unmap
238 * pages on Tile (see above).
239 *
240 * When holding an atomic kmap is is not legal to sleep, so atomic
241 * kmaps are appropriate for short, tight code paths only.
242 */
243void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
244{
245 enum fixed_addresses idx;
246 unsigned long vaddr;
247 pte_t *pte;
248
249 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
250 pagefault_disable();
251
252 /* Avoid icache flushes by disallowing atomic executable mappings. */
253 BUG_ON(pte_exec(prot));
254
255 if (!PageHighMem(page))
256 return page_address(page);
257
258 debug_kmap_atomic_prot(type);
259
260 idx = type + KM_TYPE_NR*smp_processor_id();
261 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
262 pte = kmap_get_pte(vaddr);
263 BUG_ON(!pte_none(*pte));
264
265 /* Register that this page is mapped atomically on this cpu. */
266 kmap_atomic_register(page, type, vaddr, pte, mk_pte(page, prot));
267
268 return (void *)vaddr;
269}
270EXPORT_SYMBOL(kmap_atomic_prot);
271
272void *kmap_atomic(struct page *page, enum km_type type)
273{
274 /* PAGE_NONE is a magic value that tells us to check immutability. */
275 return kmap_atomic_prot(page, type, PAGE_NONE);
276}
277EXPORT_SYMBOL(kmap_atomic);
278
279void kunmap_atomic(void *kvaddr, enum km_type type)
280{
281 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
282 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
283
284 /*
285 * Force other mappings to Oops if they try to access this pte without
286 * first remapping it. Keeping stale mappings around is a bad idea.
287 */
288 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) {
289 pte_t *pte = kmap_get_pte(vaddr);
290 pte_t pteval = *pte;
291 BUG_ON(!pte_present(pteval) && !pte_migrating(pteval));
292 kmap_atomic_unregister(pte_page(pteval), vaddr);
293 kpte_clear_flush(pte, vaddr);
294 } else {
295 /* Must be a lowmem page */
296 BUG_ON(vaddr < PAGE_OFFSET);
297 BUG_ON(vaddr >= (unsigned long)high_memory);
298 }
299
300 arch_flush_lazy_mmu_mode();
301 pagefault_enable();
302}
303EXPORT_SYMBOL(kunmap_atomic);
304
305/*
306 * This API is supposed to allow us to map memory without a "struct page".
307 * Currently we don't support this, though this may change in the future.
308 */
309void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
310{
311 return kmap_atomic(pfn_to_page(pfn), type);
312}
313void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
314{
315 return kmap_atomic_prot(pfn_to_page(pfn), type, prot);
316}
317
318struct page *kmap_atomic_to_page(void *ptr)
319{
320 pte_t *pte;
321 unsigned long vaddr = (unsigned long)ptr;
322
323 if (vaddr < FIXADDR_START)
324 return virt_to_page(ptr);
325
326 pte = kmap_get_pte(vaddr);
327 return pte_page(*pte);
328}
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
new file mode 100644
index 000000000000..97c478e7be27
--- /dev/null
+++ b/arch/tile/mm/homecache.c
@@ -0,0 +1,433 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * This code maintains the "home" for each page in the system.
15 */
16
17#include <linux/kernel.h>
18#include <linux/mm.h>
19#include <linux/spinlock.h>
20#include <linux/list.h>
21#include <linux/bootmem.h>
22#include <linux/rmap.h>
23#include <linux/pagemap.h>
24#include <linux/mutex.h>
25#include <linux/interrupt.h>
26#include <linux/sysctl.h>
27#include <linux/pagevec.h>
28#include <linux/ptrace.h>
29#include <linux/timex.h>
30#include <linux/cache.h>
31#include <linux/smp.h>
32
33#include <asm/page.h>
34#include <asm/sections.h>
35#include <asm/tlbflush.h>
36#include <asm/pgalloc.h>
37#include <asm/homecache.h>
38
39#include "migrate.h"
40
41
42#if CHIP_HAS_COHERENT_LOCAL_CACHE()
43
44/*
45 * The noallocl2 option suppresses all use of the L2 cache to cache
46 * locally from a remote home. There's no point in using it if we
47 * don't have coherent local caching, though.
48 */
49static int __write_once noallocl2;
50static int __init set_noallocl2(char *str)
51{
52 noallocl2 = 1;
53 return 0;
54}
55early_param("noallocl2", set_noallocl2);
56
57#else
58
59#define noallocl2 0
60
61#endif
62
63/* Provide no-op versions of these routines to keep flush_remote() cleaner. */
64#define mark_caches_evicted_start() 0
65#define mark_caches_evicted_finish(mask, timestamp) do {} while (0)
66
67
68/*
69 * Update the irq_stat for cpus that we are going to interrupt
70 * with TLB or cache flushes. Also handle removing dataplane cpus
71 * from the TLB flush set, and setting dataplane_tlb_state instead.
72 */
73static void hv_flush_update(const struct cpumask *cache_cpumask,
74 struct cpumask *tlb_cpumask,
75 unsigned long tlb_va, unsigned long tlb_length,
76 HV_Remote_ASID *asids, int asidcount)
77{
78 struct cpumask mask;
79 int i, cpu;
80
81 cpumask_clear(&mask);
82 if (cache_cpumask)
83 cpumask_or(&mask, &mask, cache_cpumask);
84 if (tlb_cpumask && tlb_length) {
85 cpumask_or(&mask, &mask, tlb_cpumask);
86 }
87
88 for (i = 0; i < asidcount; ++i)
89 cpumask_set_cpu(asids[i].y * smp_width + asids[i].x, &mask);
90
91 /*
92 * Don't bother to update atomically; losing a count
93 * here is not that critical.
94 */
95 for_each_cpu(cpu, &mask)
96 ++per_cpu(irq_stat, cpu).irq_hv_flush_count;
97}
98
99/*
100 * This wrapper function around hv_flush_remote() does several things:
101 *
102 * - Provides a return value error-checking panic path, since
103 * there's never any good reason for hv_flush_remote() to fail.
104 * - Accepts a 32-bit PFN rather than a 64-bit PA, which generally
105 * is the type that Linux wants to pass around anyway.
106 * - Centralizes the mark_caches_evicted() handling.
107 * - Canonicalizes that lengths of zero make cpumasks NULL.
108 * - Handles deferring TLB flushes for dataplane tiles.
109 * - Tracks remote interrupts in the per-cpu irq_cpustat_t.
110 *
111 * Note that we have to wait until the cache flush completes before
112 * updating the per-cpu last_cache_flush word, since otherwise another
113 * concurrent flush can race, conclude the flush has already
114 * completed, and start to use the page while it's still dirty
115 * remotely (running concurrently with the actual evict, presumably).
116 */
117void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
118 const struct cpumask *cache_cpumask_orig,
119 HV_VirtAddr tlb_va, unsigned long tlb_length,
120 unsigned long tlb_pgsize,
121 const struct cpumask *tlb_cpumask_orig,
122 HV_Remote_ASID *asids, int asidcount)
123{
124 int rc;
125 int timestamp = 0; /* happy compiler */
126 struct cpumask cache_cpumask_copy, tlb_cpumask_copy;
127 struct cpumask *cache_cpumask, *tlb_cpumask;
128 HV_PhysAddr cache_pa;
129 char cache_buf[NR_CPUS*5], tlb_buf[NR_CPUS*5];
130
131 mb(); /* provided just to simplify "magic hypervisor" mode */
132
133 /*
134 * Canonicalize and copy the cpumasks.
135 */
136 if (cache_cpumask_orig && cache_control) {
137 cpumask_copy(&cache_cpumask_copy, cache_cpumask_orig);
138 cache_cpumask = &cache_cpumask_copy;
139 } else {
140 cpumask_clear(&cache_cpumask_copy);
141 cache_cpumask = NULL;
142 }
143 if (cache_cpumask == NULL)
144 cache_control = 0;
145 if (tlb_cpumask_orig && tlb_length) {
146 cpumask_copy(&tlb_cpumask_copy, tlb_cpumask_orig);
147 tlb_cpumask = &tlb_cpumask_copy;
148 } else {
149 cpumask_clear(&tlb_cpumask_copy);
150 tlb_cpumask = NULL;
151 }
152
153 hv_flush_update(cache_cpumask, tlb_cpumask, tlb_va, tlb_length,
154 asids, asidcount);
155 cache_pa = (HV_PhysAddr)cache_pfn << PAGE_SHIFT;
156 if (cache_control & HV_FLUSH_EVICT_L2)
157 timestamp = mark_caches_evicted_start();
158 rc = hv_flush_remote(cache_pa, cache_control,
159 cpumask_bits(cache_cpumask),
160 tlb_va, tlb_length, tlb_pgsize,
161 cpumask_bits(tlb_cpumask),
162 asids, asidcount);
163 if (cache_control & HV_FLUSH_EVICT_L2)
164 mark_caches_evicted_finish(cache_cpumask, timestamp);
165 if (rc == 0)
166 return;
167 cpumask_scnprintf(cache_buf, sizeof(cache_buf), &cache_cpumask_copy);
168 cpumask_scnprintf(tlb_buf, sizeof(tlb_buf), &tlb_cpumask_copy);
169
170 pr_err("hv_flush_remote(%#llx, %#lx, %p [%s],"
171 " %#lx, %#lx, %#lx, %p [%s], %p, %d) = %d\n",
172 cache_pa, cache_control, cache_cpumask, cache_buf,
173 (unsigned long)tlb_va, tlb_length, tlb_pgsize,
174 tlb_cpumask, tlb_buf,
175 asids, asidcount, rc);
176 panic("Unsafe to continue.");
177}
178
179void homecache_evict(const struct cpumask *mask)
180{
181 flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0);
182}
183
184/* Return a mask of the cpus whose caches currently own these pages. */
185static void homecache_mask(struct page *page, int pages,
186 struct cpumask *home_mask)
187{
188 int i;
189 cpumask_clear(home_mask);
190 for (i = 0; i < pages; ++i) {
191 int home = page_home(&page[i]);
192 if (home == PAGE_HOME_IMMUTABLE ||
193 home == PAGE_HOME_INCOHERENT) {
194 cpumask_copy(home_mask, cpu_possible_mask);
195 return;
196 }
197#if CHIP_HAS_CBOX_HOME_MAP()
198 if (home == PAGE_HOME_HASH) {
199 cpumask_or(home_mask, home_mask, &hash_for_home_map);
200 continue;
201 }
202#endif
203 if (home == PAGE_HOME_UNCACHED)
204 continue;
205 BUG_ON(home < 0 || home >= NR_CPUS);
206 cpumask_set_cpu(home, home_mask);
207 }
208}
209
210/*
211 * Return the passed length, or zero if it's long enough that we
212 * believe we should evict the whole L2 cache.
213 */
214static unsigned long cache_flush_length(unsigned long length)
215{
216 return (length >= CHIP_L2_CACHE_SIZE()) ? HV_FLUSH_EVICT_L2 : length;
217}
218
219/* On the simulator, confirm lines have been evicted everywhere. */
220static void validate_lines_evicted(unsigned long pfn, size_t length)
221{
222 sim_syscall(SIM_SYSCALL_VALIDATE_LINES_EVICTED,
223 (HV_PhysAddr)pfn << PAGE_SHIFT, length);
224}
225
226/* Flush a page out of whatever cache(s) it is in. */
227void homecache_flush_cache(struct page *page, int order)
228{
229 int pages = 1 << order;
230 int length = cache_flush_length(pages * PAGE_SIZE);
231 unsigned long pfn = page_to_pfn(page);
232 struct cpumask home_mask;
233
234 homecache_mask(page, pages, &home_mask);
235 flush_remote(pfn, length, &home_mask, 0, 0, 0, NULL, NULL, 0);
236 validate_lines_evicted(pfn, pages * PAGE_SIZE);
237}
238
239
240/* Report the home corresponding to a given PTE. */
241static int pte_to_home(pte_t pte)
242{
243 if (hv_pte_get_nc(pte))
244 return PAGE_HOME_IMMUTABLE;
245 switch (hv_pte_get_mode(pte)) {
246 case HV_PTE_MODE_CACHE_TILE_L3:
247 return get_remote_cache_cpu(pte);
248 case HV_PTE_MODE_CACHE_NO_L3:
249 return PAGE_HOME_INCOHERENT;
250 case HV_PTE_MODE_UNCACHED:
251 return PAGE_HOME_UNCACHED;
252#if CHIP_HAS_CBOX_HOME_MAP()
253 case HV_PTE_MODE_CACHE_HASH_L3:
254 return PAGE_HOME_HASH;
255#endif
256 }
257 panic("Bad PTE %#llx\n", pte.val);
258}
259
260/* Update the home of a PTE if necessary (can also be used for a pgprot_t). */
261pte_t pte_set_home(pte_t pte, int home)
262{
263 /* Check for non-linear file mapping "PTEs" and pass them through. */
264 if (pte_file(pte))
265 return pte;
266
267#if CHIP_HAS_MMIO()
268 /* Check for MMIO mappings and pass them through. */
269 if (hv_pte_get_mode(pte) == HV_PTE_MODE_MMIO)
270 return pte;
271#endif
272
273
274 /*
275 * Only immutable pages get NC mappings. If we have a
276 * non-coherent PTE, but the underlying page is not
277 * immutable, it's likely the result of a forced
278 * caching setting running up against ptrace setting
279 * the page to be writable underneath. In this case,
280 * just keep the PTE coherent.
281 */
282 if (hv_pte_get_nc(pte) && home != PAGE_HOME_IMMUTABLE) {
283 pte = hv_pte_clear_nc(pte);
284 pr_err("non-immutable page incoherently referenced: %#llx\n",
285 pte.val);
286 }
287
288 switch (home) {
289
290 case PAGE_HOME_UNCACHED:
291 pte = hv_pte_set_mode(pte, HV_PTE_MODE_UNCACHED);
292 break;
293
294 case PAGE_HOME_INCOHERENT:
295 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
296 break;
297
298 case PAGE_HOME_IMMUTABLE:
299 /*
300 * We could home this page anywhere, since it's immutable,
301 * but by default just home it to follow "hash_default".
302 */
303 BUG_ON(hv_pte_get_writable(pte));
304 if (pte_get_forcecache(pte)) {
305 /* Upgrade "force any cpu" to "No L3" for immutable. */
306 if (hv_pte_get_mode(pte) == HV_PTE_MODE_CACHE_TILE_L3
307 && pte_get_anyhome(pte)) {
308 pte = hv_pte_set_mode(pte,
309 HV_PTE_MODE_CACHE_NO_L3);
310 }
311 } else
312#if CHIP_HAS_CBOX_HOME_MAP()
313 if (hash_default)
314 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
315 else
316#endif
317 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
318 pte = hv_pte_set_nc(pte);
319 break;
320
321#if CHIP_HAS_CBOX_HOME_MAP()
322 case PAGE_HOME_HASH:
323 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
324 break;
325#endif
326
327 default:
328 BUG_ON(home < 0 || home >= NR_CPUS ||
329 !cpu_is_valid_lotar(home));
330 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_TILE_L3);
331 pte = set_remote_cache_cpu(pte, home);
332 break;
333 }
334
335#if CHIP_HAS_NC_AND_NOALLOC_BITS()
336 if (noallocl2)
337 pte = hv_pte_set_no_alloc_l2(pte);
338
339 /* Simplify "no local and no l3" to "uncached" */
340 if (hv_pte_get_no_alloc_l2(pte) && hv_pte_get_no_alloc_l1(pte) &&
341 hv_pte_get_mode(pte) == HV_PTE_MODE_CACHE_NO_L3) {
342 pte = hv_pte_set_mode(pte, HV_PTE_MODE_UNCACHED);
343 }
344#endif
345
346 /* Checking this case here gives a better panic than from the hv. */
347 BUG_ON(hv_pte_get_mode(pte) == 0);
348
349 return pte;
350}
351
352/*
353 * The routines in this section are the "static" versions of the normal
354 * dynamic homecaching routines; they just set the home cache
355 * of a kernel page once, and require a full-chip cache/TLB flush,
356 * so they're not suitable for anything but infrequent use.
357 */
358
359#if CHIP_HAS_CBOX_HOME_MAP()
360static inline int initial_page_home(void) { return PAGE_HOME_HASH; }
361#else
362static inline int initial_page_home(void) { return 0; }
363#endif
364
365int page_home(struct page *page)
366{
367 if (PageHighMem(page)) {
368 return initial_page_home();
369 } else {
370 unsigned long kva = (unsigned long)page_address(page);
371 return pte_to_home(*virt_to_pte(NULL, kva));
372 }
373}
374
375void homecache_change_page_home(struct page *page, int order, int home)
376{
377 int i, pages = (1 << order);
378 unsigned long kva;
379
380 BUG_ON(PageHighMem(page));
381 BUG_ON(page_count(page) > 1);
382 BUG_ON(page_mapcount(page) != 0);
383 kva = (unsigned long) page_address(page);
384 flush_remote(0, HV_FLUSH_EVICT_L2, &cpu_cacheable_map,
385 kva, pages * PAGE_SIZE, PAGE_SIZE, cpu_online_mask,
386 NULL, 0);
387
388 for (i = 0; i < pages; ++i, kva += PAGE_SIZE) {
389 pte_t *ptep = virt_to_pte(NULL, kva);
390 pte_t pteval = *ptep;
391 BUG_ON(!pte_present(pteval) || pte_huge(pteval));
392 *ptep = pte_set_home(pteval, home);
393 }
394}
395
396struct page *homecache_alloc_pages(gfp_t gfp_mask,
397 unsigned int order, int home)
398{
399 struct page *page;
400 BUG_ON(gfp_mask & __GFP_HIGHMEM); /* must be lowmem */
401 page = alloc_pages(gfp_mask, order);
402 if (page)
403 homecache_change_page_home(page, order, home);
404 return page;
405}
406
407struct page *homecache_alloc_pages_node(int nid, gfp_t gfp_mask,
408 unsigned int order, int home)
409{
410 struct page *page;
411 BUG_ON(gfp_mask & __GFP_HIGHMEM); /* must be lowmem */
412 page = alloc_pages_node(nid, gfp_mask, order);
413 if (page)
414 homecache_change_page_home(page, order, home);
415 return page;
416}
417
418void homecache_free_pages(unsigned long addr, unsigned int order)
419{
420 struct page *page;
421
422 if (addr == 0)
423 return;
424
425 VM_BUG_ON(!virt_addr_valid((void *)addr));
426 page = virt_to_page((void *)addr);
427 if (put_page_testzero(page)) {
428 int pages = (1 << order);
429 homecache_change_page_home(page, order, initial_page_home());
430 while (pages--)
431 __free_page(page++);
432 }
433}
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
new file mode 100644
index 000000000000..24688b697a8d
--- /dev/null
+++ b/arch/tile/mm/hugetlbpage.c
@@ -0,0 +1,343 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * TILE Huge TLB Page Support for Kernel.
15 * Taken from i386 hugetlb implementation:
16 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
17 */
18
19#include <linux/init.h>
20#include <linux/fs.h>
21#include <linux/mm.h>
22#include <linux/hugetlb.h>
23#include <linux/pagemap.h>
24#include <linux/smp_lock.h>
25#include <linux/slab.h>
26#include <linux/err.h>
27#include <linux/sysctl.h>
28#include <linux/mman.h>
29#include <asm/tlb.h>
30#include <asm/tlbflush.h>
31
32pte_t *huge_pte_alloc(struct mm_struct *mm,
33 unsigned long addr, unsigned long sz)
34{
35 pgd_t *pgd;
36 pud_t *pud;
37 pte_t *pte = NULL;
38
39 /* We do not yet support multiple huge page sizes. */
40 BUG_ON(sz != PMD_SIZE);
41
42 pgd = pgd_offset(mm, addr);
43 pud = pud_alloc(mm, pgd, addr);
44 if (pud)
45 pte = (pte_t *) pmd_alloc(mm, pud, addr);
46 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
47
48 return pte;
49}
50
51pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
52{
53 pgd_t *pgd;
54 pud_t *pud;
55 pmd_t *pmd = NULL;
56
57 pgd = pgd_offset(mm, addr);
58 if (pgd_present(*pgd)) {
59 pud = pud_offset(pgd, addr);
60 if (pud_present(*pud))
61 pmd = pmd_offset(pud, addr);
62 }
63 return (pte_t *) pmd;
64}
65
66#ifdef HUGETLB_TEST
67struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
68 int write)
69{
70 unsigned long start = address;
71 int length = 1;
72 int nr;
73 struct page *page;
74 struct vm_area_struct *vma;
75
76 vma = find_vma(mm, addr);
77 if (!vma || !is_vm_hugetlb_page(vma))
78 return ERR_PTR(-EINVAL);
79
80 pte = huge_pte_offset(mm, address);
81
82 /* hugetlb should be locked, and hence, prefaulted */
83 WARN_ON(!pte || pte_none(*pte));
84
85 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
86
87 WARN_ON(!PageHead(page));
88
89 return page;
90}
91
92int pmd_huge(pmd_t pmd)
93{
94 return 0;
95}
96
97int pud_huge(pud_t pud)
98{
99 return 0;
100}
101
102struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
103 pmd_t *pmd, int write)
104{
105 return NULL;
106}
107
108#else
109
110struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
111 int write)
112{
113 return ERR_PTR(-EINVAL);
114}
115
116int pmd_huge(pmd_t pmd)
117{
118 return !!(pmd_val(pmd) & _PAGE_HUGE_PAGE);
119}
120
121int pud_huge(pud_t pud)
122{
123 return !!(pud_val(pud) & _PAGE_HUGE_PAGE);
124}
125
126struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
127 pmd_t *pmd, int write)
128{
129 struct page *page;
130
131 page = pte_page(*(pte_t *)pmd);
132 if (page)
133 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
134 return page;
135}
136
137struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
138 pud_t *pud, int write)
139{
140 struct page *page;
141
142 page = pte_page(*(pte_t *)pud);
143 if (page)
144 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
145 return page;
146}
147
148int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
149{
150 return 0;
151}
152
153#endif
154
155#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
156static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
157 unsigned long addr, unsigned long len,
158 unsigned long pgoff, unsigned long flags)
159{
160 struct hstate *h = hstate_file(file);
161 struct mm_struct *mm = current->mm;
162 struct vm_area_struct *vma;
163 unsigned long start_addr;
164
165 if (len > mm->cached_hole_size) {
166 start_addr = mm->free_area_cache;
167 } else {
168 start_addr = TASK_UNMAPPED_BASE;
169 mm->cached_hole_size = 0;
170 }
171
172full_search:
173 addr = ALIGN(start_addr, huge_page_size(h));
174
175 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
176 /* At this point: (!vma || addr < vma->vm_end). */
177 if (TASK_SIZE - len < addr) {
178 /*
179 * Start a new search - just in case we missed
180 * some holes.
181 */
182 if (start_addr != TASK_UNMAPPED_BASE) {
183 start_addr = TASK_UNMAPPED_BASE;
184 mm->cached_hole_size = 0;
185 goto full_search;
186 }
187 return -ENOMEM;
188 }
189 if (!vma || addr + len <= vma->vm_start) {
190 mm->free_area_cache = addr + len;
191 return addr;
192 }
193 if (addr + mm->cached_hole_size < vma->vm_start)
194 mm->cached_hole_size = vma->vm_start - addr;
195 addr = ALIGN(vma->vm_end, huge_page_size(h));
196 }
197}
198
199static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
200 unsigned long addr0, unsigned long len,
201 unsigned long pgoff, unsigned long flags)
202{
203 struct hstate *h = hstate_file(file);
204 struct mm_struct *mm = current->mm;
205 struct vm_area_struct *vma, *prev_vma;
206 unsigned long base = mm->mmap_base, addr = addr0;
207 unsigned long largest_hole = mm->cached_hole_size;
208 int first_time = 1;
209
210 /* don't allow allocations above current base */
211 if (mm->free_area_cache > base)
212 mm->free_area_cache = base;
213
214 if (len <= largest_hole) {
215 largest_hole = 0;
216 mm->free_area_cache = base;
217 }
218try_again:
219 /* make sure it can fit in the remaining address space */
220 if (mm->free_area_cache < len)
221 goto fail;
222
223 /* either no address requested or cant fit in requested address hole */
224 addr = (mm->free_area_cache - len) & huge_page_mask(h);
225 do {
226 /*
227 * Lookup failure means no vma is above this address,
228 * i.e. return with success:
229 */
230 vma = find_vma_prev(mm, addr, &prev_vma);
231 if (!vma) {
232 return addr;
233 break;
234 }
235
236 /*
237 * new region fits between prev_vma->vm_end and
238 * vma->vm_start, use it:
239 */
240 if (addr + len <= vma->vm_start &&
241 (!prev_vma || (addr >= prev_vma->vm_end))) {
242 /* remember the address as a hint for next time */
243 mm->cached_hole_size = largest_hole;
244 mm->free_area_cache = addr;
245 return addr;
246 } else {
247 /* pull free_area_cache down to the first hole */
248 if (mm->free_area_cache == vma->vm_end) {
249 mm->free_area_cache = vma->vm_start;
250 mm->cached_hole_size = largest_hole;
251 }
252 }
253
254 /* remember the largest hole we saw so far */
255 if (addr + largest_hole < vma->vm_start)
256 largest_hole = vma->vm_start - addr;
257
258 /* try just below the current vma->vm_start */
259 addr = (vma->vm_start - len) & huge_page_mask(h);
260
261 } while (len <= vma->vm_start);
262
263fail:
264 /*
265 * if hint left us with no space for the requested
266 * mapping then try again:
267 */
268 if (first_time) {
269 mm->free_area_cache = base;
270 largest_hole = 0;
271 first_time = 0;
272 goto try_again;
273 }
274 /*
275 * A failed mmap() very likely causes application failure,
276 * so fall back to the bottom-up function here. This scenario
277 * can happen with large stack limits and large mmap()
278 * allocations.
279 */
280 mm->free_area_cache = TASK_UNMAPPED_BASE;
281 mm->cached_hole_size = ~0UL;
282 addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
283 len, pgoff, flags);
284
285 /*
286 * Restore the topdown base:
287 */
288 mm->free_area_cache = base;
289 mm->cached_hole_size = ~0UL;
290
291 return addr;
292}
293
294unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
295 unsigned long len, unsigned long pgoff, unsigned long flags)
296{
297 struct hstate *h = hstate_file(file);
298 struct mm_struct *mm = current->mm;
299 struct vm_area_struct *vma;
300
301 if (len & ~huge_page_mask(h))
302 return -EINVAL;
303 if (len > TASK_SIZE)
304 return -ENOMEM;
305
306 if (flags & MAP_FIXED) {
307 if (prepare_hugepage_range(file, addr, len))
308 return -EINVAL;
309 return addr;
310 }
311
312 if (addr) {
313 addr = ALIGN(addr, huge_page_size(h));
314 vma = find_vma(mm, addr);
315 if (TASK_SIZE - len >= addr &&
316 (!vma || addr + len <= vma->vm_start))
317 return addr;
318 }
319 if (current->mm->get_unmapped_area == arch_get_unmapped_area)
320 return hugetlb_get_unmapped_area_bottomup(file, addr, len,
321 pgoff, flags);
322 else
323 return hugetlb_get_unmapped_area_topdown(file, addr, len,
324 pgoff, flags);
325}
326
327static __init int setup_hugepagesz(char *opt)
328{
329 unsigned long ps = memparse(opt, &opt);
330 if (ps == PMD_SIZE) {
331 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
332 } else if (ps == PUD_SIZE) {
333 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
334 } else {
335 pr_err("hugepagesz: Unsupported page size %lu M\n",
336 ps >> 20);
337 return 0;
338 }
339 return 1;
340}
341__setup("hugepagesz=", setup_hugepagesz);
342
343#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
new file mode 100644
index 000000000000..d89c9eacd162
--- /dev/null
+++ b/arch/tile/mm/init.c
@@ -0,0 +1,1085 @@
1/*
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright 2010 Tilera Corporation. All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation, version 2.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
12 * NON INFRINGEMENT. See the GNU General Public License for
13 * more details.
14 */
15
16#include <linux/module.h>
17#include <linux/signal.h>
18#include <linux/sched.h>
19#include <linux/kernel.h>
20#include <linux/errno.h>
21#include <linux/string.h>
22#include <linux/types.h>
23#include <linux/ptrace.h>
24#include <linux/mman.h>
25#include <linux/mm.h>
26#include <linux/hugetlb.h>
27#include <linux/swap.h>
28#include <linux/smp.h>
29#include <linux/init.h>
30#include <linux/highmem.h>
31#include <linux/pagemap.h>
32#include <linux/poison.h>
33#include <linux/bootmem.h>
34#include <linux/slab.h>
35#include <linux/proc_fs.h>
36#include <linux/efi.h>
37#include <linux/memory_hotplug.h>
38#include <linux/uaccess.h>
39#include <asm/mmu_context.h>
40#include <asm/processor.h>
41#include <asm/system.h>
42#include <asm/pgtable.h>
43#include <asm/pgalloc.h>
44#include <asm/dma.h>
45#include <asm/fixmap.h>
46#include <asm/tlb.h>
47#include <asm/tlbflush.h>
48#include <asm/sections.h>
49#include <asm/setup.h>
50#include <asm/homecache.h>
51#include <hv/hypervisor.h>
52#include <arch/chip.h>
53
54#include "migrate.h"
55
56/*
57 * We could set FORCE_MAX_ZONEORDER to "(HPAGE_SHIFT - PAGE_SHIFT + 1)"
58 * in the Tile Kconfig, but this generates configure warnings.
59 * Do it here and force people to get it right to compile this file.
60 * The problem is that with 4KB small pages and 16MB huge pages,
61 * the default value doesn't allow us to group enough small pages
62 * together to make up a huge page.
63 */
64#if CONFIG_FORCE_MAX_ZONEORDER < HPAGE_SHIFT - PAGE_SHIFT + 1
65# error "Change FORCE_MAX_ZONEORDER in arch/tile/Kconfig to match page size"
66#endif
67
68#define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0))
69
70#ifndef __tilegx__
71unsigned long VMALLOC_RESERVE = CONFIG_VMALLOC_RESERVE;
72#endif
73
74DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
75
76/* Create an L2 page table */
77static pte_t * __init alloc_pte(void)
78{
79 return __alloc_bootmem(L2_KERNEL_PGTABLE_SIZE, HV_PAGE_TABLE_ALIGN, 0);
80}
81
82/*
83 * L2 page tables per controller. We allocate these all at once from
84 * the bootmem allocator and store them here. This saves on kernel L2
85 * page table memory, compared to allocating a full 64K page per L2
86 * page table, and also means that in cases where we use huge pages,
87 * we are guaranteed to later be able to shatter those huge pages and
88 * switch to using these page tables instead, without requiring
89 * further allocation. Each l2_ptes[] entry points to the first page
90 * table for the first hugepage-size piece of memory on the
91 * controller; other page tables are just indexed directly, i.e. the
92 * L2 page tables are contiguous in memory for each controller.
93 */
94static pte_t *l2_ptes[MAX_NUMNODES];
95static int num_l2_ptes[MAX_NUMNODES];
96
97static void init_prealloc_ptes(int node, int pages)
98{
99 BUG_ON(pages & (HV_L2_ENTRIES-1));
100 if (pages) {
101 num_l2_ptes[node] = pages;
102 l2_ptes[node] = __alloc_bootmem(pages * sizeof(pte_t),
103 HV_PAGE_TABLE_ALIGN, 0);
104 }
105}
106
107pte_t *get_prealloc_pte(unsigned long pfn)
108{
109 int node = pfn_to_nid(pfn);
110 pfn &= ~(-1UL << (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT));
111 BUG_ON(node >= MAX_NUMNODES);
112 BUG_ON(pfn >= num_l2_ptes[node]);
113 return &l2_ptes[node][pfn];
114}
115
116/*
117 * What caching do we expect pages from the heap to have when
118 * they are allocated during bootup? (Once we've installed the
119 * "real" swapper_pg_dir.)
120 */
121static int initial_heap_home(void)
122{
123#if CHIP_HAS_CBOX_HOME_MAP()
124 if (hash_default)
125 return PAGE_HOME_HASH;
126#endif
127 return smp_processor_id();
128}
129
130/*
131 * Place a pointer to an L2 page table in a middle page
132 * directory entry.
133 */
134static void __init assign_pte(pmd_t *pmd, pte_t *page_table)
135{
136 phys_addr_t pa = __pa(page_table);
137 unsigned long l2_ptfn = pa >> HV_LOG2_PAGE_TABLE_ALIGN;
138 pte_t pteval = hv_pte_set_ptfn(__pgprot(_PAGE_TABLE), l2_ptfn);
139 BUG_ON((pa & (HV_PAGE_TABLE_ALIGN-1)) != 0);
140 pteval = pte_set_home(pteval, initial_heap_home());
141 *(pte_t *)pmd = pteval;
142 if (page_table != (pte_t *)pmd_page_vaddr(*pmd))
143 BUG();
144}
145
146#ifdef __tilegx__
147
148#if HV_L1_SIZE != HV_L2_SIZE
149# error Rework assumption that L1 and L2 page tables are same size.
150#endif
151
152/* Since pmd_t arrays and pte_t arrays are the same size, just use casts. */
153static inline pmd_t *alloc_pmd(void)
154{
155 return (pmd_t *)alloc_pte();
156}
157
158static inline void assign_pmd(pud_t *pud, pmd_t *pmd)
159{
160 assign_pte((pmd_t *)pud, (pte_t *)pmd);
161}
162
163#endif /* __tilegx__ */
164
165/* Replace the given pmd with a full PTE table. */
166void __init shatter_pmd(pmd_t *pmd)
167{
168 pte_t *pte = get_prealloc_pte(pte_pfn(*(pte_t *)pmd));
169 assign_pte(pmd, pte);
170}
171
172#ifdef CONFIG_HIGHMEM
173/*
174 * This function initializes a certain range of kernel virtual memory
175 * with new bootmem page tables, everywhere page tables are missing in
176 * the given range.
177 */
178
179/*
180 * NOTE: The pagetables are allocated contiguous on the physical space
181 * so we can cache the place of the first one and move around without
182 * checking the pgd every time.
183 */
184static void __init page_table_range_init(unsigned long start,
185 unsigned long end, pgd_t *pgd_base)
186{
187 pgd_t *pgd;
188 int pgd_idx;
189 unsigned long vaddr;
190
191 vaddr = start;
192 pgd_idx = pgd_index(vaddr);
193 pgd = pgd_base + pgd_idx;
194
195 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
196 pmd_t *pmd = pmd_offset(pud_offset(pgd, vaddr), vaddr);
197 if (pmd_none(*pmd))
198 assign_pte(pmd, alloc_pte());
199 vaddr += PMD_SIZE;
200 }
201}
202#endif /* CONFIG_HIGHMEM */
203
204
205#if CHIP_HAS_CBOX_HOME_MAP()
206
207static int __initdata ktext_hash = 1; /* .text pages */
208static int __initdata kdata_hash = 1; /* .data and .bss pages */
209int __write_once hash_default = 1; /* kernel allocator pages */
210EXPORT_SYMBOL(hash_default);
211int __write_once kstack_hash = 1; /* if no homecaching, use h4h */
212#endif /* CHIP_HAS_CBOX_HOME_MAP */
213
214/*
215 * CPUs to use to for striping the pages of kernel data. If hash-for-home
216 * is available, this is only relevant if kcache_hash sets up the
217 * .data and .bss to be page-homed, and we don't want the default mode
218 * of using the full set of kernel cpus for the striping.
219 */
220static __initdata struct cpumask kdata_mask;
221static __initdata int kdata_arg_seen;
222
223int __write_once kdata_huge; /* if no homecaching, small pages */
224
225
226/* Combine a generic pgprot_t with cache home to get a cache-aware pgprot. */
227static pgprot_t __init construct_pgprot(pgprot_t prot, int home)
228{
229 prot = pte_set_home(prot, home);
230#if CHIP_HAS_CBOX_HOME_MAP()
231 if (home == PAGE_HOME_IMMUTABLE) {
232 if (ktext_hash)
233 prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_HASH_L3);
234 else
235 prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_NO_L3);
236 }
237#endif
238 return prot;
239}
240
241/*
242 * For a given kernel data VA, how should it be cached?
243 * We return the complete pgprot_t with caching bits set.
244 */
245static pgprot_t __init init_pgprot(ulong address)
246{
247 int cpu;
248 unsigned long page;
249 enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
250
251#if CHIP_HAS_CBOX_HOME_MAP()
252 /* For kdata=huge, everything is just hash-for-home. */
253 if (kdata_huge)
254 return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
255#endif
256
257 /* We map the aliased pages of permanent text inaccessible. */
258 if (address < (ulong) _sinittext - CODE_DELTA)
259 return PAGE_NONE;
260
261 /*
262 * We map read-only data non-coherent for performance. We could
263 * use neighborhood caching on TILE64, but it's not clear it's a win.
264 */
265 if ((address >= (ulong) __start_rodata &&
266 address < (ulong) __end_rodata) ||
267 address == (ulong) empty_zero_page) {
268 return construct_pgprot(PAGE_KERNEL_RO, PAGE_HOME_IMMUTABLE);
269 }
270
271 /* As a performance optimization, keep the boot init stack here. */
272 if (address >= (ulong)&init_thread_union &&
273 address < (ulong)&init_thread_union + THREAD_SIZE)
274 return construct_pgprot(PAGE_KERNEL, smp_processor_id());
275
276#ifndef __tilegx__
277#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
278 /* Force the atomic_locks[] array page to be hash-for-home. */
279 if (address == (ulong) atomic_locks)
280 return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
281#endif
282#endif
283
284 /*
285 * Everything else that isn't data or bss is heap, so mark it
286 * with the initial heap home (hash-for-home, or this cpu). This
287 * includes any addresses after the loaded image and any address before
288 * _einitdata, since we already captured the case of text before
289 * _sinittext, and __pa(einittext) is approximately __pa(sinitdata).
290 *
291 * All the LOWMEM pages that we mark this way will get their
292 * struct page homecache properly marked later, in set_page_homes().
293 * The HIGHMEM pages we leave with a default zero for their
294 * homes, but with a zero free_time we don't have to actually
295 * do a flush action the first time we use them, either.
296 */
297 if (address >= (ulong) _end || address < (ulong) _einitdata)
298 return construct_pgprot(PAGE_KERNEL, initial_heap_home());
299
300#if CHIP_HAS_CBOX_HOME_MAP()
301 /* Use hash-for-home if requested for data/bss. */
302 if (kdata_hash)
303 return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
304#endif
305
306 /*
307 * Make the w1data homed like heap to start with, to avoid
308 * making it part of the page-striped data area when we're just
309 * going to convert it to read-only soon anyway.
310 */
311 if (address >= (ulong)__w1data_begin && address < (ulong)__w1data_end)
312 return construct_pgprot(PAGE_KERNEL, initial_heap_home());
313
314 /*
315 * Otherwise we just hand out consecutive cpus. To avoid
316 * requiring this function to hold state, we just walk forward from
317 * _sdata by PAGE_SIZE, skipping the readonly and init data, to reach
318 * the requested address, while walking cpu home around kdata_mask.
319 * This is typically no more than a dozen or so iterations.
320 */
321 page = (((ulong)__w1data_end) + PAGE_SIZE - 1) & PAGE_MASK;
322 BUG_ON(address < page || address >= (ulong)_end);
323 cpu = cpumask_first(&kdata_mask);
324 for (; page < address; page += PAGE_SIZE) {
325 if (page >= (ulong)&init_thread_union &&
326 page < (ulong)&init_thread_union + THREAD_SIZE)
327 continue;
328 if (page == (ulong)empty_zero_page)
329 continue;
330#ifndef __tilegx__
331#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
332 if (page == (ulong)atomic_locks)
333 continue;
334#endif
335#endif
336 cpu = cpumask_next(cpu, &kdata_mask);
337 if (cpu == NR_CPUS)
338 cpu = cpumask_first(&kdata_mask);
339 }
340 return construct_pgprot(PAGE_KERNEL, cpu);
341}
342
343/*
344 * This function sets up how we cache the kernel text. If we have
345 * hash-for-home support, normally that is used instead (see the
346 * kcache_hash boot flag for more information). But if we end up
347 * using a page-based caching technique, this option sets up the
348 * details of that. In addition, the "ktext=nocache" option may
349 * always be used to disable local caching of text pages, if desired.
350 */
351
352static int __initdata ktext_arg_seen;
353static int __initdata ktext_small;
354static int __initdata ktext_local;
355static int __initdata ktext_all;
356static int __initdata ktext_nondataplane;
357static int __initdata ktext_nocache;
358static struct cpumask __initdata ktext_mask;
359
360static int __init setup_ktext(char *str)
361{
362 if (str == NULL)
363 return -EINVAL;
364
365 /* If you have a leading "nocache", turn off ktext caching */
366 if (strncmp(str, "nocache", 7) == 0) {
367 ktext_nocache = 1;
368 pr_info("ktext: disabling local caching of kernel text\n");
369 str += 7;
370 if (*str == ',')
371 ++str;
372 if (*str == '\0')
373 return 0;
374 }
375
376 ktext_arg_seen = 1;
377
378 /* Default setting on Tile64: use a huge page */
379 if (strcmp(str, "huge") == 0)
380 pr_info("ktext: using one huge locally cached page\n");
381
382 /* Pay TLB cost but get no cache benefit: cache small pages locally */
383 else if (strcmp(str, "local") == 0) {
384 ktext_small = 1;
385 ktext_local = 1;
386 pr_info("ktext: using small pages with local caching\n");
387 }
388
389 /* Neighborhood cache ktext pages on all cpus. */
390 else if (strcmp(str, "all") == 0) {
391 ktext_small = 1;
392 ktext_all = 1;
393 pr_info("ktext: using maximal caching neighborhood\n");
394 }
395
396
397 /* Neighborhood ktext pages on specified mask */
398 else if (cpulist_parse(str, &ktext_mask) == 0) {
399 char buf[NR_CPUS * 5];
400 cpulist_scnprintf(buf, sizeof(buf), &ktext_mask);
401 if (cpumask_weight(&ktext_mask) > 1) {
402 ktext_small = 1;
403 pr_info("ktext: using caching neighborhood %s "
404 "with small pages\n", buf);
405 } else {
406 pr_info("ktext: caching on cpu %s with one huge page\n",
407 buf);
408 }
409 }
410
411 else if (*str)
412 return -EINVAL;
413
414 return 0;
415}
416
417early_param("ktext", setup_ktext);
418
419
420static inline pgprot_t ktext_set_nocache(pgprot_t prot)
421{
422 if (!ktext_nocache)
423 prot = hv_pte_set_nc(prot);
424#if CHIP_HAS_NC_AND_NOALLOC_BITS()
425 else
426 prot = hv_pte_set_no_alloc_l2(prot);
427#endif
428 return prot;
429}
430
431#ifndef __tilegx__
432static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
433{
434 return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va);
435}
436#else
437static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
438{
439 pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va);
440 if (pud_none(*pud))
441 assign_pmd(pud, alloc_pmd());
442 return pmd_offset(pud, va);
443}
444#endif
445
446/* Temporary page table we use for staging. */
447static pgd_t pgtables[PTRS_PER_PGD]
448 __attribute__((section(".init.page")));
449
450/*
451 * This maps the physical memory to kernel virtual address space, a total
452 * of max_low_pfn pages, by creating page tables starting from address
453 * PAGE_OFFSET.
454 *
455 * This routine transitions us from using a set of compiled-in large
456 * pages to using some more precise caching, including removing access
457 * to code pages mapped at PAGE_OFFSET (executed only at MEM_SV_START)
458 * marking read-only data as locally cacheable, striping the remaining
459 * .data and .bss across all the available tiles, and removing access
460 * to pages above the top of RAM (thus ensuring a page fault from a bad
461 * virtual address rather than a hypervisor shoot down for accessing
462 * memory outside the assigned limits).
463 */
464static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
465{
466 unsigned long address, pfn;
467 pmd_t *pmd;
468 pte_t *pte;
469 int pte_ofs;
470 const struct cpumask *my_cpu_mask = cpumask_of(smp_processor_id());
471 struct cpumask kstripe_mask;
472 int rc, i;
473
474#if CHIP_HAS_CBOX_HOME_MAP()
475 if (ktext_arg_seen && ktext_hash) {
476 pr_warning("warning: \"ktext\" boot argument ignored"
477 " if \"kcache_hash\" sets up text hash-for-home\n");
478 ktext_small = 0;
479 }
480
481 if (kdata_arg_seen && kdata_hash) {
482 pr_warning("warning: \"kdata\" boot argument ignored"
483 " if \"kcache_hash\" sets up data hash-for-home\n");
484 }
485
486 if (kdata_huge && !hash_default) {
487 pr_warning("warning: disabling \"kdata=huge\"; requires"
488 " kcache_hash=all or =allbutstack\n");
489 kdata_huge = 0;
490 }
491#endif
492
493 /*
494 * Set up a mask for cpus to use for kernel striping.
495 * This is normally all cpus, but minus dataplane cpus if any.
496 * If the dataplane covers the whole chip, we stripe over
497 * the whole chip too.
498 */
499 cpumask_copy(&kstripe_mask, cpu_possible_mask);
500 if (!kdata_arg_seen)
501 kdata_mask = kstripe_mask;
502
503 /* Allocate and fill in L2 page tables */
504 for (i = 0; i < MAX_NUMNODES; ++i) {
505#ifdef CONFIG_HIGHMEM
506 unsigned long end_pfn = node_lowmem_end_pfn[i];
507#else
508 unsigned long end_pfn = node_end_pfn[i];
509#endif
510 unsigned long end_huge_pfn = 0;
511
512 /* Pre-shatter the last huge page to allow per-cpu pages. */
513 if (kdata_huge)
514 end_huge_pfn = end_pfn - (HPAGE_SIZE >> PAGE_SHIFT);
515
516 pfn = node_start_pfn[i];
517
518 /* Allocate enough memory to hold L2 page tables for node. */
519 init_prealloc_ptes(i, end_pfn - pfn);
520
521 address = (unsigned long) pfn_to_kaddr(pfn);
522 while (pfn < end_pfn) {
523 BUG_ON(address & (HPAGE_SIZE-1));
524 pmd = get_pmd(pgtables, address);
525 pte = get_prealloc_pte(pfn);
526 if (pfn < end_huge_pfn) {
527 pgprot_t prot = init_pgprot(address);
528 *(pte_t *)pmd = pte_mkhuge(pfn_pte(pfn, prot));
529 for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE;
530 pfn++, pte_ofs++, address += PAGE_SIZE)
531 pte[pte_ofs] = pfn_pte(pfn, prot);
532 } else {
533 if (kdata_huge)
534 printk(KERN_DEBUG "pre-shattered huge"
535 " page at %#lx\n", address);
536 for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE;
537 pfn++, pte_ofs++, address += PAGE_SIZE) {
538 pgprot_t prot = init_pgprot(address);
539 pte[pte_ofs] = pfn_pte(pfn, prot);
540 }
541 assign_pte(pmd, pte);
542 }
543 }
544 }
545
546 /*
547 * Set or check ktext_map now that we have cpu_possible_mask
548 * and kstripe_mask to work with.
549 */
550 if (ktext_all)
551 cpumask_copy(&ktext_mask, cpu_possible_mask);
552 else if (ktext_nondataplane)
553 ktext_mask = kstripe_mask;
554 else if (!cpumask_empty(&ktext_mask)) {
555 /* Sanity-check any mask that was requested */
556 struct cpumask bad;
557 cpumask_andnot(&bad, &ktext_mask, cpu_possible_mask);
558 cpumask_and(&ktext_mask, &ktext_mask, cpu_possible_mask);
559 if (!cpumask_empty(&bad)) {
560 char buf[NR_CPUS * 5];
561 cpulist_scnprintf(buf, sizeof(buf), &bad);
562 pr_info("ktext: not using unavailable cpus %s\n", buf);
563 }
564 if (cpumask_empty(&ktext_mask)) {
565 pr_warning("ktext: no valid cpus; caching on %d.\n",
566 smp_processor_id());
567 cpumask_copy(&ktext_mask,
568 cpumask_of(smp_processor_id()));
569 }
570 }
571
572 address = MEM_SV_INTRPT;
573 pmd = get_pmd(pgtables, address);
574 if (ktext_small) {
575 /* Allocate an L2 PTE for the kernel text */
576 int cpu = 0;
577 pgprot_t prot = construct_pgprot(PAGE_KERNEL_EXEC,
578 PAGE_HOME_IMMUTABLE);
579
580 if (ktext_local) {
581 if (ktext_nocache)
582 prot = hv_pte_set_mode(prot,
583 HV_PTE_MODE_UNCACHED);
584 else
585 prot = hv_pte_set_mode(prot,
586 HV_PTE_MODE_CACHE_NO_L3);
587 } else {
588 prot = hv_pte_set_mode(prot,
589 HV_PTE_MODE_CACHE_TILE_L3);
590 cpu = cpumask_first(&ktext_mask);
591
592 prot = ktext_set_nocache(prot);
593 }
594
595 BUG_ON(address != (unsigned long)_stext);
596 pfn = 0; /* code starts at PA 0 */
597 pte = alloc_pte();
598 for (pte_ofs = 0; address < (unsigned long)_einittext;
599 pfn++, pte_ofs++, address += PAGE_SIZE) {
600 if (!ktext_local) {
601 prot = set_remote_cache_cpu(prot, cpu);
602 cpu = cpumask_next(cpu, &ktext_mask);
603 if (cpu == NR_CPUS)
604 cpu = cpumask_first(&ktext_mask);
605 }
606 pte[pte_ofs] = pfn_pte(pfn, prot);
607 }
608 assign_pte(pmd, pte);
609 } else {
610 pte_t pteval = pfn_pte(0, PAGE_KERNEL_EXEC);
611 pteval = pte_mkhuge(pteval);
612#if CHIP_HAS_CBOX_HOME_MAP()
613 if (ktext_hash) {
614 pteval = hv_pte_set_mode(pteval,
615 HV_PTE_MODE_CACHE_HASH_L3);
616 pteval = ktext_set_nocache(pteval);
617 } else
618#endif /* CHIP_HAS_CBOX_HOME_MAP() */
619 if (cpumask_weight(&ktext_mask) == 1) {
620 pteval = set_remote_cache_cpu(pteval,
621 cpumask_first(&ktext_mask));
622 pteval = hv_pte_set_mode(pteval,
623 HV_PTE_MODE_CACHE_TILE_L3);
624 pteval = ktext_set_nocache(pteval);
625 } else if (ktext_nocache)
626 pteval = hv_pte_set_mode(pteval,
627 HV_PTE_MODE_UNCACHED);
628 else
629 pteval = hv_pte_set_mode(pteval,
630 HV_PTE_MODE_CACHE_NO_L3);
631 *(pte_t *)pmd = pteval;
632 }
633
634 /* Set swapper_pgprot here so it is flushed to memory right away. */
635 swapper_pgprot = init_pgprot((unsigned long)swapper_pg_dir);
636
637 /*
638 * Since we may be changing the caching of the stack and page
639 * table itself, we invoke an assembly helper to do the
640 * following steps:
641 *
642 * - flush the cache so we start with an empty slate
643 * - install pgtables[] as the real page table
644 * - flush the TLB so the new page table takes effect
645 */
646 rc = flush_and_install_context(__pa(pgtables),
647 init_pgprot((unsigned long)pgtables),
648 __get_cpu_var(current_asid),
649 cpumask_bits(my_cpu_mask));
650 BUG_ON(rc != 0);
651
652 /* Copy the page table back to the normal swapper_pg_dir. */
653 memcpy(pgd_base, pgtables, sizeof(pgtables));
654 __install_page_table(pgd_base, __get_cpu_var(current_asid),
655 swapper_pgprot);
656}
657
658/*
659 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
660 * is valid. The argument is a physical page number.
661 *
662 * On Tile, the only valid things for which we can just hand out unchecked
663 * PTEs are the kernel code and data. Anything else might change its
664 * homing with time, and we wouldn't know to adjust the /dev/mem PTEs.
665 * Note that init_thread_union is released to heap soon after boot,
666 * so we include it in the init data.
667 *
668 * For TILE-Gx, we might want to consider allowing access to PA
669 * regions corresponding to PCI space, etc.
670 */
671int devmem_is_allowed(unsigned long pagenr)
672{
673 return pagenr < kaddr_to_pfn(_end) &&
674 !(pagenr >= kaddr_to_pfn(&init_thread_union) ||
675 pagenr < kaddr_to_pfn(_einitdata)) &&
676 !(pagenr >= kaddr_to_pfn(_sinittext) ||
677 pagenr <= kaddr_to_pfn(_einittext-1));
678}
679
680#ifdef CONFIG_HIGHMEM
681static void __init permanent_kmaps_init(pgd_t *pgd_base)
682{
683 pgd_t *pgd;
684 pud_t *pud;
685 pmd_t *pmd;
686 pte_t *pte;
687 unsigned long vaddr;
688
689 vaddr = PKMAP_BASE;
690 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
691
692 pgd = swapper_pg_dir + pgd_index(vaddr);
693 pud = pud_offset(pgd, vaddr);
694 pmd = pmd_offset(pud, vaddr);
695 pte = pte_offset_kernel(pmd, vaddr);
696 pkmap_page_table = pte;
697}
698#endif /* CONFIG_HIGHMEM */
699
700
701static void __init init_free_pfn_range(unsigned long start, unsigned long end)
702{
703 unsigned long pfn;
704 struct page *page = pfn_to_page(start);
705
706 for (pfn = start; pfn < end; ) {
707 /* Optimize by freeing pages in large batches */
708 int order = __ffs(pfn);
709 int count, i;
710 struct page *p;
711
712 if (order >= MAX_ORDER)
713 order = MAX_ORDER-1;
714 count = 1 << order;
715 while (pfn + count > end) {
716 count >>= 1;
717 --order;
718 }
719 for (p = page, i = 0; i < count; ++i, ++p) {
720 __ClearPageReserved(p);
721 /*
722 * Hacky direct set to avoid unnecessary
723 * lock take/release for EVERY page here.
724 */
725 p->_count.counter = 0;
726 p->_mapcount.counter = -1;
727 }
728 init_page_count(page);
729 __free_pages(page, order);
730 totalram_pages += count;
731
732 page += count;
733 pfn += count;
734 }
735}
736
737static void __init set_non_bootmem_pages_init(void)
738{
739 struct zone *z;
740 for_each_zone(z) {
741 unsigned long start, end;
742 int nid = z->zone_pgdat->node_id;
743 int idx = zone_idx(z);
744
745 start = z->zone_start_pfn;
746 if (start == 0)
747 continue; /* bootmem */
748 end = start + z->spanned_pages;
749 if (idx == ZONE_NORMAL) {
750 BUG_ON(start != node_start_pfn[nid]);
751 start = node_free_pfn[nid];
752 }
753#ifdef CONFIG_HIGHMEM
754 if (idx == ZONE_HIGHMEM)
755 totalhigh_pages += z->spanned_pages;
756#endif
757 if (kdata_huge) {
758 unsigned long percpu_pfn = node_percpu_pfn[nid];
759 if (start < percpu_pfn && end > percpu_pfn)
760 end = percpu_pfn;
761 }
762#ifdef CONFIG_PCI
763 if (start <= pci_reserve_start_pfn &&
764 end > pci_reserve_start_pfn) {
765 if (end > pci_reserve_end_pfn)
766 init_free_pfn_range(pci_reserve_end_pfn, end);
767 end = pci_reserve_start_pfn;
768 }
769#endif
770 init_free_pfn_range(start, end);
771 }
772}
773
774/*
775 * paging_init() sets up the page tables - note that all of lowmem is
776 * already mapped by head.S.
777 */
778void __init paging_init(void)
779{
780#ifdef CONFIG_HIGHMEM
781 unsigned long vaddr, end;
782#endif
783#ifdef __tilegx__
784 pud_t *pud;
785#endif
786 pgd_t *pgd_base = swapper_pg_dir;
787
788 kernel_physical_mapping_init(pgd_base);
789
790#ifdef CONFIG_HIGHMEM
791 /*
792 * Fixed mappings, only the page table structure has to be
793 * created - mappings will be set by set_fixmap():
794 */
795 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
796 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
797 page_table_range_init(vaddr, end, pgd_base);
798 permanent_kmaps_init(pgd_base);
799#endif
800
801#ifdef __tilegx__
802 /*
803 * Since GX allocates just one pmd_t array worth of vmalloc space,
804 * we go ahead and allocate it statically here, then share it
805 * globally. As a result we don't have to worry about any task
806 * changing init_mm once we get up and running, and there's no
807 * need for e.g. vmalloc_sync_all().
808 */
809 BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END));
810 pud = pud_offset(pgd_base + pgd_index(VMALLOC_START), VMALLOC_START);
811 assign_pmd(pud, alloc_pmd());
812#endif
813}
814
815
816/*
817 * Walk the kernel page tables and derive the page_home() from
818 * the PTEs, so that set_pte() can properly validate the caching
819 * of all PTEs it sees.
820 */
821void __init set_page_homes(void)
822{
823}
824
825static void __init set_max_mapnr_init(void)
826{
827#ifdef CONFIG_FLATMEM
828 max_mapnr = max_low_pfn;
829#endif
830}
831
832void __init mem_init(void)
833{
834 int codesize, datasize, initsize;
835 int i;
836#ifndef __tilegx__
837 void *last;
838#endif
839
840#ifdef CONFIG_FLATMEM
841 if (!mem_map)
842 BUG();
843#endif
844
845#ifdef CONFIG_HIGHMEM
846 /* check that fixmap and pkmap do not overlap */
847 if (PKMAP_ADDR(LAST_PKMAP-1) >= FIXADDR_START) {
848 pr_err("fixmap and kmap areas overlap"
849 " - this will crash\n");
850 pr_err("pkstart: %lxh pkend: %lxh fixstart %lxh\n",
851 PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP-1),
852 FIXADDR_START);
853 BUG();
854 }
855#endif
856
857 set_max_mapnr_init();
858
859 /* this will put all bootmem onto the freelists */
860 totalram_pages += free_all_bootmem();
861
862 /* count all remaining LOWMEM and give all HIGHMEM to page allocator */
863 set_non_bootmem_pages_init();
864
865 codesize = (unsigned long)&_etext - (unsigned long)&_text;
866 datasize = (unsigned long)&_end - (unsigned long)&_sdata;
867 initsize = (unsigned long)&_einittext - (unsigned long)&_sinittext;
868 initsize += (unsigned long)&_einitdata - (unsigned long)&_sinitdata;
869
870 pr_info("Memory: %luk/%luk available (%dk kernel code, %dk data, %dk init, %ldk highmem)\n",
871 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
872 num_physpages << (PAGE_SHIFT-10),
873 codesize >> 10,
874 datasize >> 10,
875 initsize >> 10,
876 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
877 );
878
879 /*
880 * In debug mode, dump some interesting memory mappings.
881 */
882#ifdef CONFIG_HIGHMEM
883 printk(KERN_DEBUG " KMAP %#lx - %#lx\n",
884 FIXADDR_START, FIXADDR_TOP + PAGE_SIZE - 1);
885 printk(KERN_DEBUG " PKMAP %#lx - %#lx\n",
886 PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP) - 1);
887#endif
888#ifdef CONFIG_HUGEVMAP
889 printk(KERN_DEBUG " HUGEMAP %#lx - %#lx\n",
890 HUGE_VMAP_BASE, HUGE_VMAP_END - 1);
891#endif
892 printk(KERN_DEBUG " VMALLOC %#lx - %#lx\n",
893 _VMALLOC_START, _VMALLOC_END - 1);
894#ifdef __tilegx__
895 for (i = MAX_NUMNODES-1; i >= 0; --i) {
896 struct pglist_data *node = &node_data[i];
897 if (node->node_present_pages) {
898 unsigned long start = (unsigned long)
899 pfn_to_kaddr(node->node_start_pfn);
900 unsigned long end = start +
901 (node->node_present_pages << PAGE_SHIFT);
902 printk(KERN_DEBUG " MEM%d %#lx - %#lx\n",
903 i, start, end - 1);
904 }
905 }
906#else
907 last = high_memory;
908 for (i = MAX_NUMNODES-1; i >= 0; --i) {
909 if ((unsigned long)vbase_map[i] != -1UL) {
910 printk(KERN_DEBUG " LOWMEM%d %#lx - %#lx\n",
911 i, (unsigned long) (vbase_map[i]),
912 (unsigned long) (last-1));
913 last = vbase_map[i];
914 }
915 }
916#endif
917
918#ifndef __tilegx__
919 /*
920 * Convert from using one lock for all atomic operations to
921 * one per cpu.
922 */
923 __init_atomic_per_cpu();
924#endif
925}
926
927/*
928 * this is for the non-NUMA, single node SMP system case.
929 * Specifically, in the case of x86, we will always add
930 * memory to the highmem for now.
931 */
932#ifndef CONFIG_NEED_MULTIPLE_NODES
933int arch_add_memory(u64 start, u64 size)
934{
935 struct pglist_data *pgdata = &contig_page_data;
936 struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
937 unsigned long start_pfn = start >> PAGE_SHIFT;
938 unsigned long nr_pages = size >> PAGE_SHIFT;
939
940 return __add_pages(zone, start_pfn, nr_pages);
941}
942
943int remove_memory(u64 start, u64 size)
944{
945 return -EINVAL;
946}
947#endif
948
949struct kmem_cache *pgd_cache;
950
951void __init pgtable_cache_init(void)
952{
953 pgd_cache = kmem_cache_create("pgd",
954 PTRS_PER_PGD*sizeof(pgd_t),
955 PTRS_PER_PGD*sizeof(pgd_t),
956 0,
957 NULL);
958 if (!pgd_cache)
959 panic("pgtable_cache_init(): Cannot create pgd cache");
960}
961
962#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
963/*
964 * The __w1data area holds data that is only written during initialization,
965 * and is read-only and thus freely cacheable thereafter. Fix the page
966 * table entries that cover that region accordingly.
967 */
968static void mark_w1data_ro(void)
969{
970 /* Loop over page table entries */
971 unsigned long addr = (unsigned long)__w1data_begin;
972 BUG_ON((addr & (PAGE_SIZE-1)) != 0);
973 for (; addr <= (unsigned long)__w1data_end - 1; addr += PAGE_SIZE) {
974 unsigned long pfn = kaddr_to_pfn((void *)addr);
975 pte_t *ptep = virt_to_pte(NULL, addr);
976 BUG_ON(pte_huge(*ptep)); /* not relevant for kdata_huge */
977 set_pte_at(&init_mm, addr, ptep, pfn_pte(pfn, PAGE_KERNEL_RO));
978 }
979}
980#endif
981
982#ifdef CONFIG_DEBUG_PAGEALLOC
983static long __write_once initfree;
984#else
985static long __write_once initfree = 1;
986#endif
987
988/* Select whether to free (1) or mark unusable (0) the __init pages. */
989static int __init set_initfree(char *str)
990{
991 strict_strtol(str, 0, &initfree);
992 pr_info("initfree: %s free init pages\n", initfree ? "will" : "won't");
993 return 1;
994}
995__setup("initfree=", set_initfree);
996
997static void free_init_pages(char *what, unsigned long begin, unsigned long end)
998{
999 unsigned long addr = (unsigned long) begin;
1000
1001 if (kdata_huge && !initfree) {
1002 pr_warning("Warning: ignoring initfree=0:"
1003 " incompatible with kdata=huge\n");
1004 initfree = 1;
1005 }
1006 end = (end + PAGE_SIZE - 1) & PAGE_MASK;
1007 local_flush_tlb_pages(NULL, begin, PAGE_SIZE, end - begin);
1008 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1009 /*
1010 * Note we just reset the home here directly in the
1011 * page table. We know this is safe because our caller
1012 * just flushed the caches on all the other cpus,
1013 * and they won't be touching any of these pages.
1014 */
1015 int pfn = kaddr_to_pfn((void *)addr);
1016 struct page *page = pfn_to_page(pfn);
1017 pte_t *ptep = virt_to_pte(NULL, addr);
1018 if (!initfree) {
1019 /*
1020 * If debugging page accesses then do not free
1021 * this memory but mark them not present - any
1022 * buggy init-section access will create a
1023 * kernel page fault:
1024 */
1025 pte_clear(&init_mm, addr, ptep);
1026 continue;
1027 }
1028 __ClearPageReserved(page);
1029 init_page_count(page);
1030 if (pte_huge(*ptep))
1031 BUG_ON(!kdata_huge);
1032 else
1033 set_pte_at(&init_mm, addr, ptep,
1034 pfn_pte(pfn, PAGE_KERNEL));
1035 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
1036 free_page(addr);
1037 totalram_pages++;
1038 }
1039 pr_info("Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1040}
1041
1042void free_initmem(void)
1043{
1044 const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
1045
1046 /*
1047 * Evict the dirty initdata on the boot cpu, evict the w1data
1048 * wherever it's homed, and evict all the init code everywhere.
1049 * We are guaranteed that no one will touch the init pages any
1050 * more, and although other cpus may be touching the w1data,
1051 * we only actually change the caching on tile64, which won't
1052 * be keeping local copies in the other tiles' caches anyway.
1053 */
1054 homecache_evict(&cpu_cacheable_map);
1055
1056 /* Free the data pages that we won't use again after init. */
1057 free_init_pages("unused kernel data",
1058 (unsigned long)_sinitdata,
1059 (unsigned long)_einitdata);
1060
1061 /*
1062 * Free the pages mapped from 0xc0000000 that correspond to code
1063 * pages from 0xfd000000 that we won't use again after init.
1064 */
1065 free_init_pages("unused kernel text",
1066 (unsigned long)_sinittext - text_delta,
1067 (unsigned long)_einittext - text_delta);
1068
1069#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
1070 /*
1071 * Upgrade the .w1data section to globally cached.
1072 * We don't do this on tilepro, since the cache architecture
1073 * pretty much makes it irrelevant, and in any case we end
1074 * up having racing issues with other tiles that may touch
1075 * the data after we flush the cache but before we update
1076 * the PTEs and flush the TLBs, causing sharer shootdowns
1077 * later. Even though this is to clean data, it seems like
1078 * an unnecessary complication.
1079 */
1080 mark_w1data_ro();
1081#endif
1082
1083 /* Do a global TLB flush so everyone sees the changes. */
1084 flush_tlb_all();
1085}
diff --git a/arch/tile/mm/migrate.h b/arch/tile/mm/migrate.h
new file mode 100644
index 000000000000..cd45a0837fa6
--- /dev/null
+++ b/arch/tile/mm/migrate.h
@@ -0,0 +1,50 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * Structure definitions for migration, exposed here for use by
15 * arch/tile/kernel/asm-offsets.c.
16 */
17
18#ifndef MM_MIGRATE_H
19#define MM_MIGRATE_H
20
21#include <linux/cpumask.h>
22#include <hv/hypervisor.h>
23
24/*
25 * This function is used as a helper when setting up the initial
26 * page table (swapper_pg_dir).
27 */
28extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access,
29 HV_ASID asid,
30 const unsigned long *cpumask);
31
32/*
33 * This function supports migration as a "helper" as follows:
34 *
35 * - Set the stack PTE itself to "migrating".
36 * - Do a global TLB flush for (va,length) and the specified ASIDs.
37 * - Do a cache-evict on all necessary cpus.
38 * - Write the new stack PTE.
39 *
40 * Note that any non-NULL pointers must not point to the page that
41 * is handled by the stack_pte itself.
42 */
43extern int homecache_migrate_stack_and_flush(pte_t stack_pte, unsigned long va,
44 size_t length, pte_t *stack_ptep,
45 const struct cpumask *cache_cpumask,
46 const struct cpumask *tlb_cpumask,
47 HV_Remote_ASID *asids,
48 int asidcount);
49
50#endif /* MM_MIGRATE_H */
diff --git a/arch/tile/mm/migrate_32.S b/arch/tile/mm/migrate_32.S
new file mode 100644
index 000000000000..f738765cd1e6
--- /dev/null
+++ b/arch/tile/mm/migrate_32.S
@@ -0,0 +1,211 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * This routine is a helper for migrating the home of a set of pages to
15 * a new cpu. See the documentation in homecache.c for more information.
16 */
17
18#include <linux/linkage.h>
19#include <linux/threads.h>
20#include <asm/page.h>
21#include <asm/types.h>
22#include <asm/asm-offsets.h>
23#include <hv/hypervisor.h>
24
25 .text
26
27/*
28 * First, some definitions that apply to all the code in the file.
29 */
30
31/* Locals (caller-save) */
32#define r_tmp r10
33#define r_save_sp r11
34
35/* What we save where in the stack frame; must include all callee-saves. */
36#define FRAME_SP 4
37#define FRAME_R30 8
38#define FRAME_R31 12
39#define FRAME_R32 16
40#define FRAME_R33 20
41#define FRAME_R34 24
42#define FRAME_R35 28
43#define FRAME_SIZE 32
44
45
46
47
48/*
49 * On entry:
50 *
51 * r0 low word of the new context PA to install (moved to r_context_lo)
52 * r1 high word of the new context PA to install (moved to r_context_hi)
53 * r2 low word of PTE to use for context access (moved to r_access_lo)
54 * r3 high word of PTE to use for context access (moved to r_access_lo)
55 * r4 ASID to use for new context (moved to r_asid)
56 * r5 pointer to cpumask with just this cpu set in it (r_my_cpumask)
57 */
58
59/* Arguments (caller-save) */
60#define r_context_lo_in r0
61#define r_context_hi_in r1
62#define r_access_lo_in r2
63#define r_access_hi_in r3
64#define r_asid_in r4
65#define r_my_cpumask r5
66
67/* Locals (callee-save); must not be more than FRAME_xxx above. */
68#define r_save_ics r30
69#define r_context_lo r31
70#define r_context_hi r32
71#define r_access_lo r33
72#define r_access_hi r34
73#define r_asid r35
74
75STD_ENTRY(flush_and_install_context)
76 /*
77 * Create a stack frame; we can't touch it once we flush the
78 * cache until we install the new page table and flush the TLB.
79 */
80 {
81 move r_save_sp, sp
82 sw sp, lr
83 addi sp, sp, -FRAME_SIZE
84 }
85 addi r_tmp, sp, FRAME_SP
86 {
87 sw r_tmp, r_save_sp
88 addi r_tmp, sp, FRAME_R30
89 }
90 {
91 sw r_tmp, r30
92 addi r_tmp, sp, FRAME_R31
93 }
94 {
95 sw r_tmp, r31
96 addi r_tmp, sp, FRAME_R32
97 }
98 {
99 sw r_tmp, r32
100 addi r_tmp, sp, FRAME_R33
101 }
102 {
103 sw r_tmp, r33
104 addi r_tmp, sp, FRAME_R34
105 }
106 {
107 sw r_tmp, r34
108 addi r_tmp, sp, FRAME_R35
109 }
110 sw r_tmp, r35
111
112 /* Move some arguments to callee-save registers. */
113 {
114 move r_context_lo, r_context_lo_in
115 move r_context_hi, r_context_hi_in
116 }
117 {
118 move r_access_lo, r_access_lo_in
119 move r_access_hi, r_access_hi_in
120 }
121 move r_asid, r_asid_in
122
123 /* Disable interrupts, since we can't use our stack. */
124 {
125 mfspr r_save_ics, INTERRUPT_CRITICAL_SECTION
126 movei r_tmp, 1
127 }
128 mtspr INTERRUPT_CRITICAL_SECTION, r_tmp
129
130 /* First, flush our L2 cache. */
131 {
132 move r0, zero /* cache_pa */
133 move r1, zero
134 }
135 {
136 auli r2, zero, ha16(HV_FLUSH_EVICT_L2) /* cache_control */
137 move r3, r_my_cpumask /* cache_cpumask */
138 }
139 {
140 move r4, zero /* tlb_va */
141 move r5, zero /* tlb_length */
142 }
143 {
144 move r6, zero /* tlb_pgsize */
145 move r7, zero /* tlb_cpumask */
146 }
147 {
148 move r8, zero /* asids */
149 move r9, zero /* asidcount */
150 }
151 jal hv_flush_remote
152 bnz r0, .Ldone
153
154 /* Now install the new page table. */
155 {
156 move r0, r_context_lo
157 move r1, r_context_hi
158 }
159 {
160 move r2, r_access_lo
161 move r3, r_access_hi
162 }
163 {
164 move r4, r_asid
165 movei r5, HV_CTX_DIRECTIO
166 }
167 jal hv_install_context
168 bnz r0, .Ldone
169
170 /* Finally, flush the TLB. */
171 {
172 movei r0, 0 /* preserve_global */
173 jal hv_flush_all
174 }
175
176.Ldone:
177 /* Reset interrupts back how they were before. */
178 mtspr INTERRUPT_CRITICAL_SECTION, r_save_ics
179
180 /* Restore the callee-saved registers and return. */
181 addli lr, sp, FRAME_SIZE
182 {
183 lw lr, lr
184 addli r_tmp, sp, FRAME_R30
185 }
186 {
187 lw r30, r_tmp
188 addli r_tmp, sp, FRAME_R31
189 }
190 {
191 lw r31, r_tmp
192 addli r_tmp, sp, FRAME_R32
193 }
194 {
195 lw r32, r_tmp
196 addli r_tmp, sp, FRAME_R33
197 }
198 {
199 lw r33, r_tmp
200 addli r_tmp, sp, FRAME_R34
201 }
202 {
203 lw r34, r_tmp
204 addli r_tmp, sp, FRAME_R35
205 }
206 {
207 lw r35, r_tmp
208 addi sp, sp, FRAME_SIZE
209 }
210 jrp lr
211 STD_ENDPROC(flush_and_install_context)
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
new file mode 100644
index 000000000000..f96f4cec602a
--- /dev/null
+++ b/arch/tile/mm/mmap.c
@@ -0,0 +1,75 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 *
14 * Taken from the i386 architecture and simplified.
15 */
16
17#include <linux/mm.h>
18#include <linux/random.h>
19#include <linux/limits.h>
20#include <linux/sched.h>
21#include <linux/mman.h>
22#include <linux/compat.h>
23
24/*
25 * Top of mmap area (just below the process stack).
26 *
27 * Leave an at least ~128 MB hole.
28 */
29#define MIN_GAP (128*1024*1024)
30#define MAX_GAP (TASK_SIZE/6*5)
31
32static inline unsigned long mmap_base(struct mm_struct *mm)
33{
34 unsigned long gap = rlimit(RLIMIT_STACK);
35 unsigned long random_factor = 0;
36
37 if (current->flags & PF_RANDOMIZE)
38 random_factor = get_random_int() % (1024*1024);
39
40 if (gap < MIN_GAP)
41 gap = MIN_GAP;
42 else if (gap > MAX_GAP)
43 gap = MAX_GAP;
44
45 return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
46}
47
48/*
49 * This function, called very early during the creation of a new
50 * process VM image, sets up which VM layout function to use:
51 */
52void arch_pick_mmap_layout(struct mm_struct *mm)
53{
54#if !defined(__tilegx__)
55 int is_32bit = 1;
56#elif defined(CONFIG_COMPAT)
57 int is_32bit = is_compat_task();
58#else
59 int is_32bit = 0;
60#endif
61
62 /*
63 * Use standard layout if the expected stack growth is unlimited
64 * or we are running native 64 bits.
65 */
66 if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
67 mm->mmap_base = TASK_UNMAPPED_BASE;
68 mm->get_unmapped_area = arch_get_unmapped_area;
69 mm->unmap_area = arch_unmap_area;
70 } else {
71 mm->mmap_base = mmap_base(mm);
72 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
73 mm->unmap_area = arch_unmap_area_topdown;
74 }
75}
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
new file mode 100644
index 000000000000..28c23140c947
--- /dev/null
+++ b/arch/tile/mm/pgtable.c
@@ -0,0 +1,530 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/sched.h>
16#include <linux/kernel.h>
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/smp.h>
21#include <linux/highmem.h>
22#include <linux/slab.h>
23#include <linux/pagemap.h>
24#include <linux/spinlock.h>
25#include <linux/cpumask.h>
26#include <linux/module.h>
27#include <linux/io.h>
28#include <linux/vmalloc.h>
29#include <linux/smp.h>
30
31#include <asm/system.h>
32#include <asm/pgtable.h>
33#include <asm/pgalloc.h>
34#include <asm/fixmap.h>
35#include <asm/tlb.h>
36#include <asm/tlbflush.h>
37#include <asm/homecache.h>
38
39#define K(x) ((x) << (PAGE_SHIFT-10))
40
41/*
42 * The normal show_free_areas() is too verbose on Tile, with dozens
43 * of processors and often four NUMA zones each with high and lowmem.
44 */
45void show_mem(void)
46{
47 struct zone *zone;
48
49 pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu"
50 " free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu"
51 " pagecache:%lu swap:%lu\n",
52 (global_page_state(NR_ACTIVE_ANON) +
53 global_page_state(NR_ACTIVE_FILE)),
54 (global_page_state(NR_INACTIVE_ANON) +
55 global_page_state(NR_INACTIVE_FILE)),
56 global_page_state(NR_FILE_DIRTY),
57 global_page_state(NR_WRITEBACK),
58 global_page_state(NR_UNSTABLE_NFS),
59 global_page_state(NR_FREE_PAGES),
60 (global_page_state(NR_SLAB_RECLAIMABLE) +
61 global_page_state(NR_SLAB_UNRECLAIMABLE)),
62 global_page_state(NR_FILE_MAPPED),
63 global_page_state(NR_PAGETABLE),
64 global_page_state(NR_BOUNCE),
65 global_page_state(NR_FILE_PAGES),
66 nr_swap_pages);
67
68 for_each_zone(zone) {
69 unsigned long flags, order, total = 0, largest_order = -1;
70
71 if (!populated_zone(zone))
72 continue;
73
74 spin_lock_irqsave(&zone->lock, flags);
75 for (order = 0; order < MAX_ORDER; order++) {
76 int nr = zone->free_area[order].nr_free;
77 total += nr << order;
78 if (nr)
79 largest_order = order;
80 }
81 spin_unlock_irqrestore(&zone->lock, flags);
82 pr_err("Node %d %7s: %lukB (largest %luKb)\n",
83 zone_to_nid(zone), zone->name,
84 K(total), largest_order ? K(1UL) << largest_order : 0);
85 }
86}
87
88/*
89 * Associate a virtual page frame with a given physical page frame
90 * and protection flags for that frame.
91 */
92static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
93{
94 pgd_t *pgd;
95 pud_t *pud;
96 pmd_t *pmd;
97 pte_t *pte;
98
99 pgd = swapper_pg_dir + pgd_index(vaddr);
100 if (pgd_none(*pgd)) {
101 BUG();
102 return;
103 }
104 pud = pud_offset(pgd, vaddr);
105 if (pud_none(*pud)) {
106 BUG();
107 return;
108 }
109 pmd = pmd_offset(pud, vaddr);
110 if (pmd_none(*pmd)) {
111 BUG();
112 return;
113 }
114 pte = pte_offset_kernel(pmd, vaddr);
115 /* <pfn,flags> stored as-is, to permit clearing entries */
116 set_pte(pte, pfn_pte(pfn, flags));
117
118 /*
119 * It's enough to flush this one mapping.
120 * This appears conservative since it is only called
121 * from __set_fixmap.
122 */
123 local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
124}
125
126void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
127{
128 unsigned long address = __fix_to_virt(idx);
129
130 if (idx >= __end_of_fixed_addresses) {
131 BUG();
132 return;
133 }
134 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
135}
136
137#if defined(CONFIG_HIGHPTE)
138pte_t *_pte_offset_map(pmd_t *dir, unsigned long address, enum km_type type)
139{
140 pte_t *pte = kmap_atomic(pmd_page(*dir), type) +
141 (pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
142 return &pte[pte_index(address)];
143}
144#endif
145
146/*
147 * List of all pgd's needed so it can invalidate entries in both cached
148 * and uncached pgd's. This is essentially codepath-based locking
149 * against pageattr.c; it is the unique case in which a valid change
150 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
151 * vmalloc faults work because attached pagetables are never freed.
152 * The locking scheme was chosen on the basis of manfred's
153 * recommendations and having no core impact whatsoever.
154 * -- wli
155 */
156DEFINE_SPINLOCK(pgd_lock);
157LIST_HEAD(pgd_list);
158
159static inline void pgd_list_add(pgd_t *pgd)
160{
161 list_add(pgd_to_list(pgd), &pgd_list);
162}
163
164static inline void pgd_list_del(pgd_t *pgd)
165{
166 list_del(pgd_to_list(pgd));
167}
168
169#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET)
170#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START)
171
172static void pgd_ctor(pgd_t *pgd)
173{
174 unsigned long flags;
175
176 memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t));
177 spin_lock_irqsave(&pgd_lock, flags);
178
179#ifndef __tilegx__
180 /*
181 * Check that the user interrupt vector has no L2.
182 * It never should for the swapper, and new page tables
183 * should always start with an empty user interrupt vector.
184 */
185 BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
186#endif
187
188 clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
189 swapper_pg_dir + KERNEL_PGD_INDEX_START,
190 KERNEL_PGD_PTRS);
191
192 pgd_list_add(pgd);
193 spin_unlock_irqrestore(&pgd_lock, flags);
194}
195
196static void pgd_dtor(pgd_t *pgd)
197{
198 unsigned long flags; /* can be called from interrupt context */
199
200 spin_lock_irqsave(&pgd_lock, flags);
201 pgd_list_del(pgd);
202 spin_unlock_irqrestore(&pgd_lock, flags);
203}
204
205pgd_t *pgd_alloc(struct mm_struct *mm)
206{
207 pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
208 if (pgd)
209 pgd_ctor(pgd);
210 return pgd;
211}
212
213void pgd_free(struct mm_struct *mm, pgd_t *pgd)
214{
215 pgd_dtor(pgd);
216 kmem_cache_free(pgd_cache, pgd);
217}
218
219
220#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
221
222struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
223{
224 gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP;
225 struct page *p;
226
227#ifdef CONFIG_HIGHPTE
228 flags |= __GFP_HIGHMEM;
229#endif
230
231 p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
232 if (p == NULL)
233 return NULL;
234
235 pgtable_page_ctor(p);
236 return p;
237}
238
239/*
240 * Free page immediately (used in __pte_alloc if we raced with another
241 * process). We have to correct whatever pte_alloc_one() did before
242 * returning the pages to the allocator.
243 */
244void pte_free(struct mm_struct *mm, struct page *p)
245{
246 pgtable_page_dtor(p);
247 __free_pages(p, L2_USER_PGTABLE_ORDER);
248}
249
250void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
251 unsigned long address)
252{
253 int i;
254
255 pgtable_page_dtor(pte);
256 tlb->need_flush = 1;
257 if (tlb_fast_mode(tlb)) {
258 struct page *pte_pages[L2_USER_PGTABLE_PAGES];
259 for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
260 pte_pages[i] = pte + i;
261 free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES);
262 return;
263 }
264 for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) {
265 tlb->pages[tlb->nr++] = pte + i;
266 if (tlb->nr >= FREE_PTE_NR)
267 tlb_flush_mmu(tlb, 0, 0);
268 }
269}
270
271#ifndef __tilegx__
272
273/*
274 * FIXME: needs to be atomic vs hypervisor writes. For now we make the
275 * window of vulnerability a bit smaller by doing an unlocked 8-bit update.
276 */
277int ptep_test_and_clear_young(struct vm_area_struct *vma,
278 unsigned long addr, pte_t *ptep)
279{
280#if HV_PTE_INDEX_ACCESSED < 8 || HV_PTE_INDEX_ACCESSED >= 16
281# error Code assumes HV_PTE "accessed" bit in second byte
282#endif
283 u8 *tmp = (u8 *)ptep;
284 u8 second_byte = tmp[1];
285 if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8))))
286 return 0;
287 tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8));
288 return 1;
289}
290
291/*
292 * This implementation is atomic vs hypervisor writes, since the hypervisor
293 * always writes the low word (where "accessed" and "dirty" are) and this
294 * routine only writes the high word.
295 */
296void ptep_set_wrprotect(struct mm_struct *mm,
297 unsigned long addr, pte_t *ptep)
298{
299#if HV_PTE_INDEX_WRITABLE < 32
300# error Code assumes HV_PTE "writable" bit in high word
301#endif
302 u32 *tmp = (u32 *)ptep;
303 tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32));
304}
305
306#endif
307
308pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
309{
310 pgd_t *pgd;
311 pud_t *pud;
312 pmd_t *pmd;
313
314 if (pgd_addr_invalid(addr))
315 return NULL;
316
317 pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr);
318 pud = pud_offset(pgd, addr);
319 if (!pud_present(*pud))
320 return NULL;
321 pmd = pmd_offset(pud, addr);
322 if (pmd_huge_page(*pmd))
323 return (pte_t *)pmd;
324 if (!pmd_present(*pmd))
325 return NULL;
326 return pte_offset_kernel(pmd, addr);
327}
328
329pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
330{
331 unsigned int width = smp_width;
332 int x = cpu % width;
333 int y = cpu / width;
334 BUG_ON(y >= smp_height);
335 BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
336 BUG_ON(cpu < 0 || cpu >= NR_CPUS);
337 BUG_ON(!cpu_is_valid_lotar(cpu));
338 return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y));
339}
340
341int get_remote_cache_cpu(pgprot_t prot)
342{
343 HV_LOTAR lotar = hv_pte_get_lotar(prot);
344 int x = HV_LOTAR_X(lotar);
345 int y = HV_LOTAR_Y(lotar);
346 BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
347 return x + y * smp_width;
348}
349
350void set_pte_order(pte_t *ptep, pte_t pte, int order)
351{
352 unsigned long pfn = pte_pfn(pte);
353 struct page *page = pfn_to_page(pfn);
354
355 /* Update the home of a PTE if necessary */
356 pte = pte_set_home(pte, page_home(page));
357
358#ifdef __tilegx__
359 *ptep = pte;
360#else
361 /*
362 * When setting a PTE, write the high bits first, then write
363 * the low bits. This sets the "present" bit only after the
364 * other bits are in place. If a particular PTE update
365 * involves transitioning from one valid PTE to another, it
366 * may be necessary to call set_pte_order() more than once,
367 * transitioning via a suitable intermediate state.
368 * Note that this sequence also means that if we are transitioning
369 * from any migrating PTE to a non-migrating one, we will not
370 * see a half-updated PTE with the migrating bit off.
371 */
372#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
373# error Must write the present and migrating bits last
374#endif
375 ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
376 barrier();
377 ((u32 *)ptep)[0] = (u32)(pte_val(pte));
378#endif
379}
380
381/* Can this mm load a PTE with cached_priority set? */
382static inline int mm_is_priority_cached(struct mm_struct *mm)
383{
384 return mm->context.priority_cached;
385}
386
387/*
388 * Add a priority mapping to an mm_context and
389 * notify the hypervisor if this is the first one.
390 */
391void start_mm_caching(struct mm_struct *mm)
392{
393 if (!mm_is_priority_cached(mm)) {
394 mm->context.priority_cached = -1U;
395 hv_set_caching(-1U);
396 }
397}
398
399/*
400 * Validate and return the priority_cached flag. We know if it's zero
401 * that we don't need to scan, since we immediately set it non-zero
402 * when we first consider a MAP_CACHE_PRIORITY mapping.
403 *
404 * We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it,
405 * since we're in an interrupt context (servicing switch_mm) we don't
406 * worry about it and don't unset the "priority_cached" field.
407 * Presumably we'll come back later and have more luck and clear
408 * the value then; for now we'll just keep the cache marked for priority.
409 */
410static unsigned int update_priority_cached(struct mm_struct *mm)
411{
412 if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
413 struct vm_area_struct *vm;
414 for (vm = mm->mmap; vm; vm = vm->vm_next) {
415 if (hv_pte_get_cached_priority(vm->vm_page_prot))
416 break;
417 }
418 if (vm == NULL)
419 mm->context.priority_cached = 0;
420 up_write(&mm->mmap_sem);
421 }
422 return mm->context.priority_cached;
423}
424
425/* Set caching correctly for an mm that we are switching to. */
426void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
427{
428 if (!mm_is_priority_cached(next)) {
429 /*
430 * If the new mm doesn't use priority caching, just see if we
431 * need the hv_set_caching(), or can assume it's already zero.
432 */
433 if (mm_is_priority_cached(prev))
434 hv_set_caching(0);
435 } else {
436 hv_set_caching(update_priority_cached(next));
437 }
438}
439
440#if CHIP_HAS_MMIO()
441
442/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
443void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
444 pgprot_t home)
445{
446 void *addr;
447 struct vm_struct *area;
448 unsigned long offset, last_addr;
449 pgprot_t pgprot;
450
451 /* Don't allow wraparound or zero size */
452 last_addr = phys_addr + size - 1;
453 if (!size || last_addr < phys_addr)
454 return NULL;
455
456 /* Create a read/write, MMIO VA mapping homed at the requested shim. */
457 pgprot = PAGE_KERNEL;
458 pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
459 pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
460
461 /*
462 * Mappings have to be page-aligned
463 */
464 offset = phys_addr & ~PAGE_MASK;
465 phys_addr &= PAGE_MASK;
466 size = PAGE_ALIGN(last_addr+1) - phys_addr;
467
468 /*
469 * Ok, go for it..
470 */
471 area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
472 if (!area)
473 return NULL;
474 area->phys_addr = phys_addr;
475 addr = area->addr;
476 if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
477 phys_addr, pgprot)) {
478 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
479 return NULL;
480 }
481 return (__force void __iomem *) (offset + (char *)addr);
482}
483EXPORT_SYMBOL(ioremap_prot);
484
485/* Map a PCI MMIO bus address into VA space. */
486void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
487{
488 panic("ioremap for PCI MMIO is not supported");
489}
490EXPORT_SYMBOL(ioremap);
491
492/* Unmap an MMIO VA mapping. */
493void iounmap(volatile void __iomem *addr_in)
494{
495 volatile void __iomem *addr = (volatile void __iomem *)
496 (PAGE_MASK & (unsigned long __force)addr_in);
497#if 1
498 vunmap((void * __force)addr);
499#else
500 /* x86 uses this complicated flow instead of vunmap(). Is
501 * there any particular reason we should do the same? */
502 struct vm_struct *p, *o;
503
504 /* Use the vm area unlocked, assuming the caller
505 ensures there isn't another iounmap for the same address
506 in parallel. Reuse of the virtual address is prevented by
507 leaving it in the global lists until we're done with it.
508 cpa takes care of the direct mappings. */
509 read_lock(&vmlist_lock);
510 for (p = vmlist; p; p = p->next) {
511 if (p->addr == addr)
512 break;
513 }
514 read_unlock(&vmlist_lock);
515
516 if (!p) {
517 pr_err("iounmap: bad address %p\n", addr);
518 dump_stack();
519 return;
520 }
521
522 /* Finally remove it */
523 o = remove_vm_area((void *)addr);
524 BUG_ON(p != o || o == NULL);
525 kfree(p);
526#endif
527}
528EXPORT_SYMBOL(iounmap);
529
530#endif /* CHIP_HAS_MMIO() */