diff options
Diffstat (limited to 'arch/tile/mm')
| -rw-r--r-- | arch/tile/mm/Makefile | 9 | ||||
| -rw-r--r-- | arch/tile/mm/elf.c | 164 | ||||
| -rw-r--r-- | arch/tile/mm/extable.c | 30 | ||||
| -rw-r--r-- | arch/tile/mm/fault.c | 905 | ||||
| -rw-r--r-- | arch/tile/mm/highmem.c | 328 | ||||
| -rw-r--r-- | arch/tile/mm/homecache.c | 445 | ||||
| -rw-r--r-- | arch/tile/mm/hugetlbpage.c | 343 | ||||
| -rw-r--r-- | arch/tile/mm/init.c | 1082 | ||||
| -rw-r--r-- | arch/tile/mm/migrate.h | 50 | ||||
| -rw-r--r-- | arch/tile/mm/migrate_32.S | 211 | ||||
| -rw-r--r-- | arch/tile/mm/mmap.c | 75 | ||||
| -rw-r--r-- | arch/tile/mm/pgtable.c | 566 |
12 files changed, 4208 insertions, 0 deletions
diff --git a/arch/tile/mm/Makefile b/arch/tile/mm/Makefile new file mode 100644 index 000000000000..e252aeddc17d --- /dev/null +++ b/arch/tile/mm/Makefile | |||
| @@ -0,0 +1,9 @@ | |||
| 1 | # | ||
| 2 | # Makefile for the linux tile-specific parts of the memory manager. | ||
| 3 | # | ||
| 4 | |||
| 5 | obj-y := init.o pgtable.o fault.o extable.o elf.o \ | ||
| 6 | mmap.o homecache.o migrate_$(BITS).o | ||
| 7 | |||
| 8 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | ||
| 9 | obj-$(CONFIG_HIGHMEM) += highmem.o | ||
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c new file mode 100644 index 000000000000..818c9bef060c --- /dev/null +++ b/arch/tile/mm/elf.c | |||
| @@ -0,0 +1,164 @@ | |||
| 1 | /* | ||
| 2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation, version 2. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
| 11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | */ | ||
| 14 | |||
| 15 | #include <linux/mm.h> | ||
| 16 | #include <linux/pagemap.h> | ||
| 17 | #include <linux/binfmts.h> | ||
| 18 | #include <linux/compat.h> | ||
| 19 | #include <linux/mman.h> | ||
| 20 | #include <linux/elf.h> | ||
| 21 | #include <asm/pgtable.h> | ||
| 22 | #include <asm/pgalloc.h> | ||
| 23 | |||
| 24 | /* Notify a running simulator, if any, that an exec just occurred. */ | ||
| 25 | static void sim_notify_exec(const char *binary_name) | ||
| 26 | { | ||
| 27 | unsigned char c; | ||
| 28 | do { | ||
| 29 | c = *binary_name++; | ||
| 30 | __insn_mtspr(SPR_SIM_CONTROL, | ||
| 31 | (SIM_CONTROL_OS_EXEC | ||
| 32 | | (c << _SIM_CONTROL_OPERATOR_BITS))); | ||
| 33 | |||
| 34 | } while (c); | ||
| 35 | } | ||
| 36 | |||
| 37 | static int notify_exec(void) | ||
| 38 | { | ||
| 39 | int retval = 0; /* failure */ | ||
| 40 | struct vm_area_struct *vma = current->mm->mmap; | ||
| 41 | while (vma) { | ||
| 42 | if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) | ||
| 43 | break; | ||
| 44 | vma = vma->vm_next; | ||
| 45 | } | ||
| 46 | if (vma) { | ||
| 47 | char *buf = (char *) __get_free_page(GFP_KERNEL); | ||
| 48 | if (buf) { | ||
| 49 | char *path = d_path(&vma->vm_file->f_path, | ||
| 50 | buf, PAGE_SIZE); | ||
| 51 | if (!IS_ERR(path)) { | ||
| 52 | sim_notify_exec(path); | ||
| 53 | retval = 1; | ||
| 54 | } | ||
| 55 | free_page((unsigned long)buf); | ||
| 56 | } | ||
| 57 | } | ||
| 58 | return retval; | ||
| 59 | } | ||
| 60 | |||
| 61 | /* Notify a running simulator, if any, that we loaded an interpreter. */ | ||
| 62 | static void sim_notify_interp(unsigned long load_addr) | ||
| 63 | { | ||
| 64 | size_t i; | ||
| 65 | for (i = 0; i < sizeof(load_addr); i++) { | ||
| 66 | unsigned char c = load_addr >> (i * 8); | ||
| 67 | __insn_mtspr(SPR_SIM_CONTROL, | ||
| 68 | (SIM_CONTROL_OS_INTERP | ||
| 69 | | (c << _SIM_CONTROL_OPERATOR_BITS))); | ||
| 70 | } | ||
| 71 | } | ||
| 72 | |||
| 73 | |||
| 74 | /* Kernel address of page used to map read-only kernel data into userspace. */ | ||
| 75 | static void *vdso_page; | ||
| 76 | |||
| 77 | /* One-entry array used for install_special_mapping. */ | ||
| 78 | static struct page *vdso_pages[1]; | ||
| 79 | |||
| 80 | int __init vdso_setup(void) | ||
| 81 | { | ||
| 82 | extern char __rt_sigreturn[], __rt_sigreturn_end[]; | ||
| 83 | vdso_page = (void *)get_zeroed_page(GFP_ATOMIC); | ||
| 84 | memcpy(vdso_page, __rt_sigreturn, __rt_sigreturn_end - __rt_sigreturn); | ||
| 85 | vdso_pages[0] = virt_to_page(vdso_page); | ||
| 86 | return 0; | ||
| 87 | } | ||
| 88 | device_initcall(vdso_setup); | ||
| 89 | |||
| 90 | const char *arch_vma_name(struct vm_area_struct *vma) | ||
| 91 | { | ||
| 92 | if (vma->vm_private_data == vdso_pages) | ||
| 93 | return "[vdso]"; | ||
| 94 | #ifndef __tilegx__ | ||
| 95 | if (vma->vm_start == MEM_USER_INTRPT) | ||
| 96 | return "[intrpt]"; | ||
| 97 | #endif | ||
| 98 | return NULL; | ||
| 99 | } | ||
| 100 | |||
| 101 | int arch_setup_additional_pages(struct linux_binprm *bprm, | ||
| 102 | int executable_stack) | ||
| 103 | { | ||
| 104 | struct mm_struct *mm = current->mm; | ||
| 105 | unsigned long vdso_base; | ||
| 106 | int retval = 0; | ||
| 107 | |||
| 108 | /* | ||
| 109 | * Notify the simulator that an exec just occurred. | ||
| 110 | * If we can't find the filename of the mapping, just use | ||
| 111 | * whatever was passed as the linux_binprm filename. | ||
| 112 | */ | ||
| 113 | if (!notify_exec()) | ||
| 114 | sim_notify_exec(bprm->filename); | ||
| 115 | |||
| 116 | down_write(&mm->mmap_sem); | ||
| 117 | |||
| 118 | /* | ||
| 119 | * MAYWRITE to allow gdb to COW and set breakpoints | ||
| 120 | * | ||
| 121 | * Make sure the vDSO gets into every core dump. Dumping its | ||
| 122 | * contents makes post-mortem fully interpretable later | ||
| 123 | * without matching up the same kernel and hardware config to | ||
| 124 | * see what PC values meant. | ||
| 125 | */ | ||
| 126 | vdso_base = VDSO_BASE; | ||
| 127 | retval = install_special_mapping(mm, vdso_base, PAGE_SIZE, | ||
| 128 | VM_READ|VM_EXEC| | ||
| 129 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| | ||
| 130 | VM_ALWAYSDUMP, | ||
| 131 | vdso_pages); | ||
| 132 | |||
| 133 | #ifndef __tilegx__ | ||
| 134 | /* | ||
| 135 | * Set up a user-interrupt mapping here; the user can't | ||
| 136 | * create one themselves since it is above TASK_SIZE. | ||
| 137 | * We make it unwritable by default, so the model for adding | ||
| 138 | * interrupt vectors always involves an mprotect. | ||
| 139 | */ | ||
| 140 | if (!retval) { | ||
| 141 | unsigned long addr = MEM_USER_INTRPT; | ||
| 142 | addr = mmap_region(NULL, addr, INTRPT_SIZE, | ||
| 143 | MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, | ||
| 144 | VM_READ|VM_EXEC| | ||
| 145 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0); | ||
| 146 | if (addr > (unsigned long) -PAGE_SIZE) | ||
| 147 | retval = (int) addr; | ||
| 148 | } | ||
| 149 | #endif | ||
| 150 | |||
| 151 | up_write(&mm->mmap_sem); | ||
| 152 | |||
| 153 | return retval; | ||
| 154 | } | ||
| 155 | |||
| 156 | |||
| 157 | void elf_plat_init(struct pt_regs *regs, unsigned long load_addr) | ||
| 158 | { | ||
| 159 | /* Zero all registers. */ | ||
| 160 | memset(regs, 0, sizeof(*regs)); | ||
| 161 | |||
| 162 | /* Report the interpreter's load address. */ | ||
| 163 | sim_notify_interp(load_addr); | ||
| 164 | } | ||
diff --git a/arch/tile/mm/extable.c b/arch/tile/mm/extable.c new file mode 100644 index 000000000000..4fb0acb9d154 --- /dev/null +++ b/arch/tile/mm/extable.c | |||
| @@ -0,0 +1,30 @@ | |||
| 1 | /* | ||
| 2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation, version 2. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
| 11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | */ | ||
| 14 | |||
| 15 | #include <linux/module.h> | ||
| 16 | #include <linux/spinlock.h> | ||
| 17 | #include <linux/uaccess.h> | ||
| 18 | |||
| 19 | int fixup_exception(struct pt_regs *regs) | ||
| 20 | { | ||
| 21 | const struct exception_table_entry *fixup; | ||
| 22 | |||
| 23 | fixup = search_exception_tables(regs->pc); | ||
| 24 | if (fixup) { | ||
| 25 | regs->pc = fixup->fixup; | ||
| 26 | return 1; | ||
| 27 | } | ||
| 28 | |||
| 29 | return 0; | ||
| 30 | } | ||
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c new file mode 100644 index 000000000000..9b6b92f07def --- /dev/null +++ b/arch/tile/mm/fault.c | |||
| @@ -0,0 +1,905 @@ | |||
| 1 | /* | ||
| 2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation, version 2. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
| 11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | * | ||
| 14 | * From i386 code copyright (C) 1995 Linus Torvalds | ||
| 15 | */ | ||
| 16 | |||
| 17 | #include <linux/signal.h> | ||
| 18 | #include <linux/sched.h> | ||
| 19 | #include <linux/kernel.h> | ||
| 20 | #include <linux/errno.h> | ||
| 21 | #include <linux/string.h> | ||
| 22 | #include <linux/types.h> | ||
| 23 | #include <linux/ptrace.h> | ||
| 24 | #include <linux/mman.h> | ||
| 25 | #include <linux/mm.h> | ||
| 26 | #include <linux/smp.h> | ||
| 27 | #include <linux/smp_lock.h> | ||
| 28 | #include <linux/interrupt.h> | ||
| 29 | #include <linux/init.h> | ||
| 30 | #include <linux/tty.h> | ||
| 31 | #include <linux/vt_kern.h> /* For unblank_screen() */ | ||
| 32 | #include <linux/highmem.h> | ||
| 33 | #include <linux/module.h> | ||
| 34 | #include <linux/kprobes.h> | ||
| 35 | #include <linux/hugetlb.h> | ||
| 36 | #include <linux/syscalls.h> | ||
| 37 | #include <linux/uaccess.h> | ||
| 38 | |||
| 39 | #include <asm/system.h> | ||
| 40 | #include <asm/pgalloc.h> | ||
| 41 | #include <asm/sections.h> | ||
| 42 | |||
| 43 | #include <arch/interrupts.h> | ||
| 44 | |||
| 45 | /* | ||
| 46 | * Unlock any spinlocks which will prevent us from getting the | ||
| 47 | * message out | ||
| 48 | */ | ||
| 49 | void bust_spinlocks(int yes) | ||
| 50 | { | ||
| 51 | int loglevel_save = console_loglevel; | ||
| 52 | |||
| 53 | if (yes) { | ||
| 54 | oops_in_progress = 1; | ||
| 55 | return; | ||
| 56 | } | ||
| 57 | oops_in_progress = 0; | ||
| 58 | /* | ||
| 59 | * OK, the message is on the console. Now we call printk() | ||
| 60 | * without oops_in_progress set so that printk will give klogd | ||
| 61 | * a poke. Hold onto your hats... | ||
| 62 | */ | ||
| 63 | console_loglevel = 15; /* NMI oopser may have shut the console up */ | ||
| 64 | printk(" "); | ||
| 65 | console_loglevel = loglevel_save; | ||
| 66 | } | ||
| 67 | |||
| 68 | static noinline void force_sig_info_fault(int si_signo, int si_code, | ||
| 69 | unsigned long address, int fault_num, struct task_struct *tsk) | ||
| 70 | { | ||
| 71 | siginfo_t info; | ||
| 72 | |||
| 73 | if (unlikely(tsk->pid < 2)) { | ||
| 74 | panic("Signal %d (code %d) at %#lx sent to %s!", | ||
| 75 | si_signo, si_code & 0xffff, address, | ||
| 76 | tsk->pid ? "init" : "the idle task"); | ||
| 77 | } | ||
| 78 | |||
| 79 | info.si_signo = si_signo; | ||
| 80 | info.si_errno = 0; | ||
| 81 | info.si_code = si_code; | ||
| 82 | info.si_addr = (void __user *)address; | ||
| 83 | info.si_trapno = fault_num; | ||
| 84 | force_sig_info(si_signo, &info, tsk); | ||
| 85 | } | ||
| 86 | |||
| 87 | #ifndef __tilegx__ | ||
| 88 | /* | ||
| 89 | * Synthesize the fault a PL0 process would get by doing a word-load of | ||
| 90 | * an unaligned address or a high kernel address. Called indirectly | ||
| 91 | * from sys_cmpxchg() in kernel/intvec.S. | ||
| 92 | */ | ||
| 93 | int _sys_cmpxchg_badaddr(unsigned long address, struct pt_regs *regs) | ||
| 94 | { | ||
| 95 | if (address >= PAGE_OFFSET) | ||
| 96 | force_sig_info_fault(SIGSEGV, SEGV_MAPERR, address, | ||
| 97 | INT_DTLB_MISS, current); | ||
| 98 | else | ||
| 99 | force_sig_info_fault(SIGBUS, BUS_ADRALN, address, | ||
| 100 | INT_UNALIGN_DATA, current); | ||
| 101 | |||
| 102 | /* | ||
| 103 | * Adjust pc to point at the actual instruction, which is unusual | ||
| 104 | * for syscalls normally, but is appropriate when we are claiming | ||
| 105 | * that a syscall swint1 caused a page fault or bus error. | ||
| 106 | */ | ||
| 107 | regs->pc -= 8; | ||
| 108 | |||
| 109 | /* | ||
| 110 | * Mark this as a caller-save interrupt, like a normal page fault, | ||
| 111 | * so that when we go through the signal handler path we will | ||
| 112 | * properly restore r0, r1, and r2 for the signal handler arguments. | ||
| 113 | */ | ||
| 114 | regs->flags |= PT_FLAGS_CALLER_SAVES; | ||
| 115 | |||
| 116 | return 0; | ||
| 117 | } | ||
| 118 | #endif | ||
| 119 | |||
| 120 | static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) | ||
| 121 | { | ||
| 122 | unsigned index = pgd_index(address); | ||
| 123 | pgd_t *pgd_k; | ||
| 124 | pud_t *pud, *pud_k; | ||
| 125 | pmd_t *pmd, *pmd_k; | ||
| 126 | |||
| 127 | pgd += index; | ||
| 128 | pgd_k = init_mm.pgd + index; | ||
| 129 | |||
| 130 | if (!pgd_present(*pgd_k)) | ||
| 131 | return NULL; | ||
| 132 | |||
| 133 | pud = pud_offset(pgd, address); | ||
| 134 | pud_k = pud_offset(pgd_k, address); | ||
| 135 | if (!pud_present(*pud_k)) | ||
| 136 | return NULL; | ||
| 137 | |||
| 138 | pmd = pmd_offset(pud, address); | ||
| 139 | pmd_k = pmd_offset(pud_k, address); | ||
| 140 | if (!pmd_present(*pmd_k)) | ||
| 141 | return NULL; | ||
| 142 | if (!pmd_present(*pmd)) { | ||
| 143 | set_pmd(pmd, *pmd_k); | ||
| 144 | arch_flush_lazy_mmu_mode(); | ||
| 145 | } else | ||
| 146 | BUG_ON(pmd_ptfn(*pmd) != pmd_ptfn(*pmd_k)); | ||
| 147 | return pmd_k; | ||
| 148 | } | ||
| 149 | |||
| 150 | /* | ||
| 151 | * Handle a fault on the vmalloc or module mapping area | ||
| 152 | */ | ||
| 153 | static inline int vmalloc_fault(pgd_t *pgd, unsigned long address) | ||
| 154 | { | ||
| 155 | pmd_t *pmd_k; | ||
| 156 | pte_t *pte_k; | ||
| 157 | |||
| 158 | /* Make sure we are in vmalloc area */ | ||
| 159 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | ||
| 160 | return -1; | ||
| 161 | |||
| 162 | /* | ||
| 163 | * Synchronize this task's top level page-table | ||
| 164 | * with the 'reference' page table. | ||
| 165 | */ | ||
| 166 | pmd_k = vmalloc_sync_one(pgd, address); | ||
| 167 | if (!pmd_k) | ||
| 168 | return -1; | ||
| 169 | if (pmd_huge(*pmd_k)) | ||
| 170 | return 0; /* support TILE huge_vmap() API */ | ||
| 171 | pte_k = pte_offset_kernel(pmd_k, address); | ||
| 172 | if (!pte_present(*pte_k)) | ||
| 173 | return -1; | ||
| 174 | return 0; | ||
| 175 | } | ||
| 176 | |||
| 177 | /* Wait until this PTE has completed migration. */ | ||
| 178 | static void wait_for_migration(pte_t *pte) | ||
| 179 | { | ||
| 180 | if (pte_migrating(*pte)) { | ||
| 181 | /* | ||
| 182 | * Wait until the migrater fixes up this pte. | ||
| 183 | * We scale the loop count by the clock rate so we'll wait for | ||
| 184 | * a few seconds here. | ||
| 185 | */ | ||
| 186 | int retries = 0; | ||
| 187 | int bound = get_clock_rate(); | ||
| 188 | while (pte_migrating(*pte)) { | ||
| 189 | barrier(); | ||
| 190 | if (++retries > bound) | ||
| 191 | panic("Hit migrating PTE (%#llx) and" | ||
| 192 | " page PFN %#lx still migrating", | ||
| 193 | pte->val, pte_pfn(*pte)); | ||
| 194 | } | ||
| 195 | } | ||
| 196 | } | ||
| 197 | |||
| 198 | /* | ||
| 199 | * It's not generally safe to use "current" to get the page table pointer, | ||
| 200 | * since we might be running an oprofile interrupt in the middle of a | ||
| 201 | * task switch. | ||
| 202 | */ | ||
| 203 | static pgd_t *get_current_pgd(void) | ||
| 204 | { | ||
| 205 | HV_Context ctx = hv_inquire_context(); | ||
| 206 | unsigned long pgd_pfn = ctx.page_table >> PAGE_SHIFT; | ||
| 207 | struct page *pgd_page = pfn_to_page(pgd_pfn); | ||
| 208 | BUG_ON(PageHighMem(pgd_page)); /* oops, HIGHPTE? */ | ||
| 209 | return (pgd_t *) __va(ctx.page_table); | ||
| 210 | } | ||
| 211 | |||
| 212 | /* | ||
| 213 | * We can receive a page fault from a migrating PTE at any time. | ||
| 214 | * Handle it by just waiting until the fault resolves. | ||
| 215 | * | ||
| 216 | * It's also possible to get a migrating kernel PTE that resolves | ||
| 217 | * itself during the downcall from hypervisor to Linux. We just check | ||
| 218 | * here to see if the PTE seems valid, and if so we retry it. | ||
| 219 | * | ||
| 220 | * NOTE! We MUST NOT take any locks for this case. We may be in an | ||
| 221 | * interrupt or a critical region, and must do as little as possible. | ||
| 222 | * Similarly, we can't use atomic ops here, since we may be handling a | ||
| 223 | * fault caused by an atomic op access. | ||
| 224 | */ | ||
| 225 | static int handle_migrating_pte(pgd_t *pgd, int fault_num, | ||
| 226 | unsigned long address, | ||
| 227 | int is_kernel_mode, int write) | ||
| 228 | { | ||
| 229 | pud_t *pud; | ||
| 230 | pmd_t *pmd; | ||
| 231 | pte_t *pte; | ||
| 232 | pte_t pteval; | ||
| 233 | |||
| 234 | if (pgd_addr_invalid(address)) | ||
| 235 | return 0; | ||
| 236 | |||
| 237 | pgd += pgd_index(address); | ||
| 238 | pud = pud_offset(pgd, address); | ||
| 239 | if (!pud || !pud_present(*pud)) | ||
| 240 | return 0; | ||
| 241 | pmd = pmd_offset(pud, address); | ||
| 242 | if (!pmd || !pmd_present(*pmd)) | ||
| 243 | return 0; | ||
| 244 | pte = pmd_huge_page(*pmd) ? ((pte_t *)pmd) : | ||
| 245 | pte_offset_kernel(pmd, address); | ||
| 246 | pteval = *pte; | ||
| 247 | if (pte_migrating(pteval)) { | ||
| 248 | wait_for_migration(pte); | ||
| 249 | return 1; | ||
| 250 | } | ||
| 251 | |||
| 252 | if (!is_kernel_mode || !pte_present(pteval)) | ||
| 253 | return 0; | ||
| 254 | if (fault_num == INT_ITLB_MISS) { | ||
| 255 | if (pte_exec(pteval)) | ||
| 256 | return 1; | ||
| 257 | } else if (write) { | ||
| 258 | if (pte_write(pteval)) | ||
| 259 | return 1; | ||
| 260 | } else { | ||
| 261 | if (pte_read(pteval)) | ||
| 262 | return 1; | ||
| 263 | } | ||
| 264 | |||
| 265 | return 0; | ||
| 266 | } | ||
| 267 | |||
| 268 | /* | ||
| 269 | * This routine is responsible for faulting in user pages. | ||
| 270 | * It passes the work off to one of the appropriate routines. | ||
| 271 | * It returns true if the fault was successfully handled. | ||
| 272 | */ | ||
| 273 | static int handle_page_fault(struct pt_regs *regs, | ||
| 274 | int fault_num, | ||
| 275 | int is_page_fault, | ||
| 276 | unsigned long address, | ||
| 277 | int write) | ||
| 278 | { | ||
| 279 | struct task_struct *tsk; | ||
| 280 | struct mm_struct *mm; | ||
| 281 | struct vm_area_struct *vma; | ||
| 282 | unsigned long stack_offset; | ||
| 283 | int fault; | ||
| 284 | int si_code; | ||
| 285 | int is_kernel_mode; | ||
| 286 | pgd_t *pgd; | ||
| 287 | |||
| 288 | /* on TILE, protection faults are always writes */ | ||
| 289 | if (!is_page_fault) | ||
| 290 | write = 1; | ||
| 291 | |||
| 292 | is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL); | ||
| 293 | |||
| 294 | tsk = validate_current(); | ||
| 295 | |||
| 296 | /* | ||
| 297 | * Check to see if we might be overwriting the stack, and bail | ||
| 298 | * out if so. The page fault code is a relatively likely | ||
| 299 | * place to get trapped in an infinite regress, and once we | ||
| 300 | * overwrite the whole stack, it becomes very hard to recover. | ||
| 301 | */ | ||
| 302 | stack_offset = stack_pointer & (THREAD_SIZE-1); | ||
| 303 | if (stack_offset < THREAD_SIZE / 8) { | ||
| 304 | printk(KERN_ALERT "Potential stack overrun: sp %#lx\n", | ||
| 305 | stack_pointer); | ||
| 306 | show_regs(regs); | ||
| 307 | printk(KERN_ALERT "Killing current process %d/%s\n", | ||
| 308 | tsk->pid, tsk->comm); | ||
| 309 | do_group_exit(SIGKILL); | ||
| 310 | } | ||
| 311 | |||
| 312 | /* | ||
| 313 | * Early on, we need to check for migrating PTE entries; | ||
| 314 | * see homecache.c. If we find a migrating PTE, we wait until | ||
| 315 | * the backing page claims to be done migrating, then we procede. | ||
| 316 | * For kernel PTEs, we rewrite the PTE and return and retry. | ||
| 317 | * Otherwise, we treat the fault like a normal "no PTE" fault, | ||
| 318 | * rather than trying to patch up the existing PTE. | ||
| 319 | */ | ||
| 320 | pgd = get_current_pgd(); | ||
| 321 | if (handle_migrating_pte(pgd, fault_num, address, | ||
| 322 | is_kernel_mode, write)) | ||
| 323 | return 1; | ||
| 324 | |||
| 325 | si_code = SEGV_MAPERR; | ||
| 326 | |||
| 327 | /* | ||
| 328 | * We fault-in kernel-space virtual memory on-demand. The | ||
| 329 | * 'reference' page table is init_mm.pgd. | ||
| 330 | * | ||
| 331 | * NOTE! We MUST NOT take any locks for this case. We may | ||
| 332 | * be in an interrupt or a critical region, and should | ||
| 333 | * only copy the information from the master page table, | ||
| 334 | * nothing more. | ||
| 335 | * | ||
| 336 | * This verifies that the fault happens in kernel space | ||
| 337 | * and that the fault was not a protection fault. | ||
| 338 | */ | ||
| 339 | if (unlikely(address >= TASK_SIZE && | ||
| 340 | !is_arch_mappable_range(address, 0))) { | ||
| 341 | if (is_kernel_mode && is_page_fault && | ||
| 342 | vmalloc_fault(pgd, address) >= 0) | ||
| 343 | return 1; | ||
| 344 | /* | ||
| 345 | * Don't take the mm semaphore here. If we fixup a prefetch | ||
| 346 | * fault we could otherwise deadlock. | ||
| 347 | */ | ||
| 348 | mm = NULL; /* happy compiler */ | ||
| 349 | vma = NULL; | ||
| 350 | goto bad_area_nosemaphore; | ||
| 351 | } | ||
| 352 | |||
| 353 | /* | ||
| 354 | * If we're trying to touch user-space addresses, we must | ||
| 355 | * be either at PL0, or else with interrupts enabled in the | ||
| 356 | * kernel, so either way we can re-enable interrupts here. | ||
| 357 | */ | ||
| 358 | local_irq_enable(); | ||
| 359 | |||
| 360 | mm = tsk->mm; | ||
| 361 | |||
| 362 | /* | ||
| 363 | * If we're in an interrupt, have no user context or are running in an | ||
| 364 | * atomic region then we must not take the fault. | ||
| 365 | */ | ||
| 366 | if (in_atomic() || !mm) { | ||
| 367 | vma = NULL; /* happy compiler */ | ||
| 368 | goto bad_area_nosemaphore; | ||
| 369 | } | ||
| 370 | |||
| 371 | /* | ||
| 372 | * When running in the kernel we expect faults to occur only to | ||
| 373 | * addresses in user space. All other faults represent errors in the | ||
| 374 | * kernel and should generate an OOPS. Unfortunately, in the case of an | ||
| 375 | * erroneous fault occurring in a code path which already holds mmap_sem | ||
| 376 | * we will deadlock attempting to validate the fault against the | ||
| 377 | * address space. Luckily the kernel only validly references user | ||
| 378 | * space from well defined areas of code, which are listed in the | ||
| 379 | * exceptions table. | ||
| 380 | * | ||
| 381 | * As the vast majority of faults will be valid we will only perform | ||
| 382 | * the source reference check when there is a possibility of a deadlock. | ||
| 383 | * Attempt to lock the address space, if we cannot we then validate the | ||
| 384 | * source. If this is invalid we can skip the address space check, | ||
| 385 | * thus avoiding the deadlock. | ||
| 386 | */ | ||
| 387 | if (!down_read_trylock(&mm->mmap_sem)) { | ||
| 388 | if (is_kernel_mode && | ||
| 389 | !search_exception_tables(regs->pc)) { | ||
| 390 | vma = NULL; /* happy compiler */ | ||
| 391 | goto bad_area_nosemaphore; | ||
| 392 | } | ||
| 393 | down_read(&mm->mmap_sem); | ||
| 394 | } | ||
| 395 | |||
| 396 | vma = find_vma(mm, address); | ||
| 397 | if (!vma) | ||
| 398 | goto bad_area; | ||
| 399 | if (vma->vm_start <= address) | ||
| 400 | goto good_area; | ||
| 401 | if (!(vma->vm_flags & VM_GROWSDOWN)) | ||
| 402 | goto bad_area; | ||
| 403 | if (regs->sp < PAGE_OFFSET) { | ||
| 404 | /* | ||
| 405 | * accessing the stack below sp is always a bug. | ||
| 406 | */ | ||
| 407 | if (address < regs->sp) | ||
| 408 | goto bad_area; | ||
| 409 | } | ||
| 410 | if (expand_stack(vma, address)) | ||
| 411 | goto bad_area; | ||
| 412 | |||
| 413 | /* | ||
| 414 | * Ok, we have a good vm_area for this memory access, so | ||
| 415 | * we can handle it.. | ||
| 416 | */ | ||
| 417 | good_area: | ||
| 418 | si_code = SEGV_ACCERR; | ||
| 419 | if (fault_num == INT_ITLB_MISS) { | ||
| 420 | if (!(vma->vm_flags & VM_EXEC)) | ||
| 421 | goto bad_area; | ||
| 422 | } else if (write) { | ||
| 423 | #ifdef TEST_VERIFY_AREA | ||
| 424 | if (!is_page_fault && regs->cs == KERNEL_CS) | ||
| 425 | printk("WP fault at "REGFMT"\n", regs->eip); | ||
| 426 | #endif | ||
| 427 | if (!(vma->vm_flags & VM_WRITE)) | ||
| 428 | goto bad_area; | ||
| 429 | } else { | ||
| 430 | if (!is_page_fault || !(vma->vm_flags & VM_READ)) | ||
| 431 | goto bad_area; | ||
| 432 | } | ||
| 433 | |||
| 434 | survive: | ||
| 435 | /* | ||
| 436 | * If for any reason at all we couldn't handle the fault, | ||
| 437 | * make sure we exit gracefully rather than endlessly redo | ||
| 438 | * the fault. | ||
| 439 | */ | ||
| 440 | fault = handle_mm_fault(mm, vma, address, write); | ||
| 441 | if (unlikely(fault & VM_FAULT_ERROR)) { | ||
| 442 | if (fault & VM_FAULT_OOM) | ||
| 443 | goto out_of_memory; | ||
| 444 | else if (fault & VM_FAULT_SIGBUS) | ||
| 445 | goto do_sigbus; | ||
| 446 | BUG(); | ||
| 447 | } | ||
| 448 | if (fault & VM_FAULT_MAJOR) | ||
| 449 | tsk->maj_flt++; | ||
| 450 | else | ||
| 451 | tsk->min_flt++; | ||
| 452 | |||
| 453 | /* | ||
| 454 | * If this was an asynchronous fault, | ||
| 455 | * restart the appropriate engine. | ||
| 456 | */ | ||
| 457 | switch (fault_num) { | ||
| 458 | #if CHIP_HAS_TILE_DMA() | ||
| 459 | case INT_DMATLB_MISS: | ||
| 460 | case INT_DMATLB_MISS_DWNCL: | ||
| 461 | case INT_DMATLB_ACCESS: | ||
| 462 | case INT_DMATLB_ACCESS_DWNCL: | ||
| 463 | __insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK); | ||
| 464 | break; | ||
| 465 | #endif | ||
| 466 | #if CHIP_HAS_SN_PROC() | ||
| 467 | case INT_SNITLB_MISS: | ||
| 468 | case INT_SNITLB_MISS_DWNCL: | ||
| 469 | __insn_mtspr(SPR_SNCTL, | ||
| 470 | __insn_mfspr(SPR_SNCTL) & | ||
| 471 | ~SPR_SNCTL__FRZPROC_MASK); | ||
| 472 | break; | ||
| 473 | #endif | ||
| 474 | } | ||
| 475 | |||
| 476 | up_read(&mm->mmap_sem); | ||
| 477 | return 1; | ||
| 478 | |||
| 479 | /* | ||
| 480 | * Something tried to access memory that isn't in our memory map.. | ||
| 481 | * Fix it, but check if it's kernel or user first.. | ||
| 482 | */ | ||
| 483 | bad_area: | ||
| 484 | up_read(&mm->mmap_sem); | ||
| 485 | |||
| 486 | bad_area_nosemaphore: | ||
| 487 | /* User mode accesses just cause a SIGSEGV */ | ||
| 488 | if (!is_kernel_mode) { | ||
| 489 | /* | ||
| 490 | * It's possible to have interrupts off here. | ||
| 491 | */ | ||
| 492 | local_irq_enable(); | ||
| 493 | |||
| 494 | force_sig_info_fault(SIGSEGV, si_code, address, | ||
| 495 | fault_num, tsk); | ||
| 496 | return 0; | ||
| 497 | } | ||
| 498 | |||
| 499 | no_context: | ||
| 500 | /* Are we prepared to handle this kernel fault? */ | ||
| 501 | if (fixup_exception(regs)) | ||
| 502 | return 0; | ||
| 503 | |||
| 504 | /* | ||
| 505 | * Oops. The kernel tried to access some bad page. We'll have to | ||
| 506 | * terminate things with extreme prejudice. | ||
| 507 | */ | ||
| 508 | |||
| 509 | bust_spinlocks(1); | ||
| 510 | |||
| 511 | /* FIXME: no lookup_address() yet */ | ||
| 512 | #ifdef SUPPORT_LOOKUP_ADDRESS | ||
| 513 | if (fault_num == INT_ITLB_MISS) { | ||
| 514 | pte_t *pte = lookup_address(address); | ||
| 515 | |||
| 516 | if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) | ||
| 517 | printk(KERN_CRIT "kernel tried to execute" | ||
| 518 | " non-executable page - exploit attempt?" | ||
| 519 | " (uid: %d)\n", current->uid); | ||
| 520 | } | ||
| 521 | #endif | ||
| 522 | if (address < PAGE_SIZE) | ||
| 523 | printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference\n"); | ||
| 524 | else | ||
| 525 | printk(KERN_ALERT "Unable to handle kernel paging request\n"); | ||
| 526 | printk(" at virtual address "REGFMT", pc "REGFMT"\n", | ||
| 527 | address, regs->pc); | ||
| 528 | |||
| 529 | show_regs(regs); | ||
| 530 | |||
| 531 | if (unlikely(tsk->pid < 2)) { | ||
| 532 | panic("Kernel page fault running %s!", | ||
| 533 | tsk->pid ? "init" : "the idle task"); | ||
| 534 | } | ||
| 535 | |||
| 536 | /* | ||
| 537 | * More FIXME: we should probably copy the i386 here and | ||
| 538 | * implement a generic die() routine. Not today. | ||
| 539 | */ | ||
| 540 | #ifdef SUPPORT_DIE | ||
| 541 | die("Oops", regs); | ||
| 542 | #endif | ||
| 543 | bust_spinlocks(1); | ||
| 544 | |||
| 545 | do_group_exit(SIGKILL); | ||
| 546 | |||
| 547 | /* | ||
| 548 | * We ran out of memory, or some other thing happened to us that made | ||
| 549 | * us unable to handle the page fault gracefully. | ||
| 550 | */ | ||
| 551 | out_of_memory: | ||
| 552 | up_read(&mm->mmap_sem); | ||
| 553 | if (is_global_init(tsk)) { | ||
| 554 | yield(); | ||
| 555 | down_read(&mm->mmap_sem); | ||
| 556 | goto survive; | ||
| 557 | } | ||
| 558 | printk("VM: killing process %s\n", tsk->comm); | ||
| 559 | if (!is_kernel_mode) | ||
| 560 | do_group_exit(SIGKILL); | ||
| 561 | goto no_context; | ||
| 562 | |||
| 563 | do_sigbus: | ||
| 564 | up_read(&mm->mmap_sem); | ||
| 565 | |||
| 566 | /* Kernel mode? Handle exceptions or die */ | ||
| 567 | if (is_kernel_mode) | ||
| 568 | goto no_context; | ||
| 569 | |||
| 570 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, fault_num, tsk); | ||
| 571 | return 0; | ||
| 572 | } | ||
| 573 | |||
| 574 | #ifndef __tilegx__ | ||
| 575 | |||
| 576 | extern char sys_cmpxchg[], __sys_cmpxchg_end[]; | ||
| 577 | extern char __sys_cmpxchg_grab_lock[]; | ||
| 578 | extern char __start_atomic_asm_code[], __end_atomic_asm_code[]; | ||
| 579 | |||
| 580 | /* | ||
| 581 | * We return this structure in registers to avoid having to write | ||
| 582 | * additional save/restore code in the intvec.S caller. | ||
| 583 | */ | ||
| 584 | struct intvec_state { | ||
| 585 | void *handler; | ||
| 586 | unsigned long vecnum; | ||
| 587 | unsigned long fault_num; | ||
| 588 | unsigned long info; | ||
| 589 | unsigned long retval; | ||
| 590 | }; | ||
| 591 | |||
| 592 | /* We must release ICS before panicking or we won't get anywhere. */ | ||
| 593 | #define ics_panic(fmt, ...) do { \ | ||
| 594 | __insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0); \ | ||
| 595 | panic(fmt, __VA_ARGS__); \ | ||
| 596 | } while (0) | ||
| 597 | |||
| 598 | void do_page_fault(struct pt_regs *regs, int fault_num, | ||
| 599 | unsigned long address, unsigned long write); | ||
| 600 | |||
| 601 | /* | ||
| 602 | * When we take an ITLB or DTLB fault or access violation in the | ||
| 603 | * supervisor while the critical section bit is set, the hypervisor is | ||
| 604 | * reluctant to write new values into the EX_CONTEXT_1_x registers, | ||
| 605 | * since that might indicate we have not yet squirreled the SPR | ||
| 606 | * contents away and can thus safely take a recursive interrupt. | ||
| 607 | * Accordingly, the hypervisor passes us the PC via SYSTEM_SAVE_1_2. | ||
| 608 | */ | ||
| 609 | struct intvec_state do_page_fault_ics(struct pt_regs *regs, int fault_num, | ||
| 610 | unsigned long address, | ||
| 611 | unsigned long info) | ||
| 612 | { | ||
| 613 | unsigned long pc = info & ~1; | ||
| 614 | int write = info & 1; | ||
| 615 | pgd_t *pgd = get_current_pgd(); | ||
| 616 | |||
| 617 | /* Retval is 1 at first since we will handle the fault fully. */ | ||
| 618 | struct intvec_state state = { | ||
| 619 | do_page_fault, fault_num, address, write, 1 | ||
| 620 | }; | ||
| 621 | |||
| 622 | /* Validate that we are plausibly in the right routine. */ | ||
| 623 | if ((pc & 0x7) != 0 || pc < PAGE_OFFSET || | ||
| 624 | (fault_num != INT_DTLB_MISS && | ||
| 625 | fault_num != INT_DTLB_ACCESS)) { | ||
| 626 | unsigned long old_pc = regs->pc; | ||
| 627 | regs->pc = pc; | ||
| 628 | ics_panic("Bad ICS page fault args:" | ||
| 629 | " old PC %#lx, fault %d/%d at %#lx\n", | ||
| 630 | old_pc, fault_num, write, address); | ||
| 631 | } | ||
| 632 | |||
| 633 | /* We might be faulting on a vmalloc page, so check that first. */ | ||
| 634 | if (fault_num != INT_DTLB_ACCESS && vmalloc_fault(pgd, address) >= 0) | ||
| 635 | return state; | ||
| 636 | |||
| 637 | /* | ||
| 638 | * If we faulted with ICS set in sys_cmpxchg, we are providing | ||
| 639 | * a user syscall service that should generate a signal on | ||
| 640 | * fault. We didn't set up a kernel stack on initial entry to | ||
| 641 | * sys_cmpxchg, but instead had one set up by the fault, which | ||
| 642 | * (because sys_cmpxchg never releases ICS) came to us via the | ||
| 643 | * SYSTEM_SAVE_1_2 mechanism, and thus EX_CONTEXT_1_[01] are | ||
| 644 | * still referencing the original user code. We release the | ||
| 645 | * atomic lock and rewrite pt_regs so that it appears that we | ||
| 646 | * came from user-space directly, and after we finish the | ||
| 647 | * fault we'll go back to user space and re-issue the swint. | ||
| 648 | * This way the backtrace information is correct if we need to | ||
| 649 | * emit a stack dump at any point while handling this. | ||
| 650 | * | ||
| 651 | * Must match register use in sys_cmpxchg(). | ||
| 652 | */ | ||
| 653 | if (pc >= (unsigned long) sys_cmpxchg && | ||
| 654 | pc < (unsigned long) __sys_cmpxchg_end) { | ||
| 655 | #ifdef CONFIG_SMP | ||
| 656 | /* Don't unlock before we could have locked. */ | ||
| 657 | if (pc >= (unsigned long)__sys_cmpxchg_grab_lock) { | ||
| 658 | int *lock_ptr = (int *)(regs->regs[ATOMIC_LOCK_REG]); | ||
| 659 | __atomic_fault_unlock(lock_ptr); | ||
| 660 | } | ||
| 661 | #endif | ||
| 662 | regs->sp = regs->regs[27]; | ||
| 663 | } | ||
| 664 | |||
| 665 | /* | ||
| 666 | * We can also fault in the atomic assembly, in which | ||
| 667 | * case we use the exception table to do the first-level fixup. | ||
| 668 | * We may re-fixup again in the real fault handler if it | ||
| 669 | * turns out the faulting address is just bad, and not, | ||
| 670 | * for example, migrating. | ||
| 671 | */ | ||
| 672 | else if (pc >= (unsigned long) __start_atomic_asm_code && | ||
| 673 | pc < (unsigned long) __end_atomic_asm_code) { | ||
| 674 | const struct exception_table_entry *fixup; | ||
| 675 | #ifdef CONFIG_SMP | ||
| 676 | /* Unlock the atomic lock. */ | ||
| 677 | int *lock_ptr = (int *)(regs->regs[ATOMIC_LOCK_REG]); | ||
| 678 | __atomic_fault_unlock(lock_ptr); | ||
| 679 | #endif | ||
| 680 | fixup = search_exception_tables(pc); | ||
| 681 | if (!fixup) | ||
| 682 | ics_panic("ICS atomic fault not in table:" | ||
| 683 | " PC %#lx, fault %d", pc, fault_num); | ||
| 684 | regs->pc = fixup->fixup; | ||
| 685 | regs->ex1 = PL_ICS_EX1(KERNEL_PL, 0); | ||
| 686 | } | ||
| 687 | |||
| 688 | /* | ||
| 689 | * NOTE: the one other type of access that might bring us here | ||
| 690 | * are the memory ops in __tns_atomic_acquire/__tns_atomic_release, | ||
| 691 | * but we don't have to check specially for them since we can | ||
| 692 | * always safely return to the address of the fault and retry, | ||
| 693 | * since no separate atomic locks are involved. | ||
| 694 | */ | ||
| 695 | |||
| 696 | /* | ||
| 697 | * Now that we have released the atomic lock (if necessary), | ||
| 698 | * it's safe to spin if the PTE that caused the fault was migrating. | ||
| 699 | */ | ||
| 700 | if (fault_num == INT_DTLB_ACCESS) | ||
| 701 | write = 1; | ||
| 702 | if (handle_migrating_pte(pgd, fault_num, address, 1, write)) | ||
| 703 | return state; | ||
| 704 | |||
| 705 | /* Return zero so that we continue on with normal fault handling. */ | ||
| 706 | state.retval = 0; | ||
| 707 | return state; | ||
| 708 | } | ||
| 709 | |||
| 710 | #endif /* !__tilegx__ */ | ||
| 711 | |||
| 712 | /* | ||
| 713 | * This routine handles page faults. It determines the address, and the | ||
| 714 | * problem, and then passes it handle_page_fault() for normal DTLB and | ||
| 715 | * ITLB issues, and for DMA or SN processor faults when we are in user | ||
| 716 | * space. For the latter, if we're in kernel mode, we just save the | ||
| 717 | * interrupt away appropriately and return immediately. We can't do | ||
| 718 | * page faults for user code while in kernel mode. | ||
| 719 | */ | ||
| 720 | void do_page_fault(struct pt_regs *regs, int fault_num, | ||
| 721 | unsigned long address, unsigned long write) | ||
| 722 | { | ||
| 723 | int is_page_fault; | ||
| 724 | |||
| 725 | /* This case should have been handled by do_page_fault_ics(). */ | ||
| 726 | BUG_ON(write & ~1); | ||
| 727 | |||
| 728 | #if CHIP_HAS_TILE_DMA() | ||
| 729 | /* | ||
| 730 | * If it's a DMA fault, suspend the transfer while we're | ||
| 731 | * handling the miss; we'll restart after it's handled. If we | ||
| 732 | * don't suspend, it's possible that this process could swap | ||
| 733 | * out and back in, and restart the engine since the DMA is | ||
| 734 | * still 'running'. | ||
| 735 | */ | ||
| 736 | if (fault_num == INT_DMATLB_MISS || | ||
| 737 | fault_num == INT_DMATLB_ACCESS || | ||
| 738 | fault_num == INT_DMATLB_MISS_DWNCL || | ||
| 739 | fault_num == INT_DMATLB_ACCESS_DWNCL) { | ||
| 740 | __insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__SUSPEND_MASK); | ||
| 741 | while (__insn_mfspr(SPR_DMA_USER_STATUS) & | ||
| 742 | SPR_DMA_STATUS__BUSY_MASK) | ||
| 743 | ; | ||
| 744 | } | ||
| 745 | #endif | ||
| 746 | |||
| 747 | /* Validate fault num and decide if this is a first-time page fault. */ | ||
| 748 | switch (fault_num) { | ||
| 749 | case INT_ITLB_MISS: | ||
| 750 | case INT_DTLB_MISS: | ||
| 751 | #if CHIP_HAS_TILE_DMA() | ||
| 752 | case INT_DMATLB_MISS: | ||
| 753 | case INT_DMATLB_MISS_DWNCL: | ||
| 754 | #endif | ||
| 755 | #if CHIP_HAS_SN_PROC() | ||
| 756 | case INT_SNITLB_MISS: | ||
| 757 | case INT_SNITLB_MISS_DWNCL: | ||
| 758 | #endif | ||
| 759 | is_page_fault = 1; | ||
| 760 | break; | ||
| 761 | |||
| 762 | case INT_DTLB_ACCESS: | ||
| 763 | #if CHIP_HAS_TILE_DMA() | ||
| 764 | case INT_DMATLB_ACCESS: | ||
| 765 | case INT_DMATLB_ACCESS_DWNCL: | ||
| 766 | #endif | ||
| 767 | is_page_fault = 0; | ||
| 768 | break; | ||
| 769 | |||
| 770 | default: | ||
| 771 | panic("Bad fault number %d in do_page_fault", fault_num); | ||
| 772 | } | ||
| 773 | |||
| 774 | if (EX1_PL(regs->ex1) != USER_PL) { | ||
| 775 | struct async_tlb *async; | ||
| 776 | switch (fault_num) { | ||
| 777 | #if CHIP_HAS_TILE_DMA() | ||
| 778 | case INT_DMATLB_MISS: | ||
| 779 | case INT_DMATLB_ACCESS: | ||
| 780 | case INT_DMATLB_MISS_DWNCL: | ||
| 781 | case INT_DMATLB_ACCESS_DWNCL: | ||
| 782 | async = ¤t->thread.dma_async_tlb; | ||
| 783 | break; | ||
| 784 | #endif | ||
| 785 | #if CHIP_HAS_SN_PROC() | ||
| 786 | case INT_SNITLB_MISS: | ||
| 787 | case INT_SNITLB_MISS_DWNCL: | ||
| 788 | async = ¤t->thread.sn_async_tlb; | ||
| 789 | break; | ||
| 790 | #endif | ||
| 791 | default: | ||
| 792 | async = NULL; | ||
| 793 | } | ||
| 794 | if (async) { | ||
| 795 | |||
| 796 | /* | ||
| 797 | * No vmalloc check required, so we can allow | ||
| 798 | * interrupts immediately at this point. | ||
| 799 | */ | ||
| 800 | local_irq_enable(); | ||
| 801 | |||
| 802 | set_thread_flag(TIF_ASYNC_TLB); | ||
| 803 | if (async->fault_num != 0) { | ||
| 804 | panic("Second async fault %d;" | ||
| 805 | " old fault was %d (%#lx/%ld)", | ||
| 806 | fault_num, async->fault_num, | ||
| 807 | address, write); | ||
| 808 | } | ||
| 809 | BUG_ON(fault_num == 0); | ||
| 810 | async->fault_num = fault_num; | ||
| 811 | async->is_fault = is_page_fault; | ||
| 812 | async->is_write = write; | ||
| 813 | async->address = address; | ||
| 814 | return; | ||
| 815 | } | ||
| 816 | } | ||
| 817 | |||
| 818 | handle_page_fault(regs, fault_num, is_page_fault, address, write); | ||
| 819 | } | ||
| 820 | |||
| 821 | |||
| 822 | #if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() | ||
| 823 | /* | ||
| 824 | * Check an async_tlb structure to see if a deferred fault is waiting, | ||
| 825 | * and if so pass it to the page-fault code. | ||
| 826 | */ | ||
| 827 | static void handle_async_page_fault(struct pt_regs *regs, | ||
| 828 | struct async_tlb *async) | ||
| 829 | { | ||
| 830 | if (async->fault_num) { | ||
| 831 | /* | ||
| 832 | * Clear async->fault_num before calling the page-fault | ||
| 833 | * handler so that if we re-interrupt before returning | ||
| 834 | * from the function we have somewhere to put the | ||
| 835 | * information from the new interrupt. | ||
| 836 | */ | ||
| 837 | int fault_num = async->fault_num; | ||
| 838 | async->fault_num = 0; | ||
| 839 | handle_page_fault(regs, fault_num, async->is_fault, | ||
| 840 | async->address, async->is_write); | ||
| 841 | } | ||
| 842 | } | ||
| 843 | #endif /* CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() */ | ||
| 844 | |||
| 845 | |||
| 846 | /* | ||
| 847 | * This routine effectively re-issues asynchronous page faults | ||
| 848 | * when we are returning to user space. | ||
| 849 | */ | ||
| 850 | void do_async_page_fault(struct pt_regs *regs) | ||
| 851 | { | ||
| 852 | /* | ||
| 853 | * Clear thread flag early. If we re-interrupt while processing | ||
| 854 | * code here, we will reset it and recall this routine before | ||
| 855 | * returning to user space. | ||
| 856 | */ | ||
| 857 | clear_thread_flag(TIF_ASYNC_TLB); | ||
| 858 | |||
| 859 | #if CHIP_HAS_TILE_DMA() | ||
| 860 | handle_async_page_fault(regs, ¤t->thread.dma_async_tlb); | ||
| 861 | #endif | ||
| 862 | #if CHIP_HAS_SN_PROC() | ||
| 863 | handle_async_page_fault(regs, ¤t->thread.sn_async_tlb); | ||
| 864 | #endif | ||
| 865 | } | ||
| 866 | |||
| 867 | void vmalloc_sync_all(void) | ||
| 868 | { | ||
| 869 | #ifdef __tilegx__ | ||
| 870 | /* Currently all L1 kernel pmd's are static and shared. */ | ||
| 871 | BUG_ON(pgd_index(VMALLOC_END) != pgd_index(VMALLOC_START)); | ||
| 872 | #else | ||
| 873 | /* | ||
| 874 | * Note that races in the updates of insync and start aren't | ||
| 875 | * problematic: insync can only get set bits added, and updates to | ||
| 876 | * start are only improving performance (without affecting correctness | ||
| 877 | * if undone). | ||
| 878 | */ | ||
| 879 | static DECLARE_BITMAP(insync, PTRS_PER_PGD); | ||
| 880 | static unsigned long start = PAGE_OFFSET; | ||
| 881 | unsigned long address; | ||
| 882 | |||
| 883 | BUILD_BUG_ON(PAGE_OFFSET & ~PGDIR_MASK); | ||
| 884 | for (address = start; address >= PAGE_OFFSET; address += PGDIR_SIZE) { | ||
| 885 | if (!test_bit(pgd_index(address), insync)) { | ||
| 886 | unsigned long flags; | ||
| 887 | struct list_head *pos; | ||
| 888 | |||
| 889 | spin_lock_irqsave(&pgd_lock, flags); | ||
| 890 | list_for_each(pos, &pgd_list) | ||
| 891 | if (!vmalloc_sync_one(list_to_pgd(pos), | ||
| 892 | address)) { | ||
| 893 | /* Must be at first entry in list. */ | ||
| 894 | BUG_ON(pos != pgd_list.next); | ||
| 895 | break; | ||
| 896 | } | ||
| 897 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
| 898 | if (pos != pgd_list.next) | ||
| 899 | set_bit(pgd_index(address), insync); | ||
| 900 | } | ||
| 901 | if (address == start && test_bit(pgd_index(address), insync)) | ||
| 902 | start = address + PGDIR_SIZE; | ||
| 903 | } | ||
| 904 | #endif | ||
| 905 | } | ||
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c new file mode 100644 index 000000000000..1fcecc5b9e03 --- /dev/null +++ b/arch/tile/mm/highmem.c | |||
| @@ -0,0 +1,328 @@ | |||
| 1 | /* | ||
| 2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation, version 2. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
| 11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | */ | ||
| 14 | |||
| 15 | #include <linux/highmem.h> | ||
| 16 | #include <linux/module.h> | ||
| 17 | #include <linux/pagemap.h> | ||
| 18 | #include <asm/homecache.h> | ||
| 19 | |||
| 20 | #define kmap_get_pte(vaddr) \ | ||
| 21 | pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)),\ | ||
| 22 | (vaddr)), (vaddr)) | ||
| 23 | |||
| 24 | |||
| 25 | void *kmap(struct page *page) | ||
| 26 | { | ||
| 27 | void *kva; | ||
| 28 | unsigned long flags; | ||
| 29 | pte_t *ptep; | ||
| 30 | |||
| 31 | might_sleep(); | ||
| 32 | if (!PageHighMem(page)) | ||
| 33 | return page_address(page); | ||
| 34 | kva = kmap_high(page); | ||
| 35 | |||
| 36 | /* | ||
| 37 | * Rewrite the PTE under the lock. This ensures that the page | ||
| 38 | * is not currently migrating. | ||
| 39 | */ | ||
| 40 | ptep = kmap_get_pte((unsigned long)kva); | ||
| 41 | flags = homecache_kpte_lock(); | ||
| 42 | set_pte_at(&init_mm, kva, ptep, mk_pte(page, page_to_kpgprot(page))); | ||
| 43 | homecache_kpte_unlock(flags); | ||
| 44 | |||
| 45 | return kva; | ||
| 46 | } | ||
| 47 | EXPORT_SYMBOL(kmap); | ||
| 48 | |||
| 49 | void kunmap(struct page *page) | ||
| 50 | { | ||
| 51 | if (in_interrupt()) | ||
| 52 | BUG(); | ||
| 53 | if (!PageHighMem(page)) | ||
| 54 | return; | ||
| 55 | kunmap_high(page); | ||
| 56 | } | ||
| 57 | EXPORT_SYMBOL(kunmap); | ||
| 58 | |||
| 59 | static void debug_kmap_atomic_prot(enum km_type type) | ||
| 60 | { | ||
| 61 | #ifdef CONFIG_DEBUG_HIGHMEM | ||
| 62 | static unsigned warn_count = 10; | ||
| 63 | |||
| 64 | if (unlikely(warn_count == 0)) | ||
| 65 | return; | ||
| 66 | |||
| 67 | if (unlikely(in_interrupt())) { | ||
| 68 | if (in_irq()) { | ||
| 69 | if (type != KM_IRQ0 && type != KM_IRQ1 && | ||
| 70 | type != KM_BIO_SRC_IRQ && | ||
| 71 | /* type != KM_BIO_DST_IRQ && */ | ||
| 72 | type != KM_BOUNCE_READ) { | ||
| 73 | WARN_ON(1); | ||
| 74 | warn_count--; | ||
| 75 | } | ||
| 76 | } else if (!irqs_disabled()) { /* softirq */ | ||
| 77 | if (type != KM_IRQ0 && type != KM_IRQ1 && | ||
| 78 | type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && | ||
| 79 | type != KM_SKB_SUNRPC_DATA && | ||
| 80 | type != KM_SKB_DATA_SOFTIRQ && | ||
| 81 | type != KM_BOUNCE_READ) { | ||
| 82 | WARN_ON(1); | ||
| 83 | warn_count--; | ||
| 84 | } | ||
| 85 | } | ||
| 86 | } | ||
| 87 | |||
| 88 | if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || | ||
| 89 | type == KM_BIO_SRC_IRQ /* || type == KM_BIO_DST_IRQ */) { | ||
| 90 | if (!irqs_disabled()) { | ||
| 91 | WARN_ON(1); | ||
| 92 | warn_count--; | ||
| 93 | } | ||
| 94 | } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { | ||
| 95 | if (irq_count() == 0 && !irqs_disabled()) { | ||
| 96 | WARN_ON(1); | ||
| 97 | warn_count--; | ||
| 98 | } | ||
| 99 | } | ||
| 100 | #endif | ||
| 101 | } | ||
| 102 | |||
| 103 | /* | ||
| 104 | * Describe a single atomic mapping of a page on a given cpu at a | ||
| 105 | * given address, and allow it to be linked into a list. | ||
| 106 | */ | ||
| 107 | struct atomic_mapped_page { | ||
| 108 | struct list_head list; | ||
| 109 | struct page *page; | ||
| 110 | int cpu; | ||
| 111 | unsigned long va; | ||
| 112 | }; | ||
| 113 | |||
| 114 | static spinlock_t amp_lock = __SPIN_LOCK_UNLOCKED(&_lock); | ||
| 115 | static struct list_head amp_list = LIST_HEAD_INIT(amp_list); | ||
| 116 | |||
| 117 | /* | ||
| 118 | * Combining this structure with a per-cpu declaration lets us give | ||
| 119 | * each cpu an atomic_mapped_page structure per type. | ||
| 120 | */ | ||
| 121 | struct kmap_amps { | ||
| 122 | struct atomic_mapped_page per_type[KM_TYPE_NR]; | ||
| 123 | }; | ||
| 124 | DEFINE_PER_CPU(struct kmap_amps, amps); | ||
| 125 | |||
| 126 | /* | ||
| 127 | * Add a page and va, on this cpu, to the list of kmap_atomic pages, | ||
| 128 | * and write the new pte to memory. Writing the new PTE under the | ||
| 129 | * lock guarantees that it is either on the list before migration starts | ||
| 130 | * (if we won the race), or set_pte() sets the migrating bit in the PTE | ||
| 131 | * (if we lost the race). And doing it under the lock guarantees | ||
| 132 | * that when kmap_atomic_fix_one_pte() comes along, it finds a valid | ||
| 133 | * PTE in memory, iff the mapping is still on the amp_list. | ||
| 134 | * | ||
| 135 | * Finally, doing it under the lock lets us safely examine the page | ||
| 136 | * to see if it is immutable or not, for the generic kmap_atomic() case. | ||
| 137 | * If we examine it earlier we are exposed to a race where it looks | ||
| 138 | * writable earlier, but becomes immutable before we write the PTE. | ||
| 139 | */ | ||
| 140 | static void kmap_atomic_register(struct page *page, enum km_type type, | ||
| 141 | unsigned long va, pte_t *ptep, pte_t pteval) | ||
| 142 | { | ||
| 143 | unsigned long flags; | ||
| 144 | struct atomic_mapped_page *amp; | ||
| 145 | |||
| 146 | flags = homecache_kpte_lock(); | ||
| 147 | spin_lock(&_lock); | ||
| 148 | |||
| 149 | /* With interrupts disabled, now fill in the per-cpu info. */ | ||
| 150 | amp = &__get_cpu_var(amps).per_type[type]; | ||
| 151 | amp->page = page; | ||
| 152 | amp->cpu = smp_processor_id(); | ||
| 153 | amp->va = va; | ||
| 154 | |||
| 155 | /* For generic kmap_atomic(), choose the PTE writability now. */ | ||
| 156 | if (!pte_read(pteval)) | ||
| 157 | pteval = mk_pte(page, page_to_kpgprot(page)); | ||
| 158 | |||
| 159 | list_add(&->list, &_list); | ||
| 160 | set_pte(ptep, pteval); | ||
| 161 | arch_flush_lazy_mmu_mode(); | ||
| 162 | |||
| 163 | spin_unlock(&_lock); | ||
| 164 | homecache_kpte_unlock(flags); | ||
| 165 | } | ||
| 166 | |||
| 167 | /* | ||
| 168 | * Remove a page and va, on this cpu, from the list of kmap_atomic pages. | ||
| 169 | * Linear-time search, but we count on the lists being short. | ||
| 170 | * We don't need to adjust the PTE under the lock (as opposed to the | ||
| 171 | * kmap_atomic_register() case), since we're just unconditionally | ||
| 172 | * zeroing the PTE after it's off the list. | ||
| 173 | */ | ||
| 174 | static void kmap_atomic_unregister(struct page *page, unsigned long va) | ||
| 175 | { | ||
| 176 | unsigned long flags; | ||
| 177 | struct atomic_mapped_page *amp; | ||
| 178 | int cpu = smp_processor_id(); | ||
| 179 | spin_lock_irqsave(&_lock, flags); | ||
| 180 | list_for_each_entry(amp, &_list, list) { | ||
| 181 | if (amp->page == page && amp->cpu == cpu && amp->va == va) | ||
| 182 | break; | ||
| 183 | } | ||
| 184 | BUG_ON(&->list == &_list); | ||
| 185 | list_del(&->list); | ||
| 186 | spin_unlock_irqrestore(&_lock, flags); | ||
| 187 | } | ||
| 188 | |||
| 189 | /* Helper routine for kmap_atomic_fix_kpte(), below. */ | ||
| 190 | static void kmap_atomic_fix_one_kpte(struct atomic_mapped_page *amp, | ||
| 191 | int finished) | ||
| 192 | { | ||
| 193 | pte_t *ptep = kmap_get_pte(amp->va); | ||
| 194 | if (!finished) { | ||
| 195 | set_pte(ptep, pte_mkmigrate(*ptep)); | ||
| 196 | flush_remote(0, 0, NULL, amp->va, PAGE_SIZE, PAGE_SIZE, | ||
| 197 | cpumask_of(amp->cpu), NULL, 0); | ||
| 198 | } else { | ||
| 199 | /* | ||
| 200 | * Rewrite a default kernel PTE for this page. | ||
| 201 | * We rely on the fact that set_pte() writes the | ||
| 202 | * present+migrating bits last. | ||
| 203 | */ | ||
| 204 | pte_t pte = mk_pte(amp->page, page_to_kpgprot(amp->page)); | ||
| 205 | set_pte(ptep, pte); | ||
| 206 | } | ||
| 207 | } | ||
| 208 | |||
| 209 | /* | ||
| 210 | * This routine is a helper function for homecache_fix_kpte(); see | ||
| 211 | * its comments for more information on the "finished" argument here. | ||
| 212 | * | ||
| 213 | * Note that we hold the lock while doing the remote flushes, which | ||
| 214 | * will stall any unrelated cpus trying to do kmap_atomic operations. | ||
| 215 | * We could just update the PTEs under the lock, and save away copies | ||
| 216 | * of the structs (or just the va+cpu), then flush them after we | ||
| 217 | * release the lock, but it seems easier just to do it all under the lock. | ||
| 218 | */ | ||
| 219 | void kmap_atomic_fix_kpte(struct page *page, int finished) | ||
| 220 | { | ||
| 221 | struct atomic_mapped_page *amp; | ||
| 222 | unsigned long flags; | ||
| 223 | spin_lock_irqsave(&_lock, flags); | ||
| 224 | list_for_each_entry(amp, &_list, list) { | ||
| 225 | if (amp->page == page) | ||
| 226 | kmap_atomic_fix_one_kpte(amp, finished); | ||
| 227 | } | ||
| 228 | spin_unlock_irqrestore(&_lock, flags); | ||
| 229 | } | ||
| 230 | |||
| 231 | /* | ||
| 232 | * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap | ||
| 233 | * because the kmap code must perform a global TLB invalidation when | ||
| 234 | * the kmap pool wraps. | ||
| 235 | * | ||
| 236 | * Note that they may be slower than on x86 (etc.) because unlike on | ||
| 237 | * those platforms, we do have to take a global lock to map and unmap | ||
| 238 | * pages on Tile (see above). | ||
| 239 | * | ||
| 240 | * When holding an atomic kmap is is not legal to sleep, so atomic | ||
| 241 | * kmaps are appropriate for short, tight code paths only. | ||
| 242 | */ | ||
| 243 | void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) | ||
| 244 | { | ||
| 245 | enum fixed_addresses idx; | ||
| 246 | unsigned long vaddr; | ||
| 247 | pte_t *pte; | ||
| 248 | |||
| 249 | /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ | ||
| 250 | pagefault_disable(); | ||
| 251 | |||
| 252 | /* Avoid icache flushes by disallowing atomic executable mappings. */ | ||
| 253 | BUG_ON(pte_exec(prot)); | ||
| 254 | |||
| 255 | if (!PageHighMem(page)) | ||
| 256 | return page_address(page); | ||
| 257 | |||
| 258 | debug_kmap_atomic_prot(type); | ||
| 259 | |||
| 260 | idx = type + KM_TYPE_NR*smp_processor_id(); | ||
| 261 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | ||
| 262 | pte = kmap_get_pte(vaddr); | ||
| 263 | BUG_ON(!pte_none(*pte)); | ||
| 264 | |||
| 265 | /* Register that this page is mapped atomically on this cpu. */ | ||
| 266 | kmap_atomic_register(page, type, vaddr, pte, mk_pte(page, prot)); | ||
| 267 | |||
| 268 | return (void *)vaddr; | ||
| 269 | } | ||
| 270 | EXPORT_SYMBOL(kmap_atomic_prot); | ||
| 271 | |||
| 272 | void *kmap_atomic(struct page *page, enum km_type type) | ||
| 273 | { | ||
| 274 | /* PAGE_NONE is a magic value that tells us to check immutability. */ | ||
| 275 | return kmap_atomic_prot(page, type, PAGE_NONE); | ||
| 276 | } | ||
| 277 | EXPORT_SYMBOL(kmap_atomic); | ||
| 278 | |||
| 279 | void kunmap_atomic(void *kvaddr, enum km_type type) | ||
| 280 | { | ||
| 281 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; | ||
| 282 | enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); | ||
| 283 | |||
| 284 | /* | ||
| 285 | * Force other mappings to Oops if they try to access this pte without | ||
| 286 | * first remapping it. Keeping stale mappings around is a bad idea. | ||
| 287 | */ | ||
| 288 | if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) { | ||
| 289 | pte_t *pte = kmap_get_pte(vaddr); | ||
| 290 | pte_t pteval = *pte; | ||
| 291 | BUG_ON(!pte_present(pteval) && !pte_migrating(pteval)); | ||
| 292 | kmap_atomic_unregister(pte_page(pteval), vaddr); | ||
| 293 | kpte_clear_flush(pte, vaddr); | ||
| 294 | } else { | ||
| 295 | /* Must be a lowmem page */ | ||
| 296 | BUG_ON(vaddr < PAGE_OFFSET); | ||
| 297 | BUG_ON(vaddr >= (unsigned long)high_memory); | ||
| 298 | } | ||
| 299 | |||
| 300 | arch_flush_lazy_mmu_mode(); | ||
| 301 | pagefault_enable(); | ||
| 302 | } | ||
| 303 | EXPORT_SYMBOL(kunmap_atomic); | ||
| 304 | |||
| 305 | /* | ||
| 306 | * This API is supposed to allow us to map memory without a "struct page". | ||
| 307 | * Currently we don't support this, though this may change in the future. | ||
| 308 | */ | ||
| 309 | void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) | ||
| 310 | { | ||
| 311 | return kmap_atomic(pfn_to_page(pfn), type); | ||
| 312 | } | ||
| 313 | void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) | ||
| 314 | { | ||
| 315 | return kmap_atomic_prot(pfn_to_page(pfn), type, prot); | ||
| 316 | } | ||
| 317 | |||
| 318 | struct page *kmap_atomic_to_page(void *ptr) | ||
| 319 | { | ||
| 320 | pte_t *pte; | ||
| 321 | unsigned long vaddr = (unsigned long)ptr; | ||
| 322 | |||
| 323 | if (vaddr < FIXADDR_START) | ||
| 324 | return virt_to_page(ptr); | ||
| 325 | |||
| 326 | pte = kmap_get_pte(vaddr); | ||
| 327 | return pte_page(*pte); | ||
| 328 | } | ||
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c new file mode 100644 index 000000000000..52feb77133ce --- /dev/null +++ b/arch/tile/mm/homecache.c | |||
| @@ -0,0 +1,445 @@ | |||
| 1 | /* | ||
| 2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation, version 2. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
| 11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | * | ||
| 14 | * This code maintains the "home" for each page in the system. | ||
| 15 | */ | ||
| 16 | |||
| 17 | #include <linux/kernel.h> | ||
| 18 | #include <linux/mm.h> | ||
| 19 | #include <linux/spinlock.h> | ||
| 20 | #include <linux/list.h> | ||
| 21 | #include <linux/bootmem.h> | ||
| 22 | #include <linux/rmap.h> | ||
| 23 | #include <linux/pagemap.h> | ||
| 24 | #include <linux/mutex.h> | ||
| 25 | #include <linux/interrupt.h> | ||
| 26 | #include <linux/sysctl.h> | ||
| 27 | #include <linux/pagevec.h> | ||
| 28 | #include <linux/ptrace.h> | ||
| 29 | #include <linux/timex.h> | ||
| 30 | #include <linux/cache.h> | ||
| 31 | #include <linux/smp.h> | ||
| 32 | |||
| 33 | #include <asm/page.h> | ||
| 34 | #include <asm/sections.h> | ||
| 35 | #include <asm/tlbflush.h> | ||
| 36 | #include <asm/pgalloc.h> | ||
| 37 | #include <asm/homecache.h> | ||
| 38 | |||
| 39 | #include "migrate.h" | ||
| 40 | |||
| 41 | |||
| 42 | #if CHIP_HAS_COHERENT_LOCAL_CACHE() | ||
| 43 | |||
| 44 | /* | ||
| 45 | * The noallocl2 option suppresses all use of the L2 cache to cache | ||
| 46 | * locally from a remote home. There's no point in using it if we | ||
| 47 | * don't have coherent local caching, though. | ||
| 48 | */ | ||
| 49 | int __write_once noallocl2; | ||
| 50 | static int __init set_noallocl2(char *str) | ||
| 51 | { | ||
| 52 | noallocl2 = 1; | ||
| 53 | return 0; | ||
| 54 | } | ||
| 55 | early_param("noallocl2", set_noallocl2); | ||
| 56 | |||
| 57 | #else | ||
| 58 | |||
| 59 | #define noallocl2 0 | ||
| 60 | |||
| 61 | #endif | ||
| 62 | |||
| 63 | |||
| 64 | |||
| 65 | /* Provide no-op versions of these routines to keep flush_remote() cleaner. */ | ||
| 66 | #define mark_caches_evicted_start() 0 | ||
| 67 | #define mark_caches_evicted_finish(mask, timestamp) do {} while (0) | ||
| 68 | |||
| 69 | |||
| 70 | |||
| 71 | |||
| 72 | /* | ||
| 73 | * Update the irq_stat for cpus that we are going to interrupt | ||
| 74 | * with TLB or cache flushes. Also handle removing dataplane cpus | ||
| 75 | * from the TLB flush set, and setting dataplane_tlb_state instead. | ||
| 76 | */ | ||
| 77 | static void hv_flush_update(const struct cpumask *cache_cpumask, | ||
| 78 | struct cpumask *tlb_cpumask, | ||
| 79 | unsigned long tlb_va, unsigned long tlb_length, | ||
| 80 | HV_Remote_ASID *asids, int asidcount) | ||
| 81 | { | ||
| 82 | struct cpumask mask; | ||
| 83 | int i, cpu; | ||
| 84 | |||
| 85 | cpumask_clear(&mask); | ||
| 86 | if (cache_cpumask) | ||
| 87 | cpumask_or(&mask, &mask, cache_cpumask); | ||
| 88 | if (tlb_cpumask && tlb_length) { | ||
| 89 | cpumask_or(&mask, &mask, tlb_cpumask); | ||
| 90 | } | ||
| 91 | |||
| 92 | for (i = 0; i < asidcount; ++i) | ||
| 93 | cpumask_set_cpu(asids[i].y * smp_width + asids[i].x, &mask); | ||
| 94 | |||
| 95 | /* | ||
| 96 | * Don't bother to update atomically; losing a count | ||
| 97 | * here is not that critical. | ||
| 98 | */ | ||
| 99 | for_each_cpu(cpu, &mask) | ||
| 100 | ++per_cpu(irq_stat, cpu).irq_hv_flush_count; | ||
| 101 | } | ||
| 102 | |||
| 103 | /* | ||
| 104 | * This wrapper function around hv_flush_remote() does several things: | ||
| 105 | * | ||
| 106 | * - Provides a return value error-checking panic path, since | ||
| 107 | * there's never any good reason for hv_flush_remote() to fail. | ||
| 108 | * - Accepts a 32-bit PFN rather than a 64-bit PA, which generally | ||
| 109 | * is the type that Linux wants to pass around anyway. | ||
| 110 | * - Centralizes the mark_caches_evicted() handling. | ||
| 111 | * - Canonicalizes that lengths of zero make cpumasks NULL. | ||
| 112 | * - Handles deferring TLB flushes for dataplane tiles. | ||
| 113 | * - Tracks remote interrupts in the per-cpu irq_cpustat_t. | ||
| 114 | * | ||
| 115 | * Note that we have to wait until the cache flush completes before | ||
| 116 | * updating the per-cpu last_cache_flush word, since otherwise another | ||
| 117 | * concurrent flush can race, conclude the flush has already | ||
| 118 | * completed, and start to use the page while it's still dirty | ||
| 119 | * remotely (running concurrently with the actual evict, presumably). | ||
| 120 | */ | ||
| 121 | void flush_remote(unsigned long cache_pfn, unsigned long cache_control, | ||
| 122 | const struct cpumask *cache_cpumask_orig, | ||
| 123 | HV_VirtAddr tlb_va, unsigned long tlb_length, | ||
| 124 | unsigned long tlb_pgsize, | ||
| 125 | const struct cpumask *tlb_cpumask_orig, | ||
| 126 | HV_Remote_ASID *asids, int asidcount) | ||
| 127 | { | ||
| 128 | int rc; | ||
| 129 | int timestamp = 0; /* happy compiler */ | ||
| 130 | struct cpumask cache_cpumask_copy, tlb_cpumask_copy; | ||
| 131 | struct cpumask *cache_cpumask, *tlb_cpumask; | ||
| 132 | HV_PhysAddr cache_pa; | ||
| 133 | char cache_buf[NR_CPUS*5], tlb_buf[NR_CPUS*5]; | ||
| 134 | |||
| 135 | mb(); /* provided just to simplify "magic hypervisor" mode */ | ||
| 136 | |||
| 137 | /* | ||
| 138 | * Canonicalize and copy the cpumasks. | ||
| 139 | */ | ||
| 140 | if (cache_cpumask_orig && cache_control) { | ||
| 141 | cpumask_copy(&cache_cpumask_copy, cache_cpumask_orig); | ||
| 142 | cache_cpumask = &cache_cpumask_copy; | ||
| 143 | } else { | ||
| 144 | cpumask_clear(&cache_cpumask_copy); | ||
| 145 | cache_cpumask = NULL; | ||
| 146 | } | ||
| 147 | if (cache_cpumask == NULL) | ||
| 148 | cache_control = 0; | ||
| 149 | if (tlb_cpumask_orig && tlb_length) { | ||
| 150 | cpumask_copy(&tlb_cpumask_copy, tlb_cpumask_orig); | ||
| 151 | tlb_cpumask = &tlb_cpumask_copy; | ||
| 152 | } else { | ||
| 153 | cpumask_clear(&tlb_cpumask_copy); | ||
| 154 | tlb_cpumask = NULL; | ||
| 155 | } | ||
| 156 | |||
| 157 | hv_flush_update(cache_cpumask, tlb_cpumask, tlb_va, tlb_length, | ||
| 158 | asids, asidcount); | ||
| 159 | cache_pa = (HV_PhysAddr)cache_pfn << PAGE_SHIFT; | ||
| 160 | if (cache_control & HV_FLUSH_EVICT_L2) | ||
| 161 | timestamp = mark_caches_evicted_start(); | ||
| 162 | rc = hv_flush_remote(cache_pa, cache_control, | ||
| 163 | cpumask_bits(cache_cpumask), | ||
| 164 | tlb_va, tlb_length, tlb_pgsize, | ||
| 165 | cpumask_bits(tlb_cpumask), | ||
| 166 | asids, asidcount); | ||
| 167 | if (cache_control & HV_FLUSH_EVICT_L2) | ||
| 168 | mark_caches_evicted_finish(cache_cpumask, timestamp); | ||
| 169 | if (rc == 0) | ||
| 170 | return; | ||
| 171 | cpumask_scnprintf(cache_buf, sizeof(cache_buf), &cache_cpumask_copy); | ||
| 172 | cpumask_scnprintf(tlb_buf, sizeof(tlb_buf), &tlb_cpumask_copy); | ||
| 173 | |||
| 174 | printk("hv_flush_remote(%#llx, %#lx, %p [%s]," | ||
| 175 | " %#lx, %#lx, %#lx, %p [%s], %p, %d) = %d\n", | ||
| 176 | cache_pa, cache_control, cache_cpumask, cache_buf, | ||
| 177 | (unsigned long)tlb_va, tlb_length, tlb_pgsize, | ||
| 178 | tlb_cpumask, tlb_buf, | ||
| 179 | asids, asidcount, rc); | ||
| 180 | if (asidcount > 0) { | ||
| 181 | int i; | ||
| 182 | printk(" asids:"); | ||
| 183 | for (i = 0; i < asidcount; ++i) | ||
| 184 | printk(" %d,%d,%d", | ||
| 185 | asids[i].x, asids[i].y, asids[i].asid); | ||
| 186 | printk("\n"); | ||
| 187 | } | ||
| 188 | panic("Unsafe to continue."); | ||
| 189 | } | ||
| 190 | |||
| 191 | void homecache_evict(const struct cpumask *mask) | ||
| 192 | { | ||
| 193 | flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0); | ||
| 194 | } | ||
| 195 | |||
| 196 | /* Return a mask of the cpus whose caches currently own these pages. */ | ||
| 197 | static void homecache_mask(struct page *page, int pages, | ||
| 198 | struct cpumask *home_mask) | ||
| 199 | { | ||
| 200 | int i; | ||
| 201 | cpumask_clear(home_mask); | ||
| 202 | for (i = 0; i < pages; ++i) { | ||
| 203 | int home = page_home(&page[i]); | ||
| 204 | if (home == PAGE_HOME_IMMUTABLE || | ||
| 205 | home == PAGE_HOME_INCOHERENT) { | ||
| 206 | cpumask_copy(home_mask, cpu_possible_mask); | ||
| 207 | return; | ||
| 208 | } | ||
| 209 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
| 210 | if (home == PAGE_HOME_HASH) { | ||
| 211 | cpumask_or(home_mask, home_mask, &hash_for_home_map); | ||
| 212 | continue; | ||
| 213 | } | ||
| 214 | #endif | ||
| 215 | if (home == PAGE_HOME_UNCACHED) | ||
| 216 | continue; | ||
| 217 | BUG_ON(home < 0 || home >= NR_CPUS); | ||
| 218 | cpumask_set_cpu(home, home_mask); | ||
| 219 | } | ||
| 220 | } | ||
| 221 | |||
| 222 | /* | ||
| 223 | * Return the passed length, or zero if it's long enough that we | ||
| 224 | * believe we should evict the whole L2 cache. | ||
| 225 | */ | ||
| 226 | static unsigned long cache_flush_length(unsigned long length) | ||
| 227 | { | ||
| 228 | return (length >= CHIP_L2_CACHE_SIZE()) ? HV_FLUSH_EVICT_L2 : length; | ||
| 229 | } | ||
| 230 | |||
| 231 | /* On the simulator, confirm lines have been evicted everywhere. */ | ||
| 232 | static void validate_lines_evicted(unsigned long pfn, size_t length) | ||
| 233 | { | ||
| 234 | sim_syscall(SIM_SYSCALL_VALIDATE_LINES_EVICTED, | ||
| 235 | (HV_PhysAddr)pfn << PAGE_SHIFT, length); | ||
| 236 | } | ||
| 237 | |||
| 238 | /* Flush a page out of whatever cache(s) it is in. */ | ||
| 239 | void homecache_flush_cache(struct page *page, int order) | ||
| 240 | { | ||
| 241 | int pages = 1 << order; | ||
| 242 | int length = cache_flush_length(pages * PAGE_SIZE); | ||
| 243 | unsigned long pfn = page_to_pfn(page); | ||
| 244 | struct cpumask home_mask; | ||
| 245 | |||
| 246 | homecache_mask(page, pages, &home_mask); | ||
| 247 | flush_remote(pfn, length, &home_mask, 0, 0, 0, NULL, NULL, 0); | ||
| 248 | validate_lines_evicted(pfn, pages * PAGE_SIZE); | ||
| 249 | } | ||
| 250 | |||
| 251 | |||
| 252 | /* Report the home corresponding to a given PTE. */ | ||
| 253 | static int pte_to_home(pte_t pte) | ||
| 254 | { | ||
| 255 | if (hv_pte_get_nc(pte)) | ||
| 256 | return PAGE_HOME_IMMUTABLE; | ||
| 257 | switch (hv_pte_get_mode(pte)) { | ||
| 258 | case HV_PTE_MODE_CACHE_TILE_L3: | ||
| 259 | return get_remote_cache_cpu(pte); | ||
| 260 | case HV_PTE_MODE_CACHE_NO_L3: | ||
| 261 | return PAGE_HOME_INCOHERENT; | ||
| 262 | case HV_PTE_MODE_UNCACHED: | ||
| 263 | return PAGE_HOME_UNCACHED; | ||
| 264 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
| 265 | case HV_PTE_MODE_CACHE_HASH_L3: | ||
| 266 | return PAGE_HOME_HASH; | ||
| 267 | #endif | ||
| 268 | } | ||
| 269 | panic("Bad PTE %#llx\n", pte.val); | ||
| 270 | } | ||
| 271 | |||
| 272 | /* Update the home of a PTE if necessary (can also be used for a pgprot_t). */ | ||
| 273 | pte_t pte_set_home(pte_t pte, int home) | ||
| 274 | { | ||
| 275 | /* Check for non-linear file mapping "PTEs" and pass them through. */ | ||
| 276 | if (pte_file(pte)) | ||
| 277 | return pte; | ||
| 278 | |||
| 279 | #if CHIP_HAS_MMIO() | ||
| 280 | /* Check for MMIO mappings and pass them through. */ | ||
| 281 | if (hv_pte_get_mode(pte) == HV_PTE_MODE_MMIO) | ||
| 282 | return pte; | ||
| 283 | #endif | ||
| 284 | |||
| 285 | |||
| 286 | /* | ||
| 287 | * Only immutable pages get NC mappings. If we have a | ||
| 288 | * non-coherent PTE, but the underlying page is not | ||
| 289 | * immutable, it's likely the result of a forced | ||
| 290 | * caching setting running up against ptrace setting | ||
| 291 | * the page to be writable underneath. In this case, | ||
| 292 | * just keep the PTE coherent. | ||
| 293 | */ | ||
| 294 | if (hv_pte_get_nc(pte) && home != PAGE_HOME_IMMUTABLE) { | ||
| 295 | pte = hv_pte_clear_nc(pte); | ||
| 296 | printk("non-immutable page incoherently referenced: %#llx\n", | ||
| 297 | pte.val); | ||
| 298 | } | ||
| 299 | |||
| 300 | switch (home) { | ||
| 301 | |||
| 302 | case PAGE_HOME_UNCACHED: | ||
| 303 | pte = hv_pte_set_mode(pte, HV_PTE_MODE_UNCACHED); | ||
| 304 | break; | ||
| 305 | |||
| 306 | case PAGE_HOME_INCOHERENT: | ||
| 307 | pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3); | ||
| 308 | break; | ||
| 309 | |||
| 310 | case PAGE_HOME_IMMUTABLE: | ||
| 311 | /* | ||
| 312 | * We could home this page anywhere, since it's immutable, | ||
| 313 | * but by default just home it to follow "hash_default". | ||
| 314 | */ | ||
| 315 | BUG_ON(hv_pte_get_writable(pte)); | ||
| 316 | if (pte_get_forcecache(pte)) { | ||
| 317 | /* Upgrade "force any cpu" to "No L3" for immutable. */ | ||
| 318 | if (hv_pte_get_mode(pte) == HV_PTE_MODE_CACHE_TILE_L3 | ||
| 319 | && pte_get_anyhome(pte)) { | ||
| 320 | pte = hv_pte_set_mode(pte, | ||
| 321 | HV_PTE_MODE_CACHE_NO_L3); | ||
| 322 | } | ||
| 323 | } else | ||
| 324 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
| 325 | if (hash_default) | ||
| 326 | pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3); | ||
| 327 | else | ||
| 328 | #endif | ||
| 329 | pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3); | ||
| 330 | pte = hv_pte_set_nc(pte); | ||
| 331 | break; | ||
| 332 | |||
| 333 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
| 334 | case PAGE_HOME_HASH: | ||
| 335 | pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3); | ||
| 336 | break; | ||
| 337 | #endif | ||
| 338 | |||
| 339 | default: | ||
| 340 | BUG_ON(home < 0 || home >= NR_CPUS || | ||
| 341 | !cpu_is_valid_lotar(home)); | ||
| 342 | pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_TILE_L3); | ||
| 343 | pte = set_remote_cache_cpu(pte, home); | ||
| 344 | break; | ||
| 345 | } | ||
| 346 | |||
| 347 | #if CHIP_HAS_NC_AND_NOALLOC_BITS() | ||
| 348 | if (noallocl2) | ||
| 349 | pte = hv_pte_set_no_alloc_l2(pte); | ||
| 350 | |||
| 351 | /* Simplify "no local and no l3" to "uncached" */ | ||
| 352 | if (hv_pte_get_no_alloc_l2(pte) && hv_pte_get_no_alloc_l1(pte) && | ||
| 353 | hv_pte_get_mode(pte) == HV_PTE_MODE_CACHE_NO_L3) { | ||
| 354 | pte = hv_pte_set_mode(pte, HV_PTE_MODE_UNCACHED); | ||
| 355 | } | ||
| 356 | #endif | ||
| 357 | |||
| 358 | /* Checking this case here gives a better panic than from the hv. */ | ||
| 359 | BUG_ON(hv_pte_get_mode(pte) == 0); | ||
| 360 | |||
| 361 | return pte; | ||
| 362 | } | ||
| 363 | |||
| 364 | /* | ||
| 365 | * The routines in this section are the "static" versions of the normal | ||
| 366 | * dynamic homecaching routines; they just set the home cache | ||
| 367 | * of a kernel page once, and require a full-chip cache/TLB flush, | ||
| 368 | * so they're not suitable for anything but infrequent use. | ||
| 369 | */ | ||
| 370 | |||
| 371 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
| 372 | static inline int initial_page_home(void) { return PAGE_HOME_HASH; } | ||
| 373 | #else | ||
| 374 | static inline int initial_page_home(void) { return 0; } | ||
| 375 | #endif | ||
| 376 | |||
| 377 | int page_home(struct page *page) | ||
| 378 | { | ||
| 379 | if (PageHighMem(page)) { | ||
| 380 | return initial_page_home(); | ||
| 381 | } else { | ||
| 382 | unsigned long kva = (unsigned long)page_address(page); | ||
| 383 | return pte_to_home(*virt_to_pte(NULL, kva)); | ||
| 384 | } | ||
| 385 | } | ||
| 386 | |||
| 387 | void homecache_change_page_home(struct page *page, int order, int home) | ||
| 388 | { | ||
| 389 | int i, pages = (1 << order); | ||
| 390 | unsigned long kva; | ||
| 391 | |||
| 392 | BUG_ON(PageHighMem(page)); | ||
| 393 | BUG_ON(page_count(page) > 1); | ||
| 394 | BUG_ON(page_mapcount(page) != 0); | ||
| 395 | kva = (unsigned long) page_address(page); | ||
| 396 | flush_remote(0, HV_FLUSH_EVICT_L2, &cpu_cacheable_map, | ||
| 397 | kva, pages * PAGE_SIZE, PAGE_SIZE, cpu_online_mask, | ||
| 398 | NULL, 0); | ||
| 399 | |||
| 400 | for (i = 0; i < pages; ++i, kva += PAGE_SIZE) { | ||
| 401 | pte_t *ptep = virt_to_pte(NULL, kva); | ||
| 402 | pte_t pteval = *ptep; | ||
| 403 | BUG_ON(!pte_present(pteval) || pte_huge(pteval)); | ||
| 404 | *ptep = pte_set_home(pteval, home); | ||
| 405 | } | ||
| 406 | } | ||
| 407 | |||
| 408 | struct page *homecache_alloc_pages(gfp_t gfp_mask, | ||
| 409 | unsigned int order, int home) | ||
| 410 | { | ||
| 411 | struct page *page; | ||
| 412 | BUG_ON(gfp_mask & __GFP_HIGHMEM); /* must be lowmem */ | ||
| 413 | page = alloc_pages(gfp_mask, order); | ||
| 414 | if (page) | ||
| 415 | homecache_change_page_home(page, order, home); | ||
| 416 | return page; | ||
| 417 | } | ||
| 418 | |||
| 419 | struct page *homecache_alloc_pages_node(int nid, gfp_t gfp_mask, | ||
| 420 | unsigned int order, int home) | ||
| 421 | { | ||
| 422 | struct page *page; | ||
| 423 | BUG_ON(gfp_mask & __GFP_HIGHMEM); /* must be lowmem */ | ||
| 424 | page = alloc_pages_node(nid, gfp_mask, order); | ||
| 425 | if (page) | ||
| 426 | homecache_change_page_home(page, order, home); | ||
| 427 | return page; | ||
| 428 | } | ||
| 429 | |||
| 430 | void homecache_free_pages(unsigned long addr, unsigned int order) | ||
| 431 | { | ||
| 432 | struct page *page; | ||
| 433 | |||
| 434 | if (addr == 0) | ||
| 435 | return; | ||
| 436 | |||
| 437 | VM_BUG_ON(!virt_addr_valid((void *)addr)); | ||
| 438 | page = virt_to_page((void *)addr); | ||
| 439 | if (put_page_testzero(page)) { | ||
| 440 | int pages = (1 << order); | ||
| 441 | homecache_change_page_home(page, order, initial_page_home()); | ||
| 442 | while (pages--) | ||
| 443 | __free_page(page++); | ||
| 444 | } | ||
| 445 | } | ||
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c new file mode 100644 index 000000000000..c38570f8f0d0 --- /dev/null +++ b/arch/tile/mm/hugetlbpage.c | |||
| @@ -0,0 +1,343 @@ | |||
| 1 | /* | ||
| 2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation, version 2. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
| 11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | * | ||
| 14 | * TILE Huge TLB Page Support for Kernel. | ||
| 15 | * Taken from i386 hugetlb implementation: | ||
| 16 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | ||
| 17 | */ | ||
| 18 | |||
| 19 | #include <linux/init.h> | ||
| 20 | #include <linux/fs.h> | ||
| 21 | #include <linux/mm.h> | ||
| 22 | #include <linux/hugetlb.h> | ||
| 23 | #include <linux/pagemap.h> | ||
| 24 | #include <linux/smp_lock.h> | ||
| 25 | #include <linux/slab.h> | ||
| 26 | #include <linux/err.h> | ||
| 27 | #include <linux/sysctl.h> | ||
| 28 | #include <linux/mman.h> | ||
| 29 | #include <asm/tlb.h> | ||
| 30 | #include <asm/tlbflush.h> | ||
| 31 | |||
| 32 | pte_t *huge_pte_alloc(struct mm_struct *mm, | ||
| 33 | unsigned long addr, unsigned long sz) | ||
| 34 | { | ||
| 35 | pgd_t *pgd; | ||
| 36 | pud_t *pud; | ||
| 37 | pte_t *pte = NULL; | ||
| 38 | |||
| 39 | /* We do not yet support multiple huge page sizes. */ | ||
| 40 | BUG_ON(sz != PMD_SIZE); | ||
| 41 | |||
| 42 | pgd = pgd_offset(mm, addr); | ||
| 43 | pud = pud_alloc(mm, pgd, addr); | ||
| 44 | if (pud) | ||
| 45 | pte = (pte_t *) pmd_alloc(mm, pud, addr); | ||
| 46 | BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); | ||
| 47 | |||
| 48 | return pte; | ||
| 49 | } | ||
| 50 | |||
| 51 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | ||
| 52 | { | ||
| 53 | pgd_t *pgd; | ||
| 54 | pud_t *pud; | ||
| 55 | pmd_t *pmd = NULL; | ||
| 56 | |||
| 57 | pgd = pgd_offset(mm, addr); | ||
| 58 | if (pgd_present(*pgd)) { | ||
| 59 | pud = pud_offset(pgd, addr); | ||
| 60 | if (pud_present(*pud)) | ||
| 61 | pmd = pmd_offset(pud, addr); | ||
| 62 | } | ||
| 63 | return (pte_t *) pmd; | ||
| 64 | } | ||
| 65 | |||
| 66 | #ifdef HUGETLB_TEST | ||
| 67 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, | ||
| 68 | int write) | ||
| 69 | { | ||
| 70 | unsigned long start = address; | ||
| 71 | int length = 1; | ||
| 72 | int nr; | ||
| 73 | struct page *page; | ||
| 74 | struct vm_area_struct *vma; | ||
| 75 | |||
| 76 | vma = find_vma(mm, addr); | ||
| 77 | if (!vma || !is_vm_hugetlb_page(vma)) | ||
| 78 | return ERR_PTR(-EINVAL); | ||
| 79 | |||
| 80 | pte = huge_pte_offset(mm, address); | ||
| 81 | |||
| 82 | /* hugetlb should be locked, and hence, prefaulted */ | ||
| 83 | WARN_ON(!pte || pte_none(*pte)); | ||
| 84 | |||
| 85 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | ||
| 86 | |||
| 87 | WARN_ON(!PageHead(page)); | ||
| 88 | |||
| 89 | return page; | ||
| 90 | } | ||
| 91 | |||
| 92 | int pmd_huge(pmd_t pmd) | ||
| 93 | { | ||
| 94 | return 0; | ||
| 95 | } | ||
| 96 | |||
| 97 | int pud_huge(pud_t pud) | ||
| 98 | { | ||
| 99 | return 0; | ||
| 100 | } | ||
| 101 | |||
| 102 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
| 103 | pmd_t *pmd, int write) | ||
| 104 | { | ||
| 105 | return NULL; | ||
| 106 | } | ||
| 107 | |||
| 108 | #else | ||
| 109 | |||
| 110 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, | ||
| 111 | int write) | ||
| 112 | { | ||
| 113 | return ERR_PTR(-EINVAL); | ||
| 114 | } | ||
| 115 | |||
| 116 | int pmd_huge(pmd_t pmd) | ||
| 117 | { | ||
| 118 | return !!(pmd_val(pmd) & _PAGE_HUGE_PAGE); | ||
| 119 | } | ||
| 120 | |||
| 121 | int pud_huge(pud_t pud) | ||
| 122 | { | ||
| 123 | return !!(pud_val(pud) & _PAGE_HUGE_PAGE); | ||
| 124 | } | ||
| 125 | |||
| 126 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
| 127 | pmd_t *pmd, int write) | ||
| 128 | { | ||
| 129 | struct page *page; | ||
| 130 | |||
| 131 | page = pte_page(*(pte_t *)pmd); | ||
| 132 | if (page) | ||
| 133 | page += ((address & ~PMD_MASK) >> PAGE_SHIFT); | ||
| 134 | return page; | ||
| 135 | } | ||
| 136 | |||
| 137 | struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, | ||
| 138 | pud_t *pud, int write) | ||
| 139 | { | ||
| 140 | struct page *page; | ||
| 141 | |||
| 142 | page = pte_page(*(pte_t *)pud); | ||
| 143 | if (page) | ||
| 144 | page += ((address & ~PUD_MASK) >> PAGE_SHIFT); | ||
| 145 | return page; | ||
| 146 | } | ||
| 147 | |||
| 148 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
| 149 | { | ||
| 150 | return 0; | ||
| 151 | } | ||
| 152 | |||
| 153 | #endif | ||
| 154 | |||
| 155 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA | ||
| 156 | static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | ||
| 157 | unsigned long addr, unsigned long len, | ||
| 158 | unsigned long pgoff, unsigned long flags) | ||
| 159 | { | ||
| 160 | struct hstate *h = hstate_file(file); | ||
| 161 | struct mm_struct *mm = current->mm; | ||
| 162 | struct vm_area_struct *vma; | ||
| 163 | unsigned long start_addr; | ||
| 164 | |||
| 165 | if (len > mm->cached_hole_size) { | ||
| 166 | start_addr = mm->free_area_cache; | ||
| 167 | } else { | ||
| 168 | start_addr = TASK_UNMAPPED_BASE; | ||
| 169 | mm->cached_hole_size = 0; | ||
| 170 | } | ||
| 171 | |||
| 172 | full_search: | ||
| 173 | addr = ALIGN(start_addr, huge_page_size(h)); | ||
| 174 | |||
| 175 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
| 176 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
| 177 | if (TASK_SIZE - len < addr) { | ||
| 178 | /* | ||
| 179 | * Start a new search - just in case we missed | ||
| 180 | * some holes. | ||
| 181 | */ | ||
| 182 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
| 183 | start_addr = TASK_UNMAPPED_BASE; | ||
| 184 | mm->cached_hole_size = 0; | ||
| 185 | goto full_search; | ||
| 186 | } | ||
| 187 | return -ENOMEM; | ||
| 188 | } | ||
| 189 | if (!vma || addr + len <= vma->vm_start) { | ||
| 190 | mm->free_area_cache = addr + len; | ||
| 191 | return addr; | ||
| 192 | } | ||
| 193 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
| 194 | mm->cached_hole_size = vma->vm_start - addr; | ||
| 195 | addr = ALIGN(vma->vm_end, huge_page_size(h)); | ||
| 196 | } | ||
| 197 | } | ||
| 198 | |||
| 199 | static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | ||
| 200 | unsigned long addr0, unsigned long len, | ||
| 201 | unsigned long pgoff, unsigned long flags) | ||
| 202 | { | ||
| 203 | struct hstate *h = hstate_file(file); | ||
| 204 | struct mm_struct *mm = current->mm; | ||
| 205 | struct vm_area_struct *vma, *prev_vma; | ||
| 206 | unsigned long base = mm->mmap_base, addr = addr0; | ||
| 207 | unsigned long largest_hole = mm->cached_hole_size; | ||
| 208 | int first_time = 1; | ||
| 209 | |||
| 210 | /* don't allow allocations above current base */ | ||
| 211 | if (mm->free_area_cache > base) | ||
| 212 | mm->free_area_cache = base; | ||
| 213 | |||
| 214 | if (len <= largest_hole) { | ||
| 215 | largest_hole = 0; | ||
| 216 | mm->free_area_cache = base; | ||
| 217 | } | ||
| 218 | try_again: | ||
| 219 | /* make sure it can fit in the remaining address space */ | ||
| 220 | if (mm->free_area_cache < len) | ||
| 221 | goto fail; | ||
| 222 | |||
| 223 | /* either no address requested or cant fit in requested address hole */ | ||
| 224 | addr = (mm->free_area_cache - len) & huge_page_mask(h); | ||
| 225 | do { | ||
| 226 | /* | ||
| 227 | * Lookup failure means no vma is above this address, | ||
| 228 | * i.e. return with success: | ||
| 229 | */ | ||
| 230 | vma = find_vma_prev(mm, addr, &prev_vma); | ||
| 231 | if (!vma) { | ||
| 232 | return addr; | ||
| 233 | break; | ||
| 234 | } | ||
| 235 | |||
| 236 | /* | ||
| 237 | * new region fits between prev_vma->vm_end and | ||
| 238 | * vma->vm_start, use it: | ||
| 239 | */ | ||
| 240 | if (addr + len <= vma->vm_start && | ||
| 241 | (!prev_vma || (addr >= prev_vma->vm_end))) { | ||
| 242 | /* remember the address as a hint for next time */ | ||
| 243 | mm->cached_hole_size = largest_hole; | ||
| 244 | mm->free_area_cache = addr; | ||
| 245 | return addr; | ||
| 246 | } else { | ||
| 247 | /* pull free_area_cache down to the first hole */ | ||
| 248 | if (mm->free_area_cache == vma->vm_end) { | ||
| 249 | mm->free_area_cache = vma->vm_start; | ||
| 250 | mm->cached_hole_size = largest_hole; | ||
| 251 | } | ||
| 252 | } | ||
| 253 | |||
| 254 | /* remember the largest hole we saw so far */ | ||
| 255 | if (addr + largest_hole < vma->vm_start) | ||
| 256 | largest_hole = vma->vm_start - addr; | ||
| 257 | |||
| 258 | /* try just below the current vma->vm_start */ | ||
| 259 | addr = (vma->vm_start - len) & huge_page_mask(h); | ||
| 260 | |||
| 261 | } while (len <= vma->vm_start); | ||
| 262 | |||
| 263 | fail: | ||
| 264 | /* | ||
| 265 | * if hint left us with no space for the requested | ||
| 266 | * mapping then try again: | ||
| 267 | */ | ||
| 268 | if (first_time) { | ||
| 269 | mm->free_area_cache = base; | ||
| 270 | largest_hole = 0; | ||
| 271 | first_time = 0; | ||
| 272 | goto try_again; | ||
| 273 | } | ||
| 274 | /* | ||
| 275 | * A failed mmap() very likely causes application failure, | ||
| 276 | * so fall back to the bottom-up function here. This scenario | ||
| 277 | * can happen with large stack limits and large mmap() | ||
| 278 | * allocations. | ||
| 279 | */ | ||
| 280 | mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
| 281 | mm->cached_hole_size = ~0UL; | ||
| 282 | addr = hugetlb_get_unmapped_area_bottomup(file, addr0, | ||
| 283 | len, pgoff, flags); | ||
| 284 | |||
| 285 | /* | ||
| 286 | * Restore the topdown base: | ||
| 287 | */ | ||
| 288 | mm->free_area_cache = base; | ||
| 289 | mm->cached_hole_size = ~0UL; | ||
| 290 | |||
| 291 | return addr; | ||
| 292 | } | ||
| 293 | |||
| 294 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | ||
| 295 | unsigned long len, unsigned long pgoff, unsigned long flags) | ||
| 296 | { | ||
| 297 | struct hstate *h = hstate_file(file); | ||
| 298 | struct mm_struct *mm = current->mm; | ||
| 299 | struct vm_area_struct *vma; | ||
| 300 | |||
| 301 | if (len & ~huge_page_mask(h)) | ||
| 302 | return -EINVAL; | ||
| 303 | if (len > TASK_SIZE) | ||
| 304 | return -ENOMEM; | ||
| 305 | |||
| 306 | if (flags & MAP_FIXED) { | ||
| 307 | if (prepare_hugepage_range(file, addr, len)) | ||
| 308 | return -EINVAL; | ||
| 309 | return addr; | ||
| 310 | } | ||
| 311 | |||
| 312 | if (addr) { | ||
| 313 | addr = ALIGN(addr, huge_page_size(h)); | ||
| 314 | vma = find_vma(mm, addr); | ||
| 315 | if (TASK_SIZE - len >= addr && | ||
| 316 | (!vma || addr + len <= vma->vm_start)) | ||
| 317 | return addr; | ||
| 318 | } | ||
| 319 | if (current->mm->get_unmapped_area == arch_get_unmapped_area) | ||
| 320 | return hugetlb_get_unmapped_area_bottomup(file, addr, len, | ||
| 321 | pgoff, flags); | ||
| 322 | else | ||
| 323 | return hugetlb_get_unmapped_area_topdown(file, addr, len, | ||
| 324 | pgoff, flags); | ||
| 325 | } | ||
| 326 | |||
| 327 | static __init int setup_hugepagesz(char *opt) | ||
| 328 | { | ||
| 329 | unsigned long ps = memparse(opt, &opt); | ||
| 330 | if (ps == PMD_SIZE) { | ||
| 331 | hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); | ||
| 332 | } else if (ps == PUD_SIZE) { | ||
| 333 | hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); | ||
| 334 | } else { | ||
| 335 | printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", | ||
| 336 | ps >> 20); | ||
| 337 | return 0; | ||
| 338 | } | ||
| 339 | return 1; | ||
| 340 | } | ||
| 341 | __setup("hugepagesz=", setup_hugepagesz); | ||
| 342 | |||
| 343 | #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ | ||
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c new file mode 100644 index 000000000000..125ac53b60fc --- /dev/null +++ b/arch/tile/mm/init.c | |||
| @@ -0,0 +1,1082 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 1995 Linus Torvalds | ||
| 3 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU General Public License | ||
| 7 | * as published by the Free Software Foundation, version 2. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, but | ||
| 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
| 12 | * NON INFRINGEMENT. See the GNU General Public License for | ||
| 13 | * more details. | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/module.h> | ||
| 17 | #include <linux/signal.h> | ||
| 18 | #include <linux/sched.h> | ||
| 19 | #include <linux/kernel.h> | ||
| 20 | #include <linux/errno.h> | ||
| 21 | #include <linux/string.h> | ||
| 22 | #include <linux/types.h> | ||
| 23 | #include <linux/ptrace.h> | ||
| 24 | #include <linux/mman.h> | ||
| 25 | #include <linux/mm.h> | ||
| 26 | #include <linux/hugetlb.h> | ||
| 27 | #include <linux/swap.h> | ||
| 28 | #include <linux/smp.h> | ||
| 29 | #include <linux/init.h> | ||
| 30 | #include <linux/highmem.h> | ||
| 31 | #include <linux/pagemap.h> | ||
| 32 | #include <linux/poison.h> | ||
| 33 | #include <linux/bootmem.h> | ||
| 34 | #include <linux/slab.h> | ||
| 35 | #include <linux/proc_fs.h> | ||
| 36 | #include <linux/efi.h> | ||
| 37 | #include <linux/memory_hotplug.h> | ||
| 38 | #include <linux/uaccess.h> | ||
| 39 | #include <asm/mmu_context.h> | ||
| 40 | #include <asm/processor.h> | ||
| 41 | #include <asm/system.h> | ||
| 42 | #include <asm/pgtable.h> | ||
| 43 | #include <asm/pgalloc.h> | ||
| 44 | #include <asm/dma.h> | ||
| 45 | #include <asm/fixmap.h> | ||
| 46 | #include <asm/tlb.h> | ||
| 47 | #include <asm/tlbflush.h> | ||
| 48 | #include <asm/sections.h> | ||
| 49 | #include <asm/setup.h> | ||
| 50 | #include <asm/homecache.h> | ||
| 51 | #include <hv/hypervisor.h> | ||
| 52 | #include <arch/chip.h> | ||
| 53 | |||
| 54 | #include "migrate.h" | ||
| 55 | |||
| 56 | /* | ||
| 57 | * We could set FORCE_MAX_ZONEORDER to "(HPAGE_SHIFT - PAGE_SHIFT + 1)" | ||
| 58 | * in the Tile Kconfig, but this generates configure warnings. | ||
| 59 | * Do it here and force people to get it right to compile this file. | ||
| 60 | * The problem is that with 4KB small pages and 16MB huge pages, | ||
| 61 | * the default value doesn't allow us to group enough small pages | ||
| 62 | * together to make up a huge page. | ||
| 63 | */ | ||
| 64 | #if CONFIG_FORCE_MAX_ZONEORDER < HPAGE_SHIFT - PAGE_SHIFT + 1 | ||
| 65 | # error "Change FORCE_MAX_ZONEORDER in arch/tile/Kconfig to match page size" | ||
| 66 | #endif | ||
| 67 | |||
| 68 | #define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0)) | ||
| 69 | |||
| 70 | unsigned long VMALLOC_RESERVE = CONFIG_VMALLOC_RESERVE; | ||
| 71 | |||
| 72 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | ||
| 73 | |||
| 74 | /* Create an L2 page table */ | ||
| 75 | static pte_t * __init alloc_pte(void) | ||
| 76 | { | ||
| 77 | return __alloc_bootmem(L2_KERNEL_PGTABLE_SIZE, HV_PAGE_TABLE_ALIGN, 0); | ||
| 78 | } | ||
| 79 | |||
| 80 | /* | ||
| 81 | * L2 page tables per controller. We allocate these all at once from | ||
| 82 | * the bootmem allocator and store them here. This saves on kernel L2 | ||
| 83 | * page table memory, compared to allocating a full 64K page per L2 | ||
| 84 | * page table, and also means that in cases where we use huge pages, | ||
| 85 | * we are guaranteed to later be able to shatter those huge pages and | ||
| 86 | * switch to using these page tables instead, without requiring | ||
| 87 | * further allocation. Each l2_ptes[] entry points to the first page | ||
| 88 | * table for the first hugepage-size piece of memory on the | ||
| 89 | * controller; other page tables are just indexed directly, i.e. the | ||
| 90 | * L2 page tables are contiguous in memory for each controller. | ||
| 91 | */ | ||
| 92 | static pte_t *l2_ptes[MAX_NUMNODES]; | ||
| 93 | static int num_l2_ptes[MAX_NUMNODES]; | ||
| 94 | |||
| 95 | static void init_prealloc_ptes(int node, int pages) | ||
| 96 | { | ||
| 97 | BUG_ON(pages & (HV_L2_ENTRIES-1)); | ||
| 98 | if (pages) { | ||
| 99 | num_l2_ptes[node] = pages; | ||
| 100 | l2_ptes[node] = __alloc_bootmem(pages * sizeof(pte_t), | ||
| 101 | HV_PAGE_TABLE_ALIGN, 0); | ||
| 102 | } | ||
| 103 | } | ||
| 104 | |||
| 105 | pte_t *get_prealloc_pte(unsigned long pfn) | ||
| 106 | { | ||
| 107 | int node = pfn_to_nid(pfn); | ||
| 108 | pfn &= ~(-1UL << (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT)); | ||
| 109 | BUG_ON(node >= MAX_NUMNODES); | ||
| 110 | BUG_ON(pfn >= num_l2_ptes[node]); | ||
| 111 | return &l2_ptes[node][pfn]; | ||
| 112 | } | ||
| 113 | |||
| 114 | /* | ||
| 115 | * What caching do we expect pages from the heap to have when | ||
| 116 | * they are allocated during bootup? (Once we've installed the | ||
| 117 | * "real" swapper_pg_dir.) | ||
| 118 | */ | ||
| 119 | static int initial_heap_home(void) | ||
| 120 | { | ||
| 121 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
| 122 | if (hash_default) | ||
| 123 | return PAGE_HOME_HASH; | ||
| 124 | #endif | ||
| 125 | return smp_processor_id(); | ||
| 126 | } | ||
| 127 | |||
| 128 | /* | ||
| 129 | * Place a pointer to an L2 page table in a middle page | ||
| 130 | * directory entry. | ||
| 131 | */ | ||
| 132 | static void __init assign_pte(pmd_t *pmd, pte_t *page_table) | ||
| 133 | { | ||
| 134 | phys_addr_t pa = __pa(page_table); | ||
| 135 | unsigned long l2_ptfn = pa >> HV_LOG2_PAGE_TABLE_ALIGN; | ||
| 136 | pte_t pteval = hv_pte_set_ptfn(__pgprot(_PAGE_TABLE), l2_ptfn); | ||
| 137 | BUG_ON((pa & (HV_PAGE_TABLE_ALIGN-1)) != 0); | ||
| 138 | pteval = pte_set_home(pteval, initial_heap_home()); | ||
| 139 | *(pte_t *)pmd = pteval; | ||
| 140 | if (page_table != (pte_t *)pmd_page_vaddr(*pmd)) | ||
| 141 | BUG(); | ||
| 142 | } | ||
| 143 | |||
| 144 | #ifdef __tilegx__ | ||
| 145 | |||
| 146 | #if HV_L1_SIZE != HV_L2_SIZE | ||
| 147 | # error Rework assumption that L1 and L2 page tables are same size. | ||
| 148 | #endif | ||
| 149 | |||
| 150 | /* Since pmd_t arrays and pte_t arrays are the same size, just use casts. */ | ||
| 151 | static inline pmd_t *alloc_pmd(void) | ||
| 152 | { | ||
| 153 | return (pmd_t *)alloc_pte(); | ||
| 154 | } | ||
| 155 | |||
| 156 | static inline void assign_pmd(pud_t *pud, pmd_t *pmd) | ||
| 157 | { | ||
| 158 | assign_pte((pmd_t *)pud, (pte_t *)pmd); | ||
| 159 | } | ||
| 160 | |||
| 161 | #endif /* __tilegx__ */ | ||
| 162 | |||
| 163 | /* Replace the given pmd with a full PTE table. */ | ||
| 164 | void __init shatter_pmd(pmd_t *pmd) | ||
| 165 | { | ||
| 166 | pte_t *pte = get_prealloc_pte(pte_pfn(*(pte_t *)pmd)); | ||
| 167 | assign_pte(pmd, pte); | ||
| 168 | } | ||
| 169 | |||
| 170 | #ifdef CONFIG_HIGHMEM | ||
| 171 | /* | ||
| 172 | * This function initializes a certain range of kernel virtual memory | ||
| 173 | * with new bootmem page tables, everywhere page tables are missing in | ||
| 174 | * the given range. | ||
| 175 | */ | ||
| 176 | |||
| 177 | /* | ||
| 178 | * NOTE: The pagetables are allocated contiguous on the physical space | ||
| 179 | * so we can cache the place of the first one and move around without | ||
| 180 | * checking the pgd every time. | ||
| 181 | */ | ||
| 182 | static void __init page_table_range_init(unsigned long start, | ||
| 183 | unsigned long end, pgd_t *pgd_base) | ||
| 184 | { | ||
| 185 | pgd_t *pgd; | ||
| 186 | int pgd_idx; | ||
| 187 | unsigned long vaddr; | ||
| 188 | |||
| 189 | vaddr = start; | ||
| 190 | pgd_idx = pgd_index(vaddr); | ||
| 191 | pgd = pgd_base + pgd_idx; | ||
| 192 | |||
| 193 | for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { | ||
| 194 | pmd_t *pmd = pmd_offset(pud_offset(pgd, vaddr), vaddr); | ||
| 195 | if (pmd_none(*pmd)) | ||
| 196 | assign_pte(pmd, alloc_pte()); | ||
| 197 | vaddr += PMD_SIZE; | ||
| 198 | } | ||
| 199 | } | ||
| 200 | #endif /* CONFIG_HIGHMEM */ | ||
| 201 | |||
| 202 | |||
| 203 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
| 204 | |||
| 205 | static int __initdata ktext_hash = 1; /* .text pages */ | ||
| 206 | static int __initdata kdata_hash = 1; /* .data and .bss pages */ | ||
| 207 | int __write_once hash_default = 1; /* kernel allocator pages */ | ||
| 208 | EXPORT_SYMBOL(hash_default); | ||
| 209 | int __write_once kstack_hash = 1; /* if no homecaching, use h4h */ | ||
| 210 | #endif /* CHIP_HAS_CBOX_HOME_MAP */ | ||
| 211 | |||
| 212 | /* | ||
| 213 | * CPUs to use to for striping the pages of kernel data. If hash-for-home | ||
| 214 | * is available, this is only relevant if kcache_hash sets up the | ||
| 215 | * .data and .bss to be page-homed, and we don't want the default mode | ||
| 216 | * of using the full set of kernel cpus for the striping. | ||
| 217 | */ | ||
| 218 | static __initdata struct cpumask kdata_mask; | ||
| 219 | static __initdata int kdata_arg_seen; | ||
| 220 | |||
| 221 | int __write_once kdata_huge; /* if no homecaching, small pages */ | ||
| 222 | |||
| 223 | |||
| 224 | /* Combine a generic pgprot_t with cache home to get a cache-aware pgprot. */ | ||
| 225 | static pgprot_t __init construct_pgprot(pgprot_t prot, int home) | ||
| 226 | { | ||
| 227 | prot = pte_set_home(prot, home); | ||
| 228 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
| 229 | if (home == PAGE_HOME_IMMUTABLE) { | ||
| 230 | if (ktext_hash) | ||
| 231 | prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_HASH_L3); | ||
| 232 | else | ||
| 233 | prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_NO_L3); | ||
| 234 | } | ||
| 235 | #endif | ||
| 236 | return prot; | ||
| 237 | } | ||
| 238 | |||
| 239 | /* | ||
| 240 | * For a given kernel data VA, how should it be cached? | ||
| 241 | * We return the complete pgprot_t with caching bits set. | ||
| 242 | */ | ||
| 243 | static pgprot_t __init init_pgprot(ulong address) | ||
| 244 | { | ||
| 245 | int cpu; | ||
| 246 | unsigned long page; | ||
| 247 | enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET }; | ||
| 248 | |||
| 249 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
| 250 | /* For kdata=huge, everything is just hash-for-home. */ | ||
| 251 | if (kdata_huge) | ||
| 252 | return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH); | ||
| 253 | #endif | ||
| 254 | |||
| 255 | /* We map the aliased pages of permanent text inaccessible. */ | ||
| 256 | if (address < (ulong) _sinittext - CODE_DELTA) | ||
| 257 | return PAGE_NONE; | ||
| 258 | |||
| 259 | /* | ||
| 260 | * We map read-only data non-coherent for performance. We could | ||
| 261 | * use neighborhood caching on TILE64, but it's not clear it's a win. | ||
| 262 | */ | ||
| 263 | if ((address >= (ulong) __start_rodata && | ||
| 264 | address < (ulong) __end_rodata) || | ||
| 265 | address == (ulong) empty_zero_page) { | ||
| 266 | return construct_pgprot(PAGE_KERNEL_RO, PAGE_HOME_IMMUTABLE); | ||
| 267 | } | ||
| 268 | |||
| 269 | /* As a performance optimization, keep the boot init stack here. */ | ||
| 270 | if (address >= (ulong)&init_thread_union && | ||
| 271 | address < (ulong)&init_thread_union + THREAD_SIZE) | ||
| 272 | return construct_pgprot(PAGE_KERNEL, smp_processor_id()); | ||
| 273 | |||
| 274 | #ifndef __tilegx__ | ||
| 275 | #if !ATOMIC_LOCKS_FOUND_VIA_TABLE() | ||
| 276 | /* Force the atomic_locks[] array page to be hash-for-home. */ | ||
| 277 | if (address == (ulong) atomic_locks) | ||
| 278 | return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH); | ||
| 279 | #endif | ||
| 280 | #endif | ||
| 281 | |||
| 282 | /* | ||
| 283 | * Everything else that isn't data or bss is heap, so mark it | ||
| 284 | * with the initial heap home (hash-for-home, or this cpu). This | ||
| 285 | * includes any addresses after the loaded image; any address before | ||
| 286 | * _einittext (since we already captured the case of text before | ||
| 287 | * _sinittext); and any init-data pages. | ||
| 288 | * | ||
| 289 | * All the LOWMEM pages that we mark this way will get their | ||
| 290 | * struct page homecache properly marked later, in set_page_homes(). | ||
| 291 | * The HIGHMEM pages we leave with a default zero for their | ||
| 292 | * homes, but with a zero free_time we don't have to actually | ||
| 293 | * do a flush action the first time we use them, either. | ||
| 294 | */ | ||
| 295 | if (address >= (ulong) _end || address < (ulong) _sdata || | ||
| 296 | (address >= (ulong) _sinitdata && | ||
| 297 | address < (ulong) _einitdata)) | ||
| 298 | return construct_pgprot(PAGE_KERNEL, initial_heap_home()); | ||
| 299 | |||
| 300 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
| 301 | /* Use hash-for-home if requested for data/bss. */ | ||
| 302 | if (kdata_hash) | ||
| 303 | return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH); | ||
| 304 | #endif | ||
| 305 | |||
| 306 | /* | ||
| 307 | * Otherwise we just hand out consecutive cpus. To avoid | ||
| 308 | * requiring this function to hold state, we just walk forward from | ||
| 309 | * _sdata by PAGE_SIZE, skipping the readonly and init data, to reach | ||
| 310 | * the requested address, while walking cpu home around kdata_mask. | ||
| 311 | * This is typically no more than a dozen or so iterations. | ||
| 312 | */ | ||
| 313 | BUG_ON(_einitdata != __bss_start); | ||
| 314 | for (page = (ulong)_sdata, cpu = NR_CPUS; ; ) { | ||
| 315 | cpu = cpumask_next(cpu, &kdata_mask); | ||
| 316 | if (cpu == NR_CPUS) | ||
| 317 | cpu = cpumask_first(&kdata_mask); | ||
| 318 | if (page >= address) | ||
| 319 | break; | ||
| 320 | page += PAGE_SIZE; | ||
| 321 | if (page == (ulong)__start_rodata) | ||
| 322 | page = (ulong)__end_rodata; | ||
| 323 | if (page == (ulong)&init_thread_union) | ||
| 324 | page += THREAD_SIZE; | ||
| 325 | if (page == (ulong)_sinitdata) | ||
| 326 | page = (ulong)_einitdata; | ||
| 327 | if (page == (ulong)empty_zero_page) | ||
| 328 | page += PAGE_SIZE; | ||
| 329 | #ifndef __tilegx__ | ||
| 330 | #if !ATOMIC_LOCKS_FOUND_VIA_TABLE() | ||
| 331 | if (page == (ulong)atomic_locks) | ||
| 332 | page += PAGE_SIZE; | ||
| 333 | #endif | ||
| 334 | #endif | ||
| 335 | |||
| 336 | } | ||
| 337 | return construct_pgprot(PAGE_KERNEL, cpu); | ||
| 338 | } | ||
| 339 | |||
| 340 | /* | ||
| 341 | * This function sets up how we cache the kernel text. If we have | ||
| 342 | * hash-for-home support, normally that is used instead (see the | ||
| 343 | * kcache_hash boot flag for more information). But if we end up | ||
| 344 | * using a page-based caching technique, this option sets up the | ||
| 345 | * details of that. In addition, the "ktext=nocache" option may | ||
| 346 | * always be used to disable local caching of text pages, if desired. | ||
| 347 | */ | ||
| 348 | |||
| 349 | static int __initdata ktext_arg_seen; | ||
| 350 | static int __initdata ktext_small; | ||
| 351 | static int __initdata ktext_local; | ||
| 352 | static int __initdata ktext_all; | ||
| 353 | static int __initdata ktext_nondataplane; | ||
| 354 | static int __initdata ktext_nocache; | ||
| 355 | static struct cpumask __initdata ktext_mask; | ||
| 356 | |||
| 357 | static int __init setup_ktext(char *str) | ||
| 358 | { | ||
| 359 | if (str == NULL) | ||
| 360 | return -EINVAL; | ||
| 361 | |||
| 362 | /* If you have a leading "nocache", turn off ktext caching */ | ||
| 363 | if (strncmp(str, "nocache", 7) == 0) { | ||
| 364 | ktext_nocache = 1; | ||
| 365 | printk("ktext: disabling local caching of kernel text\n"); | ||
| 366 | str += 7; | ||
| 367 | if (*str == ',') | ||
| 368 | ++str; | ||
| 369 | if (*str == '\0') | ||
| 370 | return 0; | ||
| 371 | } | ||
| 372 | |||
| 373 | ktext_arg_seen = 1; | ||
| 374 | |||
| 375 | /* Default setting on Tile64: use a huge page */ | ||
| 376 | if (strcmp(str, "huge") == 0) | ||
| 377 | printk("ktext: using one huge locally cached page\n"); | ||
| 378 | |||
| 379 | /* Pay TLB cost but get no cache benefit: cache small pages locally */ | ||
| 380 | else if (strcmp(str, "local") == 0) { | ||
| 381 | ktext_small = 1; | ||
| 382 | ktext_local = 1; | ||
| 383 | printk("ktext: using small pages with local caching\n"); | ||
| 384 | } | ||
| 385 | |||
| 386 | /* Neighborhood cache ktext pages on all cpus. */ | ||
| 387 | else if (strcmp(str, "all") == 0) { | ||
| 388 | ktext_small = 1; | ||
| 389 | ktext_all = 1; | ||
| 390 | printk("ktext: using maximal caching neighborhood\n"); | ||
| 391 | } | ||
| 392 | |||
| 393 | |||
| 394 | /* Neighborhood ktext pages on specified mask */ | ||
| 395 | else if (cpulist_parse(str, &ktext_mask) == 0) { | ||
| 396 | char buf[NR_CPUS * 5]; | ||
| 397 | cpulist_scnprintf(buf, sizeof(buf), &ktext_mask); | ||
| 398 | if (cpumask_weight(&ktext_mask) > 1) { | ||
| 399 | ktext_small = 1; | ||
| 400 | printk("ktext: using caching neighborhood %s " | ||
| 401 | "with small pages\n", buf); | ||
| 402 | } else { | ||
| 403 | printk("ktext: caching on cpu %s with one huge page\n", | ||
| 404 | buf); | ||
| 405 | } | ||
| 406 | } | ||
| 407 | |||
| 408 | else if (*str) | ||
| 409 | return -EINVAL; | ||
| 410 | |||
| 411 | return 0; | ||
| 412 | } | ||
| 413 | |||
| 414 | early_param("ktext", setup_ktext); | ||
| 415 | |||
| 416 | |||
| 417 | static inline pgprot_t ktext_set_nocache(pgprot_t prot) | ||
| 418 | { | ||
| 419 | if (!ktext_nocache) | ||
| 420 | prot = hv_pte_set_nc(prot); | ||
| 421 | #if CHIP_HAS_NC_AND_NOALLOC_BITS() | ||
| 422 | else | ||
| 423 | prot = hv_pte_set_no_alloc_l2(prot); | ||
| 424 | #endif | ||
| 425 | return prot; | ||
| 426 | } | ||
| 427 | |||
| 428 | #ifndef __tilegx__ | ||
| 429 | static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va) | ||
| 430 | { | ||
| 431 | return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va); | ||
| 432 | } | ||
| 433 | #else | ||
| 434 | static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va) | ||
| 435 | { | ||
| 436 | pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va); | ||
| 437 | if (pud_none(*pud)) | ||
| 438 | assign_pmd(pud, alloc_pmd()); | ||
| 439 | return pmd_offset(pud, va); | ||
| 440 | } | ||
| 441 | #endif | ||
| 442 | |||
| 443 | /* Temporary page table we use for staging. */ | ||
| 444 | static pgd_t pgtables[PTRS_PER_PGD] | ||
| 445 | __attribute__((section(".init.page"))); | ||
| 446 | |||
| 447 | /* | ||
| 448 | * This maps the physical memory to kernel virtual address space, a total | ||
| 449 | * of max_low_pfn pages, by creating page tables starting from address | ||
| 450 | * PAGE_OFFSET. | ||
| 451 | * | ||
| 452 | * This routine transitions us from using a set of compiled-in large | ||
| 453 | * pages to using some more precise caching, including removing access | ||
| 454 | * to code pages mapped at PAGE_OFFSET (executed only at MEM_SV_START) | ||
| 455 | * marking read-only data as locally cacheable, striping the remaining | ||
| 456 | * .data and .bss across all the available tiles, and removing access | ||
| 457 | * to pages above the top of RAM (thus ensuring a page fault from a bad | ||
| 458 | * virtual address rather than a hypervisor shoot down for accessing | ||
| 459 | * memory outside the assigned limits). | ||
| 460 | */ | ||
| 461 | static void __init kernel_physical_mapping_init(pgd_t *pgd_base) | ||
| 462 | { | ||
| 463 | unsigned long address, pfn; | ||
| 464 | pmd_t *pmd; | ||
| 465 | pte_t *pte; | ||
| 466 | int pte_ofs; | ||
| 467 | const struct cpumask *my_cpu_mask = cpumask_of(smp_processor_id()); | ||
| 468 | struct cpumask kstripe_mask; | ||
| 469 | int rc, i; | ||
| 470 | |||
| 471 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
| 472 | if (ktext_arg_seen && ktext_hash) { | ||
| 473 | printk("warning: \"ktext\" boot argument ignored" | ||
| 474 | " if \"kcache_hash\" sets up text hash-for-home\n"); | ||
| 475 | ktext_small = 0; | ||
| 476 | } | ||
| 477 | |||
| 478 | if (kdata_arg_seen && kdata_hash) { | ||
| 479 | printk("warning: \"kdata\" boot argument ignored" | ||
| 480 | " if \"kcache_hash\" sets up data hash-for-home\n"); | ||
| 481 | } | ||
| 482 | |||
| 483 | if (kdata_huge && !hash_default) { | ||
| 484 | printk("warning: disabling \"kdata=huge\"; requires" | ||
| 485 | " kcache_hash=all or =allbutstack\n"); | ||
| 486 | kdata_huge = 0; | ||
| 487 | } | ||
| 488 | #endif | ||
| 489 | |||
| 490 | /* | ||
| 491 | * Set up a mask for cpus to use for kernel striping. | ||
| 492 | * This is normally all cpus, but minus dataplane cpus if any. | ||
| 493 | * If the dataplane covers the whole chip, we stripe over | ||
| 494 | * the whole chip too. | ||
| 495 | */ | ||
| 496 | cpumask_copy(&kstripe_mask, cpu_possible_mask); | ||
| 497 | if (!kdata_arg_seen) | ||
| 498 | kdata_mask = kstripe_mask; | ||
| 499 | |||
| 500 | /* Allocate and fill in L2 page tables */ | ||
| 501 | for (i = 0; i < MAX_NUMNODES; ++i) { | ||
| 502 | #ifdef CONFIG_HIGHMEM | ||
| 503 | unsigned long end_pfn = node_lowmem_end_pfn[i]; | ||
| 504 | #else | ||
| 505 | unsigned long end_pfn = node_end_pfn[i]; | ||
| 506 | #endif | ||
| 507 | unsigned long end_huge_pfn = 0; | ||
| 508 | |||
| 509 | /* Pre-shatter the last huge page to allow per-cpu pages. */ | ||
| 510 | if (kdata_huge) | ||
| 511 | end_huge_pfn = end_pfn - (HPAGE_SIZE >> PAGE_SHIFT); | ||
| 512 | |||
| 513 | pfn = node_start_pfn[i]; | ||
| 514 | |||
| 515 | /* Allocate enough memory to hold L2 page tables for node. */ | ||
| 516 | init_prealloc_ptes(i, end_pfn - pfn); | ||
| 517 | |||
| 518 | address = (unsigned long) pfn_to_kaddr(pfn); | ||
| 519 | while (pfn < end_pfn) { | ||
| 520 | BUG_ON(address & (HPAGE_SIZE-1)); | ||
| 521 | pmd = get_pmd(pgtables, address); | ||
| 522 | pte = get_prealloc_pte(pfn); | ||
| 523 | if (pfn < end_huge_pfn) { | ||
| 524 | pgprot_t prot = init_pgprot(address); | ||
| 525 | *(pte_t *)pmd = pte_mkhuge(pfn_pte(pfn, prot)); | ||
| 526 | for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE; | ||
| 527 | pfn++, pte_ofs++, address += PAGE_SIZE) | ||
| 528 | pte[pte_ofs] = pfn_pte(pfn, prot); | ||
| 529 | } else { | ||
| 530 | if (kdata_huge) | ||
| 531 | printk(KERN_DEBUG "pre-shattered huge" | ||
| 532 | " page at %#lx\n", address); | ||
| 533 | for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE; | ||
| 534 | pfn++, pte_ofs++, address += PAGE_SIZE) { | ||
| 535 | pgprot_t prot = init_pgprot(address); | ||
| 536 | pte[pte_ofs] = pfn_pte(pfn, prot); | ||
| 537 | } | ||
| 538 | assign_pte(pmd, pte); | ||
| 539 | } | ||
| 540 | } | ||
| 541 | } | ||
| 542 | |||
| 543 | /* | ||
| 544 | * Set or check ktext_map now that we have cpu_possible_mask | ||
| 545 | * and kstripe_mask to work with. | ||
| 546 | */ | ||
| 547 | if (ktext_all) | ||
| 548 | cpumask_copy(&ktext_mask, cpu_possible_mask); | ||
| 549 | else if (ktext_nondataplane) | ||
| 550 | ktext_mask = kstripe_mask; | ||
| 551 | else if (!cpumask_empty(&ktext_mask)) { | ||
| 552 | /* Sanity-check any mask that was requested */ | ||
| 553 | struct cpumask bad; | ||
| 554 | cpumask_andnot(&bad, &ktext_mask, cpu_possible_mask); | ||
| 555 | cpumask_and(&ktext_mask, &ktext_mask, cpu_possible_mask); | ||
| 556 | if (!cpumask_empty(&bad)) { | ||
| 557 | char buf[NR_CPUS * 5]; | ||
| 558 | cpulist_scnprintf(buf, sizeof(buf), &bad); | ||
| 559 | printk("ktext: not using unavailable cpus %s\n", buf); | ||
| 560 | } | ||
| 561 | if (cpumask_empty(&ktext_mask)) { | ||
| 562 | printk("ktext: no valid cpus; caching on %d.\n", | ||
| 563 | smp_processor_id()); | ||
| 564 | cpumask_copy(&ktext_mask, | ||
| 565 | cpumask_of(smp_processor_id())); | ||
| 566 | } | ||
| 567 | } | ||
| 568 | |||
| 569 | address = MEM_SV_INTRPT; | ||
| 570 | pmd = get_pmd(pgtables, address); | ||
| 571 | if (ktext_small) { | ||
| 572 | /* Allocate an L2 PTE for the kernel text */ | ||
| 573 | int cpu = 0; | ||
| 574 | pgprot_t prot = construct_pgprot(PAGE_KERNEL_EXEC, | ||
| 575 | PAGE_HOME_IMMUTABLE); | ||
| 576 | |||
| 577 | if (ktext_local) { | ||
| 578 | if (ktext_nocache) | ||
| 579 | prot = hv_pte_set_mode(prot, | ||
| 580 | HV_PTE_MODE_UNCACHED); | ||
| 581 | else | ||
| 582 | prot = hv_pte_set_mode(prot, | ||
| 583 | HV_PTE_MODE_CACHE_NO_L3); | ||
| 584 | } else { | ||
| 585 | prot = hv_pte_set_mode(prot, | ||
| 586 | HV_PTE_MODE_CACHE_TILE_L3); | ||
| 587 | cpu = cpumask_first(&ktext_mask); | ||
| 588 | |||
| 589 | prot = ktext_set_nocache(prot); | ||
| 590 | } | ||
| 591 | |||
| 592 | BUG_ON(address != (unsigned long)_stext); | ||
| 593 | pfn = 0; /* code starts at PA 0 */ | ||
| 594 | pte = alloc_pte(); | ||
| 595 | for (pte_ofs = 0; address < (unsigned long)_einittext; | ||
| 596 | pfn++, pte_ofs++, address += PAGE_SIZE) { | ||
| 597 | if (!ktext_local) { | ||
| 598 | prot = set_remote_cache_cpu(prot, cpu); | ||
| 599 | cpu = cpumask_next(cpu, &ktext_mask); | ||
| 600 | if (cpu == NR_CPUS) | ||
| 601 | cpu = cpumask_first(&ktext_mask); | ||
| 602 | } | ||
| 603 | pte[pte_ofs] = pfn_pte(pfn, prot); | ||
| 604 | } | ||
| 605 | assign_pte(pmd, pte); | ||
| 606 | } else { | ||
| 607 | pte_t pteval = pfn_pte(0, PAGE_KERNEL_EXEC); | ||
| 608 | pteval = pte_mkhuge(pteval); | ||
| 609 | #if CHIP_HAS_CBOX_HOME_MAP() | ||
| 610 | if (ktext_hash) { | ||
| 611 | pteval = hv_pte_set_mode(pteval, | ||
| 612 | HV_PTE_MODE_CACHE_HASH_L3); | ||
| 613 | pteval = ktext_set_nocache(pteval); | ||
| 614 | } else | ||
| 615 | #endif /* CHIP_HAS_CBOX_HOME_MAP() */ | ||
| 616 | if (cpumask_weight(&ktext_mask) == 1) { | ||
| 617 | pteval = set_remote_cache_cpu(pteval, | ||
| 618 | cpumask_first(&ktext_mask)); | ||
| 619 | pteval = hv_pte_set_mode(pteval, | ||
| 620 | HV_PTE_MODE_CACHE_TILE_L3); | ||
| 621 | pteval = ktext_set_nocache(pteval); | ||
| 622 | } else if (ktext_nocache) | ||
| 623 | pteval = hv_pte_set_mode(pteval, | ||
| 624 | HV_PTE_MODE_UNCACHED); | ||
| 625 | else | ||
| 626 | pteval = hv_pte_set_mode(pteval, | ||
| 627 | HV_PTE_MODE_CACHE_NO_L3); | ||
| 628 | *(pte_t *)pmd = pteval; | ||
| 629 | } | ||
| 630 | |||
| 631 | /* Set swapper_pgprot here so it is flushed to memory right away. */ | ||
| 632 | swapper_pgprot = init_pgprot((unsigned long)swapper_pg_dir); | ||
| 633 | |||
| 634 | /* | ||
| 635 | * Since we may be changing the caching of the stack and page | ||
| 636 | * table itself, we invoke an assembly helper to do the | ||
| 637 | * following steps: | ||
| 638 | * | ||
| 639 | * - flush the cache so we start with an empty slate | ||
| 640 | * - install pgtables[] as the real page table | ||
| 641 | * - flush the TLB so the new page table takes effect | ||
| 642 | */ | ||
| 643 | rc = flush_and_install_context(__pa(pgtables), | ||
| 644 | init_pgprot((unsigned long)pgtables), | ||
| 645 | __get_cpu_var(current_asid), | ||
| 646 | cpumask_bits(my_cpu_mask)); | ||
| 647 | BUG_ON(rc != 0); | ||
| 648 | |||
| 649 | /* Copy the page table back to the normal swapper_pg_dir. */ | ||
| 650 | memcpy(pgd_base, pgtables, sizeof(pgtables)); | ||
| 651 | __install_page_table(pgd_base, __get_cpu_var(current_asid), | ||
| 652 | swapper_pgprot); | ||
| 653 | } | ||
| 654 | |||
| 655 | /* | ||
| 656 | * devmem_is_allowed() checks to see if /dev/mem access to a certain address | ||
| 657 | * is valid. The argument is a physical page number. | ||
| 658 | * | ||
| 659 | * On Tile, the only valid things for which we can just hand out unchecked | ||
| 660 | * PTEs are the kernel code and data. Anything else might change its | ||
| 661 | * homing with time, and we wouldn't know to adjust the /dev/mem PTEs. | ||
| 662 | * Note that init_thread_union is released to heap soon after boot, | ||
| 663 | * so we include it in the init data. | ||
| 664 | * | ||
| 665 | * For TILE-Gx, we might want to consider allowing access to PA | ||
| 666 | * regions corresponding to PCI space, etc. | ||
| 667 | */ | ||
| 668 | int devmem_is_allowed(unsigned long pagenr) | ||
| 669 | { | ||
| 670 | return pagenr < kaddr_to_pfn(_end) && | ||
| 671 | !(pagenr >= kaddr_to_pfn(&init_thread_union) || | ||
| 672 | pagenr < kaddr_to_pfn(_einitdata)) && | ||
| 673 | !(pagenr >= kaddr_to_pfn(_sinittext) || | ||
| 674 | pagenr <= kaddr_to_pfn(_einittext-1)); | ||
| 675 | } | ||
| 676 | |||
| 677 | #ifdef CONFIG_HIGHMEM | ||
| 678 | static void __init permanent_kmaps_init(pgd_t *pgd_base) | ||
| 679 | { | ||
| 680 | pgd_t *pgd; | ||
| 681 | pud_t *pud; | ||
| 682 | pmd_t *pmd; | ||
| 683 | pte_t *pte; | ||
| 684 | unsigned long vaddr; | ||
| 685 | |||
| 686 | vaddr = PKMAP_BASE; | ||
| 687 | page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); | ||
| 688 | |||
| 689 | pgd = swapper_pg_dir + pgd_index(vaddr); | ||
| 690 | pud = pud_offset(pgd, vaddr); | ||
| 691 | pmd = pmd_offset(pud, vaddr); | ||
| 692 | pte = pte_offset_kernel(pmd, vaddr); | ||
| 693 | pkmap_page_table = pte; | ||
| 694 | } | ||
| 695 | #endif /* CONFIG_HIGHMEM */ | ||
| 696 | |||
| 697 | |||
| 698 | static void __init init_free_pfn_range(unsigned long start, unsigned long end) | ||
| 699 | { | ||
| 700 | unsigned long pfn; | ||
| 701 | struct page *page = pfn_to_page(start); | ||
| 702 | |||
| 703 | for (pfn = start; pfn < end; ) { | ||
| 704 | /* Optimize by freeing pages in large batches */ | ||
| 705 | int order = __ffs(pfn); | ||
| 706 | int count, i; | ||
| 707 | struct page *p; | ||
| 708 | |||
| 709 | if (order >= MAX_ORDER) | ||
| 710 | order = MAX_ORDER-1; | ||
| 711 | count = 1 << order; | ||
| 712 | while (pfn + count > end) { | ||
| 713 | count >>= 1; | ||
| 714 | --order; | ||
| 715 | } | ||
| 716 | for (p = page, i = 0; i < count; ++i, ++p) { | ||
| 717 | __ClearPageReserved(p); | ||
| 718 | /* | ||
| 719 | * Hacky direct set to avoid unnecessary | ||
| 720 | * lock take/release for EVERY page here. | ||
| 721 | */ | ||
| 722 | p->_count.counter = 0; | ||
| 723 | p->_mapcount.counter = -1; | ||
| 724 | } | ||
| 725 | init_page_count(page); | ||
| 726 | __free_pages(page, order); | ||
| 727 | totalram_pages += count; | ||
| 728 | |||
| 729 | page += count; | ||
| 730 | pfn += count; | ||
| 731 | } | ||
| 732 | } | ||
| 733 | |||
| 734 | static void __init set_non_bootmem_pages_init(void) | ||
| 735 | { | ||
| 736 | struct zone *z; | ||
| 737 | for_each_zone(z) { | ||
| 738 | unsigned long start, end; | ||
| 739 | int nid = z->zone_pgdat->node_id; | ||
| 740 | |||
| 741 | start = z->zone_start_pfn; | ||
| 742 | if (start == 0) | ||
| 743 | continue; /* bootmem */ | ||
| 744 | end = start + z->spanned_pages; | ||
| 745 | if (zone_idx(z) == ZONE_NORMAL) { | ||
| 746 | BUG_ON(start != node_start_pfn[nid]); | ||
| 747 | start = node_free_pfn[nid]; | ||
| 748 | } | ||
| 749 | #ifdef CONFIG_HIGHMEM | ||
| 750 | if (zone_idx(z) == ZONE_HIGHMEM) | ||
| 751 | totalhigh_pages += z->spanned_pages; | ||
| 752 | #endif | ||
| 753 | if (kdata_huge) { | ||
| 754 | unsigned long percpu_pfn = node_percpu_pfn[nid]; | ||
| 755 | if (start < percpu_pfn && end > percpu_pfn) | ||
| 756 | end = percpu_pfn; | ||
| 757 | } | ||
| 758 | #ifdef CONFIG_PCI | ||
| 759 | if (start <= pci_reserve_start_pfn && | ||
| 760 | end > pci_reserve_start_pfn) { | ||
| 761 | if (end > pci_reserve_end_pfn) | ||
| 762 | init_free_pfn_range(pci_reserve_end_pfn, end); | ||
| 763 | end = pci_reserve_start_pfn; | ||
| 764 | } | ||
| 765 | #endif | ||
| 766 | init_free_pfn_range(start, end); | ||
| 767 | } | ||
| 768 | } | ||
| 769 | |||
| 770 | /* | ||
| 771 | * paging_init() sets up the page tables - note that all of lowmem is | ||
| 772 | * already mapped by head.S. | ||
| 773 | */ | ||
| 774 | void __init paging_init(void) | ||
| 775 | { | ||
| 776 | #ifdef CONFIG_HIGHMEM | ||
| 777 | unsigned long vaddr, end; | ||
| 778 | #endif | ||
| 779 | #ifdef __tilegx__ | ||
| 780 | pud_t *pud; | ||
| 781 | #endif | ||
| 782 | pgd_t *pgd_base = swapper_pg_dir; | ||
| 783 | |||
| 784 | kernel_physical_mapping_init(pgd_base); | ||
| 785 | |||
| 786 | #ifdef CONFIG_HIGHMEM | ||
| 787 | /* | ||
| 788 | * Fixed mappings, only the page table structure has to be | ||
| 789 | * created - mappings will be set by set_fixmap(): | ||
| 790 | */ | ||
| 791 | vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; | ||
| 792 | end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; | ||
| 793 | page_table_range_init(vaddr, end, pgd_base); | ||
| 794 | permanent_kmaps_init(pgd_base); | ||
| 795 | #endif | ||
| 796 | |||
| 797 | #ifdef __tilegx__ | ||
| 798 | /* | ||
| 799 | * Since GX allocates just one pmd_t array worth of vmalloc space, | ||
| 800 | * we go ahead and allocate it statically here, then share it | ||
| 801 | * globally. As a result we don't have to worry about any task | ||
| 802 | * changing init_mm once we get up and running, and there's no | ||
| 803 | * need for e.g. vmalloc_sync_all(). | ||
| 804 | */ | ||
| 805 | BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END)); | ||
| 806 | pud = pud_offset(pgd_base + pgd_index(VMALLOC_START), VMALLOC_START); | ||
| 807 | assign_pmd(pud, alloc_pmd()); | ||
| 808 | #endif | ||
| 809 | } | ||
| 810 | |||
| 811 | |||
| 812 | /* | ||
| 813 | * Walk the kernel page tables and derive the page_home() from | ||
| 814 | * the PTEs, so that set_pte() can properly validate the caching | ||
| 815 | * of all PTEs it sees. | ||
| 816 | */ | ||
| 817 | void __init set_page_homes(void) | ||
| 818 | { | ||
| 819 | } | ||
| 820 | |||
| 821 | static void __init set_max_mapnr_init(void) | ||
| 822 | { | ||
| 823 | #ifdef CONFIG_FLATMEM | ||
| 824 | max_mapnr = max_low_pfn; | ||
| 825 | #endif | ||
| 826 | } | ||
| 827 | |||
| 828 | void __init mem_init(void) | ||
| 829 | { | ||
| 830 | int codesize, datasize, initsize; | ||
| 831 | int i; | ||
| 832 | #ifndef __tilegx__ | ||
| 833 | void *last; | ||
| 834 | #endif | ||
| 835 | |||
| 836 | #ifdef CONFIG_FLATMEM | ||
| 837 | if (!mem_map) | ||
| 838 | BUG(); | ||
| 839 | #endif | ||
| 840 | |||
| 841 | #ifdef CONFIG_HIGHMEM | ||
| 842 | /* check that fixmap and pkmap do not overlap */ | ||
| 843 | if (PKMAP_ADDR(LAST_PKMAP-1) >= FIXADDR_START) { | ||
| 844 | printk(KERN_ERR "fixmap and kmap areas overlap" | ||
| 845 | " - this will crash\n"); | ||
| 846 | printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", | ||
| 847 | PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP-1), | ||
| 848 | FIXADDR_START); | ||
| 849 | BUG(); | ||
| 850 | } | ||
| 851 | #endif | ||
| 852 | |||
| 853 | set_max_mapnr_init(); | ||
| 854 | |||
| 855 | /* this will put all bootmem onto the freelists */ | ||
| 856 | totalram_pages += free_all_bootmem(); | ||
| 857 | |||
| 858 | /* count all remaining LOWMEM and give all HIGHMEM to page allocator */ | ||
| 859 | set_non_bootmem_pages_init(); | ||
| 860 | |||
| 861 | codesize = (unsigned long)&_etext - (unsigned long)&_text; | ||
| 862 | datasize = (unsigned long)&_end - (unsigned long)&_sdata; | ||
| 863 | initsize = (unsigned long)&_einittext - (unsigned long)&_sinittext; | ||
| 864 | initsize += (unsigned long)&_einitdata - (unsigned long)&_sinitdata; | ||
| 865 | |||
| 866 | printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk data, %dk init, %ldk highmem)\n", | ||
| 867 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | ||
| 868 | num_physpages << (PAGE_SHIFT-10), | ||
| 869 | codesize >> 10, | ||
| 870 | datasize >> 10, | ||
| 871 | initsize >> 10, | ||
| 872 | (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) | ||
| 873 | ); | ||
| 874 | |||
| 875 | /* | ||
| 876 | * In debug mode, dump some interesting memory mappings. | ||
| 877 | */ | ||
| 878 | #ifdef CONFIG_HIGHMEM | ||
| 879 | printk(KERN_DEBUG " KMAP %#lx - %#lx\n", | ||
| 880 | FIXADDR_START, FIXADDR_TOP + PAGE_SIZE - 1); | ||
| 881 | printk(KERN_DEBUG " PKMAP %#lx - %#lx\n", | ||
| 882 | PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP) - 1); | ||
| 883 | #endif | ||
| 884 | #ifdef CONFIG_HUGEVMAP | ||
| 885 | printk(KERN_DEBUG " HUGEMAP %#lx - %#lx\n", | ||
| 886 | HUGE_VMAP_BASE, HUGE_VMAP_END - 1); | ||
| 887 | #endif | ||
| 888 | printk(KERN_DEBUG " VMALLOC %#lx - %#lx\n", | ||
| 889 | _VMALLOC_START, _VMALLOC_END - 1); | ||
| 890 | #ifdef __tilegx__ | ||
| 891 | for (i = MAX_NUMNODES-1; i >= 0; --i) { | ||
| 892 | struct pglist_data *node = &node_data[i]; | ||
| 893 | if (node->node_present_pages) { | ||
| 894 | unsigned long start = (unsigned long) | ||
| 895 | pfn_to_kaddr(node->node_start_pfn); | ||
| 896 | unsigned long end = start + | ||
| 897 | (node->node_present_pages << PAGE_SHIFT); | ||
| 898 | printk(KERN_DEBUG " MEM%d %#lx - %#lx\n", | ||
| 899 | i, start, end - 1); | ||
| 900 | } | ||
| 901 | } | ||
| 902 | #else | ||
| 903 | last = high_memory; | ||
| 904 | for (i = MAX_NUMNODES-1; i >= 0; --i) { | ||
| 905 | if ((unsigned long)vbase_map[i] != -1UL) { | ||
| 906 | printk(KERN_DEBUG " LOWMEM%d %#lx - %#lx\n", | ||
| 907 | i, (unsigned long) (vbase_map[i]), | ||
| 908 | (unsigned long) (last-1)); | ||
| 909 | last = vbase_map[i]; | ||
| 910 | } | ||
| 911 | } | ||
| 912 | #endif | ||
| 913 | |||
| 914 | #ifndef __tilegx__ | ||
| 915 | /* | ||
| 916 | * Convert from using one lock for all atomic operations to | ||
| 917 | * one per cpu. | ||
| 918 | */ | ||
| 919 | __init_atomic_per_cpu(); | ||
| 920 | #endif | ||
| 921 | } | ||
| 922 | |||
| 923 | /* | ||
| 924 | * this is for the non-NUMA, single node SMP system case. | ||
| 925 | * Specifically, in the case of x86, we will always add | ||
| 926 | * memory to the highmem for now. | ||
| 927 | */ | ||
| 928 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
| 929 | int arch_add_memory(u64 start, u64 size) | ||
| 930 | { | ||
| 931 | struct pglist_data *pgdata = &contig_page_data; | ||
| 932 | struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; | ||
| 933 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
| 934 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
| 935 | |||
| 936 | return __add_pages(zone, start_pfn, nr_pages); | ||
| 937 | } | ||
| 938 | |||
| 939 | int remove_memory(u64 start, u64 size) | ||
| 940 | { | ||
| 941 | return -EINVAL; | ||
| 942 | } | ||
| 943 | #endif | ||
| 944 | |||
| 945 | struct kmem_cache *pgd_cache; | ||
| 946 | |||
| 947 | void __init pgtable_cache_init(void) | ||
| 948 | { | ||
| 949 | pgd_cache = kmem_cache_create("pgd", | ||
| 950 | PTRS_PER_PGD*sizeof(pgd_t), | ||
| 951 | PTRS_PER_PGD*sizeof(pgd_t), | ||
| 952 | 0, | ||
| 953 | NULL); | ||
| 954 | if (!pgd_cache) | ||
| 955 | panic("pgtable_cache_init(): Cannot create pgd cache"); | ||
| 956 | } | ||
| 957 | |||
| 958 | #if !CHIP_HAS_COHERENT_LOCAL_CACHE() | ||
| 959 | /* | ||
| 960 | * The __w1data area holds data that is only written during initialization, | ||
| 961 | * and is read-only and thus freely cacheable thereafter. Fix the page | ||
| 962 | * table entries that cover that region accordingly. | ||
| 963 | */ | ||
| 964 | static void mark_w1data_ro(void) | ||
| 965 | { | ||
| 966 | /* Loop over page table entries */ | ||
| 967 | unsigned long addr = (unsigned long)__w1data_begin; | ||
| 968 | BUG_ON((addr & (PAGE_SIZE-1)) != 0); | ||
| 969 | for (; addr <= (unsigned long)__w1data_end - 1; addr += PAGE_SIZE) { | ||
| 970 | unsigned long pfn = kaddr_to_pfn((void *)addr); | ||
| 971 | struct page *page = pfn_to_page(pfn); | ||
| 972 | pte_t *ptep = virt_to_pte(NULL, addr); | ||
| 973 | BUG_ON(pte_huge(*ptep)); /* not relevant for kdata_huge */ | ||
| 974 | set_pte_at(&init_mm, addr, ptep, pfn_pte(pfn, PAGE_KERNEL_RO)); | ||
| 975 | } | ||
| 976 | } | ||
| 977 | #endif | ||
| 978 | |||
| 979 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 980 | static long __write_once initfree; | ||
| 981 | #else | ||
| 982 | static long __write_once initfree = 1; | ||
| 983 | #endif | ||
| 984 | |||
| 985 | /* Select whether to free (1) or mark unusable (0) the __init pages. */ | ||
| 986 | static int __init set_initfree(char *str) | ||
| 987 | { | ||
| 988 | strict_strtol(str, 0, &initfree); | ||
| 989 | printk("initfree: %s free init pages\n", initfree ? "will" : "won't"); | ||
| 990 | return 1; | ||
| 991 | } | ||
| 992 | __setup("initfree=", set_initfree); | ||
| 993 | |||
| 994 | static void free_init_pages(char *what, unsigned long begin, unsigned long end) | ||
| 995 | { | ||
| 996 | unsigned long addr = (unsigned long) begin; | ||
| 997 | |||
| 998 | if (kdata_huge && !initfree) { | ||
| 999 | printk("Warning: ignoring initfree=0:" | ||
| 1000 | " incompatible with kdata=huge\n"); | ||
| 1001 | initfree = 1; | ||
| 1002 | } | ||
| 1003 | end = (end + PAGE_SIZE - 1) & PAGE_MASK; | ||
| 1004 | local_flush_tlb_pages(NULL, begin, PAGE_SIZE, end - begin); | ||
| 1005 | for (addr = begin; addr < end; addr += PAGE_SIZE) { | ||
| 1006 | /* | ||
| 1007 | * Note we just reset the home here directly in the | ||
| 1008 | * page table. We know this is safe because our caller | ||
| 1009 | * just flushed the caches on all the other cpus, | ||
| 1010 | * and they won't be touching any of these pages. | ||
| 1011 | */ | ||
| 1012 | int pfn = kaddr_to_pfn((void *)addr); | ||
| 1013 | struct page *page = pfn_to_page(pfn); | ||
| 1014 | pte_t *ptep = virt_to_pte(NULL, addr); | ||
| 1015 | if (!initfree) { | ||
| 1016 | /* | ||
| 1017 | * If debugging page accesses then do not free | ||
| 1018 | * this memory but mark them not present - any | ||
| 1019 | * buggy init-section access will create a | ||
| 1020 | * kernel page fault: | ||
| 1021 | */ | ||
| 1022 | pte_clear(&init_mm, addr, ptep); | ||
| 1023 | continue; | ||
| 1024 | } | ||
| 1025 | __ClearPageReserved(page); | ||
| 1026 | init_page_count(page); | ||
| 1027 | if (pte_huge(*ptep)) | ||
| 1028 | BUG_ON(!kdata_huge); | ||
| 1029 | else | ||
| 1030 | set_pte_at(&init_mm, addr, ptep, | ||
| 1031 | pfn_pte(pfn, PAGE_KERNEL)); | ||
| 1032 | memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); | ||
| 1033 | free_page(addr); | ||
| 1034 | totalram_pages++; | ||
| 1035 | } | ||
| 1036 | printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); | ||
| 1037 | } | ||
| 1038 | |||
| 1039 | void free_initmem(void) | ||
| 1040 | { | ||
| 1041 | const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET; | ||
| 1042 | |||
| 1043 | /* | ||
| 1044 | * Evict the dirty initdata on the boot cpu, evict the w1data | ||
| 1045 | * wherever it's homed, and evict all the init code everywhere. | ||
| 1046 | * We are guaranteed that no one will touch the init pages any | ||
| 1047 | * more, and although other cpus may be touching the w1data, | ||
| 1048 | * we only actually change the caching on tile64, which won't | ||
| 1049 | * be keeping local copies in the other tiles' caches anyway. | ||
| 1050 | */ | ||
| 1051 | homecache_evict(&cpu_cacheable_map); | ||
| 1052 | |||
| 1053 | /* Free the data pages that we won't use again after init. */ | ||
| 1054 | free_init_pages("unused kernel data", | ||
| 1055 | (unsigned long)_sinitdata, | ||
| 1056 | (unsigned long)_einitdata); | ||
| 1057 | |||
| 1058 | /* | ||
| 1059 | * Free the pages mapped from 0xc0000000 that correspond to code | ||
| 1060 | * pages from 0xfd000000 that we won't use again after init. | ||
| 1061 | */ | ||
| 1062 | free_init_pages("unused kernel text", | ||
| 1063 | (unsigned long)_sinittext - text_delta, | ||
| 1064 | (unsigned long)_einittext - text_delta); | ||
| 1065 | |||
| 1066 | #if !CHIP_HAS_COHERENT_LOCAL_CACHE() | ||
| 1067 | /* | ||
| 1068 | * Upgrade the .w1data section to globally cached. | ||
| 1069 | * We don't do this on tilepro, since the cache architecture | ||
| 1070 | * pretty much makes it irrelevant, and in any case we end | ||
| 1071 | * up having racing issues with other tiles that may touch | ||
| 1072 | * the data after we flush the cache but before we update | ||
| 1073 | * the PTEs and flush the TLBs, causing sharer shootdowns | ||
| 1074 | * later. Even though this is to clean data, it seems like | ||
| 1075 | * an unnecessary complication. | ||
| 1076 | */ | ||
| 1077 | mark_w1data_ro(); | ||
| 1078 | #endif | ||
| 1079 | |||
| 1080 | /* Do a global TLB flush so everyone sees the changes. */ | ||
| 1081 | flush_tlb_all(); | ||
| 1082 | } | ||
diff --git a/arch/tile/mm/migrate.h b/arch/tile/mm/migrate.h new file mode 100644 index 000000000000..cd45a0837fa6 --- /dev/null +++ b/arch/tile/mm/migrate.h | |||
| @@ -0,0 +1,50 @@ | |||
| 1 | /* | ||
| 2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation, version 2. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
| 11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | * | ||
| 14 | * Structure definitions for migration, exposed here for use by | ||
| 15 | * arch/tile/kernel/asm-offsets.c. | ||
| 16 | */ | ||
| 17 | |||
| 18 | #ifndef MM_MIGRATE_H | ||
| 19 | #define MM_MIGRATE_H | ||
| 20 | |||
| 21 | #include <linux/cpumask.h> | ||
| 22 | #include <hv/hypervisor.h> | ||
| 23 | |||
| 24 | /* | ||
| 25 | * This function is used as a helper when setting up the initial | ||
| 26 | * page table (swapper_pg_dir). | ||
| 27 | */ | ||
| 28 | extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access, | ||
| 29 | HV_ASID asid, | ||
| 30 | const unsigned long *cpumask); | ||
| 31 | |||
| 32 | /* | ||
| 33 | * This function supports migration as a "helper" as follows: | ||
| 34 | * | ||
| 35 | * - Set the stack PTE itself to "migrating". | ||
| 36 | * - Do a global TLB flush for (va,length) and the specified ASIDs. | ||
| 37 | * - Do a cache-evict on all necessary cpus. | ||
| 38 | * - Write the new stack PTE. | ||
| 39 | * | ||
| 40 | * Note that any non-NULL pointers must not point to the page that | ||
| 41 | * is handled by the stack_pte itself. | ||
| 42 | */ | ||
| 43 | extern int homecache_migrate_stack_and_flush(pte_t stack_pte, unsigned long va, | ||
| 44 | size_t length, pte_t *stack_ptep, | ||
| 45 | const struct cpumask *cache_cpumask, | ||
| 46 | const struct cpumask *tlb_cpumask, | ||
| 47 | HV_Remote_ASID *asids, | ||
| 48 | int asidcount); | ||
| 49 | |||
| 50 | #endif /* MM_MIGRATE_H */ | ||
diff --git a/arch/tile/mm/migrate_32.S b/arch/tile/mm/migrate_32.S new file mode 100644 index 000000000000..f738765cd1e6 --- /dev/null +++ b/arch/tile/mm/migrate_32.S | |||
| @@ -0,0 +1,211 @@ | |||
| 1 | /* | ||
| 2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation, version 2. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
| 11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | * | ||
| 14 | * This routine is a helper for migrating the home of a set of pages to | ||
| 15 | * a new cpu. See the documentation in homecache.c for more information. | ||
| 16 | */ | ||
| 17 | |||
| 18 | #include <linux/linkage.h> | ||
| 19 | #include <linux/threads.h> | ||
| 20 | #include <asm/page.h> | ||
| 21 | #include <asm/types.h> | ||
| 22 | #include <asm/asm-offsets.h> | ||
| 23 | #include <hv/hypervisor.h> | ||
| 24 | |||
| 25 | .text | ||
| 26 | |||
| 27 | /* | ||
| 28 | * First, some definitions that apply to all the code in the file. | ||
| 29 | */ | ||
| 30 | |||
| 31 | /* Locals (caller-save) */ | ||
| 32 | #define r_tmp r10 | ||
| 33 | #define r_save_sp r11 | ||
| 34 | |||
| 35 | /* What we save where in the stack frame; must include all callee-saves. */ | ||
| 36 | #define FRAME_SP 4 | ||
| 37 | #define FRAME_R30 8 | ||
| 38 | #define FRAME_R31 12 | ||
| 39 | #define FRAME_R32 16 | ||
| 40 | #define FRAME_R33 20 | ||
| 41 | #define FRAME_R34 24 | ||
| 42 | #define FRAME_R35 28 | ||
| 43 | #define FRAME_SIZE 32 | ||
| 44 | |||
| 45 | |||
| 46 | |||
| 47 | |||
| 48 | /* | ||
| 49 | * On entry: | ||
| 50 | * | ||
| 51 | * r0 low word of the new context PA to install (moved to r_context_lo) | ||
| 52 | * r1 high word of the new context PA to install (moved to r_context_hi) | ||
| 53 | * r2 low word of PTE to use for context access (moved to r_access_lo) | ||
| 54 | * r3 high word of PTE to use for context access (moved to r_access_lo) | ||
| 55 | * r4 ASID to use for new context (moved to r_asid) | ||
| 56 | * r5 pointer to cpumask with just this cpu set in it (r_my_cpumask) | ||
| 57 | */ | ||
| 58 | |||
| 59 | /* Arguments (caller-save) */ | ||
| 60 | #define r_context_lo_in r0 | ||
| 61 | #define r_context_hi_in r1 | ||
| 62 | #define r_access_lo_in r2 | ||
| 63 | #define r_access_hi_in r3 | ||
| 64 | #define r_asid_in r4 | ||
| 65 | #define r_my_cpumask r5 | ||
| 66 | |||
| 67 | /* Locals (callee-save); must not be more than FRAME_xxx above. */ | ||
| 68 | #define r_save_ics r30 | ||
| 69 | #define r_context_lo r31 | ||
| 70 | #define r_context_hi r32 | ||
| 71 | #define r_access_lo r33 | ||
| 72 | #define r_access_hi r34 | ||
| 73 | #define r_asid r35 | ||
| 74 | |||
| 75 | STD_ENTRY(flush_and_install_context) | ||
| 76 | /* | ||
| 77 | * Create a stack frame; we can't touch it once we flush the | ||
| 78 | * cache until we install the new page table and flush the TLB. | ||
| 79 | */ | ||
| 80 | { | ||
| 81 | move r_save_sp, sp | ||
| 82 | sw sp, lr | ||
| 83 | addi sp, sp, -FRAME_SIZE | ||
| 84 | } | ||
| 85 | addi r_tmp, sp, FRAME_SP | ||
| 86 | { | ||
| 87 | sw r_tmp, r_save_sp | ||
| 88 | addi r_tmp, sp, FRAME_R30 | ||
| 89 | } | ||
| 90 | { | ||
| 91 | sw r_tmp, r30 | ||
| 92 | addi r_tmp, sp, FRAME_R31 | ||
| 93 | } | ||
| 94 | { | ||
| 95 | sw r_tmp, r31 | ||
| 96 | addi r_tmp, sp, FRAME_R32 | ||
| 97 | } | ||
| 98 | { | ||
| 99 | sw r_tmp, r32 | ||
| 100 | addi r_tmp, sp, FRAME_R33 | ||
| 101 | } | ||
| 102 | { | ||
| 103 | sw r_tmp, r33 | ||
| 104 | addi r_tmp, sp, FRAME_R34 | ||
| 105 | } | ||
| 106 | { | ||
| 107 | sw r_tmp, r34 | ||
| 108 | addi r_tmp, sp, FRAME_R35 | ||
| 109 | } | ||
| 110 | sw r_tmp, r35 | ||
| 111 | |||
| 112 | /* Move some arguments to callee-save registers. */ | ||
| 113 | { | ||
| 114 | move r_context_lo, r_context_lo_in | ||
| 115 | move r_context_hi, r_context_hi_in | ||
| 116 | } | ||
| 117 | { | ||
| 118 | move r_access_lo, r_access_lo_in | ||
| 119 | move r_access_hi, r_access_hi_in | ||
| 120 | } | ||
| 121 | move r_asid, r_asid_in | ||
| 122 | |||
| 123 | /* Disable interrupts, since we can't use our stack. */ | ||
| 124 | { | ||
| 125 | mfspr r_save_ics, INTERRUPT_CRITICAL_SECTION | ||
| 126 | movei r_tmp, 1 | ||
| 127 | } | ||
| 128 | mtspr INTERRUPT_CRITICAL_SECTION, r_tmp | ||
| 129 | |||
| 130 | /* First, flush our L2 cache. */ | ||
| 131 | { | ||
| 132 | move r0, zero /* cache_pa */ | ||
| 133 | move r1, zero | ||
| 134 | } | ||
| 135 | { | ||
| 136 | auli r2, zero, ha16(HV_FLUSH_EVICT_L2) /* cache_control */ | ||
| 137 | move r3, r_my_cpumask /* cache_cpumask */ | ||
| 138 | } | ||
| 139 | { | ||
| 140 | move r4, zero /* tlb_va */ | ||
| 141 | move r5, zero /* tlb_length */ | ||
| 142 | } | ||
| 143 | { | ||
| 144 | move r6, zero /* tlb_pgsize */ | ||
| 145 | move r7, zero /* tlb_cpumask */ | ||
| 146 | } | ||
| 147 | { | ||
| 148 | move r8, zero /* asids */ | ||
| 149 | move r9, zero /* asidcount */ | ||
| 150 | } | ||
| 151 | jal hv_flush_remote | ||
| 152 | bnz r0, .Ldone | ||
| 153 | |||
| 154 | /* Now install the new page table. */ | ||
| 155 | { | ||
| 156 | move r0, r_context_lo | ||
| 157 | move r1, r_context_hi | ||
| 158 | } | ||
| 159 | { | ||
| 160 | move r2, r_access_lo | ||
| 161 | move r3, r_access_hi | ||
| 162 | } | ||
| 163 | { | ||
| 164 | move r4, r_asid | ||
| 165 | movei r5, HV_CTX_DIRECTIO | ||
| 166 | } | ||
| 167 | jal hv_install_context | ||
| 168 | bnz r0, .Ldone | ||
| 169 | |||
| 170 | /* Finally, flush the TLB. */ | ||
| 171 | { | ||
| 172 | movei r0, 0 /* preserve_global */ | ||
| 173 | jal hv_flush_all | ||
| 174 | } | ||
| 175 | |||
| 176 | .Ldone: | ||
| 177 | /* Reset interrupts back how they were before. */ | ||
| 178 | mtspr INTERRUPT_CRITICAL_SECTION, r_save_ics | ||
| 179 | |||
| 180 | /* Restore the callee-saved registers and return. */ | ||
| 181 | addli lr, sp, FRAME_SIZE | ||
| 182 | { | ||
| 183 | lw lr, lr | ||
| 184 | addli r_tmp, sp, FRAME_R30 | ||
| 185 | } | ||
| 186 | { | ||
| 187 | lw r30, r_tmp | ||
| 188 | addli r_tmp, sp, FRAME_R31 | ||
| 189 | } | ||
| 190 | { | ||
| 191 | lw r31, r_tmp | ||
| 192 | addli r_tmp, sp, FRAME_R32 | ||
| 193 | } | ||
| 194 | { | ||
| 195 | lw r32, r_tmp | ||
| 196 | addli r_tmp, sp, FRAME_R33 | ||
| 197 | } | ||
| 198 | { | ||
| 199 | lw r33, r_tmp | ||
| 200 | addli r_tmp, sp, FRAME_R34 | ||
| 201 | } | ||
| 202 | { | ||
| 203 | lw r34, r_tmp | ||
| 204 | addli r_tmp, sp, FRAME_R35 | ||
| 205 | } | ||
| 206 | { | ||
| 207 | lw r35, r_tmp | ||
| 208 | addi sp, sp, FRAME_SIZE | ||
| 209 | } | ||
| 210 | jrp lr | ||
| 211 | STD_ENDPROC(flush_and_install_context) | ||
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c new file mode 100644 index 000000000000..f96f4cec602a --- /dev/null +++ b/arch/tile/mm/mmap.c | |||
| @@ -0,0 +1,75 @@ | |||
| 1 | /* | ||
| 2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation, version 2. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
| 11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | * | ||
| 14 | * Taken from the i386 architecture and simplified. | ||
| 15 | */ | ||
| 16 | |||
| 17 | #include <linux/mm.h> | ||
| 18 | #include <linux/random.h> | ||
| 19 | #include <linux/limits.h> | ||
| 20 | #include <linux/sched.h> | ||
| 21 | #include <linux/mman.h> | ||
| 22 | #include <linux/compat.h> | ||
| 23 | |||
| 24 | /* | ||
| 25 | * Top of mmap area (just below the process stack). | ||
| 26 | * | ||
| 27 | * Leave an at least ~128 MB hole. | ||
| 28 | */ | ||
| 29 | #define MIN_GAP (128*1024*1024) | ||
| 30 | #define MAX_GAP (TASK_SIZE/6*5) | ||
| 31 | |||
| 32 | static inline unsigned long mmap_base(struct mm_struct *mm) | ||
| 33 | { | ||
| 34 | unsigned long gap = rlimit(RLIMIT_STACK); | ||
| 35 | unsigned long random_factor = 0; | ||
| 36 | |||
| 37 | if (current->flags & PF_RANDOMIZE) | ||
| 38 | random_factor = get_random_int() % (1024*1024); | ||
| 39 | |||
| 40 | if (gap < MIN_GAP) | ||
| 41 | gap = MIN_GAP; | ||
| 42 | else if (gap > MAX_GAP) | ||
| 43 | gap = MAX_GAP; | ||
| 44 | |||
| 45 | return PAGE_ALIGN(TASK_SIZE - gap - random_factor); | ||
| 46 | } | ||
| 47 | |||
| 48 | /* | ||
| 49 | * This function, called very early during the creation of a new | ||
| 50 | * process VM image, sets up which VM layout function to use: | ||
| 51 | */ | ||
| 52 | void arch_pick_mmap_layout(struct mm_struct *mm) | ||
| 53 | { | ||
| 54 | #if !defined(__tilegx__) | ||
| 55 | int is_32bit = 1; | ||
| 56 | #elif defined(CONFIG_COMPAT) | ||
| 57 | int is_32bit = is_compat_task(); | ||
| 58 | #else | ||
| 59 | int is_32bit = 0; | ||
| 60 | #endif | ||
| 61 | |||
| 62 | /* | ||
| 63 | * Use standard layout if the expected stack growth is unlimited | ||
| 64 | * or we are running native 64 bits. | ||
| 65 | */ | ||
| 66 | if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) { | ||
| 67 | mm->mmap_base = TASK_UNMAPPED_BASE; | ||
| 68 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
| 69 | mm->unmap_area = arch_unmap_area; | ||
| 70 | } else { | ||
| 71 | mm->mmap_base = mmap_base(mm); | ||
| 72 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; | ||
| 73 | mm->unmap_area = arch_unmap_area_topdown; | ||
| 74 | } | ||
| 75 | } | ||
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c new file mode 100644 index 000000000000..289e729bbd76 --- /dev/null +++ b/arch/tile/mm/pgtable.c | |||
| @@ -0,0 +1,566 @@ | |||
| 1 | /* | ||
| 2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation, version 2. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
| 11 | * NON INFRINGEMENT. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | */ | ||
| 14 | |||
| 15 | #include <linux/sched.h> | ||
| 16 | #include <linux/kernel.h> | ||
| 17 | #include <linux/errno.h> | ||
| 18 | #include <linux/mm.h> | ||
| 19 | #include <linux/swap.h> | ||
| 20 | #include <linux/smp.h> | ||
| 21 | #include <linux/highmem.h> | ||
| 22 | #include <linux/slab.h> | ||
| 23 | #include <linux/pagemap.h> | ||
| 24 | #include <linux/spinlock.h> | ||
| 25 | #include <linux/cpumask.h> | ||
| 26 | #include <linux/module.h> | ||
| 27 | #include <linux/io.h> | ||
| 28 | #include <linux/vmalloc.h> | ||
| 29 | #include <linux/smp.h> | ||
| 30 | |||
| 31 | #include <asm/system.h> | ||
| 32 | #include <asm/pgtable.h> | ||
| 33 | #include <asm/pgalloc.h> | ||
| 34 | #include <asm/fixmap.h> | ||
| 35 | #include <asm/tlb.h> | ||
| 36 | #include <asm/tlbflush.h> | ||
| 37 | #include <asm/homecache.h> | ||
| 38 | |||
| 39 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
| 40 | |||
| 41 | /* | ||
| 42 | * The normal show_free_areas() is too verbose on Tile, with dozens | ||
| 43 | * of processors and often four NUMA zones each with high and lowmem. | ||
| 44 | */ | ||
| 45 | void show_mem(void) | ||
| 46 | { | ||
| 47 | struct zone *zone; | ||
| 48 | |||
| 49 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu" | ||
| 50 | " free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu" | ||
| 51 | " pagecache:%lu swap:%lu\n", | ||
| 52 | (global_page_state(NR_ACTIVE_ANON) + | ||
| 53 | global_page_state(NR_ACTIVE_FILE)), | ||
| 54 | (global_page_state(NR_INACTIVE_ANON) + | ||
| 55 | global_page_state(NR_INACTIVE_FILE)), | ||
| 56 | global_page_state(NR_FILE_DIRTY), | ||
| 57 | global_page_state(NR_WRITEBACK), | ||
| 58 | global_page_state(NR_UNSTABLE_NFS), | ||
| 59 | global_page_state(NR_FREE_PAGES), | ||
| 60 | (global_page_state(NR_SLAB_RECLAIMABLE) + | ||
| 61 | global_page_state(NR_SLAB_UNRECLAIMABLE)), | ||
| 62 | global_page_state(NR_FILE_MAPPED), | ||
| 63 | global_page_state(NR_PAGETABLE), | ||
| 64 | global_page_state(NR_BOUNCE), | ||
| 65 | global_page_state(NR_FILE_PAGES), | ||
| 66 | nr_swap_pages); | ||
| 67 | |||
| 68 | for_each_zone(zone) { | ||
| 69 | unsigned long flags, order, total = 0, largest_order = -1; | ||
| 70 | |||
| 71 | if (!populated_zone(zone)) | ||
| 72 | continue; | ||
| 73 | |||
| 74 | printk("Node %d %7s: ", zone_to_nid(zone), zone->name); | ||
| 75 | spin_lock_irqsave(&zone->lock, flags); | ||
| 76 | for (order = 0; order < MAX_ORDER; order++) { | ||
| 77 | int nr = zone->free_area[order].nr_free; | ||
| 78 | total += nr << order; | ||
| 79 | if (nr) | ||
| 80 | largest_order = order; | ||
| 81 | } | ||
| 82 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 83 | printk("%lukB (largest %luKb)\n", | ||
| 84 | K(total), largest_order ? K(1UL) << largest_order : 0); | ||
| 85 | } | ||
| 86 | } | ||
| 87 | |||
| 88 | /* | ||
| 89 | * Associate a virtual page frame with a given physical page frame | ||
| 90 | * and protection flags for that frame. | ||
| 91 | */ | ||
| 92 | static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) | ||
| 93 | { | ||
| 94 | pgd_t *pgd; | ||
| 95 | pud_t *pud; | ||
| 96 | pmd_t *pmd; | ||
| 97 | pte_t *pte; | ||
| 98 | |||
| 99 | pgd = swapper_pg_dir + pgd_index(vaddr); | ||
| 100 | if (pgd_none(*pgd)) { | ||
| 101 | BUG(); | ||
| 102 | return; | ||
| 103 | } | ||
| 104 | pud = pud_offset(pgd, vaddr); | ||
| 105 | if (pud_none(*pud)) { | ||
| 106 | BUG(); | ||
| 107 | return; | ||
| 108 | } | ||
| 109 | pmd = pmd_offset(pud, vaddr); | ||
| 110 | if (pmd_none(*pmd)) { | ||
| 111 | BUG(); | ||
| 112 | return; | ||
| 113 | } | ||
| 114 | pte = pte_offset_kernel(pmd, vaddr); | ||
| 115 | /* <pfn,flags> stored as-is, to permit clearing entries */ | ||
| 116 | set_pte(pte, pfn_pte(pfn, flags)); | ||
| 117 | |||
| 118 | /* | ||
| 119 | * It's enough to flush this one mapping. | ||
| 120 | * This appears conservative since it is only called | ||
| 121 | * from __set_fixmap. | ||
| 122 | */ | ||
| 123 | local_flush_tlb_page(NULL, vaddr, PAGE_SIZE); | ||
| 124 | } | ||
| 125 | |||
| 126 | /* | ||
| 127 | * Associate a huge virtual page frame with a given physical page frame | ||
| 128 | * and protection flags for that frame. pfn is for the base of the page, | ||
| 129 | * vaddr is what the page gets mapped to - both must be properly aligned. | ||
| 130 | * The pmd must already be instantiated. | ||
| 131 | */ | ||
| 132 | void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) | ||
| 133 | { | ||
| 134 | pgd_t *pgd; | ||
| 135 | pud_t *pud; | ||
| 136 | pmd_t *pmd; | ||
| 137 | |||
| 138 | if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ | ||
| 139 | printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); | ||
| 140 | return; /* BUG(); */ | ||
| 141 | } | ||
| 142 | if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ | ||
| 143 | printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); | ||
| 144 | return; /* BUG(); */ | ||
| 145 | } | ||
| 146 | pgd = swapper_pg_dir + pgd_index(vaddr); | ||
| 147 | if (pgd_none(*pgd)) { | ||
| 148 | printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); | ||
| 149 | return; /* BUG(); */ | ||
| 150 | } | ||
| 151 | pud = pud_offset(pgd, vaddr); | ||
| 152 | pmd = pmd_offset(pud, vaddr); | ||
| 153 | set_pmd(pmd, ptfn_pmd(HV_PFN_TO_PTFN(pfn), flags)); | ||
| 154 | /* | ||
| 155 | * It's enough to flush this one mapping. | ||
| 156 | * We flush both small and huge TSBs to be sure. | ||
| 157 | */ | ||
| 158 | local_flush_tlb_page(NULL, vaddr, HPAGE_SIZE); | ||
| 159 | local_flush_tlb_pages(NULL, vaddr, PAGE_SIZE, HPAGE_SIZE); | ||
| 160 | } | ||
| 161 | |||
| 162 | void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) | ||
| 163 | { | ||
| 164 | unsigned long address = __fix_to_virt(idx); | ||
| 165 | |||
| 166 | if (idx >= __end_of_fixed_addresses) { | ||
| 167 | BUG(); | ||
| 168 | return; | ||
| 169 | } | ||
| 170 | set_pte_pfn(address, phys >> PAGE_SHIFT, flags); | ||
| 171 | } | ||
| 172 | |||
| 173 | #if defined(CONFIG_HIGHPTE) | ||
| 174 | pte_t *_pte_offset_map(pmd_t *dir, unsigned long address, enum km_type type) | ||
| 175 | { | ||
| 176 | pte_t *pte = kmap_atomic(pmd_page(*dir), type) + | ||
| 177 | (pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK; | ||
| 178 | return &pte[pte_index(address)]; | ||
| 179 | } | ||
| 180 | #endif | ||
| 181 | |||
| 182 | /* | ||
| 183 | * List of all pgd's needed so it can invalidate entries in both cached | ||
| 184 | * and uncached pgd's. This is essentially codepath-based locking | ||
| 185 | * against pageattr.c; it is the unique case in which a valid change | ||
| 186 | * of kernel pagetables can't be lazily synchronized by vmalloc faults. | ||
| 187 | * vmalloc faults work because attached pagetables are never freed. | ||
| 188 | * The locking scheme was chosen on the basis of manfred's | ||
| 189 | * recommendations and having no core impact whatsoever. | ||
| 190 | * -- wli | ||
| 191 | */ | ||
| 192 | DEFINE_SPINLOCK(pgd_lock); | ||
| 193 | LIST_HEAD(pgd_list); | ||
| 194 | |||
| 195 | static inline void pgd_list_add(pgd_t *pgd) | ||
| 196 | { | ||
| 197 | list_add(pgd_to_list(pgd), &pgd_list); | ||
| 198 | } | ||
| 199 | |||
| 200 | static inline void pgd_list_del(pgd_t *pgd) | ||
| 201 | { | ||
| 202 | list_del(pgd_to_list(pgd)); | ||
| 203 | } | ||
| 204 | |||
| 205 | #define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET) | ||
| 206 | #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START) | ||
| 207 | |||
| 208 | static void pgd_ctor(pgd_t *pgd) | ||
| 209 | { | ||
| 210 | unsigned long flags; | ||
| 211 | |||
| 212 | memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t)); | ||
| 213 | spin_lock_irqsave(&pgd_lock, flags); | ||
| 214 | |||
| 215 | #ifndef __tilegx__ | ||
| 216 | /* | ||
| 217 | * Check that the user interrupt vector has no L2. | ||
| 218 | * It never should for the swapper, and new page tables | ||
| 219 | * should always start with an empty user interrupt vector. | ||
| 220 | */ | ||
| 221 | BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0); | ||
| 222 | #endif | ||
| 223 | |||
| 224 | clone_pgd_range(pgd + KERNEL_PGD_INDEX_START, | ||
| 225 | swapper_pg_dir + KERNEL_PGD_INDEX_START, | ||
| 226 | KERNEL_PGD_PTRS); | ||
| 227 | |||
| 228 | pgd_list_add(pgd); | ||
| 229 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
| 230 | } | ||
| 231 | |||
| 232 | static void pgd_dtor(pgd_t *pgd) | ||
| 233 | { | ||
| 234 | unsigned long flags; /* can be called from interrupt context */ | ||
| 235 | |||
| 236 | spin_lock_irqsave(&pgd_lock, flags); | ||
| 237 | pgd_list_del(pgd); | ||
| 238 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
| 239 | } | ||
| 240 | |||
| 241 | pgd_t *pgd_alloc(struct mm_struct *mm) | ||
| 242 | { | ||
| 243 | pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); | ||
| 244 | if (pgd) | ||
| 245 | pgd_ctor(pgd); | ||
| 246 | return pgd; | ||
| 247 | } | ||
| 248 | |||
| 249 | void pgd_free(struct mm_struct *mm, pgd_t *pgd) | ||
| 250 | { | ||
| 251 | pgd_dtor(pgd); | ||
| 252 | kmem_cache_free(pgd_cache, pgd); | ||
| 253 | } | ||
| 254 | |||
| 255 | |||
| 256 | #define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER) | ||
| 257 | |||
| 258 | struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) | ||
| 259 | { | ||
| 260 | int flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP; | ||
| 261 | struct page *p; | ||
| 262 | |||
| 263 | #ifdef CONFIG_HIGHPTE | ||
| 264 | flags |= __GFP_HIGHMEM; | ||
| 265 | #endif | ||
| 266 | |||
| 267 | p = alloc_pages(flags, L2_USER_PGTABLE_ORDER); | ||
| 268 | if (p == NULL) | ||
| 269 | return NULL; | ||
| 270 | |||
| 271 | pgtable_page_ctor(p); | ||
| 272 | return p; | ||
| 273 | } | ||
| 274 | |||
| 275 | /* | ||
| 276 | * Free page immediately (used in __pte_alloc if we raced with another | ||
| 277 | * process). We have to correct whatever pte_alloc_one() did before | ||
| 278 | * returning the pages to the allocator. | ||
| 279 | */ | ||
| 280 | void pte_free(struct mm_struct *mm, struct page *p) | ||
| 281 | { | ||
| 282 | pgtable_page_dtor(p); | ||
| 283 | __free_pages(p, L2_USER_PGTABLE_ORDER); | ||
| 284 | } | ||
| 285 | |||
| 286 | void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, | ||
| 287 | unsigned long address) | ||
| 288 | { | ||
| 289 | int i; | ||
| 290 | |||
| 291 | pgtable_page_dtor(pte); | ||
| 292 | tlb->need_flush = 1; | ||
| 293 | if (tlb_fast_mode(tlb)) { | ||
| 294 | struct page *pte_pages[L2_USER_PGTABLE_PAGES]; | ||
| 295 | for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) | ||
| 296 | pte_pages[i] = pte + i; | ||
| 297 | free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES); | ||
| 298 | return; | ||
| 299 | } | ||
| 300 | for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) { | ||
| 301 | tlb->pages[tlb->nr++] = pte + i; | ||
| 302 | if (tlb->nr >= FREE_PTE_NR) | ||
| 303 | tlb_flush_mmu(tlb, 0, 0); | ||
| 304 | } | ||
| 305 | } | ||
| 306 | |||
| 307 | #ifndef __tilegx__ | ||
| 308 | |||
| 309 | /* | ||
| 310 | * FIXME: needs to be atomic vs hypervisor writes. For now we make the | ||
| 311 | * window of vulnerability a bit smaller by doing an unlocked 8-bit update. | ||
| 312 | */ | ||
| 313 | int ptep_test_and_clear_young(struct vm_area_struct *vma, | ||
| 314 | unsigned long addr, pte_t *ptep) | ||
| 315 | { | ||
| 316 | #if HV_PTE_INDEX_ACCESSED < 8 || HV_PTE_INDEX_ACCESSED >= 16 | ||
| 317 | # error Code assumes HV_PTE "accessed" bit in second byte | ||
| 318 | #endif | ||
| 319 | u8 *tmp = (u8 *)ptep; | ||
| 320 | u8 second_byte = tmp[1]; | ||
| 321 | if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8)))) | ||
| 322 | return 0; | ||
| 323 | tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8)); | ||
| 324 | return 1; | ||
| 325 | } | ||
| 326 | |||
| 327 | /* | ||
| 328 | * This implementation is atomic vs hypervisor writes, since the hypervisor | ||
| 329 | * always writes the low word (where "accessed" and "dirty" are) and this | ||
| 330 | * routine only writes the high word. | ||
| 331 | */ | ||
| 332 | void ptep_set_wrprotect(struct mm_struct *mm, | ||
| 333 | unsigned long addr, pte_t *ptep) | ||
| 334 | { | ||
| 335 | #if HV_PTE_INDEX_WRITABLE < 32 | ||
| 336 | # error Code assumes HV_PTE "writable" bit in high word | ||
| 337 | #endif | ||
| 338 | u32 *tmp = (u32 *)ptep; | ||
| 339 | tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32)); | ||
| 340 | } | ||
| 341 | |||
| 342 | #endif | ||
| 343 | |||
| 344 | pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr) | ||
| 345 | { | ||
| 346 | pgd_t *pgd; | ||
| 347 | pud_t *pud; | ||
| 348 | pmd_t *pmd; | ||
| 349 | |||
| 350 | if (pgd_addr_invalid(addr)) | ||
| 351 | return NULL; | ||
| 352 | |||
| 353 | pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr); | ||
| 354 | pud = pud_offset(pgd, addr); | ||
| 355 | if (!pud_present(*pud)) | ||
| 356 | return NULL; | ||
| 357 | pmd = pmd_offset(pud, addr); | ||
| 358 | if (pmd_huge_page(*pmd)) | ||
| 359 | return (pte_t *)pmd; | ||
| 360 | if (!pmd_present(*pmd)) | ||
| 361 | return NULL; | ||
| 362 | return pte_offset_kernel(pmd, addr); | ||
| 363 | } | ||
| 364 | |||
| 365 | pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu) | ||
| 366 | { | ||
| 367 | unsigned int width = smp_width; | ||
| 368 | int x = cpu % width; | ||
| 369 | int y = cpu / width; | ||
| 370 | BUG_ON(y >= smp_height); | ||
| 371 | BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3); | ||
| 372 | BUG_ON(cpu < 0 || cpu >= NR_CPUS); | ||
| 373 | BUG_ON(!cpu_is_valid_lotar(cpu)); | ||
| 374 | return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y)); | ||
| 375 | } | ||
| 376 | |||
| 377 | int get_remote_cache_cpu(pgprot_t prot) | ||
| 378 | { | ||
| 379 | HV_LOTAR lotar = hv_pte_get_lotar(prot); | ||
| 380 | int x = HV_LOTAR_X(lotar); | ||
| 381 | int y = HV_LOTAR_Y(lotar); | ||
| 382 | BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3); | ||
| 383 | return x + y * smp_width; | ||
| 384 | } | ||
| 385 | |||
| 386 | void set_pte_order(pte_t *ptep, pte_t pte, int order) | ||
| 387 | { | ||
| 388 | unsigned long pfn = pte_pfn(pte); | ||
| 389 | struct page *page = pfn_to_page(pfn); | ||
| 390 | |||
| 391 | /* Update the home of a PTE if necessary */ | ||
| 392 | pte = pte_set_home(pte, page_home(page)); | ||
| 393 | |||
| 394 | #ifdef __tilegx__ | ||
| 395 | *ptep = pte; | ||
| 396 | #else | ||
| 397 | /* | ||
| 398 | * When setting a PTE, write the high bits first, then write | ||
| 399 | * the low bits. This sets the "present" bit only after the | ||
| 400 | * other bits are in place. If a particular PTE update | ||
| 401 | * involves transitioning from one valid PTE to another, it | ||
| 402 | * may be necessary to call set_pte_order() more than once, | ||
| 403 | * transitioning via a suitable intermediate state. | ||
| 404 | * Note that this sequence also means that if we are transitioning | ||
| 405 | * from any migrating PTE to a non-migrating one, we will not | ||
| 406 | * see a half-updated PTE with the migrating bit off. | ||
| 407 | */ | ||
| 408 | #if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32 | ||
| 409 | # error Must write the present and migrating bits last | ||
| 410 | #endif | ||
| 411 | ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32); | ||
| 412 | barrier(); | ||
| 413 | ((u32 *)ptep)[0] = (u32)(pte_val(pte)); | ||
| 414 | #endif | ||
| 415 | } | ||
| 416 | |||
| 417 | /* Can this mm load a PTE with cached_priority set? */ | ||
| 418 | static inline int mm_is_priority_cached(struct mm_struct *mm) | ||
| 419 | { | ||
| 420 | return mm->context.priority_cached; | ||
| 421 | } | ||
| 422 | |||
| 423 | /* | ||
| 424 | * Add a priority mapping to an mm_context and | ||
| 425 | * notify the hypervisor if this is the first one. | ||
| 426 | */ | ||
| 427 | void start_mm_caching(struct mm_struct *mm) | ||
| 428 | { | ||
| 429 | if (!mm_is_priority_cached(mm)) { | ||
| 430 | mm->context.priority_cached = -1U; | ||
| 431 | hv_set_caching(-1U); | ||
| 432 | } | ||
| 433 | } | ||
| 434 | |||
| 435 | /* | ||
| 436 | * Validate and return the priority_cached flag. We know if it's zero | ||
| 437 | * that we don't need to scan, since we immediately set it non-zero | ||
| 438 | * when we first consider a MAP_CACHE_PRIORITY mapping. | ||
| 439 | * | ||
| 440 | * We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it, | ||
| 441 | * since we're in an interrupt context (servicing switch_mm) we don't | ||
| 442 | * worry about it and don't unset the "priority_cached" field. | ||
| 443 | * Presumably we'll come back later and have more luck and clear | ||
| 444 | * the value then; for now we'll just keep the cache marked for priority. | ||
| 445 | */ | ||
| 446 | static unsigned int update_priority_cached(struct mm_struct *mm) | ||
| 447 | { | ||
| 448 | if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) { | ||
| 449 | struct vm_area_struct *vm; | ||
| 450 | for (vm = mm->mmap; vm; vm = vm->vm_next) { | ||
| 451 | if (hv_pte_get_cached_priority(vm->vm_page_prot)) | ||
| 452 | break; | ||
| 453 | } | ||
| 454 | if (vm == NULL) | ||
| 455 | mm->context.priority_cached = 0; | ||
| 456 | up_write(&mm->mmap_sem); | ||
| 457 | } | ||
| 458 | return mm->context.priority_cached; | ||
| 459 | } | ||
| 460 | |||
| 461 | /* Set caching correctly for an mm that we are switching to. */ | ||
| 462 | void check_mm_caching(struct mm_struct *prev, struct mm_struct *next) | ||
| 463 | { | ||
| 464 | if (!mm_is_priority_cached(next)) { | ||
| 465 | /* | ||
| 466 | * If the new mm doesn't use priority caching, just see if we | ||
| 467 | * need the hv_set_caching(), or can assume it's already zero. | ||
| 468 | */ | ||
| 469 | if (mm_is_priority_cached(prev)) | ||
| 470 | hv_set_caching(0); | ||
| 471 | } else { | ||
| 472 | hv_set_caching(update_priority_cached(next)); | ||
| 473 | } | ||
| 474 | } | ||
| 475 | |||
| 476 | #if CHIP_HAS_MMIO() | ||
| 477 | |||
| 478 | /* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */ | ||
| 479 | void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, | ||
| 480 | pgprot_t home) | ||
| 481 | { | ||
| 482 | void *addr; | ||
| 483 | struct vm_struct *area; | ||
| 484 | unsigned long offset, last_addr; | ||
| 485 | pgprot_t pgprot; | ||
| 486 | |||
| 487 | /* Don't allow wraparound or zero size */ | ||
| 488 | last_addr = phys_addr + size - 1; | ||
| 489 | if (!size || last_addr < phys_addr) | ||
| 490 | return NULL; | ||
| 491 | |||
| 492 | /* Create a read/write, MMIO VA mapping homed at the requested shim. */ | ||
| 493 | pgprot = PAGE_KERNEL; | ||
| 494 | pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO); | ||
| 495 | pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home)); | ||
| 496 | |||
| 497 | /* | ||
| 498 | * Mappings have to be page-aligned | ||
| 499 | */ | ||
| 500 | offset = phys_addr & ~PAGE_MASK; | ||
| 501 | phys_addr &= PAGE_MASK; | ||
| 502 | size = PAGE_ALIGN(last_addr+1) - phys_addr; | ||
| 503 | |||
| 504 | /* | ||
| 505 | * Ok, go for it.. | ||
| 506 | */ | ||
| 507 | area = get_vm_area(size, VM_IOREMAP /* | other flags? */); | ||
| 508 | if (!area) | ||
| 509 | return NULL; | ||
| 510 | area->phys_addr = phys_addr; | ||
| 511 | addr = area->addr; | ||
| 512 | if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, | ||
| 513 | phys_addr, pgprot)) { | ||
| 514 | remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); | ||
| 515 | return NULL; | ||
| 516 | } | ||
| 517 | return (__force void __iomem *) (offset + (char *)addr); | ||
| 518 | } | ||
| 519 | EXPORT_SYMBOL(ioremap_prot); | ||
| 520 | |||
| 521 | /* Map a PCI MMIO bus address into VA space. */ | ||
| 522 | void __iomem *ioremap(resource_size_t phys_addr, unsigned long size) | ||
| 523 | { | ||
| 524 | panic("ioremap for PCI MMIO is not supported"); | ||
| 525 | } | ||
| 526 | EXPORT_SYMBOL(ioremap); | ||
| 527 | |||
| 528 | /* Unmap an MMIO VA mapping. */ | ||
| 529 | void iounmap(volatile void __iomem *addr_in) | ||
| 530 | { | ||
| 531 | volatile void __iomem *addr = (volatile void __iomem *) | ||
| 532 | (PAGE_MASK & (unsigned long __force)addr_in); | ||
| 533 | #if 1 | ||
| 534 | vunmap((void * __force)addr); | ||
| 535 | #else | ||
| 536 | /* x86 uses this complicated flow instead of vunmap(). Is | ||
| 537 | * there any particular reason we should do the same? */ | ||
| 538 | struct vm_struct *p, *o; | ||
| 539 | |||
| 540 | /* Use the vm area unlocked, assuming the caller | ||
| 541 | ensures there isn't another iounmap for the same address | ||
| 542 | in parallel. Reuse of the virtual address is prevented by | ||
| 543 | leaving it in the global lists until we're done with it. | ||
| 544 | cpa takes care of the direct mappings. */ | ||
| 545 | read_lock(&vmlist_lock); | ||
| 546 | for (p = vmlist; p; p = p->next) { | ||
| 547 | if (p->addr == addr) | ||
| 548 | break; | ||
| 549 | } | ||
| 550 | read_unlock(&vmlist_lock); | ||
| 551 | |||
| 552 | if (!p) { | ||
| 553 | printk("iounmap: bad address %p\n", addr); | ||
| 554 | dump_stack(); | ||
| 555 | return; | ||
| 556 | } | ||
| 557 | |||
| 558 | /* Finally remove it */ | ||
| 559 | o = remove_vm_area((void *)addr); | ||
| 560 | BUG_ON(p != o || o == NULL); | ||
| 561 | kfree(p); | ||
| 562 | #endif | ||
| 563 | } | ||
| 564 | EXPORT_SYMBOL(iounmap); | ||
| 565 | |||
| 566 | #endif /* CHIP_HAS_MMIO() */ | ||
