diff options
Diffstat (limited to 'arch/ppc64/mm')
-rw-r--r-- | arch/ppc64/mm/Makefile | 11 | ||||
-rw-r--r-- | arch/ppc64/mm/fault.c | 312 | ||||
-rw-r--r-- | arch/ppc64/mm/hash_low.S | 287 | ||||
-rw-r--r-- | arch/ppc64/mm/hash_native.c | 423 | ||||
-rw-r--r-- | arch/ppc64/mm/hash_utils.c | 439 | ||||
-rw-r--r-- | arch/ppc64/mm/hugetlbpage.c | 904 | ||||
-rw-r--r-- | arch/ppc64/mm/imalloc.c | 312 | ||||
-rw-r--r-- | arch/ppc64/mm/init.c | 927 | ||||
-rw-r--r-- | arch/ppc64/mm/mmap.c | 86 | ||||
-rw-r--r-- | arch/ppc64/mm/numa.c | 734 | ||||
-rw-r--r-- | arch/ppc64/mm/slb.c | 159 | ||||
-rw-r--r-- | arch/ppc64/mm/slb_low.S | 154 | ||||
-rw-r--r-- | arch/ppc64/mm/stab.c | 239 | ||||
-rw-r--r-- | arch/ppc64/mm/tlb.c | 180 |
14 files changed, 5167 insertions, 0 deletions
diff --git a/arch/ppc64/mm/Makefile b/arch/ppc64/mm/Makefile new file mode 100644 index 00000000000..ac522d57b2a --- /dev/null +++ b/arch/ppc64/mm/Makefile | |||
@@ -0,0 +1,11 @@ | |||
1 | # | ||
2 | # Makefile for the linux ppc-specific parts of the memory manager. | ||
3 | # | ||
4 | |||
5 | EXTRA_CFLAGS += -mno-minimal-toc | ||
6 | |||
7 | obj-y := fault.o init.o imalloc.o hash_utils.o hash_low.o tlb.o \ | ||
8 | slb_low.o slb.o stab.o mmap.o | ||
9 | obj-$(CONFIG_DISCONTIGMEM) += numa.o | ||
10 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | ||
11 | obj-$(CONFIG_PPC_MULTIPLATFORM) += hash_native.o | ||
diff --git a/arch/ppc64/mm/fault.c b/arch/ppc64/mm/fault.c new file mode 100644 index 00000000000..20b0f37e8bf --- /dev/null +++ b/arch/ppc64/mm/fault.c | |||
@@ -0,0 +1,312 @@ | |||
1 | /* | ||
2 | * arch/ppc/mm/fault.c | ||
3 | * | ||
4 | * PowerPC version | ||
5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | ||
6 | * | ||
7 | * Derived from "arch/i386/mm/fault.c" | ||
8 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | ||
9 | * | ||
10 | * Modified by Cort Dougan and Paul Mackerras. | ||
11 | * | ||
12 | * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com) | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or | ||
15 | * modify it under the terms of the GNU General Public License | ||
16 | * as published by the Free Software Foundation; either version | ||
17 | * 2 of the License, or (at your option) any later version. | ||
18 | */ | ||
19 | |||
20 | #include <linux/config.h> | ||
21 | #include <linux/signal.h> | ||
22 | #include <linux/sched.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/errno.h> | ||
25 | #include <linux/string.h> | ||
26 | #include <linux/types.h> | ||
27 | #include <linux/mman.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/interrupt.h> | ||
30 | #include <linux/smp_lock.h> | ||
31 | #include <linux/module.h> | ||
32 | |||
33 | #include <asm/page.h> | ||
34 | #include <asm/pgtable.h> | ||
35 | #include <asm/mmu.h> | ||
36 | #include <asm/mmu_context.h> | ||
37 | #include <asm/system.h> | ||
38 | #include <asm/uaccess.h> | ||
39 | #include <asm/kdebug.h> | ||
40 | |||
41 | /* | ||
42 | * Check whether the instruction at regs->nip is a store using | ||
43 | * an update addressing form which will update r1. | ||
44 | */ | ||
45 | static int store_updates_sp(struct pt_regs *regs) | ||
46 | { | ||
47 | unsigned int inst; | ||
48 | |||
49 | if (get_user(inst, (unsigned int __user *)regs->nip)) | ||
50 | return 0; | ||
51 | /* check for 1 in the rA field */ | ||
52 | if (((inst >> 16) & 0x1f) != 1) | ||
53 | return 0; | ||
54 | /* check major opcode */ | ||
55 | switch (inst >> 26) { | ||
56 | case 37: /* stwu */ | ||
57 | case 39: /* stbu */ | ||
58 | case 45: /* sthu */ | ||
59 | case 53: /* stfsu */ | ||
60 | case 55: /* stfdu */ | ||
61 | return 1; | ||
62 | case 62: /* std or stdu */ | ||
63 | return (inst & 3) == 1; | ||
64 | case 31: | ||
65 | /* check minor opcode */ | ||
66 | switch ((inst >> 1) & 0x3ff) { | ||
67 | case 181: /* stdux */ | ||
68 | case 183: /* stwux */ | ||
69 | case 247: /* stbux */ | ||
70 | case 439: /* sthux */ | ||
71 | case 695: /* stfsux */ | ||
72 | case 759: /* stfdux */ | ||
73 | return 1; | ||
74 | } | ||
75 | } | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | /* | ||
80 | * The error_code parameter is | ||
81 | * - DSISR for a non-SLB data access fault, | ||
82 | * - SRR1 & 0x08000000 for a non-SLB instruction access fault | ||
83 | * - 0 any SLB fault. | ||
84 | * The return value is 0 if the fault was handled, or the signal | ||
85 | * number if this is a kernel fault that can't be handled here. | ||
86 | */ | ||
87 | int do_page_fault(struct pt_regs *regs, unsigned long address, | ||
88 | unsigned long error_code) | ||
89 | { | ||
90 | struct vm_area_struct * vma; | ||
91 | struct mm_struct *mm = current->mm; | ||
92 | siginfo_t info; | ||
93 | unsigned long code = SEGV_MAPERR; | ||
94 | unsigned long is_write = error_code & DSISR_ISSTORE; | ||
95 | unsigned long trap = TRAP(regs); | ||
96 | unsigned long is_exec = trap == 0x400; | ||
97 | |||
98 | BUG_ON((trap == 0x380) || (trap == 0x480)); | ||
99 | |||
100 | if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, error_code, | ||
101 | 11, SIGSEGV) == NOTIFY_STOP) | ||
102 | return 0; | ||
103 | |||
104 | if (trap == 0x300) { | ||
105 | if (debugger_fault_handler(regs)) | ||
106 | return 0; | ||
107 | } | ||
108 | |||
109 | /* On a kernel SLB miss we can only check for a valid exception entry */ | ||
110 | if (!user_mode(regs) && (address >= TASK_SIZE)) | ||
111 | return SIGSEGV; | ||
112 | |||
113 | if (error_code & DSISR_DABRMATCH) { | ||
114 | if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, | ||
115 | 11, SIGSEGV) == NOTIFY_STOP) | ||
116 | return 0; | ||
117 | if (debugger_dabr_match(regs)) | ||
118 | return 0; | ||
119 | } | ||
120 | |||
121 | if (in_atomic() || mm == NULL) { | ||
122 | if (!user_mode(regs)) | ||
123 | return SIGSEGV; | ||
124 | /* in_atomic() in user mode is really bad, | ||
125 | as is current->mm == NULL. */ | ||
126 | printk(KERN_EMERG "Page fault in user mode with" | ||
127 | "in_atomic() = %d mm = %p\n", in_atomic(), mm); | ||
128 | printk(KERN_EMERG "NIP = %lx MSR = %lx\n", | ||
129 | regs->nip, regs->msr); | ||
130 | die("Weird page fault", regs, SIGSEGV); | ||
131 | } | ||
132 | |||
133 | /* When running in the kernel we expect faults to occur only to | ||
134 | * addresses in user space. All other faults represent errors in the | ||
135 | * kernel and should generate an OOPS. Unfortunatly, in the case of an | ||
136 | * erroneous fault occuring in a code path which already holds mmap_sem | ||
137 | * we will deadlock attempting to validate the fault against the | ||
138 | * address space. Luckily the kernel only validly references user | ||
139 | * space from well defined areas of code, which are listed in the | ||
140 | * exceptions table. | ||
141 | * | ||
142 | * As the vast majority of faults will be valid we will only perform | ||
143 | * the source reference check when there is a possibilty of a deadlock. | ||
144 | * Attempt to lock the address space, if we cannot we then validate the | ||
145 | * source. If this is invalid we can skip the address space check, | ||
146 | * thus avoiding the deadlock. | ||
147 | */ | ||
148 | if (!down_read_trylock(&mm->mmap_sem)) { | ||
149 | if (!user_mode(regs) && !search_exception_tables(regs->nip)) | ||
150 | goto bad_area_nosemaphore; | ||
151 | |||
152 | down_read(&mm->mmap_sem); | ||
153 | } | ||
154 | |||
155 | vma = find_vma(mm, address); | ||
156 | if (!vma) | ||
157 | goto bad_area; | ||
158 | |||
159 | if (vma->vm_start <= address) { | ||
160 | goto good_area; | ||
161 | } | ||
162 | if (!(vma->vm_flags & VM_GROWSDOWN)) | ||
163 | goto bad_area; | ||
164 | |||
165 | /* | ||
166 | * N.B. The POWER/Open ABI allows programs to access up to | ||
167 | * 288 bytes below the stack pointer. | ||
168 | * The kernel signal delivery code writes up to about 1.5kB | ||
169 | * below the stack pointer (r1) before decrementing it. | ||
170 | * The exec code can write slightly over 640kB to the stack | ||
171 | * before setting the user r1. Thus we allow the stack to | ||
172 | * expand to 1MB without further checks. | ||
173 | */ | ||
174 | if (address + 0x100000 < vma->vm_end) { | ||
175 | /* get user regs even if this fault is in kernel mode */ | ||
176 | struct pt_regs *uregs = current->thread.regs; | ||
177 | if (uregs == NULL) | ||
178 | goto bad_area; | ||
179 | |||
180 | /* | ||
181 | * A user-mode access to an address a long way below | ||
182 | * the stack pointer is only valid if the instruction | ||
183 | * is one which would update the stack pointer to the | ||
184 | * address accessed if the instruction completed, | ||
185 | * i.e. either stwu rs,n(r1) or stwux rs,r1,rb | ||
186 | * (or the byte, halfword, float or double forms). | ||
187 | * | ||
188 | * If we don't check this then any write to the area | ||
189 | * between the last mapped region and the stack will | ||
190 | * expand the stack rather than segfaulting. | ||
191 | */ | ||
192 | if (address + 2048 < uregs->gpr[1] | ||
193 | && (!user_mode(regs) || !store_updates_sp(regs))) | ||
194 | goto bad_area; | ||
195 | } | ||
196 | |||
197 | if (expand_stack(vma, address)) | ||
198 | goto bad_area; | ||
199 | |||
200 | good_area: | ||
201 | code = SEGV_ACCERR; | ||
202 | |||
203 | if (is_exec) { | ||
204 | /* protection fault */ | ||
205 | if (error_code & DSISR_PROTFAULT) | ||
206 | goto bad_area; | ||
207 | if (!(vma->vm_flags & VM_EXEC)) | ||
208 | goto bad_area; | ||
209 | /* a write */ | ||
210 | } else if (is_write) { | ||
211 | if (!(vma->vm_flags & VM_WRITE)) | ||
212 | goto bad_area; | ||
213 | /* a read */ | ||
214 | } else { | ||
215 | if (!(vma->vm_flags & VM_READ)) | ||
216 | goto bad_area; | ||
217 | } | ||
218 | |||
219 | survive: | ||
220 | /* | ||
221 | * If for any reason at all we couldn't handle the fault, | ||
222 | * make sure we exit gracefully rather than endlessly redo | ||
223 | * the fault. | ||
224 | */ | ||
225 | switch (handle_mm_fault(mm, vma, address, is_write)) { | ||
226 | |||
227 | case VM_FAULT_MINOR: | ||
228 | current->min_flt++; | ||
229 | break; | ||
230 | case VM_FAULT_MAJOR: | ||
231 | current->maj_flt++; | ||
232 | break; | ||
233 | case VM_FAULT_SIGBUS: | ||
234 | goto do_sigbus; | ||
235 | case VM_FAULT_OOM: | ||
236 | goto out_of_memory; | ||
237 | default: | ||
238 | BUG(); | ||
239 | } | ||
240 | |||
241 | up_read(&mm->mmap_sem); | ||
242 | return 0; | ||
243 | |||
244 | bad_area: | ||
245 | up_read(&mm->mmap_sem); | ||
246 | |||
247 | bad_area_nosemaphore: | ||
248 | /* User mode accesses cause a SIGSEGV */ | ||
249 | if (user_mode(regs)) { | ||
250 | info.si_signo = SIGSEGV; | ||
251 | info.si_errno = 0; | ||
252 | info.si_code = code; | ||
253 | info.si_addr = (void __user *) address; | ||
254 | force_sig_info(SIGSEGV, &info, current); | ||
255 | return 0; | ||
256 | } | ||
257 | |||
258 | if (trap == 0x400 && (error_code & DSISR_PROTFAULT) | ||
259 | && printk_ratelimit()) | ||
260 | printk(KERN_CRIT "kernel tried to execute NX-protected" | ||
261 | " page (%lx) - exploit attempt? (uid: %d)\n", | ||
262 | address, current->uid); | ||
263 | |||
264 | return SIGSEGV; | ||
265 | |||
266 | /* | ||
267 | * We ran out of memory, or some other thing happened to us that made | ||
268 | * us unable to handle the page fault gracefully. | ||
269 | */ | ||
270 | out_of_memory: | ||
271 | up_read(&mm->mmap_sem); | ||
272 | if (current->pid == 1) { | ||
273 | yield(); | ||
274 | down_read(&mm->mmap_sem); | ||
275 | goto survive; | ||
276 | } | ||
277 | printk("VM: killing process %s\n", current->comm); | ||
278 | if (user_mode(regs)) | ||
279 | do_exit(SIGKILL); | ||
280 | return SIGKILL; | ||
281 | |||
282 | do_sigbus: | ||
283 | up_read(&mm->mmap_sem); | ||
284 | if (user_mode(regs)) { | ||
285 | info.si_signo = SIGBUS; | ||
286 | info.si_errno = 0; | ||
287 | info.si_code = BUS_ADRERR; | ||
288 | info.si_addr = (void __user *)address; | ||
289 | force_sig_info(SIGBUS, &info, current); | ||
290 | return 0; | ||
291 | } | ||
292 | return SIGBUS; | ||
293 | } | ||
294 | |||
295 | /* | ||
296 | * bad_page_fault is called when we have a bad access from the kernel. | ||
297 | * It is called from do_page_fault above and from some of the procedures | ||
298 | * in traps.c. | ||
299 | */ | ||
300 | void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) | ||
301 | { | ||
302 | const struct exception_table_entry *entry; | ||
303 | |||
304 | /* Are we prepared to handle this fault? */ | ||
305 | if ((entry = search_exception_tables(regs->nip)) != NULL) { | ||
306 | regs->nip = entry->fixup; | ||
307 | return; | ||
308 | } | ||
309 | |||
310 | /* kernel has accessed a bad area */ | ||
311 | die("Kernel access of bad area", regs, sig); | ||
312 | } | ||
diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S new file mode 100644 index 00000000000..8c0156a3700 --- /dev/null +++ b/arch/ppc64/mm/hash_low.S | |||
@@ -0,0 +1,287 @@ | |||
1 | /* | ||
2 | * ppc64 MMU hashtable management routines | ||
3 | * | ||
4 | * (c) Copyright IBM Corp. 2003 | ||
5 | * | ||
6 | * Maintained by: Benjamin Herrenschmidt | ||
7 | * <benh@kernel.crashing.org> | ||
8 | * | ||
9 | * This file is covered by the GNU Public Licence v2 as | ||
10 | * described in the kernel's COPYING file. | ||
11 | */ | ||
12 | |||
13 | #include <asm/processor.h> | ||
14 | #include <asm/pgtable.h> | ||
15 | #include <asm/mmu.h> | ||
16 | #include <asm/page.h> | ||
17 | #include <asm/types.h> | ||
18 | #include <asm/ppc_asm.h> | ||
19 | #include <asm/offsets.h> | ||
20 | #include <asm/cputable.h> | ||
21 | |||
22 | .text | ||
23 | |||
24 | /* | ||
25 | * Stackframe: | ||
26 | * | ||
27 | * +-> Back chain (SP + 256) | ||
28 | * | General register save area (SP + 112) | ||
29 | * | Parameter save area (SP + 48) | ||
30 | * | TOC save area (SP + 40) | ||
31 | * | link editor doubleword (SP + 32) | ||
32 | * | compiler doubleword (SP + 24) | ||
33 | * | LR save area (SP + 16) | ||
34 | * | CR save area (SP + 8) | ||
35 | * SP ---> +-- Back chain (SP + 0) | ||
36 | */ | ||
37 | #define STACKFRAMESIZE 256 | ||
38 | |||
39 | /* Save parameters offsets */ | ||
40 | #define STK_PARM(i) (STACKFRAMESIZE + 48 + ((i)-3)*8) | ||
41 | |||
42 | /* Save non-volatile offsets */ | ||
43 | #define STK_REG(i) (112 + ((i)-14)*8) | ||
44 | |||
45 | /* | ||
46 | * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid, | ||
47 | * pte_t *ptep, unsigned long trap, int local) | ||
48 | * | ||
49 | * Adds a page to the hash table. This is the non-LPAR version for now | ||
50 | */ | ||
51 | |||
52 | _GLOBAL(__hash_page) | ||
53 | mflr r0 | ||
54 | std r0,16(r1) | ||
55 | stdu r1,-STACKFRAMESIZE(r1) | ||
56 | /* Save all params that we need after a function call */ | ||
57 | std r6,STK_PARM(r6)(r1) | ||
58 | std r8,STK_PARM(r8)(r1) | ||
59 | |||
60 | /* Add _PAGE_PRESENT to access */ | ||
61 | ori r4,r4,_PAGE_PRESENT | ||
62 | |||
63 | /* Save non-volatile registers. | ||
64 | * r31 will hold "old PTE" | ||
65 | * r30 is "new PTE" | ||
66 | * r29 is "va" | ||
67 | * r28 is a hash value | ||
68 | * r27 is hashtab mask (maybe dynamic patched instead ?) | ||
69 | */ | ||
70 | std r27,STK_REG(r27)(r1) | ||
71 | std r28,STK_REG(r28)(r1) | ||
72 | std r29,STK_REG(r29)(r1) | ||
73 | std r30,STK_REG(r30)(r1) | ||
74 | std r31,STK_REG(r31)(r1) | ||
75 | |||
76 | /* Step 1: | ||
77 | * | ||
78 | * Check permissions, atomically mark the linux PTE busy | ||
79 | * and hashed. | ||
80 | */ | ||
81 | 1: | ||
82 | ldarx r31,0,r6 | ||
83 | /* Check access rights (access & ~(pte_val(*ptep))) */ | ||
84 | andc. r0,r4,r31 | ||
85 | bne- htab_wrong_access | ||
86 | /* Check if PTE is busy */ | ||
87 | andi. r0,r31,_PAGE_BUSY | ||
88 | bne- 1b | ||
89 | /* Prepare new PTE value (turn access RW into DIRTY, then | ||
90 | * add BUSY,HASHPTE and ACCESSED) | ||
91 | */ | ||
92 | rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ | ||
93 | or r30,r30,r31 | ||
94 | ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE | ||
95 | /* Write the linux PTE atomically (setting busy) */ | ||
96 | stdcx. r30,0,r6 | ||
97 | bne- 1b | ||
98 | isync | ||
99 | |||
100 | /* Step 2: | ||
101 | * | ||
102 | * Insert/Update the HPTE in the hash table. At this point, | ||
103 | * r4 (access) is re-useable, we use it for the new HPTE flags | ||
104 | */ | ||
105 | |||
106 | /* Calc va and put it in r29 */ | ||
107 | rldicr r29,r5,28,63-28 | ||
108 | rldicl r3,r3,0,36 | ||
109 | or r29,r3,r29 | ||
110 | |||
111 | /* Calculate hash value for primary slot and store it in r28 */ | ||
112 | rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ | ||
113 | rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */ | ||
114 | xor r28,r5,r0 | ||
115 | |||
116 | /* Convert linux PTE bits into HW equivalents */ | ||
117 | andi. r3,r30,0x1fe /* Get basic set of flags */ | ||
118 | xori r3,r3,HW_NO_EXEC /* _PAGE_EXEC -> NOEXEC */ | ||
119 | rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ | ||
120 | rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ | ||
121 | and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */ | ||
122 | andc r0,r30,r0 /* r0 = pte & ~r0 */ | ||
123 | rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ | ||
124 | |||
125 | /* We eventually do the icache sync here (maybe inline that | ||
126 | * code rather than call a C function...) | ||
127 | */ | ||
128 | BEGIN_FTR_SECTION | ||
129 | BEGIN_FTR_SECTION | ||
130 | mr r4,r30 | ||
131 | mr r5,r7 | ||
132 | bl .hash_page_do_lazy_icache | ||
133 | END_FTR_SECTION_IFSET(CPU_FTR_NOEXECUTE) | ||
134 | END_FTR_SECTION_IFCLR(CPU_FTR_COHERENT_ICACHE) | ||
135 | |||
136 | /* At this point, r3 contains new PP bits, save them in | ||
137 | * place of "access" in the param area (sic) | ||
138 | */ | ||
139 | std r3,STK_PARM(r4)(r1) | ||
140 | |||
141 | /* Get htab_hash_mask */ | ||
142 | ld r4,htab_hash_mask@got(2) | ||
143 | ld r27,0(r4) /* htab_hash_mask -> r27 */ | ||
144 | |||
145 | /* Check if we may already be in the hashtable, in this case, we | ||
146 | * go to out-of-line code to try to modify the HPTE | ||
147 | */ | ||
148 | andi. r0,r31,_PAGE_HASHPTE | ||
149 | bne htab_modify_pte | ||
150 | |||
151 | htab_insert_pte: | ||
152 | /* Clear hpte bits in new pte (we also clear BUSY btw) and | ||
153 | * add _PAGE_HASHPTE | ||
154 | */ | ||
155 | lis r0,_PAGE_HPTEFLAGS@h | ||
156 | ori r0,r0,_PAGE_HPTEFLAGS@l | ||
157 | andc r30,r30,r0 | ||
158 | ori r30,r30,_PAGE_HASHPTE | ||
159 | |||
160 | /* page number in r5 */ | ||
161 | rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT | ||
162 | |||
163 | /* Calculate primary group hash */ | ||
164 | and r0,r28,r27 | ||
165 | rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ | ||
166 | |||
167 | /* Call ppc_md.hpte_insert */ | ||
168 | ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */ | ||
169 | mr r4,r29 /* Retreive va */ | ||
170 | li r6,0 /* primary slot */ | ||
171 | li r8,0 /* not bolted and not large */ | ||
172 | li r9,0 | ||
173 | _GLOBAL(htab_call_hpte_insert1) | ||
174 | bl . /* Will be patched by htab_finish_init() */ | ||
175 | cmpdi 0,r3,0 | ||
176 | bge htab_pte_insert_ok /* Insertion successful */ | ||
177 | cmpdi 0,r3,-2 /* Critical failure */ | ||
178 | beq- htab_pte_insert_failure | ||
179 | |||
180 | /* Now try secondary slot */ | ||
181 | |||
182 | /* page number in r5 */ | ||
183 | rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT | ||
184 | |||
185 | /* Calculate secondary group hash */ | ||
186 | andc r0,r27,r28 | ||
187 | rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ | ||
188 | |||
189 | /* Call ppc_md.hpte_insert */ | ||
190 | ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */ | ||
191 | mr r4,r29 /* Retreive va */ | ||
192 | li r6,1 /* secondary slot */ | ||
193 | li r8,0 /* not bolted and not large */ | ||
194 | li r9,0 | ||
195 | _GLOBAL(htab_call_hpte_insert2) | ||
196 | bl . /* Will be patched by htab_finish_init() */ | ||
197 | cmpdi 0,r3,0 | ||
198 | bge+ htab_pte_insert_ok /* Insertion successful */ | ||
199 | cmpdi 0,r3,-2 /* Critical failure */ | ||
200 | beq- htab_pte_insert_failure | ||
201 | |||
202 | /* Both are full, we need to evict something */ | ||
203 | mftb r0 | ||
204 | /* Pick a random group based on TB */ | ||
205 | andi. r0,r0,1 | ||
206 | mr r5,r28 | ||
207 | bne 2f | ||
208 | not r5,r5 | ||
209 | 2: and r0,r5,r27 | ||
210 | rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ | ||
211 | /* Call ppc_md.hpte_remove */ | ||
212 | _GLOBAL(htab_call_hpte_remove) | ||
213 | bl . /* Will be patched by htab_finish_init() */ | ||
214 | |||
215 | /* Try all again */ | ||
216 | b htab_insert_pte | ||
217 | |||
218 | htab_pte_insert_ok: | ||
219 | /* Insert slot number & secondary bit in PTE */ | ||
220 | rldimi r30,r3,12,63-15 | ||
221 | |||
222 | /* Write out the PTE with a normal write | ||
223 | * (maybe add eieio may be good still ?) | ||
224 | */ | ||
225 | htab_write_out_pte: | ||
226 | ld r6,STK_PARM(r6)(r1) | ||
227 | std r30,0(r6) | ||
228 | li r3, 0 | ||
229 | bail: | ||
230 | ld r27,STK_REG(r27)(r1) | ||
231 | ld r28,STK_REG(r28)(r1) | ||
232 | ld r29,STK_REG(r29)(r1) | ||
233 | ld r30,STK_REG(r30)(r1) | ||
234 | ld r31,STK_REG(r31)(r1) | ||
235 | addi r1,r1,STACKFRAMESIZE | ||
236 | ld r0,16(r1) | ||
237 | mtlr r0 | ||
238 | blr | ||
239 | |||
240 | htab_modify_pte: | ||
241 | /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ | ||
242 | mr r4,r3 | ||
243 | rlwinm r3,r31,32-12,29,31 | ||
244 | |||
245 | /* Secondary group ? if yes, get a inverted hash value */ | ||
246 | mr r5,r28 | ||
247 | andi. r0,r31,_PAGE_SECONDARY | ||
248 | beq 1f | ||
249 | not r5,r5 | ||
250 | 1: | ||
251 | /* Calculate proper slot value for ppc_md.hpte_updatepp */ | ||
252 | and r0,r5,r27 | ||
253 | rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ | ||
254 | add r3,r0,r3 /* add slot idx */ | ||
255 | |||
256 | /* Call ppc_md.hpte_updatepp */ | ||
257 | mr r5,r29 /* va */ | ||
258 | li r6,0 /* large is 0 */ | ||
259 | ld r7,STK_PARM(r8)(r1) /* get "local" param */ | ||
260 | _GLOBAL(htab_call_hpte_updatepp) | ||
261 | bl . /* Will be patched by htab_finish_init() */ | ||
262 | |||
263 | /* if we failed because typically the HPTE wasn't really here | ||
264 | * we try an insertion. | ||
265 | */ | ||
266 | cmpdi 0,r3,-1 | ||
267 | beq- htab_insert_pte | ||
268 | |||
269 | /* Clear the BUSY bit and Write out the PTE */ | ||
270 | li r0,_PAGE_BUSY | ||
271 | andc r30,r30,r0 | ||
272 | b htab_write_out_pte | ||
273 | |||
274 | htab_wrong_access: | ||
275 | /* Bail out clearing reservation */ | ||
276 | stdcx. r31,0,r6 | ||
277 | li r3,1 | ||
278 | b bail | ||
279 | |||
280 | htab_pte_insert_failure: | ||
281 | /* Bail out restoring old PTE */ | ||
282 | ld r6,STK_PARM(r6)(r1) | ||
283 | std r31,0(r6) | ||
284 | li r3,-1 | ||
285 | b bail | ||
286 | |||
287 | |||
diff --git a/arch/ppc64/mm/hash_native.c b/arch/ppc64/mm/hash_native.c new file mode 100644 index 00000000000..144657e0c3d --- /dev/null +++ b/arch/ppc64/mm/hash_native.c | |||
@@ -0,0 +1,423 @@ | |||
1 | /* | ||
2 | * native hashtable management. | ||
3 | * | ||
4 | * SMP scalability work: | ||
5 | * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | */ | ||
12 | #include <linux/spinlock.h> | ||
13 | #include <linux/bitops.h> | ||
14 | #include <linux/threads.h> | ||
15 | #include <linux/smp.h> | ||
16 | |||
17 | #include <asm/abs_addr.h> | ||
18 | #include <asm/machdep.h> | ||
19 | #include <asm/mmu.h> | ||
20 | #include <asm/mmu_context.h> | ||
21 | #include <asm/pgtable.h> | ||
22 | #include <asm/tlbflush.h> | ||
23 | #include <asm/tlb.h> | ||
24 | #include <asm/cputable.h> | ||
25 | |||
26 | #define HPTE_LOCK_BIT 3 | ||
27 | |||
28 | static DEFINE_SPINLOCK(native_tlbie_lock); | ||
29 | |||
30 | static inline void native_lock_hpte(HPTE *hptep) | ||
31 | { | ||
32 | unsigned long *word = &hptep->dw0.dword0; | ||
33 | |||
34 | while (1) { | ||
35 | if (!test_and_set_bit(HPTE_LOCK_BIT, word)) | ||
36 | break; | ||
37 | while(test_bit(HPTE_LOCK_BIT, word)) | ||
38 | cpu_relax(); | ||
39 | } | ||
40 | } | ||
41 | |||
42 | static inline void native_unlock_hpte(HPTE *hptep) | ||
43 | { | ||
44 | unsigned long *word = &hptep->dw0.dword0; | ||
45 | |||
46 | asm volatile("lwsync":::"memory"); | ||
47 | clear_bit(HPTE_LOCK_BIT, word); | ||
48 | } | ||
49 | |||
50 | long native_hpte_insert(unsigned long hpte_group, unsigned long va, | ||
51 | unsigned long prpn, int secondary, | ||
52 | unsigned long hpteflags, int bolted, int large) | ||
53 | { | ||
54 | unsigned long arpn = physRpn_to_absRpn(prpn); | ||
55 | HPTE *hptep = htab_address + hpte_group; | ||
56 | Hpte_dword0 dw0; | ||
57 | HPTE lhpte; | ||
58 | int i; | ||
59 | |||
60 | for (i = 0; i < HPTES_PER_GROUP; i++) { | ||
61 | dw0 = hptep->dw0.dw0; | ||
62 | |||
63 | if (!dw0.v) { | ||
64 | /* retry with lock held */ | ||
65 | native_lock_hpte(hptep); | ||
66 | dw0 = hptep->dw0.dw0; | ||
67 | if (!dw0.v) | ||
68 | break; | ||
69 | native_unlock_hpte(hptep); | ||
70 | } | ||
71 | |||
72 | hptep++; | ||
73 | } | ||
74 | |||
75 | if (i == HPTES_PER_GROUP) | ||
76 | return -1; | ||
77 | |||
78 | lhpte.dw1.dword1 = 0; | ||
79 | lhpte.dw1.dw1.rpn = arpn; | ||
80 | lhpte.dw1.flags.flags = hpteflags; | ||
81 | |||
82 | lhpte.dw0.dword0 = 0; | ||
83 | lhpte.dw0.dw0.avpn = va >> 23; | ||
84 | lhpte.dw0.dw0.h = secondary; | ||
85 | lhpte.dw0.dw0.bolted = bolted; | ||
86 | lhpte.dw0.dw0.v = 1; | ||
87 | |||
88 | if (large) { | ||
89 | lhpte.dw0.dw0.l = 1; | ||
90 | lhpte.dw0.dw0.avpn &= ~0x1UL; | ||
91 | } | ||
92 | |||
93 | hptep->dw1.dword1 = lhpte.dw1.dword1; | ||
94 | |||
95 | /* Guarantee the second dword is visible before the valid bit */ | ||
96 | __asm__ __volatile__ ("eieio" : : : "memory"); | ||
97 | |||
98 | /* | ||
99 | * Now set the first dword including the valid bit | ||
100 | * NOTE: this also unlocks the hpte | ||
101 | */ | ||
102 | hptep->dw0.dword0 = lhpte.dw0.dword0; | ||
103 | |||
104 | __asm__ __volatile__ ("ptesync" : : : "memory"); | ||
105 | |||
106 | return i | (secondary << 3); | ||
107 | } | ||
108 | |||
109 | static long native_hpte_remove(unsigned long hpte_group) | ||
110 | { | ||
111 | HPTE *hptep; | ||
112 | Hpte_dword0 dw0; | ||
113 | int i; | ||
114 | int slot_offset; | ||
115 | |||
116 | /* pick a random entry to start at */ | ||
117 | slot_offset = mftb() & 0x7; | ||
118 | |||
119 | for (i = 0; i < HPTES_PER_GROUP; i++) { | ||
120 | hptep = htab_address + hpte_group + slot_offset; | ||
121 | dw0 = hptep->dw0.dw0; | ||
122 | |||
123 | if (dw0.v && !dw0.bolted) { | ||
124 | /* retry with lock held */ | ||
125 | native_lock_hpte(hptep); | ||
126 | dw0 = hptep->dw0.dw0; | ||
127 | if (dw0.v && !dw0.bolted) | ||
128 | break; | ||
129 | native_unlock_hpte(hptep); | ||
130 | } | ||
131 | |||
132 | slot_offset++; | ||
133 | slot_offset &= 0x7; | ||
134 | } | ||
135 | |||
136 | if (i == HPTES_PER_GROUP) | ||
137 | return -1; | ||
138 | |||
139 | /* Invalidate the hpte. NOTE: this also unlocks it */ | ||
140 | hptep->dw0.dword0 = 0; | ||
141 | |||
142 | return i; | ||
143 | } | ||
144 | |||
145 | static inline void set_pp_bit(unsigned long pp, HPTE *addr) | ||
146 | { | ||
147 | unsigned long old; | ||
148 | unsigned long *p = &addr->dw1.dword1; | ||
149 | |||
150 | __asm__ __volatile__( | ||
151 | "1: ldarx %0,0,%3\n\ | ||
152 | rldimi %0,%2,0,61\n\ | ||
153 | stdcx. %0,0,%3\n\ | ||
154 | bne 1b" | ||
155 | : "=&r" (old), "=m" (*p) | ||
156 | : "r" (pp), "r" (p), "m" (*p) | ||
157 | : "cc"); | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Only works on small pages. Yes its ugly to have to check each slot in | ||
162 | * the group but we only use this during bootup. | ||
163 | */ | ||
164 | static long native_hpte_find(unsigned long vpn) | ||
165 | { | ||
166 | HPTE *hptep; | ||
167 | unsigned long hash; | ||
168 | unsigned long i, j; | ||
169 | long slot; | ||
170 | Hpte_dword0 dw0; | ||
171 | |||
172 | hash = hpt_hash(vpn, 0); | ||
173 | |||
174 | for (j = 0; j < 2; j++) { | ||
175 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | ||
176 | for (i = 0; i < HPTES_PER_GROUP; i++) { | ||
177 | hptep = htab_address + slot; | ||
178 | dw0 = hptep->dw0.dw0; | ||
179 | |||
180 | if ((dw0.avpn == (vpn >> 11)) && dw0.v && | ||
181 | (dw0.h == j)) { | ||
182 | /* HPTE matches */ | ||
183 | if (j) | ||
184 | slot = -slot; | ||
185 | return slot; | ||
186 | } | ||
187 | ++slot; | ||
188 | } | ||
189 | hash = ~hash; | ||
190 | } | ||
191 | |||
192 | return -1; | ||
193 | } | ||
194 | |||
195 | static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, | ||
196 | unsigned long va, int large, int local) | ||
197 | { | ||
198 | HPTE *hptep = htab_address + slot; | ||
199 | Hpte_dword0 dw0; | ||
200 | unsigned long avpn = va >> 23; | ||
201 | int ret = 0; | ||
202 | |||
203 | if (large) | ||
204 | avpn &= ~0x1UL; | ||
205 | |||
206 | native_lock_hpte(hptep); | ||
207 | |||
208 | dw0 = hptep->dw0.dw0; | ||
209 | |||
210 | /* Even if we miss, we need to invalidate the TLB */ | ||
211 | if ((dw0.avpn != avpn) || !dw0.v) { | ||
212 | native_unlock_hpte(hptep); | ||
213 | ret = -1; | ||
214 | } else { | ||
215 | set_pp_bit(newpp, hptep); | ||
216 | native_unlock_hpte(hptep); | ||
217 | } | ||
218 | |||
219 | /* Ensure it is out of the tlb too */ | ||
220 | if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) { | ||
221 | tlbiel(va); | ||
222 | } else { | ||
223 | int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); | ||
224 | |||
225 | if (lock_tlbie) | ||
226 | spin_lock(&native_tlbie_lock); | ||
227 | tlbie(va, large); | ||
228 | if (lock_tlbie) | ||
229 | spin_unlock(&native_tlbie_lock); | ||
230 | } | ||
231 | |||
232 | return ret; | ||
233 | } | ||
234 | |||
235 | /* | ||
236 | * Update the page protection bits. Intended to be used to create | ||
237 | * guard pages for kernel data structures on pages which are bolted | ||
238 | * in the HPT. Assumes pages being operated on will not be stolen. | ||
239 | * Does not work on large pages. | ||
240 | * | ||
241 | * No need to lock here because we should be the only user. | ||
242 | */ | ||
243 | static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea) | ||
244 | { | ||
245 | unsigned long vsid, va, vpn, flags = 0; | ||
246 | long slot; | ||
247 | HPTE *hptep; | ||
248 | int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); | ||
249 | |||
250 | vsid = get_kernel_vsid(ea); | ||
251 | va = (vsid << 28) | (ea & 0x0fffffff); | ||
252 | vpn = va >> PAGE_SHIFT; | ||
253 | |||
254 | slot = native_hpte_find(vpn); | ||
255 | if (slot == -1) | ||
256 | panic("could not find page to bolt\n"); | ||
257 | hptep = htab_address + slot; | ||
258 | |||
259 | set_pp_bit(newpp, hptep); | ||
260 | |||
261 | /* Ensure it is out of the tlb too */ | ||
262 | if (lock_tlbie) | ||
263 | spin_lock_irqsave(&native_tlbie_lock, flags); | ||
264 | tlbie(va, 0); | ||
265 | if (lock_tlbie) | ||
266 | spin_unlock_irqrestore(&native_tlbie_lock, flags); | ||
267 | } | ||
268 | |||
269 | static void native_hpte_invalidate(unsigned long slot, unsigned long va, | ||
270 | int large, int local) | ||
271 | { | ||
272 | HPTE *hptep = htab_address + slot; | ||
273 | Hpte_dword0 dw0; | ||
274 | unsigned long avpn = va >> 23; | ||
275 | unsigned long flags; | ||
276 | int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); | ||
277 | |||
278 | if (large) | ||
279 | avpn &= ~0x1UL; | ||
280 | |||
281 | local_irq_save(flags); | ||
282 | native_lock_hpte(hptep); | ||
283 | |||
284 | dw0 = hptep->dw0.dw0; | ||
285 | |||
286 | /* Even if we miss, we need to invalidate the TLB */ | ||
287 | if ((dw0.avpn != avpn) || !dw0.v) { | ||
288 | native_unlock_hpte(hptep); | ||
289 | } else { | ||
290 | /* Invalidate the hpte. NOTE: this also unlocks it */ | ||
291 | hptep->dw0.dword0 = 0; | ||
292 | } | ||
293 | |||
294 | /* Invalidate the tlb */ | ||
295 | if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) { | ||
296 | tlbiel(va); | ||
297 | } else { | ||
298 | if (lock_tlbie) | ||
299 | spin_lock(&native_tlbie_lock); | ||
300 | tlbie(va, large); | ||
301 | if (lock_tlbie) | ||
302 | spin_unlock(&native_tlbie_lock); | ||
303 | } | ||
304 | local_irq_restore(flags); | ||
305 | } | ||
306 | |||
307 | static void native_flush_hash_range(unsigned long context, | ||
308 | unsigned long number, int local) | ||
309 | { | ||
310 | unsigned long vsid, vpn, va, hash, secondary, slot, flags, avpn; | ||
311 | int i, j; | ||
312 | HPTE *hptep; | ||
313 | Hpte_dword0 dw0; | ||
314 | struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); | ||
315 | |||
316 | /* XXX fix for large ptes */ | ||
317 | unsigned long large = 0; | ||
318 | |||
319 | local_irq_save(flags); | ||
320 | |||
321 | j = 0; | ||
322 | for (i = 0; i < number; i++) { | ||
323 | if ((batch->addr[i] >= USER_START) && | ||
324 | (batch->addr[i] <= USER_END)) | ||
325 | vsid = get_vsid(context, batch->addr[i]); | ||
326 | else | ||
327 | vsid = get_kernel_vsid(batch->addr[i]); | ||
328 | |||
329 | va = (vsid << 28) | (batch->addr[i] & 0x0fffffff); | ||
330 | batch->vaddr[j] = va; | ||
331 | if (large) | ||
332 | vpn = va >> HPAGE_SHIFT; | ||
333 | else | ||
334 | vpn = va >> PAGE_SHIFT; | ||
335 | hash = hpt_hash(vpn, large); | ||
336 | secondary = (pte_val(batch->pte[i]) & _PAGE_SECONDARY) >> 15; | ||
337 | if (secondary) | ||
338 | hash = ~hash; | ||
339 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | ||
340 | slot += (pte_val(batch->pte[i]) & _PAGE_GROUP_IX) >> 12; | ||
341 | |||
342 | hptep = htab_address + slot; | ||
343 | |||
344 | avpn = va >> 23; | ||
345 | if (large) | ||
346 | avpn &= ~0x1UL; | ||
347 | |||
348 | native_lock_hpte(hptep); | ||
349 | |||
350 | dw0 = hptep->dw0.dw0; | ||
351 | |||
352 | /* Even if we miss, we need to invalidate the TLB */ | ||
353 | if ((dw0.avpn != avpn) || !dw0.v) { | ||
354 | native_unlock_hpte(hptep); | ||
355 | } else { | ||
356 | /* Invalidate the hpte. NOTE: this also unlocks it */ | ||
357 | hptep->dw0.dword0 = 0; | ||
358 | } | ||
359 | |||
360 | j++; | ||
361 | } | ||
362 | |||
363 | if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) { | ||
364 | asm volatile("ptesync":::"memory"); | ||
365 | |||
366 | for (i = 0; i < j; i++) | ||
367 | __tlbiel(batch->vaddr[i]); | ||
368 | |||
369 | asm volatile("ptesync":::"memory"); | ||
370 | } else { | ||
371 | int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); | ||
372 | |||
373 | if (lock_tlbie) | ||
374 | spin_lock(&native_tlbie_lock); | ||
375 | |||
376 | asm volatile("ptesync":::"memory"); | ||
377 | |||
378 | for (i = 0; i < j; i++) | ||
379 | __tlbie(batch->vaddr[i], 0); | ||
380 | |||
381 | asm volatile("eieio; tlbsync; ptesync":::"memory"); | ||
382 | |||
383 | if (lock_tlbie) | ||
384 | spin_unlock(&native_tlbie_lock); | ||
385 | } | ||
386 | |||
387 | local_irq_restore(flags); | ||
388 | } | ||
389 | |||
390 | #ifdef CONFIG_PPC_PSERIES | ||
391 | /* Disable TLB batching on nighthawk */ | ||
392 | static inline int tlb_batching_enabled(void) | ||
393 | { | ||
394 | struct device_node *root = of_find_node_by_path("/"); | ||
395 | int enabled = 1; | ||
396 | |||
397 | if (root) { | ||
398 | const char *model = get_property(root, "model", NULL); | ||
399 | if (model && !strcmp(model, "IBM,9076-N81")) | ||
400 | enabled = 0; | ||
401 | of_node_put(root); | ||
402 | } | ||
403 | |||
404 | return enabled; | ||
405 | } | ||
406 | #else | ||
407 | static inline int tlb_batching_enabled(void) | ||
408 | { | ||
409 | return 1; | ||
410 | } | ||
411 | #endif | ||
412 | |||
413 | void hpte_init_native(void) | ||
414 | { | ||
415 | ppc_md.hpte_invalidate = native_hpte_invalidate; | ||
416 | ppc_md.hpte_updatepp = native_hpte_updatepp; | ||
417 | ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp; | ||
418 | ppc_md.hpte_insert = native_hpte_insert; | ||
419 | ppc_md.hpte_remove = native_hpte_remove; | ||
420 | if (tlb_batching_enabled()) | ||
421 | ppc_md.flush_hash_range = native_flush_hash_range; | ||
422 | htab_finish_init(); | ||
423 | } | ||
diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c new file mode 100644 index 00000000000..e48be12f518 --- /dev/null +++ b/arch/ppc64/mm/hash_utils.c | |||
@@ -0,0 +1,439 @@ | |||
1 | /* | ||
2 | * PowerPC64 port by Mike Corrigan and Dave Engebretsen | ||
3 | * {mikejc|engebret}@us.ibm.com | ||
4 | * | ||
5 | * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com> | ||
6 | * | ||
7 | * SMP scalability work: | ||
8 | * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM | ||
9 | * | ||
10 | * Module name: htab.c | ||
11 | * | ||
12 | * Description: | ||
13 | * PowerPC Hashed Page Table functions | ||
14 | * | ||
15 | * This program is free software; you can redistribute it and/or | ||
16 | * modify it under the terms of the GNU General Public License | ||
17 | * as published by the Free Software Foundation; either version | ||
18 | * 2 of the License, or (at your option) any later version. | ||
19 | */ | ||
20 | |||
21 | #undef DEBUG | ||
22 | |||
23 | #include <linux/config.h> | ||
24 | #include <linux/spinlock.h> | ||
25 | #include <linux/errno.h> | ||
26 | #include <linux/sched.h> | ||
27 | #include <linux/proc_fs.h> | ||
28 | #include <linux/stat.h> | ||
29 | #include <linux/sysctl.h> | ||
30 | #include <linux/ctype.h> | ||
31 | #include <linux/cache.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <linux/signal.h> | ||
34 | |||
35 | #include <asm/ppcdebug.h> | ||
36 | #include <asm/processor.h> | ||
37 | #include <asm/pgtable.h> | ||
38 | #include <asm/mmu.h> | ||
39 | #include <asm/mmu_context.h> | ||
40 | #include <asm/page.h> | ||
41 | #include <asm/types.h> | ||
42 | #include <asm/system.h> | ||
43 | #include <asm/uaccess.h> | ||
44 | #include <asm/machdep.h> | ||
45 | #include <asm/lmb.h> | ||
46 | #include <asm/abs_addr.h> | ||
47 | #include <asm/tlbflush.h> | ||
48 | #include <asm/io.h> | ||
49 | #include <asm/eeh.h> | ||
50 | #include <asm/tlb.h> | ||
51 | #include <asm/cacheflush.h> | ||
52 | #include <asm/cputable.h> | ||
53 | #include <asm/abs_addr.h> | ||
54 | #include <asm/sections.h> | ||
55 | |||
56 | #ifdef DEBUG | ||
57 | #define DBG(fmt...) udbg_printf(fmt) | ||
58 | #else | ||
59 | #define DBG(fmt...) | ||
60 | #endif | ||
61 | |||
62 | /* | ||
63 | * Note: pte --> Linux PTE | ||
64 | * HPTE --> PowerPC Hashed Page Table Entry | ||
65 | * | ||
66 | * Execution context: | ||
67 | * htab_initialize is called with the MMU off (of course), but | ||
68 | * the kernel has been copied down to zero so it can directly | ||
69 | * reference global data. At this point it is very difficult | ||
70 | * to print debug info. | ||
71 | * | ||
72 | */ | ||
73 | |||
74 | #ifdef CONFIG_U3_DART | ||
75 | extern unsigned long dart_tablebase; | ||
76 | #endif /* CONFIG_U3_DART */ | ||
77 | |||
78 | HPTE *htab_address; | ||
79 | unsigned long htab_hash_mask; | ||
80 | |||
81 | extern unsigned long _SDR1; | ||
82 | |||
83 | #define KB (1024) | ||
84 | #define MB (1024*KB) | ||
85 | |||
86 | static inline void loop_forever(void) | ||
87 | { | ||
88 | volatile unsigned long x = 1; | ||
89 | for(;x;x|=1) | ||
90 | ; | ||
91 | } | ||
92 | |||
93 | #ifdef CONFIG_PPC_MULTIPLATFORM | ||
94 | static inline void create_pte_mapping(unsigned long start, unsigned long end, | ||
95 | unsigned long mode, int large) | ||
96 | { | ||
97 | unsigned long addr; | ||
98 | unsigned int step; | ||
99 | unsigned long tmp_mode; | ||
100 | |||
101 | if (large) | ||
102 | step = 16*MB; | ||
103 | else | ||
104 | step = 4*KB; | ||
105 | |||
106 | for (addr = start; addr < end; addr += step) { | ||
107 | unsigned long vpn, hash, hpteg; | ||
108 | unsigned long vsid = get_kernel_vsid(addr); | ||
109 | unsigned long va = (vsid << 28) | (addr & 0xfffffff); | ||
110 | int ret; | ||
111 | |||
112 | if (large) | ||
113 | vpn = va >> HPAGE_SHIFT; | ||
114 | else | ||
115 | vpn = va >> PAGE_SHIFT; | ||
116 | |||
117 | |||
118 | tmp_mode = mode; | ||
119 | |||
120 | /* Make non-kernel text non-executable */ | ||
121 | if (!in_kernel_text(addr)) | ||
122 | tmp_mode = mode | HW_NO_EXEC; | ||
123 | |||
124 | hash = hpt_hash(vpn, large); | ||
125 | |||
126 | hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); | ||
127 | |||
128 | #ifdef CONFIG_PPC_PSERIES | ||
129 | if (systemcfg->platform & PLATFORM_LPAR) | ||
130 | ret = pSeries_lpar_hpte_insert(hpteg, va, | ||
131 | virt_to_abs(addr) >> PAGE_SHIFT, | ||
132 | 0, tmp_mode, 1, large); | ||
133 | else | ||
134 | #endif /* CONFIG_PPC_PSERIES */ | ||
135 | ret = native_hpte_insert(hpteg, va, | ||
136 | virt_to_abs(addr) >> PAGE_SHIFT, | ||
137 | 0, tmp_mode, 1, large); | ||
138 | |||
139 | if (ret == -1) { | ||
140 | ppc64_terminate_msg(0x20, "create_pte_mapping"); | ||
141 | loop_forever(); | ||
142 | } | ||
143 | } | ||
144 | } | ||
145 | |||
146 | void __init htab_initialize(void) | ||
147 | { | ||
148 | unsigned long table, htab_size_bytes; | ||
149 | unsigned long pteg_count; | ||
150 | unsigned long mode_rw; | ||
151 | int i, use_largepages = 0; | ||
152 | unsigned long base = 0, size = 0; | ||
153 | extern unsigned long tce_alloc_start, tce_alloc_end; | ||
154 | |||
155 | DBG(" -> htab_initialize()\n"); | ||
156 | |||
157 | /* | ||
158 | * Calculate the required size of the htab. We want the number of | ||
159 | * PTEGs to equal one half the number of real pages. | ||
160 | */ | ||
161 | htab_size_bytes = 1UL << ppc64_pft_size; | ||
162 | pteg_count = htab_size_bytes >> 7; | ||
163 | |||
164 | /* For debug, make the HTAB 1/8 as big as it normally would be. */ | ||
165 | ifppcdebug(PPCDBG_HTABSIZE) { | ||
166 | pteg_count >>= 3; | ||
167 | htab_size_bytes = pteg_count << 7; | ||
168 | } | ||
169 | |||
170 | htab_hash_mask = pteg_count - 1; | ||
171 | |||
172 | if (systemcfg->platform & PLATFORM_LPAR) { | ||
173 | /* Using a hypervisor which owns the htab */ | ||
174 | htab_address = NULL; | ||
175 | _SDR1 = 0; | ||
176 | } else { | ||
177 | /* Find storage for the HPT. Must be contiguous in | ||
178 | * the absolute address space. | ||
179 | */ | ||
180 | table = lmb_alloc(htab_size_bytes, htab_size_bytes); | ||
181 | |||
182 | DBG("Hash table allocated at %lx, size: %lx\n", table, | ||
183 | htab_size_bytes); | ||
184 | |||
185 | if ( !table ) { | ||
186 | ppc64_terminate_msg(0x20, "hpt space"); | ||
187 | loop_forever(); | ||
188 | } | ||
189 | htab_address = abs_to_virt(table); | ||
190 | |||
191 | /* htab absolute addr + encoded htabsize */ | ||
192 | _SDR1 = table + __ilog2(pteg_count) - 11; | ||
193 | |||
194 | /* Initialize the HPT with no entries */ | ||
195 | memset((void *)table, 0, htab_size_bytes); | ||
196 | } | ||
197 | |||
198 | mode_rw = _PAGE_ACCESSED | _PAGE_COHERENT | PP_RWXX; | ||
199 | |||
200 | /* On U3 based machines, we need to reserve the DART area and | ||
201 | * _NOT_ map it to avoid cache paradoxes as it's remapped non | ||
202 | * cacheable later on | ||
203 | */ | ||
204 | if (cpu_has_feature(CPU_FTR_16M_PAGE)) | ||
205 | use_largepages = 1; | ||
206 | |||
207 | /* create bolted the linear mapping in the hash table */ | ||
208 | for (i=0; i < lmb.memory.cnt; i++) { | ||
209 | base = lmb.memory.region[i].physbase + KERNELBASE; | ||
210 | size = lmb.memory.region[i].size; | ||
211 | |||
212 | DBG("creating mapping for region: %lx : %lx\n", base, size); | ||
213 | |||
214 | #ifdef CONFIG_U3_DART | ||
215 | /* Do not map the DART space. Fortunately, it will be aligned | ||
216 | * in such a way that it will not cross two lmb regions and will | ||
217 | * fit within a single 16Mb page. | ||
218 | * The DART space is assumed to be a full 16Mb region even if we | ||
219 | * only use 2Mb of that space. We will use more of it later for | ||
220 | * AGP GART. We have to use a full 16Mb large page. | ||
221 | */ | ||
222 | DBG("DART base: %lx\n", dart_tablebase); | ||
223 | |||
224 | if (dart_tablebase != 0 && dart_tablebase >= base | ||
225 | && dart_tablebase < (base + size)) { | ||
226 | if (base != dart_tablebase) | ||
227 | create_pte_mapping(base, dart_tablebase, mode_rw, | ||
228 | use_largepages); | ||
229 | if ((base + size) > (dart_tablebase + 16*MB)) | ||
230 | create_pte_mapping(dart_tablebase + 16*MB, base + size, | ||
231 | mode_rw, use_largepages); | ||
232 | continue; | ||
233 | } | ||
234 | #endif /* CONFIG_U3_DART */ | ||
235 | create_pte_mapping(base, base + size, mode_rw, use_largepages); | ||
236 | } | ||
237 | |||
238 | /* | ||
239 | * If we have a memory_limit and we've allocated TCEs then we need to | ||
240 | * explicitly map the TCE area at the top of RAM. We also cope with the | ||
241 | * case that the TCEs start below memory_limit. | ||
242 | * tce_alloc_start/end are 16MB aligned so the mapping should work | ||
243 | * for either 4K or 16MB pages. | ||
244 | */ | ||
245 | if (tce_alloc_start) { | ||
246 | tce_alloc_start += KERNELBASE; | ||
247 | tce_alloc_end += KERNELBASE; | ||
248 | |||
249 | if (base + size >= tce_alloc_start) | ||
250 | tce_alloc_start = base + size + 1; | ||
251 | |||
252 | create_pte_mapping(tce_alloc_start, tce_alloc_end, | ||
253 | mode_rw, use_largepages); | ||
254 | } | ||
255 | |||
256 | DBG(" <- htab_initialize()\n"); | ||
257 | } | ||
258 | #undef KB | ||
259 | #undef MB | ||
260 | #endif /* CONFIG_PPC_MULTIPLATFORM */ | ||
261 | |||
262 | /* | ||
263 | * Called by asm hashtable.S for doing lazy icache flush | ||
264 | */ | ||
265 | unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) | ||
266 | { | ||
267 | struct page *page; | ||
268 | |||
269 | if (!pfn_valid(pte_pfn(pte))) | ||
270 | return pp; | ||
271 | |||
272 | page = pte_page(pte); | ||
273 | |||
274 | /* page is dirty */ | ||
275 | if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { | ||
276 | if (trap == 0x400) { | ||
277 | __flush_dcache_icache(page_address(page)); | ||
278 | set_bit(PG_arch_1, &page->flags); | ||
279 | } else | ||
280 | pp |= HW_NO_EXEC; | ||
281 | } | ||
282 | return pp; | ||
283 | } | ||
284 | |||
285 | /* Result code is: | ||
286 | * 0 - handled | ||
287 | * 1 - normal page fault | ||
288 | * -1 - critical hash insertion error | ||
289 | */ | ||
290 | int hash_page(unsigned long ea, unsigned long access, unsigned long trap) | ||
291 | { | ||
292 | void *pgdir; | ||
293 | unsigned long vsid; | ||
294 | struct mm_struct *mm; | ||
295 | pte_t *ptep; | ||
296 | int ret; | ||
297 | int user_region = 0; | ||
298 | int local = 0; | ||
299 | cpumask_t tmp; | ||
300 | |||
301 | switch (REGION_ID(ea)) { | ||
302 | case USER_REGION_ID: | ||
303 | user_region = 1; | ||
304 | mm = current->mm; | ||
305 | if ((ea > USER_END) || (! mm)) | ||
306 | return 1; | ||
307 | |||
308 | vsid = get_vsid(mm->context.id, ea); | ||
309 | break; | ||
310 | case IO_REGION_ID: | ||
311 | if (ea > IMALLOC_END) | ||
312 | return 1; | ||
313 | mm = &ioremap_mm; | ||
314 | vsid = get_kernel_vsid(ea); | ||
315 | break; | ||
316 | case VMALLOC_REGION_ID: | ||
317 | if (ea > VMALLOC_END) | ||
318 | return 1; | ||
319 | mm = &init_mm; | ||
320 | vsid = get_kernel_vsid(ea); | ||
321 | break; | ||
322 | #if 0 | ||
323 | case KERNEL_REGION_ID: | ||
324 | /* | ||
325 | * Should never get here - entire 0xC0... region is bolted. | ||
326 | * Send the problem up to do_page_fault | ||
327 | */ | ||
328 | #endif | ||
329 | default: | ||
330 | /* Not a valid range | ||
331 | * Send the problem up to do_page_fault | ||
332 | */ | ||
333 | return 1; | ||
334 | break; | ||
335 | } | ||
336 | |||
337 | pgdir = mm->pgd; | ||
338 | |||
339 | if (pgdir == NULL) | ||
340 | return 1; | ||
341 | |||
342 | tmp = cpumask_of_cpu(smp_processor_id()); | ||
343 | if (user_region && cpus_equal(mm->cpu_vm_mask, tmp)) | ||
344 | local = 1; | ||
345 | |||
346 | /* Is this a huge page ? */ | ||
347 | if (unlikely(in_hugepage_area(mm->context, ea))) | ||
348 | ret = hash_huge_page(mm, access, ea, vsid, local); | ||
349 | else { | ||
350 | ptep = find_linux_pte(pgdir, ea); | ||
351 | if (ptep == NULL) | ||
352 | return 1; | ||
353 | ret = __hash_page(ea, access, vsid, ptep, trap, local); | ||
354 | } | ||
355 | |||
356 | return ret; | ||
357 | } | ||
358 | |||
359 | void flush_hash_page(unsigned long context, unsigned long ea, pte_t pte, | ||
360 | int local) | ||
361 | { | ||
362 | unsigned long vsid, vpn, va, hash, secondary, slot; | ||
363 | unsigned long huge = pte_huge(pte); | ||
364 | |||
365 | if ((ea >= USER_START) && (ea <= USER_END)) | ||
366 | vsid = get_vsid(context, ea); | ||
367 | else | ||
368 | vsid = get_kernel_vsid(ea); | ||
369 | |||
370 | va = (vsid << 28) | (ea & 0x0fffffff); | ||
371 | if (huge) | ||
372 | vpn = va >> HPAGE_SHIFT; | ||
373 | else | ||
374 | vpn = va >> PAGE_SHIFT; | ||
375 | hash = hpt_hash(vpn, huge); | ||
376 | secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15; | ||
377 | if (secondary) | ||
378 | hash = ~hash; | ||
379 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | ||
380 | slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12; | ||
381 | |||
382 | ppc_md.hpte_invalidate(slot, va, huge, local); | ||
383 | } | ||
384 | |||
385 | void flush_hash_range(unsigned long context, unsigned long number, int local) | ||
386 | { | ||
387 | if (ppc_md.flush_hash_range) { | ||
388 | ppc_md.flush_hash_range(context, number, local); | ||
389 | } else { | ||
390 | int i; | ||
391 | struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); | ||
392 | |||
393 | for (i = 0; i < number; i++) | ||
394 | flush_hash_page(context, batch->addr[i], batch->pte[i], | ||
395 | local); | ||
396 | } | ||
397 | } | ||
398 | |||
399 | static inline void make_bl(unsigned int *insn_addr, void *func) | ||
400 | { | ||
401 | unsigned long funcp = *((unsigned long *)func); | ||
402 | int offset = funcp - (unsigned long)insn_addr; | ||
403 | |||
404 | *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc)); | ||
405 | flush_icache_range((unsigned long)insn_addr, 4+ | ||
406 | (unsigned long)insn_addr); | ||
407 | } | ||
408 | |||
409 | /* | ||
410 | * low_hash_fault is called when we the low level hash code failed | ||
411 | * to instert a PTE due to an hypervisor error | ||
412 | */ | ||
413 | void low_hash_fault(struct pt_regs *regs, unsigned long address) | ||
414 | { | ||
415 | if (user_mode(regs)) { | ||
416 | siginfo_t info; | ||
417 | |||
418 | info.si_signo = SIGBUS; | ||
419 | info.si_errno = 0; | ||
420 | info.si_code = BUS_ADRERR; | ||
421 | info.si_addr = (void __user *)address; | ||
422 | force_sig_info(SIGBUS, &info, current); | ||
423 | return; | ||
424 | } | ||
425 | bad_page_fault(regs, address, SIGBUS); | ||
426 | } | ||
427 | |||
428 | void __init htab_finish_init(void) | ||
429 | { | ||
430 | extern unsigned int *htab_call_hpte_insert1; | ||
431 | extern unsigned int *htab_call_hpte_insert2; | ||
432 | extern unsigned int *htab_call_hpte_remove; | ||
433 | extern unsigned int *htab_call_hpte_updatepp; | ||
434 | |||
435 | make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert); | ||
436 | make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert); | ||
437 | make_bl(htab_call_hpte_remove, ppc_md.hpte_remove); | ||
438 | make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp); | ||
439 | } | ||
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c new file mode 100644 index 00000000000..c62ddaff072 --- /dev/null +++ b/arch/ppc64/mm/hugetlbpage.c | |||
@@ -0,0 +1,904 @@ | |||
1 | /* | ||
2 | * PPC64 (POWER4) Huge TLB Page Support for Kernel. | ||
3 | * | ||
4 | * Copyright (C) 2003 David Gibson, IBM Corporation. | ||
5 | * | ||
6 | * Based on the IA-32 version: | ||
7 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | ||
8 | */ | ||
9 | |||
10 | #include <linux/init.h> | ||
11 | #include <linux/fs.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/hugetlb.h> | ||
14 | #include <linux/pagemap.h> | ||
15 | #include <linux/smp_lock.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <linux/err.h> | ||
18 | #include <linux/sysctl.h> | ||
19 | #include <asm/mman.h> | ||
20 | #include <asm/pgalloc.h> | ||
21 | #include <asm/tlb.h> | ||
22 | #include <asm/tlbflush.h> | ||
23 | #include <asm/mmu_context.h> | ||
24 | #include <asm/machdep.h> | ||
25 | #include <asm/cputable.h> | ||
26 | #include <asm/tlb.h> | ||
27 | |||
28 | #include <linux/sysctl.h> | ||
29 | |||
30 | #define HUGEPGDIR_SHIFT (HPAGE_SHIFT + PAGE_SHIFT - 3) | ||
31 | #define HUGEPGDIR_SIZE (1UL << HUGEPGDIR_SHIFT) | ||
32 | #define HUGEPGDIR_MASK (~(HUGEPGDIR_SIZE-1)) | ||
33 | |||
34 | #define HUGEPTE_INDEX_SIZE 9 | ||
35 | #define HUGEPGD_INDEX_SIZE 10 | ||
36 | |||
37 | #define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE) | ||
38 | #define PTRS_PER_HUGEPGD (1 << HUGEPGD_INDEX_SIZE) | ||
39 | |||
40 | static inline int hugepgd_index(unsigned long addr) | ||
41 | { | ||
42 | return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT; | ||
43 | } | ||
44 | |||
45 | static pgd_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr) | ||
46 | { | ||
47 | int index; | ||
48 | |||
49 | if (! mm->context.huge_pgdir) | ||
50 | return NULL; | ||
51 | |||
52 | |||
53 | index = hugepgd_index(addr); | ||
54 | BUG_ON(index >= PTRS_PER_HUGEPGD); | ||
55 | return mm->context.huge_pgdir + index; | ||
56 | } | ||
57 | |||
58 | static inline pte_t *hugepte_offset(pgd_t *dir, unsigned long addr) | ||
59 | { | ||
60 | int index; | ||
61 | |||
62 | if (pgd_none(*dir)) | ||
63 | return NULL; | ||
64 | |||
65 | index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE; | ||
66 | return (pte_t *)pgd_page(*dir) + index; | ||
67 | } | ||
68 | |||
69 | static pgd_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr) | ||
70 | { | ||
71 | BUG_ON(! in_hugepage_area(mm->context, addr)); | ||
72 | |||
73 | if (! mm->context.huge_pgdir) { | ||
74 | pgd_t *new; | ||
75 | spin_unlock(&mm->page_table_lock); | ||
76 | /* Don't use pgd_alloc(), because we want __GFP_REPEAT */ | ||
77 | new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT); | ||
78 | BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE)); | ||
79 | spin_lock(&mm->page_table_lock); | ||
80 | |||
81 | /* | ||
82 | * Because we dropped the lock, we should re-check the | ||
83 | * entry, as somebody else could have populated it.. | ||
84 | */ | ||
85 | if (mm->context.huge_pgdir) | ||
86 | pgd_free(new); | ||
87 | else | ||
88 | mm->context.huge_pgdir = new; | ||
89 | } | ||
90 | return hugepgd_offset(mm, addr); | ||
91 | } | ||
92 | |||
93 | static pte_t *hugepte_alloc(struct mm_struct *mm, pgd_t *dir, | ||
94 | unsigned long addr) | ||
95 | { | ||
96 | if (! pgd_present(*dir)) { | ||
97 | pte_t *new; | ||
98 | |||
99 | spin_unlock(&mm->page_table_lock); | ||
100 | new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT); | ||
101 | BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE)); | ||
102 | spin_lock(&mm->page_table_lock); | ||
103 | /* | ||
104 | * Because we dropped the lock, we should re-check the | ||
105 | * entry, as somebody else could have populated it.. | ||
106 | */ | ||
107 | if (pgd_present(*dir)) { | ||
108 | if (new) | ||
109 | kmem_cache_free(zero_cache, new); | ||
110 | } else { | ||
111 | struct page *ptepage; | ||
112 | |||
113 | if (! new) | ||
114 | return NULL; | ||
115 | ptepage = virt_to_page(new); | ||
116 | ptepage->mapping = (void *) mm; | ||
117 | ptepage->index = addr & HUGEPGDIR_MASK; | ||
118 | pgd_populate(mm, dir, new); | ||
119 | } | ||
120 | } | ||
121 | |||
122 | return hugepte_offset(dir, addr); | ||
123 | } | ||
124 | |||
125 | static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | ||
126 | { | ||
127 | pgd_t *pgd; | ||
128 | |||
129 | BUG_ON(! in_hugepage_area(mm->context, addr)); | ||
130 | |||
131 | pgd = hugepgd_offset(mm, addr); | ||
132 | if (! pgd) | ||
133 | return NULL; | ||
134 | |||
135 | return hugepte_offset(pgd, addr); | ||
136 | } | ||
137 | |||
138 | static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) | ||
139 | { | ||
140 | pgd_t *pgd; | ||
141 | |||
142 | BUG_ON(! in_hugepage_area(mm->context, addr)); | ||
143 | |||
144 | pgd = hugepgd_alloc(mm, addr); | ||
145 | if (! pgd) | ||
146 | return NULL; | ||
147 | |||
148 | return hugepte_alloc(mm, pgd, addr); | ||
149 | } | ||
150 | |||
151 | static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, | ||
152 | unsigned long addr, struct page *page, | ||
153 | pte_t *ptep, int write_access) | ||
154 | { | ||
155 | pte_t entry; | ||
156 | |||
157 | add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); | ||
158 | if (write_access) { | ||
159 | entry = | ||
160 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | ||
161 | } else { | ||
162 | entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); | ||
163 | } | ||
164 | entry = pte_mkyoung(entry); | ||
165 | entry = pte_mkhuge(entry); | ||
166 | |||
167 | set_pte_at(mm, addr, ptep, entry); | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * This function checks for proper alignment of input addr and len parameters. | ||
172 | */ | ||
173 | int is_aligned_hugepage_range(unsigned long addr, unsigned long len) | ||
174 | { | ||
175 | if (len & ~HPAGE_MASK) | ||
176 | return -EINVAL; | ||
177 | if (addr & ~HPAGE_MASK) | ||
178 | return -EINVAL; | ||
179 | if (! (within_hugepage_low_range(addr, len) | ||
180 | || within_hugepage_high_range(addr, len)) ) | ||
181 | return -EINVAL; | ||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | static void flush_segments(void *parm) | ||
186 | { | ||
187 | u16 segs = (unsigned long) parm; | ||
188 | unsigned long i; | ||
189 | |||
190 | asm volatile("isync" : : : "memory"); | ||
191 | |||
192 | for (i = 0; i < 16; i++) { | ||
193 | if (! (segs & (1U << i))) | ||
194 | continue; | ||
195 | asm volatile("slbie %0" : : "r" (i << SID_SHIFT)); | ||
196 | } | ||
197 | |||
198 | asm volatile("isync" : : : "memory"); | ||
199 | } | ||
200 | |||
201 | static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg) | ||
202 | { | ||
203 | unsigned long start = seg << SID_SHIFT; | ||
204 | unsigned long end = (seg+1) << SID_SHIFT; | ||
205 | struct vm_area_struct *vma; | ||
206 | unsigned long addr; | ||
207 | struct mmu_gather *tlb; | ||
208 | |||
209 | BUG_ON(seg >= 16); | ||
210 | |||
211 | /* Check no VMAs are in the region */ | ||
212 | vma = find_vma(mm, start); | ||
213 | if (vma && (vma->vm_start < end)) | ||
214 | return -EBUSY; | ||
215 | |||
216 | /* Clean up any leftover PTE pages in the region */ | ||
217 | spin_lock(&mm->page_table_lock); | ||
218 | tlb = tlb_gather_mmu(mm, 0); | ||
219 | for (addr = start; addr < end; addr += PMD_SIZE) { | ||
220 | pgd_t *pgd = pgd_offset(mm, addr); | ||
221 | pmd_t *pmd; | ||
222 | struct page *page; | ||
223 | pte_t *pte; | ||
224 | int i; | ||
225 | |||
226 | if (pgd_none(*pgd)) | ||
227 | continue; | ||
228 | pmd = pmd_offset(pgd, addr); | ||
229 | if (!pmd || pmd_none(*pmd)) | ||
230 | continue; | ||
231 | if (pmd_bad(*pmd)) { | ||
232 | pmd_ERROR(*pmd); | ||
233 | pmd_clear(pmd); | ||
234 | continue; | ||
235 | } | ||
236 | pte = (pte_t *)pmd_page_kernel(*pmd); | ||
237 | /* No VMAs, so there should be no PTEs, check just in case. */ | ||
238 | for (i = 0; i < PTRS_PER_PTE; i++) { | ||
239 | BUG_ON(!pte_none(*pte)); | ||
240 | pte++; | ||
241 | } | ||
242 | page = pmd_page(*pmd); | ||
243 | pmd_clear(pmd); | ||
244 | mm->nr_ptes--; | ||
245 | dec_page_state(nr_page_table_pages); | ||
246 | pte_free_tlb(tlb, page); | ||
247 | } | ||
248 | tlb_finish_mmu(tlb, start, end); | ||
249 | spin_unlock(&mm->page_table_lock); | ||
250 | |||
251 | return 0; | ||
252 | } | ||
253 | |||
254 | static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs) | ||
255 | { | ||
256 | unsigned long i; | ||
257 | |||
258 | newsegs &= ~(mm->context.htlb_segs); | ||
259 | if (! newsegs) | ||
260 | return 0; /* The segments we want are already open */ | ||
261 | |||
262 | for (i = 0; i < 16; i++) | ||
263 | if ((1 << i) & newsegs) | ||
264 | if (prepare_low_seg_for_htlb(mm, i) != 0) | ||
265 | return -EBUSY; | ||
266 | |||
267 | mm->context.htlb_segs |= newsegs; | ||
268 | |||
269 | /* update the paca copy of the context struct */ | ||
270 | get_paca()->context = mm->context; | ||
271 | |||
272 | /* the context change must make it to memory before the flush, | ||
273 | * so that further SLB misses do the right thing. */ | ||
274 | mb(); | ||
275 | on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1); | ||
276 | |||
277 | return 0; | ||
278 | } | ||
279 | |||
280 | int prepare_hugepage_range(unsigned long addr, unsigned long len) | ||
281 | { | ||
282 | if (within_hugepage_high_range(addr, len)) | ||
283 | return 0; | ||
284 | else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) { | ||
285 | int err; | ||
286 | /* Yes, we need both tests, in case addr+len overflows | ||
287 | * 64-bit arithmetic */ | ||
288 | err = open_low_hpage_segs(current->mm, | ||
289 | LOW_ESID_MASK(addr, len)); | ||
290 | if (err) | ||
291 | printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" | ||
292 | " failed (segs: 0x%04hx)\n", addr, len, | ||
293 | LOW_ESID_MASK(addr, len)); | ||
294 | return err; | ||
295 | } | ||
296 | |||
297 | return -EINVAL; | ||
298 | } | ||
299 | |||
300 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | ||
301 | struct vm_area_struct *vma) | ||
302 | { | ||
303 | pte_t *src_pte, *dst_pte, entry; | ||
304 | struct page *ptepage; | ||
305 | unsigned long addr = vma->vm_start; | ||
306 | unsigned long end = vma->vm_end; | ||
307 | int err = -ENOMEM; | ||
308 | |||
309 | while (addr < end) { | ||
310 | dst_pte = huge_pte_alloc(dst, addr); | ||
311 | if (!dst_pte) | ||
312 | goto out; | ||
313 | |||
314 | src_pte = huge_pte_offset(src, addr); | ||
315 | entry = *src_pte; | ||
316 | |||
317 | ptepage = pte_page(entry); | ||
318 | get_page(ptepage); | ||
319 | add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); | ||
320 | set_pte_at(dst, addr, dst_pte, entry); | ||
321 | |||
322 | addr += HPAGE_SIZE; | ||
323 | } | ||
324 | |||
325 | err = 0; | ||
326 | out: | ||
327 | return err; | ||
328 | } | ||
329 | |||
330 | int | ||
331 | follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
332 | struct page **pages, struct vm_area_struct **vmas, | ||
333 | unsigned long *position, int *length, int i) | ||
334 | { | ||
335 | unsigned long vpfn, vaddr = *position; | ||
336 | int remainder = *length; | ||
337 | |||
338 | WARN_ON(!is_vm_hugetlb_page(vma)); | ||
339 | |||
340 | vpfn = vaddr/PAGE_SIZE; | ||
341 | while (vaddr < vma->vm_end && remainder) { | ||
342 | if (pages) { | ||
343 | pte_t *pte; | ||
344 | struct page *page; | ||
345 | |||
346 | pte = huge_pte_offset(mm, vaddr); | ||
347 | |||
348 | /* hugetlb should be locked, and hence, prefaulted */ | ||
349 | WARN_ON(!pte || pte_none(*pte)); | ||
350 | |||
351 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | ||
352 | |||
353 | WARN_ON(!PageCompound(page)); | ||
354 | |||
355 | get_page(page); | ||
356 | pages[i] = page; | ||
357 | } | ||
358 | |||
359 | if (vmas) | ||
360 | vmas[i] = vma; | ||
361 | |||
362 | vaddr += PAGE_SIZE; | ||
363 | ++vpfn; | ||
364 | --remainder; | ||
365 | ++i; | ||
366 | } | ||
367 | |||
368 | *length = remainder; | ||
369 | *position = vaddr; | ||
370 | |||
371 | return i; | ||
372 | } | ||
373 | |||
374 | struct page * | ||
375 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | ||
376 | { | ||
377 | pte_t *ptep; | ||
378 | struct page *page; | ||
379 | |||
380 | if (! in_hugepage_area(mm->context, address)) | ||
381 | return ERR_PTR(-EINVAL); | ||
382 | |||
383 | ptep = huge_pte_offset(mm, address); | ||
384 | page = pte_page(*ptep); | ||
385 | if (page) | ||
386 | page += (address % HPAGE_SIZE) / PAGE_SIZE; | ||
387 | |||
388 | return page; | ||
389 | } | ||
390 | |||
391 | int pmd_huge(pmd_t pmd) | ||
392 | { | ||
393 | return 0; | ||
394 | } | ||
395 | |||
396 | struct page * | ||
397 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
398 | pmd_t *pmd, int write) | ||
399 | { | ||
400 | BUG(); | ||
401 | return NULL; | ||
402 | } | ||
403 | |||
404 | void unmap_hugepage_range(struct vm_area_struct *vma, | ||
405 | unsigned long start, unsigned long end) | ||
406 | { | ||
407 | struct mm_struct *mm = vma->vm_mm; | ||
408 | unsigned long addr; | ||
409 | pte_t *ptep; | ||
410 | struct page *page; | ||
411 | |||
412 | WARN_ON(!is_vm_hugetlb_page(vma)); | ||
413 | BUG_ON((start % HPAGE_SIZE) != 0); | ||
414 | BUG_ON((end % HPAGE_SIZE) != 0); | ||
415 | |||
416 | for (addr = start; addr < end; addr += HPAGE_SIZE) { | ||
417 | pte_t pte; | ||
418 | |||
419 | ptep = huge_pte_offset(mm, addr); | ||
420 | if (!ptep || pte_none(*ptep)) | ||
421 | continue; | ||
422 | |||
423 | pte = *ptep; | ||
424 | page = pte_page(pte); | ||
425 | pte_clear(mm, addr, ptep); | ||
426 | |||
427 | put_page(page); | ||
428 | } | ||
429 | add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); | ||
430 | flush_tlb_pending(); | ||
431 | } | ||
432 | |||
433 | void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, | ||
434 | unsigned long start, unsigned long end) | ||
435 | { | ||
436 | /* Because the huge pgtables are only 2 level, they can take | ||
437 | * at most around 4M, much less than one hugepage which the | ||
438 | * process is presumably entitled to use. So we don't bother | ||
439 | * freeing up the pagetables on unmap, and wait until | ||
440 | * destroy_context() to clean up the lot. */ | ||
441 | } | ||
442 | |||
443 | int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) | ||
444 | { | ||
445 | struct mm_struct *mm = current->mm; | ||
446 | unsigned long addr; | ||
447 | int ret = 0; | ||
448 | |||
449 | WARN_ON(!is_vm_hugetlb_page(vma)); | ||
450 | BUG_ON((vma->vm_start % HPAGE_SIZE) != 0); | ||
451 | BUG_ON((vma->vm_end % HPAGE_SIZE) != 0); | ||
452 | |||
453 | spin_lock(&mm->page_table_lock); | ||
454 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | ||
455 | unsigned long idx; | ||
456 | pte_t *pte = huge_pte_alloc(mm, addr); | ||
457 | struct page *page; | ||
458 | |||
459 | if (!pte) { | ||
460 | ret = -ENOMEM; | ||
461 | goto out; | ||
462 | } | ||
463 | if (! pte_none(*pte)) | ||
464 | continue; | ||
465 | |||
466 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
467 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
468 | page = find_get_page(mapping, idx); | ||
469 | if (!page) { | ||
470 | /* charge the fs quota first */ | ||
471 | if (hugetlb_get_quota(mapping)) { | ||
472 | ret = -ENOMEM; | ||
473 | goto out; | ||
474 | } | ||
475 | page = alloc_huge_page(); | ||
476 | if (!page) { | ||
477 | hugetlb_put_quota(mapping); | ||
478 | ret = -ENOMEM; | ||
479 | goto out; | ||
480 | } | ||
481 | ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); | ||
482 | if (! ret) { | ||
483 | unlock_page(page); | ||
484 | } else { | ||
485 | hugetlb_put_quota(mapping); | ||
486 | free_huge_page(page); | ||
487 | goto out; | ||
488 | } | ||
489 | } | ||
490 | set_huge_pte(mm, vma, addr, page, pte, vma->vm_flags & VM_WRITE); | ||
491 | } | ||
492 | out: | ||
493 | spin_unlock(&mm->page_table_lock); | ||
494 | return ret; | ||
495 | } | ||
496 | |||
497 | /* Because we have an exclusive hugepage region which lies within the | ||
498 | * normal user address space, we have to take special measures to make | ||
499 | * non-huge mmap()s evade the hugepage reserved regions. */ | ||
500 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, | ||
501 | unsigned long len, unsigned long pgoff, | ||
502 | unsigned long flags) | ||
503 | { | ||
504 | struct mm_struct *mm = current->mm; | ||
505 | struct vm_area_struct *vma; | ||
506 | unsigned long start_addr; | ||
507 | |||
508 | if (len > TASK_SIZE) | ||
509 | return -ENOMEM; | ||
510 | |||
511 | if (addr) { | ||
512 | addr = PAGE_ALIGN(addr); | ||
513 | vma = find_vma(mm, addr); | ||
514 | if (((TASK_SIZE - len) >= addr) | ||
515 | && (!vma || (addr+len) <= vma->vm_start) | ||
516 | && !is_hugepage_only_range(mm, addr,len)) | ||
517 | return addr; | ||
518 | } | ||
519 | start_addr = addr = mm->free_area_cache; | ||
520 | |||
521 | full_search: | ||
522 | vma = find_vma(mm, addr); | ||
523 | while (TASK_SIZE - len >= addr) { | ||
524 | BUG_ON(vma && (addr >= vma->vm_end)); | ||
525 | |||
526 | if (touches_hugepage_low_range(mm, addr, len)) { | ||
527 | addr = ALIGN(addr+1, 1<<SID_SHIFT); | ||
528 | vma = find_vma(mm, addr); | ||
529 | continue; | ||
530 | } | ||
531 | if (touches_hugepage_high_range(addr, len)) { | ||
532 | addr = TASK_HPAGE_END; | ||
533 | vma = find_vma(mm, addr); | ||
534 | continue; | ||
535 | } | ||
536 | if (!vma || addr + len <= vma->vm_start) { | ||
537 | /* | ||
538 | * Remember the place where we stopped the search: | ||
539 | */ | ||
540 | mm->free_area_cache = addr + len; | ||
541 | return addr; | ||
542 | } | ||
543 | addr = vma->vm_end; | ||
544 | vma = vma->vm_next; | ||
545 | } | ||
546 | |||
547 | /* Make sure we didn't miss any holes */ | ||
548 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
549 | start_addr = addr = TASK_UNMAPPED_BASE; | ||
550 | goto full_search; | ||
551 | } | ||
552 | return -ENOMEM; | ||
553 | } | ||
554 | |||
555 | /* | ||
556 | * This mmap-allocator allocates new areas top-down from below the | ||
557 | * stack's low limit (the base): | ||
558 | * | ||
559 | * Because we have an exclusive hugepage region which lies within the | ||
560 | * normal user address space, we have to take special measures to make | ||
561 | * non-huge mmap()s evade the hugepage reserved regions. | ||
562 | */ | ||
563 | unsigned long | ||
564 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | ||
565 | const unsigned long len, const unsigned long pgoff, | ||
566 | const unsigned long flags) | ||
567 | { | ||
568 | struct vm_area_struct *vma, *prev_vma; | ||
569 | struct mm_struct *mm = current->mm; | ||
570 | unsigned long base = mm->mmap_base, addr = addr0; | ||
571 | int first_time = 1; | ||
572 | |||
573 | /* requested length too big for entire address space */ | ||
574 | if (len > TASK_SIZE) | ||
575 | return -ENOMEM; | ||
576 | |||
577 | /* dont allow allocations above current base */ | ||
578 | if (mm->free_area_cache > base) | ||
579 | mm->free_area_cache = base; | ||
580 | |||
581 | /* requesting a specific address */ | ||
582 | if (addr) { | ||
583 | addr = PAGE_ALIGN(addr); | ||
584 | vma = find_vma(mm, addr); | ||
585 | if (TASK_SIZE - len >= addr && | ||
586 | (!vma || addr + len <= vma->vm_start) | ||
587 | && !is_hugepage_only_range(mm, addr,len)) | ||
588 | return addr; | ||
589 | } | ||
590 | |||
591 | try_again: | ||
592 | /* make sure it can fit in the remaining address space */ | ||
593 | if (mm->free_area_cache < len) | ||
594 | goto fail; | ||
595 | |||
596 | /* either no address requested or cant fit in requested address hole */ | ||
597 | addr = (mm->free_area_cache - len) & PAGE_MASK; | ||
598 | do { | ||
599 | hugepage_recheck: | ||
600 | if (touches_hugepage_low_range(mm, addr, len)) { | ||
601 | addr = (addr & ((~0) << SID_SHIFT)) - len; | ||
602 | goto hugepage_recheck; | ||
603 | } else if (touches_hugepage_high_range(addr, len)) { | ||
604 | addr = TASK_HPAGE_BASE - len; | ||
605 | } | ||
606 | |||
607 | /* | ||
608 | * Lookup failure means no vma is above this address, | ||
609 | * i.e. return with success: | ||
610 | */ | ||
611 | if (!(vma = find_vma_prev(mm, addr, &prev_vma))) | ||
612 | return addr; | ||
613 | |||
614 | /* | ||
615 | * new region fits between prev_vma->vm_end and | ||
616 | * vma->vm_start, use it: | ||
617 | */ | ||
618 | if (addr+len <= vma->vm_start && | ||
619 | (!prev_vma || (addr >= prev_vma->vm_end))) | ||
620 | /* remember the address as a hint for next time */ | ||
621 | return (mm->free_area_cache = addr); | ||
622 | else | ||
623 | /* pull free_area_cache down to the first hole */ | ||
624 | if (mm->free_area_cache == vma->vm_end) | ||
625 | mm->free_area_cache = vma->vm_start; | ||
626 | |||
627 | /* try just below the current vma->vm_start */ | ||
628 | addr = vma->vm_start-len; | ||
629 | } while (len <= vma->vm_start); | ||
630 | |||
631 | fail: | ||
632 | /* | ||
633 | * if hint left us with no space for the requested | ||
634 | * mapping then try again: | ||
635 | */ | ||
636 | if (first_time) { | ||
637 | mm->free_area_cache = base; | ||
638 | first_time = 0; | ||
639 | goto try_again; | ||
640 | } | ||
641 | /* | ||
642 | * A failed mmap() very likely causes application failure, | ||
643 | * so fall back to the bottom-up function here. This scenario | ||
644 | * can happen with large stack limits and large mmap() | ||
645 | * allocations. | ||
646 | */ | ||
647 | mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
648 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | ||
649 | /* | ||
650 | * Restore the topdown base: | ||
651 | */ | ||
652 | mm->free_area_cache = base; | ||
653 | |||
654 | return addr; | ||
655 | } | ||
656 | |||
657 | static unsigned long htlb_get_low_area(unsigned long len, u16 segmask) | ||
658 | { | ||
659 | unsigned long addr = 0; | ||
660 | struct vm_area_struct *vma; | ||
661 | |||
662 | vma = find_vma(current->mm, addr); | ||
663 | while (addr + len <= 0x100000000UL) { | ||
664 | BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ | ||
665 | |||
666 | if (! __within_hugepage_low_range(addr, len, segmask)) { | ||
667 | addr = ALIGN(addr+1, 1<<SID_SHIFT); | ||
668 | vma = find_vma(current->mm, addr); | ||
669 | continue; | ||
670 | } | ||
671 | |||
672 | if (!vma || (addr + len) <= vma->vm_start) | ||
673 | return addr; | ||
674 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | ||
675 | /* Depending on segmask this might not be a confirmed | ||
676 | * hugepage region, so the ALIGN could have skipped | ||
677 | * some VMAs */ | ||
678 | vma = find_vma(current->mm, addr); | ||
679 | } | ||
680 | |||
681 | return -ENOMEM; | ||
682 | } | ||
683 | |||
684 | static unsigned long htlb_get_high_area(unsigned long len) | ||
685 | { | ||
686 | unsigned long addr = TASK_HPAGE_BASE; | ||
687 | struct vm_area_struct *vma; | ||
688 | |||
689 | vma = find_vma(current->mm, addr); | ||
690 | for (vma = find_vma(current->mm, addr); | ||
691 | addr + len <= TASK_HPAGE_END; | ||
692 | vma = vma->vm_next) { | ||
693 | BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ | ||
694 | BUG_ON(! within_hugepage_high_range(addr, len)); | ||
695 | |||
696 | if (!vma || (addr + len) <= vma->vm_start) | ||
697 | return addr; | ||
698 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | ||
699 | /* Because we're in a hugepage region, this alignment | ||
700 | * should not skip us over any VMAs */ | ||
701 | } | ||
702 | |||
703 | return -ENOMEM; | ||
704 | } | ||
705 | |||
706 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | ||
707 | unsigned long len, unsigned long pgoff, | ||
708 | unsigned long flags) | ||
709 | { | ||
710 | if (len & ~HPAGE_MASK) | ||
711 | return -EINVAL; | ||
712 | |||
713 | if (!cpu_has_feature(CPU_FTR_16M_PAGE)) | ||
714 | return -EINVAL; | ||
715 | |||
716 | if (test_thread_flag(TIF_32BIT)) { | ||
717 | int lastshift = 0; | ||
718 | u16 segmask, cursegs = current->mm->context.htlb_segs; | ||
719 | |||
720 | /* First see if we can do the mapping in the existing | ||
721 | * low hpage segments */ | ||
722 | addr = htlb_get_low_area(len, cursegs); | ||
723 | if (addr != -ENOMEM) | ||
724 | return addr; | ||
725 | |||
726 | for (segmask = LOW_ESID_MASK(0x100000000UL-len, len); | ||
727 | ! lastshift; segmask >>=1) { | ||
728 | if (segmask & 1) | ||
729 | lastshift = 1; | ||
730 | |||
731 | addr = htlb_get_low_area(len, cursegs | segmask); | ||
732 | if ((addr != -ENOMEM) | ||
733 | && open_low_hpage_segs(current->mm, segmask) == 0) | ||
734 | return addr; | ||
735 | } | ||
736 | printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" | ||
737 | " enough segments\n"); | ||
738 | return -ENOMEM; | ||
739 | } else { | ||
740 | return htlb_get_high_area(len); | ||
741 | } | ||
742 | } | ||
743 | |||
744 | void hugetlb_mm_free_pgd(struct mm_struct *mm) | ||
745 | { | ||
746 | int i; | ||
747 | pgd_t *pgdir; | ||
748 | |||
749 | spin_lock(&mm->page_table_lock); | ||
750 | |||
751 | pgdir = mm->context.huge_pgdir; | ||
752 | if (! pgdir) | ||
753 | goto out; | ||
754 | |||
755 | mm->context.huge_pgdir = NULL; | ||
756 | |||
757 | /* cleanup any hugepte pages leftover */ | ||
758 | for (i = 0; i < PTRS_PER_HUGEPGD; i++) { | ||
759 | pgd_t *pgd = pgdir + i; | ||
760 | |||
761 | if (! pgd_none(*pgd)) { | ||
762 | pte_t *pte = (pte_t *)pgd_page(*pgd); | ||
763 | struct page *ptepage = virt_to_page(pte); | ||
764 | |||
765 | ptepage->mapping = NULL; | ||
766 | |||
767 | BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE)); | ||
768 | kmem_cache_free(zero_cache, pte); | ||
769 | } | ||
770 | pgd_clear(pgd); | ||
771 | } | ||
772 | |||
773 | BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE)); | ||
774 | kmem_cache_free(zero_cache, pgdir); | ||
775 | |||
776 | out: | ||
777 | spin_unlock(&mm->page_table_lock); | ||
778 | } | ||
779 | |||
780 | int hash_huge_page(struct mm_struct *mm, unsigned long access, | ||
781 | unsigned long ea, unsigned long vsid, int local) | ||
782 | { | ||
783 | pte_t *ptep; | ||
784 | unsigned long va, vpn; | ||
785 | pte_t old_pte, new_pte; | ||
786 | unsigned long hpteflags, prpn; | ||
787 | long slot; | ||
788 | int err = 1; | ||
789 | |||
790 | spin_lock(&mm->page_table_lock); | ||
791 | |||
792 | ptep = huge_pte_offset(mm, ea); | ||
793 | |||
794 | /* Search the Linux page table for a match with va */ | ||
795 | va = (vsid << 28) | (ea & 0x0fffffff); | ||
796 | vpn = va >> HPAGE_SHIFT; | ||
797 | |||
798 | /* | ||
799 | * If no pte found or not present, send the problem up to | ||
800 | * do_page_fault | ||
801 | */ | ||
802 | if (unlikely(!ptep || pte_none(*ptep))) | ||
803 | goto out; | ||
804 | |||
805 | /* BUG_ON(pte_bad(*ptep)); */ | ||
806 | |||
807 | /* | ||
808 | * Check the user's access rights to the page. If access should be | ||
809 | * prevented then send the problem up to do_page_fault. | ||
810 | */ | ||
811 | if (unlikely(access & ~pte_val(*ptep))) | ||
812 | goto out; | ||
813 | /* | ||
814 | * At this point, we have a pte (old_pte) which can be used to build | ||
815 | * or update an HPTE. There are 2 cases: | ||
816 | * | ||
817 | * 1. There is a valid (present) pte with no associated HPTE (this is | ||
818 | * the most common case) | ||
819 | * 2. There is a valid (present) pte with an associated HPTE. The | ||
820 | * current values of the pp bits in the HPTE prevent access | ||
821 | * because we are doing software DIRTY bit management and the | ||
822 | * page is currently not DIRTY. | ||
823 | */ | ||
824 | |||
825 | |||
826 | old_pte = *ptep; | ||
827 | new_pte = old_pte; | ||
828 | |||
829 | hpteflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW)); | ||
830 | /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ | ||
831 | hpteflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC); | ||
832 | |||
833 | /* Check if pte already has an hpte (case 2) */ | ||
834 | if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) { | ||
835 | /* There MIGHT be an HPTE for this pte */ | ||
836 | unsigned long hash, slot; | ||
837 | |||
838 | hash = hpt_hash(vpn, 1); | ||
839 | if (pte_val(old_pte) & _PAGE_SECONDARY) | ||
840 | hash = ~hash; | ||
841 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | ||
842 | slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12; | ||
843 | |||
844 | if (ppc_md.hpte_updatepp(slot, hpteflags, va, 1, local) == -1) | ||
845 | pte_val(old_pte) &= ~_PAGE_HPTEFLAGS; | ||
846 | } | ||
847 | |||
848 | if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) { | ||
849 | unsigned long hash = hpt_hash(vpn, 1); | ||
850 | unsigned long hpte_group; | ||
851 | |||
852 | prpn = pte_pfn(old_pte); | ||
853 | |||
854 | repeat: | ||
855 | hpte_group = ((hash & htab_hash_mask) * | ||
856 | HPTES_PER_GROUP) & ~0x7UL; | ||
857 | |||
858 | /* Update the linux pte with the HPTE slot */ | ||
859 | pte_val(new_pte) &= ~_PAGE_HPTEFLAGS; | ||
860 | pte_val(new_pte) |= _PAGE_HASHPTE; | ||
861 | |||
862 | /* Add in WIMG bits */ | ||
863 | /* XXX We should store these in the pte */ | ||
864 | hpteflags |= _PAGE_COHERENT; | ||
865 | |||
866 | slot = ppc_md.hpte_insert(hpte_group, va, prpn, 0, | ||
867 | hpteflags, 0, 1); | ||
868 | |||
869 | /* Primary is full, try the secondary */ | ||
870 | if (unlikely(slot == -1)) { | ||
871 | pte_val(new_pte) |= _PAGE_SECONDARY; | ||
872 | hpte_group = ((~hash & htab_hash_mask) * | ||
873 | HPTES_PER_GROUP) & ~0x7UL; | ||
874 | slot = ppc_md.hpte_insert(hpte_group, va, prpn, | ||
875 | 1, hpteflags, 0, 1); | ||
876 | if (slot == -1) { | ||
877 | if (mftb() & 0x1) | ||
878 | hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; | ||
879 | |||
880 | ppc_md.hpte_remove(hpte_group); | ||
881 | goto repeat; | ||
882 | } | ||
883 | } | ||
884 | |||
885 | if (unlikely(slot == -2)) | ||
886 | panic("hash_huge_page: pte_insert failed\n"); | ||
887 | |||
888 | pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX; | ||
889 | |||
890 | /* | ||
891 | * No need to use ldarx/stdcx here because all who | ||
892 | * might be updating the pte will hold the | ||
893 | * page_table_lock | ||
894 | */ | ||
895 | *ptep = new_pte; | ||
896 | } | ||
897 | |||
898 | err = 0; | ||
899 | |||
900 | out: | ||
901 | spin_unlock(&mm->page_table_lock); | ||
902 | |||
903 | return err; | ||
904 | } | ||
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c new file mode 100644 index 00000000000..9d92b0d9cde --- /dev/null +++ b/arch/ppc64/mm/imalloc.c | |||
@@ -0,0 +1,312 @@ | |||
1 | /* | ||
2 | * c 2001 PPC 64 Team, IBM Corp | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | */ | ||
9 | |||
10 | #include <linux/slab.h> | ||
11 | #include <linux/vmalloc.h> | ||
12 | |||
13 | #include <asm/uaccess.h> | ||
14 | #include <asm/pgalloc.h> | ||
15 | #include <asm/pgtable.h> | ||
16 | #include <asm/semaphore.h> | ||
17 | |||
18 | static DECLARE_MUTEX(imlist_sem); | ||
19 | struct vm_struct * imlist = NULL; | ||
20 | |||
21 | static int get_free_im_addr(unsigned long size, unsigned long *im_addr) | ||
22 | { | ||
23 | unsigned long addr; | ||
24 | struct vm_struct **p, *tmp; | ||
25 | |||
26 | addr = IMALLOC_START; | ||
27 | for (p = &imlist; (tmp = *p) ; p = &tmp->next) { | ||
28 | if (size + addr < (unsigned long) tmp->addr) | ||
29 | break; | ||
30 | if ((unsigned long)tmp->addr >= IMALLOC_START) | ||
31 | addr = tmp->size + (unsigned long) tmp->addr; | ||
32 | if (addr > IMALLOC_END-size) | ||
33 | return 1; | ||
34 | } | ||
35 | *im_addr = addr; | ||
36 | |||
37 | return 0; | ||
38 | } | ||
39 | |||
40 | /* Return whether the region described by v_addr and size is a subset | ||
41 | * of the region described by parent | ||
42 | */ | ||
43 | static inline int im_region_is_subset(unsigned long v_addr, unsigned long size, | ||
44 | struct vm_struct *parent) | ||
45 | { | ||
46 | return (int) (v_addr >= (unsigned long) parent->addr && | ||
47 | v_addr < (unsigned long) parent->addr + parent->size && | ||
48 | size < parent->size); | ||
49 | } | ||
50 | |||
51 | /* Return whether the region described by v_addr and size is a superset | ||
52 | * of the region described by child | ||
53 | */ | ||
54 | static int im_region_is_superset(unsigned long v_addr, unsigned long size, | ||
55 | struct vm_struct *child) | ||
56 | { | ||
57 | struct vm_struct parent; | ||
58 | |||
59 | parent.addr = (void *) v_addr; | ||
60 | parent.size = size; | ||
61 | |||
62 | return im_region_is_subset((unsigned long) child->addr, child->size, | ||
63 | &parent); | ||
64 | } | ||
65 | |||
66 | /* Return whether the region described by v_addr and size overlaps | ||
67 | * the region described by vm. Overlapping regions meet the | ||
68 | * following conditions: | ||
69 | * 1) The regions share some part of the address space | ||
70 | * 2) The regions aren't identical | ||
71 | * 3) Neither region is a subset of the other | ||
72 | */ | ||
73 | static int im_region_overlaps(unsigned long v_addr, unsigned long size, | ||
74 | struct vm_struct *vm) | ||
75 | { | ||
76 | if (im_region_is_superset(v_addr, size, vm)) | ||
77 | return 0; | ||
78 | |||
79 | return (v_addr + size > (unsigned long) vm->addr + vm->size && | ||
80 | v_addr < (unsigned long) vm->addr + vm->size) || | ||
81 | (v_addr < (unsigned long) vm->addr && | ||
82 | v_addr + size > (unsigned long) vm->addr); | ||
83 | } | ||
84 | |||
85 | /* Determine imalloc status of region described by v_addr and size. | ||
86 | * Can return one of the following: | ||
87 | * IM_REGION_UNUSED - Entire region is unallocated in imalloc space. | ||
88 | * IM_REGION_SUBSET - Region is a subset of a region that is already | ||
89 | * allocated in imalloc space. | ||
90 | * vm will be assigned to a ptr to the parent region. | ||
91 | * IM_REGION_EXISTS - Exact region already allocated in imalloc space. | ||
92 | * vm will be assigned to a ptr to the existing imlist | ||
93 | * member. | ||
94 | * IM_REGION_OVERLAPS - Region overlaps an allocated region in imalloc space. | ||
95 | * IM_REGION_SUPERSET - Region is a superset of a region that is already | ||
96 | * allocated in imalloc space. | ||
97 | */ | ||
98 | static int im_region_status(unsigned long v_addr, unsigned long size, | ||
99 | struct vm_struct **vm) | ||
100 | { | ||
101 | struct vm_struct *tmp; | ||
102 | |||
103 | for (tmp = imlist; tmp; tmp = tmp->next) | ||
104 | if (v_addr < (unsigned long) tmp->addr + tmp->size) | ||
105 | break; | ||
106 | |||
107 | if (tmp) { | ||
108 | if (im_region_overlaps(v_addr, size, tmp)) | ||
109 | return IM_REGION_OVERLAP; | ||
110 | |||
111 | *vm = tmp; | ||
112 | if (im_region_is_subset(v_addr, size, tmp)) { | ||
113 | /* Return with tmp pointing to superset */ | ||
114 | return IM_REGION_SUBSET; | ||
115 | } | ||
116 | if (im_region_is_superset(v_addr, size, tmp)) { | ||
117 | /* Return with tmp pointing to first subset */ | ||
118 | return IM_REGION_SUPERSET; | ||
119 | } | ||
120 | else if (v_addr == (unsigned long) tmp->addr && | ||
121 | size == tmp->size) { | ||
122 | /* Return with tmp pointing to exact region */ | ||
123 | return IM_REGION_EXISTS; | ||
124 | } | ||
125 | } | ||
126 | |||
127 | *vm = NULL; | ||
128 | return IM_REGION_UNUSED; | ||
129 | } | ||
130 | |||
131 | static struct vm_struct * split_im_region(unsigned long v_addr, | ||
132 | unsigned long size, struct vm_struct *parent) | ||
133 | { | ||
134 | struct vm_struct *vm1 = NULL; | ||
135 | struct vm_struct *vm2 = NULL; | ||
136 | struct vm_struct *new_vm = NULL; | ||
137 | |||
138 | vm1 = (struct vm_struct *) kmalloc(sizeof(*vm1), GFP_KERNEL); | ||
139 | if (vm1 == NULL) { | ||
140 | printk(KERN_ERR "%s() out of memory\n", __FUNCTION__); | ||
141 | return NULL; | ||
142 | } | ||
143 | |||
144 | if (v_addr == (unsigned long) parent->addr) { | ||
145 | /* Use existing parent vm_struct to represent child, allocate | ||
146 | * new one for the remainder of parent range | ||
147 | */ | ||
148 | vm1->size = parent->size - size; | ||
149 | vm1->addr = (void *) (v_addr + size); | ||
150 | vm1->next = parent->next; | ||
151 | |||
152 | parent->size = size; | ||
153 | parent->next = vm1; | ||
154 | new_vm = parent; | ||
155 | } else if (v_addr + size == (unsigned long) parent->addr + | ||
156 | parent->size) { | ||
157 | /* Allocate new vm_struct to represent child, use existing | ||
158 | * parent one for remainder of parent range | ||
159 | */ | ||
160 | vm1->size = size; | ||
161 | vm1->addr = (void *) v_addr; | ||
162 | vm1->next = parent->next; | ||
163 | new_vm = vm1; | ||
164 | |||
165 | parent->size -= size; | ||
166 | parent->next = vm1; | ||
167 | } else { | ||
168 | /* Allocate two new vm_structs for the new child and | ||
169 | * uppermost remainder, and use existing parent one for the | ||
170 | * lower remainder of parent range | ||
171 | */ | ||
172 | vm2 = (struct vm_struct *) kmalloc(sizeof(*vm2), GFP_KERNEL); | ||
173 | if (vm2 == NULL) { | ||
174 | printk(KERN_ERR "%s() out of memory\n", __FUNCTION__); | ||
175 | kfree(vm1); | ||
176 | return NULL; | ||
177 | } | ||
178 | |||
179 | vm1->size = size; | ||
180 | vm1->addr = (void *) v_addr; | ||
181 | vm1->next = vm2; | ||
182 | new_vm = vm1; | ||
183 | |||
184 | vm2->size = ((unsigned long) parent->addr + parent->size) - | ||
185 | (v_addr + size); | ||
186 | vm2->addr = (void *) v_addr + size; | ||
187 | vm2->next = parent->next; | ||
188 | |||
189 | parent->size = v_addr - (unsigned long) parent->addr; | ||
190 | parent->next = vm1; | ||
191 | } | ||
192 | |||
193 | return new_vm; | ||
194 | } | ||
195 | |||
196 | static struct vm_struct * __add_new_im_area(unsigned long req_addr, | ||
197 | unsigned long size) | ||
198 | { | ||
199 | struct vm_struct **p, *tmp, *area; | ||
200 | |||
201 | for (p = &imlist; (tmp = *p) ; p = &tmp->next) { | ||
202 | if (req_addr + size <= (unsigned long)tmp->addr) | ||
203 | break; | ||
204 | } | ||
205 | |||
206 | area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); | ||
207 | if (!area) | ||
208 | return NULL; | ||
209 | area->flags = 0; | ||
210 | area->addr = (void *)req_addr; | ||
211 | area->size = size; | ||
212 | area->next = *p; | ||
213 | *p = area; | ||
214 | |||
215 | return area; | ||
216 | } | ||
217 | |||
218 | static struct vm_struct * __im_get_area(unsigned long req_addr, | ||
219 | unsigned long size, | ||
220 | int criteria) | ||
221 | { | ||
222 | struct vm_struct *tmp; | ||
223 | int status; | ||
224 | |||
225 | status = im_region_status(req_addr, size, &tmp); | ||
226 | if ((criteria & status) == 0) { | ||
227 | return NULL; | ||
228 | } | ||
229 | |||
230 | switch (status) { | ||
231 | case IM_REGION_UNUSED: | ||
232 | tmp = __add_new_im_area(req_addr, size); | ||
233 | break; | ||
234 | case IM_REGION_SUBSET: | ||
235 | tmp = split_im_region(req_addr, size, tmp); | ||
236 | break; | ||
237 | case IM_REGION_EXISTS: | ||
238 | /* Return requested region */ | ||
239 | break; | ||
240 | case IM_REGION_SUPERSET: | ||
241 | /* Return first existing subset of requested region */ | ||
242 | break; | ||
243 | default: | ||
244 | printk(KERN_ERR "%s() unexpected imalloc region status\n", | ||
245 | __FUNCTION__); | ||
246 | tmp = NULL; | ||
247 | } | ||
248 | |||
249 | return tmp; | ||
250 | } | ||
251 | |||
252 | struct vm_struct * im_get_free_area(unsigned long size) | ||
253 | { | ||
254 | struct vm_struct *area; | ||
255 | unsigned long addr; | ||
256 | |||
257 | down(&imlist_sem); | ||
258 | if (get_free_im_addr(size, &addr)) { | ||
259 | printk(KERN_ERR "%s() cannot obtain addr for size 0x%lx\n", | ||
260 | __FUNCTION__, size); | ||
261 | area = NULL; | ||
262 | goto next_im_done; | ||
263 | } | ||
264 | |||
265 | area = __im_get_area(addr, size, IM_REGION_UNUSED); | ||
266 | if (area == NULL) { | ||
267 | printk(KERN_ERR | ||
268 | "%s() cannot obtain area for addr 0x%lx size 0x%lx\n", | ||
269 | __FUNCTION__, addr, size); | ||
270 | } | ||
271 | next_im_done: | ||
272 | up(&imlist_sem); | ||
273 | return area; | ||
274 | } | ||
275 | |||
276 | struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size, | ||
277 | int criteria) | ||
278 | { | ||
279 | struct vm_struct *area; | ||
280 | |||
281 | down(&imlist_sem); | ||
282 | area = __im_get_area(v_addr, size, criteria); | ||
283 | up(&imlist_sem); | ||
284 | return area; | ||
285 | } | ||
286 | |||
287 | unsigned long im_free(void * addr) | ||
288 | { | ||
289 | struct vm_struct **p, *tmp; | ||
290 | unsigned long ret_size = 0; | ||
291 | |||
292 | if (!addr) | ||
293 | return ret_size; | ||
294 | if ((PAGE_SIZE-1) & (unsigned long) addr) { | ||
295 | printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__, addr); | ||
296 | return ret_size; | ||
297 | } | ||
298 | down(&imlist_sem); | ||
299 | for (p = &imlist ; (tmp = *p) ; p = &tmp->next) { | ||
300 | if (tmp->addr == addr) { | ||
301 | ret_size = tmp->size; | ||
302 | *p = tmp->next; | ||
303 | kfree(tmp); | ||
304 | up(&imlist_sem); | ||
305 | return ret_size; | ||
306 | } | ||
307 | } | ||
308 | up(&imlist_sem); | ||
309 | printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__, | ||
310 | addr); | ||
311 | return ret_size; | ||
312 | } | ||
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c new file mode 100644 index 00000000000..23813d03e1c --- /dev/null +++ b/arch/ppc64/mm/init.c | |||
@@ -0,0 +1,927 @@ | |||
1 | /* | ||
2 | * PowerPC version | ||
3 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | ||
4 | * | ||
5 | * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) | ||
6 | * and Cort Dougan (PReP) (cort@cs.nmt.edu) | ||
7 | * Copyright (C) 1996 Paul Mackerras | ||
8 | * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk). | ||
9 | * | ||
10 | * Derived from "arch/i386/mm/init.c" | ||
11 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | ||
12 | * | ||
13 | * Dave Engebretsen <engebret@us.ibm.com> | ||
14 | * Rework for PPC64 port. | ||
15 | * | ||
16 | * This program is free software; you can redistribute it and/or | ||
17 | * modify it under the terms of the GNU General Public License | ||
18 | * as published by the Free Software Foundation; either version | ||
19 | * 2 of the License, or (at your option) any later version. | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | #include <linux/config.h> | ||
24 | #include <linux/signal.h> | ||
25 | #include <linux/sched.h> | ||
26 | #include <linux/kernel.h> | ||
27 | #include <linux/errno.h> | ||
28 | #include <linux/string.h> | ||
29 | #include <linux/types.h> | ||
30 | #include <linux/mman.h> | ||
31 | #include <linux/mm.h> | ||
32 | #include <linux/swap.h> | ||
33 | #include <linux/stddef.h> | ||
34 | #include <linux/vmalloc.h> | ||
35 | #include <linux/init.h> | ||
36 | #include <linux/delay.h> | ||
37 | #include <linux/bootmem.h> | ||
38 | #include <linux/highmem.h> | ||
39 | #include <linux/idr.h> | ||
40 | #include <linux/nodemask.h> | ||
41 | #include <linux/module.h> | ||
42 | |||
43 | #include <asm/pgalloc.h> | ||
44 | #include <asm/page.h> | ||
45 | #include <asm/abs_addr.h> | ||
46 | #include <asm/prom.h> | ||
47 | #include <asm/lmb.h> | ||
48 | #include <asm/rtas.h> | ||
49 | #include <asm/io.h> | ||
50 | #include <asm/mmu_context.h> | ||
51 | #include <asm/pgtable.h> | ||
52 | #include <asm/mmu.h> | ||
53 | #include <asm/uaccess.h> | ||
54 | #include <asm/smp.h> | ||
55 | #include <asm/machdep.h> | ||
56 | #include <asm/tlb.h> | ||
57 | #include <asm/eeh.h> | ||
58 | #include <asm/processor.h> | ||
59 | #include <asm/mmzone.h> | ||
60 | #include <asm/cputable.h> | ||
61 | #include <asm/ppcdebug.h> | ||
62 | #include <asm/sections.h> | ||
63 | #include <asm/system.h> | ||
64 | #include <asm/iommu.h> | ||
65 | #include <asm/abs_addr.h> | ||
66 | #include <asm/vdso.h> | ||
67 | |||
68 | int mem_init_done; | ||
69 | unsigned long ioremap_bot = IMALLOC_BASE; | ||
70 | static unsigned long phbs_io_bot = PHBS_IO_BASE; | ||
71 | |||
72 | extern pgd_t swapper_pg_dir[]; | ||
73 | extern struct task_struct *current_set[NR_CPUS]; | ||
74 | |||
75 | extern pgd_t ioremap_dir[]; | ||
76 | pgd_t * ioremap_pgd = (pgd_t *)&ioremap_dir; | ||
77 | |||
78 | unsigned long klimit = (unsigned long)_end; | ||
79 | |||
80 | unsigned long _SDR1=0; | ||
81 | unsigned long _ASR=0; | ||
82 | |||
83 | /* max amount of RAM to use */ | ||
84 | unsigned long __max_memory; | ||
85 | |||
86 | /* info on what we think the IO hole is */ | ||
87 | unsigned long io_hole_start; | ||
88 | unsigned long io_hole_size; | ||
89 | |||
90 | void show_mem(void) | ||
91 | { | ||
92 | unsigned long total = 0, reserved = 0; | ||
93 | unsigned long shared = 0, cached = 0; | ||
94 | struct page *page; | ||
95 | pg_data_t *pgdat; | ||
96 | unsigned long i; | ||
97 | |||
98 | printk("Mem-info:\n"); | ||
99 | show_free_areas(); | ||
100 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | ||
101 | for_each_pgdat(pgdat) { | ||
102 | for (i = 0; i < pgdat->node_spanned_pages; i++) { | ||
103 | page = pgdat->node_mem_map + i; | ||
104 | total++; | ||
105 | if (PageReserved(page)) | ||
106 | reserved++; | ||
107 | else if (PageSwapCache(page)) | ||
108 | cached++; | ||
109 | else if (page_count(page)) | ||
110 | shared += page_count(page) - 1; | ||
111 | } | ||
112 | } | ||
113 | printk("%ld pages of RAM\n", total); | ||
114 | printk("%ld reserved pages\n", reserved); | ||
115 | printk("%ld pages shared\n", shared); | ||
116 | printk("%ld pages swap cached\n", cached); | ||
117 | } | ||
118 | |||
119 | #ifdef CONFIG_PPC_ISERIES | ||
120 | |||
121 | void __iomem *ioremap(unsigned long addr, unsigned long size) | ||
122 | { | ||
123 | return (void __iomem *)addr; | ||
124 | } | ||
125 | |||
126 | extern void __iomem *__ioremap(unsigned long addr, unsigned long size, | ||
127 | unsigned long flags) | ||
128 | { | ||
129 | return (void __iomem *)addr; | ||
130 | } | ||
131 | |||
132 | void iounmap(volatile void __iomem *addr) | ||
133 | { | ||
134 | return; | ||
135 | } | ||
136 | |||
137 | #else | ||
138 | |||
139 | /* | ||
140 | * map_io_page currently only called by __ioremap | ||
141 | * map_io_page adds an entry to the ioremap page table | ||
142 | * and adds an entry to the HPT, possibly bolting it | ||
143 | */ | ||
144 | static void map_io_page(unsigned long ea, unsigned long pa, int flags) | ||
145 | { | ||
146 | pgd_t *pgdp; | ||
147 | pmd_t *pmdp; | ||
148 | pte_t *ptep; | ||
149 | unsigned long vsid; | ||
150 | |||
151 | if (mem_init_done) { | ||
152 | spin_lock(&ioremap_mm.page_table_lock); | ||
153 | pgdp = pgd_offset_i(ea); | ||
154 | pmdp = pmd_alloc(&ioremap_mm, pgdp, ea); | ||
155 | ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea); | ||
156 | |||
157 | pa = abs_to_phys(pa); | ||
158 | set_pte_at(&ioremap_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); | ||
159 | spin_unlock(&ioremap_mm.page_table_lock); | ||
160 | } else { | ||
161 | unsigned long va, vpn, hash, hpteg; | ||
162 | |||
163 | /* | ||
164 | * If the mm subsystem is not fully up, we cannot create a | ||
165 | * linux page table entry for this mapping. Simply bolt an | ||
166 | * entry in the hardware page table. | ||
167 | */ | ||
168 | vsid = get_kernel_vsid(ea); | ||
169 | va = (vsid << 28) | (ea & 0xFFFFFFF); | ||
170 | vpn = va >> PAGE_SHIFT; | ||
171 | |||
172 | hash = hpt_hash(vpn, 0); | ||
173 | |||
174 | hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); | ||
175 | |||
176 | /* Panic if a pte grpup is full */ | ||
177 | if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT, 0, | ||
178 | _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX, | ||
179 | 1, 0) == -1) { | ||
180 | panic("map_io_page: could not insert mapping"); | ||
181 | } | ||
182 | } | ||
183 | } | ||
184 | |||
185 | |||
186 | static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa, | ||
187 | unsigned long ea, unsigned long size, | ||
188 | unsigned long flags) | ||
189 | { | ||
190 | unsigned long i; | ||
191 | |||
192 | if ((flags & _PAGE_PRESENT) == 0) | ||
193 | flags |= pgprot_val(PAGE_KERNEL); | ||
194 | if (flags & (_PAGE_NO_CACHE | _PAGE_WRITETHRU)) | ||
195 | flags |= _PAGE_GUARDED; | ||
196 | |||
197 | for (i = 0; i < size; i += PAGE_SIZE) { | ||
198 | map_io_page(ea+i, pa+i, flags); | ||
199 | } | ||
200 | |||
201 | return (void __iomem *) (ea + (addr & ~PAGE_MASK)); | ||
202 | } | ||
203 | |||
204 | |||
205 | void __iomem * | ||
206 | ioremap(unsigned long addr, unsigned long size) | ||
207 | { | ||
208 | return __ioremap(addr, size, _PAGE_NO_CACHE); | ||
209 | } | ||
210 | |||
211 | void __iomem * | ||
212 | __ioremap(unsigned long addr, unsigned long size, unsigned long flags) | ||
213 | { | ||
214 | unsigned long pa, ea; | ||
215 | |||
216 | /* | ||
217 | * Choose an address to map it to. | ||
218 | * Once the imalloc system is running, we use it. | ||
219 | * Before that, we map using addresses going | ||
220 | * up from ioremap_bot. imalloc will use | ||
221 | * the addresses from ioremap_bot through | ||
222 | * IMALLOC_END (0xE000001fffffffff) | ||
223 | * | ||
224 | */ | ||
225 | pa = addr & PAGE_MASK; | ||
226 | size = PAGE_ALIGN(addr + size) - pa; | ||
227 | |||
228 | if (size == 0) | ||
229 | return NULL; | ||
230 | |||
231 | if (mem_init_done) { | ||
232 | struct vm_struct *area; | ||
233 | area = im_get_free_area(size); | ||
234 | if (area == NULL) | ||
235 | return NULL; | ||
236 | ea = (unsigned long)(area->addr); | ||
237 | } else { | ||
238 | ea = ioremap_bot; | ||
239 | ioremap_bot += size; | ||
240 | } | ||
241 | |||
242 | return __ioremap_com(addr, pa, ea, size, flags); | ||
243 | } | ||
244 | |||
245 | #define IS_PAGE_ALIGNED(_val) ((_val) == ((_val) & PAGE_MASK)) | ||
246 | |||
247 | int __ioremap_explicit(unsigned long pa, unsigned long ea, | ||
248 | unsigned long size, unsigned long flags) | ||
249 | { | ||
250 | struct vm_struct *area; | ||
251 | |||
252 | /* For now, require page-aligned values for pa, ea, and size */ | ||
253 | if (!IS_PAGE_ALIGNED(pa) || !IS_PAGE_ALIGNED(ea) || | ||
254 | !IS_PAGE_ALIGNED(size)) { | ||
255 | printk(KERN_ERR "unaligned value in %s\n", __FUNCTION__); | ||
256 | return 1; | ||
257 | } | ||
258 | |||
259 | if (!mem_init_done) { | ||
260 | /* Two things to consider in this case: | ||
261 | * 1) No records will be kept (imalloc, etc) that the region | ||
262 | * has been remapped | ||
263 | * 2) It won't be easy to iounmap() the region later (because | ||
264 | * of 1) | ||
265 | */ | ||
266 | ; | ||
267 | } else { | ||
268 | area = im_get_area(ea, size, | ||
269 | IM_REGION_UNUSED|IM_REGION_SUBSET|IM_REGION_EXISTS); | ||
270 | if (area == NULL) { | ||
271 | /* Expected when PHB-dlpar is in play */ | ||
272 | return 1; | ||
273 | } | ||
274 | if (ea != (unsigned long) area->addr) { | ||
275 | printk(KERN_ERR "unexpected addr return from im_get_area\n"); | ||
276 | return 1; | ||
277 | } | ||
278 | } | ||
279 | |||
280 | if (__ioremap_com(pa, pa, ea, size, flags) != (void *) ea) { | ||
281 | printk(KERN_ERR "__ioremap_com() returned unexpected addr\n"); | ||
282 | return 1; | ||
283 | } | ||
284 | |||
285 | return 0; | ||
286 | } | ||
287 | |||
288 | static void unmap_im_area_pte(pmd_t *pmd, unsigned long address, | ||
289 | unsigned long size) | ||
290 | { | ||
291 | unsigned long base, end; | ||
292 | pte_t *pte; | ||
293 | |||
294 | if (pmd_none(*pmd)) | ||
295 | return; | ||
296 | if (pmd_bad(*pmd)) { | ||
297 | pmd_ERROR(*pmd); | ||
298 | pmd_clear(pmd); | ||
299 | return; | ||
300 | } | ||
301 | |||
302 | pte = pte_offset_kernel(pmd, address); | ||
303 | base = address & PMD_MASK; | ||
304 | address &= ~PMD_MASK; | ||
305 | end = address + size; | ||
306 | if (end > PMD_SIZE) | ||
307 | end = PMD_SIZE; | ||
308 | |||
309 | do { | ||
310 | pte_t page; | ||
311 | page = ptep_get_and_clear(&ioremap_mm, base + address, pte); | ||
312 | address += PAGE_SIZE; | ||
313 | pte++; | ||
314 | if (pte_none(page)) | ||
315 | continue; | ||
316 | if (pte_present(page)) | ||
317 | continue; | ||
318 | printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n"); | ||
319 | } while (address < end); | ||
320 | } | ||
321 | |||
322 | static void unmap_im_area_pmd(pgd_t *dir, unsigned long address, | ||
323 | unsigned long size) | ||
324 | { | ||
325 | unsigned long base, end; | ||
326 | pmd_t *pmd; | ||
327 | |||
328 | if (pgd_none(*dir)) | ||
329 | return; | ||
330 | if (pgd_bad(*dir)) { | ||
331 | pgd_ERROR(*dir); | ||
332 | pgd_clear(dir); | ||
333 | return; | ||
334 | } | ||
335 | |||
336 | pmd = pmd_offset(dir, address); | ||
337 | base = address & PGDIR_MASK; | ||
338 | address &= ~PGDIR_MASK; | ||
339 | end = address + size; | ||
340 | if (end > PGDIR_SIZE) | ||
341 | end = PGDIR_SIZE; | ||
342 | |||
343 | do { | ||
344 | unmap_im_area_pte(pmd, base + address, end - address); | ||
345 | address = (address + PMD_SIZE) & PMD_MASK; | ||
346 | pmd++; | ||
347 | } while (address < end); | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * Unmap an IO region and remove it from imalloc'd list. | ||
352 | * Access to IO memory should be serialized by driver. | ||
353 | * This code is modeled after vmalloc code - unmap_vm_area() | ||
354 | * | ||
355 | * XXX what about calls before mem_init_done (ie python_countermeasures()) | ||
356 | */ | ||
357 | void iounmap(volatile void __iomem *token) | ||
358 | { | ||
359 | unsigned long address, start, end, size; | ||
360 | struct mm_struct *mm; | ||
361 | pgd_t *dir; | ||
362 | void *addr; | ||
363 | |||
364 | if (!mem_init_done) { | ||
365 | return; | ||
366 | } | ||
367 | |||
368 | addr = (void *) ((unsigned long __force) token & PAGE_MASK); | ||
369 | |||
370 | if ((size = im_free(addr)) == 0) { | ||
371 | return; | ||
372 | } | ||
373 | |||
374 | address = (unsigned long)addr; | ||
375 | start = address; | ||
376 | end = address + size; | ||
377 | |||
378 | mm = &ioremap_mm; | ||
379 | spin_lock(&mm->page_table_lock); | ||
380 | |||
381 | dir = pgd_offset_i(address); | ||
382 | flush_cache_vunmap(address, end); | ||
383 | do { | ||
384 | unmap_im_area_pmd(dir, address, end - address); | ||
385 | address = (address + PGDIR_SIZE) & PGDIR_MASK; | ||
386 | dir++; | ||
387 | } while (address && (address < end)); | ||
388 | flush_tlb_kernel_range(start, end); | ||
389 | |||
390 | spin_unlock(&mm->page_table_lock); | ||
391 | return; | ||
392 | } | ||
393 | |||
394 | static int iounmap_subset_regions(unsigned long addr, unsigned long size) | ||
395 | { | ||
396 | struct vm_struct *area; | ||
397 | |||
398 | /* Check whether subsets of this region exist */ | ||
399 | area = im_get_area(addr, size, IM_REGION_SUPERSET); | ||
400 | if (area == NULL) | ||
401 | return 1; | ||
402 | |||
403 | while (area) { | ||
404 | iounmap((void __iomem *) area->addr); | ||
405 | area = im_get_area(addr, size, | ||
406 | IM_REGION_SUPERSET); | ||
407 | } | ||
408 | |||
409 | return 0; | ||
410 | } | ||
411 | |||
412 | int iounmap_explicit(volatile void __iomem *start, unsigned long size) | ||
413 | { | ||
414 | struct vm_struct *area; | ||
415 | unsigned long addr; | ||
416 | int rc; | ||
417 | |||
418 | addr = (unsigned long __force) start & PAGE_MASK; | ||
419 | |||
420 | /* Verify that the region either exists or is a subset of an existing | ||
421 | * region. In the latter case, split the parent region to create | ||
422 | * the exact region | ||
423 | */ | ||
424 | area = im_get_area(addr, size, | ||
425 | IM_REGION_EXISTS | IM_REGION_SUBSET); | ||
426 | if (area == NULL) { | ||
427 | /* Determine whether subset regions exist. If so, unmap */ | ||
428 | rc = iounmap_subset_regions(addr, size); | ||
429 | if (rc) { | ||
430 | printk(KERN_ERR | ||
431 | "%s() cannot unmap nonexistent range 0x%lx\n", | ||
432 | __FUNCTION__, addr); | ||
433 | return 1; | ||
434 | } | ||
435 | } else { | ||
436 | iounmap((void __iomem *) area->addr); | ||
437 | } | ||
438 | /* | ||
439 | * FIXME! This can't be right: | ||
440 | iounmap(area->addr); | ||
441 | * Maybe it should be "iounmap(area);" | ||
442 | */ | ||
443 | return 0; | ||
444 | } | ||
445 | |||
446 | #endif | ||
447 | |||
448 | EXPORT_SYMBOL(ioremap); | ||
449 | EXPORT_SYMBOL(__ioremap); | ||
450 | EXPORT_SYMBOL(iounmap); | ||
451 | |||
452 | void free_initmem(void) | ||
453 | { | ||
454 | unsigned long addr; | ||
455 | |||
456 | addr = (unsigned long)__init_begin; | ||
457 | for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { | ||
458 | ClearPageReserved(virt_to_page(addr)); | ||
459 | set_page_count(virt_to_page(addr), 1); | ||
460 | free_page(addr); | ||
461 | totalram_pages++; | ||
462 | } | ||
463 | printk ("Freeing unused kernel memory: %luk freed\n", | ||
464 | ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10); | ||
465 | } | ||
466 | |||
467 | #ifdef CONFIG_BLK_DEV_INITRD | ||
468 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
469 | { | ||
470 | if (start < end) | ||
471 | printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); | ||
472 | for (; start < end; start += PAGE_SIZE) { | ||
473 | ClearPageReserved(virt_to_page(start)); | ||
474 | set_page_count(virt_to_page(start), 1); | ||
475 | free_page(start); | ||
476 | totalram_pages++; | ||
477 | } | ||
478 | } | ||
479 | #endif | ||
480 | |||
481 | static DEFINE_SPINLOCK(mmu_context_lock); | ||
482 | static DEFINE_IDR(mmu_context_idr); | ||
483 | |||
484 | int init_new_context(struct task_struct *tsk, struct mm_struct *mm) | ||
485 | { | ||
486 | int index; | ||
487 | int err; | ||
488 | |||
489 | #ifdef CONFIG_HUGETLB_PAGE | ||
490 | /* We leave htlb_segs as it was, but for a fork, we need to | ||
491 | * clear the huge_pgdir. */ | ||
492 | mm->context.huge_pgdir = NULL; | ||
493 | #endif | ||
494 | |||
495 | again: | ||
496 | if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL)) | ||
497 | return -ENOMEM; | ||
498 | |||
499 | spin_lock(&mmu_context_lock); | ||
500 | err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index); | ||
501 | spin_unlock(&mmu_context_lock); | ||
502 | |||
503 | if (err == -EAGAIN) | ||
504 | goto again; | ||
505 | else if (err) | ||
506 | return err; | ||
507 | |||
508 | if (index > MAX_CONTEXT) { | ||
509 | idr_remove(&mmu_context_idr, index); | ||
510 | return -ENOMEM; | ||
511 | } | ||
512 | |||
513 | mm->context.id = index; | ||
514 | |||
515 | return 0; | ||
516 | } | ||
517 | |||
518 | void destroy_context(struct mm_struct *mm) | ||
519 | { | ||
520 | spin_lock(&mmu_context_lock); | ||
521 | idr_remove(&mmu_context_idr, mm->context.id); | ||
522 | spin_unlock(&mmu_context_lock); | ||
523 | |||
524 | mm->context.id = NO_CONTEXT; | ||
525 | |||
526 | hugetlb_mm_free_pgd(mm); | ||
527 | } | ||
528 | |||
529 | /* | ||
530 | * Do very early mm setup. | ||
531 | */ | ||
532 | void __init mm_init_ppc64(void) | ||
533 | { | ||
534 | #ifndef CONFIG_PPC_ISERIES | ||
535 | unsigned long i; | ||
536 | #endif | ||
537 | |||
538 | ppc64_boot_msg(0x100, "MM Init"); | ||
539 | |||
540 | /* This is the story of the IO hole... please, keep seated, | ||
541 | * unfortunately, we are out of oxygen masks at the moment. | ||
542 | * So we need some rough way to tell where your big IO hole | ||
543 | * is. On pmac, it's between 2G and 4G, on POWER3, it's around | ||
544 | * that area as well, on POWER4 we don't have one, etc... | ||
545 | * We need that as a "hint" when sizing the TCE table on POWER3 | ||
546 | * So far, the simplest way that seem work well enough for us it | ||
547 | * to just assume that the first discontinuity in our physical | ||
548 | * RAM layout is the IO hole. That may not be correct in the future | ||
549 | * (and isn't on iSeries but then we don't care ;) | ||
550 | */ | ||
551 | |||
552 | #ifndef CONFIG_PPC_ISERIES | ||
553 | for (i = 1; i < lmb.memory.cnt; i++) { | ||
554 | unsigned long base, prevbase, prevsize; | ||
555 | |||
556 | prevbase = lmb.memory.region[i-1].physbase; | ||
557 | prevsize = lmb.memory.region[i-1].size; | ||
558 | base = lmb.memory.region[i].physbase; | ||
559 | if (base > (prevbase + prevsize)) { | ||
560 | io_hole_start = prevbase + prevsize; | ||
561 | io_hole_size = base - (prevbase + prevsize); | ||
562 | break; | ||
563 | } | ||
564 | } | ||
565 | #endif /* CONFIG_PPC_ISERIES */ | ||
566 | if (io_hole_start) | ||
567 | printk("IO Hole assumed to be %lx -> %lx\n", | ||
568 | io_hole_start, io_hole_start + io_hole_size - 1); | ||
569 | |||
570 | ppc64_boot_msg(0x100, "MM Init Done"); | ||
571 | } | ||
572 | |||
573 | /* | ||
574 | * This is called by /dev/mem to know if a given address has to | ||
575 | * be mapped non-cacheable or not | ||
576 | */ | ||
577 | int page_is_ram(unsigned long pfn) | ||
578 | { | ||
579 | int i; | ||
580 | unsigned long paddr = (pfn << PAGE_SHIFT); | ||
581 | |||
582 | for (i=0; i < lmb.memory.cnt; i++) { | ||
583 | unsigned long base; | ||
584 | |||
585 | #ifdef CONFIG_MSCHUNKS | ||
586 | base = lmb.memory.region[i].physbase; | ||
587 | #else | ||
588 | base = lmb.memory.region[i].base; | ||
589 | #endif | ||
590 | if ((paddr >= base) && | ||
591 | (paddr < (base + lmb.memory.region[i].size))) { | ||
592 | return 1; | ||
593 | } | ||
594 | } | ||
595 | |||
596 | return 0; | ||
597 | } | ||
598 | EXPORT_SYMBOL(page_is_ram); | ||
599 | |||
600 | /* | ||
601 | * Initialize the bootmem system and give it all the memory we | ||
602 | * have available. | ||
603 | */ | ||
604 | #ifndef CONFIG_DISCONTIGMEM | ||
605 | void __init do_init_bootmem(void) | ||
606 | { | ||
607 | unsigned long i; | ||
608 | unsigned long start, bootmap_pages; | ||
609 | unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT; | ||
610 | int boot_mapsize; | ||
611 | |||
612 | /* | ||
613 | * Find an area to use for the bootmem bitmap. Calculate the size of | ||
614 | * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE. | ||
615 | * Add 1 additional page in case the address isn't page-aligned. | ||
616 | */ | ||
617 | bootmap_pages = bootmem_bootmap_pages(total_pages); | ||
618 | |||
619 | start = abs_to_phys(lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE)); | ||
620 | BUG_ON(!start); | ||
621 | |||
622 | boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages); | ||
623 | |||
624 | max_pfn = max_low_pfn; | ||
625 | |||
626 | /* add all physical memory to the bootmem map. Also find the first */ | ||
627 | for (i=0; i < lmb.memory.cnt; i++) { | ||
628 | unsigned long physbase, size; | ||
629 | |||
630 | physbase = lmb.memory.region[i].physbase; | ||
631 | size = lmb.memory.region[i].size; | ||
632 | free_bootmem(physbase, size); | ||
633 | } | ||
634 | |||
635 | /* reserve the sections we're already using */ | ||
636 | for (i=0; i < lmb.reserved.cnt; i++) { | ||
637 | unsigned long physbase = lmb.reserved.region[i].physbase; | ||
638 | unsigned long size = lmb.reserved.region[i].size; | ||
639 | |||
640 | reserve_bootmem(physbase, size); | ||
641 | } | ||
642 | } | ||
643 | |||
644 | /* | ||
645 | * paging_init() sets up the page tables - in fact we've already done this. | ||
646 | */ | ||
647 | void __init paging_init(void) | ||
648 | { | ||
649 | unsigned long zones_size[MAX_NR_ZONES]; | ||
650 | unsigned long zholes_size[MAX_NR_ZONES]; | ||
651 | unsigned long total_ram = lmb_phys_mem_size(); | ||
652 | unsigned long top_of_ram = lmb_end_of_DRAM(); | ||
653 | |||
654 | printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", | ||
655 | top_of_ram, total_ram); | ||
656 | printk(KERN_INFO "Memory hole size: %ldMB\n", | ||
657 | (top_of_ram - total_ram) >> 20); | ||
658 | /* | ||
659 | * All pages are DMA-able so we put them all in the DMA zone. | ||
660 | */ | ||
661 | memset(zones_size, 0, sizeof(zones_size)); | ||
662 | memset(zholes_size, 0, sizeof(zholes_size)); | ||
663 | |||
664 | zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT; | ||
665 | zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT; | ||
666 | |||
667 | free_area_init_node(0, &contig_page_data, zones_size, | ||
668 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size); | ||
669 | } | ||
670 | #endif /* CONFIG_DISCONTIGMEM */ | ||
671 | |||
672 | static struct kcore_list kcore_vmem; | ||
673 | |||
674 | static int __init setup_kcore(void) | ||
675 | { | ||
676 | int i; | ||
677 | |||
678 | for (i=0; i < lmb.memory.cnt; i++) { | ||
679 | unsigned long physbase, size; | ||
680 | struct kcore_list *kcore_mem; | ||
681 | |||
682 | physbase = lmb.memory.region[i].physbase; | ||
683 | size = lmb.memory.region[i].size; | ||
684 | |||
685 | /* GFP_ATOMIC to avoid might_sleep warnings during boot */ | ||
686 | kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC); | ||
687 | if (!kcore_mem) | ||
688 | panic("mem_init: kmalloc failed\n"); | ||
689 | |||
690 | kclist_add(kcore_mem, __va(physbase), size); | ||
691 | } | ||
692 | |||
693 | kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); | ||
694 | |||
695 | return 0; | ||
696 | } | ||
697 | module_init(setup_kcore); | ||
698 | |||
699 | void __init mem_init(void) | ||
700 | { | ||
701 | #ifdef CONFIG_DISCONTIGMEM | ||
702 | int nid; | ||
703 | #endif | ||
704 | pg_data_t *pgdat; | ||
705 | unsigned long i; | ||
706 | struct page *page; | ||
707 | unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize; | ||
708 | |||
709 | num_physpages = max_low_pfn; /* RAM is assumed contiguous */ | ||
710 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); | ||
711 | |||
712 | #ifdef CONFIG_DISCONTIGMEM | ||
713 | for_each_online_node(nid) { | ||
714 | if (NODE_DATA(nid)->node_spanned_pages != 0) { | ||
715 | printk("freeing bootmem node %x\n", nid); | ||
716 | totalram_pages += | ||
717 | free_all_bootmem_node(NODE_DATA(nid)); | ||
718 | } | ||
719 | } | ||
720 | #else | ||
721 | max_mapnr = num_physpages; | ||
722 | totalram_pages += free_all_bootmem(); | ||
723 | #endif | ||
724 | |||
725 | for_each_pgdat(pgdat) { | ||
726 | for (i = 0; i < pgdat->node_spanned_pages; i++) { | ||
727 | page = pgdat->node_mem_map + i; | ||
728 | if (PageReserved(page)) | ||
729 | reservedpages++; | ||
730 | } | ||
731 | } | ||
732 | |||
733 | codesize = (unsigned long)&_etext - (unsigned long)&_stext; | ||
734 | initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin; | ||
735 | datasize = (unsigned long)&_edata - (unsigned long)&__init_end; | ||
736 | bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start; | ||
737 | |||
738 | printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, " | ||
739 | "%luk reserved, %luk data, %luk bss, %luk init)\n", | ||
740 | (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), | ||
741 | num_physpages << (PAGE_SHIFT-10), | ||
742 | codesize >> 10, | ||
743 | reservedpages << (PAGE_SHIFT-10), | ||
744 | datasize >> 10, | ||
745 | bsssize >> 10, | ||
746 | initsize >> 10); | ||
747 | |||
748 | mem_init_done = 1; | ||
749 | |||
750 | #ifdef CONFIG_PPC_ISERIES | ||
751 | iommu_vio_init(); | ||
752 | #endif | ||
753 | /* Initialize the vDSO */ | ||
754 | vdso_init(); | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * This is called when a page has been modified by the kernel. | ||
759 | * It just marks the page as not i-cache clean. We do the i-cache | ||
760 | * flush later when the page is given to a user process, if necessary. | ||
761 | */ | ||
762 | void flush_dcache_page(struct page *page) | ||
763 | { | ||
764 | if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) | ||
765 | return; | ||
766 | /* avoid an atomic op if possible */ | ||
767 | if (test_bit(PG_arch_1, &page->flags)) | ||
768 | clear_bit(PG_arch_1, &page->flags); | ||
769 | } | ||
770 | EXPORT_SYMBOL(flush_dcache_page); | ||
771 | |||
772 | void clear_user_page(void *page, unsigned long vaddr, struct page *pg) | ||
773 | { | ||
774 | clear_page(page); | ||
775 | |||
776 | if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) | ||
777 | return; | ||
778 | /* | ||
779 | * We shouldnt have to do this, but some versions of glibc | ||
780 | * require it (ld.so assumes zero filled pages are icache clean) | ||
781 | * - Anton | ||
782 | */ | ||
783 | |||
784 | /* avoid an atomic op if possible */ | ||
785 | if (test_bit(PG_arch_1, &pg->flags)) | ||
786 | clear_bit(PG_arch_1, &pg->flags); | ||
787 | } | ||
788 | EXPORT_SYMBOL(clear_user_page); | ||
789 | |||
790 | void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, | ||
791 | struct page *pg) | ||
792 | { | ||
793 | copy_page(vto, vfrom); | ||
794 | |||
795 | /* | ||
796 | * We should be able to use the following optimisation, however | ||
797 | * there are two problems. | ||
798 | * Firstly a bug in some versions of binutils meant PLT sections | ||
799 | * were not marked executable. | ||
800 | * Secondly the first word in the GOT section is blrl, used | ||
801 | * to establish the GOT address. Until recently the GOT was | ||
802 | * not marked executable. | ||
803 | * - Anton | ||
804 | */ | ||
805 | #if 0 | ||
806 | if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0)) | ||
807 | return; | ||
808 | #endif | ||
809 | |||
810 | if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) | ||
811 | return; | ||
812 | |||
813 | /* avoid an atomic op if possible */ | ||
814 | if (test_bit(PG_arch_1, &pg->flags)) | ||
815 | clear_bit(PG_arch_1, &pg->flags); | ||
816 | } | ||
817 | |||
818 | void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, | ||
819 | unsigned long addr, int len) | ||
820 | { | ||
821 | unsigned long maddr; | ||
822 | |||
823 | maddr = (unsigned long)page_address(page) + (addr & ~PAGE_MASK); | ||
824 | flush_icache_range(maddr, maddr + len); | ||
825 | } | ||
826 | EXPORT_SYMBOL(flush_icache_user_range); | ||
827 | |||
828 | /* | ||
829 | * This is called at the end of handling a user page fault, when the | ||
830 | * fault has been handled by updating a PTE in the linux page tables. | ||
831 | * We use it to preload an HPTE into the hash table corresponding to | ||
832 | * the updated linux PTE. | ||
833 | * | ||
834 | * This must always be called with the mm->page_table_lock held | ||
835 | */ | ||
836 | void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea, | ||
837 | pte_t pte) | ||
838 | { | ||
839 | unsigned long vsid; | ||
840 | void *pgdir; | ||
841 | pte_t *ptep; | ||
842 | int local = 0; | ||
843 | cpumask_t tmp; | ||
844 | unsigned long flags; | ||
845 | |||
846 | /* handle i-cache coherency */ | ||
847 | if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) && | ||
848 | !cpu_has_feature(CPU_FTR_NOEXECUTE)) { | ||
849 | unsigned long pfn = pte_pfn(pte); | ||
850 | if (pfn_valid(pfn)) { | ||
851 | struct page *page = pfn_to_page(pfn); | ||
852 | if (!PageReserved(page) | ||
853 | && !test_bit(PG_arch_1, &page->flags)) { | ||
854 | __flush_dcache_icache(page_address(page)); | ||
855 | set_bit(PG_arch_1, &page->flags); | ||
856 | } | ||
857 | } | ||
858 | } | ||
859 | |||
860 | /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ | ||
861 | if (!pte_young(pte)) | ||
862 | return; | ||
863 | |||
864 | pgdir = vma->vm_mm->pgd; | ||
865 | if (pgdir == NULL) | ||
866 | return; | ||
867 | |||
868 | ptep = find_linux_pte(pgdir, ea); | ||
869 | if (!ptep) | ||
870 | return; | ||
871 | |||
872 | vsid = get_vsid(vma->vm_mm->context.id, ea); | ||
873 | |||
874 | local_irq_save(flags); | ||
875 | tmp = cpumask_of_cpu(smp_processor_id()); | ||
876 | if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp)) | ||
877 | local = 1; | ||
878 | |||
879 | __hash_page(ea, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep, | ||
880 | 0x300, local); | ||
881 | local_irq_restore(flags); | ||
882 | } | ||
883 | |||
884 | void __iomem * reserve_phb_iospace(unsigned long size) | ||
885 | { | ||
886 | void __iomem *virt_addr; | ||
887 | |||
888 | if (phbs_io_bot >= IMALLOC_BASE) | ||
889 | panic("reserve_phb_iospace(): phb io space overflow\n"); | ||
890 | |||
891 | virt_addr = (void __iomem *) phbs_io_bot; | ||
892 | phbs_io_bot += size; | ||
893 | |||
894 | return virt_addr; | ||
895 | } | ||
896 | |||
897 | kmem_cache_t *zero_cache; | ||
898 | |||
899 | static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags) | ||
900 | { | ||
901 | memset(pte, 0, PAGE_SIZE); | ||
902 | } | ||
903 | |||
904 | void pgtable_cache_init(void) | ||
905 | { | ||
906 | zero_cache = kmem_cache_create("zero", | ||
907 | PAGE_SIZE, | ||
908 | 0, | ||
909 | SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, | ||
910 | zero_ctor, | ||
911 | NULL); | ||
912 | if (!zero_cache) | ||
913 | panic("pgtable_cache_init(): could not create zero_cache!\n"); | ||
914 | } | ||
915 | |||
916 | pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, | ||
917 | unsigned long size, pgprot_t vma_prot) | ||
918 | { | ||
919 | if (ppc_md.phys_mem_access_prot) | ||
920 | return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot); | ||
921 | |||
922 | if (!page_is_ram(addr >> PAGE_SHIFT)) | ||
923 | vma_prot = __pgprot(pgprot_val(vma_prot) | ||
924 | | _PAGE_GUARDED | _PAGE_NO_CACHE); | ||
925 | return vma_prot; | ||
926 | } | ||
927 | EXPORT_SYMBOL(phys_mem_access_prot); | ||
diff --git a/arch/ppc64/mm/mmap.c b/arch/ppc64/mm/mmap.c new file mode 100644 index 00000000000..fe65f522aff --- /dev/null +++ b/arch/ppc64/mm/mmap.c | |||
@@ -0,0 +1,86 @@ | |||
1 | /* | ||
2 | * linux/arch/ppc64/mm/mmap.c | ||
3 | * | ||
4 | * flexible mmap layout support | ||
5 | * | ||
6 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. | ||
7 | * All Rights Reserved. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | * GNU General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public License | ||
20 | * along with this program; if not, write to the Free Software | ||
21 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
22 | * | ||
23 | * | ||
24 | * Started by Ingo Molnar <mingo@elte.hu> | ||
25 | */ | ||
26 | |||
27 | #include <linux/personality.h> | ||
28 | #include <linux/mm.h> | ||
29 | |||
30 | /* | ||
31 | * Top of mmap area (just below the process stack). | ||
32 | * | ||
33 | * Leave an at least ~128 MB hole. | ||
34 | */ | ||
35 | #define MIN_GAP (128*1024*1024) | ||
36 | #define MAX_GAP (TASK_SIZE/6*5) | ||
37 | |||
38 | static inline unsigned long mmap_base(void) | ||
39 | { | ||
40 | unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; | ||
41 | |||
42 | if (gap < MIN_GAP) | ||
43 | gap = MIN_GAP; | ||
44 | else if (gap > MAX_GAP) | ||
45 | gap = MAX_GAP; | ||
46 | |||
47 | return TASK_SIZE - (gap & PAGE_MASK); | ||
48 | } | ||
49 | |||
50 | static inline int mmap_is_legacy(void) | ||
51 | { | ||
52 | /* | ||
53 | * Force standard allocation for 64 bit programs. | ||
54 | */ | ||
55 | if (!test_thread_flag(TIF_32BIT)) | ||
56 | return 1; | ||
57 | |||
58 | if (current->personality & ADDR_COMPAT_LAYOUT) | ||
59 | return 1; | ||
60 | |||
61 | if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) | ||
62 | return 1; | ||
63 | |||
64 | return sysctl_legacy_va_layout; | ||
65 | } | ||
66 | |||
67 | /* | ||
68 | * This function, called very early during the creation of a new | ||
69 | * process VM image, sets up which VM layout function to use: | ||
70 | */ | ||
71 | void arch_pick_mmap_layout(struct mm_struct *mm) | ||
72 | { | ||
73 | /* | ||
74 | * Fall back to the standard layout if the personality | ||
75 | * bit is set, or if the expected stack growth is unlimited: | ||
76 | */ | ||
77 | if (mmap_is_legacy()) { | ||
78 | mm->mmap_base = TASK_UNMAPPED_BASE; | ||
79 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
80 | mm->unmap_area = arch_unmap_area; | ||
81 | } else { | ||
82 | mm->mmap_base = mmap_base(); | ||
83 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; | ||
84 | mm->unmap_area = arch_unmap_area_topdown; | ||
85 | } | ||
86 | } | ||
diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c new file mode 100644 index 00000000000..ea862ec643d --- /dev/null +++ b/arch/ppc64/mm/numa.c | |||
@@ -0,0 +1,734 @@ | |||
1 | /* | ||
2 | * pSeries NUMA support | ||
3 | * | ||
4 | * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | #include <linux/threads.h> | ||
12 | #include <linux/bootmem.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/mm.h> | ||
15 | #include <linux/mmzone.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/nodemask.h> | ||
18 | #include <linux/cpu.h> | ||
19 | #include <linux/notifier.h> | ||
20 | #include <asm/lmb.h> | ||
21 | #include <asm/machdep.h> | ||
22 | #include <asm/abs_addr.h> | ||
23 | |||
24 | static int numa_enabled = 1; | ||
25 | |||
26 | static int numa_debug; | ||
27 | #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } | ||
28 | |||
29 | #ifdef DEBUG_NUMA | ||
30 | #define ARRAY_INITIALISER -1 | ||
31 | #else | ||
32 | #define ARRAY_INITIALISER 0 | ||
33 | #endif | ||
34 | |||
35 | int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] = | ||
36 | ARRAY_INITIALISER}; | ||
37 | char *numa_memory_lookup_table; | ||
38 | cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; | ||
39 | int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0}; | ||
40 | |||
41 | struct pglist_data *node_data[MAX_NUMNODES]; | ||
42 | bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; | ||
43 | static int min_common_depth; | ||
44 | |||
45 | /* | ||
46 | * We need somewhere to store start/span for each node until we have | ||
47 | * allocated the real node_data structures. | ||
48 | */ | ||
49 | static struct { | ||
50 | unsigned long node_start_pfn; | ||
51 | unsigned long node_end_pfn; | ||
52 | unsigned long node_present_pages; | ||
53 | } init_node_data[MAX_NUMNODES] __initdata; | ||
54 | |||
55 | EXPORT_SYMBOL(node_data); | ||
56 | EXPORT_SYMBOL(numa_cpu_lookup_table); | ||
57 | EXPORT_SYMBOL(numa_memory_lookup_table); | ||
58 | EXPORT_SYMBOL(numa_cpumask_lookup_table); | ||
59 | EXPORT_SYMBOL(nr_cpus_in_node); | ||
60 | |||
61 | static inline void map_cpu_to_node(int cpu, int node) | ||
62 | { | ||
63 | numa_cpu_lookup_table[cpu] = node; | ||
64 | if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) { | ||
65 | cpu_set(cpu, numa_cpumask_lookup_table[node]); | ||
66 | nr_cpus_in_node[node]++; | ||
67 | } | ||
68 | } | ||
69 | |||
70 | #ifdef CONFIG_HOTPLUG_CPU | ||
71 | static void unmap_cpu_from_node(unsigned long cpu) | ||
72 | { | ||
73 | int node = numa_cpu_lookup_table[cpu]; | ||
74 | |||
75 | dbg("removing cpu %lu from node %d\n", cpu, node); | ||
76 | |||
77 | if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { | ||
78 | cpu_clear(cpu, numa_cpumask_lookup_table[node]); | ||
79 | nr_cpus_in_node[node]--; | ||
80 | } else { | ||
81 | printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", | ||
82 | cpu, node); | ||
83 | } | ||
84 | } | ||
85 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
86 | |||
87 | static struct device_node * __devinit find_cpu_node(unsigned int cpu) | ||
88 | { | ||
89 | unsigned int hw_cpuid = get_hard_smp_processor_id(cpu); | ||
90 | struct device_node *cpu_node = NULL; | ||
91 | unsigned int *interrupt_server, *reg; | ||
92 | int len; | ||
93 | |||
94 | while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) { | ||
95 | /* Try interrupt server first */ | ||
96 | interrupt_server = (unsigned int *)get_property(cpu_node, | ||
97 | "ibm,ppc-interrupt-server#s", &len); | ||
98 | |||
99 | len = len / sizeof(u32); | ||
100 | |||
101 | if (interrupt_server && (len > 0)) { | ||
102 | while (len--) { | ||
103 | if (interrupt_server[len] == hw_cpuid) | ||
104 | return cpu_node; | ||
105 | } | ||
106 | } else { | ||
107 | reg = (unsigned int *)get_property(cpu_node, | ||
108 | "reg", &len); | ||
109 | if (reg && (len > 0) && (reg[0] == hw_cpuid)) | ||
110 | return cpu_node; | ||
111 | } | ||
112 | } | ||
113 | |||
114 | return NULL; | ||
115 | } | ||
116 | |||
117 | /* must hold reference to node during call */ | ||
118 | static int *of_get_associativity(struct device_node *dev) | ||
119 | { | ||
120 | return (unsigned int *)get_property(dev, "ibm,associativity", NULL); | ||
121 | } | ||
122 | |||
123 | static int of_node_numa_domain(struct device_node *device) | ||
124 | { | ||
125 | int numa_domain; | ||
126 | unsigned int *tmp; | ||
127 | |||
128 | if (min_common_depth == -1) | ||
129 | return 0; | ||
130 | |||
131 | tmp = of_get_associativity(device); | ||
132 | if (tmp && (tmp[0] >= min_common_depth)) { | ||
133 | numa_domain = tmp[min_common_depth]; | ||
134 | } else { | ||
135 | dbg("WARNING: no NUMA information for %s\n", | ||
136 | device->full_name); | ||
137 | numa_domain = 0; | ||
138 | } | ||
139 | return numa_domain; | ||
140 | } | ||
141 | |||
142 | /* | ||
143 | * In theory, the "ibm,associativity" property may contain multiple | ||
144 | * associativity lists because a resource may be multiply connected | ||
145 | * into the machine. This resource then has different associativity | ||
146 | * characteristics relative to its multiple connections. We ignore | ||
147 | * this for now. We also assume that all cpu and memory sets have | ||
148 | * their distances represented at a common level. This won't be | ||
149 | * true for heirarchical NUMA. | ||
150 | * | ||
151 | * In any case the ibm,associativity-reference-points should give | ||
152 | * the correct depth for a normal NUMA system. | ||
153 | * | ||
154 | * - Dave Hansen <haveblue@us.ibm.com> | ||
155 | */ | ||
156 | static int __init find_min_common_depth(void) | ||
157 | { | ||
158 | int depth; | ||
159 | unsigned int *ref_points; | ||
160 | struct device_node *rtas_root; | ||
161 | unsigned int len; | ||
162 | |||
163 | rtas_root = of_find_node_by_path("/rtas"); | ||
164 | |||
165 | if (!rtas_root) | ||
166 | return -1; | ||
167 | |||
168 | /* | ||
169 | * this property is 2 32-bit integers, each representing a level of | ||
170 | * depth in the associativity nodes. The first is for an SMP | ||
171 | * configuration (should be all 0's) and the second is for a normal | ||
172 | * NUMA configuration. | ||
173 | */ | ||
174 | ref_points = (unsigned int *)get_property(rtas_root, | ||
175 | "ibm,associativity-reference-points", &len); | ||
176 | |||
177 | if ((len >= 1) && ref_points) { | ||
178 | depth = ref_points[1]; | ||
179 | } else { | ||
180 | dbg("WARNING: could not find NUMA " | ||
181 | "associativity reference point\n"); | ||
182 | depth = -1; | ||
183 | } | ||
184 | of_node_put(rtas_root); | ||
185 | |||
186 | return depth; | ||
187 | } | ||
188 | |||
189 | static int __init get_mem_addr_cells(void) | ||
190 | { | ||
191 | struct device_node *memory = NULL; | ||
192 | int rc; | ||
193 | |||
194 | memory = of_find_node_by_type(memory, "memory"); | ||
195 | if (!memory) | ||
196 | return 0; /* it won't matter */ | ||
197 | |||
198 | rc = prom_n_addr_cells(memory); | ||
199 | return rc; | ||
200 | } | ||
201 | |||
202 | static int __init get_mem_size_cells(void) | ||
203 | { | ||
204 | struct device_node *memory = NULL; | ||
205 | int rc; | ||
206 | |||
207 | memory = of_find_node_by_type(memory, "memory"); | ||
208 | if (!memory) | ||
209 | return 0; /* it won't matter */ | ||
210 | rc = prom_n_size_cells(memory); | ||
211 | return rc; | ||
212 | } | ||
213 | |||
214 | static unsigned long read_n_cells(int n, unsigned int **buf) | ||
215 | { | ||
216 | unsigned long result = 0; | ||
217 | |||
218 | while (n--) { | ||
219 | result = (result << 32) | **buf; | ||
220 | (*buf)++; | ||
221 | } | ||
222 | return result; | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | * Figure out to which domain a cpu belongs and stick it there. | ||
227 | * Return the id of the domain used. | ||
228 | */ | ||
229 | static int numa_setup_cpu(unsigned long lcpu) | ||
230 | { | ||
231 | int numa_domain = 0; | ||
232 | struct device_node *cpu = find_cpu_node(lcpu); | ||
233 | |||
234 | if (!cpu) { | ||
235 | WARN_ON(1); | ||
236 | goto out; | ||
237 | } | ||
238 | |||
239 | numa_domain = of_node_numa_domain(cpu); | ||
240 | |||
241 | if (numa_domain >= num_online_nodes()) { | ||
242 | /* | ||
243 | * POWER4 LPAR uses 0xffff as invalid node, | ||
244 | * dont warn in this case. | ||
245 | */ | ||
246 | if (numa_domain != 0xffff) | ||
247 | printk(KERN_ERR "WARNING: cpu %ld " | ||
248 | "maps to invalid NUMA node %d\n", | ||
249 | lcpu, numa_domain); | ||
250 | numa_domain = 0; | ||
251 | } | ||
252 | out: | ||
253 | node_set_online(numa_domain); | ||
254 | |||
255 | map_cpu_to_node(lcpu, numa_domain); | ||
256 | |||
257 | of_node_put(cpu); | ||
258 | |||
259 | return numa_domain; | ||
260 | } | ||
261 | |||
262 | static int cpu_numa_callback(struct notifier_block *nfb, | ||
263 | unsigned long action, | ||
264 | void *hcpu) | ||
265 | { | ||
266 | unsigned long lcpu = (unsigned long)hcpu; | ||
267 | int ret = NOTIFY_DONE; | ||
268 | |||
269 | switch (action) { | ||
270 | case CPU_UP_PREPARE: | ||
271 | if (min_common_depth == -1 || !numa_enabled) | ||
272 | map_cpu_to_node(lcpu, 0); | ||
273 | else | ||
274 | numa_setup_cpu(lcpu); | ||
275 | ret = NOTIFY_OK; | ||
276 | break; | ||
277 | #ifdef CONFIG_HOTPLUG_CPU | ||
278 | case CPU_DEAD: | ||
279 | case CPU_UP_CANCELED: | ||
280 | unmap_cpu_from_node(lcpu); | ||
281 | break; | ||
282 | ret = NOTIFY_OK; | ||
283 | #endif | ||
284 | } | ||
285 | return ret; | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * Check and possibly modify a memory region to enforce the memory limit. | ||
290 | * | ||
291 | * Returns the size the region should have to enforce the memory limit. | ||
292 | * This will either be the original value of size, a truncated value, | ||
293 | * or zero. If the returned value of size is 0 the region should be | ||
294 | * discarded as it lies wholy above the memory limit. | ||
295 | */ | ||
296 | static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size) | ||
297 | { | ||
298 | /* | ||
299 | * We use lmb_end_of_DRAM() in here instead of memory_limit because | ||
300 | * we've already adjusted it for the limit and it takes care of | ||
301 | * having memory holes below the limit. | ||
302 | */ | ||
303 | extern unsigned long memory_limit; | ||
304 | |||
305 | if (! memory_limit) | ||
306 | return size; | ||
307 | |||
308 | if (start + size <= lmb_end_of_DRAM()) | ||
309 | return size; | ||
310 | |||
311 | if (start >= lmb_end_of_DRAM()) | ||
312 | return 0; | ||
313 | |||
314 | return lmb_end_of_DRAM() - start; | ||
315 | } | ||
316 | |||
317 | static int __init parse_numa_properties(void) | ||
318 | { | ||
319 | struct device_node *cpu = NULL; | ||
320 | struct device_node *memory = NULL; | ||
321 | int addr_cells, size_cells; | ||
322 | int max_domain = 0; | ||
323 | long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT; | ||
324 | unsigned long i; | ||
325 | |||
326 | if (numa_enabled == 0) { | ||
327 | printk(KERN_WARNING "NUMA disabled by user\n"); | ||
328 | return -1; | ||
329 | } | ||
330 | |||
331 | numa_memory_lookup_table = | ||
332 | (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); | ||
333 | memset(numa_memory_lookup_table, 0, entries * sizeof(char)); | ||
334 | |||
335 | for (i = 0; i < entries ; i++) | ||
336 | numa_memory_lookup_table[i] = ARRAY_INITIALISER; | ||
337 | |||
338 | min_common_depth = find_min_common_depth(); | ||
339 | |||
340 | dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); | ||
341 | if (min_common_depth < 0) | ||
342 | return min_common_depth; | ||
343 | |||
344 | max_domain = numa_setup_cpu(boot_cpuid); | ||
345 | |||
346 | /* | ||
347 | * Even though we connect cpus to numa domains later in SMP init, | ||
348 | * we need to know the maximum node id now. This is because each | ||
349 | * node id must have NODE_DATA etc backing it. | ||
350 | * As a result of hotplug we could still have cpus appear later on | ||
351 | * with larger node ids. In that case we force the cpu into node 0. | ||
352 | */ | ||
353 | for_each_cpu(i) { | ||
354 | int numa_domain; | ||
355 | |||
356 | cpu = find_cpu_node(i); | ||
357 | |||
358 | if (cpu) { | ||
359 | numa_domain = of_node_numa_domain(cpu); | ||
360 | of_node_put(cpu); | ||
361 | |||
362 | if (numa_domain < MAX_NUMNODES && | ||
363 | max_domain < numa_domain) | ||
364 | max_domain = numa_domain; | ||
365 | } | ||
366 | } | ||
367 | |||
368 | addr_cells = get_mem_addr_cells(); | ||
369 | size_cells = get_mem_size_cells(); | ||
370 | memory = NULL; | ||
371 | while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { | ||
372 | unsigned long start; | ||
373 | unsigned long size; | ||
374 | int numa_domain; | ||
375 | int ranges; | ||
376 | unsigned int *memcell_buf; | ||
377 | unsigned int len; | ||
378 | |||
379 | memcell_buf = (unsigned int *)get_property(memory, "reg", &len); | ||
380 | if (!memcell_buf || len <= 0) | ||
381 | continue; | ||
382 | |||
383 | ranges = memory->n_addrs; | ||
384 | new_range: | ||
385 | /* these are order-sensitive, and modify the buffer pointer */ | ||
386 | start = read_n_cells(addr_cells, &memcell_buf); | ||
387 | size = read_n_cells(size_cells, &memcell_buf); | ||
388 | |||
389 | start = _ALIGN_DOWN(start, MEMORY_INCREMENT); | ||
390 | size = _ALIGN_UP(size, MEMORY_INCREMENT); | ||
391 | |||
392 | numa_domain = of_node_numa_domain(memory); | ||
393 | |||
394 | if (numa_domain >= MAX_NUMNODES) { | ||
395 | if (numa_domain != 0xffff) | ||
396 | printk(KERN_ERR "WARNING: memory at %lx maps " | ||
397 | "to invalid NUMA node %d\n", start, | ||
398 | numa_domain); | ||
399 | numa_domain = 0; | ||
400 | } | ||
401 | |||
402 | if (max_domain < numa_domain) | ||
403 | max_domain = numa_domain; | ||
404 | |||
405 | if (! (size = numa_enforce_memory_limit(start, size))) { | ||
406 | if (--ranges) | ||
407 | goto new_range; | ||
408 | else | ||
409 | continue; | ||
410 | } | ||
411 | |||
412 | /* | ||
413 | * Initialize new node struct, or add to an existing one. | ||
414 | */ | ||
415 | if (init_node_data[numa_domain].node_end_pfn) { | ||
416 | if ((start / PAGE_SIZE) < | ||
417 | init_node_data[numa_domain].node_start_pfn) | ||
418 | init_node_data[numa_domain].node_start_pfn = | ||
419 | start / PAGE_SIZE; | ||
420 | if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) > | ||
421 | init_node_data[numa_domain].node_end_pfn) | ||
422 | init_node_data[numa_domain].node_end_pfn = | ||
423 | (start / PAGE_SIZE) + | ||
424 | (size / PAGE_SIZE); | ||
425 | |||
426 | init_node_data[numa_domain].node_present_pages += | ||
427 | size / PAGE_SIZE; | ||
428 | } else { | ||
429 | node_set_online(numa_domain); | ||
430 | |||
431 | init_node_data[numa_domain].node_start_pfn = | ||
432 | start / PAGE_SIZE; | ||
433 | init_node_data[numa_domain].node_end_pfn = | ||
434 | init_node_data[numa_domain].node_start_pfn + | ||
435 | size / PAGE_SIZE; | ||
436 | init_node_data[numa_domain].node_present_pages = | ||
437 | size / PAGE_SIZE; | ||
438 | } | ||
439 | |||
440 | for (i = start ; i < (start+size); i += MEMORY_INCREMENT) | ||
441 | numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = | ||
442 | numa_domain; | ||
443 | |||
444 | if (--ranges) | ||
445 | goto new_range; | ||
446 | } | ||
447 | |||
448 | for (i = 0; i <= max_domain; i++) | ||
449 | node_set_online(i); | ||
450 | |||
451 | return 0; | ||
452 | } | ||
453 | |||
454 | static void __init setup_nonnuma(void) | ||
455 | { | ||
456 | unsigned long top_of_ram = lmb_end_of_DRAM(); | ||
457 | unsigned long total_ram = lmb_phys_mem_size(); | ||
458 | unsigned long i; | ||
459 | |||
460 | printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", | ||
461 | top_of_ram, total_ram); | ||
462 | printk(KERN_INFO "Memory hole size: %ldMB\n", | ||
463 | (top_of_ram - total_ram) >> 20); | ||
464 | |||
465 | if (!numa_memory_lookup_table) { | ||
466 | long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT; | ||
467 | numa_memory_lookup_table = | ||
468 | (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); | ||
469 | memset(numa_memory_lookup_table, 0, entries * sizeof(char)); | ||
470 | for (i = 0; i < entries ; i++) | ||
471 | numa_memory_lookup_table[i] = ARRAY_INITIALISER; | ||
472 | } | ||
473 | |||
474 | map_cpu_to_node(boot_cpuid, 0); | ||
475 | |||
476 | node_set_online(0); | ||
477 | |||
478 | init_node_data[0].node_start_pfn = 0; | ||
479 | init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE; | ||
480 | init_node_data[0].node_present_pages = total_ram / PAGE_SIZE; | ||
481 | |||
482 | for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) | ||
483 | numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; | ||
484 | } | ||
485 | |||
486 | static void __init dump_numa_topology(void) | ||
487 | { | ||
488 | unsigned int node; | ||
489 | unsigned int count; | ||
490 | |||
491 | if (min_common_depth == -1 || !numa_enabled) | ||
492 | return; | ||
493 | |||
494 | for_each_online_node(node) { | ||
495 | unsigned long i; | ||
496 | |||
497 | printk(KERN_INFO "Node %d Memory:", node); | ||
498 | |||
499 | count = 0; | ||
500 | |||
501 | for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) { | ||
502 | if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) { | ||
503 | if (count == 0) | ||
504 | printk(" 0x%lx", i); | ||
505 | ++count; | ||
506 | } else { | ||
507 | if (count > 0) | ||
508 | printk("-0x%lx", i); | ||
509 | count = 0; | ||
510 | } | ||
511 | } | ||
512 | |||
513 | if (count > 0) | ||
514 | printk("-0x%lx", i); | ||
515 | printk("\n"); | ||
516 | } | ||
517 | return; | ||
518 | } | ||
519 | |||
520 | /* | ||
521 | * Allocate some memory, satisfying the lmb or bootmem allocator where | ||
522 | * required. nid is the preferred node and end is the physical address of | ||
523 | * the highest address in the node. | ||
524 | * | ||
525 | * Returns the physical address of the memory. | ||
526 | */ | ||
527 | static unsigned long careful_allocation(int nid, unsigned long size, | ||
528 | unsigned long align, unsigned long end) | ||
529 | { | ||
530 | unsigned long ret = lmb_alloc_base(size, align, end); | ||
531 | |||
532 | /* retry over all memory */ | ||
533 | if (!ret) | ||
534 | ret = lmb_alloc_base(size, align, lmb_end_of_DRAM()); | ||
535 | |||
536 | if (!ret) | ||
537 | panic("numa.c: cannot allocate %lu bytes on node %d", | ||
538 | size, nid); | ||
539 | |||
540 | /* | ||
541 | * If the memory came from a previously allocated node, we must | ||
542 | * retry with the bootmem allocator. | ||
543 | */ | ||
544 | if (pa_to_nid(ret) < nid) { | ||
545 | nid = pa_to_nid(ret); | ||
546 | ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid), | ||
547 | size, align, 0); | ||
548 | |||
549 | if (!ret) | ||
550 | panic("numa.c: cannot allocate %lu bytes on node %d", | ||
551 | size, nid); | ||
552 | |||
553 | ret = virt_to_abs(ret); | ||
554 | |||
555 | dbg("alloc_bootmem %lx %lx\n", ret, size); | ||
556 | } | ||
557 | |||
558 | return ret; | ||
559 | } | ||
560 | |||
561 | void __init do_init_bootmem(void) | ||
562 | { | ||
563 | int nid; | ||
564 | int addr_cells, size_cells; | ||
565 | struct device_node *memory = NULL; | ||
566 | static struct notifier_block ppc64_numa_nb = { | ||
567 | .notifier_call = cpu_numa_callback, | ||
568 | .priority = 1 /* Must run before sched domains notifier. */ | ||
569 | }; | ||
570 | |||
571 | min_low_pfn = 0; | ||
572 | max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; | ||
573 | max_pfn = max_low_pfn; | ||
574 | |||
575 | if (parse_numa_properties()) | ||
576 | setup_nonnuma(); | ||
577 | else | ||
578 | dump_numa_topology(); | ||
579 | |||
580 | register_cpu_notifier(&ppc64_numa_nb); | ||
581 | |||
582 | for_each_online_node(nid) { | ||
583 | unsigned long start_paddr, end_paddr; | ||
584 | int i; | ||
585 | unsigned long bootmem_paddr; | ||
586 | unsigned long bootmap_pages; | ||
587 | |||
588 | start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE; | ||
589 | end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE; | ||
590 | |||
591 | /* Allocate the node structure node local if possible */ | ||
592 | NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid, | ||
593 | sizeof(struct pglist_data), | ||
594 | SMP_CACHE_BYTES, end_paddr); | ||
595 | NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid)); | ||
596 | memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); | ||
597 | |||
598 | dbg("node %d\n", nid); | ||
599 | dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); | ||
600 | |||
601 | NODE_DATA(nid)->bdata = &plat_node_bdata[nid]; | ||
602 | NODE_DATA(nid)->node_start_pfn = | ||
603 | init_node_data[nid].node_start_pfn; | ||
604 | NODE_DATA(nid)->node_spanned_pages = | ||
605 | end_paddr - start_paddr; | ||
606 | |||
607 | if (NODE_DATA(nid)->node_spanned_pages == 0) | ||
608 | continue; | ||
609 | |||
610 | dbg("start_paddr = %lx\n", start_paddr); | ||
611 | dbg("end_paddr = %lx\n", end_paddr); | ||
612 | |||
613 | bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT); | ||
614 | |||
615 | bootmem_paddr = careful_allocation(nid, | ||
616 | bootmap_pages << PAGE_SHIFT, | ||
617 | PAGE_SIZE, end_paddr); | ||
618 | memset(abs_to_virt(bootmem_paddr), 0, | ||
619 | bootmap_pages << PAGE_SHIFT); | ||
620 | dbg("bootmap_paddr = %lx\n", bootmem_paddr); | ||
621 | |||
622 | init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, | ||
623 | start_paddr >> PAGE_SHIFT, | ||
624 | end_paddr >> PAGE_SHIFT); | ||
625 | |||
626 | /* | ||
627 | * We need to do another scan of all memory sections to | ||
628 | * associate memory with the correct node. | ||
629 | */ | ||
630 | addr_cells = get_mem_addr_cells(); | ||
631 | size_cells = get_mem_size_cells(); | ||
632 | memory = NULL; | ||
633 | while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { | ||
634 | unsigned long mem_start, mem_size; | ||
635 | int numa_domain, ranges; | ||
636 | unsigned int *memcell_buf; | ||
637 | unsigned int len; | ||
638 | |||
639 | memcell_buf = (unsigned int *)get_property(memory, "reg", &len); | ||
640 | if (!memcell_buf || len <= 0) | ||
641 | continue; | ||
642 | |||
643 | ranges = memory->n_addrs; /* ranges in cell */ | ||
644 | new_range: | ||
645 | mem_start = read_n_cells(addr_cells, &memcell_buf); | ||
646 | mem_size = read_n_cells(size_cells, &memcell_buf); | ||
647 | numa_domain = numa_enabled ? of_node_numa_domain(memory) : 0; | ||
648 | |||
649 | if (numa_domain != nid) | ||
650 | continue; | ||
651 | |||
652 | mem_size = numa_enforce_memory_limit(mem_start, mem_size); | ||
653 | if (mem_size) { | ||
654 | dbg("free_bootmem %lx %lx\n", mem_start, mem_size); | ||
655 | free_bootmem_node(NODE_DATA(nid), mem_start, mem_size); | ||
656 | } | ||
657 | |||
658 | if (--ranges) /* process all ranges in cell */ | ||
659 | goto new_range; | ||
660 | } | ||
661 | |||
662 | /* | ||
663 | * Mark reserved regions on this node | ||
664 | */ | ||
665 | for (i = 0; i < lmb.reserved.cnt; i++) { | ||
666 | unsigned long physbase = lmb.reserved.region[i].physbase; | ||
667 | unsigned long size = lmb.reserved.region[i].size; | ||
668 | |||
669 | if (pa_to_nid(physbase) != nid && | ||
670 | pa_to_nid(physbase+size-1) != nid) | ||
671 | continue; | ||
672 | |||
673 | if (physbase < end_paddr && | ||
674 | (physbase+size) > start_paddr) { | ||
675 | /* overlaps */ | ||
676 | if (physbase < start_paddr) { | ||
677 | size -= start_paddr - physbase; | ||
678 | physbase = start_paddr; | ||
679 | } | ||
680 | |||
681 | if (size > end_paddr - physbase) | ||
682 | size = end_paddr - physbase; | ||
683 | |||
684 | dbg("reserve_bootmem %lx %lx\n", physbase, | ||
685 | size); | ||
686 | reserve_bootmem_node(NODE_DATA(nid), physbase, | ||
687 | size); | ||
688 | } | ||
689 | } | ||
690 | } | ||
691 | } | ||
692 | |||
693 | void __init paging_init(void) | ||
694 | { | ||
695 | unsigned long zones_size[MAX_NR_ZONES]; | ||
696 | unsigned long zholes_size[MAX_NR_ZONES]; | ||
697 | int nid; | ||
698 | |||
699 | memset(zones_size, 0, sizeof(zones_size)); | ||
700 | memset(zholes_size, 0, sizeof(zholes_size)); | ||
701 | |||
702 | for_each_online_node(nid) { | ||
703 | unsigned long start_pfn; | ||
704 | unsigned long end_pfn; | ||
705 | |||
706 | start_pfn = init_node_data[nid].node_start_pfn; | ||
707 | end_pfn = init_node_data[nid].node_end_pfn; | ||
708 | |||
709 | zones_size[ZONE_DMA] = end_pfn - start_pfn; | ||
710 | zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - | ||
711 | init_node_data[nid].node_present_pages; | ||
712 | |||
713 | dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid, | ||
714 | zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]); | ||
715 | |||
716 | free_area_init_node(nid, NODE_DATA(nid), zones_size, | ||
717 | start_pfn, zholes_size); | ||
718 | } | ||
719 | } | ||
720 | |||
721 | static int __init early_numa(char *p) | ||
722 | { | ||
723 | if (!p) | ||
724 | return 0; | ||
725 | |||
726 | if (strstr(p, "off")) | ||
727 | numa_enabled = 0; | ||
728 | |||
729 | if (strstr(p, "debug")) | ||
730 | numa_debug = 1; | ||
731 | |||
732 | return 0; | ||
733 | } | ||
734 | early_param("numa", early_numa); | ||
diff --git a/arch/ppc64/mm/slb.c b/arch/ppc64/mm/slb.c new file mode 100644 index 00000000000..6a20773f695 --- /dev/null +++ b/arch/ppc64/mm/slb.c | |||
@@ -0,0 +1,159 @@ | |||
1 | /* | ||
2 | * PowerPC64 SLB support. | ||
3 | * | ||
4 | * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM | ||
5 | * Based on earlier code writteh by: | ||
6 | * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com | ||
7 | * Copyright (c) 2001 Dave Engebretsen | ||
8 | * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM | ||
9 | * | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public License | ||
13 | * as published by the Free Software Foundation; either version | ||
14 | * 2 of the License, or (at your option) any later version. | ||
15 | */ | ||
16 | |||
17 | #include <linux/config.h> | ||
18 | #include <asm/pgtable.h> | ||
19 | #include <asm/mmu.h> | ||
20 | #include <asm/mmu_context.h> | ||
21 | #include <asm/paca.h> | ||
22 | #include <asm/cputable.h> | ||
23 | |||
24 | extern void slb_allocate(unsigned long ea); | ||
25 | |||
26 | static inline unsigned long mk_esid_data(unsigned long ea, unsigned long slot) | ||
27 | { | ||
28 | return (ea & ESID_MASK) | SLB_ESID_V | slot; | ||
29 | } | ||
30 | |||
31 | static inline unsigned long mk_vsid_data(unsigned long ea, unsigned long flags) | ||
32 | { | ||
33 | return (get_kernel_vsid(ea) << SLB_VSID_SHIFT) | flags; | ||
34 | } | ||
35 | |||
36 | static inline void create_slbe(unsigned long ea, unsigned long vsid, | ||
37 | unsigned long flags, unsigned long entry) | ||
38 | { | ||
39 | asm volatile("slbmte %0,%1" : | ||
40 | : "r" (mk_vsid_data(ea, flags)), | ||
41 | "r" (mk_esid_data(ea, entry)) | ||
42 | : "memory" ); | ||
43 | } | ||
44 | |||
45 | static void slb_flush_and_rebolt(void) | ||
46 | { | ||
47 | /* If you change this make sure you change SLB_NUM_BOLTED | ||
48 | * appropriately too. */ | ||
49 | unsigned long ksp_flags = SLB_VSID_KERNEL; | ||
50 | unsigned long ksp_esid_data; | ||
51 | |||
52 | WARN_ON(!irqs_disabled()); | ||
53 | |||
54 | if (cpu_has_feature(CPU_FTR_16M_PAGE)) | ||
55 | ksp_flags |= SLB_VSID_L; | ||
56 | |||
57 | ksp_esid_data = mk_esid_data(get_paca()->kstack, 2); | ||
58 | if ((ksp_esid_data & ESID_MASK) == KERNELBASE) | ||
59 | ksp_esid_data &= ~SLB_ESID_V; | ||
60 | |||
61 | /* We need to do this all in asm, so we're sure we don't touch | ||
62 | * the stack between the slbia and rebolting it. */ | ||
63 | asm volatile("isync\n" | ||
64 | "slbia\n" | ||
65 | /* Slot 1 - first VMALLOC segment */ | ||
66 | "slbmte %0,%1\n" | ||
67 | /* Slot 2 - kernel stack */ | ||
68 | "slbmte %2,%3\n" | ||
69 | "isync" | ||
70 | :: "r"(mk_vsid_data(VMALLOCBASE, SLB_VSID_KERNEL)), | ||
71 | "r"(mk_esid_data(VMALLOCBASE, 1)), | ||
72 | "r"(mk_vsid_data(ksp_esid_data, ksp_flags)), | ||
73 | "r"(ksp_esid_data) | ||
74 | : "memory"); | ||
75 | } | ||
76 | |||
77 | /* Flush all user entries from the segment table of the current processor. */ | ||
78 | void switch_slb(struct task_struct *tsk, struct mm_struct *mm) | ||
79 | { | ||
80 | unsigned long offset = get_paca()->slb_cache_ptr; | ||
81 | unsigned long esid_data = 0; | ||
82 | unsigned long pc = KSTK_EIP(tsk); | ||
83 | unsigned long stack = KSTK_ESP(tsk); | ||
84 | unsigned long unmapped_base; | ||
85 | |||
86 | if (offset <= SLB_CACHE_ENTRIES) { | ||
87 | int i; | ||
88 | asm volatile("isync" : : : "memory"); | ||
89 | for (i = 0; i < offset; i++) { | ||
90 | esid_data = (unsigned long)get_paca()->slb_cache[i] | ||
91 | << SID_SHIFT; | ||
92 | asm volatile("slbie %0" : : "r" (esid_data)); | ||
93 | } | ||
94 | asm volatile("isync" : : : "memory"); | ||
95 | } else { | ||
96 | slb_flush_and_rebolt(); | ||
97 | } | ||
98 | |||
99 | /* Workaround POWER5 < DD2.1 issue */ | ||
100 | if (offset == 1 || offset > SLB_CACHE_ENTRIES) | ||
101 | asm volatile("slbie %0" : : "r" (esid_data)); | ||
102 | |||
103 | get_paca()->slb_cache_ptr = 0; | ||
104 | get_paca()->context = mm->context; | ||
105 | |||
106 | /* | ||
107 | * preload some userspace segments into the SLB. | ||
108 | */ | ||
109 | if (test_tsk_thread_flag(tsk, TIF_32BIT)) | ||
110 | unmapped_base = TASK_UNMAPPED_BASE_USER32; | ||
111 | else | ||
112 | unmapped_base = TASK_UNMAPPED_BASE_USER64; | ||
113 | |||
114 | if (pc >= KERNELBASE) | ||
115 | return; | ||
116 | slb_allocate(pc); | ||
117 | |||
118 | if (GET_ESID(pc) == GET_ESID(stack)) | ||
119 | return; | ||
120 | |||
121 | if (stack >= KERNELBASE) | ||
122 | return; | ||
123 | slb_allocate(stack); | ||
124 | |||
125 | if ((GET_ESID(pc) == GET_ESID(unmapped_base)) | ||
126 | || (GET_ESID(stack) == GET_ESID(unmapped_base))) | ||
127 | return; | ||
128 | |||
129 | if (unmapped_base >= KERNELBASE) | ||
130 | return; | ||
131 | slb_allocate(unmapped_base); | ||
132 | } | ||
133 | |||
134 | void slb_initialize(void) | ||
135 | { | ||
136 | /* On iSeries the bolted entries have already been set up by | ||
137 | * the hypervisor from the lparMap data in head.S */ | ||
138 | #ifndef CONFIG_PPC_ISERIES | ||
139 | unsigned long flags = SLB_VSID_KERNEL; | ||
140 | |||
141 | /* Invalidate the entire SLB (even slot 0) & all the ERATS */ | ||
142 | if (cpu_has_feature(CPU_FTR_16M_PAGE)) | ||
143 | flags |= SLB_VSID_L; | ||
144 | |||
145 | asm volatile("isync":::"memory"); | ||
146 | asm volatile("slbmte %0,%0"::"r" (0) : "memory"); | ||
147 | asm volatile("isync; slbia; isync":::"memory"); | ||
148 | create_slbe(KERNELBASE, get_kernel_vsid(KERNELBASE), flags, 0); | ||
149 | create_slbe(VMALLOCBASE, get_kernel_vsid(KERNELBASE), | ||
150 | SLB_VSID_KERNEL, 1); | ||
151 | /* We don't bolt the stack for the time being - we're in boot, | ||
152 | * so the stack is in the bolted segment. By the time it goes | ||
153 | * elsewhere, we'll call _switch() which will bolt in the new | ||
154 | * one. */ | ||
155 | asm volatile("isync":::"memory"); | ||
156 | #endif | ||
157 | |||
158 | get_paca()->stab_rr = SLB_NUM_BOLTED; | ||
159 | } | ||
diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S new file mode 100644 index 00000000000..8379d678f70 --- /dev/null +++ b/arch/ppc64/mm/slb_low.S | |||
@@ -0,0 +1,154 @@ | |||
1 | /* | ||
2 | * arch/ppc64/mm/slb_low.S | ||
3 | * | ||
4 | * Low-level SLB routines | ||
5 | * | ||
6 | * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM | ||
7 | * | ||
8 | * Based on earlier C version: | ||
9 | * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com | ||
10 | * Copyright (c) 2001 Dave Engebretsen | ||
11 | * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM | ||
12 | * | ||
13 | * This program is free software; you can redistribute it and/or | ||
14 | * modify it under the terms of the GNU General Public License | ||
15 | * as published by the Free Software Foundation; either version | ||
16 | * 2 of the License, or (at your option) any later version. | ||
17 | */ | ||
18 | |||
19 | #include <linux/config.h> | ||
20 | #include <asm/processor.h> | ||
21 | #include <asm/page.h> | ||
22 | #include <asm/mmu.h> | ||
23 | #include <asm/ppc_asm.h> | ||
24 | #include <asm/offsets.h> | ||
25 | #include <asm/cputable.h> | ||
26 | |||
27 | /* void slb_allocate(unsigned long ea); | ||
28 | * | ||
29 | * Create an SLB entry for the given EA (user or kernel). | ||
30 | * r3 = faulting address, r13 = PACA | ||
31 | * r9, r10, r11 are clobbered by this function | ||
32 | * No other registers are examined or changed. | ||
33 | */ | ||
34 | _GLOBAL(slb_allocate) | ||
35 | /* | ||
36 | * First find a slot, round robin. Previously we tried to find | ||
37 | * a free slot first but that took too long. Unfortunately we | ||
38 | * dont have any LRU information to help us choose a slot. | ||
39 | */ | ||
40 | #ifdef CONFIG_PPC_ISERIES | ||
41 | /* | ||
42 | * On iSeries, the "bolted" stack segment can be cast out on | ||
43 | * shared processor switch so we need to check for a miss on | ||
44 | * it and restore it to the right slot. | ||
45 | */ | ||
46 | ld r9,PACAKSAVE(r13) | ||
47 | clrrdi r9,r9,28 | ||
48 | clrrdi r11,r3,28 | ||
49 | li r10,SLB_NUM_BOLTED-1 /* Stack goes in last bolted slot */ | ||
50 | cmpld r9,r11 | ||
51 | beq 3f | ||
52 | #endif /* CONFIG_PPC_ISERIES */ | ||
53 | |||
54 | ld r10,PACASTABRR(r13) | ||
55 | addi r10,r10,1 | ||
56 | /* use a cpu feature mask if we ever change our slb size */ | ||
57 | cmpldi r10,SLB_NUM_ENTRIES | ||
58 | |||
59 | blt+ 4f | ||
60 | li r10,SLB_NUM_BOLTED | ||
61 | |||
62 | 4: | ||
63 | std r10,PACASTABRR(r13) | ||
64 | 3: | ||
65 | /* r3 = faulting address, r10 = entry */ | ||
66 | |||
67 | srdi r9,r3,60 /* get region */ | ||
68 | srdi r3,r3,28 /* get esid */ | ||
69 | cmpldi cr7,r9,0xc /* cmp KERNELBASE for later use */ | ||
70 | |||
71 | rldimi r10,r3,28,0 /* r10= ESID<<28 | entry */ | ||
72 | oris r10,r10,SLB_ESID_V@h /* r10 |= SLB_ESID_V */ | ||
73 | |||
74 | /* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */ | ||
75 | |||
76 | blt cr7,0f /* user or kernel? */ | ||
77 | |||
78 | /* kernel address: proto-VSID = ESID */ | ||
79 | /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but | ||
80 | * this code will generate the protoVSID 0xfffffffff for the | ||
81 | * top segment. That's ok, the scramble below will translate | ||
82 | * it to VSID 0, which is reserved as a bad VSID - one which | ||
83 | * will never have any pages in it. */ | ||
84 | li r11,SLB_VSID_KERNEL | ||
85 | BEGIN_FTR_SECTION | ||
86 | bne cr7,9f | ||
87 | li r11,(SLB_VSID_KERNEL|SLB_VSID_L) | ||
88 | END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) | ||
89 | b 9f | ||
90 | |||
91 | 0: /* user address: proto-VSID = context<<15 | ESID */ | ||
92 | li r11,SLB_VSID_USER | ||
93 | |||
94 | srdi. r9,r3,13 | ||
95 | bne- 8f /* invalid ea bits set */ | ||
96 | |||
97 | #ifdef CONFIG_HUGETLB_PAGE | ||
98 | BEGIN_FTR_SECTION | ||
99 | /* check against the hugepage ranges */ | ||
100 | cmpldi r3,(TASK_HPAGE_END>>SID_SHIFT) | ||
101 | bge 6f /* >= TASK_HPAGE_END */ | ||
102 | cmpldi r3,(TASK_HPAGE_BASE>>SID_SHIFT) | ||
103 | bge 5f /* TASK_HPAGE_BASE..TASK_HPAGE_END */ | ||
104 | cmpldi r3,16 | ||
105 | bge 6f /* 4GB..TASK_HPAGE_BASE */ | ||
106 | |||
107 | lhz r9,PACAHTLBSEGS(r13) | ||
108 | srd r9,r9,r3 | ||
109 | andi. r9,r9,1 | ||
110 | beq 6f | ||
111 | |||
112 | 5: /* this is a hugepage user address */ | ||
113 | li r11,(SLB_VSID_USER|SLB_VSID_L) | ||
114 | END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) | ||
115 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
116 | |||
117 | 6: ld r9,PACACONTEXTID(r13) | ||
118 | rldimi r3,r9,USER_ESID_BITS,0 | ||
119 | |||
120 | 9: /* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */ | ||
121 | ASM_VSID_SCRAMBLE(r3,r9) | ||
122 | |||
123 | rldimi r11,r3,SLB_VSID_SHIFT,16 /* combine VSID and flags */ | ||
124 | |||
125 | /* | ||
126 | * No need for an isync before or after this slbmte. The exception | ||
127 | * we enter with and the rfid we exit with are context synchronizing. | ||
128 | */ | ||
129 | slbmte r11,r10 | ||
130 | |||
131 | bgelr cr7 /* we're done for kernel addresses */ | ||
132 | |||
133 | /* Update the slb cache */ | ||
134 | lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ | ||
135 | cmpldi r3,SLB_CACHE_ENTRIES | ||
136 | bge 1f | ||
137 | |||
138 | /* still room in the slb cache */ | ||
139 | sldi r11,r3,1 /* r11 = offset * sizeof(u16) */ | ||
140 | rldicl r10,r10,36,28 /* get low 16 bits of the ESID */ | ||
141 | add r11,r11,r13 /* r11 = (u16 *)paca + offset */ | ||
142 | sth r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ | ||
143 | addi r3,r3,1 /* offset++ */ | ||
144 | b 2f | ||
145 | 1: /* offset >= SLB_CACHE_ENTRIES */ | ||
146 | li r3,SLB_CACHE_ENTRIES+1 | ||
147 | 2: | ||
148 | sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ | ||
149 | blr | ||
150 | |||
151 | 8: /* invalid EA */ | ||
152 | li r3,0 /* BAD_VSID */ | ||
153 | li r11,SLB_VSID_USER /* flags don't much matter */ | ||
154 | b 9b | ||
diff --git a/arch/ppc64/mm/stab.c b/arch/ppc64/mm/stab.c new file mode 100644 index 00000000000..31491131d5e --- /dev/null +++ b/arch/ppc64/mm/stab.c | |||
@@ -0,0 +1,239 @@ | |||
1 | /* | ||
2 | * PowerPC64 Segment Translation Support. | ||
3 | * | ||
4 | * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com | ||
5 | * Copyright (c) 2001 Dave Engebretsen | ||
6 | * | ||
7 | * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version | ||
12 | * 2 of the License, or (at your option) any later version. | ||
13 | */ | ||
14 | |||
15 | #include <linux/config.h> | ||
16 | #include <asm/pgtable.h> | ||
17 | #include <asm/mmu.h> | ||
18 | #include <asm/mmu_context.h> | ||
19 | #include <asm/paca.h> | ||
20 | #include <asm/cputable.h> | ||
21 | |||
22 | /* Both the segment table and SLB code uses the following cache */ | ||
23 | #define NR_STAB_CACHE_ENTRIES 8 | ||
24 | DEFINE_PER_CPU(long, stab_cache_ptr); | ||
25 | DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); | ||
26 | |||
27 | /* | ||
28 | * Create a segment table entry for the given esid/vsid pair. | ||
29 | */ | ||
30 | static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid) | ||
31 | { | ||
32 | unsigned long esid_data, vsid_data; | ||
33 | unsigned long entry, group, old_esid, castout_entry, i; | ||
34 | unsigned int global_entry; | ||
35 | struct stab_entry *ste, *castout_ste; | ||
36 | unsigned long kernel_segment = (esid << SID_SHIFT) >= KERNELBASE; | ||
37 | |||
38 | vsid_data = vsid << STE_VSID_SHIFT; | ||
39 | esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V; | ||
40 | if (! kernel_segment) | ||
41 | esid_data |= STE_ESID_KS; | ||
42 | |||
43 | /* Search the primary group first. */ | ||
44 | global_entry = (esid & 0x1f) << 3; | ||
45 | ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7)); | ||
46 | |||
47 | /* Find an empty entry, if one exists. */ | ||
48 | for (group = 0; group < 2; group++) { | ||
49 | for (entry = 0; entry < 8; entry++, ste++) { | ||
50 | if (!(ste->esid_data & STE_ESID_V)) { | ||
51 | ste->vsid_data = vsid_data; | ||
52 | asm volatile("eieio":::"memory"); | ||
53 | ste->esid_data = esid_data; | ||
54 | return (global_entry | entry); | ||
55 | } | ||
56 | } | ||
57 | /* Now search the secondary group. */ | ||
58 | global_entry = ((~esid) & 0x1f) << 3; | ||
59 | ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7)); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * Could not find empty entry, pick one with a round robin selection. | ||
64 | * Search all entries in the two groups. | ||
65 | */ | ||
66 | castout_entry = get_paca()->stab_rr; | ||
67 | for (i = 0; i < 16; i++) { | ||
68 | if (castout_entry < 8) { | ||
69 | global_entry = (esid & 0x1f) << 3; | ||
70 | ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7)); | ||
71 | castout_ste = ste + castout_entry; | ||
72 | } else { | ||
73 | global_entry = ((~esid) & 0x1f) << 3; | ||
74 | ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7)); | ||
75 | castout_ste = ste + (castout_entry - 8); | ||
76 | } | ||
77 | |||
78 | /* Dont cast out the first kernel segment */ | ||
79 | if ((castout_ste->esid_data & ESID_MASK) != KERNELBASE) | ||
80 | break; | ||
81 | |||
82 | castout_entry = (castout_entry + 1) & 0xf; | ||
83 | } | ||
84 | |||
85 | get_paca()->stab_rr = (castout_entry + 1) & 0xf; | ||
86 | |||
87 | /* Modify the old entry to the new value. */ | ||
88 | |||
89 | /* Force previous translations to complete. DRENG */ | ||
90 | asm volatile("isync" : : : "memory"); | ||
91 | |||
92 | old_esid = castout_ste->esid_data >> SID_SHIFT; | ||
93 | castout_ste->esid_data = 0; /* Invalidate old entry */ | ||
94 | |||
95 | asm volatile("sync" : : : "memory"); /* Order update */ | ||
96 | |||
97 | castout_ste->vsid_data = vsid_data; | ||
98 | asm volatile("eieio" : : : "memory"); /* Order update */ | ||
99 | castout_ste->esid_data = esid_data; | ||
100 | |||
101 | asm volatile("slbie %0" : : "r" (old_esid << SID_SHIFT)); | ||
102 | /* Ensure completion of slbie */ | ||
103 | asm volatile("sync" : : : "memory"); | ||
104 | |||
105 | return (global_entry | (castout_entry & 0x7)); | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Allocate a segment table entry for the given ea and mm | ||
110 | */ | ||
111 | static int __ste_allocate(unsigned long ea, struct mm_struct *mm) | ||
112 | { | ||
113 | unsigned long vsid; | ||
114 | unsigned char stab_entry; | ||
115 | unsigned long offset; | ||
116 | |||
117 | /* Kernel or user address? */ | ||
118 | if (ea >= KERNELBASE) { | ||
119 | vsid = get_kernel_vsid(ea); | ||
120 | } else { | ||
121 | if ((ea >= TASK_SIZE_USER64) || (! mm)) | ||
122 | return 1; | ||
123 | |||
124 | vsid = get_vsid(mm->context.id, ea); | ||
125 | } | ||
126 | |||
127 | stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid); | ||
128 | |||
129 | if (ea < KERNELBASE) { | ||
130 | offset = __get_cpu_var(stab_cache_ptr); | ||
131 | if (offset < NR_STAB_CACHE_ENTRIES) | ||
132 | __get_cpu_var(stab_cache[offset++]) = stab_entry; | ||
133 | else | ||
134 | offset = NR_STAB_CACHE_ENTRIES+1; | ||
135 | __get_cpu_var(stab_cache_ptr) = offset; | ||
136 | |||
137 | /* Order update */ | ||
138 | asm volatile("sync":::"memory"); | ||
139 | } | ||
140 | |||
141 | return 0; | ||
142 | } | ||
143 | |||
144 | int ste_allocate(unsigned long ea) | ||
145 | { | ||
146 | return __ste_allocate(ea, current->mm); | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * Do the segment table work for a context switch: flush all user | ||
151 | * entries from the table, then preload some probably useful entries | ||
152 | * for the new task | ||
153 | */ | ||
154 | void switch_stab(struct task_struct *tsk, struct mm_struct *mm) | ||
155 | { | ||
156 | struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr; | ||
157 | struct stab_entry *ste; | ||
158 | unsigned long offset = __get_cpu_var(stab_cache_ptr); | ||
159 | unsigned long pc = KSTK_EIP(tsk); | ||
160 | unsigned long stack = KSTK_ESP(tsk); | ||
161 | unsigned long unmapped_base; | ||
162 | |||
163 | /* Force previous translations to complete. DRENG */ | ||
164 | asm volatile("isync" : : : "memory"); | ||
165 | |||
166 | if (offset <= NR_STAB_CACHE_ENTRIES) { | ||
167 | int i; | ||
168 | |||
169 | for (i = 0; i < offset; i++) { | ||
170 | ste = stab + __get_cpu_var(stab_cache[i]); | ||
171 | ste->esid_data = 0; /* invalidate entry */ | ||
172 | } | ||
173 | } else { | ||
174 | unsigned long entry; | ||
175 | |||
176 | /* Invalidate all entries. */ | ||
177 | ste = stab; | ||
178 | |||
179 | /* Never flush the first entry. */ | ||
180 | ste += 1; | ||
181 | for (entry = 1; | ||
182 | entry < (PAGE_SIZE / sizeof(struct stab_entry)); | ||
183 | entry++, ste++) { | ||
184 | unsigned long ea; | ||
185 | ea = ste->esid_data & ESID_MASK; | ||
186 | if (ea < KERNELBASE) { | ||
187 | ste->esid_data = 0; | ||
188 | } | ||
189 | } | ||
190 | } | ||
191 | |||
192 | asm volatile("sync; slbia; sync":::"memory"); | ||
193 | |||
194 | __get_cpu_var(stab_cache_ptr) = 0; | ||
195 | |||
196 | /* Now preload some entries for the new task */ | ||
197 | if (test_tsk_thread_flag(tsk, TIF_32BIT)) | ||
198 | unmapped_base = TASK_UNMAPPED_BASE_USER32; | ||
199 | else | ||
200 | unmapped_base = TASK_UNMAPPED_BASE_USER64; | ||
201 | |||
202 | __ste_allocate(pc, mm); | ||
203 | |||
204 | if (GET_ESID(pc) == GET_ESID(stack)) | ||
205 | return; | ||
206 | |||
207 | __ste_allocate(stack, mm); | ||
208 | |||
209 | if ((GET_ESID(pc) == GET_ESID(unmapped_base)) | ||
210 | || (GET_ESID(stack) == GET_ESID(unmapped_base))) | ||
211 | return; | ||
212 | |||
213 | __ste_allocate(unmapped_base, mm); | ||
214 | |||
215 | /* Order update */ | ||
216 | asm volatile("sync" : : : "memory"); | ||
217 | } | ||
218 | |||
219 | extern void slb_initialize(void); | ||
220 | |||
221 | /* | ||
222 | * Build an entry for the base kernel segment and put it into | ||
223 | * the segment table or SLB. All other segment table or SLB | ||
224 | * entries are faulted in. | ||
225 | */ | ||
226 | void stab_initialize(unsigned long stab) | ||
227 | { | ||
228 | unsigned long vsid = get_kernel_vsid(KERNELBASE); | ||
229 | |||
230 | if (cpu_has_feature(CPU_FTR_SLB)) { | ||
231 | slb_initialize(); | ||
232 | } else { | ||
233 | asm volatile("isync; slbia; isync":::"memory"); | ||
234 | make_ste(stab, GET_ESID(KERNELBASE), vsid); | ||
235 | |||
236 | /* Order update */ | ||
237 | asm volatile("sync":::"memory"); | ||
238 | } | ||
239 | } | ||
diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c new file mode 100644 index 00000000000..26f0172c452 --- /dev/null +++ b/arch/ppc64/mm/tlb.c | |||
@@ -0,0 +1,180 @@ | |||
1 | /* | ||
2 | * This file contains the routines for flushing entries from the | ||
3 | * TLB and MMU hash table. | ||
4 | * | ||
5 | * Derived from arch/ppc64/mm/init.c: | ||
6 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | ||
7 | * | ||
8 | * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) | ||
9 | * and Cort Dougan (PReP) (cort@cs.nmt.edu) | ||
10 | * Copyright (C) 1996 Paul Mackerras | ||
11 | * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk). | ||
12 | * | ||
13 | * Derived from "arch/i386/mm/init.c" | ||
14 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | ||
15 | * | ||
16 | * Dave Engebretsen <engebret@us.ibm.com> | ||
17 | * Rework for PPC64 port. | ||
18 | * | ||
19 | * This program is free software; you can redistribute it and/or | ||
20 | * modify it under the terms of the GNU General Public License | ||
21 | * as published by the Free Software Foundation; either version | ||
22 | * 2 of the License, or (at your option) any later version. | ||
23 | */ | ||
24 | #include <linux/config.h> | ||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/percpu.h> | ||
29 | #include <linux/hardirq.h> | ||
30 | #include <asm/pgalloc.h> | ||
31 | #include <asm/tlbflush.h> | ||
32 | #include <asm/tlb.h> | ||
33 | #include <linux/highmem.h> | ||
34 | |||
35 | DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); | ||
36 | |||
37 | /* This is declared as we are using the more or less generic | ||
38 | * include/asm-ppc64/tlb.h file -- tgall | ||
39 | */ | ||
40 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | ||
41 | DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); | ||
42 | unsigned long pte_freelist_forced_free; | ||
43 | |||
44 | void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage) | ||
45 | { | ||
46 | /* This is safe as we are holding page_table_lock */ | ||
47 | cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); | ||
48 | struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); | ||
49 | |||
50 | if (atomic_read(&tlb->mm->mm_users) < 2 || | ||
51 | cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) { | ||
52 | pte_free(ptepage); | ||
53 | return; | ||
54 | } | ||
55 | |||
56 | if (*batchp == NULL) { | ||
57 | *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); | ||
58 | if (*batchp == NULL) { | ||
59 | pte_free_now(ptepage); | ||
60 | return; | ||
61 | } | ||
62 | (*batchp)->index = 0; | ||
63 | } | ||
64 | (*batchp)->pages[(*batchp)->index++] = ptepage; | ||
65 | if ((*batchp)->index == PTE_FREELIST_SIZE) { | ||
66 | pte_free_submit(*batchp); | ||
67 | *batchp = NULL; | ||
68 | } | ||
69 | } | ||
70 | |||
71 | /* | ||
72 | * Update the MMU hash table to correspond with a change to | ||
73 | * a Linux PTE. If wrprot is true, it is permissible to | ||
74 | * change the existing HPTE to read-only rather than removing it | ||
75 | * (if we remove it we should clear the _PTE_HPTEFLAGS bits). | ||
76 | */ | ||
77 | void hpte_update(struct mm_struct *mm, unsigned long addr, | ||
78 | unsigned long pte, int wrprot) | ||
79 | { | ||
80 | int i; | ||
81 | unsigned long context = 0; | ||
82 | struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); | ||
83 | |||
84 | if (REGION_ID(addr) == USER_REGION_ID) | ||
85 | context = mm->context.id; | ||
86 | i = batch->index; | ||
87 | |||
88 | /* | ||
89 | * This can happen when we are in the middle of a TLB batch and | ||
90 | * we encounter memory pressure (eg copy_page_range when it tries | ||
91 | * to allocate a new pte). If we have to reclaim memory and end | ||
92 | * up scanning and resetting referenced bits then our batch context | ||
93 | * will change mid stream. | ||
94 | */ | ||
95 | if (unlikely(i != 0 && context != batch->context)) { | ||
96 | flush_tlb_pending(); | ||
97 | i = 0; | ||
98 | } | ||
99 | |||
100 | if (i == 0) { | ||
101 | batch->context = context; | ||
102 | batch->mm = mm; | ||
103 | } | ||
104 | batch->pte[i] = __pte(pte); | ||
105 | batch->addr[i] = addr; | ||
106 | batch->index = ++i; | ||
107 | if (i >= PPC64_TLB_BATCH_NR) | ||
108 | flush_tlb_pending(); | ||
109 | } | ||
110 | |||
111 | void __flush_tlb_pending(struct ppc64_tlb_batch *batch) | ||
112 | { | ||
113 | int i; | ||
114 | int cpu; | ||
115 | cpumask_t tmp; | ||
116 | int local = 0; | ||
117 | |||
118 | BUG_ON(in_interrupt()); | ||
119 | |||
120 | cpu = get_cpu(); | ||
121 | i = batch->index; | ||
122 | tmp = cpumask_of_cpu(cpu); | ||
123 | if (cpus_equal(batch->mm->cpu_vm_mask, tmp)) | ||
124 | local = 1; | ||
125 | |||
126 | if (i == 1) | ||
127 | flush_hash_page(batch->context, batch->addr[0], batch->pte[0], | ||
128 | local); | ||
129 | else | ||
130 | flush_hash_range(batch->context, i, local); | ||
131 | batch->index = 0; | ||
132 | put_cpu(); | ||
133 | } | ||
134 | |||
135 | #ifdef CONFIG_SMP | ||
136 | static void pte_free_smp_sync(void *arg) | ||
137 | { | ||
138 | /* Do nothing, just ensure we sync with all CPUs */ | ||
139 | } | ||
140 | #endif | ||
141 | |||
142 | /* This is only called when we are critically out of memory | ||
143 | * (and fail to get a page in pte_free_tlb). | ||
144 | */ | ||
145 | void pte_free_now(struct page *ptepage) | ||
146 | { | ||
147 | pte_freelist_forced_free++; | ||
148 | |||
149 | smp_call_function(pte_free_smp_sync, NULL, 0, 1); | ||
150 | |||
151 | pte_free(ptepage); | ||
152 | } | ||
153 | |||
154 | static void pte_free_rcu_callback(struct rcu_head *head) | ||
155 | { | ||
156 | struct pte_freelist_batch *batch = | ||
157 | container_of(head, struct pte_freelist_batch, rcu); | ||
158 | unsigned int i; | ||
159 | |||
160 | for (i = 0; i < batch->index; i++) | ||
161 | pte_free(batch->pages[i]); | ||
162 | free_page((unsigned long)batch); | ||
163 | } | ||
164 | |||
165 | void pte_free_submit(struct pte_freelist_batch *batch) | ||
166 | { | ||
167 | INIT_RCU_HEAD(&batch->rcu); | ||
168 | call_rcu(&batch->rcu, pte_free_rcu_callback); | ||
169 | } | ||
170 | |||
171 | void pte_free_finish(void) | ||
172 | { | ||
173 | /* This is safe as we are holding page_table_lock */ | ||
174 | struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); | ||
175 | |||
176 | if (*batchp == NULL) | ||
177 | return; | ||
178 | pte_free_submit(*batchp); | ||
179 | *batchp = NULL; | ||
180 | } | ||