aboutsummaryrefslogtreecommitdiffstats
path: root/arch/ppc64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/ppc64/mm')
-rw-r--r--arch/ppc64/mm/Makefile11
-rw-r--r--arch/ppc64/mm/fault.c333
-rw-r--r--arch/ppc64/mm/hash_low.S288
-rw-r--r--arch/ppc64/mm/hash_native.c446
-rw-r--r--arch/ppc64/mm/hash_utils.c438
-rw-r--r--arch/ppc64/mm/hugetlbpage.c745
-rw-r--r--arch/ppc64/mm/imalloc.c317
-rw-r--r--arch/ppc64/mm/init.c870
-rw-r--r--arch/ppc64/mm/mmap.c86
-rw-r--r--arch/ppc64/mm/numa.c779
-rw-r--r--arch/ppc64/mm/slb.c158
-rw-r--r--arch/ppc64/mm/slb_low.S151
-rw-r--r--arch/ppc64/mm/stab.c279
-rw-r--r--arch/ppc64/mm/tlb.c196
14 files changed, 0 insertions, 5097 deletions
diff --git a/arch/ppc64/mm/Makefile b/arch/ppc64/mm/Makefile
deleted file mode 100644
index 3695d00d347..00000000000
--- a/arch/ppc64/mm/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
1#
2# Makefile for the linux ppc-specific parts of the memory manager.
3#
4
5EXTRA_CFLAGS += -mno-minimal-toc
6
7obj-y := fault.o init.o imalloc.o hash_utils.o hash_low.o tlb.o \
8 slb_low.o slb.o stab.o mmap.o
9obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
10obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
11obj-$(CONFIG_PPC_MULTIPLATFORM) += hash_native.o
diff --git a/arch/ppc64/mm/fault.c b/arch/ppc64/mm/fault.c
deleted file mode 100644
index be3f25cf3e9..00000000000
--- a/arch/ppc64/mm/fault.c
+++ /dev/null
@@ -1,333 +0,0 @@
1/*
2 * arch/ppc/mm/fault.c
3 *
4 * PowerPC version
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * Derived from "arch/i386/mm/fault.c"
8 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
9 *
10 * Modified by Cort Dougan and Paul Mackerras.
11 *
12 * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 */
19
20#include <linux/config.h>
21#include <linux/signal.h>
22#include <linux/sched.h>
23#include <linux/kernel.h>
24#include <linux/errno.h>
25#include <linux/string.h>
26#include <linux/types.h>
27#include <linux/mman.h>
28#include <linux/mm.h>
29#include <linux/interrupt.h>
30#include <linux/smp_lock.h>
31#include <linux/module.h>
32#include <linux/kprobes.h>
33
34#include <asm/page.h>
35#include <asm/pgtable.h>
36#include <asm/mmu.h>
37#include <asm/mmu_context.h>
38#include <asm/system.h>
39#include <asm/uaccess.h>
40#include <asm/kdebug.h>
41#include <asm/siginfo.h>
42
43/*
44 * Check whether the instruction at regs->nip is a store using
45 * an update addressing form which will update r1.
46 */
47static int store_updates_sp(struct pt_regs *regs)
48{
49 unsigned int inst;
50
51 if (get_user(inst, (unsigned int __user *)regs->nip))
52 return 0;
53 /* check for 1 in the rA field */
54 if (((inst >> 16) & 0x1f) != 1)
55 return 0;
56 /* check major opcode */
57 switch (inst >> 26) {
58 case 37: /* stwu */
59 case 39: /* stbu */
60 case 45: /* sthu */
61 case 53: /* stfsu */
62 case 55: /* stfdu */
63 return 1;
64 case 62: /* std or stdu */
65 return (inst & 3) == 1;
66 case 31:
67 /* check minor opcode */
68 switch ((inst >> 1) & 0x3ff) {
69 case 181: /* stdux */
70 case 183: /* stwux */
71 case 247: /* stbux */
72 case 439: /* sthux */
73 case 695: /* stfsux */
74 case 759: /* stfdux */
75 return 1;
76 }
77 }
78 return 0;
79}
80
81static void do_dabr(struct pt_regs *regs, unsigned long error_code)
82{
83 siginfo_t info;
84
85 if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
86 11, SIGSEGV) == NOTIFY_STOP)
87 return;
88
89 if (debugger_dabr_match(regs))
90 return;
91
92 /* Clear the DABR */
93 set_dabr(0);
94
95 /* Deliver the signal to userspace */
96 info.si_signo = SIGTRAP;
97 info.si_errno = 0;
98 info.si_code = TRAP_HWBKPT;
99 info.si_addr = (void __user *)regs->nip;
100 force_sig_info(SIGTRAP, &info, current);
101}
102
103/*
104 * The error_code parameter is
105 * - DSISR for a non-SLB data access fault,
106 * - SRR1 & 0x08000000 for a non-SLB instruction access fault
107 * - 0 any SLB fault.
108 * The return value is 0 if the fault was handled, or the signal
109 * number if this is a kernel fault that can't be handled here.
110 */
111int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
112 unsigned long error_code)
113{
114 struct vm_area_struct * vma;
115 struct mm_struct *mm = current->mm;
116 siginfo_t info;
117 unsigned long code = SEGV_MAPERR;
118 unsigned long is_write = error_code & DSISR_ISSTORE;
119 unsigned long trap = TRAP(regs);
120 unsigned long is_exec = trap == 0x400;
121
122 BUG_ON((trap == 0x380) || (trap == 0x480));
123
124 if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, error_code,
125 11, SIGSEGV) == NOTIFY_STOP)
126 return 0;
127
128 if (trap == 0x300) {
129 if (debugger_fault_handler(regs))
130 return 0;
131 }
132
133 /* On a kernel SLB miss we can only check for a valid exception entry */
134 if (!user_mode(regs) && (address >= TASK_SIZE))
135 return SIGSEGV;
136
137 if (error_code & DSISR_DABRMATCH) {
138 do_dabr(regs, error_code);
139 return 0;
140 }
141
142 if (in_atomic() || mm == NULL) {
143 if (!user_mode(regs))
144 return SIGSEGV;
145 /* in_atomic() in user mode is really bad,
146 as is current->mm == NULL. */
147 printk(KERN_EMERG "Page fault in user mode with"
148 "in_atomic() = %d mm = %p\n", in_atomic(), mm);
149 printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
150 regs->nip, regs->msr);
151 die("Weird page fault", regs, SIGSEGV);
152 }
153
154 /* When running in the kernel we expect faults to occur only to
155 * addresses in user space. All other faults represent errors in the
156 * kernel and should generate an OOPS. Unfortunatly, in the case of an
157 * erroneous fault occuring in a code path which already holds mmap_sem
158 * we will deadlock attempting to validate the fault against the
159 * address space. Luckily the kernel only validly references user
160 * space from well defined areas of code, which are listed in the
161 * exceptions table.
162 *
163 * As the vast majority of faults will be valid we will only perform
164 * the source reference check when there is a possibilty of a deadlock.
165 * Attempt to lock the address space, if we cannot we then validate the
166 * source. If this is invalid we can skip the address space check,
167 * thus avoiding the deadlock.
168 */
169 if (!down_read_trylock(&mm->mmap_sem)) {
170 if (!user_mode(regs) && !search_exception_tables(regs->nip))
171 goto bad_area_nosemaphore;
172
173 down_read(&mm->mmap_sem);
174 }
175
176 vma = find_vma(mm, address);
177 if (!vma)
178 goto bad_area;
179
180 if (vma->vm_start <= address) {
181 goto good_area;
182 }
183 if (!(vma->vm_flags & VM_GROWSDOWN))
184 goto bad_area;
185
186 /*
187 * N.B. The POWER/Open ABI allows programs to access up to
188 * 288 bytes below the stack pointer.
189 * The kernel signal delivery code writes up to about 1.5kB
190 * below the stack pointer (r1) before decrementing it.
191 * The exec code can write slightly over 640kB to the stack
192 * before setting the user r1. Thus we allow the stack to
193 * expand to 1MB without further checks.
194 */
195 if (address + 0x100000 < vma->vm_end) {
196 /* get user regs even if this fault is in kernel mode */
197 struct pt_regs *uregs = current->thread.regs;
198 if (uregs == NULL)
199 goto bad_area;
200
201 /*
202 * A user-mode access to an address a long way below
203 * the stack pointer is only valid if the instruction
204 * is one which would update the stack pointer to the
205 * address accessed if the instruction completed,
206 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
207 * (or the byte, halfword, float or double forms).
208 *
209 * If we don't check this then any write to the area
210 * between the last mapped region and the stack will
211 * expand the stack rather than segfaulting.
212 */
213 if (address + 2048 < uregs->gpr[1]
214 && (!user_mode(regs) || !store_updates_sp(regs)))
215 goto bad_area;
216 }
217
218 if (expand_stack(vma, address))
219 goto bad_area;
220
221good_area:
222 code = SEGV_ACCERR;
223
224 if (is_exec) {
225 /* protection fault */
226 if (error_code & DSISR_PROTFAULT)
227 goto bad_area;
228 if (!(vma->vm_flags & VM_EXEC))
229 goto bad_area;
230 /* a write */
231 } else if (is_write) {
232 if (!(vma->vm_flags & VM_WRITE))
233 goto bad_area;
234 /* a read */
235 } else {
236 if (!(vma->vm_flags & VM_READ))
237 goto bad_area;
238 }
239
240 survive:
241 /*
242 * If for any reason at all we couldn't handle the fault,
243 * make sure we exit gracefully rather than endlessly redo
244 * the fault.
245 */
246 switch (handle_mm_fault(mm, vma, address, is_write)) {
247
248 case VM_FAULT_MINOR:
249 current->min_flt++;
250 break;
251 case VM_FAULT_MAJOR:
252 current->maj_flt++;
253 break;
254 case VM_FAULT_SIGBUS:
255 goto do_sigbus;
256 case VM_FAULT_OOM:
257 goto out_of_memory;
258 default:
259 BUG();
260 }
261
262 up_read(&mm->mmap_sem);
263 return 0;
264
265bad_area:
266 up_read(&mm->mmap_sem);
267
268bad_area_nosemaphore:
269 /* User mode accesses cause a SIGSEGV */
270 if (user_mode(regs)) {
271 info.si_signo = SIGSEGV;
272 info.si_errno = 0;
273 info.si_code = code;
274 info.si_addr = (void __user *) address;
275 force_sig_info(SIGSEGV, &info, current);
276 return 0;
277 }
278
279 if (trap == 0x400 && (error_code & DSISR_PROTFAULT)
280 && printk_ratelimit())
281 printk(KERN_CRIT "kernel tried to execute NX-protected"
282 " page (%lx) - exploit attempt? (uid: %d)\n",
283 address, current->uid);
284
285 return SIGSEGV;
286
287/*
288 * We ran out of memory, or some other thing happened to us that made
289 * us unable to handle the page fault gracefully.
290 */
291out_of_memory:
292 up_read(&mm->mmap_sem);
293 if (current->pid == 1) {
294 yield();
295 down_read(&mm->mmap_sem);
296 goto survive;
297 }
298 printk("VM: killing process %s\n", current->comm);
299 if (user_mode(regs))
300 do_exit(SIGKILL);
301 return SIGKILL;
302
303do_sigbus:
304 up_read(&mm->mmap_sem);
305 if (user_mode(regs)) {
306 info.si_signo = SIGBUS;
307 info.si_errno = 0;
308 info.si_code = BUS_ADRERR;
309 info.si_addr = (void __user *)address;
310 force_sig_info(SIGBUS, &info, current);
311 return 0;
312 }
313 return SIGBUS;
314}
315
316/*
317 * bad_page_fault is called when we have a bad access from the kernel.
318 * It is called from do_page_fault above and from some of the procedures
319 * in traps.c.
320 */
321void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
322{
323 const struct exception_table_entry *entry;
324
325 /* Are we prepared to handle this fault? */
326 if ((entry = search_exception_tables(regs->nip)) != NULL) {
327 regs->nip = entry->fixup;
328 return;
329 }
330
331 /* kernel has accessed a bad area */
332 die("Kernel access of bad area", regs, sig);
333}
diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S
deleted file mode 100644
index ee5a5d36bfa..00000000000
--- a/arch/ppc64/mm/hash_low.S
+++ /dev/null
@@ -1,288 +0,0 @@
1/*
2 * ppc64 MMU hashtable management routines
3 *
4 * (c) Copyright IBM Corp. 2003
5 *
6 * Maintained by: Benjamin Herrenschmidt
7 * <benh@kernel.crashing.org>
8 *
9 * This file is covered by the GNU Public Licence v2 as
10 * described in the kernel's COPYING file.
11 */
12
13#include <asm/processor.h>
14#include <asm/pgtable.h>
15#include <asm/mmu.h>
16#include <asm/page.h>
17#include <asm/types.h>
18#include <asm/ppc_asm.h>
19#include <asm/asm-offsets.h>
20#include <asm/cputable.h>
21
22 .text
23
24/*
25 * Stackframe:
26 *
27 * +-> Back chain (SP + 256)
28 * | General register save area (SP + 112)
29 * | Parameter save area (SP + 48)
30 * | TOC save area (SP + 40)
31 * | link editor doubleword (SP + 32)
32 * | compiler doubleword (SP + 24)
33 * | LR save area (SP + 16)
34 * | CR save area (SP + 8)
35 * SP ---> +-- Back chain (SP + 0)
36 */
37#define STACKFRAMESIZE 256
38
39/* Save parameters offsets */
40#define STK_PARM(i) (STACKFRAMESIZE + 48 + ((i)-3)*8)
41
42/* Save non-volatile offsets */
43#define STK_REG(i) (112 + ((i)-14)*8)
44
45/*
46 * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid,
47 * pte_t *ptep, unsigned long trap, int local)
48 *
49 * Adds a page to the hash table. This is the non-LPAR version for now
50 */
51
52_GLOBAL(__hash_page)
53 mflr r0
54 std r0,16(r1)
55 stdu r1,-STACKFRAMESIZE(r1)
56 /* Save all params that we need after a function call */
57 std r6,STK_PARM(r6)(r1)
58 std r8,STK_PARM(r8)(r1)
59
60 /* Add _PAGE_PRESENT to access */
61 ori r4,r4,_PAGE_PRESENT
62
63 /* Save non-volatile registers.
64 * r31 will hold "old PTE"
65 * r30 is "new PTE"
66 * r29 is "va"
67 * r28 is a hash value
68 * r27 is hashtab mask (maybe dynamic patched instead ?)
69 */
70 std r27,STK_REG(r27)(r1)
71 std r28,STK_REG(r28)(r1)
72 std r29,STK_REG(r29)(r1)
73 std r30,STK_REG(r30)(r1)
74 std r31,STK_REG(r31)(r1)
75
76 /* Step 1:
77 *
78 * Check permissions, atomically mark the linux PTE busy
79 * and hashed.
80 */
811:
82 ldarx r31,0,r6
83 /* Check access rights (access & ~(pte_val(*ptep))) */
84 andc. r0,r4,r31
85 bne- htab_wrong_access
86 /* Check if PTE is busy */
87 andi. r0,r31,_PAGE_BUSY
88 /* If so, just bail out and refault if needed. Someone else
89 * is changing this PTE anyway and might hash it.
90 */
91 bne- bail_ok
92 /* Prepare new PTE value (turn access RW into DIRTY, then
93 * add BUSY,HASHPTE and ACCESSED)
94 */
95 rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */
96 or r30,r30,r31
97 ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
98 /* Write the linux PTE atomically (setting busy) */
99 stdcx. r30,0,r6
100 bne- 1b
101 isync
102
103 /* Step 2:
104 *
105 * Insert/Update the HPTE in the hash table. At this point,
106 * r4 (access) is re-useable, we use it for the new HPTE flags
107 */
108
109 /* Calc va and put it in r29 */
110 rldicr r29,r5,28,63-28
111 rldicl r3,r3,0,36
112 or r29,r3,r29
113
114 /* Calculate hash value for primary slot and store it in r28 */
115 rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
116 rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */
117 xor r28,r5,r0
118
119 /* Convert linux PTE bits into HW equivalents */
120 andi. r3,r30,0x1fe /* Get basic set of flags */
121 xori r3,r3,HW_NO_EXEC /* _PAGE_EXEC -> NOEXEC */
122 rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */
123 rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */
124 and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */
125 andc r0,r30,r0 /* r0 = pte & ~r0 */
126 rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
127
128 /* We eventually do the icache sync here (maybe inline that
129 * code rather than call a C function...)
130 */
131BEGIN_FTR_SECTION
132 mr r4,r30
133 mr r5,r7
134 bl .hash_page_do_lazy_icache
135END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
136
137 /* At this point, r3 contains new PP bits, save them in
138 * place of "access" in the param area (sic)
139 */
140 std r3,STK_PARM(r4)(r1)
141
142 /* Get htab_hash_mask */
143 ld r4,htab_hash_mask@got(2)
144 ld r27,0(r4) /* htab_hash_mask -> r27 */
145
146 /* Check if we may already be in the hashtable, in this case, we
147 * go to out-of-line code to try to modify the HPTE
148 */
149 andi. r0,r31,_PAGE_HASHPTE
150 bne htab_modify_pte
151
152htab_insert_pte:
153 /* Clear hpte bits in new pte (we also clear BUSY btw) and
154 * add _PAGE_HASHPTE
155 */
156 lis r0,_PAGE_HPTEFLAGS@h
157 ori r0,r0,_PAGE_HPTEFLAGS@l
158 andc r30,r30,r0
159 ori r30,r30,_PAGE_HASHPTE
160
161 /* page number in r5 */
162 rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT
163
164 /* Calculate primary group hash */
165 and r0,r28,r27
166 rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
167
168 /* Call ppc_md.hpte_insert */
169 ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */
170 mr r4,r29 /* Retreive va */
171 li r6,0 /* no vflags */
172_GLOBAL(htab_call_hpte_insert1)
173 bl . /* Will be patched by htab_finish_init() */
174 cmpdi 0,r3,0
175 bge htab_pte_insert_ok /* Insertion successful */
176 cmpdi 0,r3,-2 /* Critical failure */
177 beq- htab_pte_insert_failure
178
179 /* Now try secondary slot */
180
181 /* page number in r5 */
182 rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT
183
184 /* Calculate secondary group hash */
185 andc r0,r27,r28
186 rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
187
188 /* Call ppc_md.hpte_insert */
189 ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */
190 mr r4,r29 /* Retreive va */
191 li r6,HPTE_V_SECONDARY@l /* secondary slot */
192_GLOBAL(htab_call_hpte_insert2)
193 bl . /* Will be patched by htab_finish_init() */
194 cmpdi 0,r3,0
195 bge+ htab_pte_insert_ok /* Insertion successful */
196 cmpdi 0,r3,-2 /* Critical failure */
197 beq- htab_pte_insert_failure
198
199 /* Both are full, we need to evict something */
200 mftb r0
201 /* Pick a random group based on TB */
202 andi. r0,r0,1
203 mr r5,r28
204 bne 2f
205 not r5,r5
2062: and r0,r5,r27
207 rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
208 /* Call ppc_md.hpte_remove */
209_GLOBAL(htab_call_hpte_remove)
210 bl . /* Will be patched by htab_finish_init() */
211
212 /* Try all again */
213 b htab_insert_pte
214
215bail_ok:
216 li r3,0
217 b bail
218
219htab_pte_insert_ok:
220 /* Insert slot number & secondary bit in PTE */
221 rldimi r30,r3,12,63-15
222
223 /* Write out the PTE with a normal write
224 * (maybe add eieio may be good still ?)
225 */
226htab_write_out_pte:
227 ld r6,STK_PARM(r6)(r1)
228 std r30,0(r6)
229 li r3, 0
230bail:
231 ld r27,STK_REG(r27)(r1)
232 ld r28,STK_REG(r28)(r1)
233 ld r29,STK_REG(r29)(r1)
234 ld r30,STK_REG(r30)(r1)
235 ld r31,STK_REG(r31)(r1)
236 addi r1,r1,STACKFRAMESIZE
237 ld r0,16(r1)
238 mtlr r0
239 blr
240
241htab_modify_pte:
242 /* Keep PP bits in r4 and slot idx from the PTE around in r3 */
243 mr r4,r3
244 rlwinm r3,r31,32-12,29,31
245
246 /* Secondary group ? if yes, get a inverted hash value */
247 mr r5,r28
248 andi. r0,r31,_PAGE_SECONDARY
249 beq 1f
250 not r5,r5
2511:
252 /* Calculate proper slot value for ppc_md.hpte_updatepp */
253 and r0,r5,r27
254 rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */
255 add r3,r0,r3 /* add slot idx */
256
257 /* Call ppc_md.hpte_updatepp */
258 mr r5,r29 /* va */
259 li r6,0 /* large is 0 */
260 ld r7,STK_PARM(r8)(r1) /* get "local" param */
261_GLOBAL(htab_call_hpte_updatepp)
262 bl . /* Will be patched by htab_finish_init() */
263
264 /* if we failed because typically the HPTE wasn't really here
265 * we try an insertion.
266 */
267 cmpdi 0,r3,-1
268 beq- htab_insert_pte
269
270 /* Clear the BUSY bit and Write out the PTE */
271 li r0,_PAGE_BUSY
272 andc r30,r30,r0
273 b htab_write_out_pte
274
275htab_wrong_access:
276 /* Bail out clearing reservation */
277 stdcx. r31,0,r6
278 li r3,1
279 b bail
280
281htab_pte_insert_failure:
282 /* Bail out restoring old PTE */
283 ld r6,STK_PARM(r6)(r1)
284 std r31,0(r6)
285 li r3,-1
286 b bail
287
288
diff --git a/arch/ppc64/mm/hash_native.c b/arch/ppc64/mm/hash_native.c
deleted file mode 100644
index 174d14576c2..00000000000
--- a/arch/ppc64/mm/hash_native.c
+++ /dev/null
@@ -1,446 +0,0 @@
1/*
2 * native hashtable management.
3 *
4 * SMP scalability work:
5 * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12#include <linux/spinlock.h>
13#include <linux/bitops.h>
14#include <linux/threads.h>
15#include <linux/smp.h>
16
17#include <asm/abs_addr.h>
18#include <asm/machdep.h>
19#include <asm/mmu.h>
20#include <asm/mmu_context.h>
21#include <asm/pgtable.h>
22#include <asm/tlbflush.h>
23#include <asm/tlb.h>
24#include <asm/cputable.h>
25
26#define HPTE_LOCK_BIT 3
27
28static DEFINE_SPINLOCK(native_tlbie_lock);
29
30static inline void native_lock_hpte(hpte_t *hptep)
31{
32 unsigned long *word = &hptep->v;
33
34 while (1) {
35 if (!test_and_set_bit(HPTE_LOCK_BIT, word))
36 break;
37 while(test_bit(HPTE_LOCK_BIT, word))
38 cpu_relax();
39 }
40}
41
42static inline void native_unlock_hpte(hpte_t *hptep)
43{
44 unsigned long *word = &hptep->v;
45
46 asm volatile("lwsync":::"memory");
47 clear_bit(HPTE_LOCK_BIT, word);
48}
49
50long native_hpte_insert(unsigned long hpte_group, unsigned long va,
51 unsigned long prpn, unsigned long vflags,
52 unsigned long rflags)
53{
54 hpte_t *hptep = htab_address + hpte_group;
55 unsigned long hpte_v, hpte_r;
56 int i;
57
58 for (i = 0; i < HPTES_PER_GROUP; i++) {
59 if (! (hptep->v & HPTE_V_VALID)) {
60 /* retry with lock held */
61 native_lock_hpte(hptep);
62 if (! (hptep->v & HPTE_V_VALID))
63 break;
64 native_unlock_hpte(hptep);
65 }
66
67 hptep++;
68 }
69
70 if (i == HPTES_PER_GROUP)
71 return -1;
72
73 hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID;
74 if (vflags & HPTE_V_LARGE)
75 va &= ~(1UL << HPTE_V_AVPN_SHIFT);
76 hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
77
78 hptep->r = hpte_r;
79 /* Guarantee the second dword is visible before the valid bit */
80 __asm__ __volatile__ ("eieio" : : : "memory");
81 /*
82 * Now set the first dword including the valid bit
83 * NOTE: this also unlocks the hpte
84 */
85 hptep->v = hpte_v;
86
87 __asm__ __volatile__ ("ptesync" : : : "memory");
88
89 return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
90}
91
92static long native_hpte_remove(unsigned long hpte_group)
93{
94 hpte_t *hptep;
95 int i;
96 int slot_offset;
97 unsigned long hpte_v;
98
99 /* pick a random entry to start at */
100 slot_offset = mftb() & 0x7;
101
102 for (i = 0; i < HPTES_PER_GROUP; i++) {
103 hptep = htab_address + hpte_group + slot_offset;
104 hpte_v = hptep->v;
105
106 if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
107 /* retry with lock held */
108 native_lock_hpte(hptep);
109 hpte_v = hptep->v;
110 if ((hpte_v & HPTE_V_VALID)
111 && !(hpte_v & HPTE_V_BOLTED))
112 break;
113 native_unlock_hpte(hptep);
114 }
115
116 slot_offset++;
117 slot_offset &= 0x7;
118 }
119
120 if (i == HPTES_PER_GROUP)
121 return -1;
122
123 /* Invalidate the hpte. NOTE: this also unlocks it */
124 hptep->v = 0;
125
126 return i;
127}
128
129static inline void set_pp_bit(unsigned long pp, hpte_t *addr)
130{
131 unsigned long old;
132 unsigned long *p = &addr->r;
133
134 __asm__ __volatile__(
135 "1: ldarx %0,0,%3\n\
136 rldimi %0,%2,0,61\n\
137 stdcx. %0,0,%3\n\
138 bne 1b"
139 : "=&r" (old), "=m" (*p)
140 : "r" (pp), "r" (p), "m" (*p)
141 : "cc");
142}
143
144/*
145 * Only works on small pages. Yes its ugly to have to check each slot in
146 * the group but we only use this during bootup.
147 */
148static long native_hpte_find(unsigned long vpn)
149{
150 hpte_t *hptep;
151 unsigned long hash;
152 unsigned long i, j;
153 long slot;
154 unsigned long hpte_v;
155
156 hash = hpt_hash(vpn, 0);
157
158 for (j = 0; j < 2; j++) {
159 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
160 for (i = 0; i < HPTES_PER_GROUP; i++) {
161 hptep = htab_address + slot;
162 hpte_v = hptep->v;
163
164 if ((HPTE_V_AVPN_VAL(hpte_v) == (vpn >> 11))
165 && (hpte_v & HPTE_V_VALID)
166 && ( !!(hpte_v & HPTE_V_SECONDARY) == j)) {
167 /* HPTE matches */
168 if (j)
169 slot = -slot;
170 return slot;
171 }
172 ++slot;
173 }
174 hash = ~hash;
175 }
176
177 return -1;
178}
179
180static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
181 unsigned long va, int large, int local)
182{
183 hpte_t *hptep = htab_address + slot;
184 unsigned long hpte_v;
185 unsigned long avpn = va >> 23;
186 int ret = 0;
187
188 if (large)
189 avpn &= ~1;
190
191 native_lock_hpte(hptep);
192
193 hpte_v = hptep->v;
194
195 /* Even if we miss, we need to invalidate the TLB */
196 if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
197 || !(hpte_v & HPTE_V_VALID)) {
198 native_unlock_hpte(hptep);
199 ret = -1;
200 } else {
201 set_pp_bit(newpp, hptep);
202 native_unlock_hpte(hptep);
203 }
204
205 /* Ensure it is out of the tlb too */
206 if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
207 tlbiel(va);
208 } else {
209 int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
210
211 if (lock_tlbie)
212 spin_lock(&native_tlbie_lock);
213 tlbie(va, large);
214 if (lock_tlbie)
215 spin_unlock(&native_tlbie_lock);
216 }
217
218 return ret;
219}
220
221/*
222 * Update the page protection bits. Intended to be used to create
223 * guard pages for kernel data structures on pages which are bolted
224 * in the HPT. Assumes pages being operated on will not be stolen.
225 * Does not work on large pages.
226 *
227 * No need to lock here because we should be the only user.
228 */
229static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea)
230{
231 unsigned long vsid, va, vpn, flags = 0;
232 long slot;
233 hpte_t *hptep;
234 int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
235
236 vsid = get_kernel_vsid(ea);
237 va = (vsid << 28) | (ea & 0x0fffffff);
238 vpn = va >> PAGE_SHIFT;
239
240 slot = native_hpte_find(vpn);
241 if (slot == -1)
242 panic("could not find page to bolt\n");
243 hptep = htab_address + slot;
244
245 set_pp_bit(newpp, hptep);
246
247 /* Ensure it is out of the tlb too */
248 if (lock_tlbie)
249 spin_lock_irqsave(&native_tlbie_lock, flags);
250 tlbie(va, 0);
251 if (lock_tlbie)
252 spin_unlock_irqrestore(&native_tlbie_lock, flags);
253}
254
255static void native_hpte_invalidate(unsigned long slot, unsigned long va,
256 int large, int local)
257{
258 hpte_t *hptep = htab_address + slot;
259 unsigned long hpte_v;
260 unsigned long avpn = va >> 23;
261 unsigned long flags;
262 int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
263
264 if (large)
265 avpn &= ~1;
266
267 local_irq_save(flags);
268 native_lock_hpte(hptep);
269
270 hpte_v = hptep->v;
271
272 /* Even if we miss, we need to invalidate the TLB */
273 if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
274 || !(hpte_v & HPTE_V_VALID)) {
275 native_unlock_hpte(hptep);
276 } else {
277 /* Invalidate the hpte. NOTE: this also unlocks it */
278 hptep->v = 0;
279 }
280
281 /* Invalidate the tlb */
282 if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
283 tlbiel(va);
284 } else {
285 if (lock_tlbie)
286 spin_lock(&native_tlbie_lock);
287 tlbie(va, large);
288 if (lock_tlbie)
289 spin_unlock(&native_tlbie_lock);
290 }
291 local_irq_restore(flags);
292}
293
294/*
295 * clear all mappings on kexec. All cpus are in real mode (or they will
296 * be when they isi), and we are the only one left. We rely on our kernel
297 * mapping being 0xC0's and the hardware ignoring those two real bits.
298 *
299 * TODO: add batching support when enabled. remember, no dynamic memory here,
300 * athough there is the control page available...
301 */
302static void native_hpte_clear(void)
303{
304 unsigned long slot, slots, flags;
305 hpte_t *hptep = htab_address;
306 unsigned long hpte_v;
307 unsigned long pteg_count;
308
309 pteg_count = htab_hash_mask + 1;
310
311 local_irq_save(flags);
312
313 /* we take the tlbie lock and hold it. Some hardware will
314 * deadlock if we try to tlbie from two processors at once.
315 */
316 spin_lock(&native_tlbie_lock);
317
318 slots = pteg_count * HPTES_PER_GROUP;
319
320 for (slot = 0; slot < slots; slot++, hptep++) {
321 /*
322 * we could lock the pte here, but we are the only cpu
323 * running, right? and for crash dump, we probably
324 * don't want to wait for a maybe bad cpu.
325 */
326 hpte_v = hptep->v;
327
328 if (hpte_v & HPTE_V_VALID) {
329 hptep->v = 0;
330 tlbie(slot2va(hpte_v, slot), hpte_v & HPTE_V_LARGE);
331 }
332 }
333
334 spin_unlock(&native_tlbie_lock);
335 local_irq_restore(flags);
336}
337
338static void native_flush_hash_range(unsigned long number, int local)
339{
340 unsigned long va, vpn, hash, secondary, slot, flags, avpn;
341 int i, j;
342 hpte_t *hptep;
343 unsigned long hpte_v;
344 struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
345 unsigned long large = batch->large;
346
347 local_irq_save(flags);
348
349 j = 0;
350 for (i = 0; i < number; i++) {
351 va = batch->vaddr[j];
352 if (large)
353 vpn = va >> HPAGE_SHIFT;
354 else
355 vpn = va >> PAGE_SHIFT;
356 hash = hpt_hash(vpn, large);
357 secondary = (pte_val(batch->pte[i]) & _PAGE_SECONDARY) >> 15;
358 if (secondary)
359 hash = ~hash;
360 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
361 slot += (pte_val(batch->pte[i]) & _PAGE_GROUP_IX) >> 12;
362
363 hptep = htab_address + slot;
364
365 avpn = va >> 23;
366 if (large)
367 avpn &= ~0x1UL;
368
369 native_lock_hpte(hptep);
370
371 hpte_v = hptep->v;
372
373 /* Even if we miss, we need to invalidate the TLB */
374 if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
375 || !(hpte_v & HPTE_V_VALID)) {
376 native_unlock_hpte(hptep);
377 } else {
378 /* Invalidate the hpte. NOTE: this also unlocks it */
379 hptep->v = 0;
380 }
381
382 j++;
383 }
384
385 if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
386 asm volatile("ptesync":::"memory");
387
388 for (i = 0; i < j; i++)
389 __tlbiel(batch->vaddr[i]);
390
391 asm volatile("ptesync":::"memory");
392 } else {
393 int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
394
395 if (lock_tlbie)
396 spin_lock(&native_tlbie_lock);
397
398 asm volatile("ptesync":::"memory");
399
400 for (i = 0; i < j; i++)
401 __tlbie(batch->vaddr[i], large);
402
403 asm volatile("eieio; tlbsync; ptesync":::"memory");
404
405 if (lock_tlbie)
406 spin_unlock(&native_tlbie_lock);
407 }
408
409 local_irq_restore(flags);
410}
411
412#ifdef CONFIG_PPC_PSERIES
413/* Disable TLB batching on nighthawk */
414static inline int tlb_batching_enabled(void)
415{
416 struct device_node *root = of_find_node_by_path("/");
417 int enabled = 1;
418
419 if (root) {
420 const char *model = get_property(root, "model", NULL);
421 if (model && !strcmp(model, "IBM,9076-N81"))
422 enabled = 0;
423 of_node_put(root);
424 }
425
426 return enabled;
427}
428#else
429static inline int tlb_batching_enabled(void)
430{
431 return 1;
432}
433#endif
434
435void hpte_init_native(void)
436{
437 ppc_md.hpte_invalidate = native_hpte_invalidate;
438 ppc_md.hpte_updatepp = native_hpte_updatepp;
439 ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp;
440 ppc_md.hpte_insert = native_hpte_insert;
441 ppc_md.hpte_remove = native_hpte_remove;
442 ppc_md.hpte_clear_all = native_hpte_clear;
443 if (tlb_batching_enabled())
444 ppc_md.flush_hash_range = native_flush_hash_range;
445 htab_finish_init();
446}
diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c
deleted file mode 100644
index 83507438d6a..00000000000
--- a/arch/ppc64/mm/hash_utils.c
+++ /dev/null
@@ -1,438 +0,0 @@
1/*
2 * PowerPC64 port by Mike Corrigan and Dave Engebretsen
3 * {mikejc|engebret}@us.ibm.com
4 *
5 * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
6 *
7 * SMP scalability work:
8 * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
9 *
10 * Module name: htab.c
11 *
12 * Description:
13 * PowerPC Hashed Page Table functions
14 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version
18 * 2 of the License, or (at your option) any later version.
19 */
20
21#undef DEBUG
22
23#include <linux/config.h>
24#include <linux/spinlock.h>
25#include <linux/errno.h>
26#include <linux/sched.h>
27#include <linux/proc_fs.h>
28#include <linux/stat.h>
29#include <linux/sysctl.h>
30#include <linux/ctype.h>
31#include <linux/cache.h>
32#include <linux/init.h>
33#include <linux/signal.h>
34
35#include <asm/ppcdebug.h>
36#include <asm/processor.h>
37#include <asm/pgtable.h>
38#include <asm/mmu.h>
39#include <asm/mmu_context.h>
40#include <asm/page.h>
41#include <asm/types.h>
42#include <asm/system.h>
43#include <asm/uaccess.h>
44#include <asm/machdep.h>
45#include <asm/lmb.h>
46#include <asm/abs_addr.h>
47#include <asm/tlbflush.h>
48#include <asm/io.h>
49#include <asm/eeh.h>
50#include <asm/tlb.h>
51#include <asm/cacheflush.h>
52#include <asm/cputable.h>
53#include <asm/abs_addr.h>
54#include <asm/sections.h>
55
56#ifdef DEBUG
57#define DBG(fmt...) udbg_printf(fmt)
58#else
59#define DBG(fmt...)
60#endif
61
62/*
63 * Note: pte --> Linux PTE
64 * HPTE --> PowerPC Hashed Page Table Entry
65 *
66 * Execution context:
67 * htab_initialize is called with the MMU off (of course), but
68 * the kernel has been copied down to zero so it can directly
69 * reference global data. At this point it is very difficult
70 * to print debug info.
71 *
72 */
73
74#ifdef CONFIG_U3_DART
75extern unsigned long dart_tablebase;
76#endif /* CONFIG_U3_DART */
77
78hpte_t *htab_address;
79unsigned long htab_hash_mask;
80
81extern unsigned long _SDR1;
82
83#define KB (1024)
84#define MB (1024*KB)
85
86static inline void loop_forever(void)
87{
88 volatile unsigned long x = 1;
89 for(;x;x|=1)
90 ;
91}
92
93static inline void create_pte_mapping(unsigned long start, unsigned long end,
94 unsigned long mode, int large)
95{
96 unsigned long addr;
97 unsigned int step;
98 unsigned long tmp_mode;
99 unsigned long vflags;
100
101 if (large) {
102 step = 16*MB;
103 vflags = HPTE_V_BOLTED | HPTE_V_LARGE;
104 } else {
105 step = 4*KB;
106 vflags = HPTE_V_BOLTED;
107 }
108
109 for (addr = start; addr < end; addr += step) {
110 unsigned long vpn, hash, hpteg;
111 unsigned long vsid = get_kernel_vsid(addr);
112 unsigned long va = (vsid << 28) | (addr & 0xfffffff);
113 int ret = -1;
114
115 if (large)
116 vpn = va >> HPAGE_SHIFT;
117 else
118 vpn = va >> PAGE_SHIFT;
119
120
121 tmp_mode = mode;
122
123 /* Make non-kernel text non-executable */
124 if (!in_kernel_text(addr))
125 tmp_mode = mode | HW_NO_EXEC;
126
127 hash = hpt_hash(vpn, large);
128
129 hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
130
131#ifdef CONFIG_PPC_ISERIES
132 if (systemcfg->platform & PLATFORM_ISERIES_LPAR)
133 ret = iSeries_hpte_bolt_or_insert(hpteg, va,
134 virt_to_abs(addr) >> PAGE_SHIFT,
135 vflags, tmp_mode);
136 else
137#endif
138#ifdef CONFIG_PPC_PSERIES
139 if (systemcfg->platform & PLATFORM_LPAR)
140 ret = pSeries_lpar_hpte_insert(hpteg, va,
141 virt_to_abs(addr) >> PAGE_SHIFT,
142 vflags, tmp_mode);
143 else
144#endif
145#ifdef CONFIG_PPC_MULTIPLATFORM
146 ret = native_hpte_insert(hpteg, va,
147 virt_to_abs(addr) >> PAGE_SHIFT,
148 vflags, tmp_mode);
149#endif
150
151 if (ret == -1) {
152 ppc64_terminate_msg(0x20, "create_pte_mapping");
153 loop_forever();
154 }
155 }
156}
157
158void __init htab_initialize(void)
159{
160 unsigned long table, htab_size_bytes;
161 unsigned long pteg_count;
162 unsigned long mode_rw;
163 int i, use_largepages = 0;
164 unsigned long base = 0, size = 0;
165 extern unsigned long tce_alloc_start, tce_alloc_end;
166
167 DBG(" -> htab_initialize()\n");
168
169 /*
170 * Calculate the required size of the htab. We want the number of
171 * PTEGs to equal one half the number of real pages.
172 */
173 htab_size_bytes = 1UL << ppc64_pft_size;
174 pteg_count = htab_size_bytes >> 7;
175
176 /* For debug, make the HTAB 1/8 as big as it normally would be. */
177 ifppcdebug(PPCDBG_HTABSIZE) {
178 pteg_count >>= 3;
179 htab_size_bytes = pteg_count << 7;
180 }
181
182 htab_hash_mask = pteg_count - 1;
183
184 if (systemcfg->platform & PLATFORM_LPAR) {
185 /* Using a hypervisor which owns the htab */
186 htab_address = NULL;
187 _SDR1 = 0;
188 } else {
189 /* Find storage for the HPT. Must be contiguous in
190 * the absolute address space.
191 */
192 table = lmb_alloc(htab_size_bytes, htab_size_bytes);
193
194 DBG("Hash table allocated at %lx, size: %lx\n", table,
195 htab_size_bytes);
196
197 if ( !table ) {
198 ppc64_terminate_msg(0x20, "hpt space");
199 loop_forever();
200 }
201 htab_address = abs_to_virt(table);
202
203 /* htab absolute addr + encoded htabsize */
204 _SDR1 = table + __ilog2(pteg_count) - 11;
205
206 /* Initialize the HPT with no entries */
207 memset((void *)table, 0, htab_size_bytes);
208 }
209
210 mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
211
212 /* On U3 based machines, we need to reserve the DART area and
213 * _NOT_ map it to avoid cache paradoxes as it's remapped non
214 * cacheable later on
215 */
216 if (cpu_has_feature(CPU_FTR_16M_PAGE))
217 use_largepages = 1;
218
219 /* create bolted the linear mapping in the hash table */
220 for (i=0; i < lmb.memory.cnt; i++) {
221 base = lmb.memory.region[i].base + KERNELBASE;
222 size = lmb.memory.region[i].size;
223
224 DBG("creating mapping for region: %lx : %lx\n", base, size);
225
226#ifdef CONFIG_U3_DART
227 /* Do not map the DART space. Fortunately, it will be aligned
228 * in such a way that it will not cross two lmb regions and will
229 * fit within a single 16Mb page.
230 * The DART space is assumed to be a full 16Mb region even if we
231 * only use 2Mb of that space. We will use more of it later for
232 * AGP GART. We have to use a full 16Mb large page.
233 */
234 DBG("DART base: %lx\n", dart_tablebase);
235
236 if (dart_tablebase != 0 && dart_tablebase >= base
237 && dart_tablebase < (base + size)) {
238 if (base != dart_tablebase)
239 create_pte_mapping(base, dart_tablebase, mode_rw,
240 use_largepages);
241 if ((base + size) > (dart_tablebase + 16*MB))
242 create_pte_mapping(dart_tablebase + 16*MB, base + size,
243 mode_rw, use_largepages);
244 continue;
245 }
246#endif /* CONFIG_U3_DART */
247 create_pte_mapping(base, base + size, mode_rw, use_largepages);
248 }
249
250 /*
251 * If we have a memory_limit and we've allocated TCEs then we need to
252 * explicitly map the TCE area at the top of RAM. We also cope with the
253 * case that the TCEs start below memory_limit.
254 * tce_alloc_start/end are 16MB aligned so the mapping should work
255 * for either 4K or 16MB pages.
256 */
257 if (tce_alloc_start) {
258 tce_alloc_start += KERNELBASE;
259 tce_alloc_end += KERNELBASE;
260
261 if (base + size >= tce_alloc_start)
262 tce_alloc_start = base + size + 1;
263
264 create_pte_mapping(tce_alloc_start, tce_alloc_end,
265 mode_rw, use_largepages);
266 }
267
268 DBG(" <- htab_initialize()\n");
269}
270#undef KB
271#undef MB
272
273/*
274 * Called by asm hashtable.S for doing lazy icache flush
275 */
276unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
277{
278 struct page *page;
279
280 if (!pfn_valid(pte_pfn(pte)))
281 return pp;
282
283 page = pte_page(pte);
284
285 /* page is dirty */
286 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
287 if (trap == 0x400) {
288 __flush_dcache_icache(page_address(page));
289 set_bit(PG_arch_1, &page->flags);
290 } else
291 pp |= HW_NO_EXEC;
292 }
293 return pp;
294}
295
296/* Result code is:
297 * 0 - handled
298 * 1 - normal page fault
299 * -1 - critical hash insertion error
300 */
301int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
302{
303 void *pgdir;
304 unsigned long vsid;
305 struct mm_struct *mm;
306 pte_t *ptep;
307 int ret;
308 int user_region = 0;
309 int local = 0;
310 cpumask_t tmp;
311
312 if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
313 return 1;
314
315 switch (REGION_ID(ea)) {
316 case USER_REGION_ID:
317 user_region = 1;
318 mm = current->mm;
319 if (! mm)
320 return 1;
321
322 vsid = get_vsid(mm->context.id, ea);
323 break;
324 case VMALLOC_REGION_ID:
325 mm = &init_mm;
326 vsid = get_kernel_vsid(ea);
327 break;
328#if 0
329 case KERNEL_REGION_ID:
330 /*
331 * Should never get here - entire 0xC0... region is bolted.
332 * Send the problem up to do_page_fault
333 */
334#endif
335 default:
336 /* Not a valid range
337 * Send the problem up to do_page_fault
338 */
339 return 1;
340 break;
341 }
342
343 pgdir = mm->pgd;
344
345 if (pgdir == NULL)
346 return 1;
347
348 tmp = cpumask_of_cpu(smp_processor_id());
349 if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
350 local = 1;
351
352 /* Is this a huge page ? */
353 if (unlikely(in_hugepage_area(mm->context, ea)))
354 ret = hash_huge_page(mm, access, ea, vsid, local);
355 else {
356 ptep = find_linux_pte(pgdir, ea);
357 if (ptep == NULL)
358 return 1;
359 ret = __hash_page(ea, access, vsid, ptep, trap, local);
360 }
361
362 return ret;
363}
364
365void flush_hash_page(unsigned long va, pte_t pte, int local)
366{
367 unsigned long vpn, hash, secondary, slot;
368 unsigned long huge = pte_huge(pte);
369
370 if (huge)
371 vpn = va >> HPAGE_SHIFT;
372 else
373 vpn = va >> PAGE_SHIFT;
374 hash = hpt_hash(vpn, huge);
375 secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15;
376 if (secondary)
377 hash = ~hash;
378 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
379 slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12;
380
381 ppc_md.hpte_invalidate(slot, va, huge, local);
382}
383
384void flush_hash_range(unsigned long number, int local)
385{
386 if (ppc_md.flush_hash_range) {
387 ppc_md.flush_hash_range(number, local);
388 } else {
389 int i;
390 struct ppc64_tlb_batch *batch =
391 &__get_cpu_var(ppc64_tlb_batch);
392
393 for (i = 0; i < number; i++)
394 flush_hash_page(batch->vaddr[i], batch->pte[i], local);
395 }
396}
397
398static inline void make_bl(unsigned int *insn_addr, void *func)
399{
400 unsigned long funcp = *((unsigned long *)func);
401 int offset = funcp - (unsigned long)insn_addr;
402
403 *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc));
404 flush_icache_range((unsigned long)insn_addr, 4+
405 (unsigned long)insn_addr);
406}
407
408/*
409 * low_hash_fault is called when we the low level hash code failed
410 * to instert a PTE due to an hypervisor error
411 */
412void low_hash_fault(struct pt_regs *regs, unsigned long address)
413{
414 if (user_mode(regs)) {
415 siginfo_t info;
416
417 info.si_signo = SIGBUS;
418 info.si_errno = 0;
419 info.si_code = BUS_ADRERR;
420 info.si_addr = (void __user *)address;
421 force_sig_info(SIGBUS, &info, current);
422 return;
423 }
424 bad_page_fault(regs, address, SIGBUS);
425}
426
427void __init htab_finish_init(void)
428{
429 extern unsigned int *htab_call_hpte_insert1;
430 extern unsigned int *htab_call_hpte_insert2;
431 extern unsigned int *htab_call_hpte_remove;
432 extern unsigned int *htab_call_hpte_updatepp;
433
434 make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert);
435 make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert);
436 make_bl(htab_call_hpte_remove, ppc_md.hpte_remove);
437 make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp);
438}
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
deleted file mode 100644
index 0ea0994ed97..00000000000
--- a/arch/ppc64/mm/hugetlbpage.c
+++ /dev/null
@@ -1,745 +0,0 @@
1/*
2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
3 *
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
5 *
6 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8 */
9
10#include <linux/init.h>
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/hugetlb.h>
14#include <linux/pagemap.h>
15#include <linux/smp_lock.h>
16#include <linux/slab.h>
17#include <linux/err.h>
18#include <linux/sysctl.h>
19#include <asm/mman.h>
20#include <asm/pgalloc.h>
21#include <asm/tlb.h>
22#include <asm/tlbflush.h>
23#include <asm/mmu_context.h>
24#include <asm/machdep.h>
25#include <asm/cputable.h>
26#include <asm/tlb.h>
27
28#include <linux/sysctl.h>
29
30#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
31#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
32
33/* Modelled after find_linux_pte() */
34pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
35{
36 pgd_t *pg;
37 pud_t *pu;
38 pmd_t *pm;
39 pte_t *pt;
40
41 BUG_ON(! in_hugepage_area(mm->context, addr));
42
43 addr &= HPAGE_MASK;
44
45 pg = pgd_offset(mm, addr);
46 if (!pgd_none(*pg)) {
47 pu = pud_offset(pg, addr);
48 if (!pud_none(*pu)) {
49 pm = pmd_offset(pu, addr);
50 pt = (pte_t *)pm;
51 BUG_ON(!pmd_none(*pm)
52 && !(pte_present(*pt) && pte_huge(*pt)));
53 return pt;
54 }
55 }
56
57 return NULL;
58}
59
60pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
61{
62 pgd_t *pg;
63 pud_t *pu;
64 pmd_t *pm;
65 pte_t *pt;
66
67 BUG_ON(! in_hugepage_area(mm->context, addr));
68
69 addr &= HPAGE_MASK;
70
71 pg = pgd_offset(mm, addr);
72 pu = pud_alloc(mm, pg, addr);
73
74 if (pu) {
75 pm = pmd_alloc(mm, pu, addr);
76 if (pm) {
77 pt = (pte_t *)pm;
78 BUG_ON(!pmd_none(*pm)
79 && !(pte_present(*pt) && pte_huge(*pt)));
80 return pt;
81 }
82 }
83
84 return NULL;
85}
86
87#define HUGEPTE_BATCH_SIZE (HPAGE_SIZE / PMD_SIZE)
88
89void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
90 pte_t *ptep, pte_t pte)
91{
92 int i;
93
94 if (pte_present(*ptep)) {
95 pte_clear(mm, addr, ptep);
96 flush_tlb_pending();
97 }
98
99 for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
100 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
101 ptep++;
102 }
103}
104
105pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
106 pte_t *ptep)
107{
108 unsigned long old = pte_update(ptep, ~0UL);
109 int i;
110
111 if (old & _PAGE_HASHPTE)
112 hpte_update(mm, addr, old, 0);
113
114 for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
115 ptep[i] = __pte(0);
116
117 return __pte(old);
118}
119
120/*
121 * This function checks for proper alignment of input addr and len parameters.
122 */
123int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
124{
125 if (len & ~HPAGE_MASK)
126 return -EINVAL;
127 if (addr & ~HPAGE_MASK)
128 return -EINVAL;
129 if (! (within_hugepage_low_range(addr, len)
130 || within_hugepage_high_range(addr, len)) )
131 return -EINVAL;
132 return 0;
133}
134
135static void flush_low_segments(void *parm)
136{
137 u16 areas = (unsigned long) parm;
138 unsigned long i;
139
140 asm volatile("isync" : : : "memory");
141
142 BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
143
144 for (i = 0; i < NUM_LOW_AREAS; i++) {
145 if (! (areas & (1U << i)))
146 continue;
147 asm volatile("slbie %0"
148 : : "r" ((i << SID_SHIFT) | SLBIE_C));
149 }
150
151 asm volatile("isync" : : : "memory");
152}
153
154static void flush_high_segments(void *parm)
155{
156 u16 areas = (unsigned long) parm;
157 unsigned long i, j;
158
159 asm volatile("isync" : : : "memory");
160
161 BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
162
163 for (i = 0; i < NUM_HIGH_AREAS; i++) {
164 if (! (areas & (1U << i)))
165 continue;
166 for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
167 asm volatile("slbie %0"
168 :: "r" (((i << HTLB_AREA_SHIFT)
169 + (j << SID_SHIFT)) | SLBIE_C));
170 }
171
172 asm volatile("isync" : : : "memory");
173}
174
175static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
176{
177 unsigned long start = area << SID_SHIFT;
178 unsigned long end = (area+1) << SID_SHIFT;
179 struct vm_area_struct *vma;
180
181 BUG_ON(area >= NUM_LOW_AREAS);
182
183 /* Check no VMAs are in the region */
184 vma = find_vma(mm, start);
185 if (vma && (vma->vm_start < end))
186 return -EBUSY;
187
188 return 0;
189}
190
191static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
192{
193 unsigned long start = area << HTLB_AREA_SHIFT;
194 unsigned long end = (area+1) << HTLB_AREA_SHIFT;
195 struct vm_area_struct *vma;
196
197 BUG_ON(area >= NUM_HIGH_AREAS);
198
199 /* Check no VMAs are in the region */
200 vma = find_vma(mm, start);
201 if (vma && (vma->vm_start < end))
202 return -EBUSY;
203
204 return 0;
205}
206
207static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
208{
209 unsigned long i;
210
211 BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
212 BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
213
214 newareas &= ~(mm->context.low_htlb_areas);
215 if (! newareas)
216 return 0; /* The segments we want are already open */
217
218 for (i = 0; i < NUM_LOW_AREAS; i++)
219 if ((1 << i) & newareas)
220 if (prepare_low_area_for_htlb(mm, i) != 0)
221 return -EBUSY;
222
223 mm->context.low_htlb_areas |= newareas;
224
225 /* update the paca copy of the context struct */
226 get_paca()->context = mm->context;
227
228 /* the context change must make it to memory before the flush,
229 * so that further SLB misses do the right thing. */
230 mb();
231 on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
232
233 return 0;
234}
235
236static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
237{
238 unsigned long i;
239
240 BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
241 BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
242 != NUM_HIGH_AREAS);
243
244 newareas &= ~(mm->context.high_htlb_areas);
245 if (! newareas)
246 return 0; /* The areas we want are already open */
247
248 for (i = 0; i < NUM_HIGH_AREAS; i++)
249 if ((1 << i) & newareas)
250 if (prepare_high_area_for_htlb(mm, i) != 0)
251 return -EBUSY;
252
253 mm->context.high_htlb_areas |= newareas;
254
255 /* update the paca copy of the context struct */
256 get_paca()->context = mm->context;
257
258 /* the context change must make it to memory before the flush,
259 * so that further SLB misses do the right thing. */
260 mb();
261 on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
262
263 return 0;
264}
265
266int prepare_hugepage_range(unsigned long addr, unsigned long len)
267{
268 int err;
269
270 if ( (addr+len) < addr )
271 return -EINVAL;
272
273 if ((addr + len) < 0x100000000UL)
274 err = open_low_hpage_areas(current->mm,
275 LOW_ESID_MASK(addr, len));
276 else
277 err = open_high_hpage_areas(current->mm,
278 HTLB_AREA_MASK(addr, len));
279 if (err) {
280 printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
281 " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
282 addr, len,
283 LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
284 return err;
285 }
286
287 return 0;
288}
289
290struct page *
291follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
292{
293 pte_t *ptep;
294 struct page *page;
295
296 if (! in_hugepage_area(mm->context, address))
297 return ERR_PTR(-EINVAL);
298
299 ptep = huge_pte_offset(mm, address);
300 page = pte_page(*ptep);
301 if (page)
302 page += (address % HPAGE_SIZE) / PAGE_SIZE;
303
304 return page;
305}
306
307int pmd_huge(pmd_t pmd)
308{
309 return 0;
310}
311
312struct page *
313follow_huge_pmd(struct mm_struct *mm, unsigned long address,
314 pmd_t *pmd, int write)
315{
316 BUG();
317 return NULL;
318}
319
320/* Because we have an exclusive hugepage region which lies within the
321 * normal user address space, we have to take special measures to make
322 * non-huge mmap()s evade the hugepage reserved regions. */
323unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
324 unsigned long len, unsigned long pgoff,
325 unsigned long flags)
326{
327 struct mm_struct *mm = current->mm;
328 struct vm_area_struct *vma;
329 unsigned long start_addr;
330
331 if (len > TASK_SIZE)
332 return -ENOMEM;
333
334 if (addr) {
335 addr = PAGE_ALIGN(addr);
336 vma = find_vma(mm, addr);
337 if (((TASK_SIZE - len) >= addr)
338 && (!vma || (addr+len) <= vma->vm_start)
339 && !is_hugepage_only_range(mm, addr,len))
340 return addr;
341 }
342 if (len > mm->cached_hole_size) {
343 start_addr = addr = mm->free_area_cache;
344 } else {
345 start_addr = addr = TASK_UNMAPPED_BASE;
346 mm->cached_hole_size = 0;
347 }
348
349full_search:
350 vma = find_vma(mm, addr);
351 while (TASK_SIZE - len >= addr) {
352 BUG_ON(vma && (addr >= vma->vm_end));
353
354 if (touches_hugepage_low_range(mm, addr, len)) {
355 addr = ALIGN(addr+1, 1<<SID_SHIFT);
356 vma = find_vma(mm, addr);
357 continue;
358 }
359 if (touches_hugepage_high_range(mm, addr, len)) {
360 addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
361 vma = find_vma(mm, addr);
362 continue;
363 }
364 if (!vma || addr + len <= vma->vm_start) {
365 /*
366 * Remember the place where we stopped the search:
367 */
368 mm->free_area_cache = addr + len;
369 return addr;
370 }
371 if (addr + mm->cached_hole_size < vma->vm_start)
372 mm->cached_hole_size = vma->vm_start - addr;
373 addr = vma->vm_end;
374 vma = vma->vm_next;
375 }
376
377 /* Make sure we didn't miss any holes */
378 if (start_addr != TASK_UNMAPPED_BASE) {
379 start_addr = addr = TASK_UNMAPPED_BASE;
380 mm->cached_hole_size = 0;
381 goto full_search;
382 }
383 return -ENOMEM;
384}
385
386/*
387 * This mmap-allocator allocates new areas top-down from below the
388 * stack's low limit (the base):
389 *
390 * Because we have an exclusive hugepage region which lies within the
391 * normal user address space, we have to take special measures to make
392 * non-huge mmap()s evade the hugepage reserved regions.
393 */
394unsigned long
395arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
396 const unsigned long len, const unsigned long pgoff,
397 const unsigned long flags)
398{
399 struct vm_area_struct *vma, *prev_vma;
400 struct mm_struct *mm = current->mm;
401 unsigned long base = mm->mmap_base, addr = addr0;
402 unsigned long largest_hole = mm->cached_hole_size;
403 int first_time = 1;
404
405 /* requested length too big for entire address space */
406 if (len > TASK_SIZE)
407 return -ENOMEM;
408
409 /* dont allow allocations above current base */
410 if (mm->free_area_cache > base)
411 mm->free_area_cache = base;
412
413 /* requesting a specific address */
414 if (addr) {
415 addr = PAGE_ALIGN(addr);
416 vma = find_vma(mm, addr);
417 if (TASK_SIZE - len >= addr &&
418 (!vma || addr + len <= vma->vm_start)
419 && !is_hugepage_only_range(mm, addr,len))
420 return addr;
421 }
422
423 if (len <= largest_hole) {
424 largest_hole = 0;
425 mm->free_area_cache = base;
426 }
427try_again:
428 /* make sure it can fit in the remaining address space */
429 if (mm->free_area_cache < len)
430 goto fail;
431
432 /* either no address requested or cant fit in requested address hole */
433 addr = (mm->free_area_cache - len) & PAGE_MASK;
434 do {
435hugepage_recheck:
436 if (touches_hugepage_low_range(mm, addr, len)) {
437 addr = (addr & ((~0) << SID_SHIFT)) - len;
438 goto hugepage_recheck;
439 } else if (touches_hugepage_high_range(mm, addr, len)) {
440 addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
441 goto hugepage_recheck;
442 }
443
444 /*
445 * Lookup failure means no vma is above this address,
446 * i.e. return with success:
447 */
448 if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
449 return addr;
450
451 /*
452 * new region fits between prev_vma->vm_end and
453 * vma->vm_start, use it:
454 */
455 if (addr+len <= vma->vm_start &&
456 (!prev_vma || (addr >= prev_vma->vm_end))) {
457 /* remember the address as a hint for next time */
458 mm->cached_hole_size = largest_hole;
459 return (mm->free_area_cache = addr);
460 } else {
461 /* pull free_area_cache down to the first hole */
462 if (mm->free_area_cache == vma->vm_end) {
463 mm->free_area_cache = vma->vm_start;
464 mm->cached_hole_size = largest_hole;
465 }
466 }
467
468 /* remember the largest hole we saw so far */
469 if (addr + largest_hole < vma->vm_start)
470 largest_hole = vma->vm_start - addr;
471
472 /* try just below the current vma->vm_start */
473 addr = vma->vm_start-len;
474 } while (len <= vma->vm_start);
475
476fail:
477 /*
478 * if hint left us with no space for the requested
479 * mapping then try again:
480 */
481 if (first_time) {
482 mm->free_area_cache = base;
483 largest_hole = 0;
484 first_time = 0;
485 goto try_again;
486 }
487 /*
488 * A failed mmap() very likely causes application failure,
489 * so fall back to the bottom-up function here. This scenario
490 * can happen with large stack limits and large mmap()
491 * allocations.
492 */
493 mm->free_area_cache = TASK_UNMAPPED_BASE;
494 mm->cached_hole_size = ~0UL;
495 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
496 /*
497 * Restore the topdown base:
498 */
499 mm->free_area_cache = base;
500 mm->cached_hole_size = ~0UL;
501
502 return addr;
503}
504
505static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
506{
507 unsigned long addr = 0;
508 struct vm_area_struct *vma;
509
510 vma = find_vma(current->mm, addr);
511 while (addr + len <= 0x100000000UL) {
512 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
513
514 if (! __within_hugepage_low_range(addr, len, segmask)) {
515 addr = ALIGN(addr+1, 1<<SID_SHIFT);
516 vma = find_vma(current->mm, addr);
517 continue;
518 }
519
520 if (!vma || (addr + len) <= vma->vm_start)
521 return addr;
522 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
523 /* Depending on segmask this might not be a confirmed
524 * hugepage region, so the ALIGN could have skipped
525 * some VMAs */
526 vma = find_vma(current->mm, addr);
527 }
528
529 return -ENOMEM;
530}
531
532static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
533{
534 unsigned long addr = 0x100000000UL;
535 struct vm_area_struct *vma;
536
537 vma = find_vma(current->mm, addr);
538 while (addr + len <= TASK_SIZE_USER64) {
539 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
540
541 if (! __within_hugepage_high_range(addr, len, areamask)) {
542 addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
543 vma = find_vma(current->mm, addr);
544 continue;
545 }
546
547 if (!vma || (addr + len) <= vma->vm_start)
548 return addr;
549 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
550 /* Depending on segmask this might not be a confirmed
551 * hugepage region, so the ALIGN could have skipped
552 * some VMAs */
553 vma = find_vma(current->mm, addr);
554 }
555
556 return -ENOMEM;
557}
558
559unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
560 unsigned long len, unsigned long pgoff,
561 unsigned long flags)
562{
563 int lastshift;
564 u16 areamask, curareas;
565
566 if (len & ~HPAGE_MASK)
567 return -EINVAL;
568
569 if (!cpu_has_feature(CPU_FTR_16M_PAGE))
570 return -EINVAL;
571
572 if (test_thread_flag(TIF_32BIT)) {
573 curareas = current->mm->context.low_htlb_areas;
574
575 /* First see if we can do the mapping in the existing
576 * low areas */
577 addr = htlb_get_low_area(len, curareas);
578 if (addr != -ENOMEM)
579 return addr;
580
581 lastshift = 0;
582 for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
583 ! lastshift; areamask >>=1) {
584 if (areamask & 1)
585 lastshift = 1;
586
587 addr = htlb_get_low_area(len, curareas | areamask);
588 if ((addr != -ENOMEM)
589 && open_low_hpage_areas(current->mm, areamask) == 0)
590 return addr;
591 }
592 } else {
593 curareas = current->mm->context.high_htlb_areas;
594
595 /* First see if we can do the mapping in the existing
596 * high areas */
597 addr = htlb_get_high_area(len, curareas);
598 if (addr != -ENOMEM)
599 return addr;
600
601 lastshift = 0;
602 for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
603 ! lastshift; areamask >>=1) {
604 if (areamask & 1)
605 lastshift = 1;
606
607 addr = htlb_get_high_area(len, curareas | areamask);
608 if ((addr != -ENOMEM)
609 && open_high_hpage_areas(current->mm, areamask) == 0)
610 return addr;
611 }
612 }
613 printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
614 " enough areas\n");
615 return -ENOMEM;
616}
617
618int hash_huge_page(struct mm_struct *mm, unsigned long access,
619 unsigned long ea, unsigned long vsid, int local)
620{
621 pte_t *ptep;
622 unsigned long va, vpn;
623 pte_t old_pte, new_pte;
624 unsigned long rflags, prpn;
625 long slot;
626 int err = 1;
627
628 spin_lock(&mm->page_table_lock);
629
630 ptep = huge_pte_offset(mm, ea);
631
632 /* Search the Linux page table for a match with va */
633 va = (vsid << 28) | (ea & 0x0fffffff);
634 vpn = va >> HPAGE_SHIFT;
635
636 /*
637 * If no pte found or not present, send the problem up to
638 * do_page_fault
639 */
640 if (unlikely(!ptep || pte_none(*ptep)))
641 goto out;
642
643/* BUG_ON(pte_bad(*ptep)); */
644
645 /*
646 * Check the user's access rights to the page. If access should be
647 * prevented then send the problem up to do_page_fault.
648 */
649 if (unlikely(access & ~pte_val(*ptep)))
650 goto out;
651 /*
652 * At this point, we have a pte (old_pte) which can be used to build
653 * or update an HPTE. There are 2 cases:
654 *
655 * 1. There is a valid (present) pte with no associated HPTE (this is
656 * the most common case)
657 * 2. There is a valid (present) pte with an associated HPTE. The
658 * current values of the pp bits in the HPTE prevent access
659 * because we are doing software DIRTY bit management and the
660 * page is currently not DIRTY.
661 */
662
663
664 old_pte = *ptep;
665 new_pte = old_pte;
666
667 rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
668 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
669 rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
670
671 /* Check if pte already has an hpte (case 2) */
672 if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
673 /* There MIGHT be an HPTE for this pte */
674 unsigned long hash, slot;
675
676 hash = hpt_hash(vpn, 1);
677 if (pte_val(old_pte) & _PAGE_SECONDARY)
678 hash = ~hash;
679 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
680 slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
681
682 if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
683 pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
684 }
685
686 if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
687 unsigned long hash = hpt_hash(vpn, 1);
688 unsigned long hpte_group;
689
690 prpn = pte_pfn(old_pte);
691
692repeat:
693 hpte_group = ((hash & htab_hash_mask) *
694 HPTES_PER_GROUP) & ~0x7UL;
695
696 /* Update the linux pte with the HPTE slot */
697 pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
698 pte_val(new_pte) |= _PAGE_HASHPTE;
699
700 /* Add in WIMG bits */
701 /* XXX We should store these in the pte */
702 rflags |= _PAGE_COHERENT;
703
704 slot = ppc_md.hpte_insert(hpte_group, va, prpn,
705 HPTE_V_LARGE, rflags);
706
707 /* Primary is full, try the secondary */
708 if (unlikely(slot == -1)) {
709 pte_val(new_pte) |= _PAGE_SECONDARY;
710 hpte_group = ((~hash & htab_hash_mask) *
711 HPTES_PER_GROUP) & ~0x7UL;
712 slot = ppc_md.hpte_insert(hpte_group, va, prpn,
713 HPTE_V_LARGE |
714 HPTE_V_SECONDARY,
715 rflags);
716 if (slot == -1) {
717 if (mftb() & 0x1)
718 hpte_group = ((hash & htab_hash_mask) *
719 HPTES_PER_GROUP)&~0x7UL;
720
721 ppc_md.hpte_remove(hpte_group);
722 goto repeat;
723 }
724 }
725
726 if (unlikely(slot == -2))
727 panic("hash_huge_page: pte_insert failed\n");
728
729 pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
730
731 /*
732 * No need to use ldarx/stdcx here because all who
733 * might be updating the pte will hold the
734 * page_table_lock
735 */
736 *ptep = new_pte;
737 }
738
739 err = 0;
740
741 out:
742 spin_unlock(&mm->page_table_lock);
743
744 return err;
745}
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c
deleted file mode 100644
index c65b87b9275..00000000000
--- a/arch/ppc64/mm/imalloc.c
+++ /dev/null
@@ -1,317 +0,0 @@
1/*
2 * c 2001 PPC 64 Team, IBM Corp
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#include <linux/slab.h>
11#include <linux/vmalloc.h>
12
13#include <asm/uaccess.h>
14#include <asm/pgalloc.h>
15#include <asm/pgtable.h>
16#include <asm/semaphore.h>
17#include <asm/imalloc.h>
18#include <asm/cacheflush.h>
19
20static DECLARE_MUTEX(imlist_sem);
21struct vm_struct * imlist = NULL;
22
23static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
24{
25 unsigned long addr;
26 struct vm_struct **p, *tmp;
27
28 addr = ioremap_bot;
29 for (p = &imlist; (tmp = *p) ; p = &tmp->next) {
30 if (size + addr < (unsigned long) tmp->addr)
31 break;
32 if ((unsigned long)tmp->addr >= ioremap_bot)
33 addr = tmp->size + (unsigned long) tmp->addr;
34 if (addr >= IMALLOC_END-size)
35 return 1;
36 }
37 *im_addr = addr;
38
39 return 0;
40}
41
42/* Return whether the region described by v_addr and size is a subset
43 * of the region described by parent
44 */
45static inline int im_region_is_subset(unsigned long v_addr, unsigned long size,
46 struct vm_struct *parent)
47{
48 return (int) (v_addr >= (unsigned long) parent->addr &&
49 v_addr < (unsigned long) parent->addr + parent->size &&
50 size < parent->size);
51}
52
53/* Return whether the region described by v_addr and size is a superset
54 * of the region described by child
55 */
56static int im_region_is_superset(unsigned long v_addr, unsigned long size,
57 struct vm_struct *child)
58{
59 struct vm_struct parent;
60
61 parent.addr = (void *) v_addr;
62 parent.size = size;
63
64 return im_region_is_subset((unsigned long) child->addr, child->size,
65 &parent);
66}
67
68/* Return whether the region described by v_addr and size overlaps
69 * the region described by vm. Overlapping regions meet the
70 * following conditions:
71 * 1) The regions share some part of the address space
72 * 2) The regions aren't identical
73 * 3) Neither region is a subset of the other
74 */
75static int im_region_overlaps(unsigned long v_addr, unsigned long size,
76 struct vm_struct *vm)
77{
78 if (im_region_is_superset(v_addr, size, vm))
79 return 0;
80
81 return (v_addr + size > (unsigned long) vm->addr + vm->size &&
82 v_addr < (unsigned long) vm->addr + vm->size) ||
83 (v_addr < (unsigned long) vm->addr &&
84 v_addr + size > (unsigned long) vm->addr);
85}
86
87/* Determine imalloc status of region described by v_addr and size.
88 * Can return one of the following:
89 * IM_REGION_UNUSED - Entire region is unallocated in imalloc space.
90 * IM_REGION_SUBSET - Region is a subset of a region that is already
91 * allocated in imalloc space.
92 * vm will be assigned to a ptr to the parent region.
93 * IM_REGION_EXISTS - Exact region already allocated in imalloc space.
94 * vm will be assigned to a ptr to the existing imlist
95 * member.
96 * IM_REGION_OVERLAPS - Region overlaps an allocated region in imalloc space.
97 * IM_REGION_SUPERSET - Region is a superset of a region that is already
98 * allocated in imalloc space.
99 */
100static int im_region_status(unsigned long v_addr, unsigned long size,
101 struct vm_struct **vm)
102{
103 struct vm_struct *tmp;
104
105 for (tmp = imlist; tmp; tmp = tmp->next)
106 if (v_addr < (unsigned long) tmp->addr + tmp->size)
107 break;
108
109 if (tmp) {
110 if (im_region_overlaps(v_addr, size, tmp))
111 return IM_REGION_OVERLAP;
112
113 *vm = tmp;
114 if (im_region_is_subset(v_addr, size, tmp)) {
115 /* Return with tmp pointing to superset */
116 return IM_REGION_SUBSET;
117 }
118 if (im_region_is_superset(v_addr, size, tmp)) {
119 /* Return with tmp pointing to first subset */
120 return IM_REGION_SUPERSET;
121 }
122 else if (v_addr == (unsigned long) tmp->addr &&
123 size == tmp->size) {
124 /* Return with tmp pointing to exact region */
125 return IM_REGION_EXISTS;
126 }
127 }
128
129 *vm = NULL;
130 return IM_REGION_UNUSED;
131}
132
133static struct vm_struct * split_im_region(unsigned long v_addr,
134 unsigned long size, struct vm_struct *parent)
135{
136 struct vm_struct *vm1 = NULL;
137 struct vm_struct *vm2 = NULL;
138 struct vm_struct *new_vm = NULL;
139
140 vm1 = (struct vm_struct *) kmalloc(sizeof(*vm1), GFP_KERNEL);
141 if (vm1 == NULL) {
142 printk(KERN_ERR "%s() out of memory\n", __FUNCTION__);
143 return NULL;
144 }
145
146 if (v_addr == (unsigned long) parent->addr) {
147 /* Use existing parent vm_struct to represent child, allocate
148 * new one for the remainder of parent range
149 */
150 vm1->size = parent->size - size;
151 vm1->addr = (void *) (v_addr + size);
152 vm1->next = parent->next;
153
154 parent->size = size;
155 parent->next = vm1;
156 new_vm = parent;
157 } else if (v_addr + size == (unsigned long) parent->addr +
158 parent->size) {
159 /* Allocate new vm_struct to represent child, use existing
160 * parent one for remainder of parent range
161 */
162 vm1->size = size;
163 vm1->addr = (void *) v_addr;
164 vm1->next = parent->next;
165 new_vm = vm1;
166
167 parent->size -= size;
168 parent->next = vm1;
169 } else {
170 /* Allocate two new vm_structs for the new child and
171 * uppermost remainder, and use existing parent one for the
172 * lower remainder of parent range
173 */
174 vm2 = (struct vm_struct *) kmalloc(sizeof(*vm2), GFP_KERNEL);
175 if (vm2 == NULL) {
176 printk(KERN_ERR "%s() out of memory\n", __FUNCTION__);
177 kfree(vm1);
178 return NULL;
179 }
180
181 vm1->size = size;
182 vm1->addr = (void *) v_addr;
183 vm1->next = vm2;
184 new_vm = vm1;
185
186 vm2->size = ((unsigned long) parent->addr + parent->size) -
187 (v_addr + size);
188 vm2->addr = (void *) v_addr + size;
189 vm2->next = parent->next;
190
191 parent->size = v_addr - (unsigned long) parent->addr;
192 parent->next = vm1;
193 }
194
195 return new_vm;
196}
197
198static struct vm_struct * __add_new_im_area(unsigned long req_addr,
199 unsigned long size)
200{
201 struct vm_struct **p, *tmp, *area;
202
203 for (p = &imlist; (tmp = *p) ; p = &tmp->next) {
204 if (req_addr + size <= (unsigned long)tmp->addr)
205 break;
206 }
207
208 area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
209 if (!area)
210 return NULL;
211 area->flags = 0;
212 area->addr = (void *)req_addr;
213 area->size = size;
214 area->next = *p;
215 *p = area;
216
217 return area;
218}
219
220static struct vm_struct * __im_get_area(unsigned long req_addr,
221 unsigned long size,
222 int criteria)
223{
224 struct vm_struct *tmp;
225 int status;
226
227 status = im_region_status(req_addr, size, &tmp);
228 if ((criteria & status) == 0) {
229 return NULL;
230 }
231
232 switch (status) {
233 case IM_REGION_UNUSED:
234 tmp = __add_new_im_area(req_addr, size);
235 break;
236 case IM_REGION_SUBSET:
237 tmp = split_im_region(req_addr, size, tmp);
238 break;
239 case IM_REGION_EXISTS:
240 /* Return requested region */
241 break;
242 case IM_REGION_SUPERSET:
243 /* Return first existing subset of requested region */
244 break;
245 default:
246 printk(KERN_ERR "%s() unexpected imalloc region status\n",
247 __FUNCTION__);
248 tmp = NULL;
249 }
250
251 return tmp;
252}
253
254struct vm_struct * im_get_free_area(unsigned long size)
255{
256 struct vm_struct *area;
257 unsigned long addr;
258
259 down(&imlist_sem);
260 if (get_free_im_addr(size, &addr)) {
261 printk(KERN_ERR "%s() cannot obtain addr for size 0x%lx\n",
262 __FUNCTION__, size);
263 area = NULL;
264 goto next_im_done;
265 }
266
267 area = __im_get_area(addr, size, IM_REGION_UNUSED);
268 if (area == NULL) {
269 printk(KERN_ERR
270 "%s() cannot obtain area for addr 0x%lx size 0x%lx\n",
271 __FUNCTION__, addr, size);
272 }
273next_im_done:
274 up(&imlist_sem);
275 return area;
276}
277
278struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size,
279 int criteria)
280{
281 struct vm_struct *area;
282
283 down(&imlist_sem);
284 area = __im_get_area(v_addr, size, criteria);
285 up(&imlist_sem);
286 return area;
287}
288
289void im_free(void * addr)
290{
291 struct vm_struct **p, *tmp;
292
293 if (!addr)
294 return;
295 if ((unsigned long) addr & ~PAGE_MASK) {
296 printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__, addr);
297 return;
298 }
299 down(&imlist_sem);
300 for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
301 if (tmp->addr == addr) {
302 *p = tmp->next;
303
304 /* XXX: do we need the lock? */
305 spin_lock(&init_mm.page_table_lock);
306 unmap_vm_area(tmp);
307 spin_unlock(&init_mm.page_table_lock);
308
309 kfree(tmp);
310 up(&imlist_sem);
311 return;
312 }
313 }
314 up(&imlist_sem);
315 printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__,
316 addr);
317}
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
deleted file mode 100644
index c2157c9c3ac..00000000000
--- a/arch/ppc64/mm/init.c
+++ /dev/null
@@ -1,870 +0,0 @@
1/*
2 * PowerPC version
3 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
4 *
5 * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
6 * and Cort Dougan (PReP) (cort@cs.nmt.edu)
7 * Copyright (C) 1996 Paul Mackerras
8 * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
9 *
10 * Derived from "arch/i386/mm/init.c"
11 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
12 *
13 * Dave Engebretsen <engebret@us.ibm.com>
14 * Rework for PPC64 port.
15 *
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version
19 * 2 of the License, or (at your option) any later version.
20 *
21 */
22
23#include <linux/config.h>
24#include <linux/signal.h>
25#include <linux/sched.h>
26#include <linux/kernel.h>
27#include <linux/errno.h>
28#include <linux/string.h>
29#include <linux/types.h>
30#include <linux/mman.h>
31#include <linux/mm.h>
32#include <linux/swap.h>
33#include <linux/stddef.h>
34#include <linux/vmalloc.h>
35#include <linux/init.h>
36#include <linux/delay.h>
37#include <linux/bootmem.h>
38#include <linux/highmem.h>
39#include <linux/idr.h>
40#include <linux/nodemask.h>
41#include <linux/module.h>
42
43#include <asm/pgalloc.h>
44#include <asm/page.h>
45#include <asm/prom.h>
46#include <asm/lmb.h>
47#include <asm/rtas.h>
48#include <asm/io.h>
49#include <asm/mmu_context.h>
50#include <asm/pgtable.h>
51#include <asm/mmu.h>
52#include <asm/uaccess.h>
53#include <asm/smp.h>
54#include <asm/machdep.h>
55#include <asm/tlb.h>
56#include <asm/eeh.h>
57#include <asm/processor.h>
58#include <asm/mmzone.h>
59#include <asm/cputable.h>
60#include <asm/ppcdebug.h>
61#include <asm/sections.h>
62#include <asm/system.h>
63#include <asm/iommu.h>
64#include <asm/abs_addr.h>
65#include <asm/vdso.h>
66#include <asm/imalloc.h>
67
68#if PGTABLE_RANGE > USER_VSID_RANGE
69#warning Limited user VSID range means pagetable space is wasted
70#endif
71
72#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
73#warning TASK_SIZE is smaller than it needs to be.
74#endif
75
76int mem_init_done;
77unsigned long ioremap_bot = IMALLOC_BASE;
78static unsigned long phbs_io_bot = PHBS_IO_BASE;
79
80extern pgd_t swapper_pg_dir[];
81extern struct task_struct *current_set[NR_CPUS];
82
83unsigned long klimit = (unsigned long)_end;
84
85unsigned long _SDR1=0;
86unsigned long _ASR=0;
87
88/* max amount of RAM to use */
89unsigned long __max_memory;
90
91/* info on what we think the IO hole is */
92unsigned long io_hole_start;
93unsigned long io_hole_size;
94
95void show_mem(void)
96{
97 unsigned long total = 0, reserved = 0;
98 unsigned long shared = 0, cached = 0;
99 struct page *page;
100 pg_data_t *pgdat;
101 unsigned long i;
102
103 printk("Mem-info:\n");
104 show_free_areas();
105 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
106 for_each_pgdat(pgdat) {
107 for (i = 0; i < pgdat->node_spanned_pages; i++) {
108 page = pgdat_page_nr(pgdat, i);
109 total++;
110 if (PageReserved(page))
111 reserved++;
112 else if (PageSwapCache(page))
113 cached++;
114 else if (page_count(page))
115 shared += page_count(page) - 1;
116 }
117 }
118 printk("%ld pages of RAM\n", total);
119 printk("%ld reserved pages\n", reserved);
120 printk("%ld pages shared\n", shared);
121 printk("%ld pages swap cached\n", cached);
122}
123
124#ifdef CONFIG_PPC_ISERIES
125
126void __iomem *ioremap(unsigned long addr, unsigned long size)
127{
128 return (void __iomem *)addr;
129}
130
131extern void __iomem *__ioremap(unsigned long addr, unsigned long size,
132 unsigned long flags)
133{
134 return (void __iomem *)addr;
135}
136
137void iounmap(volatile void __iomem *addr)
138{
139 return;
140}
141
142#else
143
144/*
145 * map_io_page currently only called by __ioremap
146 * map_io_page adds an entry to the ioremap page table
147 * and adds an entry to the HPT, possibly bolting it
148 */
149static int map_io_page(unsigned long ea, unsigned long pa, int flags)
150{
151 pgd_t *pgdp;
152 pud_t *pudp;
153 pmd_t *pmdp;
154 pte_t *ptep;
155 unsigned long vsid;
156
157 if (mem_init_done) {
158 spin_lock(&init_mm.page_table_lock);
159 pgdp = pgd_offset_k(ea);
160 pudp = pud_alloc(&init_mm, pgdp, ea);
161 if (!pudp)
162 return -ENOMEM;
163 pmdp = pmd_alloc(&init_mm, pudp, ea);
164 if (!pmdp)
165 return -ENOMEM;
166 ptep = pte_alloc_kernel(&init_mm, pmdp, ea);
167 if (!ptep)
168 return -ENOMEM;
169 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
170 __pgprot(flags)));
171 spin_unlock(&init_mm.page_table_lock);
172 } else {
173 unsigned long va, vpn, hash, hpteg;
174
175 /*
176 * If the mm subsystem is not fully up, we cannot create a
177 * linux page table entry for this mapping. Simply bolt an
178 * entry in the hardware page table.
179 */
180 vsid = get_kernel_vsid(ea);
181 va = (vsid << 28) | (ea & 0xFFFFFFF);
182 vpn = va >> PAGE_SHIFT;
183
184 hash = hpt_hash(vpn, 0);
185
186 hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
187
188 /* Panic if a pte grpup is full */
189 if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT,
190 HPTE_V_BOLTED,
191 _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX)
192 == -1) {
193 panic("map_io_page: could not insert mapping");
194 }
195 }
196 return 0;
197}
198
199
200static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa,
201 unsigned long ea, unsigned long size,
202 unsigned long flags)
203{
204 unsigned long i;
205
206 if ((flags & _PAGE_PRESENT) == 0)
207 flags |= pgprot_val(PAGE_KERNEL);
208
209 for (i = 0; i < size; i += PAGE_SIZE)
210 if (map_io_page(ea+i, pa+i, flags))
211 return NULL;
212
213 return (void __iomem *) (ea + (addr & ~PAGE_MASK));
214}
215
216
217void __iomem *
218ioremap(unsigned long addr, unsigned long size)
219{
220 return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED);
221}
222
223void __iomem * __ioremap(unsigned long addr, unsigned long size,
224 unsigned long flags)
225{
226 unsigned long pa, ea;
227 void __iomem *ret;
228
229 /*
230 * Choose an address to map it to.
231 * Once the imalloc system is running, we use it.
232 * Before that, we map using addresses going
233 * up from ioremap_bot. imalloc will use
234 * the addresses from ioremap_bot through
235 * IMALLOC_END
236 *
237 */
238 pa = addr & PAGE_MASK;
239 size = PAGE_ALIGN(addr + size) - pa;
240
241 if (size == 0)
242 return NULL;
243
244 if (mem_init_done) {
245 struct vm_struct *area;
246 area = im_get_free_area(size);
247 if (area == NULL)
248 return NULL;
249 ea = (unsigned long)(area->addr);
250 ret = __ioremap_com(addr, pa, ea, size, flags);
251 if (!ret)
252 im_free(area->addr);
253 } else {
254 ea = ioremap_bot;
255 ret = __ioremap_com(addr, pa, ea, size, flags);
256 if (ret)
257 ioremap_bot += size;
258 }
259 return ret;
260}
261
262#define IS_PAGE_ALIGNED(_val) ((_val) == ((_val) & PAGE_MASK))
263
264int __ioremap_explicit(unsigned long pa, unsigned long ea,
265 unsigned long size, unsigned long flags)
266{
267 struct vm_struct *area;
268 void __iomem *ret;
269
270 /* For now, require page-aligned values for pa, ea, and size */
271 if (!IS_PAGE_ALIGNED(pa) || !IS_PAGE_ALIGNED(ea) ||
272 !IS_PAGE_ALIGNED(size)) {
273 printk(KERN_ERR "unaligned value in %s\n", __FUNCTION__);
274 return 1;
275 }
276
277 if (!mem_init_done) {
278 /* Two things to consider in this case:
279 * 1) No records will be kept (imalloc, etc) that the region
280 * has been remapped
281 * 2) It won't be easy to iounmap() the region later (because
282 * of 1)
283 */
284 ;
285 } else {
286 area = im_get_area(ea, size,
287 IM_REGION_UNUSED|IM_REGION_SUBSET|IM_REGION_EXISTS);
288 if (area == NULL) {
289 /* Expected when PHB-dlpar is in play */
290 return 1;
291 }
292 if (ea != (unsigned long) area->addr) {
293 printk(KERN_ERR "unexpected addr return from "
294 "im_get_area\n");
295 return 1;
296 }
297 }
298
299 ret = __ioremap_com(pa, pa, ea, size, flags);
300 if (ret == NULL) {
301 printk(KERN_ERR "ioremap_explicit() allocation failure !\n");
302 return 1;
303 }
304 if (ret != (void *) ea) {
305 printk(KERN_ERR "__ioremap_com() returned unexpected addr\n");
306 return 1;
307 }
308
309 return 0;
310}
311
312/*
313 * Unmap an IO region and remove it from imalloc'd list.
314 * Access to IO memory should be serialized by driver.
315 * This code is modeled after vmalloc code - unmap_vm_area()
316 *
317 * XXX what about calls before mem_init_done (ie python_countermeasures())
318 */
319void iounmap(volatile void __iomem *token)
320{
321 void *addr;
322
323 if (!mem_init_done)
324 return;
325
326 addr = (void *) ((unsigned long __force) token & PAGE_MASK);
327
328 im_free(addr);
329}
330
331static int iounmap_subset_regions(unsigned long addr, unsigned long size)
332{
333 struct vm_struct *area;
334
335 /* Check whether subsets of this region exist */
336 area = im_get_area(addr, size, IM_REGION_SUPERSET);
337 if (area == NULL)
338 return 1;
339
340 while (area) {
341 iounmap((void __iomem *) area->addr);
342 area = im_get_area(addr, size,
343 IM_REGION_SUPERSET);
344 }
345
346 return 0;
347}
348
349int iounmap_explicit(volatile void __iomem *start, unsigned long size)
350{
351 struct vm_struct *area;
352 unsigned long addr;
353 int rc;
354
355 addr = (unsigned long __force) start & PAGE_MASK;
356
357 /* Verify that the region either exists or is a subset of an existing
358 * region. In the latter case, split the parent region to create
359 * the exact region
360 */
361 area = im_get_area(addr, size,
362 IM_REGION_EXISTS | IM_REGION_SUBSET);
363 if (area == NULL) {
364 /* Determine whether subset regions exist. If so, unmap */
365 rc = iounmap_subset_regions(addr, size);
366 if (rc) {
367 printk(KERN_ERR
368 "%s() cannot unmap nonexistent range 0x%lx\n",
369 __FUNCTION__, addr);
370 return 1;
371 }
372 } else {
373 iounmap((void __iomem *) area->addr);
374 }
375 /*
376 * FIXME! This can't be right:
377 iounmap(area->addr);
378 * Maybe it should be "iounmap(area);"
379 */
380 return 0;
381}
382
383#endif
384
385EXPORT_SYMBOL(ioremap);
386EXPORT_SYMBOL(__ioremap);
387EXPORT_SYMBOL(iounmap);
388
389void free_initmem(void)
390{
391 unsigned long addr;
392
393 addr = (unsigned long)__init_begin;
394 for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
395 memset((void *)addr, 0xcc, PAGE_SIZE);
396 ClearPageReserved(virt_to_page(addr));
397 set_page_count(virt_to_page(addr), 1);
398 free_page(addr);
399 totalram_pages++;
400 }
401 printk ("Freeing unused kernel memory: %luk freed\n",
402 ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10);
403}
404
405#ifdef CONFIG_BLK_DEV_INITRD
406void free_initrd_mem(unsigned long start, unsigned long end)
407{
408 if (start < end)
409 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
410 for (; start < end; start += PAGE_SIZE) {
411 ClearPageReserved(virt_to_page(start));
412 set_page_count(virt_to_page(start), 1);
413 free_page(start);
414 totalram_pages++;
415 }
416}
417#endif
418
419static DEFINE_SPINLOCK(mmu_context_lock);
420static DEFINE_IDR(mmu_context_idr);
421
422int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
423{
424 int index;
425 int err;
426
427again:
428 if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
429 return -ENOMEM;
430
431 spin_lock(&mmu_context_lock);
432 err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
433 spin_unlock(&mmu_context_lock);
434
435 if (err == -EAGAIN)
436 goto again;
437 else if (err)
438 return err;
439
440 if (index > MAX_CONTEXT) {
441 idr_remove(&mmu_context_idr, index);
442 return -ENOMEM;
443 }
444
445 mm->context.id = index;
446
447 return 0;
448}
449
450void destroy_context(struct mm_struct *mm)
451{
452 spin_lock(&mmu_context_lock);
453 idr_remove(&mmu_context_idr, mm->context.id);
454 spin_unlock(&mmu_context_lock);
455
456 mm->context.id = NO_CONTEXT;
457}
458
459/*
460 * Do very early mm setup.
461 */
462void __init mm_init_ppc64(void)
463{
464#ifndef CONFIG_PPC_ISERIES
465 unsigned long i;
466#endif
467
468 ppc64_boot_msg(0x100, "MM Init");
469
470 /* This is the story of the IO hole... please, keep seated,
471 * unfortunately, we are out of oxygen masks at the moment.
472 * So we need some rough way to tell where your big IO hole
473 * is. On pmac, it's between 2G and 4G, on POWER3, it's around
474 * that area as well, on POWER4 we don't have one, etc...
475 * We need that as a "hint" when sizing the TCE table on POWER3
476 * So far, the simplest way that seem work well enough for us it
477 * to just assume that the first discontinuity in our physical
478 * RAM layout is the IO hole. That may not be correct in the future
479 * (and isn't on iSeries but then we don't care ;)
480 */
481
482#ifndef CONFIG_PPC_ISERIES
483 for (i = 1; i < lmb.memory.cnt; i++) {
484 unsigned long base, prevbase, prevsize;
485
486 prevbase = lmb.memory.region[i-1].base;
487 prevsize = lmb.memory.region[i-1].size;
488 base = lmb.memory.region[i].base;
489 if (base > (prevbase + prevsize)) {
490 io_hole_start = prevbase + prevsize;
491 io_hole_size = base - (prevbase + prevsize);
492 break;
493 }
494 }
495#endif /* CONFIG_PPC_ISERIES */
496 if (io_hole_start)
497 printk("IO Hole assumed to be %lx -> %lx\n",
498 io_hole_start, io_hole_start + io_hole_size - 1);
499
500 ppc64_boot_msg(0x100, "MM Init Done");
501}
502
503/*
504 * This is called by /dev/mem to know if a given address has to
505 * be mapped non-cacheable or not
506 */
507int page_is_ram(unsigned long pfn)
508{
509 int i;
510 unsigned long paddr = (pfn << PAGE_SHIFT);
511
512 for (i=0; i < lmb.memory.cnt; i++) {
513 unsigned long base;
514
515 base = lmb.memory.region[i].base;
516
517 if ((paddr >= base) &&
518 (paddr < (base + lmb.memory.region[i].size))) {
519 return 1;
520 }
521 }
522
523 return 0;
524}
525EXPORT_SYMBOL(page_is_ram);
526
527/*
528 * Initialize the bootmem system and give it all the memory we
529 * have available.
530 */
531#ifndef CONFIG_NEED_MULTIPLE_NODES
532void __init do_init_bootmem(void)
533{
534 unsigned long i;
535 unsigned long start, bootmap_pages;
536 unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT;
537 int boot_mapsize;
538
539 /*
540 * Find an area to use for the bootmem bitmap. Calculate the size of
541 * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE.
542 * Add 1 additional page in case the address isn't page-aligned.
543 */
544 bootmap_pages = bootmem_bootmap_pages(total_pages);
545
546 start = lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
547 BUG_ON(!start);
548
549 boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
550
551 max_pfn = max_low_pfn;
552
553 /* Add all physical memory to the bootmem map, mark each area
554 * present.
555 */
556 for (i=0; i < lmb.memory.cnt; i++)
557 free_bootmem(lmb.memory.region[i].base,
558 lmb_size_bytes(&lmb.memory, i));
559
560 /* reserve the sections we're already using */
561 for (i=0; i < lmb.reserved.cnt; i++)
562 reserve_bootmem(lmb.reserved.region[i].base,
563 lmb_size_bytes(&lmb.reserved, i));
564
565 for (i=0; i < lmb.memory.cnt; i++)
566 memory_present(0, lmb_start_pfn(&lmb.memory, i),
567 lmb_end_pfn(&lmb.memory, i));
568}
569
570/*
571 * paging_init() sets up the page tables - in fact we've already done this.
572 */
573void __init paging_init(void)
574{
575 unsigned long zones_size[MAX_NR_ZONES];
576 unsigned long zholes_size[MAX_NR_ZONES];
577 unsigned long total_ram = lmb_phys_mem_size();
578 unsigned long top_of_ram = lmb_end_of_DRAM();
579
580 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
581 top_of_ram, total_ram);
582 printk(KERN_INFO "Memory hole size: %ldMB\n",
583 (top_of_ram - total_ram) >> 20);
584 /*
585 * All pages are DMA-able so we put them all in the DMA zone.
586 */
587 memset(zones_size, 0, sizeof(zones_size));
588 memset(zholes_size, 0, sizeof(zholes_size));
589
590 zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
591 zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT;
592
593 free_area_init_node(0, NODE_DATA(0), zones_size,
594 __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size);
595}
596#endif /* ! CONFIG_NEED_MULTIPLE_NODES */
597
598static struct kcore_list kcore_vmem;
599
600static int __init setup_kcore(void)
601{
602 int i;
603
604 for (i=0; i < lmb.memory.cnt; i++) {
605 unsigned long base, size;
606 struct kcore_list *kcore_mem;
607
608 base = lmb.memory.region[i].base;
609 size = lmb.memory.region[i].size;
610
611 /* GFP_ATOMIC to avoid might_sleep warnings during boot */
612 kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
613 if (!kcore_mem)
614 panic("mem_init: kmalloc failed\n");
615
616 kclist_add(kcore_mem, __va(base), size);
617 }
618
619 kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
620
621 return 0;
622}
623module_init(setup_kcore);
624
625void __init mem_init(void)
626{
627#ifdef CONFIG_NEED_MULTIPLE_NODES
628 int nid;
629#endif
630 pg_data_t *pgdat;
631 unsigned long i;
632 struct page *page;
633 unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
634
635 num_physpages = max_low_pfn; /* RAM is assumed contiguous */
636 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
637
638#ifdef CONFIG_NEED_MULTIPLE_NODES
639 for_each_online_node(nid) {
640 if (NODE_DATA(nid)->node_spanned_pages != 0) {
641 printk("freeing bootmem node %x\n", nid);
642 totalram_pages +=
643 free_all_bootmem_node(NODE_DATA(nid));
644 }
645 }
646#else
647 max_mapnr = num_physpages;
648 totalram_pages += free_all_bootmem();
649#endif
650
651 for_each_pgdat(pgdat) {
652 for (i = 0; i < pgdat->node_spanned_pages; i++) {
653 page = pgdat_page_nr(pgdat, i);
654 if (PageReserved(page))
655 reservedpages++;
656 }
657 }
658
659 codesize = (unsigned long)&_etext - (unsigned long)&_stext;
660 initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
661 datasize = (unsigned long)&_edata - (unsigned long)&__init_end;
662 bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
663
664 printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
665 "%luk reserved, %luk data, %luk bss, %luk init)\n",
666 (unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
667 num_physpages << (PAGE_SHIFT-10),
668 codesize >> 10,
669 reservedpages << (PAGE_SHIFT-10),
670 datasize >> 10,
671 bsssize >> 10,
672 initsize >> 10);
673
674 mem_init_done = 1;
675
676 /* Initialize the vDSO */
677 vdso_init();
678}
679
680/*
681 * This is called when a page has been modified by the kernel.
682 * It just marks the page as not i-cache clean. We do the i-cache
683 * flush later when the page is given to a user process, if necessary.
684 */
685void flush_dcache_page(struct page *page)
686{
687 if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
688 return;
689 /* avoid an atomic op if possible */
690 if (test_bit(PG_arch_1, &page->flags))
691 clear_bit(PG_arch_1, &page->flags);
692}
693EXPORT_SYMBOL(flush_dcache_page);
694
695void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
696{
697 clear_page(page);
698
699 if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
700 return;
701 /*
702 * We shouldnt have to do this, but some versions of glibc
703 * require it (ld.so assumes zero filled pages are icache clean)
704 * - Anton
705 */
706
707 /* avoid an atomic op if possible */
708 if (test_bit(PG_arch_1, &pg->flags))
709 clear_bit(PG_arch_1, &pg->flags);
710}
711EXPORT_SYMBOL(clear_user_page);
712
713void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
714 struct page *pg)
715{
716 copy_page(vto, vfrom);
717
718 /*
719 * We should be able to use the following optimisation, however
720 * there are two problems.
721 * Firstly a bug in some versions of binutils meant PLT sections
722 * were not marked executable.
723 * Secondly the first word in the GOT section is blrl, used
724 * to establish the GOT address. Until recently the GOT was
725 * not marked executable.
726 * - Anton
727 */
728#if 0
729 if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0))
730 return;
731#endif
732
733 if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
734 return;
735
736 /* avoid an atomic op if possible */
737 if (test_bit(PG_arch_1, &pg->flags))
738 clear_bit(PG_arch_1, &pg->flags);
739}
740
741void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
742 unsigned long addr, int len)
743{
744 unsigned long maddr;
745
746 maddr = (unsigned long)page_address(page) + (addr & ~PAGE_MASK);
747 flush_icache_range(maddr, maddr + len);
748}
749EXPORT_SYMBOL(flush_icache_user_range);
750
751/*
752 * This is called at the end of handling a user page fault, when the
753 * fault has been handled by updating a PTE in the linux page tables.
754 * We use it to preload an HPTE into the hash table corresponding to
755 * the updated linux PTE.
756 *
757 * This must always be called with the mm->page_table_lock held
758 */
759void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea,
760 pte_t pte)
761{
762 unsigned long vsid;
763 void *pgdir;
764 pte_t *ptep;
765 int local = 0;
766 cpumask_t tmp;
767 unsigned long flags;
768
769 /* handle i-cache coherency */
770 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
771 !cpu_has_feature(CPU_FTR_NOEXECUTE)) {
772 unsigned long pfn = pte_pfn(pte);
773 if (pfn_valid(pfn)) {
774 struct page *page = pfn_to_page(pfn);
775 if (!PageReserved(page)
776 && !test_bit(PG_arch_1, &page->flags)) {
777 __flush_dcache_icache(page_address(page));
778 set_bit(PG_arch_1, &page->flags);
779 }
780 }
781 }
782
783 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
784 if (!pte_young(pte))
785 return;
786
787 pgdir = vma->vm_mm->pgd;
788 if (pgdir == NULL)
789 return;
790
791 ptep = find_linux_pte(pgdir, ea);
792 if (!ptep)
793 return;
794
795 vsid = get_vsid(vma->vm_mm->context.id, ea);
796
797 local_irq_save(flags);
798 tmp = cpumask_of_cpu(smp_processor_id());
799 if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
800 local = 1;
801
802 __hash_page(ea, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep,
803 0x300, local);
804 local_irq_restore(flags);
805}
806
807void __iomem * reserve_phb_iospace(unsigned long size)
808{
809 void __iomem *virt_addr;
810
811 if (phbs_io_bot >= IMALLOC_BASE)
812 panic("reserve_phb_iospace(): phb io space overflow\n");
813
814 virt_addr = (void __iomem *) phbs_io_bot;
815 phbs_io_bot += size;
816
817 return virt_addr;
818}
819
820static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
821{
822 memset(addr, 0, kmem_cache_size(cache));
823}
824
825static const int pgtable_cache_size[2] = {
826 PTE_TABLE_SIZE, PMD_TABLE_SIZE
827};
828static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
829 "pgd_pte_cache", "pud_pmd_cache",
830};
831
832kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
833
834void pgtable_cache_init(void)
835{
836 int i;
837
838 BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
839 BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
840 BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
841 BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);
842
843 for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
844 int size = pgtable_cache_size[i];
845 const char *name = pgtable_cache_name[i];
846
847 pgtable_cache[i] = kmem_cache_create(name,
848 size, size,
849 SLAB_HWCACHE_ALIGN
850 | SLAB_MUST_HWCACHE_ALIGN,
851 zero_ctor,
852 NULL);
853 if (! pgtable_cache[i])
854 panic("pgtable_cache_init(): could not create %s!\n",
855 name);
856 }
857}
858
859pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
860 unsigned long size, pgprot_t vma_prot)
861{
862 if (ppc_md.phys_mem_access_prot)
863 return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot);
864
865 if (!page_is_ram(addr >> PAGE_SHIFT))
866 vma_prot = __pgprot(pgprot_val(vma_prot)
867 | _PAGE_GUARDED | _PAGE_NO_CACHE);
868 return vma_prot;
869}
870EXPORT_SYMBOL(phys_mem_access_prot);
diff --git a/arch/ppc64/mm/mmap.c b/arch/ppc64/mm/mmap.c
deleted file mode 100644
index fe65f522aff..00000000000
--- a/arch/ppc64/mm/mmap.c
+++ /dev/null
@@ -1,86 +0,0 @@
1/*
2 * linux/arch/ppc64/mm/mmap.c
3 *
4 * flexible mmap layout support
5 *
6 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
7 * All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 *
24 * Started by Ingo Molnar <mingo@elte.hu>
25 */
26
27#include <linux/personality.h>
28#include <linux/mm.h>
29
30/*
31 * Top of mmap area (just below the process stack).
32 *
33 * Leave an at least ~128 MB hole.
34 */
35#define MIN_GAP (128*1024*1024)
36#define MAX_GAP (TASK_SIZE/6*5)
37
38static inline unsigned long mmap_base(void)
39{
40 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
41
42 if (gap < MIN_GAP)
43 gap = MIN_GAP;
44 else if (gap > MAX_GAP)
45 gap = MAX_GAP;
46
47 return TASK_SIZE - (gap & PAGE_MASK);
48}
49
50static inline int mmap_is_legacy(void)
51{
52 /*
53 * Force standard allocation for 64 bit programs.
54 */
55 if (!test_thread_flag(TIF_32BIT))
56 return 1;
57
58 if (current->personality & ADDR_COMPAT_LAYOUT)
59 return 1;
60
61 if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
62 return 1;
63
64 return sysctl_legacy_va_layout;
65}
66
67/*
68 * This function, called very early during the creation of a new
69 * process VM image, sets up which VM layout function to use:
70 */
71void arch_pick_mmap_layout(struct mm_struct *mm)
72{
73 /*
74 * Fall back to the standard layout if the personality
75 * bit is set, or if the expected stack growth is unlimited:
76 */
77 if (mmap_is_legacy()) {
78 mm->mmap_base = TASK_UNMAPPED_BASE;
79 mm->get_unmapped_area = arch_get_unmapped_area;
80 mm->unmap_area = arch_unmap_area;
81 } else {
82 mm->mmap_base = mmap_base();
83 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
84 mm->unmap_area = arch_unmap_area_topdown;
85 }
86}
diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c
deleted file mode 100644
index cb864b8f275..00000000000
--- a/arch/ppc64/mm/numa.c
+++ /dev/null
@@ -1,779 +0,0 @@
1/*
2 * pSeries NUMA support
3 *
4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/threads.h>
12#include <linux/bootmem.h>
13#include <linux/init.h>
14#include <linux/mm.h>
15#include <linux/mmzone.h>
16#include <linux/module.h>
17#include <linux/nodemask.h>
18#include <linux/cpu.h>
19#include <linux/notifier.h>
20#include <asm/lmb.h>
21#include <asm/machdep.h>
22#include <asm/abs_addr.h>
23
24static int numa_enabled = 1;
25
26static int numa_debug;
27#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
28
29#ifdef DEBUG_NUMA
30#define ARRAY_INITIALISER -1
31#else
32#define ARRAY_INITIALISER 0
33#endif
34
35int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
36 ARRAY_INITIALISER};
37char *numa_memory_lookup_table;
38cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
39int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
40
41struct pglist_data *node_data[MAX_NUMNODES];
42bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
43static int min_common_depth;
44
45/*
46 * We need somewhere to store start/span for each node until we have
47 * allocated the real node_data structures.
48 */
49static struct {
50 unsigned long node_start_pfn;
51 unsigned long node_end_pfn;
52 unsigned long node_present_pages;
53} init_node_data[MAX_NUMNODES] __initdata;
54
55EXPORT_SYMBOL(node_data);
56EXPORT_SYMBOL(numa_cpu_lookup_table);
57EXPORT_SYMBOL(numa_memory_lookup_table);
58EXPORT_SYMBOL(numa_cpumask_lookup_table);
59EXPORT_SYMBOL(nr_cpus_in_node);
60
61static inline void map_cpu_to_node(int cpu, int node)
62{
63 numa_cpu_lookup_table[cpu] = node;
64 if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
65 cpu_set(cpu, numa_cpumask_lookup_table[node]);
66 nr_cpus_in_node[node]++;
67 }
68}
69
70#ifdef CONFIG_HOTPLUG_CPU
71static void unmap_cpu_from_node(unsigned long cpu)
72{
73 int node = numa_cpu_lookup_table[cpu];
74
75 dbg("removing cpu %lu from node %d\n", cpu, node);
76
77 if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
78 cpu_clear(cpu, numa_cpumask_lookup_table[node]);
79 nr_cpus_in_node[node]--;
80 } else {
81 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
82 cpu, node);
83 }
84}
85#endif /* CONFIG_HOTPLUG_CPU */
86
87static struct device_node * __devinit find_cpu_node(unsigned int cpu)
88{
89 unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
90 struct device_node *cpu_node = NULL;
91 unsigned int *interrupt_server, *reg;
92 int len;
93
94 while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
95 /* Try interrupt server first */
96 interrupt_server = (unsigned int *)get_property(cpu_node,
97 "ibm,ppc-interrupt-server#s", &len);
98
99 len = len / sizeof(u32);
100
101 if (interrupt_server && (len > 0)) {
102 while (len--) {
103 if (interrupt_server[len] == hw_cpuid)
104 return cpu_node;
105 }
106 } else {
107 reg = (unsigned int *)get_property(cpu_node,
108 "reg", &len);
109 if (reg && (len > 0) && (reg[0] == hw_cpuid))
110 return cpu_node;
111 }
112 }
113
114 return NULL;
115}
116
117/* must hold reference to node during call */
118static int *of_get_associativity(struct device_node *dev)
119{
120 return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
121}
122
123static int of_node_numa_domain(struct device_node *device)
124{
125 int numa_domain;
126 unsigned int *tmp;
127
128 if (min_common_depth == -1)
129 return 0;
130
131 tmp = of_get_associativity(device);
132 if (tmp && (tmp[0] >= min_common_depth)) {
133 numa_domain = tmp[min_common_depth];
134 } else {
135 dbg("WARNING: no NUMA information for %s\n",
136 device->full_name);
137 numa_domain = 0;
138 }
139 return numa_domain;
140}
141
142/*
143 * In theory, the "ibm,associativity" property may contain multiple
144 * associativity lists because a resource may be multiply connected
145 * into the machine. This resource then has different associativity
146 * characteristics relative to its multiple connections. We ignore
147 * this for now. We also assume that all cpu and memory sets have
148 * their distances represented at a common level. This won't be
149 * true for heirarchical NUMA.
150 *
151 * In any case the ibm,associativity-reference-points should give
152 * the correct depth for a normal NUMA system.
153 *
154 * - Dave Hansen <haveblue@us.ibm.com>
155 */
156static int __init find_min_common_depth(void)
157{
158 int depth;
159 unsigned int *ref_points;
160 struct device_node *rtas_root;
161 unsigned int len;
162
163 rtas_root = of_find_node_by_path("/rtas");
164
165 if (!rtas_root)
166 return -1;
167
168 /*
169 * this property is 2 32-bit integers, each representing a level of
170 * depth in the associativity nodes. The first is for an SMP
171 * configuration (should be all 0's) and the second is for a normal
172 * NUMA configuration.
173 */
174 ref_points = (unsigned int *)get_property(rtas_root,
175 "ibm,associativity-reference-points", &len);
176
177 if ((len >= 1) && ref_points) {
178 depth = ref_points[1];
179 } else {
180 dbg("WARNING: could not find NUMA "
181 "associativity reference point\n");
182 depth = -1;
183 }
184 of_node_put(rtas_root);
185
186 return depth;
187}
188
189static int __init get_mem_addr_cells(void)
190{
191 struct device_node *memory = NULL;
192 int rc;
193
194 memory = of_find_node_by_type(memory, "memory");
195 if (!memory)
196 return 0; /* it won't matter */
197
198 rc = prom_n_addr_cells(memory);
199 return rc;
200}
201
202static int __init get_mem_size_cells(void)
203{
204 struct device_node *memory = NULL;
205 int rc;
206
207 memory = of_find_node_by_type(memory, "memory");
208 if (!memory)
209 return 0; /* it won't matter */
210 rc = prom_n_size_cells(memory);
211 return rc;
212}
213
214static unsigned long read_n_cells(int n, unsigned int **buf)
215{
216 unsigned long result = 0;
217
218 while (n--) {
219 result = (result << 32) | **buf;
220 (*buf)++;
221 }
222 return result;
223}
224
225/*
226 * Figure out to which domain a cpu belongs and stick it there.
227 * Return the id of the domain used.
228 */
229static int numa_setup_cpu(unsigned long lcpu)
230{
231 int numa_domain = 0;
232 struct device_node *cpu = find_cpu_node(lcpu);
233
234 if (!cpu) {
235 WARN_ON(1);
236 goto out;
237 }
238
239 numa_domain = of_node_numa_domain(cpu);
240
241 if (numa_domain >= num_online_nodes()) {
242 /*
243 * POWER4 LPAR uses 0xffff as invalid node,
244 * dont warn in this case.
245 */
246 if (numa_domain != 0xffff)
247 printk(KERN_ERR "WARNING: cpu %ld "
248 "maps to invalid NUMA node %d\n",
249 lcpu, numa_domain);
250 numa_domain = 0;
251 }
252out:
253 node_set_online(numa_domain);
254
255 map_cpu_to_node(lcpu, numa_domain);
256
257 of_node_put(cpu);
258
259 return numa_domain;
260}
261
262static int cpu_numa_callback(struct notifier_block *nfb,
263 unsigned long action,
264 void *hcpu)
265{
266 unsigned long lcpu = (unsigned long)hcpu;
267 int ret = NOTIFY_DONE;
268
269 switch (action) {
270 case CPU_UP_PREPARE:
271 if (min_common_depth == -1 || !numa_enabled)
272 map_cpu_to_node(lcpu, 0);
273 else
274 numa_setup_cpu(lcpu);
275 ret = NOTIFY_OK;
276 break;
277#ifdef CONFIG_HOTPLUG_CPU
278 case CPU_DEAD:
279 case CPU_UP_CANCELED:
280 unmap_cpu_from_node(lcpu);
281 break;
282 ret = NOTIFY_OK;
283#endif
284 }
285 return ret;
286}
287
288/*
289 * Check and possibly modify a memory region to enforce the memory limit.
290 *
291 * Returns the size the region should have to enforce the memory limit.
292 * This will either be the original value of size, a truncated value,
293 * or zero. If the returned value of size is 0 the region should be
294 * discarded as it lies wholy above the memory limit.
295 */
296static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
297{
298 /*
299 * We use lmb_end_of_DRAM() in here instead of memory_limit because
300 * we've already adjusted it for the limit and it takes care of
301 * having memory holes below the limit.
302 */
303 extern unsigned long memory_limit;
304
305 if (! memory_limit)
306 return size;
307
308 if (start + size <= lmb_end_of_DRAM())
309 return size;
310
311 if (start >= lmb_end_of_DRAM())
312 return 0;
313
314 return lmb_end_of_DRAM() - start;
315}
316
317static int __init parse_numa_properties(void)
318{
319 struct device_node *cpu = NULL;
320 struct device_node *memory = NULL;
321 int addr_cells, size_cells;
322 int max_domain = 0;
323 long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
324 unsigned long i;
325
326 if (numa_enabled == 0) {
327 printk(KERN_WARNING "NUMA disabled by user\n");
328 return -1;
329 }
330
331 numa_memory_lookup_table =
332 (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
333 memset(numa_memory_lookup_table, 0, entries * sizeof(char));
334
335 for (i = 0; i < entries ; i++)
336 numa_memory_lookup_table[i] = ARRAY_INITIALISER;
337
338 min_common_depth = find_min_common_depth();
339
340 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
341 if (min_common_depth < 0)
342 return min_common_depth;
343
344 max_domain = numa_setup_cpu(boot_cpuid);
345
346 /*
347 * Even though we connect cpus to numa domains later in SMP init,
348 * we need to know the maximum node id now. This is because each
349 * node id must have NODE_DATA etc backing it.
350 * As a result of hotplug we could still have cpus appear later on
351 * with larger node ids. In that case we force the cpu into node 0.
352 */
353 for_each_cpu(i) {
354 int numa_domain;
355
356 cpu = find_cpu_node(i);
357
358 if (cpu) {
359 numa_domain = of_node_numa_domain(cpu);
360 of_node_put(cpu);
361
362 if (numa_domain < MAX_NUMNODES &&
363 max_domain < numa_domain)
364 max_domain = numa_domain;
365 }
366 }
367
368 addr_cells = get_mem_addr_cells();
369 size_cells = get_mem_size_cells();
370 memory = NULL;
371 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
372 unsigned long start;
373 unsigned long size;
374 int numa_domain;
375 int ranges;
376 unsigned int *memcell_buf;
377 unsigned int len;
378
379 memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
380 if (!memcell_buf || len <= 0)
381 continue;
382
383 ranges = memory->n_addrs;
384new_range:
385 /* these are order-sensitive, and modify the buffer pointer */
386 start = read_n_cells(addr_cells, &memcell_buf);
387 size = read_n_cells(size_cells, &memcell_buf);
388
389 start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
390 size = _ALIGN_UP(size, MEMORY_INCREMENT);
391
392 numa_domain = of_node_numa_domain(memory);
393
394 if (numa_domain >= MAX_NUMNODES) {
395 if (numa_domain != 0xffff)
396 printk(KERN_ERR "WARNING: memory at %lx maps "
397 "to invalid NUMA node %d\n", start,
398 numa_domain);
399 numa_domain = 0;
400 }
401
402 if (max_domain < numa_domain)
403 max_domain = numa_domain;
404
405 if (! (size = numa_enforce_memory_limit(start, size))) {
406 if (--ranges)
407 goto new_range;
408 else
409 continue;
410 }
411
412 /*
413 * Initialize new node struct, or add to an existing one.
414 */
415 if (init_node_data[numa_domain].node_end_pfn) {
416 if ((start / PAGE_SIZE) <
417 init_node_data[numa_domain].node_start_pfn)
418 init_node_data[numa_domain].node_start_pfn =
419 start / PAGE_SIZE;
420 if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
421 init_node_data[numa_domain].node_end_pfn)
422 init_node_data[numa_domain].node_end_pfn =
423 (start / PAGE_SIZE) +
424 (size / PAGE_SIZE);
425
426 init_node_data[numa_domain].node_present_pages +=
427 size / PAGE_SIZE;
428 } else {
429 node_set_online(numa_domain);
430
431 init_node_data[numa_domain].node_start_pfn =
432 start / PAGE_SIZE;
433 init_node_data[numa_domain].node_end_pfn =
434 init_node_data[numa_domain].node_start_pfn +
435 size / PAGE_SIZE;
436 init_node_data[numa_domain].node_present_pages =
437 size / PAGE_SIZE;
438 }
439
440 for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
441 numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
442 numa_domain;
443
444 if (--ranges)
445 goto new_range;
446 }
447
448 for (i = 0; i <= max_domain; i++)
449 node_set_online(i);
450
451 return 0;
452}
453
454static void __init setup_nonnuma(void)
455{
456 unsigned long top_of_ram = lmb_end_of_DRAM();
457 unsigned long total_ram = lmb_phys_mem_size();
458 unsigned long i;
459
460 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
461 top_of_ram, total_ram);
462 printk(KERN_INFO "Memory hole size: %ldMB\n",
463 (top_of_ram - total_ram) >> 20);
464
465 if (!numa_memory_lookup_table) {
466 long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
467 numa_memory_lookup_table =
468 (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
469 memset(numa_memory_lookup_table, 0, entries * sizeof(char));
470 for (i = 0; i < entries ; i++)
471 numa_memory_lookup_table[i] = ARRAY_INITIALISER;
472 }
473
474 map_cpu_to_node(boot_cpuid, 0);
475
476 node_set_online(0);
477
478 init_node_data[0].node_start_pfn = 0;
479 init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
480 init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
481
482 for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
483 numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
484}
485
486static void __init dump_numa_topology(void)
487{
488 unsigned int node;
489 unsigned int count;
490
491 if (min_common_depth == -1 || !numa_enabled)
492 return;
493
494 for_each_online_node(node) {
495 unsigned long i;
496
497 printk(KERN_INFO "Node %d Memory:", node);
498
499 count = 0;
500
501 for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
502 if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
503 if (count == 0)
504 printk(" 0x%lx", i);
505 ++count;
506 } else {
507 if (count > 0)
508 printk("-0x%lx", i);
509 count = 0;
510 }
511 }
512
513 if (count > 0)
514 printk("-0x%lx", i);
515 printk("\n");
516 }
517 return;
518}
519
520/*
521 * Allocate some memory, satisfying the lmb or bootmem allocator where
522 * required. nid is the preferred node and end is the physical address of
523 * the highest address in the node.
524 *
525 * Returns the physical address of the memory.
526 */
527static unsigned long careful_allocation(int nid, unsigned long size,
528 unsigned long align, unsigned long end)
529{
530 unsigned long ret = lmb_alloc_base(size, align, end);
531
532 /* retry over all memory */
533 if (!ret)
534 ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
535
536 if (!ret)
537 panic("numa.c: cannot allocate %lu bytes on node %d",
538 size, nid);
539
540 /*
541 * If the memory came from a previously allocated node, we must
542 * retry with the bootmem allocator.
543 */
544 if (pa_to_nid(ret) < nid) {
545 nid = pa_to_nid(ret);
546 ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
547 size, align, 0);
548
549 if (!ret)
550 panic("numa.c: cannot allocate %lu bytes on node %d",
551 size, nid);
552
553 ret = virt_to_abs(ret);
554
555 dbg("alloc_bootmem %lx %lx\n", ret, size);
556 }
557
558 return ret;
559}
560
561void __init do_init_bootmem(void)
562{
563 int nid;
564 int addr_cells, size_cells;
565 struct device_node *memory = NULL;
566 static struct notifier_block ppc64_numa_nb = {
567 .notifier_call = cpu_numa_callback,
568 .priority = 1 /* Must run before sched domains notifier. */
569 };
570
571 min_low_pfn = 0;
572 max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
573 max_pfn = max_low_pfn;
574
575 if (parse_numa_properties())
576 setup_nonnuma();
577 else
578 dump_numa_topology();
579
580 register_cpu_notifier(&ppc64_numa_nb);
581
582 for_each_online_node(nid) {
583 unsigned long start_paddr, end_paddr;
584 int i;
585 unsigned long bootmem_paddr;
586 unsigned long bootmap_pages;
587
588 start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
589 end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
590
591 /* Allocate the node structure node local if possible */
592 NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
593 sizeof(struct pglist_data),
594 SMP_CACHE_BYTES, end_paddr);
595 NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
596 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
597
598 dbg("node %d\n", nid);
599 dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
600
601 NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
602 NODE_DATA(nid)->node_start_pfn =
603 init_node_data[nid].node_start_pfn;
604 NODE_DATA(nid)->node_spanned_pages =
605 end_paddr - start_paddr;
606
607 if (NODE_DATA(nid)->node_spanned_pages == 0)
608 continue;
609
610 dbg("start_paddr = %lx\n", start_paddr);
611 dbg("end_paddr = %lx\n", end_paddr);
612
613 bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
614
615 bootmem_paddr = careful_allocation(nid,
616 bootmap_pages << PAGE_SHIFT,
617 PAGE_SIZE, end_paddr);
618 memset(abs_to_virt(bootmem_paddr), 0,
619 bootmap_pages << PAGE_SHIFT);
620 dbg("bootmap_paddr = %lx\n", bootmem_paddr);
621
622 init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
623 start_paddr >> PAGE_SHIFT,
624 end_paddr >> PAGE_SHIFT);
625
626 /*
627 * We need to do another scan of all memory sections to
628 * associate memory with the correct node.
629 */
630 addr_cells = get_mem_addr_cells();
631 size_cells = get_mem_size_cells();
632 memory = NULL;
633 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
634 unsigned long mem_start, mem_size;
635 int numa_domain, ranges;
636 unsigned int *memcell_buf;
637 unsigned int len;
638
639 memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
640 if (!memcell_buf || len <= 0)
641 continue;
642
643 ranges = memory->n_addrs; /* ranges in cell */
644new_range:
645 mem_start = read_n_cells(addr_cells, &memcell_buf);
646 mem_size = read_n_cells(size_cells, &memcell_buf);
647 if (numa_enabled) {
648 numa_domain = of_node_numa_domain(memory);
649 if (numa_domain >= MAX_NUMNODES)
650 numa_domain = 0;
651 } else
652 numa_domain = 0;
653
654 if (numa_domain != nid)
655 continue;
656
657 mem_size = numa_enforce_memory_limit(mem_start, mem_size);
658 if (mem_size) {
659 dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
660 free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
661 }
662
663 if (--ranges) /* process all ranges in cell */
664 goto new_range;
665 }
666
667 /*
668 * Mark reserved regions on this node
669 */
670 for (i = 0; i < lmb.reserved.cnt; i++) {
671 unsigned long physbase = lmb.reserved.region[i].base;
672 unsigned long size = lmb.reserved.region[i].size;
673
674 if (pa_to_nid(physbase) != nid &&
675 pa_to_nid(physbase+size-1) != nid)
676 continue;
677
678 if (physbase < end_paddr &&
679 (physbase+size) > start_paddr) {
680 /* overlaps */
681 if (physbase < start_paddr) {
682 size -= start_paddr - physbase;
683 physbase = start_paddr;
684 }
685
686 if (size > end_paddr - physbase)
687 size = end_paddr - physbase;
688
689 dbg("reserve_bootmem %lx %lx\n", physbase,
690 size);
691 reserve_bootmem_node(NODE_DATA(nid), physbase,
692 size);
693 }
694 }
695 /*
696 * This loop may look famaliar, but we have to do it again
697 * after marking our reserved memory to mark memory present
698 * for sparsemem.
699 */
700 addr_cells = get_mem_addr_cells();
701 size_cells = get_mem_size_cells();
702 memory = NULL;
703 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
704 unsigned long mem_start, mem_size;
705 int numa_domain, ranges;
706 unsigned int *memcell_buf;
707 unsigned int len;
708
709 memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
710 if (!memcell_buf || len <= 0)
711 continue;
712
713 ranges = memory->n_addrs; /* ranges in cell */
714new_range2:
715 mem_start = read_n_cells(addr_cells, &memcell_buf);
716 mem_size = read_n_cells(size_cells, &memcell_buf);
717 if (numa_enabled) {
718 numa_domain = of_node_numa_domain(memory);
719 if (numa_domain >= MAX_NUMNODES)
720 numa_domain = 0;
721 } else
722 numa_domain = 0;
723
724 if (numa_domain != nid)
725 continue;
726
727 mem_size = numa_enforce_memory_limit(mem_start, mem_size);
728 memory_present(numa_domain, mem_start >> PAGE_SHIFT,
729 (mem_start + mem_size) >> PAGE_SHIFT);
730
731 if (--ranges) /* process all ranges in cell */
732 goto new_range2;
733 }
734
735 }
736}
737
738void __init paging_init(void)
739{
740 unsigned long zones_size[MAX_NR_ZONES];
741 unsigned long zholes_size[MAX_NR_ZONES];
742 int nid;
743
744 memset(zones_size, 0, sizeof(zones_size));
745 memset(zholes_size, 0, sizeof(zholes_size));
746
747 for_each_online_node(nid) {
748 unsigned long start_pfn;
749 unsigned long end_pfn;
750
751 start_pfn = init_node_data[nid].node_start_pfn;
752 end_pfn = init_node_data[nid].node_end_pfn;
753
754 zones_size[ZONE_DMA] = end_pfn - start_pfn;
755 zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
756 init_node_data[nid].node_present_pages;
757
758 dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
759 zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
760
761 free_area_init_node(nid, NODE_DATA(nid), zones_size,
762 start_pfn, zholes_size);
763 }
764}
765
766static int __init early_numa(char *p)
767{
768 if (!p)
769 return 0;
770
771 if (strstr(p, "off"))
772 numa_enabled = 0;
773
774 if (strstr(p, "debug"))
775 numa_debug = 1;
776
777 return 0;
778}
779early_param("numa", early_numa);
diff --git a/arch/ppc64/mm/slb.c b/arch/ppc64/mm/slb.c
deleted file mode 100644
index 0473953f6a3..00000000000
--- a/arch/ppc64/mm/slb.c
+++ /dev/null
@@ -1,158 +0,0 @@
1/*
2 * PowerPC64 SLB support.
3 *
4 * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
5 * Based on earlier code writteh by:
6 * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
7 * Copyright (c) 2001 Dave Engebretsen
8 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
9 *
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 */
16
17#include <linux/config.h>
18#include <asm/pgtable.h>
19#include <asm/mmu.h>
20#include <asm/mmu_context.h>
21#include <asm/paca.h>
22#include <asm/cputable.h>
23
24extern void slb_allocate(unsigned long ea);
25
26static inline unsigned long mk_esid_data(unsigned long ea, unsigned long slot)
27{
28 return (ea & ESID_MASK) | SLB_ESID_V | slot;
29}
30
31static inline unsigned long mk_vsid_data(unsigned long ea, unsigned long flags)
32{
33 return (get_kernel_vsid(ea) << SLB_VSID_SHIFT) | flags;
34}
35
36static inline void create_slbe(unsigned long ea, unsigned long flags,
37 unsigned long entry)
38{
39 asm volatile("slbmte %0,%1" :
40 : "r" (mk_vsid_data(ea, flags)),
41 "r" (mk_esid_data(ea, entry))
42 : "memory" );
43}
44
45static void slb_flush_and_rebolt(void)
46{
47 /* If you change this make sure you change SLB_NUM_BOLTED
48 * appropriately too. */
49 unsigned long ksp_flags = SLB_VSID_KERNEL;
50 unsigned long ksp_esid_data;
51
52 WARN_ON(!irqs_disabled());
53
54 if (cpu_has_feature(CPU_FTR_16M_PAGE))
55 ksp_flags |= SLB_VSID_L;
56
57 ksp_esid_data = mk_esid_data(get_paca()->kstack, 2);
58 if ((ksp_esid_data & ESID_MASK) == KERNELBASE)
59 ksp_esid_data &= ~SLB_ESID_V;
60
61 /* We need to do this all in asm, so we're sure we don't touch
62 * the stack between the slbia and rebolting it. */
63 asm volatile("isync\n"
64 "slbia\n"
65 /* Slot 1 - first VMALLOC segment */
66 "slbmte %0,%1\n"
67 /* Slot 2 - kernel stack */
68 "slbmte %2,%3\n"
69 "isync"
70 :: "r"(mk_vsid_data(VMALLOCBASE, SLB_VSID_KERNEL)),
71 "r"(mk_esid_data(VMALLOCBASE, 1)),
72 "r"(mk_vsid_data(ksp_esid_data, ksp_flags)),
73 "r"(ksp_esid_data)
74 : "memory");
75}
76
77/* Flush all user entries from the segment table of the current processor. */
78void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
79{
80 unsigned long offset = get_paca()->slb_cache_ptr;
81 unsigned long esid_data = 0;
82 unsigned long pc = KSTK_EIP(tsk);
83 unsigned long stack = KSTK_ESP(tsk);
84 unsigned long unmapped_base;
85
86 if (offset <= SLB_CACHE_ENTRIES) {
87 int i;
88 asm volatile("isync" : : : "memory");
89 for (i = 0; i < offset; i++) {
90 esid_data = ((unsigned long)get_paca()->slb_cache[i]
91 << SID_SHIFT) | SLBIE_C;
92 asm volatile("slbie %0" : : "r" (esid_data));
93 }
94 asm volatile("isync" : : : "memory");
95 } else {
96 slb_flush_and_rebolt();
97 }
98
99 /* Workaround POWER5 < DD2.1 issue */
100 if (offset == 1 || offset > SLB_CACHE_ENTRIES)
101 asm volatile("slbie %0" : : "r" (esid_data));
102
103 get_paca()->slb_cache_ptr = 0;
104 get_paca()->context = mm->context;
105
106 /*
107 * preload some userspace segments into the SLB.
108 */
109 if (test_tsk_thread_flag(tsk, TIF_32BIT))
110 unmapped_base = TASK_UNMAPPED_BASE_USER32;
111 else
112 unmapped_base = TASK_UNMAPPED_BASE_USER64;
113
114 if (pc >= KERNELBASE)
115 return;
116 slb_allocate(pc);
117
118 if (GET_ESID(pc) == GET_ESID(stack))
119 return;
120
121 if (stack >= KERNELBASE)
122 return;
123 slb_allocate(stack);
124
125 if ((GET_ESID(pc) == GET_ESID(unmapped_base))
126 || (GET_ESID(stack) == GET_ESID(unmapped_base)))
127 return;
128
129 if (unmapped_base >= KERNELBASE)
130 return;
131 slb_allocate(unmapped_base);
132}
133
134void slb_initialize(void)
135{
136 /* On iSeries the bolted entries have already been set up by
137 * the hypervisor from the lparMap data in head.S */
138#ifndef CONFIG_PPC_ISERIES
139 unsigned long flags = SLB_VSID_KERNEL;
140
141 /* Invalidate the entire SLB (even slot 0) & all the ERATS */
142 if (cpu_has_feature(CPU_FTR_16M_PAGE))
143 flags |= SLB_VSID_L;
144
145 asm volatile("isync":::"memory");
146 asm volatile("slbmte %0,%0"::"r" (0) : "memory");
147 asm volatile("isync; slbia; isync":::"memory");
148 create_slbe(KERNELBASE, flags, 0);
149 create_slbe(VMALLOCBASE, SLB_VSID_KERNEL, 1);
150 /* We don't bolt the stack for the time being - we're in boot,
151 * so the stack is in the bolted segment. By the time it goes
152 * elsewhere, we'll call _switch() which will bolt in the new
153 * one. */
154 asm volatile("isync":::"memory");
155#endif
156
157 get_paca()->stab_rr = SLB_NUM_BOLTED;
158}
diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S
deleted file mode 100644
index a3a03da503b..00000000000
--- a/arch/ppc64/mm/slb_low.S
+++ /dev/null
@@ -1,151 +0,0 @@
1/*
2 * arch/ppc64/mm/slb_low.S
3 *
4 * Low-level SLB routines
5 *
6 * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
7 *
8 * Based on earlier C version:
9 * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
10 * Copyright (c) 2001 Dave Engebretsen
11 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 */
18
19#include <linux/config.h>
20#include <asm/processor.h>
21#include <asm/page.h>
22#include <asm/mmu.h>
23#include <asm/ppc_asm.h>
24#include <asm/asm-offsets.h>
25#include <asm/cputable.h>
26
27/* void slb_allocate(unsigned long ea);
28 *
29 * Create an SLB entry for the given EA (user or kernel).
30 * r3 = faulting address, r13 = PACA
31 * r9, r10, r11 are clobbered by this function
32 * No other registers are examined or changed.
33 */
34_GLOBAL(slb_allocate)
35 /*
36 * First find a slot, round robin. Previously we tried to find
37 * a free slot first but that took too long. Unfortunately we
38 * dont have any LRU information to help us choose a slot.
39 */
40#ifdef CONFIG_PPC_ISERIES
41 /*
42 * On iSeries, the "bolted" stack segment can be cast out on
43 * shared processor switch so we need to check for a miss on
44 * it and restore it to the right slot.
45 */
46 ld r9,PACAKSAVE(r13)
47 clrrdi r9,r9,28
48 clrrdi r11,r3,28
49 li r10,SLB_NUM_BOLTED-1 /* Stack goes in last bolted slot */
50 cmpld r9,r11
51 beq 3f
52#endif /* CONFIG_PPC_ISERIES */
53
54 ld r10,PACASTABRR(r13)
55 addi r10,r10,1
56 /* use a cpu feature mask if we ever change our slb size */
57 cmpldi r10,SLB_NUM_ENTRIES
58
59 blt+ 4f
60 li r10,SLB_NUM_BOLTED
61
624:
63 std r10,PACASTABRR(r13)
643:
65 /* r3 = faulting address, r10 = entry */
66
67 srdi r9,r3,60 /* get region */
68 srdi r3,r3,28 /* get esid */
69 cmpldi cr7,r9,0xc /* cmp KERNELBASE for later use */
70
71 rldimi r10,r3,28,0 /* r10= ESID<<28 | entry */
72 oris r10,r10,SLB_ESID_V@h /* r10 |= SLB_ESID_V */
73
74 /* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */
75
76 blt cr7,0f /* user or kernel? */
77
78 /* kernel address: proto-VSID = ESID */
79 /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
80 * this code will generate the protoVSID 0xfffffffff for the
81 * top segment. That's ok, the scramble below will translate
82 * it to VSID 0, which is reserved as a bad VSID - one which
83 * will never have any pages in it. */
84 li r11,SLB_VSID_KERNEL
85BEGIN_FTR_SECTION
86 bne cr7,9f
87 li r11,(SLB_VSID_KERNEL|SLB_VSID_L)
88END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
89 b 9f
90
910: /* user address: proto-VSID = context<<15 | ESID */
92 srdi. r9,r3,USER_ESID_BITS
93 bne- 8f /* invalid ea bits set */
94
95#ifdef CONFIG_HUGETLB_PAGE
96BEGIN_FTR_SECTION
97 lhz r9,PACAHIGHHTLBAREAS(r13)
98 srdi r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
99 srd r9,r9,r11
100 lhz r11,PACALOWHTLBAREAS(r13)
101 srd r11,r11,r3
102 or r9,r9,r11
103END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
104#endif /* CONFIG_HUGETLB_PAGE */
105
106 li r11,SLB_VSID_USER
107
108#ifdef CONFIG_HUGETLB_PAGE
109BEGIN_FTR_SECTION
110 rldimi r11,r9,8,55 /* shift masked bit into SLB_VSID_L */
111END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
112#endif /* CONFIG_HUGETLB_PAGE */
113
114 ld r9,PACACONTEXTID(r13)
115 rldimi r3,r9,USER_ESID_BITS,0
116
1179: /* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */
118 ASM_VSID_SCRAMBLE(r3,r9)
119
120 rldimi r11,r3,SLB_VSID_SHIFT,16 /* combine VSID and flags */
121
122 /*
123 * No need for an isync before or after this slbmte. The exception
124 * we enter with and the rfid we exit with are context synchronizing.
125 */
126 slbmte r11,r10
127
128 bgelr cr7 /* we're done for kernel addresses */
129
130 /* Update the slb cache */
131 lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */
132 cmpldi r3,SLB_CACHE_ENTRIES
133 bge 1f
134
135 /* still room in the slb cache */
136 sldi r11,r3,1 /* r11 = offset * sizeof(u16) */
137 rldicl r10,r10,36,28 /* get low 16 bits of the ESID */
138 add r11,r11,r13 /* r11 = (u16 *)paca + offset */
139 sth r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */
140 addi r3,r3,1 /* offset++ */
141 b 2f
1421: /* offset >= SLB_CACHE_ENTRIES */
143 li r3,SLB_CACHE_ENTRIES+1
1442:
145 sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */
146 blr
147
1488: /* invalid EA */
149 li r3,0 /* BAD_VSID */
150 li r11,SLB_VSID_USER /* flags don't much matter */
151 b 9b
diff --git a/arch/ppc64/mm/stab.c b/arch/ppc64/mm/stab.c
deleted file mode 100644
index 1b83f002bf2..00000000000
--- a/arch/ppc64/mm/stab.c
+++ /dev/null
@@ -1,279 +0,0 @@
1/*
2 * PowerPC64 Segment Translation Support.
3 *
4 * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
5 * Copyright (c) 2001 Dave Engebretsen
6 *
7 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 */
14
15#include <linux/config.h>
16#include <asm/pgtable.h>
17#include <asm/mmu.h>
18#include <asm/mmu_context.h>
19#include <asm/paca.h>
20#include <asm/cputable.h>
21#include <asm/lmb.h>
22#include <asm/abs_addr.h>
23
24struct stab_entry {
25 unsigned long esid_data;
26 unsigned long vsid_data;
27};
28
29/* Both the segment table and SLB code uses the following cache */
30#define NR_STAB_CACHE_ENTRIES 8
31DEFINE_PER_CPU(long, stab_cache_ptr);
32DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
33
34/*
35 * Create a segment table entry for the given esid/vsid pair.
36 */
37static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid)
38{
39 unsigned long esid_data, vsid_data;
40 unsigned long entry, group, old_esid, castout_entry, i;
41 unsigned int global_entry;
42 struct stab_entry *ste, *castout_ste;
43 unsigned long kernel_segment = (esid << SID_SHIFT) >= KERNELBASE;
44
45 vsid_data = vsid << STE_VSID_SHIFT;
46 esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V;
47 if (! kernel_segment)
48 esid_data |= STE_ESID_KS;
49
50 /* Search the primary group first. */
51 global_entry = (esid & 0x1f) << 3;
52 ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
53
54 /* Find an empty entry, if one exists. */
55 for (group = 0; group < 2; group++) {
56 for (entry = 0; entry < 8; entry++, ste++) {
57 if (!(ste->esid_data & STE_ESID_V)) {
58 ste->vsid_data = vsid_data;
59 asm volatile("eieio":::"memory");
60 ste->esid_data = esid_data;
61 return (global_entry | entry);
62 }
63 }
64 /* Now search the secondary group. */
65 global_entry = ((~esid) & 0x1f) << 3;
66 ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
67 }
68
69 /*
70 * Could not find empty entry, pick one with a round robin selection.
71 * Search all entries in the two groups.
72 */
73 castout_entry = get_paca()->stab_rr;
74 for (i = 0; i < 16; i++) {
75 if (castout_entry < 8) {
76 global_entry = (esid & 0x1f) << 3;
77 ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
78 castout_ste = ste + castout_entry;
79 } else {
80 global_entry = ((~esid) & 0x1f) << 3;
81 ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
82 castout_ste = ste + (castout_entry - 8);
83 }
84
85 /* Dont cast out the first kernel segment */
86 if ((castout_ste->esid_data & ESID_MASK) != KERNELBASE)
87 break;
88
89 castout_entry = (castout_entry + 1) & 0xf;
90 }
91
92 get_paca()->stab_rr = (castout_entry + 1) & 0xf;
93
94 /* Modify the old entry to the new value. */
95
96 /* Force previous translations to complete. DRENG */
97 asm volatile("isync" : : : "memory");
98
99 old_esid = castout_ste->esid_data >> SID_SHIFT;
100 castout_ste->esid_data = 0; /* Invalidate old entry */
101
102 asm volatile("sync" : : : "memory"); /* Order update */
103
104 castout_ste->vsid_data = vsid_data;
105 asm volatile("eieio" : : : "memory"); /* Order update */
106 castout_ste->esid_data = esid_data;
107
108 asm volatile("slbie %0" : : "r" (old_esid << SID_SHIFT));
109 /* Ensure completion of slbie */
110 asm volatile("sync" : : : "memory");
111
112 return (global_entry | (castout_entry & 0x7));
113}
114
115/*
116 * Allocate a segment table entry for the given ea and mm
117 */
118static int __ste_allocate(unsigned long ea, struct mm_struct *mm)
119{
120 unsigned long vsid;
121 unsigned char stab_entry;
122 unsigned long offset;
123
124 /* Kernel or user address? */
125 if (ea >= KERNELBASE) {
126 vsid = get_kernel_vsid(ea);
127 } else {
128 if ((ea >= TASK_SIZE_USER64) || (! mm))
129 return 1;
130
131 vsid = get_vsid(mm->context.id, ea);
132 }
133
134 stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid);
135
136 if (ea < KERNELBASE) {
137 offset = __get_cpu_var(stab_cache_ptr);
138 if (offset < NR_STAB_CACHE_ENTRIES)
139 __get_cpu_var(stab_cache[offset++]) = stab_entry;
140 else
141 offset = NR_STAB_CACHE_ENTRIES+1;
142 __get_cpu_var(stab_cache_ptr) = offset;
143
144 /* Order update */
145 asm volatile("sync":::"memory");
146 }
147
148 return 0;
149}
150
151int ste_allocate(unsigned long ea)
152{
153 return __ste_allocate(ea, current->mm);
154}
155
156/*
157 * Do the segment table work for a context switch: flush all user
158 * entries from the table, then preload some probably useful entries
159 * for the new task
160 */
161void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
162{
163 struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
164 struct stab_entry *ste;
165 unsigned long offset = __get_cpu_var(stab_cache_ptr);
166 unsigned long pc = KSTK_EIP(tsk);
167 unsigned long stack = KSTK_ESP(tsk);
168 unsigned long unmapped_base;
169
170 /* Force previous translations to complete. DRENG */
171 asm volatile("isync" : : : "memory");
172
173 if (offset <= NR_STAB_CACHE_ENTRIES) {
174 int i;
175
176 for (i = 0; i < offset; i++) {
177 ste = stab + __get_cpu_var(stab_cache[i]);
178 ste->esid_data = 0; /* invalidate entry */
179 }
180 } else {
181 unsigned long entry;
182
183 /* Invalidate all entries. */
184 ste = stab;
185
186 /* Never flush the first entry. */
187 ste += 1;
188 for (entry = 1;
189 entry < (PAGE_SIZE / sizeof(struct stab_entry));
190 entry++, ste++) {
191 unsigned long ea;
192 ea = ste->esid_data & ESID_MASK;
193 if (ea < KERNELBASE) {
194 ste->esid_data = 0;
195 }
196 }
197 }
198
199 asm volatile("sync; slbia; sync":::"memory");
200
201 __get_cpu_var(stab_cache_ptr) = 0;
202
203 /* Now preload some entries for the new task */
204 if (test_tsk_thread_flag(tsk, TIF_32BIT))
205 unmapped_base = TASK_UNMAPPED_BASE_USER32;
206 else
207 unmapped_base = TASK_UNMAPPED_BASE_USER64;
208
209 __ste_allocate(pc, mm);
210
211 if (GET_ESID(pc) == GET_ESID(stack))
212 return;
213
214 __ste_allocate(stack, mm);
215
216 if ((GET_ESID(pc) == GET_ESID(unmapped_base))
217 || (GET_ESID(stack) == GET_ESID(unmapped_base)))
218 return;
219
220 __ste_allocate(unmapped_base, mm);
221
222 /* Order update */
223 asm volatile("sync" : : : "memory");
224}
225
226extern void slb_initialize(void);
227
228/*
229 * Allocate segment tables for secondary CPUs. These must all go in
230 * the first (bolted) segment, so that do_stab_bolted won't get a
231 * recursive segment miss on the segment table itself.
232 */
233void stabs_alloc(void)
234{
235 int cpu;
236
237 if (cpu_has_feature(CPU_FTR_SLB))
238 return;
239
240 for_each_cpu(cpu) {
241 unsigned long newstab;
242
243 if (cpu == 0)
244 continue; /* stab for CPU 0 is statically allocated */
245
246 newstab = lmb_alloc_base(PAGE_SIZE, PAGE_SIZE, 1<<SID_SHIFT);
247 if (! newstab)
248 panic("Unable to allocate segment table for CPU %d.\n",
249 cpu);
250
251 newstab += KERNELBASE;
252
253 memset((void *)newstab, 0, PAGE_SIZE);
254
255 paca[cpu].stab_addr = newstab;
256 paca[cpu].stab_real = virt_to_abs(newstab);
257 printk(KERN_DEBUG "Segment table for CPU %d at 0x%lx virtual, 0x%lx absolute\n", cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
258 }
259}
260
261/*
262 * Build an entry for the base kernel segment and put it into
263 * the segment table or SLB. All other segment table or SLB
264 * entries are faulted in.
265 */
266void stab_initialize(unsigned long stab)
267{
268 unsigned long vsid = get_kernel_vsid(KERNELBASE);
269
270 if (cpu_has_feature(CPU_FTR_SLB)) {
271 slb_initialize();
272 } else {
273 asm volatile("isync; slbia; isync":::"memory");
274 make_ste(stab, GET_ESID(KERNELBASE), vsid);
275
276 /* Order update */
277 asm volatile("sync":::"memory");
278 }
279}
diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c
deleted file mode 100644
index 09ab81a10f4..00000000000
--- a/arch/ppc64/mm/tlb.c
+++ /dev/null
@@ -1,196 +0,0 @@
1/*
2 * This file contains the routines for flushing entries from the
3 * TLB and MMU hash table.
4 *
5 * Derived from arch/ppc64/mm/init.c:
6 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7 *
8 * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
9 * and Cort Dougan (PReP) (cort@cs.nmt.edu)
10 * Copyright (C) 1996 Paul Mackerras
11 * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
12 *
13 * Derived from "arch/i386/mm/init.c"
14 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
15 *
16 * Dave Engebretsen <engebret@us.ibm.com>
17 * Rework for PPC64 port.
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 */
24#include <linux/config.h>
25#include <linux/kernel.h>
26#include <linux/mm.h>
27#include <linux/init.h>
28#include <linux/percpu.h>
29#include <linux/hardirq.h>
30#include <asm/pgalloc.h>
31#include <asm/tlbflush.h>
32#include <asm/tlb.h>
33#include <linux/highmem.h>
34
35DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
36
37/* This is declared as we are using the more or less generic
38 * include/asm-ppc64/tlb.h file -- tgall
39 */
40DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
41DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
42unsigned long pte_freelist_forced_free;
43
44struct pte_freelist_batch
45{
46 struct rcu_head rcu;
47 unsigned int index;
48 pgtable_free_t tables[0];
49};
50
51DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
52unsigned long pte_freelist_forced_free;
53
54#define PTE_FREELIST_SIZE \
55 ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
56 / sizeof(pgtable_free_t))
57
58#ifdef CONFIG_SMP
59static void pte_free_smp_sync(void *arg)
60{
61 /* Do nothing, just ensure we sync with all CPUs */
62}
63#endif
64
65/* This is only called when we are critically out of memory
66 * (and fail to get a page in pte_free_tlb).
67 */
68static void pgtable_free_now(pgtable_free_t pgf)
69{
70 pte_freelist_forced_free++;
71
72 smp_call_function(pte_free_smp_sync, NULL, 0, 1);
73
74 pgtable_free(pgf);
75}
76
77static void pte_free_rcu_callback(struct rcu_head *head)
78{
79 struct pte_freelist_batch *batch =
80 container_of(head, struct pte_freelist_batch, rcu);
81 unsigned int i;
82
83 for (i = 0; i < batch->index; i++)
84 pgtable_free(batch->tables[i]);
85
86 free_page((unsigned long)batch);
87}
88
89static void pte_free_submit(struct pte_freelist_batch *batch)
90{
91 INIT_RCU_HEAD(&batch->rcu);
92 call_rcu(&batch->rcu, pte_free_rcu_callback);
93}
94
95void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
96{
97 /* This is safe as we are holding page_table_lock */
98 cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
99 struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
100
101 if (atomic_read(&tlb->mm->mm_users) < 2 ||
102 cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
103 pgtable_free(pgf);
104 return;
105 }
106
107 if (*batchp == NULL) {
108 *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
109 if (*batchp == NULL) {
110 pgtable_free_now(pgf);
111 return;
112 }
113 (*batchp)->index = 0;
114 }
115 (*batchp)->tables[(*batchp)->index++] = pgf;
116 if ((*batchp)->index == PTE_FREELIST_SIZE) {
117 pte_free_submit(*batchp);
118 *batchp = NULL;
119 }
120}
121
122/*
123 * Update the MMU hash table to correspond with a change to
124 * a Linux PTE. If wrprot is true, it is permissible to
125 * change the existing HPTE to read-only rather than removing it
126 * (if we remove it we should clear the _PTE_HPTEFLAGS bits).
127 */
128void hpte_update(struct mm_struct *mm, unsigned long addr,
129 unsigned long pte, int wrprot)
130{
131 struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
132 unsigned long vsid;
133 int i;
134
135 i = batch->index;
136
137 /*
138 * This can happen when we are in the middle of a TLB batch and
139 * we encounter memory pressure (eg copy_page_range when it tries
140 * to allocate a new pte). If we have to reclaim memory and end
141 * up scanning and resetting referenced bits then our batch context
142 * will change mid stream.
143 */
144 if (i != 0 && (mm != batch->mm || batch->large != pte_huge(pte))) {
145 flush_tlb_pending();
146 i = 0;
147 }
148 if (i == 0) {
149 batch->mm = mm;
150 batch->large = pte_huge(pte);
151 }
152 if (addr < KERNELBASE) {
153 vsid = get_vsid(mm->context.id, addr);
154 WARN_ON(vsid == 0);
155 } else
156 vsid = get_kernel_vsid(addr);
157 batch->vaddr[i] = (vsid << 28 ) | (addr & 0x0fffffff);
158 batch->pte[i] = __pte(pte);
159 batch->index = ++i;
160 if (i >= PPC64_TLB_BATCH_NR)
161 flush_tlb_pending();
162}
163
164void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
165{
166 int i;
167 int cpu;
168 cpumask_t tmp;
169 int local = 0;
170
171 BUG_ON(in_interrupt());
172
173 cpu = get_cpu();
174 i = batch->index;
175 tmp = cpumask_of_cpu(cpu);
176 if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
177 local = 1;
178
179 if (i == 1)
180 flush_hash_page(batch->vaddr[0], batch->pte[0], local);
181 else
182 flush_hash_range(i, local);
183 batch->index = 0;
184 put_cpu();
185}
186
187void pte_free_finish(void)
188{
189 /* This is safe as we are holding page_table_lock */
190 struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
191
192 if (*batchp == NULL)
193 return;
194 pte_free_submit(*batchp);
195 *batchp = NULL;
196}