aboutsummaryrefslogtreecommitdiffstats
path: root/arch/ppc64/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ppc64/mm
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'arch/ppc64/mm')
-rw-r--r--arch/ppc64/mm/Makefile11
-rw-r--r--arch/ppc64/mm/fault.c312
-rw-r--r--arch/ppc64/mm/hash_low.S287
-rw-r--r--arch/ppc64/mm/hash_native.c423
-rw-r--r--arch/ppc64/mm/hash_utils.c439
-rw-r--r--arch/ppc64/mm/hugetlbpage.c904
-rw-r--r--arch/ppc64/mm/imalloc.c312
-rw-r--r--arch/ppc64/mm/init.c927
-rw-r--r--arch/ppc64/mm/mmap.c86
-rw-r--r--arch/ppc64/mm/numa.c734
-rw-r--r--arch/ppc64/mm/slb.c159
-rw-r--r--arch/ppc64/mm/slb_low.S154
-rw-r--r--arch/ppc64/mm/stab.c239
-rw-r--r--arch/ppc64/mm/tlb.c180
14 files changed, 5167 insertions, 0 deletions
diff --git a/arch/ppc64/mm/Makefile b/arch/ppc64/mm/Makefile
new file mode 100644
index 000000000000..ac522d57b2a7
--- /dev/null
+++ b/arch/ppc64/mm/Makefile
@@ -0,0 +1,11 @@
1#
2# Makefile for the linux ppc-specific parts of the memory manager.
3#
4
5EXTRA_CFLAGS += -mno-minimal-toc
6
7obj-y := fault.o init.o imalloc.o hash_utils.o hash_low.o tlb.o \
8 slb_low.o slb.o stab.o mmap.o
9obj-$(CONFIG_DISCONTIGMEM) += numa.o
10obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
11obj-$(CONFIG_PPC_MULTIPLATFORM) += hash_native.o
diff --git a/arch/ppc64/mm/fault.c b/arch/ppc64/mm/fault.c
new file mode 100644
index 000000000000..20b0f37e8bf8
--- /dev/null
+++ b/arch/ppc64/mm/fault.c
@@ -0,0 +1,312 @@
1/*
2 * arch/ppc/mm/fault.c
3 *
4 * PowerPC version
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 * Derived from "arch/i386/mm/fault.c"
8 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
9 *
10 * Modified by Cort Dougan and Paul Mackerras.
11 *
12 * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 */
19
20#include <linux/config.h>
21#include <linux/signal.h>
22#include <linux/sched.h>
23#include <linux/kernel.h>
24#include <linux/errno.h>
25#include <linux/string.h>
26#include <linux/types.h>
27#include <linux/mman.h>
28#include <linux/mm.h>
29#include <linux/interrupt.h>
30#include <linux/smp_lock.h>
31#include <linux/module.h>
32
33#include <asm/page.h>
34#include <asm/pgtable.h>
35#include <asm/mmu.h>
36#include <asm/mmu_context.h>
37#include <asm/system.h>
38#include <asm/uaccess.h>
39#include <asm/kdebug.h>
40
41/*
42 * Check whether the instruction at regs->nip is a store using
43 * an update addressing form which will update r1.
44 */
45static int store_updates_sp(struct pt_regs *regs)
46{
47 unsigned int inst;
48
49 if (get_user(inst, (unsigned int __user *)regs->nip))
50 return 0;
51 /* check for 1 in the rA field */
52 if (((inst >> 16) & 0x1f) != 1)
53 return 0;
54 /* check major opcode */
55 switch (inst >> 26) {
56 case 37: /* stwu */
57 case 39: /* stbu */
58 case 45: /* sthu */
59 case 53: /* stfsu */
60 case 55: /* stfdu */
61 return 1;
62 case 62: /* std or stdu */
63 return (inst & 3) == 1;
64 case 31:
65 /* check minor opcode */
66 switch ((inst >> 1) & 0x3ff) {
67 case 181: /* stdux */
68 case 183: /* stwux */
69 case 247: /* stbux */
70 case 439: /* sthux */
71 case 695: /* stfsux */
72 case 759: /* stfdux */
73 return 1;
74 }
75 }
76 return 0;
77}
78
79/*
80 * The error_code parameter is
81 * - DSISR for a non-SLB data access fault,
82 * - SRR1 & 0x08000000 for a non-SLB instruction access fault
83 * - 0 any SLB fault.
84 * The return value is 0 if the fault was handled, or the signal
85 * number if this is a kernel fault that can't be handled here.
86 */
87int do_page_fault(struct pt_regs *regs, unsigned long address,
88 unsigned long error_code)
89{
90 struct vm_area_struct * vma;
91 struct mm_struct *mm = current->mm;
92 siginfo_t info;
93 unsigned long code = SEGV_MAPERR;
94 unsigned long is_write = error_code & DSISR_ISSTORE;
95 unsigned long trap = TRAP(regs);
96 unsigned long is_exec = trap == 0x400;
97
98 BUG_ON((trap == 0x380) || (trap == 0x480));
99
100 if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, error_code,
101 11, SIGSEGV) == NOTIFY_STOP)
102 return 0;
103
104 if (trap == 0x300) {
105 if (debugger_fault_handler(regs))
106 return 0;
107 }
108
109 /* On a kernel SLB miss we can only check for a valid exception entry */
110 if (!user_mode(regs) && (address >= TASK_SIZE))
111 return SIGSEGV;
112
113 if (error_code & DSISR_DABRMATCH) {
114 if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
115 11, SIGSEGV) == NOTIFY_STOP)
116 return 0;
117 if (debugger_dabr_match(regs))
118 return 0;
119 }
120
121 if (in_atomic() || mm == NULL) {
122 if (!user_mode(regs))
123 return SIGSEGV;
124 /* in_atomic() in user mode is really bad,
125 as is current->mm == NULL. */
126 printk(KERN_EMERG "Page fault in user mode with"
127 "in_atomic() = %d mm = %p\n", in_atomic(), mm);
128 printk(KERN_EMERG "NIP = %lx MSR = %lx\n",
129 regs->nip, regs->msr);
130 die("Weird page fault", regs, SIGSEGV);
131 }
132
133 /* When running in the kernel we expect faults to occur only to
134 * addresses in user space. All other faults represent errors in the
135 * kernel and should generate an OOPS. Unfortunatly, in the case of an
136 * erroneous fault occuring in a code path which already holds mmap_sem
137 * we will deadlock attempting to validate the fault against the
138 * address space. Luckily the kernel only validly references user
139 * space from well defined areas of code, which are listed in the
140 * exceptions table.
141 *
142 * As the vast majority of faults will be valid we will only perform
143 * the source reference check when there is a possibilty of a deadlock.
144 * Attempt to lock the address space, if we cannot we then validate the
145 * source. If this is invalid we can skip the address space check,
146 * thus avoiding the deadlock.
147 */
148 if (!down_read_trylock(&mm->mmap_sem)) {
149 if (!user_mode(regs) && !search_exception_tables(regs->nip))
150 goto bad_area_nosemaphore;
151
152 down_read(&mm->mmap_sem);
153 }
154
155 vma = find_vma(mm, address);
156 if (!vma)
157 goto bad_area;
158
159 if (vma->vm_start <= address) {
160 goto good_area;
161 }
162 if (!(vma->vm_flags & VM_GROWSDOWN))
163 goto bad_area;
164
165 /*
166 * N.B. The POWER/Open ABI allows programs to access up to
167 * 288 bytes below the stack pointer.
168 * The kernel signal delivery code writes up to about 1.5kB
169 * below the stack pointer (r1) before decrementing it.
170 * The exec code can write slightly over 640kB to the stack
171 * before setting the user r1. Thus we allow the stack to
172 * expand to 1MB without further checks.
173 */
174 if (address + 0x100000 < vma->vm_end) {
175 /* get user regs even if this fault is in kernel mode */
176 struct pt_regs *uregs = current->thread.regs;
177 if (uregs == NULL)
178 goto bad_area;
179
180 /*
181 * A user-mode access to an address a long way below
182 * the stack pointer is only valid if the instruction
183 * is one which would update the stack pointer to the
184 * address accessed if the instruction completed,
185 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
186 * (or the byte, halfword, float or double forms).
187 *
188 * If we don't check this then any write to the area
189 * between the last mapped region and the stack will
190 * expand the stack rather than segfaulting.
191 */
192 if (address + 2048 < uregs->gpr[1]
193 && (!user_mode(regs) || !store_updates_sp(regs)))
194 goto bad_area;
195 }
196
197 if (expand_stack(vma, address))
198 goto bad_area;
199
200good_area:
201 code = SEGV_ACCERR;
202
203 if (is_exec) {
204 /* protection fault */
205 if (error_code & DSISR_PROTFAULT)
206 goto bad_area;
207 if (!(vma->vm_flags & VM_EXEC))
208 goto bad_area;
209 /* a write */
210 } else if (is_write) {
211 if (!(vma->vm_flags & VM_WRITE))
212 goto bad_area;
213 /* a read */
214 } else {
215 if (!(vma->vm_flags & VM_READ))
216 goto bad_area;
217 }
218
219 survive:
220 /*
221 * If for any reason at all we couldn't handle the fault,
222 * make sure we exit gracefully rather than endlessly redo
223 * the fault.
224 */
225 switch (handle_mm_fault(mm, vma, address, is_write)) {
226
227 case VM_FAULT_MINOR:
228 current->min_flt++;
229 break;
230 case VM_FAULT_MAJOR:
231 current->maj_flt++;
232 break;
233 case VM_FAULT_SIGBUS:
234 goto do_sigbus;
235 case VM_FAULT_OOM:
236 goto out_of_memory;
237 default:
238 BUG();
239 }
240
241 up_read(&mm->mmap_sem);
242 return 0;
243
244bad_area:
245 up_read(&mm->mmap_sem);
246
247bad_area_nosemaphore:
248 /* User mode accesses cause a SIGSEGV */
249 if (user_mode(regs)) {
250 info.si_signo = SIGSEGV;
251 info.si_errno = 0;
252 info.si_code = code;
253 info.si_addr = (void __user *) address;
254 force_sig_info(SIGSEGV, &info, current);
255 return 0;
256 }
257
258 if (trap == 0x400 && (error_code & DSISR_PROTFAULT)
259 && printk_ratelimit())
260 printk(KERN_CRIT "kernel tried to execute NX-protected"
261 " page (%lx) - exploit attempt? (uid: %d)\n",
262 address, current->uid);
263
264 return SIGSEGV;
265
266/*
267 * We ran out of memory, or some other thing happened to us that made
268 * us unable to handle the page fault gracefully.
269 */
270out_of_memory:
271 up_read(&mm->mmap_sem);
272 if (current->pid == 1) {
273 yield();
274 down_read(&mm->mmap_sem);
275 goto survive;
276 }
277 printk("VM: killing process %s\n", current->comm);
278 if (user_mode(regs))
279 do_exit(SIGKILL);
280 return SIGKILL;
281
282do_sigbus:
283 up_read(&mm->mmap_sem);
284 if (user_mode(regs)) {
285 info.si_signo = SIGBUS;
286 info.si_errno = 0;
287 info.si_code = BUS_ADRERR;
288 info.si_addr = (void __user *)address;
289 force_sig_info(SIGBUS, &info, current);
290 return 0;
291 }
292 return SIGBUS;
293}
294
295/*
296 * bad_page_fault is called when we have a bad access from the kernel.
297 * It is called from do_page_fault above and from some of the procedures
298 * in traps.c.
299 */
300void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
301{
302 const struct exception_table_entry *entry;
303
304 /* Are we prepared to handle this fault? */
305 if ((entry = search_exception_tables(regs->nip)) != NULL) {
306 regs->nip = entry->fixup;
307 return;
308 }
309
310 /* kernel has accessed a bad area */
311 die("Kernel access of bad area", regs, sig);
312}
diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S
new file mode 100644
index 000000000000..8c0156a37001
--- /dev/null
+++ b/arch/ppc64/mm/hash_low.S
@@ -0,0 +1,287 @@
1/*
2 * ppc64 MMU hashtable management routines
3 *
4 * (c) Copyright IBM Corp. 2003
5 *
6 * Maintained by: Benjamin Herrenschmidt
7 * <benh@kernel.crashing.org>
8 *
9 * This file is covered by the GNU Public Licence v2 as
10 * described in the kernel's COPYING file.
11 */
12
13#include <asm/processor.h>
14#include <asm/pgtable.h>
15#include <asm/mmu.h>
16#include <asm/page.h>
17#include <asm/types.h>
18#include <asm/ppc_asm.h>
19#include <asm/offsets.h>
20#include <asm/cputable.h>
21
22 .text
23
24/*
25 * Stackframe:
26 *
27 * +-> Back chain (SP + 256)
28 * | General register save area (SP + 112)
29 * | Parameter save area (SP + 48)
30 * | TOC save area (SP + 40)
31 * | link editor doubleword (SP + 32)
32 * | compiler doubleword (SP + 24)
33 * | LR save area (SP + 16)
34 * | CR save area (SP + 8)
35 * SP ---> +-- Back chain (SP + 0)
36 */
37#define STACKFRAMESIZE 256
38
39/* Save parameters offsets */
40#define STK_PARM(i) (STACKFRAMESIZE + 48 + ((i)-3)*8)
41
42/* Save non-volatile offsets */
43#define STK_REG(i) (112 + ((i)-14)*8)
44
45/*
46 * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid,
47 * pte_t *ptep, unsigned long trap, int local)
48 *
49 * Adds a page to the hash table. This is the non-LPAR version for now
50 */
51
52_GLOBAL(__hash_page)
53 mflr r0
54 std r0,16(r1)
55 stdu r1,-STACKFRAMESIZE(r1)
56 /* Save all params that we need after a function call */
57 std r6,STK_PARM(r6)(r1)
58 std r8,STK_PARM(r8)(r1)
59
60 /* Add _PAGE_PRESENT to access */
61 ori r4,r4,_PAGE_PRESENT
62
63 /* Save non-volatile registers.
64 * r31 will hold "old PTE"
65 * r30 is "new PTE"
66 * r29 is "va"
67 * r28 is a hash value
68 * r27 is hashtab mask (maybe dynamic patched instead ?)
69 */
70 std r27,STK_REG(r27)(r1)
71 std r28,STK_REG(r28)(r1)
72 std r29,STK_REG(r29)(r1)
73 std r30,STK_REG(r30)(r1)
74 std r31,STK_REG(r31)(r1)
75
76 /* Step 1:
77 *
78 * Check permissions, atomically mark the linux PTE busy
79 * and hashed.
80 */
811:
82 ldarx r31,0,r6
83 /* Check access rights (access & ~(pte_val(*ptep))) */
84 andc. r0,r4,r31
85 bne- htab_wrong_access
86 /* Check if PTE is busy */
87 andi. r0,r31,_PAGE_BUSY
88 bne- 1b
89 /* Prepare new PTE value (turn access RW into DIRTY, then
90 * add BUSY,HASHPTE and ACCESSED)
91 */
92 rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */
93 or r30,r30,r31
94 ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
95 /* Write the linux PTE atomically (setting busy) */
96 stdcx. r30,0,r6
97 bne- 1b
98 isync
99
100 /* Step 2:
101 *
102 * Insert/Update the HPTE in the hash table. At this point,
103 * r4 (access) is re-useable, we use it for the new HPTE flags
104 */
105
106 /* Calc va and put it in r29 */
107 rldicr r29,r5,28,63-28
108 rldicl r3,r3,0,36
109 or r29,r3,r29
110
111 /* Calculate hash value for primary slot and store it in r28 */
112 rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */
113 rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */
114 xor r28,r5,r0
115
116 /* Convert linux PTE bits into HW equivalents */
117 andi. r3,r30,0x1fe /* Get basic set of flags */
118 xori r3,r3,HW_NO_EXEC /* _PAGE_EXEC -> NOEXEC */
119 rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */
120 rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */
121 and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */
122 andc r0,r30,r0 /* r0 = pte & ~r0 */
123 rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */
124
125 /* We eventually do the icache sync here (maybe inline that
126 * code rather than call a C function...)
127 */
128BEGIN_FTR_SECTION
129BEGIN_FTR_SECTION
130 mr r4,r30
131 mr r5,r7
132 bl .hash_page_do_lazy_icache
133END_FTR_SECTION_IFSET(CPU_FTR_NOEXECUTE)
134END_FTR_SECTION_IFCLR(CPU_FTR_COHERENT_ICACHE)
135
136 /* At this point, r3 contains new PP bits, save them in
137 * place of "access" in the param area (sic)
138 */
139 std r3,STK_PARM(r4)(r1)
140
141 /* Get htab_hash_mask */
142 ld r4,htab_hash_mask@got(2)
143 ld r27,0(r4) /* htab_hash_mask -> r27 */
144
145 /* Check if we may already be in the hashtable, in this case, we
146 * go to out-of-line code to try to modify the HPTE
147 */
148 andi. r0,r31,_PAGE_HASHPTE
149 bne htab_modify_pte
150
151htab_insert_pte:
152 /* Clear hpte bits in new pte (we also clear BUSY btw) and
153 * add _PAGE_HASHPTE
154 */
155 lis r0,_PAGE_HPTEFLAGS@h
156 ori r0,r0,_PAGE_HPTEFLAGS@l
157 andc r30,r30,r0
158 ori r30,r30,_PAGE_HASHPTE
159
160 /* page number in r5 */
161 rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT
162
163 /* Calculate primary group hash */
164 and r0,r28,r27
165 rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
166
167 /* Call ppc_md.hpte_insert */
168 ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */
169 mr r4,r29 /* Retreive va */
170 li r6,0 /* primary slot */
171 li r8,0 /* not bolted and not large */
172 li r9,0
173_GLOBAL(htab_call_hpte_insert1)
174 bl . /* Will be patched by htab_finish_init() */
175 cmpdi 0,r3,0
176 bge htab_pte_insert_ok /* Insertion successful */
177 cmpdi 0,r3,-2 /* Critical failure */
178 beq- htab_pte_insert_failure
179
180 /* Now try secondary slot */
181
182 /* page number in r5 */
183 rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT
184
185 /* Calculate secondary group hash */
186 andc r0,r27,r28
187 rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */
188
189 /* Call ppc_md.hpte_insert */
190 ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */
191 mr r4,r29 /* Retreive va */
192 li r6,1 /* secondary slot */
193 li r8,0 /* not bolted and not large */
194 li r9,0
195_GLOBAL(htab_call_hpte_insert2)
196 bl . /* Will be patched by htab_finish_init() */
197 cmpdi 0,r3,0
198 bge+ htab_pte_insert_ok /* Insertion successful */
199 cmpdi 0,r3,-2 /* Critical failure */
200 beq- htab_pte_insert_failure
201
202 /* Both are full, we need to evict something */
203 mftb r0
204 /* Pick a random group based on TB */
205 andi. r0,r0,1
206 mr r5,r28
207 bne 2f
208 not r5,r5
2092: and r0,r5,r27
210 rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */
211 /* Call ppc_md.hpte_remove */
212_GLOBAL(htab_call_hpte_remove)
213 bl . /* Will be patched by htab_finish_init() */
214
215 /* Try all again */
216 b htab_insert_pte
217
218htab_pte_insert_ok:
219 /* Insert slot number & secondary bit in PTE */
220 rldimi r30,r3,12,63-15
221
222 /* Write out the PTE with a normal write
223 * (maybe add eieio may be good still ?)
224 */
225htab_write_out_pte:
226 ld r6,STK_PARM(r6)(r1)
227 std r30,0(r6)
228 li r3, 0
229bail:
230 ld r27,STK_REG(r27)(r1)
231 ld r28,STK_REG(r28)(r1)
232 ld r29,STK_REG(r29)(r1)
233 ld r30,STK_REG(r30)(r1)
234 ld r31,STK_REG(r31)(r1)
235 addi r1,r1,STACKFRAMESIZE
236 ld r0,16(r1)
237 mtlr r0
238 blr
239
240htab_modify_pte:
241 /* Keep PP bits in r4 and slot idx from the PTE around in r3 */
242 mr r4,r3
243 rlwinm r3,r31,32-12,29,31
244
245 /* Secondary group ? if yes, get a inverted hash value */
246 mr r5,r28
247 andi. r0,r31,_PAGE_SECONDARY
248 beq 1f
249 not r5,r5
2501:
251 /* Calculate proper slot value for ppc_md.hpte_updatepp */
252 and r0,r5,r27
253 rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */
254 add r3,r0,r3 /* add slot idx */
255
256 /* Call ppc_md.hpte_updatepp */
257 mr r5,r29 /* va */
258 li r6,0 /* large is 0 */
259 ld r7,STK_PARM(r8)(r1) /* get "local" param */
260_GLOBAL(htab_call_hpte_updatepp)
261 bl . /* Will be patched by htab_finish_init() */
262
263 /* if we failed because typically the HPTE wasn't really here
264 * we try an insertion.
265 */
266 cmpdi 0,r3,-1
267 beq- htab_insert_pte
268
269 /* Clear the BUSY bit and Write out the PTE */
270 li r0,_PAGE_BUSY
271 andc r30,r30,r0
272 b htab_write_out_pte
273
274htab_wrong_access:
275 /* Bail out clearing reservation */
276 stdcx. r31,0,r6
277 li r3,1
278 b bail
279
280htab_pte_insert_failure:
281 /* Bail out restoring old PTE */
282 ld r6,STK_PARM(r6)(r1)
283 std r31,0(r6)
284 li r3,-1
285 b bail
286
287
diff --git a/arch/ppc64/mm/hash_native.c b/arch/ppc64/mm/hash_native.c
new file mode 100644
index 000000000000..144657e0c3d5
--- /dev/null
+++ b/arch/ppc64/mm/hash_native.c
@@ -0,0 +1,423 @@
1/*
2 * native hashtable management.
3 *
4 * SMP scalability work:
5 * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12#include <linux/spinlock.h>
13#include <linux/bitops.h>
14#include <linux/threads.h>
15#include <linux/smp.h>
16
17#include <asm/abs_addr.h>
18#include <asm/machdep.h>
19#include <asm/mmu.h>
20#include <asm/mmu_context.h>
21#include <asm/pgtable.h>
22#include <asm/tlbflush.h>
23#include <asm/tlb.h>
24#include <asm/cputable.h>
25
26#define HPTE_LOCK_BIT 3
27
28static DEFINE_SPINLOCK(native_tlbie_lock);
29
30static inline void native_lock_hpte(HPTE *hptep)
31{
32 unsigned long *word = &hptep->dw0.dword0;
33
34 while (1) {
35 if (!test_and_set_bit(HPTE_LOCK_BIT, word))
36 break;
37 while(test_bit(HPTE_LOCK_BIT, word))
38 cpu_relax();
39 }
40}
41
42static inline void native_unlock_hpte(HPTE *hptep)
43{
44 unsigned long *word = &hptep->dw0.dword0;
45
46 asm volatile("lwsync":::"memory");
47 clear_bit(HPTE_LOCK_BIT, word);
48}
49
50long native_hpte_insert(unsigned long hpte_group, unsigned long va,
51 unsigned long prpn, int secondary,
52 unsigned long hpteflags, int bolted, int large)
53{
54 unsigned long arpn = physRpn_to_absRpn(prpn);
55 HPTE *hptep = htab_address + hpte_group;
56 Hpte_dword0 dw0;
57 HPTE lhpte;
58 int i;
59
60 for (i = 0; i < HPTES_PER_GROUP; i++) {
61 dw0 = hptep->dw0.dw0;
62
63 if (!dw0.v) {
64 /* retry with lock held */
65 native_lock_hpte(hptep);
66 dw0 = hptep->dw0.dw0;
67 if (!dw0.v)
68 break;
69 native_unlock_hpte(hptep);
70 }
71
72 hptep++;
73 }
74
75 if (i == HPTES_PER_GROUP)
76 return -1;
77
78 lhpte.dw1.dword1 = 0;
79 lhpte.dw1.dw1.rpn = arpn;
80 lhpte.dw1.flags.flags = hpteflags;
81
82 lhpte.dw0.dword0 = 0;
83 lhpte.dw0.dw0.avpn = va >> 23;
84 lhpte.dw0.dw0.h = secondary;
85 lhpte.dw0.dw0.bolted = bolted;
86 lhpte.dw0.dw0.v = 1;
87
88 if (large) {
89 lhpte.dw0.dw0.l = 1;
90 lhpte.dw0.dw0.avpn &= ~0x1UL;
91 }
92
93 hptep->dw1.dword1 = lhpte.dw1.dword1;
94
95 /* Guarantee the second dword is visible before the valid bit */
96 __asm__ __volatile__ ("eieio" : : : "memory");
97
98 /*
99 * Now set the first dword including the valid bit
100 * NOTE: this also unlocks the hpte
101 */
102 hptep->dw0.dword0 = lhpte.dw0.dword0;
103
104 __asm__ __volatile__ ("ptesync" : : : "memory");
105
106 return i | (secondary << 3);
107}
108
109static long native_hpte_remove(unsigned long hpte_group)
110{
111 HPTE *hptep;
112 Hpte_dword0 dw0;
113 int i;
114 int slot_offset;
115
116 /* pick a random entry to start at */
117 slot_offset = mftb() & 0x7;
118
119 for (i = 0; i < HPTES_PER_GROUP; i++) {
120 hptep = htab_address + hpte_group + slot_offset;
121 dw0 = hptep->dw0.dw0;
122
123 if (dw0.v && !dw0.bolted) {
124 /* retry with lock held */
125 native_lock_hpte(hptep);
126 dw0 = hptep->dw0.dw0;
127 if (dw0.v && !dw0.bolted)
128 break;
129 native_unlock_hpte(hptep);
130 }
131
132 slot_offset++;
133 slot_offset &= 0x7;
134 }
135
136 if (i == HPTES_PER_GROUP)
137 return -1;
138
139 /* Invalidate the hpte. NOTE: this also unlocks it */
140 hptep->dw0.dword0 = 0;
141
142 return i;
143}
144
145static inline void set_pp_bit(unsigned long pp, HPTE *addr)
146{
147 unsigned long old;
148 unsigned long *p = &addr->dw1.dword1;
149
150 __asm__ __volatile__(
151 "1: ldarx %0,0,%3\n\
152 rldimi %0,%2,0,61\n\
153 stdcx. %0,0,%3\n\
154 bne 1b"
155 : "=&r" (old), "=m" (*p)
156 : "r" (pp), "r" (p), "m" (*p)
157 : "cc");
158}
159
160/*
161 * Only works on small pages. Yes its ugly to have to check each slot in
162 * the group but we only use this during bootup.
163 */
164static long native_hpte_find(unsigned long vpn)
165{
166 HPTE *hptep;
167 unsigned long hash;
168 unsigned long i, j;
169 long slot;
170 Hpte_dword0 dw0;
171
172 hash = hpt_hash(vpn, 0);
173
174 for (j = 0; j < 2; j++) {
175 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
176 for (i = 0; i < HPTES_PER_GROUP; i++) {
177 hptep = htab_address + slot;
178 dw0 = hptep->dw0.dw0;
179
180 if ((dw0.avpn == (vpn >> 11)) && dw0.v &&
181 (dw0.h == j)) {
182 /* HPTE matches */
183 if (j)
184 slot = -slot;
185 return slot;
186 }
187 ++slot;
188 }
189 hash = ~hash;
190 }
191
192 return -1;
193}
194
195static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
196 unsigned long va, int large, int local)
197{
198 HPTE *hptep = htab_address + slot;
199 Hpte_dword0 dw0;
200 unsigned long avpn = va >> 23;
201 int ret = 0;
202
203 if (large)
204 avpn &= ~0x1UL;
205
206 native_lock_hpte(hptep);
207
208 dw0 = hptep->dw0.dw0;
209
210 /* Even if we miss, we need to invalidate the TLB */
211 if ((dw0.avpn != avpn) || !dw0.v) {
212 native_unlock_hpte(hptep);
213 ret = -1;
214 } else {
215 set_pp_bit(newpp, hptep);
216 native_unlock_hpte(hptep);
217 }
218
219 /* Ensure it is out of the tlb too */
220 if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
221 tlbiel(va);
222 } else {
223 int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
224
225 if (lock_tlbie)
226 spin_lock(&native_tlbie_lock);
227 tlbie(va, large);
228 if (lock_tlbie)
229 spin_unlock(&native_tlbie_lock);
230 }
231
232 return ret;
233}
234
235/*
236 * Update the page protection bits. Intended to be used to create
237 * guard pages for kernel data structures on pages which are bolted
238 * in the HPT. Assumes pages being operated on will not be stolen.
239 * Does not work on large pages.
240 *
241 * No need to lock here because we should be the only user.
242 */
243static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea)
244{
245 unsigned long vsid, va, vpn, flags = 0;
246 long slot;
247 HPTE *hptep;
248 int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
249
250 vsid = get_kernel_vsid(ea);
251 va = (vsid << 28) | (ea & 0x0fffffff);
252 vpn = va >> PAGE_SHIFT;
253
254 slot = native_hpte_find(vpn);
255 if (slot == -1)
256 panic("could not find page to bolt\n");
257 hptep = htab_address + slot;
258
259 set_pp_bit(newpp, hptep);
260
261 /* Ensure it is out of the tlb too */
262 if (lock_tlbie)
263 spin_lock_irqsave(&native_tlbie_lock, flags);
264 tlbie(va, 0);
265 if (lock_tlbie)
266 spin_unlock_irqrestore(&native_tlbie_lock, flags);
267}
268
269static void native_hpte_invalidate(unsigned long slot, unsigned long va,
270 int large, int local)
271{
272 HPTE *hptep = htab_address + slot;
273 Hpte_dword0 dw0;
274 unsigned long avpn = va >> 23;
275 unsigned long flags;
276 int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
277
278 if (large)
279 avpn &= ~0x1UL;
280
281 local_irq_save(flags);
282 native_lock_hpte(hptep);
283
284 dw0 = hptep->dw0.dw0;
285
286 /* Even if we miss, we need to invalidate the TLB */
287 if ((dw0.avpn != avpn) || !dw0.v) {
288 native_unlock_hpte(hptep);
289 } else {
290 /* Invalidate the hpte. NOTE: this also unlocks it */
291 hptep->dw0.dword0 = 0;
292 }
293
294 /* Invalidate the tlb */
295 if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
296 tlbiel(va);
297 } else {
298 if (lock_tlbie)
299 spin_lock(&native_tlbie_lock);
300 tlbie(va, large);
301 if (lock_tlbie)
302 spin_unlock(&native_tlbie_lock);
303 }
304 local_irq_restore(flags);
305}
306
307static void native_flush_hash_range(unsigned long context,
308 unsigned long number, int local)
309{
310 unsigned long vsid, vpn, va, hash, secondary, slot, flags, avpn;
311 int i, j;
312 HPTE *hptep;
313 Hpte_dword0 dw0;
314 struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
315
316 /* XXX fix for large ptes */
317 unsigned long large = 0;
318
319 local_irq_save(flags);
320
321 j = 0;
322 for (i = 0; i < number; i++) {
323 if ((batch->addr[i] >= USER_START) &&
324 (batch->addr[i] <= USER_END))
325 vsid = get_vsid(context, batch->addr[i]);
326 else
327 vsid = get_kernel_vsid(batch->addr[i]);
328
329 va = (vsid << 28) | (batch->addr[i] & 0x0fffffff);
330 batch->vaddr[j] = va;
331 if (large)
332 vpn = va >> HPAGE_SHIFT;
333 else
334 vpn = va >> PAGE_SHIFT;
335 hash = hpt_hash(vpn, large);
336 secondary = (pte_val(batch->pte[i]) & _PAGE_SECONDARY) >> 15;
337 if (secondary)
338 hash = ~hash;
339 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
340 slot += (pte_val(batch->pte[i]) & _PAGE_GROUP_IX) >> 12;
341
342 hptep = htab_address + slot;
343
344 avpn = va >> 23;
345 if (large)
346 avpn &= ~0x1UL;
347
348 native_lock_hpte(hptep);
349
350 dw0 = hptep->dw0.dw0;
351
352 /* Even if we miss, we need to invalidate the TLB */
353 if ((dw0.avpn != avpn) || !dw0.v) {
354 native_unlock_hpte(hptep);
355 } else {
356 /* Invalidate the hpte. NOTE: this also unlocks it */
357 hptep->dw0.dword0 = 0;
358 }
359
360 j++;
361 }
362
363 if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
364 asm volatile("ptesync":::"memory");
365
366 for (i = 0; i < j; i++)
367 __tlbiel(batch->vaddr[i]);
368
369 asm volatile("ptesync":::"memory");
370 } else {
371 int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
372
373 if (lock_tlbie)
374 spin_lock(&native_tlbie_lock);
375
376 asm volatile("ptesync":::"memory");
377
378 for (i = 0; i < j; i++)
379 __tlbie(batch->vaddr[i], 0);
380
381 asm volatile("eieio; tlbsync; ptesync":::"memory");
382
383 if (lock_tlbie)
384 spin_unlock(&native_tlbie_lock);
385 }
386
387 local_irq_restore(flags);
388}
389
390#ifdef CONFIG_PPC_PSERIES
391/* Disable TLB batching on nighthawk */
392static inline int tlb_batching_enabled(void)
393{
394 struct device_node *root = of_find_node_by_path("/");
395 int enabled = 1;
396
397 if (root) {
398 const char *model = get_property(root, "model", NULL);
399 if (model && !strcmp(model, "IBM,9076-N81"))
400 enabled = 0;
401 of_node_put(root);
402 }
403
404 return enabled;
405}
406#else
407static inline int tlb_batching_enabled(void)
408{
409 return 1;
410}
411#endif
412
413void hpte_init_native(void)
414{
415 ppc_md.hpte_invalidate = native_hpte_invalidate;
416 ppc_md.hpte_updatepp = native_hpte_updatepp;
417 ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp;
418 ppc_md.hpte_insert = native_hpte_insert;
419 ppc_md.hpte_remove = native_hpte_remove;
420 if (tlb_batching_enabled())
421 ppc_md.flush_hash_range = native_flush_hash_range;
422 htab_finish_init();
423}
diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c
new file mode 100644
index 000000000000..e48be12f518c
--- /dev/null
+++ b/arch/ppc64/mm/hash_utils.c
@@ -0,0 +1,439 @@
1/*
2 * PowerPC64 port by Mike Corrigan and Dave Engebretsen
3 * {mikejc|engebret}@us.ibm.com
4 *
5 * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
6 *
7 * SMP scalability work:
8 * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
9 *
10 * Module name: htab.c
11 *
12 * Description:
13 * PowerPC Hashed Page Table functions
14 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version
18 * 2 of the License, or (at your option) any later version.
19 */
20
21#undef DEBUG
22
23#include <linux/config.h>
24#include <linux/spinlock.h>
25#include <linux/errno.h>
26#include <linux/sched.h>
27#include <linux/proc_fs.h>
28#include <linux/stat.h>
29#include <linux/sysctl.h>
30#include <linux/ctype.h>
31#include <linux/cache.h>
32#include <linux/init.h>
33#include <linux/signal.h>
34
35#include <asm/ppcdebug.h>
36#include <asm/processor.h>
37#include <asm/pgtable.h>
38#include <asm/mmu.h>
39#include <asm/mmu_context.h>
40#include <asm/page.h>
41#include <asm/types.h>
42#include <asm/system.h>
43#include <asm/uaccess.h>
44#include <asm/machdep.h>
45#include <asm/lmb.h>
46#include <asm/abs_addr.h>
47#include <asm/tlbflush.h>
48#include <asm/io.h>
49#include <asm/eeh.h>
50#include <asm/tlb.h>
51#include <asm/cacheflush.h>
52#include <asm/cputable.h>
53#include <asm/abs_addr.h>
54#include <asm/sections.h>
55
56#ifdef DEBUG
57#define DBG(fmt...) udbg_printf(fmt)
58#else
59#define DBG(fmt...)
60#endif
61
62/*
63 * Note: pte --> Linux PTE
64 * HPTE --> PowerPC Hashed Page Table Entry
65 *
66 * Execution context:
67 * htab_initialize is called with the MMU off (of course), but
68 * the kernel has been copied down to zero so it can directly
69 * reference global data. At this point it is very difficult
70 * to print debug info.
71 *
72 */
73
74#ifdef CONFIG_U3_DART
75extern unsigned long dart_tablebase;
76#endif /* CONFIG_U3_DART */
77
78HPTE *htab_address;
79unsigned long htab_hash_mask;
80
81extern unsigned long _SDR1;
82
83#define KB (1024)
84#define MB (1024*KB)
85
86static inline void loop_forever(void)
87{
88 volatile unsigned long x = 1;
89 for(;x;x|=1)
90 ;
91}
92
93#ifdef CONFIG_PPC_MULTIPLATFORM
94static inline void create_pte_mapping(unsigned long start, unsigned long end,
95 unsigned long mode, int large)
96{
97 unsigned long addr;
98 unsigned int step;
99 unsigned long tmp_mode;
100
101 if (large)
102 step = 16*MB;
103 else
104 step = 4*KB;
105
106 for (addr = start; addr < end; addr += step) {
107 unsigned long vpn, hash, hpteg;
108 unsigned long vsid = get_kernel_vsid(addr);
109 unsigned long va = (vsid << 28) | (addr & 0xfffffff);
110 int ret;
111
112 if (large)
113 vpn = va >> HPAGE_SHIFT;
114 else
115 vpn = va >> PAGE_SHIFT;
116
117
118 tmp_mode = mode;
119
120 /* Make non-kernel text non-executable */
121 if (!in_kernel_text(addr))
122 tmp_mode = mode | HW_NO_EXEC;
123
124 hash = hpt_hash(vpn, large);
125
126 hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
127
128#ifdef CONFIG_PPC_PSERIES
129 if (systemcfg->platform & PLATFORM_LPAR)
130 ret = pSeries_lpar_hpte_insert(hpteg, va,
131 virt_to_abs(addr) >> PAGE_SHIFT,
132 0, tmp_mode, 1, large);
133 else
134#endif /* CONFIG_PPC_PSERIES */
135 ret = native_hpte_insert(hpteg, va,
136 virt_to_abs(addr) >> PAGE_SHIFT,
137 0, tmp_mode, 1, large);
138
139 if (ret == -1) {
140 ppc64_terminate_msg(0x20, "create_pte_mapping");
141 loop_forever();
142 }
143 }
144}
145
146void __init htab_initialize(void)
147{
148 unsigned long table, htab_size_bytes;
149 unsigned long pteg_count;
150 unsigned long mode_rw;
151 int i, use_largepages = 0;
152 unsigned long base = 0, size = 0;
153 extern unsigned long tce_alloc_start, tce_alloc_end;
154
155 DBG(" -> htab_initialize()\n");
156
157 /*
158 * Calculate the required size of the htab. We want the number of
159 * PTEGs to equal one half the number of real pages.
160 */
161 htab_size_bytes = 1UL << ppc64_pft_size;
162 pteg_count = htab_size_bytes >> 7;
163
164 /* For debug, make the HTAB 1/8 as big as it normally would be. */
165 ifppcdebug(PPCDBG_HTABSIZE) {
166 pteg_count >>= 3;
167 htab_size_bytes = pteg_count << 7;
168 }
169
170 htab_hash_mask = pteg_count - 1;
171
172 if (systemcfg->platform & PLATFORM_LPAR) {
173 /* Using a hypervisor which owns the htab */
174 htab_address = NULL;
175 _SDR1 = 0;
176 } else {
177 /* Find storage for the HPT. Must be contiguous in
178 * the absolute address space.
179 */
180 table = lmb_alloc(htab_size_bytes, htab_size_bytes);
181
182 DBG("Hash table allocated at %lx, size: %lx\n", table,
183 htab_size_bytes);
184
185 if ( !table ) {
186 ppc64_terminate_msg(0x20, "hpt space");
187 loop_forever();
188 }
189 htab_address = abs_to_virt(table);
190
191 /* htab absolute addr + encoded htabsize */
192 _SDR1 = table + __ilog2(pteg_count) - 11;
193
194 /* Initialize the HPT with no entries */
195 memset((void *)table, 0, htab_size_bytes);
196 }
197
198 mode_rw = _PAGE_ACCESSED | _PAGE_COHERENT | PP_RWXX;
199
200 /* On U3 based machines, we need to reserve the DART area and
201 * _NOT_ map it to avoid cache paradoxes as it's remapped non
202 * cacheable later on
203 */
204 if (cpu_has_feature(CPU_FTR_16M_PAGE))
205 use_largepages = 1;
206
207 /* create bolted the linear mapping in the hash table */
208 for (i=0; i < lmb.memory.cnt; i++) {
209 base = lmb.memory.region[i].physbase + KERNELBASE;
210 size = lmb.memory.region[i].size;
211
212 DBG("creating mapping for region: %lx : %lx\n", base, size);
213
214#ifdef CONFIG_U3_DART
215 /* Do not map the DART space. Fortunately, it will be aligned
216 * in such a way that it will not cross two lmb regions and will
217 * fit within a single 16Mb page.
218 * The DART space is assumed to be a full 16Mb region even if we
219 * only use 2Mb of that space. We will use more of it later for
220 * AGP GART. We have to use a full 16Mb large page.
221 */
222 DBG("DART base: %lx\n", dart_tablebase);
223
224 if (dart_tablebase != 0 && dart_tablebase >= base
225 && dart_tablebase < (base + size)) {
226 if (base != dart_tablebase)
227 create_pte_mapping(base, dart_tablebase, mode_rw,
228 use_largepages);
229 if ((base + size) > (dart_tablebase + 16*MB))
230 create_pte_mapping(dart_tablebase + 16*MB, base + size,
231 mode_rw, use_largepages);
232 continue;
233 }
234#endif /* CONFIG_U3_DART */
235 create_pte_mapping(base, base + size, mode_rw, use_largepages);
236 }
237
238 /*
239 * If we have a memory_limit and we've allocated TCEs then we need to
240 * explicitly map the TCE area at the top of RAM. We also cope with the
241 * case that the TCEs start below memory_limit.
242 * tce_alloc_start/end are 16MB aligned so the mapping should work
243 * for either 4K or 16MB pages.
244 */
245 if (tce_alloc_start) {
246 tce_alloc_start += KERNELBASE;
247 tce_alloc_end += KERNELBASE;
248
249 if (base + size >= tce_alloc_start)
250 tce_alloc_start = base + size + 1;
251
252 create_pte_mapping(tce_alloc_start, tce_alloc_end,
253 mode_rw, use_largepages);
254 }
255
256 DBG(" <- htab_initialize()\n");
257}
258#undef KB
259#undef MB
260#endif /* CONFIG_PPC_MULTIPLATFORM */
261
262/*
263 * Called by asm hashtable.S for doing lazy icache flush
264 */
265unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
266{
267 struct page *page;
268
269 if (!pfn_valid(pte_pfn(pte)))
270 return pp;
271
272 page = pte_page(pte);
273
274 /* page is dirty */
275 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
276 if (trap == 0x400) {
277 __flush_dcache_icache(page_address(page));
278 set_bit(PG_arch_1, &page->flags);
279 } else
280 pp |= HW_NO_EXEC;
281 }
282 return pp;
283}
284
285/* Result code is:
286 * 0 - handled
287 * 1 - normal page fault
288 * -1 - critical hash insertion error
289 */
290int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
291{
292 void *pgdir;
293 unsigned long vsid;
294 struct mm_struct *mm;
295 pte_t *ptep;
296 int ret;
297 int user_region = 0;
298 int local = 0;
299 cpumask_t tmp;
300
301 switch (REGION_ID(ea)) {
302 case USER_REGION_ID:
303 user_region = 1;
304 mm = current->mm;
305 if ((ea > USER_END) || (! mm))
306 return 1;
307
308 vsid = get_vsid(mm->context.id, ea);
309 break;
310 case IO_REGION_ID:
311 if (ea > IMALLOC_END)
312 return 1;
313 mm = &ioremap_mm;
314 vsid = get_kernel_vsid(ea);
315 break;
316 case VMALLOC_REGION_ID:
317 if (ea > VMALLOC_END)
318 return 1;
319 mm = &init_mm;
320 vsid = get_kernel_vsid(ea);
321 break;
322#if 0
323 case KERNEL_REGION_ID:
324 /*
325 * Should never get here - entire 0xC0... region is bolted.
326 * Send the problem up to do_page_fault
327 */
328#endif
329 default:
330 /* Not a valid range
331 * Send the problem up to do_page_fault
332 */
333 return 1;
334 break;
335 }
336
337 pgdir = mm->pgd;
338
339 if (pgdir == NULL)
340 return 1;
341
342 tmp = cpumask_of_cpu(smp_processor_id());
343 if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
344 local = 1;
345
346 /* Is this a huge page ? */
347 if (unlikely(in_hugepage_area(mm->context, ea)))
348 ret = hash_huge_page(mm, access, ea, vsid, local);
349 else {
350 ptep = find_linux_pte(pgdir, ea);
351 if (ptep == NULL)
352 return 1;
353 ret = __hash_page(ea, access, vsid, ptep, trap, local);
354 }
355
356 return ret;
357}
358
359void flush_hash_page(unsigned long context, unsigned long ea, pte_t pte,
360 int local)
361{
362 unsigned long vsid, vpn, va, hash, secondary, slot;
363 unsigned long huge = pte_huge(pte);
364
365 if ((ea >= USER_START) && (ea <= USER_END))
366 vsid = get_vsid(context, ea);
367 else
368 vsid = get_kernel_vsid(ea);
369
370 va = (vsid << 28) | (ea & 0x0fffffff);
371 if (huge)
372 vpn = va >> HPAGE_SHIFT;
373 else
374 vpn = va >> PAGE_SHIFT;
375 hash = hpt_hash(vpn, huge);
376 secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15;
377 if (secondary)
378 hash = ~hash;
379 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
380 slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12;
381
382 ppc_md.hpte_invalidate(slot, va, huge, local);
383}
384
385void flush_hash_range(unsigned long context, unsigned long number, int local)
386{
387 if (ppc_md.flush_hash_range) {
388 ppc_md.flush_hash_range(context, number, local);
389 } else {
390 int i;
391 struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
392
393 for (i = 0; i < number; i++)
394 flush_hash_page(context, batch->addr[i], batch->pte[i],
395 local);
396 }
397}
398
399static inline void make_bl(unsigned int *insn_addr, void *func)
400{
401 unsigned long funcp = *((unsigned long *)func);
402 int offset = funcp - (unsigned long)insn_addr;
403
404 *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc));
405 flush_icache_range((unsigned long)insn_addr, 4+
406 (unsigned long)insn_addr);
407}
408
409/*
410 * low_hash_fault is called when we the low level hash code failed
411 * to instert a PTE due to an hypervisor error
412 */
413void low_hash_fault(struct pt_regs *regs, unsigned long address)
414{
415 if (user_mode(regs)) {
416 siginfo_t info;
417
418 info.si_signo = SIGBUS;
419 info.si_errno = 0;
420 info.si_code = BUS_ADRERR;
421 info.si_addr = (void __user *)address;
422 force_sig_info(SIGBUS, &info, current);
423 return;
424 }
425 bad_page_fault(regs, address, SIGBUS);
426}
427
428void __init htab_finish_init(void)
429{
430 extern unsigned int *htab_call_hpte_insert1;
431 extern unsigned int *htab_call_hpte_insert2;
432 extern unsigned int *htab_call_hpte_remove;
433 extern unsigned int *htab_call_hpte_updatepp;
434
435 make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert);
436 make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert);
437 make_bl(htab_call_hpte_remove, ppc_md.hpte_remove);
438 make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp);
439}
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
new file mode 100644
index 000000000000..c62ddaff0720
--- /dev/null
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -0,0 +1,904 @@
1/*
2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
3 *
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
5 *
6 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8 */
9
10#include <linux/init.h>
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/hugetlb.h>
14#include <linux/pagemap.h>
15#include <linux/smp_lock.h>
16#include <linux/slab.h>
17#include <linux/err.h>
18#include <linux/sysctl.h>
19#include <asm/mman.h>
20#include <asm/pgalloc.h>
21#include <asm/tlb.h>
22#include <asm/tlbflush.h>
23#include <asm/mmu_context.h>
24#include <asm/machdep.h>
25#include <asm/cputable.h>
26#include <asm/tlb.h>
27
28#include <linux/sysctl.h>
29
30#define HUGEPGDIR_SHIFT (HPAGE_SHIFT + PAGE_SHIFT - 3)
31#define HUGEPGDIR_SIZE (1UL << HUGEPGDIR_SHIFT)
32#define HUGEPGDIR_MASK (~(HUGEPGDIR_SIZE-1))
33
34#define HUGEPTE_INDEX_SIZE 9
35#define HUGEPGD_INDEX_SIZE 10
36
37#define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE)
38#define PTRS_PER_HUGEPGD (1 << HUGEPGD_INDEX_SIZE)
39
40static inline int hugepgd_index(unsigned long addr)
41{
42 return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT;
43}
44
45static pgd_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr)
46{
47 int index;
48
49 if (! mm->context.huge_pgdir)
50 return NULL;
51
52
53 index = hugepgd_index(addr);
54 BUG_ON(index >= PTRS_PER_HUGEPGD);
55 return mm->context.huge_pgdir + index;
56}
57
58static inline pte_t *hugepte_offset(pgd_t *dir, unsigned long addr)
59{
60 int index;
61
62 if (pgd_none(*dir))
63 return NULL;
64
65 index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE;
66 return (pte_t *)pgd_page(*dir) + index;
67}
68
69static pgd_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr)
70{
71 BUG_ON(! in_hugepage_area(mm->context, addr));
72
73 if (! mm->context.huge_pgdir) {
74 pgd_t *new;
75 spin_unlock(&mm->page_table_lock);
76 /* Don't use pgd_alloc(), because we want __GFP_REPEAT */
77 new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
78 BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
79 spin_lock(&mm->page_table_lock);
80
81 /*
82 * Because we dropped the lock, we should re-check the
83 * entry, as somebody else could have populated it..
84 */
85 if (mm->context.huge_pgdir)
86 pgd_free(new);
87 else
88 mm->context.huge_pgdir = new;
89 }
90 return hugepgd_offset(mm, addr);
91}
92
93static pte_t *hugepte_alloc(struct mm_struct *mm, pgd_t *dir,
94 unsigned long addr)
95{
96 if (! pgd_present(*dir)) {
97 pte_t *new;
98
99 spin_unlock(&mm->page_table_lock);
100 new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
101 BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
102 spin_lock(&mm->page_table_lock);
103 /*
104 * Because we dropped the lock, we should re-check the
105 * entry, as somebody else could have populated it..
106 */
107 if (pgd_present(*dir)) {
108 if (new)
109 kmem_cache_free(zero_cache, new);
110 } else {
111 struct page *ptepage;
112
113 if (! new)
114 return NULL;
115 ptepage = virt_to_page(new);
116 ptepage->mapping = (void *) mm;
117 ptepage->index = addr & HUGEPGDIR_MASK;
118 pgd_populate(mm, dir, new);
119 }
120 }
121
122 return hugepte_offset(dir, addr);
123}
124
125static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
126{
127 pgd_t *pgd;
128
129 BUG_ON(! in_hugepage_area(mm->context, addr));
130
131 pgd = hugepgd_offset(mm, addr);
132 if (! pgd)
133 return NULL;
134
135 return hugepte_offset(pgd, addr);
136}
137
138static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
139{
140 pgd_t *pgd;
141
142 BUG_ON(! in_hugepage_area(mm->context, addr));
143
144 pgd = hugepgd_alloc(mm, addr);
145 if (! pgd)
146 return NULL;
147
148 return hugepte_alloc(mm, pgd, addr);
149}
150
151static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
152 unsigned long addr, struct page *page,
153 pte_t *ptep, int write_access)
154{
155 pte_t entry;
156
157 add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
158 if (write_access) {
159 entry =
160 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
161 } else {
162 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
163 }
164 entry = pte_mkyoung(entry);
165 entry = pte_mkhuge(entry);
166
167 set_pte_at(mm, addr, ptep, entry);
168}
169
170/*
171 * This function checks for proper alignment of input addr and len parameters.
172 */
173int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
174{
175 if (len & ~HPAGE_MASK)
176 return -EINVAL;
177 if (addr & ~HPAGE_MASK)
178 return -EINVAL;
179 if (! (within_hugepage_low_range(addr, len)
180 || within_hugepage_high_range(addr, len)) )
181 return -EINVAL;
182 return 0;
183}
184
185static void flush_segments(void *parm)
186{
187 u16 segs = (unsigned long) parm;
188 unsigned long i;
189
190 asm volatile("isync" : : : "memory");
191
192 for (i = 0; i < 16; i++) {
193 if (! (segs & (1U << i)))
194 continue;
195 asm volatile("slbie %0" : : "r" (i << SID_SHIFT));
196 }
197
198 asm volatile("isync" : : : "memory");
199}
200
201static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
202{
203 unsigned long start = seg << SID_SHIFT;
204 unsigned long end = (seg+1) << SID_SHIFT;
205 struct vm_area_struct *vma;
206 unsigned long addr;
207 struct mmu_gather *tlb;
208
209 BUG_ON(seg >= 16);
210
211 /* Check no VMAs are in the region */
212 vma = find_vma(mm, start);
213 if (vma && (vma->vm_start < end))
214 return -EBUSY;
215
216 /* Clean up any leftover PTE pages in the region */
217 spin_lock(&mm->page_table_lock);
218 tlb = tlb_gather_mmu(mm, 0);
219 for (addr = start; addr < end; addr += PMD_SIZE) {
220 pgd_t *pgd = pgd_offset(mm, addr);
221 pmd_t *pmd;
222 struct page *page;
223 pte_t *pte;
224 int i;
225
226 if (pgd_none(*pgd))
227 continue;
228 pmd = pmd_offset(pgd, addr);
229 if (!pmd || pmd_none(*pmd))
230 continue;
231 if (pmd_bad(*pmd)) {
232 pmd_ERROR(*pmd);
233 pmd_clear(pmd);
234 continue;
235 }
236 pte = (pte_t *)pmd_page_kernel(*pmd);
237 /* No VMAs, so there should be no PTEs, check just in case. */
238 for (i = 0; i < PTRS_PER_PTE; i++) {
239 BUG_ON(!pte_none(*pte));
240 pte++;
241 }
242 page = pmd_page(*pmd);
243 pmd_clear(pmd);
244 mm->nr_ptes--;
245 dec_page_state(nr_page_table_pages);
246 pte_free_tlb(tlb, page);
247 }
248 tlb_finish_mmu(tlb, start, end);
249 spin_unlock(&mm->page_table_lock);
250
251 return 0;
252}
253
254static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
255{
256 unsigned long i;
257
258 newsegs &= ~(mm->context.htlb_segs);
259 if (! newsegs)
260 return 0; /* The segments we want are already open */
261
262 for (i = 0; i < 16; i++)
263 if ((1 << i) & newsegs)
264 if (prepare_low_seg_for_htlb(mm, i) != 0)
265 return -EBUSY;
266
267 mm->context.htlb_segs |= newsegs;
268
269 /* update the paca copy of the context struct */
270 get_paca()->context = mm->context;
271
272 /* the context change must make it to memory before the flush,
273 * so that further SLB misses do the right thing. */
274 mb();
275 on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1);
276
277 return 0;
278}
279
280int prepare_hugepage_range(unsigned long addr, unsigned long len)
281{
282 if (within_hugepage_high_range(addr, len))
283 return 0;
284 else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) {
285 int err;
286 /* Yes, we need both tests, in case addr+len overflows
287 * 64-bit arithmetic */
288 err = open_low_hpage_segs(current->mm,
289 LOW_ESID_MASK(addr, len));
290 if (err)
291 printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
292 " failed (segs: 0x%04hx)\n", addr, len,
293 LOW_ESID_MASK(addr, len));
294 return err;
295 }
296
297 return -EINVAL;
298}
299
300int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
301 struct vm_area_struct *vma)
302{
303 pte_t *src_pte, *dst_pte, entry;
304 struct page *ptepage;
305 unsigned long addr = vma->vm_start;
306 unsigned long end = vma->vm_end;
307 int err = -ENOMEM;
308
309 while (addr < end) {
310 dst_pte = huge_pte_alloc(dst, addr);
311 if (!dst_pte)
312 goto out;
313
314 src_pte = huge_pte_offset(src, addr);
315 entry = *src_pte;
316
317 ptepage = pte_page(entry);
318 get_page(ptepage);
319 add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
320 set_pte_at(dst, addr, dst_pte, entry);
321
322 addr += HPAGE_SIZE;
323 }
324
325 err = 0;
326 out:
327 return err;
328}
329
330int
331follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
332 struct page **pages, struct vm_area_struct **vmas,
333 unsigned long *position, int *length, int i)
334{
335 unsigned long vpfn, vaddr = *position;
336 int remainder = *length;
337
338 WARN_ON(!is_vm_hugetlb_page(vma));
339
340 vpfn = vaddr/PAGE_SIZE;
341 while (vaddr < vma->vm_end && remainder) {
342 if (pages) {
343 pte_t *pte;
344 struct page *page;
345
346 pte = huge_pte_offset(mm, vaddr);
347
348 /* hugetlb should be locked, and hence, prefaulted */
349 WARN_ON(!pte || pte_none(*pte));
350
351 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
352
353 WARN_ON(!PageCompound(page));
354
355 get_page(page);
356 pages[i] = page;
357 }
358
359 if (vmas)
360 vmas[i] = vma;
361
362 vaddr += PAGE_SIZE;
363 ++vpfn;
364 --remainder;
365 ++i;
366 }
367
368 *length = remainder;
369 *position = vaddr;
370
371 return i;
372}
373
374struct page *
375follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
376{
377 pte_t *ptep;
378 struct page *page;
379
380 if (! in_hugepage_area(mm->context, address))
381 return ERR_PTR(-EINVAL);
382
383 ptep = huge_pte_offset(mm, address);
384 page = pte_page(*ptep);
385 if (page)
386 page += (address % HPAGE_SIZE) / PAGE_SIZE;
387
388 return page;
389}
390
391int pmd_huge(pmd_t pmd)
392{
393 return 0;
394}
395
396struct page *
397follow_huge_pmd(struct mm_struct *mm, unsigned long address,
398 pmd_t *pmd, int write)
399{
400 BUG();
401 return NULL;
402}
403
404void unmap_hugepage_range(struct vm_area_struct *vma,
405 unsigned long start, unsigned long end)
406{
407 struct mm_struct *mm = vma->vm_mm;
408 unsigned long addr;
409 pte_t *ptep;
410 struct page *page;
411
412 WARN_ON(!is_vm_hugetlb_page(vma));
413 BUG_ON((start % HPAGE_SIZE) != 0);
414 BUG_ON((end % HPAGE_SIZE) != 0);
415
416 for (addr = start; addr < end; addr += HPAGE_SIZE) {
417 pte_t pte;
418
419 ptep = huge_pte_offset(mm, addr);
420 if (!ptep || pte_none(*ptep))
421 continue;
422
423 pte = *ptep;
424 page = pte_page(pte);
425 pte_clear(mm, addr, ptep);
426
427 put_page(page);
428 }
429 add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
430 flush_tlb_pending();
431}
432
433void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
434 unsigned long start, unsigned long end)
435{
436 /* Because the huge pgtables are only 2 level, they can take
437 * at most around 4M, much less than one hugepage which the
438 * process is presumably entitled to use. So we don't bother
439 * freeing up the pagetables on unmap, and wait until
440 * destroy_context() to clean up the lot. */
441}
442
443int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
444{
445 struct mm_struct *mm = current->mm;
446 unsigned long addr;
447 int ret = 0;
448
449 WARN_ON(!is_vm_hugetlb_page(vma));
450 BUG_ON((vma->vm_start % HPAGE_SIZE) != 0);
451 BUG_ON((vma->vm_end % HPAGE_SIZE) != 0);
452
453 spin_lock(&mm->page_table_lock);
454 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
455 unsigned long idx;
456 pte_t *pte = huge_pte_alloc(mm, addr);
457 struct page *page;
458
459 if (!pte) {
460 ret = -ENOMEM;
461 goto out;
462 }
463 if (! pte_none(*pte))
464 continue;
465
466 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
467 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
468 page = find_get_page(mapping, idx);
469 if (!page) {
470 /* charge the fs quota first */
471 if (hugetlb_get_quota(mapping)) {
472 ret = -ENOMEM;
473 goto out;
474 }
475 page = alloc_huge_page();
476 if (!page) {
477 hugetlb_put_quota(mapping);
478 ret = -ENOMEM;
479 goto out;
480 }
481 ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
482 if (! ret) {
483 unlock_page(page);
484 } else {
485 hugetlb_put_quota(mapping);
486 free_huge_page(page);
487 goto out;
488 }
489 }
490 set_huge_pte(mm, vma, addr, page, pte, vma->vm_flags & VM_WRITE);
491 }
492out:
493 spin_unlock(&mm->page_table_lock);
494 return ret;
495}
496
497/* Because we have an exclusive hugepage region which lies within the
498 * normal user address space, we have to take special measures to make
499 * non-huge mmap()s evade the hugepage reserved regions. */
500unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
501 unsigned long len, unsigned long pgoff,
502 unsigned long flags)
503{
504 struct mm_struct *mm = current->mm;
505 struct vm_area_struct *vma;
506 unsigned long start_addr;
507
508 if (len > TASK_SIZE)
509 return -ENOMEM;
510
511 if (addr) {
512 addr = PAGE_ALIGN(addr);
513 vma = find_vma(mm, addr);
514 if (((TASK_SIZE - len) >= addr)
515 && (!vma || (addr+len) <= vma->vm_start)
516 && !is_hugepage_only_range(mm, addr,len))
517 return addr;
518 }
519 start_addr = addr = mm->free_area_cache;
520
521full_search:
522 vma = find_vma(mm, addr);
523 while (TASK_SIZE - len >= addr) {
524 BUG_ON(vma && (addr >= vma->vm_end));
525
526 if (touches_hugepage_low_range(mm, addr, len)) {
527 addr = ALIGN(addr+1, 1<<SID_SHIFT);
528 vma = find_vma(mm, addr);
529 continue;
530 }
531 if (touches_hugepage_high_range(addr, len)) {
532 addr = TASK_HPAGE_END;
533 vma = find_vma(mm, addr);
534 continue;
535 }
536 if (!vma || addr + len <= vma->vm_start) {
537 /*
538 * Remember the place where we stopped the search:
539 */
540 mm->free_area_cache = addr + len;
541 return addr;
542 }
543 addr = vma->vm_end;
544 vma = vma->vm_next;
545 }
546
547 /* Make sure we didn't miss any holes */
548 if (start_addr != TASK_UNMAPPED_BASE) {
549 start_addr = addr = TASK_UNMAPPED_BASE;
550 goto full_search;
551 }
552 return -ENOMEM;
553}
554
555/*
556 * This mmap-allocator allocates new areas top-down from below the
557 * stack's low limit (the base):
558 *
559 * Because we have an exclusive hugepage region which lies within the
560 * normal user address space, we have to take special measures to make
561 * non-huge mmap()s evade the hugepage reserved regions.
562 */
563unsigned long
564arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
565 const unsigned long len, const unsigned long pgoff,
566 const unsigned long flags)
567{
568 struct vm_area_struct *vma, *prev_vma;
569 struct mm_struct *mm = current->mm;
570 unsigned long base = mm->mmap_base, addr = addr0;
571 int first_time = 1;
572
573 /* requested length too big for entire address space */
574 if (len > TASK_SIZE)
575 return -ENOMEM;
576
577 /* dont allow allocations above current base */
578 if (mm->free_area_cache > base)
579 mm->free_area_cache = base;
580
581 /* requesting a specific address */
582 if (addr) {
583 addr = PAGE_ALIGN(addr);
584 vma = find_vma(mm, addr);
585 if (TASK_SIZE - len >= addr &&
586 (!vma || addr + len <= vma->vm_start)
587 && !is_hugepage_only_range(mm, addr,len))
588 return addr;
589 }
590
591try_again:
592 /* make sure it can fit in the remaining address space */
593 if (mm->free_area_cache < len)
594 goto fail;
595
596 /* either no address requested or cant fit in requested address hole */
597 addr = (mm->free_area_cache - len) & PAGE_MASK;
598 do {
599hugepage_recheck:
600 if (touches_hugepage_low_range(mm, addr, len)) {
601 addr = (addr & ((~0) << SID_SHIFT)) - len;
602 goto hugepage_recheck;
603 } else if (touches_hugepage_high_range(addr, len)) {
604 addr = TASK_HPAGE_BASE - len;
605 }
606
607 /*
608 * Lookup failure means no vma is above this address,
609 * i.e. return with success:
610 */
611 if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
612 return addr;
613
614 /*
615 * new region fits between prev_vma->vm_end and
616 * vma->vm_start, use it:
617 */
618 if (addr+len <= vma->vm_start &&
619 (!prev_vma || (addr >= prev_vma->vm_end)))
620 /* remember the address as a hint for next time */
621 return (mm->free_area_cache = addr);
622 else
623 /* pull free_area_cache down to the first hole */
624 if (mm->free_area_cache == vma->vm_end)
625 mm->free_area_cache = vma->vm_start;
626
627 /* try just below the current vma->vm_start */
628 addr = vma->vm_start-len;
629 } while (len <= vma->vm_start);
630
631fail:
632 /*
633 * if hint left us with no space for the requested
634 * mapping then try again:
635 */
636 if (first_time) {
637 mm->free_area_cache = base;
638 first_time = 0;
639 goto try_again;
640 }
641 /*
642 * A failed mmap() very likely causes application failure,
643 * so fall back to the bottom-up function here. This scenario
644 * can happen with large stack limits and large mmap()
645 * allocations.
646 */
647 mm->free_area_cache = TASK_UNMAPPED_BASE;
648 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
649 /*
650 * Restore the topdown base:
651 */
652 mm->free_area_cache = base;
653
654 return addr;
655}
656
657static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
658{
659 unsigned long addr = 0;
660 struct vm_area_struct *vma;
661
662 vma = find_vma(current->mm, addr);
663 while (addr + len <= 0x100000000UL) {
664 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
665
666 if (! __within_hugepage_low_range(addr, len, segmask)) {
667 addr = ALIGN(addr+1, 1<<SID_SHIFT);
668 vma = find_vma(current->mm, addr);
669 continue;
670 }
671
672 if (!vma || (addr + len) <= vma->vm_start)
673 return addr;
674 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
675 /* Depending on segmask this might not be a confirmed
676 * hugepage region, so the ALIGN could have skipped
677 * some VMAs */
678 vma = find_vma(current->mm, addr);
679 }
680
681 return -ENOMEM;
682}
683
684static unsigned long htlb_get_high_area(unsigned long len)
685{
686 unsigned long addr = TASK_HPAGE_BASE;
687 struct vm_area_struct *vma;
688
689 vma = find_vma(current->mm, addr);
690 for (vma = find_vma(current->mm, addr);
691 addr + len <= TASK_HPAGE_END;
692 vma = vma->vm_next) {
693 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
694 BUG_ON(! within_hugepage_high_range(addr, len));
695
696 if (!vma || (addr + len) <= vma->vm_start)
697 return addr;
698 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
699 /* Because we're in a hugepage region, this alignment
700 * should not skip us over any VMAs */
701 }
702
703 return -ENOMEM;
704}
705
706unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
707 unsigned long len, unsigned long pgoff,
708 unsigned long flags)
709{
710 if (len & ~HPAGE_MASK)
711 return -EINVAL;
712
713 if (!cpu_has_feature(CPU_FTR_16M_PAGE))
714 return -EINVAL;
715
716 if (test_thread_flag(TIF_32BIT)) {
717 int lastshift = 0;
718 u16 segmask, cursegs = current->mm->context.htlb_segs;
719
720 /* First see if we can do the mapping in the existing
721 * low hpage segments */
722 addr = htlb_get_low_area(len, cursegs);
723 if (addr != -ENOMEM)
724 return addr;
725
726 for (segmask = LOW_ESID_MASK(0x100000000UL-len, len);
727 ! lastshift; segmask >>=1) {
728 if (segmask & 1)
729 lastshift = 1;
730
731 addr = htlb_get_low_area(len, cursegs | segmask);
732 if ((addr != -ENOMEM)
733 && open_low_hpage_segs(current->mm, segmask) == 0)
734 return addr;
735 }
736 printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
737 " enough segments\n");
738 return -ENOMEM;
739 } else {
740 return htlb_get_high_area(len);
741 }
742}
743
744void hugetlb_mm_free_pgd(struct mm_struct *mm)
745{
746 int i;
747 pgd_t *pgdir;
748
749 spin_lock(&mm->page_table_lock);
750
751 pgdir = mm->context.huge_pgdir;
752 if (! pgdir)
753 goto out;
754
755 mm->context.huge_pgdir = NULL;
756
757 /* cleanup any hugepte pages leftover */
758 for (i = 0; i < PTRS_PER_HUGEPGD; i++) {
759 pgd_t *pgd = pgdir + i;
760
761 if (! pgd_none(*pgd)) {
762 pte_t *pte = (pte_t *)pgd_page(*pgd);
763 struct page *ptepage = virt_to_page(pte);
764
765 ptepage->mapping = NULL;
766
767 BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE));
768 kmem_cache_free(zero_cache, pte);
769 }
770 pgd_clear(pgd);
771 }
772
773 BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE));
774 kmem_cache_free(zero_cache, pgdir);
775
776 out:
777 spin_unlock(&mm->page_table_lock);
778}
779
780int hash_huge_page(struct mm_struct *mm, unsigned long access,
781 unsigned long ea, unsigned long vsid, int local)
782{
783 pte_t *ptep;
784 unsigned long va, vpn;
785 pte_t old_pte, new_pte;
786 unsigned long hpteflags, prpn;
787 long slot;
788 int err = 1;
789
790 spin_lock(&mm->page_table_lock);
791
792 ptep = huge_pte_offset(mm, ea);
793
794 /* Search the Linux page table for a match with va */
795 va = (vsid << 28) | (ea & 0x0fffffff);
796 vpn = va >> HPAGE_SHIFT;
797
798 /*
799 * If no pte found or not present, send the problem up to
800 * do_page_fault
801 */
802 if (unlikely(!ptep || pte_none(*ptep)))
803 goto out;
804
805/* BUG_ON(pte_bad(*ptep)); */
806
807 /*
808 * Check the user's access rights to the page. If access should be
809 * prevented then send the problem up to do_page_fault.
810 */
811 if (unlikely(access & ~pte_val(*ptep)))
812 goto out;
813 /*
814 * At this point, we have a pte (old_pte) which can be used to build
815 * or update an HPTE. There are 2 cases:
816 *
817 * 1. There is a valid (present) pte with no associated HPTE (this is
818 * the most common case)
819 * 2. There is a valid (present) pte with an associated HPTE. The
820 * current values of the pp bits in the HPTE prevent access
821 * because we are doing software DIRTY bit management and the
822 * page is currently not DIRTY.
823 */
824
825
826 old_pte = *ptep;
827 new_pte = old_pte;
828
829 hpteflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
830 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
831 hpteflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
832
833 /* Check if pte already has an hpte (case 2) */
834 if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
835 /* There MIGHT be an HPTE for this pte */
836 unsigned long hash, slot;
837
838 hash = hpt_hash(vpn, 1);
839 if (pte_val(old_pte) & _PAGE_SECONDARY)
840 hash = ~hash;
841 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
842 slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
843
844 if (ppc_md.hpte_updatepp(slot, hpteflags, va, 1, local) == -1)
845 pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
846 }
847
848 if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
849 unsigned long hash = hpt_hash(vpn, 1);
850 unsigned long hpte_group;
851
852 prpn = pte_pfn(old_pte);
853
854repeat:
855 hpte_group = ((hash & htab_hash_mask) *
856 HPTES_PER_GROUP) & ~0x7UL;
857
858 /* Update the linux pte with the HPTE slot */
859 pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
860 pte_val(new_pte) |= _PAGE_HASHPTE;
861
862 /* Add in WIMG bits */
863 /* XXX We should store these in the pte */
864 hpteflags |= _PAGE_COHERENT;
865
866 slot = ppc_md.hpte_insert(hpte_group, va, prpn, 0,
867 hpteflags, 0, 1);
868
869 /* Primary is full, try the secondary */
870 if (unlikely(slot == -1)) {
871 pte_val(new_pte) |= _PAGE_SECONDARY;
872 hpte_group = ((~hash & htab_hash_mask) *
873 HPTES_PER_GROUP) & ~0x7UL;
874 slot = ppc_md.hpte_insert(hpte_group, va, prpn,
875 1, hpteflags, 0, 1);
876 if (slot == -1) {
877 if (mftb() & 0x1)
878 hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
879
880 ppc_md.hpte_remove(hpte_group);
881 goto repeat;
882 }
883 }
884
885 if (unlikely(slot == -2))
886 panic("hash_huge_page: pte_insert failed\n");
887
888 pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
889
890 /*
891 * No need to use ldarx/stdcx here because all who
892 * might be updating the pte will hold the
893 * page_table_lock
894 */
895 *ptep = new_pte;
896 }
897
898 err = 0;
899
900 out:
901 spin_unlock(&mm->page_table_lock);
902
903 return err;
904}
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c
new file mode 100644
index 000000000000..9d92b0d9cde5
--- /dev/null
+++ b/arch/ppc64/mm/imalloc.c
@@ -0,0 +1,312 @@
1/*
2 * c 2001 PPC 64 Team, IBM Corp
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#include <linux/slab.h>
11#include <linux/vmalloc.h>
12
13#include <asm/uaccess.h>
14#include <asm/pgalloc.h>
15#include <asm/pgtable.h>
16#include <asm/semaphore.h>
17
18static DECLARE_MUTEX(imlist_sem);
19struct vm_struct * imlist = NULL;
20
21static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
22{
23 unsigned long addr;
24 struct vm_struct **p, *tmp;
25
26 addr = IMALLOC_START;
27 for (p = &imlist; (tmp = *p) ; p = &tmp->next) {
28 if (size + addr < (unsigned long) tmp->addr)
29 break;
30 if ((unsigned long)tmp->addr >= IMALLOC_START)
31 addr = tmp->size + (unsigned long) tmp->addr;
32 if (addr > IMALLOC_END-size)
33 return 1;
34 }
35 *im_addr = addr;
36
37 return 0;
38}
39
40/* Return whether the region described by v_addr and size is a subset
41 * of the region described by parent
42 */
43static inline int im_region_is_subset(unsigned long v_addr, unsigned long size,
44 struct vm_struct *parent)
45{
46 return (int) (v_addr >= (unsigned long) parent->addr &&
47 v_addr < (unsigned long) parent->addr + parent->size &&
48 size < parent->size);
49}
50
51/* Return whether the region described by v_addr and size is a superset
52 * of the region described by child
53 */
54static int im_region_is_superset(unsigned long v_addr, unsigned long size,
55 struct vm_struct *child)
56{
57 struct vm_struct parent;
58
59 parent.addr = (void *) v_addr;
60 parent.size = size;
61
62 return im_region_is_subset((unsigned long) child->addr, child->size,
63 &parent);
64}
65
66/* Return whether the region described by v_addr and size overlaps
67 * the region described by vm. Overlapping regions meet the
68 * following conditions:
69 * 1) The regions share some part of the address space
70 * 2) The regions aren't identical
71 * 3) Neither region is a subset of the other
72 */
73static int im_region_overlaps(unsigned long v_addr, unsigned long size,
74 struct vm_struct *vm)
75{
76 if (im_region_is_superset(v_addr, size, vm))
77 return 0;
78
79 return (v_addr + size > (unsigned long) vm->addr + vm->size &&
80 v_addr < (unsigned long) vm->addr + vm->size) ||
81 (v_addr < (unsigned long) vm->addr &&
82 v_addr + size > (unsigned long) vm->addr);
83}
84
85/* Determine imalloc status of region described by v_addr and size.
86 * Can return one of the following:
87 * IM_REGION_UNUSED - Entire region is unallocated in imalloc space.
88 * IM_REGION_SUBSET - Region is a subset of a region that is already
89 * allocated in imalloc space.
90 * vm will be assigned to a ptr to the parent region.
91 * IM_REGION_EXISTS - Exact region already allocated in imalloc space.
92 * vm will be assigned to a ptr to the existing imlist
93 * member.
94 * IM_REGION_OVERLAPS - Region overlaps an allocated region in imalloc space.
95 * IM_REGION_SUPERSET - Region is a superset of a region that is already
96 * allocated in imalloc space.
97 */
98static int im_region_status(unsigned long v_addr, unsigned long size,
99 struct vm_struct **vm)
100{
101 struct vm_struct *tmp;
102
103 for (tmp = imlist; tmp; tmp = tmp->next)
104 if (v_addr < (unsigned long) tmp->addr + tmp->size)
105 break;
106
107 if (tmp) {
108 if (im_region_overlaps(v_addr, size, tmp))
109 return IM_REGION_OVERLAP;
110
111 *vm = tmp;
112 if (im_region_is_subset(v_addr, size, tmp)) {
113 /* Return with tmp pointing to superset */
114 return IM_REGION_SUBSET;
115 }
116 if (im_region_is_superset(v_addr, size, tmp)) {
117 /* Return with tmp pointing to first subset */
118 return IM_REGION_SUPERSET;
119 }
120 else if (v_addr == (unsigned long) tmp->addr &&
121 size == tmp->size) {
122 /* Return with tmp pointing to exact region */
123 return IM_REGION_EXISTS;
124 }
125 }
126
127 *vm = NULL;
128 return IM_REGION_UNUSED;
129}
130
131static struct vm_struct * split_im_region(unsigned long v_addr,
132 unsigned long size, struct vm_struct *parent)
133{
134 struct vm_struct *vm1 = NULL;
135 struct vm_struct *vm2 = NULL;
136 struct vm_struct *new_vm = NULL;
137
138 vm1 = (struct vm_struct *) kmalloc(sizeof(*vm1), GFP_KERNEL);
139 if (vm1 == NULL) {
140 printk(KERN_ERR "%s() out of memory\n", __FUNCTION__);
141 return NULL;
142 }
143
144 if (v_addr == (unsigned long) parent->addr) {
145 /* Use existing parent vm_struct to represent child, allocate
146 * new one for the remainder of parent range
147 */
148 vm1->size = parent->size - size;
149 vm1->addr = (void *) (v_addr + size);
150 vm1->next = parent->next;
151
152 parent->size = size;
153 parent->next = vm1;
154 new_vm = parent;
155 } else if (v_addr + size == (unsigned long) parent->addr +
156 parent->size) {
157 /* Allocate new vm_struct to represent child, use existing
158 * parent one for remainder of parent range
159 */
160 vm1->size = size;
161 vm1->addr = (void *) v_addr;
162 vm1->next = parent->next;
163 new_vm = vm1;
164
165 parent->size -= size;
166 parent->next = vm1;
167 } else {
168 /* Allocate two new vm_structs for the new child and
169 * uppermost remainder, and use existing parent one for the
170 * lower remainder of parent range
171 */
172 vm2 = (struct vm_struct *) kmalloc(sizeof(*vm2), GFP_KERNEL);
173 if (vm2 == NULL) {
174 printk(KERN_ERR "%s() out of memory\n", __FUNCTION__);
175 kfree(vm1);
176 return NULL;
177 }
178
179 vm1->size = size;
180 vm1->addr = (void *) v_addr;
181 vm1->next = vm2;
182 new_vm = vm1;
183
184 vm2->size = ((unsigned long) parent->addr + parent->size) -
185 (v_addr + size);
186 vm2->addr = (void *) v_addr + size;
187 vm2->next = parent->next;
188
189 parent->size = v_addr - (unsigned long) parent->addr;
190 parent->next = vm1;
191 }
192
193 return new_vm;
194}
195
196static struct vm_struct * __add_new_im_area(unsigned long req_addr,
197 unsigned long size)
198{
199 struct vm_struct **p, *tmp, *area;
200
201 for (p = &imlist; (tmp = *p) ; p = &tmp->next) {
202 if (req_addr + size <= (unsigned long)tmp->addr)
203 break;
204 }
205
206 area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
207 if (!area)
208 return NULL;
209 area->flags = 0;
210 area->addr = (void *)req_addr;
211 area->size = size;
212 area->next = *p;
213 *p = area;
214
215 return area;
216}
217
218static struct vm_struct * __im_get_area(unsigned long req_addr,
219 unsigned long size,
220 int criteria)
221{
222 struct vm_struct *tmp;
223 int status;
224
225 status = im_region_status(req_addr, size, &tmp);
226 if ((criteria & status) == 0) {
227 return NULL;
228 }
229
230 switch (status) {
231 case IM_REGION_UNUSED:
232 tmp = __add_new_im_area(req_addr, size);
233 break;
234 case IM_REGION_SUBSET:
235 tmp = split_im_region(req_addr, size, tmp);
236 break;
237 case IM_REGION_EXISTS:
238 /* Return requested region */
239 break;
240 case IM_REGION_SUPERSET:
241 /* Return first existing subset of requested region */
242 break;
243 default:
244 printk(KERN_ERR "%s() unexpected imalloc region status\n",
245 __FUNCTION__);
246 tmp = NULL;
247 }
248
249 return tmp;
250}
251
252struct vm_struct * im_get_free_area(unsigned long size)
253{
254 struct vm_struct *area;
255 unsigned long addr;
256
257 down(&imlist_sem);
258 if (get_free_im_addr(size, &addr)) {
259 printk(KERN_ERR "%s() cannot obtain addr for size 0x%lx\n",
260 __FUNCTION__, size);
261 area = NULL;
262 goto next_im_done;
263 }
264
265 area = __im_get_area(addr, size, IM_REGION_UNUSED);
266 if (area == NULL) {
267 printk(KERN_ERR
268 "%s() cannot obtain area for addr 0x%lx size 0x%lx\n",
269 __FUNCTION__, addr, size);
270 }
271next_im_done:
272 up(&imlist_sem);
273 return area;
274}
275
276struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size,
277 int criteria)
278{
279 struct vm_struct *area;
280
281 down(&imlist_sem);
282 area = __im_get_area(v_addr, size, criteria);
283 up(&imlist_sem);
284 return area;
285}
286
287unsigned long im_free(void * addr)
288{
289 struct vm_struct **p, *tmp;
290 unsigned long ret_size = 0;
291
292 if (!addr)
293 return ret_size;
294 if ((PAGE_SIZE-1) & (unsigned long) addr) {
295 printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__, addr);
296 return ret_size;
297 }
298 down(&imlist_sem);
299 for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
300 if (tmp->addr == addr) {
301 ret_size = tmp->size;
302 *p = tmp->next;
303 kfree(tmp);
304 up(&imlist_sem);
305 return ret_size;
306 }
307 }
308 up(&imlist_sem);
309 printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__,
310 addr);
311 return ret_size;
312}
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
new file mode 100644
index 000000000000..23813d03e1c4
--- /dev/null
+++ b/arch/ppc64/mm/init.c
@@ -0,0 +1,927 @@
1/*
2 * PowerPC version
3 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
4 *
5 * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
6 * and Cort Dougan (PReP) (cort@cs.nmt.edu)
7 * Copyright (C) 1996 Paul Mackerras
8 * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
9 *
10 * Derived from "arch/i386/mm/init.c"
11 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
12 *
13 * Dave Engebretsen <engebret@us.ibm.com>
14 * Rework for PPC64 port.
15 *
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version
19 * 2 of the License, or (at your option) any later version.
20 *
21 */
22
23#include <linux/config.h>
24#include <linux/signal.h>
25#include <linux/sched.h>
26#include <linux/kernel.h>
27#include <linux/errno.h>
28#include <linux/string.h>
29#include <linux/types.h>
30#include <linux/mman.h>
31#include <linux/mm.h>
32#include <linux/swap.h>
33#include <linux/stddef.h>
34#include <linux/vmalloc.h>
35#include <linux/init.h>
36#include <linux/delay.h>
37#include <linux/bootmem.h>
38#include <linux/highmem.h>
39#include <linux/idr.h>
40#include <linux/nodemask.h>
41#include <linux/module.h>
42
43#include <asm/pgalloc.h>
44#include <asm/page.h>
45#include <asm/abs_addr.h>
46#include <asm/prom.h>
47#include <asm/lmb.h>
48#include <asm/rtas.h>
49#include <asm/io.h>
50#include <asm/mmu_context.h>
51#include <asm/pgtable.h>
52#include <asm/mmu.h>
53#include <asm/uaccess.h>
54#include <asm/smp.h>
55#include <asm/machdep.h>
56#include <asm/tlb.h>
57#include <asm/eeh.h>
58#include <asm/processor.h>
59#include <asm/mmzone.h>
60#include <asm/cputable.h>
61#include <asm/ppcdebug.h>
62#include <asm/sections.h>
63#include <asm/system.h>
64#include <asm/iommu.h>
65#include <asm/abs_addr.h>
66#include <asm/vdso.h>
67
68int mem_init_done;
69unsigned long ioremap_bot = IMALLOC_BASE;
70static unsigned long phbs_io_bot = PHBS_IO_BASE;
71
72extern pgd_t swapper_pg_dir[];
73extern struct task_struct *current_set[NR_CPUS];
74
75extern pgd_t ioremap_dir[];
76pgd_t * ioremap_pgd = (pgd_t *)&ioremap_dir;
77
78unsigned long klimit = (unsigned long)_end;
79
80unsigned long _SDR1=0;
81unsigned long _ASR=0;
82
83/* max amount of RAM to use */
84unsigned long __max_memory;
85
86/* info on what we think the IO hole is */
87unsigned long io_hole_start;
88unsigned long io_hole_size;
89
90void show_mem(void)
91{
92 unsigned long total = 0, reserved = 0;
93 unsigned long shared = 0, cached = 0;
94 struct page *page;
95 pg_data_t *pgdat;
96 unsigned long i;
97
98 printk("Mem-info:\n");
99 show_free_areas();
100 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
101 for_each_pgdat(pgdat) {
102 for (i = 0; i < pgdat->node_spanned_pages; i++) {
103 page = pgdat->node_mem_map + i;
104 total++;
105 if (PageReserved(page))
106 reserved++;
107 else if (PageSwapCache(page))
108 cached++;
109 else if (page_count(page))
110 shared += page_count(page) - 1;
111 }
112 }
113 printk("%ld pages of RAM\n", total);
114 printk("%ld reserved pages\n", reserved);
115 printk("%ld pages shared\n", shared);
116 printk("%ld pages swap cached\n", cached);
117}
118
119#ifdef CONFIG_PPC_ISERIES
120
121void __iomem *ioremap(unsigned long addr, unsigned long size)
122{
123 return (void __iomem *)addr;
124}
125
126extern void __iomem *__ioremap(unsigned long addr, unsigned long size,
127 unsigned long flags)
128{
129 return (void __iomem *)addr;
130}
131
132void iounmap(volatile void __iomem *addr)
133{
134 return;
135}
136
137#else
138
139/*
140 * map_io_page currently only called by __ioremap
141 * map_io_page adds an entry to the ioremap page table
142 * and adds an entry to the HPT, possibly bolting it
143 */
144static void map_io_page(unsigned long ea, unsigned long pa, int flags)
145{
146 pgd_t *pgdp;
147 pmd_t *pmdp;
148 pte_t *ptep;
149 unsigned long vsid;
150
151 if (mem_init_done) {
152 spin_lock(&ioremap_mm.page_table_lock);
153 pgdp = pgd_offset_i(ea);
154 pmdp = pmd_alloc(&ioremap_mm, pgdp, ea);
155 ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea);
156
157 pa = abs_to_phys(pa);
158 set_pte_at(&ioremap_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags)));
159 spin_unlock(&ioremap_mm.page_table_lock);
160 } else {
161 unsigned long va, vpn, hash, hpteg;
162
163 /*
164 * If the mm subsystem is not fully up, we cannot create a
165 * linux page table entry for this mapping. Simply bolt an
166 * entry in the hardware page table.
167 */
168 vsid = get_kernel_vsid(ea);
169 va = (vsid << 28) | (ea & 0xFFFFFFF);
170 vpn = va >> PAGE_SHIFT;
171
172 hash = hpt_hash(vpn, 0);
173
174 hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
175
176 /* Panic if a pte grpup is full */
177 if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT, 0,
178 _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX,
179 1, 0) == -1) {
180 panic("map_io_page: could not insert mapping");
181 }
182 }
183}
184
185
186static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa,
187 unsigned long ea, unsigned long size,
188 unsigned long flags)
189{
190 unsigned long i;
191
192 if ((flags & _PAGE_PRESENT) == 0)
193 flags |= pgprot_val(PAGE_KERNEL);
194 if (flags & (_PAGE_NO_CACHE | _PAGE_WRITETHRU))
195 flags |= _PAGE_GUARDED;
196
197 for (i = 0; i < size; i += PAGE_SIZE) {
198 map_io_page(ea+i, pa+i, flags);
199 }
200
201 return (void __iomem *) (ea + (addr & ~PAGE_MASK));
202}
203
204
205void __iomem *
206ioremap(unsigned long addr, unsigned long size)
207{
208 return __ioremap(addr, size, _PAGE_NO_CACHE);
209}
210
211void __iomem *
212__ioremap(unsigned long addr, unsigned long size, unsigned long flags)
213{
214 unsigned long pa, ea;
215
216 /*
217 * Choose an address to map it to.
218 * Once the imalloc system is running, we use it.
219 * Before that, we map using addresses going
220 * up from ioremap_bot. imalloc will use
221 * the addresses from ioremap_bot through
222 * IMALLOC_END (0xE000001fffffffff)
223 *
224 */
225 pa = addr & PAGE_MASK;
226 size = PAGE_ALIGN(addr + size) - pa;
227
228 if (size == 0)
229 return NULL;
230
231 if (mem_init_done) {
232 struct vm_struct *area;
233 area = im_get_free_area(size);
234 if (area == NULL)
235 return NULL;
236 ea = (unsigned long)(area->addr);
237 } else {
238 ea = ioremap_bot;
239 ioremap_bot += size;
240 }
241
242 return __ioremap_com(addr, pa, ea, size, flags);
243}
244
245#define IS_PAGE_ALIGNED(_val) ((_val) == ((_val) & PAGE_MASK))
246
247int __ioremap_explicit(unsigned long pa, unsigned long ea,
248 unsigned long size, unsigned long flags)
249{
250 struct vm_struct *area;
251
252 /* For now, require page-aligned values for pa, ea, and size */
253 if (!IS_PAGE_ALIGNED(pa) || !IS_PAGE_ALIGNED(ea) ||
254 !IS_PAGE_ALIGNED(size)) {
255 printk(KERN_ERR "unaligned value in %s\n", __FUNCTION__);
256 return 1;
257 }
258
259 if (!mem_init_done) {
260 /* Two things to consider in this case:
261 * 1) No records will be kept (imalloc, etc) that the region
262 * has been remapped
263 * 2) It won't be easy to iounmap() the region later (because
264 * of 1)
265 */
266 ;
267 } else {
268 area = im_get_area(ea, size,
269 IM_REGION_UNUSED|IM_REGION_SUBSET|IM_REGION_EXISTS);
270 if (area == NULL) {
271 /* Expected when PHB-dlpar is in play */
272 return 1;
273 }
274 if (ea != (unsigned long) area->addr) {
275 printk(KERN_ERR "unexpected addr return from im_get_area\n");
276 return 1;
277 }
278 }
279
280 if (__ioremap_com(pa, pa, ea, size, flags) != (void *) ea) {
281 printk(KERN_ERR "__ioremap_com() returned unexpected addr\n");
282 return 1;
283 }
284
285 return 0;
286}
287
288static void unmap_im_area_pte(pmd_t *pmd, unsigned long address,
289 unsigned long size)
290{
291 unsigned long base, end;
292 pte_t *pte;
293
294 if (pmd_none(*pmd))
295 return;
296 if (pmd_bad(*pmd)) {
297 pmd_ERROR(*pmd);
298 pmd_clear(pmd);
299 return;
300 }
301
302 pte = pte_offset_kernel(pmd, address);
303 base = address & PMD_MASK;
304 address &= ~PMD_MASK;
305 end = address + size;
306 if (end > PMD_SIZE)
307 end = PMD_SIZE;
308
309 do {
310 pte_t page;
311 page = ptep_get_and_clear(&ioremap_mm, base + address, pte);
312 address += PAGE_SIZE;
313 pte++;
314 if (pte_none(page))
315 continue;
316 if (pte_present(page))
317 continue;
318 printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n");
319 } while (address < end);
320}
321
322static void unmap_im_area_pmd(pgd_t *dir, unsigned long address,
323 unsigned long size)
324{
325 unsigned long base, end;
326 pmd_t *pmd;
327
328 if (pgd_none(*dir))
329 return;
330 if (pgd_bad(*dir)) {
331 pgd_ERROR(*dir);
332 pgd_clear(dir);
333 return;
334 }
335
336 pmd = pmd_offset(dir, address);
337 base = address & PGDIR_MASK;
338 address &= ~PGDIR_MASK;
339 end = address + size;
340 if (end > PGDIR_SIZE)
341 end = PGDIR_SIZE;
342
343 do {
344 unmap_im_area_pte(pmd, base + address, end - address);
345 address = (address + PMD_SIZE) & PMD_MASK;
346 pmd++;
347 } while (address < end);
348}
349
350/*
351 * Unmap an IO region and remove it from imalloc'd list.
352 * Access to IO memory should be serialized by driver.
353 * This code is modeled after vmalloc code - unmap_vm_area()
354 *
355 * XXX what about calls before mem_init_done (ie python_countermeasures())
356 */
357void iounmap(volatile void __iomem *token)
358{
359 unsigned long address, start, end, size;
360 struct mm_struct *mm;
361 pgd_t *dir;
362 void *addr;
363
364 if (!mem_init_done) {
365 return;
366 }
367
368 addr = (void *) ((unsigned long __force) token & PAGE_MASK);
369
370 if ((size = im_free(addr)) == 0) {
371 return;
372 }
373
374 address = (unsigned long)addr;
375 start = address;
376 end = address + size;
377
378 mm = &ioremap_mm;
379 spin_lock(&mm->page_table_lock);
380
381 dir = pgd_offset_i(address);
382 flush_cache_vunmap(address, end);
383 do {
384 unmap_im_area_pmd(dir, address, end - address);
385 address = (address + PGDIR_SIZE) & PGDIR_MASK;
386 dir++;
387 } while (address && (address < end));
388 flush_tlb_kernel_range(start, end);
389
390 spin_unlock(&mm->page_table_lock);
391 return;
392}
393
394static int iounmap_subset_regions(unsigned long addr, unsigned long size)
395{
396 struct vm_struct *area;
397
398 /* Check whether subsets of this region exist */
399 area = im_get_area(addr, size, IM_REGION_SUPERSET);
400 if (area == NULL)
401 return 1;
402
403 while (area) {
404 iounmap((void __iomem *) area->addr);
405 area = im_get_area(addr, size,
406 IM_REGION_SUPERSET);
407 }
408
409 return 0;
410}
411
412int iounmap_explicit(volatile void __iomem *start, unsigned long size)
413{
414 struct vm_struct *area;
415 unsigned long addr;
416 int rc;
417
418 addr = (unsigned long __force) start & PAGE_MASK;
419
420 /* Verify that the region either exists or is a subset of an existing
421 * region. In the latter case, split the parent region to create
422 * the exact region
423 */
424 area = im_get_area(addr, size,
425 IM_REGION_EXISTS | IM_REGION_SUBSET);
426 if (area == NULL) {
427 /* Determine whether subset regions exist. If so, unmap */
428 rc = iounmap_subset_regions(addr, size);
429 if (rc) {
430 printk(KERN_ERR
431 "%s() cannot unmap nonexistent range 0x%lx\n",
432 __FUNCTION__, addr);
433 return 1;
434 }
435 } else {
436 iounmap((void __iomem *) area->addr);
437 }
438 /*
439 * FIXME! This can't be right:
440 iounmap(area->addr);
441 * Maybe it should be "iounmap(area);"
442 */
443 return 0;
444}
445
446#endif
447
448EXPORT_SYMBOL(ioremap);
449EXPORT_SYMBOL(__ioremap);
450EXPORT_SYMBOL(iounmap);
451
452void free_initmem(void)
453{
454 unsigned long addr;
455
456 addr = (unsigned long)__init_begin;
457 for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
458 ClearPageReserved(virt_to_page(addr));
459 set_page_count(virt_to_page(addr), 1);
460 free_page(addr);
461 totalram_pages++;
462 }
463 printk ("Freeing unused kernel memory: %luk freed\n",
464 ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10);
465}
466
467#ifdef CONFIG_BLK_DEV_INITRD
468void free_initrd_mem(unsigned long start, unsigned long end)
469{
470 if (start < end)
471 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
472 for (; start < end; start += PAGE_SIZE) {
473 ClearPageReserved(virt_to_page(start));
474 set_page_count(virt_to_page(start), 1);
475 free_page(start);
476 totalram_pages++;
477 }
478}
479#endif
480
481static DEFINE_SPINLOCK(mmu_context_lock);
482static DEFINE_IDR(mmu_context_idr);
483
484int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
485{
486 int index;
487 int err;
488
489#ifdef CONFIG_HUGETLB_PAGE
490 /* We leave htlb_segs as it was, but for a fork, we need to
491 * clear the huge_pgdir. */
492 mm->context.huge_pgdir = NULL;
493#endif
494
495again:
496 if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
497 return -ENOMEM;
498
499 spin_lock(&mmu_context_lock);
500 err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
501 spin_unlock(&mmu_context_lock);
502
503 if (err == -EAGAIN)
504 goto again;
505 else if (err)
506 return err;
507
508 if (index > MAX_CONTEXT) {
509 idr_remove(&mmu_context_idr, index);
510 return -ENOMEM;
511 }
512
513 mm->context.id = index;
514
515 return 0;
516}
517
518void destroy_context(struct mm_struct *mm)
519{
520 spin_lock(&mmu_context_lock);
521 idr_remove(&mmu_context_idr, mm->context.id);
522 spin_unlock(&mmu_context_lock);
523
524 mm->context.id = NO_CONTEXT;
525
526 hugetlb_mm_free_pgd(mm);
527}
528
529/*
530 * Do very early mm setup.
531 */
532void __init mm_init_ppc64(void)
533{
534#ifndef CONFIG_PPC_ISERIES
535 unsigned long i;
536#endif
537
538 ppc64_boot_msg(0x100, "MM Init");
539
540 /* This is the story of the IO hole... please, keep seated,
541 * unfortunately, we are out of oxygen masks at the moment.
542 * So we need some rough way to tell where your big IO hole
543 * is. On pmac, it's between 2G and 4G, on POWER3, it's around
544 * that area as well, on POWER4 we don't have one, etc...
545 * We need that as a "hint" when sizing the TCE table on POWER3
546 * So far, the simplest way that seem work well enough for us it
547 * to just assume that the first discontinuity in our physical
548 * RAM layout is the IO hole. That may not be correct in the future
549 * (and isn't on iSeries but then we don't care ;)
550 */
551
552#ifndef CONFIG_PPC_ISERIES
553 for (i = 1; i < lmb.memory.cnt; i++) {
554 unsigned long base, prevbase, prevsize;
555
556 prevbase = lmb.memory.region[i-1].physbase;
557 prevsize = lmb.memory.region[i-1].size;
558 base = lmb.memory.region[i].physbase;
559 if (base > (prevbase + prevsize)) {
560 io_hole_start = prevbase + prevsize;
561 io_hole_size = base - (prevbase + prevsize);
562 break;
563 }
564 }
565#endif /* CONFIG_PPC_ISERIES */
566 if (io_hole_start)
567 printk("IO Hole assumed to be %lx -> %lx\n",
568 io_hole_start, io_hole_start + io_hole_size - 1);
569
570 ppc64_boot_msg(0x100, "MM Init Done");
571}
572
573/*
574 * This is called by /dev/mem to know if a given address has to
575 * be mapped non-cacheable or not
576 */
577int page_is_ram(unsigned long pfn)
578{
579 int i;
580 unsigned long paddr = (pfn << PAGE_SHIFT);
581
582 for (i=0; i < lmb.memory.cnt; i++) {
583 unsigned long base;
584
585#ifdef CONFIG_MSCHUNKS
586 base = lmb.memory.region[i].physbase;
587#else
588 base = lmb.memory.region[i].base;
589#endif
590 if ((paddr >= base) &&
591 (paddr < (base + lmb.memory.region[i].size))) {
592 return 1;
593 }
594 }
595
596 return 0;
597}
598EXPORT_SYMBOL(page_is_ram);
599
600/*
601 * Initialize the bootmem system and give it all the memory we
602 * have available.
603 */
604#ifndef CONFIG_DISCONTIGMEM
605void __init do_init_bootmem(void)
606{
607 unsigned long i;
608 unsigned long start, bootmap_pages;
609 unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT;
610 int boot_mapsize;
611
612 /*
613 * Find an area to use for the bootmem bitmap. Calculate the size of
614 * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE.
615 * Add 1 additional page in case the address isn't page-aligned.
616 */
617 bootmap_pages = bootmem_bootmap_pages(total_pages);
618
619 start = abs_to_phys(lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE));
620 BUG_ON(!start);
621
622 boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
623
624 max_pfn = max_low_pfn;
625
626 /* add all physical memory to the bootmem map. Also find the first */
627 for (i=0; i < lmb.memory.cnt; i++) {
628 unsigned long physbase, size;
629
630 physbase = lmb.memory.region[i].physbase;
631 size = lmb.memory.region[i].size;
632 free_bootmem(physbase, size);
633 }
634
635 /* reserve the sections we're already using */
636 for (i=0; i < lmb.reserved.cnt; i++) {
637 unsigned long physbase = lmb.reserved.region[i].physbase;
638 unsigned long size = lmb.reserved.region[i].size;
639
640 reserve_bootmem(physbase, size);
641 }
642}
643
644/*
645 * paging_init() sets up the page tables - in fact we've already done this.
646 */
647void __init paging_init(void)
648{
649 unsigned long zones_size[MAX_NR_ZONES];
650 unsigned long zholes_size[MAX_NR_ZONES];
651 unsigned long total_ram = lmb_phys_mem_size();
652 unsigned long top_of_ram = lmb_end_of_DRAM();
653
654 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
655 top_of_ram, total_ram);
656 printk(KERN_INFO "Memory hole size: %ldMB\n",
657 (top_of_ram - total_ram) >> 20);
658 /*
659 * All pages are DMA-able so we put them all in the DMA zone.
660 */
661 memset(zones_size, 0, sizeof(zones_size));
662 memset(zholes_size, 0, sizeof(zholes_size));
663
664 zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
665 zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT;
666
667 free_area_init_node(0, &contig_page_data, zones_size,
668 __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size);
669}
670#endif /* CONFIG_DISCONTIGMEM */
671
672static struct kcore_list kcore_vmem;
673
674static int __init setup_kcore(void)
675{
676 int i;
677
678 for (i=0; i < lmb.memory.cnt; i++) {
679 unsigned long physbase, size;
680 struct kcore_list *kcore_mem;
681
682 physbase = lmb.memory.region[i].physbase;
683 size = lmb.memory.region[i].size;
684
685 /* GFP_ATOMIC to avoid might_sleep warnings during boot */
686 kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
687 if (!kcore_mem)
688 panic("mem_init: kmalloc failed\n");
689
690 kclist_add(kcore_mem, __va(physbase), size);
691 }
692
693 kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
694
695 return 0;
696}
697module_init(setup_kcore);
698
699void __init mem_init(void)
700{
701#ifdef CONFIG_DISCONTIGMEM
702 int nid;
703#endif
704 pg_data_t *pgdat;
705 unsigned long i;
706 struct page *page;
707 unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
708
709 num_physpages = max_low_pfn; /* RAM is assumed contiguous */
710 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
711
712#ifdef CONFIG_DISCONTIGMEM
713 for_each_online_node(nid) {
714 if (NODE_DATA(nid)->node_spanned_pages != 0) {
715 printk("freeing bootmem node %x\n", nid);
716 totalram_pages +=
717 free_all_bootmem_node(NODE_DATA(nid));
718 }
719 }
720#else
721 max_mapnr = num_physpages;
722 totalram_pages += free_all_bootmem();
723#endif
724
725 for_each_pgdat(pgdat) {
726 for (i = 0; i < pgdat->node_spanned_pages; i++) {
727 page = pgdat->node_mem_map + i;
728 if (PageReserved(page))
729 reservedpages++;
730 }
731 }
732
733 codesize = (unsigned long)&_etext - (unsigned long)&_stext;
734 initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
735 datasize = (unsigned long)&_edata - (unsigned long)&__init_end;
736 bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
737
738 printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
739 "%luk reserved, %luk data, %luk bss, %luk init)\n",
740 (unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
741 num_physpages << (PAGE_SHIFT-10),
742 codesize >> 10,
743 reservedpages << (PAGE_SHIFT-10),
744 datasize >> 10,
745 bsssize >> 10,
746 initsize >> 10);
747
748 mem_init_done = 1;
749
750#ifdef CONFIG_PPC_ISERIES
751 iommu_vio_init();
752#endif
753 /* Initialize the vDSO */
754 vdso_init();
755}
756
757/*
758 * This is called when a page has been modified by the kernel.
759 * It just marks the page as not i-cache clean. We do the i-cache
760 * flush later when the page is given to a user process, if necessary.
761 */
762void flush_dcache_page(struct page *page)
763{
764 if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
765 return;
766 /* avoid an atomic op if possible */
767 if (test_bit(PG_arch_1, &page->flags))
768 clear_bit(PG_arch_1, &page->flags);
769}
770EXPORT_SYMBOL(flush_dcache_page);
771
772void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
773{
774 clear_page(page);
775
776 if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
777 return;
778 /*
779 * We shouldnt have to do this, but some versions of glibc
780 * require it (ld.so assumes zero filled pages are icache clean)
781 * - Anton
782 */
783
784 /* avoid an atomic op if possible */
785 if (test_bit(PG_arch_1, &pg->flags))
786 clear_bit(PG_arch_1, &pg->flags);
787}
788EXPORT_SYMBOL(clear_user_page);
789
790void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
791 struct page *pg)
792{
793 copy_page(vto, vfrom);
794
795 /*
796 * We should be able to use the following optimisation, however
797 * there are two problems.
798 * Firstly a bug in some versions of binutils meant PLT sections
799 * were not marked executable.
800 * Secondly the first word in the GOT section is blrl, used
801 * to establish the GOT address. Until recently the GOT was
802 * not marked executable.
803 * - Anton
804 */
805#if 0
806 if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0))
807 return;
808#endif
809
810 if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
811 return;
812
813 /* avoid an atomic op if possible */
814 if (test_bit(PG_arch_1, &pg->flags))
815 clear_bit(PG_arch_1, &pg->flags);
816}
817
818void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
819 unsigned long addr, int len)
820{
821 unsigned long maddr;
822
823 maddr = (unsigned long)page_address(page) + (addr & ~PAGE_MASK);
824 flush_icache_range(maddr, maddr + len);
825}
826EXPORT_SYMBOL(flush_icache_user_range);
827
828/*
829 * This is called at the end of handling a user page fault, when the
830 * fault has been handled by updating a PTE in the linux page tables.
831 * We use it to preload an HPTE into the hash table corresponding to
832 * the updated linux PTE.
833 *
834 * This must always be called with the mm->page_table_lock held
835 */
836void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea,
837 pte_t pte)
838{
839 unsigned long vsid;
840 void *pgdir;
841 pte_t *ptep;
842 int local = 0;
843 cpumask_t tmp;
844 unsigned long flags;
845
846 /* handle i-cache coherency */
847 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
848 !cpu_has_feature(CPU_FTR_NOEXECUTE)) {
849 unsigned long pfn = pte_pfn(pte);
850 if (pfn_valid(pfn)) {
851 struct page *page = pfn_to_page(pfn);
852 if (!PageReserved(page)
853 && !test_bit(PG_arch_1, &page->flags)) {
854 __flush_dcache_icache(page_address(page));
855 set_bit(PG_arch_1, &page->flags);
856 }
857 }
858 }
859
860 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
861 if (!pte_young(pte))
862 return;
863
864 pgdir = vma->vm_mm->pgd;
865 if (pgdir == NULL)
866 return;
867
868 ptep = find_linux_pte(pgdir, ea);
869 if (!ptep)
870 return;
871
872 vsid = get_vsid(vma->vm_mm->context.id, ea);
873
874 local_irq_save(flags);
875 tmp = cpumask_of_cpu(smp_processor_id());
876 if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
877 local = 1;
878
879 __hash_page(ea, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep,
880 0x300, local);
881 local_irq_restore(flags);
882}
883
884void __iomem * reserve_phb_iospace(unsigned long size)
885{
886 void __iomem *virt_addr;
887
888 if (phbs_io_bot >= IMALLOC_BASE)
889 panic("reserve_phb_iospace(): phb io space overflow\n");
890
891 virt_addr = (void __iomem *) phbs_io_bot;
892 phbs_io_bot += size;
893
894 return virt_addr;
895}
896
897kmem_cache_t *zero_cache;
898
899static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
900{
901 memset(pte, 0, PAGE_SIZE);
902}
903
904void pgtable_cache_init(void)
905{
906 zero_cache = kmem_cache_create("zero",
907 PAGE_SIZE,
908 0,
909 SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
910 zero_ctor,
911 NULL);
912 if (!zero_cache)
913 panic("pgtable_cache_init(): could not create zero_cache!\n");
914}
915
916pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
917 unsigned long size, pgprot_t vma_prot)
918{
919 if (ppc_md.phys_mem_access_prot)
920 return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot);
921
922 if (!page_is_ram(addr >> PAGE_SHIFT))
923 vma_prot = __pgprot(pgprot_val(vma_prot)
924 | _PAGE_GUARDED | _PAGE_NO_CACHE);
925 return vma_prot;
926}
927EXPORT_SYMBOL(phys_mem_access_prot);
diff --git a/arch/ppc64/mm/mmap.c b/arch/ppc64/mm/mmap.c
new file mode 100644
index 000000000000..fe65f522aff3
--- /dev/null
+++ b/arch/ppc64/mm/mmap.c
@@ -0,0 +1,86 @@
1/*
2 * linux/arch/ppc64/mm/mmap.c
3 *
4 * flexible mmap layout support
5 *
6 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
7 * All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 *
24 * Started by Ingo Molnar <mingo@elte.hu>
25 */
26
27#include <linux/personality.h>
28#include <linux/mm.h>
29
30/*
31 * Top of mmap area (just below the process stack).
32 *
33 * Leave an at least ~128 MB hole.
34 */
35#define MIN_GAP (128*1024*1024)
36#define MAX_GAP (TASK_SIZE/6*5)
37
38static inline unsigned long mmap_base(void)
39{
40 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
41
42 if (gap < MIN_GAP)
43 gap = MIN_GAP;
44 else if (gap > MAX_GAP)
45 gap = MAX_GAP;
46
47 return TASK_SIZE - (gap & PAGE_MASK);
48}
49
50static inline int mmap_is_legacy(void)
51{
52 /*
53 * Force standard allocation for 64 bit programs.
54 */
55 if (!test_thread_flag(TIF_32BIT))
56 return 1;
57
58 if (current->personality & ADDR_COMPAT_LAYOUT)
59 return 1;
60
61 if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
62 return 1;
63
64 return sysctl_legacy_va_layout;
65}
66
67/*
68 * This function, called very early during the creation of a new
69 * process VM image, sets up which VM layout function to use:
70 */
71void arch_pick_mmap_layout(struct mm_struct *mm)
72{
73 /*
74 * Fall back to the standard layout if the personality
75 * bit is set, or if the expected stack growth is unlimited:
76 */
77 if (mmap_is_legacy()) {
78 mm->mmap_base = TASK_UNMAPPED_BASE;
79 mm->get_unmapped_area = arch_get_unmapped_area;
80 mm->unmap_area = arch_unmap_area;
81 } else {
82 mm->mmap_base = mmap_base();
83 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
84 mm->unmap_area = arch_unmap_area_topdown;
85 }
86}
diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c
new file mode 100644
index 000000000000..ea862ec643d3
--- /dev/null
+++ b/arch/ppc64/mm/numa.c
@@ -0,0 +1,734 @@
1/*
2 * pSeries NUMA support
3 *
4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/threads.h>
12#include <linux/bootmem.h>
13#include <linux/init.h>
14#include <linux/mm.h>
15#include <linux/mmzone.h>
16#include <linux/module.h>
17#include <linux/nodemask.h>
18#include <linux/cpu.h>
19#include <linux/notifier.h>
20#include <asm/lmb.h>
21#include <asm/machdep.h>
22#include <asm/abs_addr.h>
23
24static int numa_enabled = 1;
25
26static int numa_debug;
27#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
28
29#ifdef DEBUG_NUMA
30#define ARRAY_INITIALISER -1
31#else
32#define ARRAY_INITIALISER 0
33#endif
34
35int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
36 ARRAY_INITIALISER};
37char *numa_memory_lookup_table;
38cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
39int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
40
41struct pglist_data *node_data[MAX_NUMNODES];
42bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
43static int min_common_depth;
44
45/*
46 * We need somewhere to store start/span for each node until we have
47 * allocated the real node_data structures.
48 */
49static struct {
50 unsigned long node_start_pfn;
51 unsigned long node_end_pfn;
52 unsigned long node_present_pages;
53} init_node_data[MAX_NUMNODES] __initdata;
54
55EXPORT_SYMBOL(node_data);
56EXPORT_SYMBOL(numa_cpu_lookup_table);
57EXPORT_SYMBOL(numa_memory_lookup_table);
58EXPORT_SYMBOL(numa_cpumask_lookup_table);
59EXPORT_SYMBOL(nr_cpus_in_node);
60
61static inline void map_cpu_to_node(int cpu, int node)
62{
63 numa_cpu_lookup_table[cpu] = node;
64 if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
65 cpu_set(cpu, numa_cpumask_lookup_table[node]);
66 nr_cpus_in_node[node]++;
67 }
68}
69
70#ifdef CONFIG_HOTPLUG_CPU
71static void unmap_cpu_from_node(unsigned long cpu)
72{
73 int node = numa_cpu_lookup_table[cpu];
74
75 dbg("removing cpu %lu from node %d\n", cpu, node);
76
77 if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
78 cpu_clear(cpu, numa_cpumask_lookup_table[node]);
79 nr_cpus_in_node[node]--;
80 } else {
81 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
82 cpu, node);
83 }
84}
85#endif /* CONFIG_HOTPLUG_CPU */
86
87static struct device_node * __devinit find_cpu_node(unsigned int cpu)
88{
89 unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
90 struct device_node *cpu_node = NULL;
91 unsigned int *interrupt_server, *reg;
92 int len;
93
94 while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
95 /* Try interrupt server first */
96 interrupt_server = (unsigned int *)get_property(cpu_node,
97 "ibm,ppc-interrupt-server#s", &len);
98
99 len = len / sizeof(u32);
100
101 if (interrupt_server && (len > 0)) {
102 while (len--) {
103 if (interrupt_server[len] == hw_cpuid)
104 return cpu_node;
105 }
106 } else {
107 reg = (unsigned int *)get_property(cpu_node,
108 "reg", &len);
109 if (reg && (len > 0) && (reg[0] == hw_cpuid))
110 return cpu_node;
111 }
112 }
113
114 return NULL;
115}
116
117/* must hold reference to node during call */
118static int *of_get_associativity(struct device_node *dev)
119{
120 return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
121}
122
123static int of_node_numa_domain(struct device_node *device)
124{
125 int numa_domain;
126 unsigned int *tmp;
127
128 if (min_common_depth == -1)
129 return 0;
130
131 tmp = of_get_associativity(device);
132 if (tmp && (tmp[0] >= min_common_depth)) {
133 numa_domain = tmp[min_common_depth];
134 } else {
135 dbg("WARNING: no NUMA information for %s\n",
136 device->full_name);
137 numa_domain = 0;
138 }
139 return numa_domain;
140}
141
142/*
143 * In theory, the "ibm,associativity" property may contain multiple
144 * associativity lists because a resource may be multiply connected
145 * into the machine. This resource then has different associativity
146 * characteristics relative to its multiple connections. We ignore
147 * this for now. We also assume that all cpu and memory sets have
148 * their distances represented at a common level. This won't be
149 * true for heirarchical NUMA.
150 *
151 * In any case the ibm,associativity-reference-points should give
152 * the correct depth for a normal NUMA system.
153 *
154 * - Dave Hansen <haveblue@us.ibm.com>
155 */
156static int __init find_min_common_depth(void)
157{
158 int depth;
159 unsigned int *ref_points;
160 struct device_node *rtas_root;
161 unsigned int len;
162
163 rtas_root = of_find_node_by_path("/rtas");
164
165 if (!rtas_root)
166 return -1;
167
168 /*
169 * this property is 2 32-bit integers, each representing a level of
170 * depth in the associativity nodes. The first is for an SMP
171 * configuration (should be all 0's) and the second is for a normal
172 * NUMA configuration.
173 */
174 ref_points = (unsigned int *)get_property(rtas_root,
175 "ibm,associativity-reference-points", &len);
176
177 if ((len >= 1) && ref_points) {
178 depth = ref_points[1];
179 } else {
180 dbg("WARNING: could not find NUMA "
181 "associativity reference point\n");
182 depth = -1;
183 }
184 of_node_put(rtas_root);
185
186 return depth;
187}
188
189static int __init get_mem_addr_cells(void)
190{
191 struct device_node *memory = NULL;
192 int rc;
193
194 memory = of_find_node_by_type(memory, "memory");
195 if (!memory)
196 return 0; /* it won't matter */
197
198 rc = prom_n_addr_cells(memory);
199 return rc;
200}
201
202static int __init get_mem_size_cells(void)
203{
204 struct device_node *memory = NULL;
205 int rc;
206
207 memory = of_find_node_by_type(memory, "memory");
208 if (!memory)
209 return 0; /* it won't matter */
210 rc = prom_n_size_cells(memory);
211 return rc;
212}
213
214static unsigned long read_n_cells(int n, unsigned int **buf)
215{
216 unsigned long result = 0;
217
218 while (n--) {
219 result = (result << 32) | **buf;
220 (*buf)++;
221 }
222 return result;
223}
224
225/*
226 * Figure out to which domain a cpu belongs and stick it there.
227 * Return the id of the domain used.
228 */
229static int numa_setup_cpu(unsigned long lcpu)
230{
231 int numa_domain = 0;
232 struct device_node *cpu = find_cpu_node(lcpu);
233
234 if (!cpu) {
235 WARN_ON(1);
236 goto out;
237 }
238
239 numa_domain = of_node_numa_domain(cpu);
240
241 if (numa_domain >= num_online_nodes()) {
242 /*
243 * POWER4 LPAR uses 0xffff as invalid node,
244 * dont warn in this case.
245 */
246 if (numa_domain != 0xffff)
247 printk(KERN_ERR "WARNING: cpu %ld "
248 "maps to invalid NUMA node %d\n",
249 lcpu, numa_domain);
250 numa_domain = 0;
251 }
252out:
253 node_set_online(numa_domain);
254
255 map_cpu_to_node(lcpu, numa_domain);
256
257 of_node_put(cpu);
258
259 return numa_domain;
260}
261
262static int cpu_numa_callback(struct notifier_block *nfb,
263 unsigned long action,
264 void *hcpu)
265{
266 unsigned long lcpu = (unsigned long)hcpu;
267 int ret = NOTIFY_DONE;
268
269 switch (action) {
270 case CPU_UP_PREPARE:
271 if (min_common_depth == -1 || !numa_enabled)
272 map_cpu_to_node(lcpu, 0);
273 else
274 numa_setup_cpu(lcpu);
275 ret = NOTIFY_OK;
276 break;
277#ifdef CONFIG_HOTPLUG_CPU
278 case CPU_DEAD:
279 case CPU_UP_CANCELED:
280 unmap_cpu_from_node(lcpu);
281 break;
282 ret = NOTIFY_OK;
283#endif
284 }
285 return ret;
286}
287
288/*
289 * Check and possibly modify a memory region to enforce the memory limit.
290 *
291 * Returns the size the region should have to enforce the memory limit.
292 * This will either be the original value of size, a truncated value,
293 * or zero. If the returned value of size is 0 the region should be
294 * discarded as it lies wholy above the memory limit.
295 */
296static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
297{
298 /*
299 * We use lmb_end_of_DRAM() in here instead of memory_limit because
300 * we've already adjusted it for the limit and it takes care of
301 * having memory holes below the limit.
302 */
303 extern unsigned long memory_limit;
304
305 if (! memory_limit)
306 return size;
307
308 if (start + size <= lmb_end_of_DRAM())
309 return size;
310
311 if (start >= lmb_end_of_DRAM())
312 return 0;
313
314 return lmb_end_of_DRAM() - start;
315}
316
317static int __init parse_numa_properties(void)
318{
319 struct device_node *cpu = NULL;
320 struct device_node *memory = NULL;
321 int addr_cells, size_cells;
322 int max_domain = 0;
323 long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
324 unsigned long i;
325
326 if (numa_enabled == 0) {
327 printk(KERN_WARNING "NUMA disabled by user\n");
328 return -1;
329 }
330
331 numa_memory_lookup_table =
332 (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
333 memset(numa_memory_lookup_table, 0, entries * sizeof(char));
334
335 for (i = 0; i < entries ; i++)
336 numa_memory_lookup_table[i] = ARRAY_INITIALISER;
337
338 min_common_depth = find_min_common_depth();
339
340 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
341 if (min_common_depth < 0)
342 return min_common_depth;
343
344 max_domain = numa_setup_cpu(boot_cpuid);
345
346 /*
347 * Even though we connect cpus to numa domains later in SMP init,
348 * we need to know the maximum node id now. This is because each
349 * node id must have NODE_DATA etc backing it.
350 * As a result of hotplug we could still have cpus appear later on
351 * with larger node ids. In that case we force the cpu into node 0.
352 */
353 for_each_cpu(i) {
354 int numa_domain;
355
356 cpu = find_cpu_node(i);
357
358 if (cpu) {
359 numa_domain = of_node_numa_domain(cpu);
360 of_node_put(cpu);
361
362 if (numa_domain < MAX_NUMNODES &&
363 max_domain < numa_domain)
364 max_domain = numa_domain;
365 }
366 }
367
368 addr_cells = get_mem_addr_cells();
369 size_cells = get_mem_size_cells();
370 memory = NULL;
371 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
372 unsigned long start;
373 unsigned long size;
374 int numa_domain;
375 int ranges;
376 unsigned int *memcell_buf;
377 unsigned int len;
378
379 memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
380 if (!memcell_buf || len <= 0)
381 continue;
382
383 ranges = memory->n_addrs;
384new_range:
385 /* these are order-sensitive, and modify the buffer pointer */
386 start = read_n_cells(addr_cells, &memcell_buf);
387 size = read_n_cells(size_cells, &memcell_buf);
388
389 start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
390 size = _ALIGN_UP(size, MEMORY_INCREMENT);
391
392 numa_domain = of_node_numa_domain(memory);
393
394 if (numa_domain >= MAX_NUMNODES) {
395 if (numa_domain != 0xffff)
396 printk(KERN_ERR "WARNING: memory at %lx maps "
397 "to invalid NUMA node %d\n", start,
398 numa_domain);
399 numa_domain = 0;
400 }
401
402 if (max_domain < numa_domain)
403 max_domain = numa_domain;
404
405 if (! (size = numa_enforce_memory_limit(start, size))) {
406 if (--ranges)
407 goto new_range;
408 else
409 continue;
410 }
411
412 /*
413 * Initialize new node struct, or add to an existing one.
414 */
415 if (init_node_data[numa_domain].node_end_pfn) {
416 if ((start / PAGE_SIZE) <
417 init_node_data[numa_domain].node_start_pfn)
418 init_node_data[numa_domain].node_start_pfn =
419 start / PAGE_SIZE;
420 if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
421 init_node_data[numa_domain].node_end_pfn)
422 init_node_data[numa_domain].node_end_pfn =
423 (start / PAGE_SIZE) +
424 (size / PAGE_SIZE);
425
426 init_node_data[numa_domain].node_present_pages +=
427 size / PAGE_SIZE;
428 } else {
429 node_set_online(numa_domain);
430
431 init_node_data[numa_domain].node_start_pfn =
432 start / PAGE_SIZE;
433 init_node_data[numa_domain].node_end_pfn =
434 init_node_data[numa_domain].node_start_pfn +
435 size / PAGE_SIZE;
436 init_node_data[numa_domain].node_present_pages =
437 size / PAGE_SIZE;
438 }
439
440 for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
441 numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
442 numa_domain;
443
444 if (--ranges)
445 goto new_range;
446 }
447
448 for (i = 0; i <= max_domain; i++)
449 node_set_online(i);
450
451 return 0;
452}
453
454static void __init setup_nonnuma(void)
455{
456 unsigned long top_of_ram = lmb_end_of_DRAM();
457 unsigned long total_ram = lmb_phys_mem_size();
458 unsigned long i;
459
460 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
461 top_of_ram, total_ram);
462 printk(KERN_INFO "Memory hole size: %ldMB\n",
463 (top_of_ram - total_ram) >> 20);
464
465 if (!numa_memory_lookup_table) {
466 long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
467 numa_memory_lookup_table =
468 (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
469 memset(numa_memory_lookup_table, 0, entries * sizeof(char));
470 for (i = 0; i < entries ; i++)
471 numa_memory_lookup_table[i] = ARRAY_INITIALISER;
472 }
473
474 map_cpu_to_node(boot_cpuid, 0);
475
476 node_set_online(0);
477
478 init_node_data[0].node_start_pfn = 0;
479 init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
480 init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
481
482 for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
483 numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
484}
485
486static void __init dump_numa_topology(void)
487{
488 unsigned int node;
489 unsigned int count;
490
491 if (min_common_depth == -1 || !numa_enabled)
492 return;
493
494 for_each_online_node(node) {
495 unsigned long i;
496
497 printk(KERN_INFO "Node %d Memory:", node);
498
499 count = 0;
500
501 for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
502 if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
503 if (count == 0)
504 printk(" 0x%lx", i);
505 ++count;
506 } else {
507 if (count > 0)
508 printk("-0x%lx", i);
509 count = 0;
510 }
511 }
512
513 if (count > 0)
514 printk("-0x%lx", i);
515 printk("\n");
516 }
517 return;
518}
519
520/*
521 * Allocate some memory, satisfying the lmb or bootmem allocator where
522 * required. nid is the preferred node and end is the physical address of
523 * the highest address in the node.
524 *
525 * Returns the physical address of the memory.
526 */
527static unsigned long careful_allocation(int nid, unsigned long size,
528 unsigned long align, unsigned long end)
529{
530 unsigned long ret = lmb_alloc_base(size, align, end);
531
532 /* retry over all memory */
533 if (!ret)
534 ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
535
536 if (!ret)
537 panic("numa.c: cannot allocate %lu bytes on node %d",
538 size, nid);
539
540 /*
541 * If the memory came from a previously allocated node, we must
542 * retry with the bootmem allocator.
543 */
544 if (pa_to_nid(ret) < nid) {
545 nid = pa_to_nid(ret);
546 ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
547 size, align, 0);
548
549 if (!ret)
550 panic("numa.c: cannot allocate %lu bytes on node %d",
551 size, nid);
552
553 ret = virt_to_abs(ret);
554
555 dbg("alloc_bootmem %lx %lx\n", ret, size);
556 }
557
558 return ret;
559}
560
561void __init do_init_bootmem(void)
562{
563 int nid;
564 int addr_cells, size_cells;
565 struct device_node *memory = NULL;
566 static struct notifier_block ppc64_numa_nb = {
567 .notifier_call = cpu_numa_callback,
568 .priority = 1 /* Must run before sched domains notifier. */
569 };
570
571 min_low_pfn = 0;
572 max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
573 max_pfn = max_low_pfn;
574
575 if (parse_numa_properties())
576 setup_nonnuma();
577 else
578 dump_numa_topology();
579
580 register_cpu_notifier(&ppc64_numa_nb);
581
582 for_each_online_node(nid) {
583 unsigned long start_paddr, end_paddr;
584 int i;
585 unsigned long bootmem_paddr;
586 unsigned long bootmap_pages;
587
588 start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
589 end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
590
591 /* Allocate the node structure node local if possible */
592 NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
593 sizeof(struct pglist_data),
594 SMP_CACHE_BYTES, end_paddr);
595 NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
596 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
597
598 dbg("node %d\n", nid);
599 dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
600
601 NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
602 NODE_DATA(nid)->node_start_pfn =
603 init_node_data[nid].node_start_pfn;
604 NODE_DATA(nid)->node_spanned_pages =
605 end_paddr - start_paddr;
606
607 if (NODE_DATA(nid)->node_spanned_pages == 0)
608 continue;
609
610 dbg("start_paddr = %lx\n", start_paddr);
611 dbg("end_paddr = %lx\n", end_paddr);
612
613 bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
614
615 bootmem_paddr = careful_allocation(nid,
616 bootmap_pages << PAGE_SHIFT,
617 PAGE_SIZE, end_paddr);
618 memset(abs_to_virt(bootmem_paddr), 0,
619 bootmap_pages << PAGE_SHIFT);
620 dbg("bootmap_paddr = %lx\n", bootmem_paddr);
621
622 init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
623 start_paddr >> PAGE_SHIFT,
624 end_paddr >> PAGE_SHIFT);
625
626 /*
627 * We need to do another scan of all memory sections to
628 * associate memory with the correct node.
629 */
630 addr_cells = get_mem_addr_cells();
631 size_cells = get_mem_size_cells();
632 memory = NULL;
633 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
634 unsigned long mem_start, mem_size;
635 int numa_domain, ranges;
636 unsigned int *memcell_buf;
637 unsigned int len;
638
639 memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
640 if (!memcell_buf || len <= 0)
641 continue;
642
643 ranges = memory->n_addrs; /* ranges in cell */
644new_range:
645 mem_start = read_n_cells(addr_cells, &memcell_buf);
646 mem_size = read_n_cells(size_cells, &memcell_buf);
647 numa_domain = numa_enabled ? of_node_numa_domain(memory) : 0;
648
649 if (numa_domain != nid)
650 continue;
651
652 mem_size = numa_enforce_memory_limit(mem_start, mem_size);
653 if (mem_size) {
654 dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
655 free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
656 }
657
658 if (--ranges) /* process all ranges in cell */
659 goto new_range;
660 }
661
662 /*
663 * Mark reserved regions on this node
664 */
665 for (i = 0; i < lmb.reserved.cnt; i++) {
666 unsigned long physbase = lmb.reserved.region[i].physbase;
667 unsigned long size = lmb.reserved.region[i].size;
668
669 if (pa_to_nid(physbase) != nid &&
670 pa_to_nid(physbase+size-1) != nid)
671 continue;
672
673 if (physbase < end_paddr &&
674 (physbase+size) > start_paddr) {
675 /* overlaps */
676 if (physbase < start_paddr) {
677 size -= start_paddr - physbase;
678 physbase = start_paddr;
679 }
680
681 if (size > end_paddr - physbase)
682 size = end_paddr - physbase;
683
684 dbg("reserve_bootmem %lx %lx\n", physbase,
685 size);
686 reserve_bootmem_node(NODE_DATA(nid), physbase,
687 size);
688 }
689 }
690 }
691}
692
693void __init paging_init(void)
694{
695 unsigned long zones_size[MAX_NR_ZONES];
696 unsigned long zholes_size[MAX_NR_ZONES];
697 int nid;
698
699 memset(zones_size, 0, sizeof(zones_size));
700 memset(zholes_size, 0, sizeof(zholes_size));
701
702 for_each_online_node(nid) {
703 unsigned long start_pfn;
704 unsigned long end_pfn;
705
706 start_pfn = init_node_data[nid].node_start_pfn;
707 end_pfn = init_node_data[nid].node_end_pfn;
708
709 zones_size[ZONE_DMA] = end_pfn - start_pfn;
710 zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
711 init_node_data[nid].node_present_pages;
712
713 dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
714 zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
715
716 free_area_init_node(nid, NODE_DATA(nid), zones_size,
717 start_pfn, zholes_size);
718 }
719}
720
721static int __init early_numa(char *p)
722{
723 if (!p)
724 return 0;
725
726 if (strstr(p, "off"))
727 numa_enabled = 0;
728
729 if (strstr(p, "debug"))
730 numa_debug = 1;
731
732 return 0;
733}
734early_param("numa", early_numa);
diff --git a/arch/ppc64/mm/slb.c b/arch/ppc64/mm/slb.c
new file mode 100644
index 000000000000..6a20773f695d
--- /dev/null
+++ b/arch/ppc64/mm/slb.c
@@ -0,0 +1,159 @@
1/*
2 * PowerPC64 SLB support.
3 *
4 * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
5 * Based on earlier code writteh by:
6 * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
7 * Copyright (c) 2001 Dave Engebretsen
8 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
9 *
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 */
16
17#include <linux/config.h>
18#include <asm/pgtable.h>
19#include <asm/mmu.h>
20#include <asm/mmu_context.h>
21#include <asm/paca.h>
22#include <asm/cputable.h>
23
24extern void slb_allocate(unsigned long ea);
25
26static inline unsigned long mk_esid_data(unsigned long ea, unsigned long slot)
27{
28 return (ea & ESID_MASK) | SLB_ESID_V | slot;
29}
30
31static inline unsigned long mk_vsid_data(unsigned long ea, unsigned long flags)
32{
33 return (get_kernel_vsid(ea) << SLB_VSID_SHIFT) | flags;
34}
35
36static inline void create_slbe(unsigned long ea, unsigned long vsid,
37 unsigned long flags, unsigned long entry)
38{
39 asm volatile("slbmte %0,%1" :
40 : "r" (mk_vsid_data(ea, flags)),
41 "r" (mk_esid_data(ea, entry))
42 : "memory" );
43}
44
45static void slb_flush_and_rebolt(void)
46{
47 /* If you change this make sure you change SLB_NUM_BOLTED
48 * appropriately too. */
49 unsigned long ksp_flags = SLB_VSID_KERNEL;
50 unsigned long ksp_esid_data;
51
52 WARN_ON(!irqs_disabled());
53
54 if (cpu_has_feature(CPU_FTR_16M_PAGE))
55 ksp_flags |= SLB_VSID_L;
56
57 ksp_esid_data = mk_esid_data(get_paca()->kstack, 2);
58 if ((ksp_esid_data & ESID_MASK) == KERNELBASE)
59 ksp_esid_data &= ~SLB_ESID_V;
60
61 /* We need to do this all in asm, so we're sure we don't touch
62 * the stack between the slbia and rebolting it. */
63 asm volatile("isync\n"
64 "slbia\n"
65 /* Slot 1 - first VMALLOC segment */
66 "slbmte %0,%1\n"
67 /* Slot 2 - kernel stack */
68 "slbmte %2,%3\n"
69 "isync"
70 :: "r"(mk_vsid_data(VMALLOCBASE, SLB_VSID_KERNEL)),
71 "r"(mk_esid_data(VMALLOCBASE, 1)),
72 "r"(mk_vsid_data(ksp_esid_data, ksp_flags)),
73 "r"(ksp_esid_data)
74 : "memory");
75}
76
77/* Flush all user entries from the segment table of the current processor. */
78void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
79{
80 unsigned long offset = get_paca()->slb_cache_ptr;
81 unsigned long esid_data = 0;
82 unsigned long pc = KSTK_EIP(tsk);
83 unsigned long stack = KSTK_ESP(tsk);
84 unsigned long unmapped_base;
85
86 if (offset <= SLB_CACHE_ENTRIES) {
87 int i;
88 asm volatile("isync" : : : "memory");
89 for (i = 0; i < offset; i++) {
90 esid_data = (unsigned long)get_paca()->slb_cache[i]
91 << SID_SHIFT;
92 asm volatile("slbie %0" : : "r" (esid_data));
93 }
94 asm volatile("isync" : : : "memory");
95 } else {
96 slb_flush_and_rebolt();
97 }
98
99 /* Workaround POWER5 < DD2.1 issue */
100 if (offset == 1 || offset > SLB_CACHE_ENTRIES)
101 asm volatile("slbie %0" : : "r" (esid_data));
102
103 get_paca()->slb_cache_ptr = 0;
104 get_paca()->context = mm->context;
105
106 /*
107 * preload some userspace segments into the SLB.
108 */
109 if (test_tsk_thread_flag(tsk, TIF_32BIT))
110 unmapped_base = TASK_UNMAPPED_BASE_USER32;
111 else
112 unmapped_base = TASK_UNMAPPED_BASE_USER64;
113
114 if (pc >= KERNELBASE)
115 return;
116 slb_allocate(pc);
117
118 if (GET_ESID(pc) == GET_ESID(stack))
119 return;
120
121 if (stack >= KERNELBASE)
122 return;
123 slb_allocate(stack);
124
125 if ((GET_ESID(pc) == GET_ESID(unmapped_base))
126 || (GET_ESID(stack) == GET_ESID(unmapped_base)))
127 return;
128
129 if (unmapped_base >= KERNELBASE)
130 return;
131 slb_allocate(unmapped_base);
132}
133
134void slb_initialize(void)
135{
136 /* On iSeries the bolted entries have already been set up by
137 * the hypervisor from the lparMap data in head.S */
138#ifndef CONFIG_PPC_ISERIES
139 unsigned long flags = SLB_VSID_KERNEL;
140
141 /* Invalidate the entire SLB (even slot 0) & all the ERATS */
142 if (cpu_has_feature(CPU_FTR_16M_PAGE))
143 flags |= SLB_VSID_L;
144
145 asm volatile("isync":::"memory");
146 asm volatile("slbmte %0,%0"::"r" (0) : "memory");
147 asm volatile("isync; slbia; isync":::"memory");
148 create_slbe(KERNELBASE, get_kernel_vsid(KERNELBASE), flags, 0);
149 create_slbe(VMALLOCBASE, get_kernel_vsid(KERNELBASE),
150 SLB_VSID_KERNEL, 1);
151 /* We don't bolt the stack for the time being - we're in boot,
152 * so the stack is in the bolted segment. By the time it goes
153 * elsewhere, we'll call _switch() which will bolt in the new
154 * one. */
155 asm volatile("isync":::"memory");
156#endif
157
158 get_paca()->stab_rr = SLB_NUM_BOLTED;
159}
diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S
new file mode 100644
index 000000000000..8379d678f70f
--- /dev/null
+++ b/arch/ppc64/mm/slb_low.S
@@ -0,0 +1,154 @@
1/*
2 * arch/ppc64/mm/slb_low.S
3 *
4 * Low-level SLB routines
5 *
6 * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
7 *
8 * Based on earlier C version:
9 * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
10 * Copyright (c) 2001 Dave Engebretsen
11 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 */
18
19#include <linux/config.h>
20#include <asm/processor.h>
21#include <asm/page.h>
22#include <asm/mmu.h>
23#include <asm/ppc_asm.h>
24#include <asm/offsets.h>
25#include <asm/cputable.h>
26
27/* void slb_allocate(unsigned long ea);
28 *
29 * Create an SLB entry for the given EA (user or kernel).
30 * r3 = faulting address, r13 = PACA
31 * r9, r10, r11 are clobbered by this function
32 * No other registers are examined or changed.
33 */
34_GLOBAL(slb_allocate)
35 /*
36 * First find a slot, round robin. Previously we tried to find
37 * a free slot first but that took too long. Unfortunately we
38 * dont have any LRU information to help us choose a slot.
39 */
40#ifdef CONFIG_PPC_ISERIES
41 /*
42 * On iSeries, the "bolted" stack segment can be cast out on
43 * shared processor switch so we need to check for a miss on
44 * it and restore it to the right slot.
45 */
46 ld r9,PACAKSAVE(r13)
47 clrrdi r9,r9,28
48 clrrdi r11,r3,28
49 li r10,SLB_NUM_BOLTED-1 /* Stack goes in last bolted slot */
50 cmpld r9,r11
51 beq 3f
52#endif /* CONFIG_PPC_ISERIES */
53
54 ld r10,PACASTABRR(r13)
55 addi r10,r10,1
56 /* use a cpu feature mask if we ever change our slb size */
57 cmpldi r10,SLB_NUM_ENTRIES
58
59 blt+ 4f
60 li r10,SLB_NUM_BOLTED
61
624:
63 std r10,PACASTABRR(r13)
643:
65 /* r3 = faulting address, r10 = entry */
66
67 srdi r9,r3,60 /* get region */
68 srdi r3,r3,28 /* get esid */
69 cmpldi cr7,r9,0xc /* cmp KERNELBASE for later use */
70
71 rldimi r10,r3,28,0 /* r10= ESID<<28 | entry */
72 oris r10,r10,SLB_ESID_V@h /* r10 |= SLB_ESID_V */
73
74 /* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */
75
76 blt cr7,0f /* user or kernel? */
77
78 /* kernel address: proto-VSID = ESID */
79 /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
80 * this code will generate the protoVSID 0xfffffffff for the
81 * top segment. That's ok, the scramble below will translate
82 * it to VSID 0, which is reserved as a bad VSID - one which
83 * will never have any pages in it. */
84 li r11,SLB_VSID_KERNEL
85BEGIN_FTR_SECTION
86 bne cr7,9f
87 li r11,(SLB_VSID_KERNEL|SLB_VSID_L)
88END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
89 b 9f
90
910: /* user address: proto-VSID = context<<15 | ESID */
92 li r11,SLB_VSID_USER
93
94 srdi. r9,r3,13
95 bne- 8f /* invalid ea bits set */
96
97#ifdef CONFIG_HUGETLB_PAGE
98BEGIN_FTR_SECTION
99 /* check against the hugepage ranges */
100 cmpldi r3,(TASK_HPAGE_END>>SID_SHIFT)
101 bge 6f /* >= TASK_HPAGE_END */
102 cmpldi r3,(TASK_HPAGE_BASE>>SID_SHIFT)
103 bge 5f /* TASK_HPAGE_BASE..TASK_HPAGE_END */
104 cmpldi r3,16
105 bge 6f /* 4GB..TASK_HPAGE_BASE */
106
107 lhz r9,PACAHTLBSEGS(r13)
108 srd r9,r9,r3
109 andi. r9,r9,1
110 beq 6f
111
1125: /* this is a hugepage user address */
113 li r11,(SLB_VSID_USER|SLB_VSID_L)
114END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
115#endif /* CONFIG_HUGETLB_PAGE */
116
1176: ld r9,PACACONTEXTID(r13)
118 rldimi r3,r9,USER_ESID_BITS,0
119
1209: /* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */
121 ASM_VSID_SCRAMBLE(r3,r9)
122
123 rldimi r11,r3,SLB_VSID_SHIFT,16 /* combine VSID and flags */
124
125 /*
126 * No need for an isync before or after this slbmte. The exception
127 * we enter with and the rfid we exit with are context synchronizing.
128 */
129 slbmte r11,r10
130
131 bgelr cr7 /* we're done for kernel addresses */
132
133 /* Update the slb cache */
134 lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */
135 cmpldi r3,SLB_CACHE_ENTRIES
136 bge 1f
137
138 /* still room in the slb cache */
139 sldi r11,r3,1 /* r11 = offset * sizeof(u16) */
140 rldicl r10,r10,36,28 /* get low 16 bits of the ESID */
141 add r11,r11,r13 /* r11 = (u16 *)paca + offset */
142 sth r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */
143 addi r3,r3,1 /* offset++ */
144 b 2f
1451: /* offset >= SLB_CACHE_ENTRIES */
146 li r3,SLB_CACHE_ENTRIES+1
1472:
148 sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */
149 blr
150
1518: /* invalid EA */
152 li r3,0 /* BAD_VSID */
153 li r11,SLB_VSID_USER /* flags don't much matter */
154 b 9b
diff --git a/arch/ppc64/mm/stab.c b/arch/ppc64/mm/stab.c
new file mode 100644
index 000000000000..31491131d5e4
--- /dev/null
+++ b/arch/ppc64/mm/stab.c
@@ -0,0 +1,239 @@
1/*
2 * PowerPC64 Segment Translation Support.
3 *
4 * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
5 * Copyright (c) 2001 Dave Engebretsen
6 *
7 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 */
14
15#include <linux/config.h>
16#include <asm/pgtable.h>
17#include <asm/mmu.h>
18#include <asm/mmu_context.h>
19#include <asm/paca.h>
20#include <asm/cputable.h>
21
22/* Both the segment table and SLB code uses the following cache */
23#define NR_STAB_CACHE_ENTRIES 8
24DEFINE_PER_CPU(long, stab_cache_ptr);
25DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
26
27/*
28 * Create a segment table entry for the given esid/vsid pair.
29 */
30static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid)
31{
32 unsigned long esid_data, vsid_data;
33 unsigned long entry, group, old_esid, castout_entry, i;
34 unsigned int global_entry;
35 struct stab_entry *ste, *castout_ste;
36 unsigned long kernel_segment = (esid << SID_SHIFT) >= KERNELBASE;
37
38 vsid_data = vsid << STE_VSID_SHIFT;
39 esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V;
40 if (! kernel_segment)
41 esid_data |= STE_ESID_KS;
42
43 /* Search the primary group first. */
44 global_entry = (esid & 0x1f) << 3;
45 ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
46
47 /* Find an empty entry, if one exists. */
48 for (group = 0; group < 2; group++) {
49 for (entry = 0; entry < 8; entry++, ste++) {
50 if (!(ste->esid_data & STE_ESID_V)) {
51 ste->vsid_data = vsid_data;
52 asm volatile("eieio":::"memory");
53 ste->esid_data = esid_data;
54 return (global_entry | entry);
55 }
56 }
57 /* Now search the secondary group. */
58 global_entry = ((~esid) & 0x1f) << 3;
59 ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
60 }
61
62 /*
63 * Could not find empty entry, pick one with a round robin selection.
64 * Search all entries in the two groups.
65 */
66 castout_entry = get_paca()->stab_rr;
67 for (i = 0; i < 16; i++) {
68 if (castout_entry < 8) {
69 global_entry = (esid & 0x1f) << 3;
70 ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
71 castout_ste = ste + castout_entry;
72 } else {
73 global_entry = ((~esid) & 0x1f) << 3;
74 ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
75 castout_ste = ste + (castout_entry - 8);
76 }
77
78 /* Dont cast out the first kernel segment */
79 if ((castout_ste->esid_data & ESID_MASK) != KERNELBASE)
80 break;
81
82 castout_entry = (castout_entry + 1) & 0xf;
83 }
84
85 get_paca()->stab_rr = (castout_entry + 1) & 0xf;
86
87 /* Modify the old entry to the new value. */
88
89 /* Force previous translations to complete. DRENG */
90 asm volatile("isync" : : : "memory");
91
92 old_esid = castout_ste->esid_data >> SID_SHIFT;
93 castout_ste->esid_data = 0; /* Invalidate old entry */
94
95 asm volatile("sync" : : : "memory"); /* Order update */
96
97 castout_ste->vsid_data = vsid_data;
98 asm volatile("eieio" : : : "memory"); /* Order update */
99 castout_ste->esid_data = esid_data;
100
101 asm volatile("slbie %0" : : "r" (old_esid << SID_SHIFT));
102 /* Ensure completion of slbie */
103 asm volatile("sync" : : : "memory");
104
105 return (global_entry | (castout_entry & 0x7));
106}
107
108/*
109 * Allocate a segment table entry for the given ea and mm
110 */
111static int __ste_allocate(unsigned long ea, struct mm_struct *mm)
112{
113 unsigned long vsid;
114 unsigned char stab_entry;
115 unsigned long offset;
116
117 /* Kernel or user address? */
118 if (ea >= KERNELBASE) {
119 vsid = get_kernel_vsid(ea);
120 } else {
121 if ((ea >= TASK_SIZE_USER64) || (! mm))
122 return 1;
123
124 vsid = get_vsid(mm->context.id, ea);
125 }
126
127 stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid);
128
129 if (ea < KERNELBASE) {
130 offset = __get_cpu_var(stab_cache_ptr);
131 if (offset < NR_STAB_CACHE_ENTRIES)
132 __get_cpu_var(stab_cache[offset++]) = stab_entry;
133 else
134 offset = NR_STAB_CACHE_ENTRIES+1;
135 __get_cpu_var(stab_cache_ptr) = offset;
136
137 /* Order update */
138 asm volatile("sync":::"memory");
139 }
140
141 return 0;
142}
143
144int ste_allocate(unsigned long ea)
145{
146 return __ste_allocate(ea, current->mm);
147}
148
149/*
150 * Do the segment table work for a context switch: flush all user
151 * entries from the table, then preload some probably useful entries
152 * for the new task
153 */
154void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
155{
156 struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
157 struct stab_entry *ste;
158 unsigned long offset = __get_cpu_var(stab_cache_ptr);
159 unsigned long pc = KSTK_EIP(tsk);
160 unsigned long stack = KSTK_ESP(tsk);
161 unsigned long unmapped_base;
162
163 /* Force previous translations to complete. DRENG */
164 asm volatile("isync" : : : "memory");
165
166 if (offset <= NR_STAB_CACHE_ENTRIES) {
167 int i;
168
169 for (i = 0; i < offset; i++) {
170 ste = stab + __get_cpu_var(stab_cache[i]);
171 ste->esid_data = 0; /* invalidate entry */
172 }
173 } else {
174 unsigned long entry;
175
176 /* Invalidate all entries. */
177 ste = stab;
178
179 /* Never flush the first entry. */
180 ste += 1;
181 for (entry = 1;
182 entry < (PAGE_SIZE / sizeof(struct stab_entry));
183 entry++, ste++) {
184 unsigned long ea;
185 ea = ste->esid_data & ESID_MASK;
186 if (ea < KERNELBASE) {
187 ste->esid_data = 0;
188 }
189 }
190 }
191
192 asm volatile("sync; slbia; sync":::"memory");
193
194 __get_cpu_var(stab_cache_ptr) = 0;
195
196 /* Now preload some entries for the new task */
197 if (test_tsk_thread_flag(tsk, TIF_32BIT))
198 unmapped_base = TASK_UNMAPPED_BASE_USER32;
199 else
200 unmapped_base = TASK_UNMAPPED_BASE_USER64;
201
202 __ste_allocate(pc, mm);
203
204 if (GET_ESID(pc) == GET_ESID(stack))
205 return;
206
207 __ste_allocate(stack, mm);
208
209 if ((GET_ESID(pc) == GET_ESID(unmapped_base))
210 || (GET_ESID(stack) == GET_ESID(unmapped_base)))
211 return;
212
213 __ste_allocate(unmapped_base, mm);
214
215 /* Order update */
216 asm volatile("sync" : : : "memory");
217}
218
219extern void slb_initialize(void);
220
221/*
222 * Build an entry for the base kernel segment and put it into
223 * the segment table or SLB. All other segment table or SLB
224 * entries are faulted in.
225 */
226void stab_initialize(unsigned long stab)
227{
228 unsigned long vsid = get_kernel_vsid(KERNELBASE);
229
230 if (cpu_has_feature(CPU_FTR_SLB)) {
231 slb_initialize();
232 } else {
233 asm volatile("isync; slbia; isync":::"memory");
234 make_ste(stab, GET_ESID(KERNELBASE), vsid);
235
236 /* Order update */
237 asm volatile("sync":::"memory");
238 }
239}
diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c
new file mode 100644
index 000000000000..26f0172c4527
--- /dev/null
+++ b/arch/ppc64/mm/tlb.c
@@ -0,0 +1,180 @@
1/*
2 * This file contains the routines for flushing entries from the
3 * TLB and MMU hash table.
4 *
5 * Derived from arch/ppc64/mm/init.c:
6 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7 *
8 * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
9 * and Cort Dougan (PReP) (cort@cs.nmt.edu)
10 * Copyright (C) 1996 Paul Mackerras
11 * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
12 *
13 * Derived from "arch/i386/mm/init.c"
14 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
15 *
16 * Dave Engebretsen <engebret@us.ibm.com>
17 * Rework for PPC64 port.
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 */
24#include <linux/config.h>
25#include <linux/kernel.h>
26#include <linux/mm.h>
27#include <linux/init.h>
28#include <linux/percpu.h>
29#include <linux/hardirq.h>
30#include <asm/pgalloc.h>
31#include <asm/tlbflush.h>
32#include <asm/tlb.h>
33#include <linux/highmem.h>
34
35DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
36
37/* This is declared as we are using the more or less generic
38 * include/asm-ppc64/tlb.h file -- tgall
39 */
40DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
41DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
42unsigned long pte_freelist_forced_free;
43
44void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage)
45{
46 /* This is safe as we are holding page_table_lock */
47 cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
48 struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
49
50 if (atomic_read(&tlb->mm->mm_users) < 2 ||
51 cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
52 pte_free(ptepage);
53 return;
54 }
55
56 if (*batchp == NULL) {
57 *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
58 if (*batchp == NULL) {
59 pte_free_now(ptepage);
60 return;
61 }
62 (*batchp)->index = 0;
63 }
64 (*batchp)->pages[(*batchp)->index++] = ptepage;
65 if ((*batchp)->index == PTE_FREELIST_SIZE) {
66 pte_free_submit(*batchp);
67 *batchp = NULL;
68 }
69}
70
71/*
72 * Update the MMU hash table to correspond with a change to
73 * a Linux PTE. If wrprot is true, it is permissible to
74 * change the existing HPTE to read-only rather than removing it
75 * (if we remove it we should clear the _PTE_HPTEFLAGS bits).
76 */
77void hpte_update(struct mm_struct *mm, unsigned long addr,
78 unsigned long pte, int wrprot)
79{
80 int i;
81 unsigned long context = 0;
82 struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
83
84 if (REGION_ID(addr) == USER_REGION_ID)
85 context = mm->context.id;
86 i = batch->index;
87
88 /*
89 * This can happen when we are in the middle of a TLB batch and
90 * we encounter memory pressure (eg copy_page_range when it tries
91 * to allocate a new pte). If we have to reclaim memory and end
92 * up scanning and resetting referenced bits then our batch context
93 * will change mid stream.
94 */
95 if (unlikely(i != 0 && context != batch->context)) {
96 flush_tlb_pending();
97 i = 0;
98 }
99
100 if (i == 0) {
101 batch->context = context;
102 batch->mm = mm;
103 }
104 batch->pte[i] = __pte(pte);
105 batch->addr[i] = addr;
106 batch->index = ++i;
107 if (i >= PPC64_TLB_BATCH_NR)
108 flush_tlb_pending();
109}
110
111void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
112{
113 int i;
114 int cpu;
115 cpumask_t tmp;
116 int local = 0;
117
118 BUG_ON(in_interrupt());
119
120 cpu = get_cpu();
121 i = batch->index;
122 tmp = cpumask_of_cpu(cpu);
123 if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
124 local = 1;
125
126 if (i == 1)
127 flush_hash_page(batch->context, batch->addr[0], batch->pte[0],
128 local);
129 else
130 flush_hash_range(batch->context, i, local);
131 batch->index = 0;
132 put_cpu();
133}
134
135#ifdef CONFIG_SMP
136static void pte_free_smp_sync(void *arg)
137{
138 /* Do nothing, just ensure we sync with all CPUs */
139}
140#endif
141
142/* This is only called when we are critically out of memory
143 * (and fail to get a page in pte_free_tlb).
144 */
145void pte_free_now(struct page *ptepage)
146{
147 pte_freelist_forced_free++;
148
149 smp_call_function(pte_free_smp_sync, NULL, 0, 1);
150
151 pte_free(ptepage);
152}
153
154static void pte_free_rcu_callback(struct rcu_head *head)
155{
156 struct pte_freelist_batch *batch =
157 container_of(head, struct pte_freelist_batch, rcu);
158 unsigned int i;
159
160 for (i = 0; i < batch->index; i++)
161 pte_free(batch->pages[i]);
162 free_page((unsigned long)batch);
163}
164
165void pte_free_submit(struct pte_freelist_batch *batch)
166{
167 INIT_RCU_HEAD(&batch->rcu);
168 call_rcu(&batch->rcu, pte_free_rcu_callback);
169}
170
171void pte_free_finish(void)
172{
173 /* This is safe as we are holding page_table_lock */
174 struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
175
176 if (*batchp == NULL)
177 return;
178 pte_free_submit(*batchp);
179 *batchp = NULL;
180}