aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/i386/mm
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'arch/i386/mm')
-rw-r--r--arch/i386/mm/Makefile10
-rw-r--r--arch/i386/mm/boot_ioremap.c97
-rw-r--r--arch/i386/mm/discontig.c383
-rw-r--r--arch/i386/mm/extable.c36
-rw-r--r--arch/i386/mm/fault.c552
-rw-r--r--arch/i386/mm/highmem.c89
-rw-r--r--arch/i386/mm/hugetlbpage.c431
-rw-r--r--arch/i386/mm/init.c696
-rw-r--r--arch/i386/mm/ioremap.c320
-rw-r--r--arch/i386/mm/mmap.c76
-rw-r--r--arch/i386/mm/pageattr.c221
-rw-r--r--arch/i386/mm/pgtable.c260
12 files changed, 3171 insertions, 0 deletions
diff --git a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile
new file mode 100644
index 000000000000..fc3272506846
--- /dev/null
+++ b/arch/i386/mm/Makefile
@@ -0,0 +1,10 @@
1#
2# Makefile for the linux i386-specific parts of the memory manager.
3#
4
5obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o
6
7obj-$(CONFIG_DISCONTIGMEM) += discontig.o
8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
9obj-$(CONFIG_HIGHMEM) += highmem.o
10obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
diff --git a/arch/i386/mm/boot_ioremap.c b/arch/i386/mm/boot_ioremap.c
new file mode 100644
index 000000000000..523b30634e0a
--- /dev/null
+++ b/arch/i386/mm/boot_ioremap.c
@@ -0,0 +1,97 @@
1/*
2 * arch/i386/mm/boot_ioremap.c
3 *
4 * Re-map functions for early boot-time before paging_init() when the
5 * boot-time pagetables are still in use
6 *
7 * Written by Dave Hansen <haveblue@us.ibm.com>
8 */
9
10
11/*
12 * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
13 * keeps that from happenning. If anyone has a better way, I'm listening.
14 *
15 * boot_pte_t is defined only if this all works correctly
16 */
17
18#include <linux/config.h>
19#undef CONFIG_X86_PAE
20#include <asm/page.h>
21#include <asm/pgtable.h>
22#include <asm/tlbflush.h>
23#include <linux/init.h>
24#include <linux/stddef.h>
25
26/*
27 * I'm cheating here. It is known that the two boot PTE pages are
28 * allocated next to each other. I'm pretending that they're just
29 * one big array.
30 */
31
32#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
33#define boot_pte_index(address) \
34 (((address) >> PAGE_SHIFT) & (BOOT_PTE_PTRS - 1))
35
36static inline boot_pte_t* boot_vaddr_to_pte(void *address)
37{
38 boot_pte_t* boot_pg = (boot_pte_t*)pg0;
39 return &boot_pg[boot_pte_index((unsigned long)address)];
40}
41
42/*
43 * This is only for a caller who is clever enough to page-align
44 * phys_addr and virtual_source, and who also has a preference
45 * about which virtual address from which to steal ptes
46 */
47static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages,
48 void* virtual_source)
49{
50 boot_pte_t* pte;
51 int i;
52 char *vaddr = virtual_source;
53
54 pte = boot_vaddr_to_pte(virtual_source);
55 for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
56 set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
57 __flush_tlb_one(&vaddr[i*PAGE_SIZE]);
58 }
59}
60
61/* the virtual space we're going to remap comes from this array */
62#define BOOT_IOREMAP_PAGES 4
63#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
64static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
65 __attribute__ ((aligned (PAGE_SIZE)));
66
67/*
68 * This only applies to things which need to ioremap before paging_init()
69 * bt_ioremap() and plain ioremap() are both useless at this point.
70 *
71 * When used, we're still using the boot-time pagetables, which only
72 * have 2 PTE pages mapping the first 8MB
73 *
74 * There is no unmap. The boot-time PTE pages aren't used after boot.
75 * If you really want the space back, just remap it yourself.
76 * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
77 */
78__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
79{
80 unsigned long last_addr, offset;
81 unsigned int nrpages;
82
83 last_addr = phys_addr + size - 1;
84
85 /* page align the requested address */
86 offset = phys_addr & ~PAGE_MASK;
87 phys_addr &= PAGE_MASK;
88 size = PAGE_ALIGN(last_addr) - phys_addr;
89
90 nrpages = size >> PAGE_SHIFT;
91 if (nrpages > BOOT_IOREMAP_PAGES)
92 return NULL;
93
94 __boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
95
96 return &boot_ioremap_space[offset];
97}
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
new file mode 100644
index 000000000000..1726b4096b10
--- /dev/null
+++ b/arch/i386/mm/discontig.c
@@ -0,0 +1,383 @@
1/*
2 * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
3 * August 2002: added remote node KVA remap - Martin J. Bligh
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#include <linux/config.h>
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/highmem.h>
30#include <linux/initrd.h>
31#include <linux/nodemask.h>
32#include <asm/e820.h>
33#include <asm/setup.h>
34#include <asm/mmzone.h>
35#include <bios_ebda.h>
36
37struct pglist_data *node_data[MAX_NUMNODES];
38bootmem_data_t node0_bdata;
39
40/*
41 * numa interface - we expect the numa architecture specfic code to have
42 * populated the following initialisation.
43 *
44 * 1) node_online_map - the map of all nodes configured (online) in the system
45 * 2) physnode_map - the mapping between a pfn and owning node
46 * 3) node_start_pfn - the starting page frame number for a node
47 * 3) node_end_pfn - the ending page fram number for a node
48 */
49
50/*
51 * physnode_map keeps track of the physical memory layout of a generic
52 * numa node on a 256Mb break (each element of the array will
53 * represent 256Mb of memory and will be marked by the node id. so,
54 * if the first gig is on node 0, and the second gig is on node 1
55 * physnode_map will contain:
56 *
57 * physnode_map[0-3] = 0;
58 * physnode_map[4-7] = 1;
59 * physnode_map[8- ] = -1;
60 */
61s8 physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1};
62
63void memory_present(int nid, unsigned long start, unsigned long end)
64{
65 unsigned long pfn;
66
67 printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n",
68 nid, start, end);
69 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
70 printk(KERN_DEBUG " ");
71 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
72 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
73 printk("%ld ", pfn);
74 }
75 printk("\n");
76}
77
78unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
79 unsigned long end_pfn)
80{
81 unsigned long nr_pages = end_pfn - start_pfn;
82
83 if (!nr_pages)
84 return 0;
85
86 return (nr_pages + 1) * sizeof(struct page);
87}
88
89unsigned long node_start_pfn[MAX_NUMNODES];
90unsigned long node_end_pfn[MAX_NUMNODES];
91
92extern unsigned long find_max_low_pfn(void);
93extern void find_max_pfn(void);
94extern void one_highpage_init(struct page *, int, int);
95
96extern struct e820map e820;
97extern unsigned long init_pg_tables_end;
98extern unsigned long highend_pfn, highstart_pfn;
99extern unsigned long max_low_pfn;
100extern unsigned long totalram_pages;
101extern unsigned long totalhigh_pages;
102
103#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
104
105unsigned long node_remap_start_pfn[MAX_NUMNODES];
106unsigned long node_remap_size[MAX_NUMNODES];
107unsigned long node_remap_offset[MAX_NUMNODES];
108void *node_remap_start_vaddr[MAX_NUMNODES];
109void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
110
111/*
112 * FLAT - support for basic PC memory model with discontig enabled, essentially
113 * a single node with all available processors in it with a flat
114 * memory map.
115 */
116int __init get_memcfg_numa_flat(void)
117{
118 printk("NUMA - single node, flat memory mode\n");
119
120 /* Run the memory configuration and find the top of memory. */
121 find_max_pfn();
122 node_start_pfn[0] = 0;
123 node_end_pfn[0] = max_pfn;
124 memory_present(0, 0, max_pfn);
125
126 /* Indicate there is one node available. */
127 nodes_clear(node_online_map);
128 node_set_online(0);
129 return 1;
130}
131
132/*
133 * Find the highest page frame number we have available for the node
134 */
135static void __init find_max_pfn_node(int nid)
136{
137 if (node_end_pfn[nid] > max_pfn)
138 node_end_pfn[nid] = max_pfn;
139 /*
140 * if a user has given mem=XXXX, then we need to make sure
141 * that the node _starts_ before that, too, not just ends
142 */
143 if (node_start_pfn[nid] > max_pfn)
144 node_start_pfn[nid] = max_pfn;
145 if (node_start_pfn[nid] > node_end_pfn[nid])
146 BUG();
147}
148
149/*
150 * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
151 * method. For node zero take this from the bottom of memory, for
152 * subsequent nodes place them at node_remap_start_vaddr which contains
153 * node local data in physically node local memory. See setup_memory()
154 * for details.
155 */
156static void __init allocate_pgdat(int nid)
157{
158 if (nid && node_has_online_mem(nid))
159 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
160 else {
161 NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));
162 min_low_pfn += PFN_UP(sizeof(pg_data_t));
163 }
164}
165
166void __init remap_numa_kva(void)
167{
168 void *vaddr;
169 unsigned long pfn;
170 int node;
171
172 for_each_online_node(node) {
173 if (node == 0)
174 continue;
175 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
176 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
177 set_pmd_pfn((ulong) vaddr,
178 node_remap_start_pfn[node] + pfn,
179 PAGE_KERNEL_LARGE);
180 }
181 }
182}
183
184static unsigned long calculate_numa_remap_pages(void)
185{
186 int nid;
187 unsigned long size, reserve_pages = 0;
188
189 for_each_online_node(nid) {
190 if (nid == 0)
191 continue;
192 if (!node_remap_size[nid])
193 continue;
194
195 /*
196 * The acpi/srat node info can show hot-add memroy zones
197 * where memory could be added but not currently present.
198 */
199 if (node_start_pfn[nid] > max_pfn)
200 continue;
201 if (node_end_pfn[nid] > max_pfn)
202 node_end_pfn[nid] = max_pfn;
203
204 /* ensure the remap includes space for the pgdat. */
205 size = node_remap_size[nid] + sizeof(pg_data_t);
206
207 /* convert size to large (pmd size) pages, rounding up */
208 size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
209 /* now the roundup is correct, convert to PAGE_SIZE pages */
210 size = size * PTRS_PER_PTE;
211 printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
212 size, nid);
213 node_remap_size[nid] = size;
214 reserve_pages += size;
215 node_remap_offset[nid] = reserve_pages;
216 printk("Shrinking node %d from %ld pages to %ld pages\n",
217 nid, node_end_pfn[nid], node_end_pfn[nid] - size);
218 node_end_pfn[nid] -= size;
219 node_remap_start_pfn[nid] = node_end_pfn[nid];
220 }
221 printk("Reserving total of %ld pages for numa KVA remap\n",
222 reserve_pages);
223 return reserve_pages;
224}
225
226extern void setup_bootmem_allocator(void);
227unsigned long __init setup_memory(void)
228{
229 int nid;
230 unsigned long system_start_pfn, system_max_low_pfn;
231 unsigned long reserve_pages;
232
233 /*
234 * When mapping a NUMA machine we allocate the node_mem_map arrays
235 * from node local memory. They are then mapped directly into KVA
236 * between zone normal and vmalloc space. Calculate the size of
237 * this space and use it to adjust the boundry between ZONE_NORMAL
238 * and ZONE_HIGHMEM.
239 */
240 find_max_pfn();
241 get_memcfg_numa();
242
243 reserve_pages = calculate_numa_remap_pages();
244
245 /* partially used pages are not usable - thus round upwards */
246 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
247
248 system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages;
249 printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n",
250 reserve_pages, max_low_pfn + reserve_pages);
251 printk("max_pfn = %ld\n", max_pfn);
252#ifdef CONFIG_HIGHMEM
253 highstart_pfn = highend_pfn = max_pfn;
254 if (max_pfn > system_max_low_pfn)
255 highstart_pfn = system_max_low_pfn;
256 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
257 pages_to_mb(highend_pfn - highstart_pfn));
258#endif
259 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
260 pages_to_mb(system_max_low_pfn));
261 printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",
262 min_low_pfn, max_low_pfn, highstart_pfn);
263
264 printk("Low memory ends at vaddr %08lx\n",
265 (ulong) pfn_to_kaddr(max_low_pfn));
266 for_each_online_node(nid) {
267 node_remap_start_vaddr[nid] = pfn_to_kaddr(
268 (highstart_pfn + reserve_pages) - node_remap_offset[nid]);
269 allocate_pgdat(nid);
270 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
271 (ulong) node_remap_start_vaddr[nid],
272 (ulong) pfn_to_kaddr(highstart_pfn + reserve_pages
273 - node_remap_offset[nid] + node_remap_size[nid]));
274 }
275 printk("High memory starts at vaddr %08lx\n",
276 (ulong) pfn_to_kaddr(highstart_pfn));
277 vmalloc_earlyreserve = reserve_pages * PAGE_SIZE;
278 for_each_online_node(nid)
279 find_max_pfn_node(nid);
280
281 memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
282 NODE_DATA(0)->bdata = &node0_bdata;
283 setup_bootmem_allocator();
284 return max_low_pfn;
285}
286
287void __init zone_sizes_init(void)
288{
289 int nid;
290
291 /*
292 * Insert nodes into pgdat_list backward so they appear in order.
293 * Clobber node 0's links and NULL out pgdat_list before starting.
294 */
295 pgdat_list = NULL;
296 for (nid = MAX_NUMNODES - 1; nid >= 0; nid--) {
297 if (!node_online(nid))
298 continue;
299 NODE_DATA(nid)->pgdat_next = pgdat_list;
300 pgdat_list = NODE_DATA(nid);
301 }
302
303 for_each_online_node(nid) {
304 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
305 unsigned long *zholes_size;
306 unsigned int max_dma;
307
308 unsigned long low = max_low_pfn;
309 unsigned long start = node_start_pfn[nid];
310 unsigned long high = node_end_pfn[nid];
311
312 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
313
314 if (node_has_online_mem(nid)){
315 if (start > low) {
316#ifdef CONFIG_HIGHMEM
317 BUG_ON(start > high);
318 zones_size[ZONE_HIGHMEM] = high - start;
319#endif
320 } else {
321 if (low < max_dma)
322 zones_size[ZONE_DMA] = low;
323 else {
324 BUG_ON(max_dma > low);
325 BUG_ON(low > high);
326 zones_size[ZONE_DMA] = max_dma;
327 zones_size[ZONE_NORMAL] = low - max_dma;
328#ifdef CONFIG_HIGHMEM
329 zones_size[ZONE_HIGHMEM] = high - low;
330#endif
331 }
332 }
333 }
334
335 zholes_size = get_zholes_size(nid);
336 /*
337 * We let the lmem_map for node 0 be allocated from the
338 * normal bootmem allocator, but other nodes come from the
339 * remapped KVA area - mbligh
340 */
341 if (!nid)
342 free_area_init_node(nid, NODE_DATA(nid),
343 zones_size, start, zholes_size);
344 else {
345 unsigned long lmem_map;
346 lmem_map = (unsigned long)node_remap_start_vaddr[nid];
347 lmem_map += sizeof(pg_data_t) + PAGE_SIZE - 1;
348 lmem_map &= PAGE_MASK;
349 NODE_DATA(nid)->node_mem_map = (struct page *)lmem_map;
350 free_area_init_node(nid, NODE_DATA(nid), zones_size,
351 start, zholes_size);
352 }
353 }
354 return;
355}
356
357void __init set_highmem_pages_init(int bad_ppro)
358{
359#ifdef CONFIG_HIGHMEM
360 struct zone *zone;
361
362 for_each_zone(zone) {
363 unsigned long node_pfn, node_high_size, zone_start_pfn;
364 struct page * zone_mem_map;
365
366 if (!is_highmem(zone))
367 continue;
368
369 printk("Initializing %s for node %d\n", zone->name,
370 zone->zone_pgdat->node_id);
371
372 node_high_size = zone->spanned_pages;
373 zone_mem_map = zone->zone_mem_map;
374 zone_start_pfn = zone->zone_start_pfn;
375
376 for (node_pfn = 0; node_pfn < node_high_size; node_pfn++) {
377 one_highpage_init((struct page *)(zone_mem_map + node_pfn),
378 zone_start_pfn + node_pfn, bad_ppro);
379 }
380 }
381 totalram_pages += totalhigh_pages;
382#endif
383}
diff --git a/arch/i386/mm/extable.c b/arch/i386/mm/extable.c
new file mode 100644
index 000000000000..f706449319c4
--- /dev/null
+++ b/arch/i386/mm/extable.c
@@ -0,0 +1,36 @@
1/*
2 * linux/arch/i386/mm/extable.c
3 */
4
5#include <linux/config.h>
6#include <linux/module.h>
7#include <linux/spinlock.h>
8#include <asm/uaccess.h>
9
10int fixup_exception(struct pt_regs *regs)
11{
12 const struct exception_table_entry *fixup;
13
14#ifdef CONFIG_PNPBIOS
15 if (unlikely((regs->xcs & ~15) == (GDT_ENTRY_PNPBIOS_BASE << 3)))
16 {
17 extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
18 extern u32 pnp_bios_is_utter_crap;
19 pnp_bios_is_utter_crap = 1;
20 printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
21 __asm__ volatile(
22 "movl %0, %%esp\n\t"
23 "jmp *%1\n\t"
24 : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
25 panic("do_trap: can't hit this");
26 }
27#endif
28
29 fixup = search_exception_tables(regs->eip);
30 if (fixup) {
31 regs->eip = fixup->fixup;
32 return 1;
33 }
34
35 return 0;
36}
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
new file mode 100644
index 000000000000..a509237c4815
--- /dev/null
+++ b/arch/i386/mm/fault.c
@@ -0,0 +1,552 @@
1/*
2 * linux/arch/i386/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 */
6
7#include <linux/signal.h>
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/errno.h>
11#include <linux/string.h>
12#include <linux/types.h>
13#include <linux/ptrace.h>
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
17#include <linux/smp_lock.h>
18#include <linux/interrupt.h>
19#include <linux/init.h>
20#include <linux/tty.h>
21#include <linux/vt_kern.h> /* For unblank_screen() */
22#include <linux/highmem.h>
23#include <linux/module.h>
24
25#include <asm/system.h>
26#include <asm/uaccess.h>
27#include <asm/desc.h>
28#include <asm/kdebug.h>
29
30extern void die(const char *,struct pt_regs *,long);
31
32/*
33 * Unlock any spinlocks which will prevent us from getting the
34 * message out
35 */
36void bust_spinlocks(int yes)
37{
38 int loglevel_save = console_loglevel;
39
40 if (yes) {
41 oops_in_progress = 1;
42 return;
43 }
44#ifdef CONFIG_VT
45 unblank_screen();
46#endif
47 oops_in_progress = 0;
48 /*
49 * OK, the message is on the console. Now we call printk()
50 * without oops_in_progress set so that printk will give klogd
51 * a poke. Hold onto your hats...
52 */
53 console_loglevel = 15; /* NMI oopser may have shut the console up */
54 printk(" ");
55 console_loglevel = loglevel_save;
56}
57
58/*
59 * Return EIP plus the CS segment base. The segment limit is also
60 * adjusted, clamped to the kernel/user address space (whichever is
61 * appropriate), and returned in *eip_limit.
62 *
63 * The segment is checked, because it might have been changed by another
64 * task between the original faulting instruction and here.
65 *
66 * If CS is no longer a valid code segment, or if EIP is beyond the
67 * limit, or if it is a kernel address when CS is not a kernel segment,
68 * then the returned value will be greater than *eip_limit.
69 *
70 * This is slow, but is very rarely executed.
71 */
72static inline unsigned long get_segment_eip(struct pt_regs *regs,
73 unsigned long *eip_limit)
74{
75 unsigned long eip = regs->eip;
76 unsigned seg = regs->xcs & 0xffff;
77 u32 seg_ar, seg_limit, base, *desc;
78
79 /* The standard kernel/user address space limit. */
80 *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg;
81
82 /* Unlikely, but must come before segment checks. */
83 if (unlikely((regs->eflags & VM_MASK) != 0))
84 return eip + (seg << 4);
85
86 /* By far the most common cases. */
87 if (likely(seg == __USER_CS || seg == __KERNEL_CS))
88 return eip;
89
90 /* Check the segment exists, is within the current LDT/GDT size,
91 that kernel/user (ring 0..3) has the appropriate privilege,
92 that it's a code segment, and get the limit. */
93 __asm__ ("larl %3,%0; lsll %3,%1"
94 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
95 if ((~seg_ar & 0x9800) || eip > seg_limit) {
96 *eip_limit = 0;
97 return 1; /* So that returned eip > *eip_limit. */
98 }
99
100 /* Get the GDT/LDT descriptor base.
101 When you look for races in this code remember that
102 LDT and other horrors are only used in user space. */
103 if (seg & (1<<2)) {
104 /* Must lock the LDT while reading it. */
105 down(&current->mm->context.sem);
106 desc = current->mm->context.ldt;
107 desc = (void *)desc + (seg & ~7);
108 } else {
109 /* Must disable preemption while reading the GDT. */
110 desc = (u32 *)&per_cpu(cpu_gdt_table, get_cpu());
111 desc = (void *)desc + (seg & ~7);
112 }
113
114 /* Decode the code segment base from the descriptor */
115 base = get_desc_base((unsigned long *)desc);
116
117 if (seg & (1<<2)) {
118 up(&current->mm->context.sem);
119 } else
120 put_cpu();
121
122 /* Adjust EIP and segment limit, and clamp at the kernel limit.
123 It's legitimate for segments to wrap at 0xffffffff. */
124 seg_limit += base;
125 if (seg_limit < *eip_limit && seg_limit >= base)
126 *eip_limit = seg_limit;
127 return eip + base;
128}
129
130/*
131 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
132 * Check that here and ignore it.
133 */
134static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
135{
136 unsigned long limit;
137 unsigned long instr = get_segment_eip (regs, &limit);
138 int scan_more = 1;
139 int prefetch = 0;
140 int i;
141
142 for (i = 0; scan_more && i < 15; i++) {
143 unsigned char opcode;
144 unsigned char instr_hi;
145 unsigned char instr_lo;
146
147 if (instr > limit)
148 break;
149 if (__get_user(opcode, (unsigned char *) instr))
150 break;
151
152 instr_hi = opcode & 0xf0;
153 instr_lo = opcode & 0x0f;
154 instr++;
155
156 switch (instr_hi) {
157 case 0x20:
158 case 0x30:
159 /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
160 scan_more = ((instr_lo & 7) == 0x6);
161 break;
162
163 case 0x60:
164 /* 0x64 thru 0x67 are valid prefixes in all modes. */
165 scan_more = (instr_lo & 0xC) == 0x4;
166 break;
167 case 0xF0:
168 /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
169 scan_more = !instr_lo || (instr_lo>>1) == 1;
170 break;
171 case 0x00:
172 /* Prefetch instruction is 0x0F0D or 0x0F18 */
173 scan_more = 0;
174 if (instr > limit)
175 break;
176 if (__get_user(opcode, (unsigned char *) instr))
177 break;
178 prefetch = (instr_lo == 0xF) &&
179 (opcode == 0x0D || opcode == 0x18);
180 break;
181 default:
182 scan_more = 0;
183 break;
184 }
185 }
186 return prefetch;
187}
188
189static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
190 unsigned long error_code)
191{
192 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
193 boot_cpu_data.x86 >= 6)) {
194 /* Catch an obscure case of prefetch inside an NX page. */
195 if (nx_enabled && (error_code & 16))
196 return 0;
197 return __is_prefetch(regs, addr);
198 }
199 return 0;
200}
201
202fastcall void do_invalid_op(struct pt_regs *, unsigned long);
203
204/*
205 * This routine handles page faults. It determines the address,
206 * and the problem, and then passes it off to one of the appropriate
207 * routines.
208 *
209 * error_code:
210 * bit 0 == 0 means no page found, 1 means protection fault
211 * bit 1 == 0 means read, 1 means write
212 * bit 2 == 0 means kernel, 1 means user-mode
213 */
214fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code)
215{
216 struct task_struct *tsk;
217 struct mm_struct *mm;
218 struct vm_area_struct * vma;
219 unsigned long address;
220 unsigned long page;
221 int write;
222 siginfo_t info;
223
224 /* get the address */
225 __asm__("movl %%cr2,%0":"=r" (address));
226
227 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
228 SIGSEGV) == NOTIFY_STOP)
229 return;
230 /* It's safe to allow irq's after cr2 has been saved */
231 if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
232 local_irq_enable();
233
234 tsk = current;
235
236 info.si_code = SEGV_MAPERR;
237
238 /*
239 * We fault-in kernel-space virtual memory on-demand. The
240 * 'reference' page table is init_mm.pgd.
241 *
242 * NOTE! We MUST NOT take any locks for this case. We may
243 * be in an interrupt or a critical region, and should
244 * only copy the information from the master page table,
245 * nothing more.
246 *
247 * This verifies that the fault happens in kernel space
248 * (error_code & 4) == 0, and that the fault was not a
249 * protection error (error_code & 1) == 0.
250 */
251 if (unlikely(address >= TASK_SIZE)) {
252 if (!(error_code & 5))
253 goto vmalloc_fault;
254 /*
255 * Don't take the mm semaphore here. If we fixup a prefetch
256 * fault we could otherwise deadlock.
257 */
258 goto bad_area_nosemaphore;
259 }
260
261 mm = tsk->mm;
262
263 /*
264 * If we're in an interrupt, have no user context or are running in an
265 * atomic region then we must not take the fault..
266 */
267 if (in_atomic() || !mm)
268 goto bad_area_nosemaphore;
269
270 /* When running in the kernel we expect faults to occur only to
271 * addresses in user space. All other faults represent errors in the
272 * kernel and should generate an OOPS. Unfortunatly, in the case of an
273 * erroneous fault occuring in a code path which already holds mmap_sem
274 * we will deadlock attempting to validate the fault against the
275 * address space. Luckily the kernel only validly references user
276 * space from well defined areas of code, which are listed in the
277 * exceptions table.
278 *
279 * As the vast majority of faults will be valid we will only perform
280 * the source reference check when there is a possibilty of a deadlock.
281 * Attempt to lock the address space, if we cannot we then validate the
282 * source. If this is invalid we can skip the address space check,
283 * thus avoiding the deadlock.
284 */
285 if (!down_read_trylock(&mm->mmap_sem)) {
286 if ((error_code & 4) == 0 &&
287 !search_exception_tables(regs->eip))
288 goto bad_area_nosemaphore;
289 down_read(&mm->mmap_sem);
290 }
291
292 vma = find_vma(mm, address);
293 if (!vma)
294 goto bad_area;
295 if (vma->vm_start <= address)
296 goto good_area;
297 if (!(vma->vm_flags & VM_GROWSDOWN))
298 goto bad_area;
299 if (error_code & 4) {
300 /*
301 * accessing the stack below %esp is always a bug.
302 * The "+ 32" is there due to some instructions (like
303 * pusha) doing post-decrement on the stack and that
304 * doesn't show up until later..
305 */
306 if (address + 32 < regs->esp)
307 goto bad_area;
308 }
309 if (expand_stack(vma, address))
310 goto bad_area;
311/*
312 * Ok, we have a good vm_area for this memory access, so
313 * we can handle it..
314 */
315good_area:
316 info.si_code = SEGV_ACCERR;
317 write = 0;
318 switch (error_code & 3) {
319 default: /* 3: write, present */
320#ifdef TEST_VERIFY_AREA
321 if (regs->cs == KERNEL_CS)
322 printk("WP fault at %08lx\n", regs->eip);
323#endif
324 /* fall through */
325 case 2: /* write, not present */
326 if (!(vma->vm_flags & VM_WRITE))
327 goto bad_area;
328 write++;
329 break;
330 case 1: /* read, present */
331 goto bad_area;
332 case 0: /* read, not present */
333 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
334 goto bad_area;
335 }
336
337 survive:
338 /*
339 * If for any reason at all we couldn't handle the fault,
340 * make sure we exit gracefully rather than endlessly redo
341 * the fault.
342 */
343 switch (handle_mm_fault(mm, vma, address, write)) {
344 case VM_FAULT_MINOR:
345 tsk->min_flt++;
346 break;
347 case VM_FAULT_MAJOR:
348 tsk->maj_flt++;
349 break;
350 case VM_FAULT_SIGBUS:
351 goto do_sigbus;
352 case VM_FAULT_OOM:
353 goto out_of_memory;
354 default:
355 BUG();
356 }
357
358 /*
359 * Did it hit the DOS screen memory VA from vm86 mode?
360 */
361 if (regs->eflags & VM_MASK) {
362 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
363 if (bit < 32)
364 tsk->thread.screen_bitmap |= 1 << bit;
365 }
366 up_read(&mm->mmap_sem);
367 return;
368
369/*
370 * Something tried to access memory that isn't in our memory map..
371 * Fix it, but check if it's kernel or user first..
372 */
373bad_area:
374 up_read(&mm->mmap_sem);
375
376bad_area_nosemaphore:
377 /* User mode accesses just cause a SIGSEGV */
378 if (error_code & 4) {
379 /*
380 * Valid to do another page fault here because this one came
381 * from user space.
382 */
383 if (is_prefetch(regs, address, error_code))
384 return;
385
386 tsk->thread.cr2 = address;
387 /* Kernel addresses are always protection faults */
388 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
389 tsk->thread.trap_no = 14;
390 info.si_signo = SIGSEGV;
391 info.si_errno = 0;
392 /* info.si_code has been set above */
393 info.si_addr = (void __user *)address;
394 force_sig_info(SIGSEGV, &info, tsk);
395 return;
396 }
397
398#ifdef CONFIG_X86_F00F_BUG
399 /*
400 * Pentium F0 0F C7 C8 bug workaround.
401 */
402 if (boot_cpu_data.f00f_bug) {
403 unsigned long nr;
404
405 nr = (address - idt_descr.address) >> 3;
406
407 if (nr == 6) {
408 do_invalid_op(regs, 0);
409 return;
410 }
411 }
412#endif
413
414no_context:
415 /* Are we prepared to handle this kernel fault? */
416 if (fixup_exception(regs))
417 return;
418
419 /*
420 * Valid to do another page fault here, because if this fault
421 * had been triggered by is_prefetch fixup_exception would have
422 * handled it.
423 */
424 if (is_prefetch(regs, address, error_code))
425 return;
426
427/*
428 * Oops. The kernel tried to access some bad page. We'll have to
429 * terminate things with extreme prejudice.
430 */
431
432 bust_spinlocks(1);
433
434#ifdef CONFIG_X86_PAE
435 if (error_code & 16) {
436 pte_t *pte = lookup_address(address);
437
438 if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
439 printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid);
440 }
441#endif
442 if (address < PAGE_SIZE)
443 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
444 else
445 printk(KERN_ALERT "Unable to handle kernel paging request");
446 printk(" at virtual address %08lx\n",address);
447 printk(KERN_ALERT " printing eip:\n");
448 printk("%08lx\n", regs->eip);
449 asm("movl %%cr3,%0":"=r" (page));
450 page = ((unsigned long *) __va(page))[address >> 22];
451 printk(KERN_ALERT "*pde = %08lx\n", page);
452 /*
453 * We must not directly access the pte in the highpte
454 * case, the page table might be allocated in highmem.
455 * And lets rather not kmap-atomic the pte, just in case
456 * it's allocated already.
457 */
458#ifndef CONFIG_HIGHPTE
459 if (page & 1) {
460 page &= PAGE_MASK;
461 address &= 0x003ff000;
462 page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
463 printk(KERN_ALERT "*pte = %08lx\n", page);
464 }
465#endif
466 die("Oops", regs, error_code);
467 bust_spinlocks(0);
468 do_exit(SIGKILL);
469
470/*
471 * We ran out of memory, or some other thing happened to us that made
472 * us unable to handle the page fault gracefully.
473 */
474out_of_memory:
475 up_read(&mm->mmap_sem);
476 if (tsk->pid == 1) {
477 yield();
478 down_read(&mm->mmap_sem);
479 goto survive;
480 }
481 printk("VM: killing process %s\n", tsk->comm);
482 if (error_code & 4)
483 do_exit(SIGKILL);
484 goto no_context;
485
486do_sigbus:
487 up_read(&mm->mmap_sem);
488
489 /* Kernel mode? Handle exceptions or die */
490 if (!(error_code & 4))
491 goto no_context;
492
493 /* User space => ok to do another page fault */
494 if (is_prefetch(regs, address, error_code))
495 return;
496
497 tsk->thread.cr2 = address;
498 tsk->thread.error_code = error_code;
499 tsk->thread.trap_no = 14;
500 info.si_signo = SIGBUS;
501 info.si_errno = 0;
502 info.si_code = BUS_ADRERR;
503 info.si_addr = (void __user *)address;
504 force_sig_info(SIGBUS, &info, tsk);
505 return;
506
507vmalloc_fault:
508 {
509 /*
510 * Synchronize this task's top level page-table
511 * with the 'reference' page table.
512 *
513 * Do _not_ use "tsk" here. We might be inside
514 * an interrupt in the middle of a task switch..
515 */
516 int index = pgd_index(address);
517 unsigned long pgd_paddr;
518 pgd_t *pgd, *pgd_k;
519 pud_t *pud, *pud_k;
520 pmd_t *pmd, *pmd_k;
521 pte_t *pte_k;
522
523 asm("movl %%cr3,%0":"=r" (pgd_paddr));
524 pgd = index + (pgd_t *)__va(pgd_paddr);
525 pgd_k = init_mm.pgd + index;
526
527 if (!pgd_present(*pgd_k))
528 goto no_context;
529
530 /*
531 * set_pgd(pgd, *pgd_k); here would be useless on PAE
532 * and redundant with the set_pmd() on non-PAE. As would
533 * set_pud.
534 */
535
536 pud = pud_offset(pgd, address);
537 pud_k = pud_offset(pgd_k, address);
538 if (!pud_present(*pud_k))
539 goto no_context;
540
541 pmd = pmd_offset(pud, address);
542 pmd_k = pmd_offset(pud_k, address);
543 if (!pmd_present(*pmd_k))
544 goto no_context;
545 set_pmd(pmd, *pmd_k);
546
547 pte_k = pte_offset_kernel(pmd_k, address);
548 if (!pte_present(*pte_k))
549 goto no_context;
550 return;
551 }
552}
diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c
new file mode 100644
index 000000000000..fc4c4cad4e98
--- /dev/null
+++ b/arch/i386/mm/highmem.c
@@ -0,0 +1,89 @@
1#include <linux/highmem.h>
2
3void *kmap(struct page *page)
4{
5 might_sleep();
6 if (!PageHighMem(page))
7 return page_address(page);
8 return kmap_high(page);
9}
10
11void kunmap(struct page *page)
12{
13 if (in_interrupt())
14 BUG();
15 if (!PageHighMem(page))
16 return;
17 kunmap_high(page);
18}
19
20/*
21 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
22 * no global lock is needed and because the kmap code must perform a global TLB
23 * invalidation when the kmap pool wraps.
24 *
25 * However when holding an atomic kmap is is not legal to sleep, so atomic
26 * kmaps are appropriate for short, tight code paths only.
27 */
28void *kmap_atomic(struct page *page, enum km_type type)
29{
30 enum fixed_addresses idx;
31 unsigned long vaddr;
32
33 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
34 inc_preempt_count();
35 if (!PageHighMem(page))
36 return page_address(page);
37
38 idx = type + KM_TYPE_NR*smp_processor_id();
39 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
40#ifdef CONFIG_DEBUG_HIGHMEM
41 if (!pte_none(*(kmap_pte-idx)))
42 BUG();
43#endif
44 set_pte(kmap_pte-idx, mk_pte(page, kmap_prot));
45 __flush_tlb_one(vaddr);
46
47 return (void*) vaddr;
48}
49
50void kunmap_atomic(void *kvaddr, enum km_type type)
51{
52#ifdef CONFIG_DEBUG_HIGHMEM
53 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
54 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
55
56 if (vaddr < FIXADDR_START) { // FIXME
57 dec_preempt_count();
58 preempt_check_resched();
59 return;
60 }
61
62 if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
63 BUG();
64
65 /*
66 * force other mappings to Oops if they'll try to access
67 * this pte without first remap it
68 */
69 pte_clear(&init_mm, vaddr, kmap_pte-idx);
70 __flush_tlb_one(vaddr);
71#endif
72
73 dec_preempt_count();
74 preempt_check_resched();
75}
76
77struct page *kmap_atomic_to_page(void *ptr)
78{
79 unsigned long idx, vaddr = (unsigned long)ptr;
80 pte_t *pte;
81
82 if (vaddr < FIXADDR_START)
83 return virt_to_page(ptr);
84
85 idx = virt_to_fix(vaddr);
86 pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
87 return pte_page(*pte);
88}
89
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
new file mode 100644
index 000000000000..a8c45143088b
--- /dev/null
+++ b/arch/i386/mm/hugetlbpage.c
@@ -0,0 +1,431 @@
1/*
2 * IA-32 Huge TLB Page Support for Kernel.
3 *
4 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
5 */
6
7#include <linux/config.h>
8#include <linux/init.h>
9#include <linux/fs.h>
10#include <linux/mm.h>
11#include <linux/hugetlb.h>
12#include <linux/pagemap.h>
13#include <linux/smp_lock.h>
14#include <linux/slab.h>
15#include <linux/err.h>
16#include <linux/sysctl.h>
17#include <asm/mman.h>
18#include <asm/tlb.h>
19#include <asm/tlbflush.h>
20
21static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
22{
23 pgd_t *pgd;
24 pud_t *pud;
25 pmd_t *pmd = NULL;
26
27 pgd = pgd_offset(mm, addr);
28 pud = pud_alloc(mm, pgd, addr);
29 pmd = pmd_alloc(mm, pud, addr);
30 return (pte_t *) pmd;
31}
32
33static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
34{
35 pgd_t *pgd;
36 pud_t *pud;
37 pmd_t *pmd = NULL;
38
39 pgd = pgd_offset(mm, addr);
40 pud = pud_offset(pgd, addr);
41 pmd = pmd_offset(pud, addr);
42 return (pte_t *) pmd;
43}
44
45static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access)
46{
47 pte_t entry;
48
49 add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
50 if (write_access) {
51 entry =
52 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
53 } else
54 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
55 entry = pte_mkyoung(entry);
56 mk_pte_huge(entry);
57 set_pte(page_table, entry);
58}
59
60/*
61 * This function checks for proper alignment of input addr and len parameters.
62 */
63int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
64{
65 if (len & ~HPAGE_MASK)
66 return -EINVAL;
67 if (addr & ~HPAGE_MASK)
68 return -EINVAL;
69 return 0;
70}
71
72int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
73 struct vm_area_struct *vma)
74{
75 pte_t *src_pte, *dst_pte, entry;
76 struct page *ptepage;
77 unsigned long addr = vma->vm_start;
78 unsigned long end = vma->vm_end;
79
80 while (addr < end) {
81 dst_pte = huge_pte_alloc(dst, addr);
82 if (!dst_pte)
83 goto nomem;
84 src_pte = huge_pte_offset(src, addr);
85 entry = *src_pte;
86 ptepage = pte_page(entry);
87 get_page(ptepage);
88 set_pte(dst_pte, entry);
89 add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
90 addr += HPAGE_SIZE;
91 }
92 return 0;
93
94nomem:
95 return -ENOMEM;
96}
97
98int
99follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
100 struct page **pages, struct vm_area_struct **vmas,
101 unsigned long *position, int *length, int i)
102{
103 unsigned long vpfn, vaddr = *position;
104 int remainder = *length;
105
106 WARN_ON(!is_vm_hugetlb_page(vma));
107
108 vpfn = vaddr/PAGE_SIZE;
109 while (vaddr < vma->vm_end && remainder) {
110
111 if (pages) {
112 pte_t *pte;
113 struct page *page;
114
115 pte = huge_pte_offset(mm, vaddr);
116
117 /* hugetlb should be locked, and hence, prefaulted */
118 WARN_ON(!pte || pte_none(*pte));
119
120 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
121
122 WARN_ON(!PageCompound(page));
123
124 get_page(page);
125 pages[i] = page;
126 }
127
128 if (vmas)
129 vmas[i] = vma;
130
131 vaddr += PAGE_SIZE;
132 ++vpfn;
133 --remainder;
134 ++i;
135 }
136
137 *length = remainder;
138 *position = vaddr;
139
140 return i;
141}
142
143#if 0 /* This is just for testing */
144struct page *
145follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
146{
147 unsigned long start = address;
148 int length = 1;
149 int nr;
150 struct page *page;
151 struct vm_area_struct *vma;
152
153 vma = find_vma(mm, addr);
154 if (!vma || !is_vm_hugetlb_page(vma))
155 return ERR_PTR(-EINVAL);
156
157 pte = huge_pte_offset(mm, address);
158
159 /* hugetlb should be locked, and hence, prefaulted */
160 WARN_ON(!pte || pte_none(*pte));
161
162 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
163
164 WARN_ON(!PageCompound(page));
165
166 return page;
167}
168
169int pmd_huge(pmd_t pmd)
170{
171 return 0;
172}
173
174struct page *
175follow_huge_pmd(struct mm_struct *mm, unsigned long address,
176 pmd_t *pmd, int write)
177{
178 return NULL;
179}
180
181#else
182
183struct page *
184follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
185{
186 return ERR_PTR(-EINVAL);
187}
188
189int pmd_huge(pmd_t pmd)
190{
191 return !!(pmd_val(pmd) & _PAGE_PSE);
192}
193
194struct page *
195follow_huge_pmd(struct mm_struct *mm, unsigned long address,
196 pmd_t *pmd, int write)
197{
198 struct page *page;
199
200 page = pte_page(*(pte_t *)pmd);
201 if (page)
202 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
203 return page;
204}
205#endif
206
207void unmap_hugepage_range(struct vm_area_struct *vma,
208 unsigned long start, unsigned long end)
209{
210 struct mm_struct *mm = vma->vm_mm;
211 unsigned long address;
212 pte_t pte, *ptep;
213 struct page *page;
214
215 BUG_ON(start & (HPAGE_SIZE - 1));
216 BUG_ON(end & (HPAGE_SIZE - 1));
217
218 for (address = start; address < end; address += HPAGE_SIZE) {
219 ptep = huge_pte_offset(mm, address);
220 if (!ptep)
221 continue;
222 pte = ptep_get_and_clear(mm, address, ptep);
223 if (pte_none(pte))
224 continue;
225 page = pte_page(pte);
226 put_page(page);
227 }
228 add_mm_counter(mm ,rss, -((end - start) >> PAGE_SHIFT));
229 flush_tlb_range(vma, start, end);
230}
231
232int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
233{
234 struct mm_struct *mm = current->mm;
235 unsigned long addr;
236 int ret = 0;
237
238 BUG_ON(vma->vm_start & ~HPAGE_MASK);
239 BUG_ON(vma->vm_end & ~HPAGE_MASK);
240
241 spin_lock(&mm->page_table_lock);
242 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
243 unsigned long idx;
244 pte_t *pte = huge_pte_alloc(mm, addr);
245 struct page *page;
246
247 if (!pte) {
248 ret = -ENOMEM;
249 goto out;
250 }
251
252 if (!pte_none(*pte)) {
253 pmd_t *pmd = (pmd_t *) pte;
254
255 page = pmd_page(*pmd);
256 pmd_clear(pmd);
257 mm->nr_ptes--;
258 dec_page_state(nr_page_table_pages);
259 page_cache_release(page);
260 }
261
262 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
263 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
264 page = find_get_page(mapping, idx);
265 if (!page) {
266 /* charge the fs quota first */
267 if (hugetlb_get_quota(mapping)) {
268 ret = -ENOMEM;
269 goto out;
270 }
271 page = alloc_huge_page();
272 if (!page) {
273 hugetlb_put_quota(mapping);
274 ret = -ENOMEM;
275 goto out;
276 }
277 ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
278 if (! ret) {
279 unlock_page(page);
280 } else {
281 hugetlb_put_quota(mapping);
282 free_huge_page(page);
283 goto out;
284 }
285 }
286 set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
287 }
288out:
289 spin_unlock(&mm->page_table_lock);
290 return ret;
291}
292
293/* x86_64 also uses this file */
294
295#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
296static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
297 unsigned long addr, unsigned long len,
298 unsigned long pgoff, unsigned long flags)
299{
300 struct mm_struct *mm = current->mm;
301 struct vm_area_struct *vma;
302 unsigned long start_addr;
303
304 start_addr = mm->free_area_cache;
305
306full_search:
307 addr = ALIGN(start_addr, HPAGE_SIZE);
308
309 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
310 /* At this point: (!vma || addr < vma->vm_end). */
311 if (TASK_SIZE - len < addr) {
312 /*
313 * Start a new search - just in case we missed
314 * some holes.
315 */
316 if (start_addr != TASK_UNMAPPED_BASE) {
317 start_addr = TASK_UNMAPPED_BASE;
318 goto full_search;
319 }
320 return -ENOMEM;
321 }
322 if (!vma || addr + len <= vma->vm_start) {
323 mm->free_area_cache = addr + len;
324 return addr;
325 }
326 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
327 }
328}
329
330static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
331 unsigned long addr0, unsigned long len,
332 unsigned long pgoff, unsigned long flags)
333{
334 struct mm_struct *mm = current->mm;
335 struct vm_area_struct *vma, *prev_vma;
336 unsigned long base = mm->mmap_base, addr = addr0;
337 int first_time = 1;
338
339 /* don't allow allocations above current base */
340 if (mm->free_area_cache > base)
341 mm->free_area_cache = base;
342
343try_again:
344 /* make sure it can fit in the remaining address space */
345 if (mm->free_area_cache < len)
346 goto fail;
347
348 /* either no address requested or cant fit in requested address hole */
349 addr = (mm->free_area_cache - len) & HPAGE_MASK;
350 do {
351 /*
352 * Lookup failure means no vma is above this address,
353 * i.e. return with success:
354 */
355 if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
356 return addr;
357
358 /*
359 * new region fits between prev_vma->vm_end and
360 * vma->vm_start, use it:
361 */
362 if (addr + len <= vma->vm_start &&
363 (!prev_vma || (addr >= prev_vma->vm_end)))
364 /* remember the address as a hint for next time */
365 return (mm->free_area_cache = addr);
366 else
367 /* pull free_area_cache down to the first hole */
368 if (mm->free_area_cache == vma->vm_end)
369 mm->free_area_cache = vma->vm_start;
370
371 /* try just below the current vma->vm_start */
372 addr = (vma->vm_start - len) & HPAGE_MASK;
373 } while (len <= vma->vm_start);
374
375fail:
376 /*
377 * if hint left us with no space for the requested
378 * mapping then try again:
379 */
380 if (first_time) {
381 mm->free_area_cache = base;
382 first_time = 0;
383 goto try_again;
384 }
385 /*
386 * A failed mmap() very likely causes application failure,
387 * so fall back to the bottom-up function here. This scenario
388 * can happen with large stack limits and large mmap()
389 * allocations.
390 */
391 mm->free_area_cache = TASK_UNMAPPED_BASE;
392 addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
393 len, pgoff, flags);
394
395 /*
396 * Restore the topdown base:
397 */
398 mm->free_area_cache = base;
399
400 return addr;
401}
402
403unsigned long
404hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
405 unsigned long len, unsigned long pgoff, unsigned long flags)
406{
407 struct mm_struct *mm = current->mm;
408 struct vm_area_struct *vma;
409
410 if (len & ~HPAGE_MASK)
411 return -EINVAL;
412 if (len > TASK_SIZE)
413 return -ENOMEM;
414
415 if (addr) {
416 addr = ALIGN(addr, HPAGE_SIZE);
417 vma = find_vma(mm, addr);
418 if (TASK_SIZE - len >= addr &&
419 (!vma || addr + len <= vma->vm_start))
420 return addr;
421 }
422 if (mm->get_unmapped_area == arch_get_unmapped_area)
423 return hugetlb_get_unmapped_area_bottomup(file, addr, len,
424 pgoff, flags);
425 else
426 return hugetlb_get_unmapped_area_topdown(file, addr, len,
427 pgoff, flags);
428}
429
430#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
431
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
new file mode 100644
index 000000000000..7a7ea3737265
--- /dev/null
+++ b/arch/i386/mm/init.c
@@ -0,0 +1,696 @@
1/*
2 * linux/arch/i386/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 */
8
9#include <linux/config.h>
10#include <linux/module.h>
11#include <linux/signal.h>
12#include <linux/sched.h>
13#include <linux/kernel.h>
14#include <linux/errno.h>
15#include <linux/string.h>
16#include <linux/types.h>
17#include <linux/ptrace.h>
18#include <linux/mman.h>
19#include <linux/mm.h>
20#include <linux/hugetlb.h>
21#include <linux/swap.h>
22#include <linux/smp.h>
23#include <linux/init.h>
24#include <linux/highmem.h>
25#include <linux/pagemap.h>
26#include <linux/bootmem.h>
27#include <linux/slab.h>
28#include <linux/proc_fs.h>
29#include <linux/efi.h>
30
31#include <asm/processor.h>
32#include <asm/system.h>
33#include <asm/uaccess.h>
34#include <asm/pgtable.h>
35#include <asm/dma.h>
36#include <asm/fixmap.h>
37#include <asm/e820.h>
38#include <asm/apic.h>
39#include <asm/tlb.h>
40#include <asm/tlbflush.h>
41#include <asm/sections.h>
42
43unsigned int __VMALLOC_RESERVE = 128 << 20;
44
45DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
46unsigned long highstart_pfn, highend_pfn;
47
48static int noinline do_test_wp_bit(void);
49
50/*
51 * Creates a middle page table and puts a pointer to it in the
52 * given global directory entry. This only returns the gd entry
53 * in non-PAE compilation mode, since the middle layer is folded.
54 */
55static pmd_t * __init one_md_table_init(pgd_t *pgd)
56{
57 pud_t *pud;
58 pmd_t *pmd_table;
59
60#ifdef CONFIG_X86_PAE
61 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
62 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
63 pud = pud_offset(pgd, 0);
64 if (pmd_table != pmd_offset(pud, 0))
65 BUG();
66#else
67 pud = pud_offset(pgd, 0);
68 pmd_table = pmd_offset(pud, 0);
69#endif
70
71 return pmd_table;
72}
73
74/*
75 * Create a page table and place a pointer to it in a middle page
76 * directory entry.
77 */
78static pte_t * __init one_page_table_init(pmd_t *pmd)
79{
80 if (pmd_none(*pmd)) {
81 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
82 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
83 if (page_table != pte_offset_kernel(pmd, 0))
84 BUG();
85
86 return page_table;
87 }
88
89 return pte_offset_kernel(pmd, 0);
90}
91
92/*
93 * This function initializes a certain range of kernel virtual memory
94 * with new bootmem page tables, everywhere page tables are missing in
95 * the given range.
96 */
97
98/*
99 * NOTE: The pagetables are allocated contiguous on the physical space
100 * so we can cache the place of the first one and move around without
101 * checking the pgd every time.
102 */
103static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
104{
105 pgd_t *pgd;
106 pud_t *pud;
107 pmd_t *pmd;
108 int pgd_idx, pmd_idx;
109 unsigned long vaddr;
110
111 vaddr = start;
112 pgd_idx = pgd_index(vaddr);
113 pmd_idx = pmd_index(vaddr);
114 pgd = pgd_base + pgd_idx;
115
116 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
117 if (pgd_none(*pgd))
118 one_md_table_init(pgd);
119 pud = pud_offset(pgd, vaddr);
120 pmd = pmd_offset(pud, vaddr);
121 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
122 if (pmd_none(*pmd))
123 one_page_table_init(pmd);
124
125 vaddr += PMD_SIZE;
126 }
127 pmd_idx = 0;
128 }
129}
130
131static inline int is_kernel_text(unsigned long addr)
132{
133 if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
134 return 1;
135 return 0;
136}
137
138/*
139 * This maps the physical memory to kernel virtual address space, a total
140 * of max_low_pfn pages, by creating page tables starting from address
141 * PAGE_OFFSET.
142 */
143static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
144{
145 unsigned long pfn;
146 pgd_t *pgd;
147 pmd_t *pmd;
148 pte_t *pte;
149 int pgd_idx, pmd_idx, pte_ofs;
150
151 pgd_idx = pgd_index(PAGE_OFFSET);
152 pgd = pgd_base + pgd_idx;
153 pfn = 0;
154
155 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
156 pmd = one_md_table_init(pgd);
157 if (pfn >= max_low_pfn)
158 continue;
159 for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
160 unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
161
162 /* Map with big pages if possible, otherwise create normal page tables. */
163 if (cpu_has_pse) {
164 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
165
166 if (is_kernel_text(address) || is_kernel_text(address2))
167 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
168 else
169 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
170 pfn += PTRS_PER_PTE;
171 } else {
172 pte = one_page_table_init(pmd);
173
174 for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
175 if (is_kernel_text(address))
176 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
177 else
178 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
179 }
180 }
181 }
182 }
183}
184
185static inline int page_kills_ppro(unsigned long pagenr)
186{
187 if (pagenr >= 0x70000 && pagenr <= 0x7003F)
188 return 1;
189 return 0;
190}
191
192extern int is_available_memory(efi_memory_desc_t *);
193
194static inline int page_is_ram(unsigned long pagenr)
195{
196 int i;
197 unsigned long addr, end;
198
199 if (efi_enabled) {
200 efi_memory_desc_t *md;
201
202 for (i = 0; i < memmap.nr_map; i++) {
203 md = &memmap.map[i];
204 if (!is_available_memory(md))
205 continue;
206 addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
207 end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
208
209 if ((pagenr >= addr) && (pagenr < end))
210 return 1;
211 }
212 return 0;
213 }
214
215 for (i = 0; i < e820.nr_map; i++) {
216
217 if (e820.map[i].type != E820_RAM) /* not usable memory */
218 continue;
219 /*
220 * !!!FIXME!!! Some BIOSen report areas as RAM that
221 * are not. Notably the 640->1Mb area. We need a sanity
222 * check here.
223 */
224 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
225 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
226 if ((pagenr >= addr) && (pagenr < end))
227 return 1;
228 }
229 return 0;
230}
231
232#ifdef CONFIG_HIGHMEM
233pte_t *kmap_pte;
234pgprot_t kmap_prot;
235
236#define kmap_get_fixmap_pte(vaddr) \
237 pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
238
239static void __init kmap_init(void)
240{
241 unsigned long kmap_vstart;
242
243 /* cache the first kmap pte */
244 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
245 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
246
247 kmap_prot = PAGE_KERNEL;
248}
249
250static void __init permanent_kmaps_init(pgd_t *pgd_base)
251{
252 pgd_t *pgd;
253 pud_t *pud;
254 pmd_t *pmd;
255 pte_t *pte;
256 unsigned long vaddr;
257
258 vaddr = PKMAP_BASE;
259 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
260
261 pgd = swapper_pg_dir + pgd_index(vaddr);
262 pud = pud_offset(pgd, vaddr);
263 pmd = pmd_offset(pud, vaddr);
264 pte = pte_offset_kernel(pmd, vaddr);
265 pkmap_page_table = pte;
266}
267
268void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
269{
270 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
271 ClearPageReserved(page);
272 set_bit(PG_highmem, &page->flags);
273 set_page_count(page, 1);
274 __free_page(page);
275 totalhigh_pages++;
276 } else
277 SetPageReserved(page);
278}
279
280#ifndef CONFIG_DISCONTIGMEM
281static void __init set_highmem_pages_init(int bad_ppro)
282{
283 int pfn;
284 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
285 one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
286 totalram_pages += totalhigh_pages;
287}
288#else
289extern void set_highmem_pages_init(int);
290#endif /* !CONFIG_DISCONTIGMEM */
291
292#else
293#define kmap_init() do { } while (0)
294#define permanent_kmaps_init(pgd_base) do { } while (0)
295#define set_highmem_pages_init(bad_ppro) do { } while (0)
296#endif /* CONFIG_HIGHMEM */
297
298unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
299unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
300
301#ifndef CONFIG_DISCONTIGMEM
302#define remap_numa_kva() do {} while (0)
303#else
304extern void __init remap_numa_kva(void);
305#endif
306
307static void __init pagetable_init (void)
308{
309 unsigned long vaddr;
310 pgd_t *pgd_base = swapper_pg_dir;
311
312#ifdef CONFIG_X86_PAE
313 int i;
314 /* Init entries of the first-level page table to the zero page */
315 for (i = 0; i < PTRS_PER_PGD; i++)
316 set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
317#endif
318
319 /* Enable PSE if available */
320 if (cpu_has_pse) {
321 set_in_cr4(X86_CR4_PSE);
322 }
323
324 /* Enable PGE if available */
325 if (cpu_has_pge) {
326 set_in_cr4(X86_CR4_PGE);
327 __PAGE_KERNEL |= _PAGE_GLOBAL;
328 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
329 }
330
331 kernel_physical_mapping_init(pgd_base);
332 remap_numa_kva();
333
334 /*
335 * Fixed mappings, only the page table structure has to be
336 * created - mappings will be set by set_fixmap():
337 */
338 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
339 page_table_range_init(vaddr, 0, pgd_base);
340
341 permanent_kmaps_init(pgd_base);
342
343#ifdef CONFIG_X86_PAE
344 /*
345 * Add low memory identity-mappings - SMP needs it when
346 * starting up on an AP from real-mode. In the non-PAE
347 * case we already have these mappings through head.S.
348 * All user-space mappings are explicitly cleared after
349 * SMP startup.
350 */
351 pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
352#endif
353}
354
355#if defined(CONFIG_PM_DISK) || defined(CONFIG_SOFTWARE_SUSPEND)
356/*
357 * Swap suspend & friends need this for resume because things like the intel-agp
358 * driver might have split up a kernel 4MB mapping.
359 */
360char __nosavedata swsusp_pg_dir[PAGE_SIZE]
361 __attribute__ ((aligned (PAGE_SIZE)));
362
363static inline void save_pg_dir(void)
364{
365 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
366}
367#else
368static inline void save_pg_dir(void)
369{
370}
371#endif
372
373void zap_low_mappings (void)
374{
375 int i;
376
377 save_pg_dir();
378
379 /*
380 * Zap initial low-memory mappings.
381 *
382 * Note that "pgd_clear()" doesn't do it for
383 * us, because pgd_clear() is a no-op on i386.
384 */
385 for (i = 0; i < USER_PTRS_PER_PGD; i++)
386#ifdef CONFIG_X86_PAE
387 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
388#else
389 set_pgd(swapper_pg_dir+i, __pgd(0));
390#endif
391 flush_tlb_all();
392}
393
394static int disable_nx __initdata = 0;
395u64 __supported_pte_mask = ~_PAGE_NX;
396
397/*
398 * noexec = on|off
399 *
400 * Control non executable mappings.
401 *
402 * on Enable
403 * off Disable
404 */
405void __init noexec_setup(const char *str)
406{
407 if (!strncmp(str, "on",2) && cpu_has_nx) {
408 __supported_pte_mask |= _PAGE_NX;
409 disable_nx = 0;
410 } else if (!strncmp(str,"off",3)) {
411 disable_nx = 1;
412 __supported_pte_mask &= ~_PAGE_NX;
413 }
414}
415
416int nx_enabled = 0;
417#ifdef CONFIG_X86_PAE
418
419static void __init set_nx(void)
420{
421 unsigned int v[4], l, h;
422
423 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
424 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
425 if ((v[3] & (1 << 20)) && !disable_nx) {
426 rdmsr(MSR_EFER, l, h);
427 l |= EFER_NX;
428 wrmsr(MSR_EFER, l, h);
429 nx_enabled = 1;
430 __supported_pte_mask |= _PAGE_NX;
431 }
432 }
433}
434
435/*
436 * Enables/disables executability of a given kernel page and
437 * returns the previous setting.
438 */
439int __init set_kernel_exec(unsigned long vaddr, int enable)
440{
441 pte_t *pte;
442 int ret = 1;
443
444 if (!nx_enabled)
445 goto out;
446
447 pte = lookup_address(vaddr);
448 BUG_ON(!pte);
449
450 if (!pte_exec_kernel(*pte))
451 ret = 0;
452
453 if (enable)
454 pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
455 else
456 pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
457 __flush_tlb_all();
458out:
459 return ret;
460}
461
462#endif
463
464/*
465 * paging_init() sets up the page tables - note that the first 8MB are
466 * already mapped by head.S.
467 *
468 * This routines also unmaps the page at virtual kernel address 0, so
469 * that we can trap those pesky NULL-reference errors in the kernel.
470 */
471void __init paging_init(void)
472{
473#ifdef CONFIG_X86_PAE
474 set_nx();
475 if (nx_enabled)
476 printk("NX (Execute Disable) protection: active\n");
477#endif
478
479 pagetable_init();
480
481 load_cr3(swapper_pg_dir);
482
483#ifdef CONFIG_X86_PAE
484 /*
485 * We will bail out later - printk doesn't work right now so
486 * the user would just see a hanging kernel.
487 */
488 if (cpu_has_pae)
489 set_in_cr4(X86_CR4_PAE);
490#endif
491 __flush_tlb_all();
492
493 kmap_init();
494}
495
496/*
497 * Test if the WP bit works in supervisor mode. It isn't supported on 386's
498 * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
499 * used to involve black magic jumps to work around some nasty CPU bugs,
500 * but fortunately the switch to using exceptions got rid of all that.
501 */
502
503static void __init test_wp_bit(void)
504{
505 printk("Checking if this processor honours the WP bit even in supervisor mode... ");
506
507 /* Any page-aligned address will do, the test is non-destructive */
508 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
509 boot_cpu_data.wp_works_ok = do_test_wp_bit();
510 clear_fixmap(FIX_WP_TEST);
511
512 if (!boot_cpu_data.wp_works_ok) {
513 printk("No.\n");
514#ifdef CONFIG_X86_WP_WORKS_OK
515 panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
516#endif
517 } else {
518 printk("Ok.\n");
519 }
520}
521
522static void __init set_max_mapnr_init(void)
523{
524#ifdef CONFIG_HIGHMEM
525 num_physpages = highend_pfn;
526#else
527 num_physpages = max_low_pfn;
528#endif
529#ifndef CONFIG_DISCONTIGMEM
530 max_mapnr = num_physpages;
531#endif
532}
533
534static struct kcore_list kcore_mem, kcore_vmalloc;
535
536void __init mem_init(void)
537{
538 extern int ppro_with_ram_bug(void);
539 int codesize, reservedpages, datasize, initsize;
540 int tmp;
541 int bad_ppro;
542
543#ifndef CONFIG_DISCONTIGMEM
544 if (!mem_map)
545 BUG();
546#endif
547
548 bad_ppro = ppro_with_ram_bug();
549
550#ifdef CONFIG_HIGHMEM
551 /* check that fixmap and pkmap do not overlap */
552 if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
553 printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
554 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
555 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
556 BUG();
557 }
558#endif
559
560 set_max_mapnr_init();
561
562#ifdef CONFIG_HIGHMEM
563 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
564#else
565 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
566#endif
567
568 /* this will put all low memory onto the freelists */
569 totalram_pages += free_all_bootmem();
570
571 reservedpages = 0;
572 for (tmp = 0; tmp < max_low_pfn; tmp++)
573 /*
574 * Only count reserved RAM pages
575 */
576 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
577 reservedpages++;
578
579 set_highmem_pages_init(bad_ppro);
580
581 codesize = (unsigned long) &_etext - (unsigned long) &_text;
582 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
583 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
584
585 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
586 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
587 VMALLOC_END-VMALLOC_START);
588
589 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
590 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
591 num_physpages << (PAGE_SHIFT-10),
592 codesize >> 10,
593 reservedpages << (PAGE_SHIFT-10),
594 datasize >> 10,
595 initsize >> 10,
596 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
597 );
598
599#ifdef CONFIG_X86_PAE
600 if (!cpu_has_pae)
601 panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
602#endif
603 if (boot_cpu_data.wp_works_ok < 0)
604 test_wp_bit();
605
606 /*
607 * Subtle. SMP is doing it's boot stuff late (because it has to
608 * fork idle threads) - but it also needs low mappings for the
609 * protected-mode entry to work. We zap these entries only after
610 * the WP-bit has been tested.
611 */
612#ifndef CONFIG_SMP
613 zap_low_mappings();
614#endif
615}
616
617kmem_cache_t *pgd_cache;
618kmem_cache_t *pmd_cache;
619
620void __init pgtable_cache_init(void)
621{
622 if (PTRS_PER_PMD > 1) {
623 pmd_cache = kmem_cache_create("pmd",
624 PTRS_PER_PMD*sizeof(pmd_t),
625 PTRS_PER_PMD*sizeof(pmd_t),
626 0,
627 pmd_ctor,
628 NULL);
629 if (!pmd_cache)
630 panic("pgtable_cache_init(): cannot create pmd cache");
631 }
632 pgd_cache = kmem_cache_create("pgd",
633 PTRS_PER_PGD*sizeof(pgd_t),
634 PTRS_PER_PGD*sizeof(pgd_t),
635 0,
636 pgd_ctor,
637 PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
638 if (!pgd_cache)
639 panic("pgtable_cache_init(): Cannot create pgd cache");
640}
641
642/*
643 * This function cannot be __init, since exceptions don't work in that
644 * section. Put this after the callers, so that it cannot be inlined.
645 */
646static int noinline do_test_wp_bit(void)
647{
648 char tmp_reg;
649 int flag;
650
651 __asm__ __volatile__(
652 " movb %0,%1 \n"
653 "1: movb %1,%0 \n"
654 " xorl %2,%2 \n"
655 "2: \n"
656 ".section __ex_table,\"a\"\n"
657 " .align 4 \n"
658 " .long 1b,2b \n"
659 ".previous \n"
660 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
661 "=q" (tmp_reg),
662 "=r" (flag)
663 :"2" (1)
664 :"memory");
665
666 return flag;
667}
668
669void free_initmem(void)
670{
671 unsigned long addr;
672
673 addr = (unsigned long)(&__init_begin);
674 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
675 ClearPageReserved(virt_to_page(addr));
676 set_page_count(virt_to_page(addr), 1);
677 memset((void *)addr, 0xcc, PAGE_SIZE);
678 free_page(addr);
679 totalram_pages++;
680 }
681 printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10);
682}
683
684#ifdef CONFIG_BLK_DEV_INITRD
685void free_initrd_mem(unsigned long start, unsigned long end)
686{
687 if (start < end)
688 printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
689 for (; start < end; start += PAGE_SIZE) {
690 ClearPageReserved(virt_to_page(start));
691 set_page_count(virt_to_page(start), 1);
692 free_page(start);
693 totalram_pages++;
694 }
695}
696#endif
diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c
new file mode 100644
index 000000000000..db06f7399913
--- /dev/null
+++ b/arch/i386/mm/ioremap.c
@@ -0,0 +1,320 @@
1/*
2 * arch/i386/mm/ioremap.c
3 *
4 * Re-map IO memory to kernel address space so that we can access it.
5 * This is needed for high PCI addresses that aren't mapped in the
6 * 640k-1MB IO memory area on PC's
7 *
8 * (C) Copyright 1995 1996 Linus Torvalds
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <asm/io.h>
15#include <asm/fixmap.h>
16#include <asm/cacheflush.h>
17#include <asm/tlbflush.h>
18#include <asm/pgtable.h>
19
20#define ISA_START_ADDRESS 0xa0000
21#define ISA_END_ADDRESS 0x100000
22
23static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
24 unsigned long end, unsigned long phys_addr, unsigned long flags)
25{
26 pte_t *pte;
27 unsigned long pfn;
28
29 pfn = phys_addr >> PAGE_SHIFT;
30 pte = pte_alloc_kernel(&init_mm, pmd, addr);
31 if (!pte)
32 return -ENOMEM;
33 do {
34 BUG_ON(!pte_none(*pte));
35 set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW |
36 _PAGE_DIRTY | _PAGE_ACCESSED | flags)));
37 pfn++;
38 } while (pte++, addr += PAGE_SIZE, addr != end);
39 return 0;
40}
41
42static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
43 unsigned long end, unsigned long phys_addr, unsigned long flags)
44{
45 pmd_t *pmd;
46 unsigned long next;
47
48 phys_addr -= addr;
49 pmd = pmd_alloc(&init_mm, pud, addr);
50 if (!pmd)
51 return -ENOMEM;
52 do {
53 next = pmd_addr_end(addr, end);
54 if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, flags))
55 return -ENOMEM;
56 } while (pmd++, addr = next, addr != end);
57 return 0;
58}
59
60static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
61 unsigned long end, unsigned long phys_addr, unsigned long flags)
62{
63 pud_t *pud;
64 unsigned long next;
65
66 phys_addr -= addr;
67 pud = pud_alloc(&init_mm, pgd, addr);
68 if (!pud)
69 return -ENOMEM;
70 do {
71 next = pud_addr_end(addr, end);
72 if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, flags))
73 return -ENOMEM;
74 } while (pud++, addr = next, addr != end);
75 return 0;
76}
77
78static int ioremap_page_range(unsigned long addr,
79 unsigned long end, unsigned long phys_addr, unsigned long flags)
80{
81 pgd_t *pgd;
82 unsigned long next;
83 int err;
84
85 BUG_ON(addr >= end);
86 flush_cache_all();
87 phys_addr -= addr;
88 pgd = pgd_offset_k(addr);
89 spin_lock(&init_mm.page_table_lock);
90 do {
91 next = pgd_addr_end(addr, end);
92 err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags);
93 if (err)
94 break;
95 } while (pgd++, addr = next, addr != end);
96 spin_unlock(&init_mm.page_table_lock);
97 flush_tlb_all();
98 return err;
99}
100
101/*
102 * Generic mapping function (not visible outside):
103 */
104
105/*
106 * Remap an arbitrary physical address space into the kernel virtual
107 * address space. Needed when the kernel wants to access high addresses
108 * directly.
109 *
110 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
111 * have to convert them into an offset in a page-aligned mapping, but the
112 * caller shouldn't need to know that small detail.
113 */
114void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
115{
116 void __iomem * addr;
117 struct vm_struct * area;
118 unsigned long offset, last_addr;
119
120 /* Don't allow wraparound or zero size */
121 last_addr = phys_addr + size - 1;
122 if (!size || last_addr < phys_addr)
123 return NULL;
124
125 /*
126 * Don't remap the low PCI/ISA area, it's always mapped..
127 */
128 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
129 return (void __iomem *) phys_to_virt(phys_addr);
130
131 /*
132 * Don't allow anybody to remap normal RAM that we're using..
133 */
134 if (phys_addr <= virt_to_phys(high_memory - 1)) {
135 char *t_addr, *t_end;
136 struct page *page;
137
138 t_addr = __va(phys_addr);
139 t_end = t_addr + (size - 1);
140
141 for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
142 if(!PageReserved(page))
143 return NULL;
144 }
145
146 /*
147 * Mappings have to be page-aligned
148 */
149 offset = phys_addr & ~PAGE_MASK;
150 phys_addr &= PAGE_MASK;
151 size = PAGE_ALIGN(last_addr+1) - phys_addr;
152
153 /*
154 * Ok, go for it..
155 */
156 area = get_vm_area(size, VM_IOREMAP | (flags << 20));
157 if (!area)
158 return NULL;
159 area->phys_addr = phys_addr;
160 addr = (void __iomem *) area->addr;
161 if (ioremap_page_range((unsigned long) addr,
162 (unsigned long) addr + size, phys_addr, flags)) {
163 vunmap((void __force *) addr);
164 return NULL;
165 }
166 return (void __iomem *) (offset + (char __iomem *)addr);
167}
168
169
170/**
171 * ioremap_nocache - map bus memory into CPU space
172 * @offset: bus address of the memory
173 * @size: size of the resource to map
174 *
175 * ioremap_nocache performs a platform specific sequence of operations to
176 * make bus memory CPU accessible via the readb/readw/readl/writeb/
177 * writew/writel functions and the other mmio helpers. The returned
178 * address is not guaranteed to be usable directly as a virtual
179 * address.
180 *
181 * This version of ioremap ensures that the memory is marked uncachable
182 * on the CPU as well as honouring existing caching rules from things like
183 * the PCI bus. Note that there are other caches and buffers on many
184 * busses. In particular driver authors should read up on PCI writes
185 *
186 * It's useful if some control registers are in such an area and
187 * write combining or read caching is not desirable:
188 *
189 * Must be freed with iounmap.
190 */
191
192void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
193{
194 unsigned long last_addr;
195 void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
196 if (!p)
197 return p;
198
199 /* Guaranteed to be > phys_addr, as per __ioremap() */
200 last_addr = phys_addr + size - 1;
201
202 if (last_addr < virt_to_phys(high_memory) - 1) {
203 struct page *ppage = virt_to_page(__va(phys_addr));
204 unsigned long npages;
205
206 phys_addr &= PAGE_MASK;
207
208 /* This might overflow and become zero.. */
209 last_addr = PAGE_ALIGN(last_addr);
210
211 /* .. but that's ok, because modulo-2**n arithmetic will make
212 * the page-aligned "last - first" come out right.
213 */
214 npages = (last_addr - phys_addr) >> PAGE_SHIFT;
215
216 if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
217 iounmap(p);
218 p = NULL;
219 }
220 global_flush_tlb();
221 }
222
223 return p;
224}
225
226void iounmap(volatile void __iomem *addr)
227{
228 struct vm_struct *p;
229 if ((void __force *) addr <= high_memory)
230 return;
231
232 /*
233 * __ioremap special-cases the PCI/ISA range by not instantiating a
234 * vm_area and by simply returning an address into the kernel mapping
235 * of ISA space. So handle that here.
236 */
237 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
238 addr < phys_to_virt(ISA_END_ADDRESS))
239 return;
240
241 p = remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr));
242 if (!p) {
243 printk("__iounmap: bad address %p\n", addr);
244 return;
245 }
246
247 if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
248 /* p->size includes the guard page, but cpa doesn't like that */
249 change_page_attr(virt_to_page(__va(p->phys_addr)),
250 p->size >> PAGE_SHIFT,
251 PAGE_KERNEL);
252 global_flush_tlb();
253 }
254 kfree(p);
255}
256
257void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
258{
259 unsigned long offset, last_addr;
260 unsigned int nrpages;
261 enum fixed_addresses idx;
262
263 /* Don't allow wraparound or zero size */
264 last_addr = phys_addr + size - 1;
265 if (!size || last_addr < phys_addr)
266 return NULL;
267
268 /*
269 * Don't remap the low PCI/ISA area, it's always mapped..
270 */
271 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
272 return phys_to_virt(phys_addr);
273
274 /*
275 * Mappings have to be page-aligned
276 */
277 offset = phys_addr & ~PAGE_MASK;
278 phys_addr &= PAGE_MASK;
279 size = PAGE_ALIGN(last_addr) - phys_addr;
280
281 /*
282 * Mappings have to fit in the FIX_BTMAP area.
283 */
284 nrpages = size >> PAGE_SHIFT;
285 if (nrpages > NR_FIX_BTMAPS)
286 return NULL;
287
288 /*
289 * Ok, go for it..
290 */
291 idx = FIX_BTMAP_BEGIN;
292 while (nrpages > 0) {
293 set_fixmap(idx, phys_addr);
294 phys_addr += PAGE_SIZE;
295 --idx;
296 --nrpages;
297 }
298 return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
299}
300
301void __init bt_iounmap(void *addr, unsigned long size)
302{
303 unsigned long virt_addr;
304 unsigned long offset;
305 unsigned int nrpages;
306 enum fixed_addresses idx;
307
308 virt_addr = (unsigned long)addr;
309 if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
310 return;
311 offset = virt_addr & ~PAGE_MASK;
312 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
313
314 idx = FIX_BTMAP_BEGIN;
315 while (nrpages > 0) {
316 clear_fixmap(idx);
317 --idx;
318 --nrpages;
319 }
320}
diff --git a/arch/i386/mm/mmap.c b/arch/i386/mm/mmap.c
new file mode 100644
index 000000000000..e4730a1a43dd
--- /dev/null
+++ b/arch/i386/mm/mmap.c
@@ -0,0 +1,76 @@
1/*
2 * linux/arch/i386/mm/mmap.c
3 *
4 * flexible mmap layout support
5 *
6 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
7 * All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 *
24 * Started by Ingo Molnar <mingo@elte.hu>
25 */
26
27#include <linux/personality.h>
28#include <linux/mm.h>
29#include <linux/random.h>
30
31/*
32 * Top of mmap area (just below the process stack).
33 *
34 * Leave an at least ~128 MB hole.
35 */
36#define MIN_GAP (128*1024*1024)
37#define MAX_GAP (TASK_SIZE/6*5)
38
39static inline unsigned long mmap_base(struct mm_struct *mm)
40{
41 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
42 unsigned long random_factor = 0;
43
44 if (current->flags & PF_RANDOMIZE)
45 random_factor = get_random_int() % (1024*1024);
46
47 if (gap < MIN_GAP)
48 gap = MIN_GAP;
49 else if (gap > MAX_GAP)
50 gap = MAX_GAP;
51
52 return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
53}
54
55/*
56 * This function, called very early during the creation of a new
57 * process VM image, sets up which VM layout function to use:
58 */
59void arch_pick_mmap_layout(struct mm_struct *mm)
60{
61 /*
62 * Fall back to the standard layout if the personality
63 * bit is set, or if the expected stack growth is unlimited:
64 */
65 if (sysctl_legacy_va_layout ||
66 (current->personality & ADDR_COMPAT_LAYOUT) ||
67 current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
68 mm->mmap_base = TASK_UNMAPPED_BASE;
69 mm->get_unmapped_area = arch_get_unmapped_area;
70 mm->unmap_area = arch_unmap_area;
71 } else {
72 mm->mmap_base = mmap_base(mm);
73 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
74 mm->unmap_area = arch_unmap_area_topdown;
75 }
76}
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
new file mode 100644
index 000000000000..cb3da6baa704
--- /dev/null
+++ b/arch/i386/mm/pageattr.c
@@ -0,0 +1,221 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5
6#include <linux/config.h>
7#include <linux/mm.h>
8#include <linux/sched.h>
9#include <linux/highmem.h>
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <asm/uaccess.h>
13#include <asm/processor.h>
14#include <asm/tlbflush.h>
15
16static DEFINE_SPINLOCK(cpa_lock);
17static struct list_head df_list = LIST_HEAD_INIT(df_list);
18
19
20pte_t *lookup_address(unsigned long address)
21{
22 pgd_t *pgd = pgd_offset_k(address);
23 pud_t *pud;
24 pmd_t *pmd;
25 if (pgd_none(*pgd))
26 return NULL;
27 pud = pud_offset(pgd, address);
28 if (pud_none(*pud))
29 return NULL;
30 pmd = pmd_offset(pud, address);
31 if (pmd_none(*pmd))
32 return NULL;
33 if (pmd_large(*pmd))
34 return (pte_t *)pmd;
35 return pte_offset_kernel(pmd, address);
36}
37
38static struct page *split_large_page(unsigned long address, pgprot_t prot)
39{
40 int i;
41 unsigned long addr;
42 struct page *base;
43 pte_t *pbase;
44
45 spin_unlock_irq(&cpa_lock);
46 base = alloc_pages(GFP_KERNEL, 0);
47 spin_lock_irq(&cpa_lock);
48 if (!base)
49 return NULL;
50
51 address = __pa(address);
52 addr = address & LARGE_PAGE_MASK;
53 pbase = (pte_t *)page_address(base);
54 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
55 pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
56 addr == address ? prot : PAGE_KERNEL);
57 }
58 return base;
59}
60
61static void flush_kernel_map(void *dummy)
62{
63 /* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */
64 if (boot_cpu_data.x86_model >= 4)
65 asm volatile("wbinvd":::"memory");
66 /* Flush all to work around Errata in early athlons regarding
67 * large page flushing.
68 */
69 __flush_tlb_all();
70}
71
72static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
73{
74 struct page *page;
75 unsigned long flags;
76
77 set_pte_atomic(kpte, pte); /* change init_mm */
78 if (PTRS_PER_PMD > 1)
79 return;
80
81 spin_lock_irqsave(&pgd_lock, flags);
82 for (page = pgd_list; page; page = (struct page *)page->index) {
83 pgd_t *pgd;
84 pud_t *pud;
85 pmd_t *pmd;
86 pgd = (pgd_t *)page_address(page) + pgd_index(address);
87 pud = pud_offset(pgd, address);
88 pmd = pmd_offset(pud, address);
89 set_pte_atomic((pte_t *)pmd, pte);
90 }
91 spin_unlock_irqrestore(&pgd_lock, flags);
92}
93
94/*
95 * No more special protections in this 2/4MB area - revert to a
96 * large page again.
97 */
98static inline void revert_page(struct page *kpte_page, unsigned long address)
99{
100 pte_t *linear = (pte_t *)
101 pmd_offset(pud_offset(pgd_offset_k(address), address), address);
102 set_pmd_pte(linear, address,
103 pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
104 PAGE_KERNEL_LARGE));
105}
106
107static int
108__change_page_attr(struct page *page, pgprot_t prot)
109{
110 pte_t *kpte;
111 unsigned long address;
112 struct page *kpte_page;
113
114 BUG_ON(PageHighMem(page));
115 address = (unsigned long)page_address(page);
116
117 kpte = lookup_address(address);
118 if (!kpte)
119 return -EINVAL;
120 kpte_page = virt_to_page(kpte);
121 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
122 if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
123 set_pte_atomic(kpte, mk_pte(page, prot));
124 } else {
125 struct page *split = split_large_page(address, prot);
126 if (!split)
127 return -ENOMEM;
128 set_pmd_pte(kpte,address,mk_pte(split, PAGE_KERNEL));
129 kpte_page = split;
130 }
131 get_page(kpte_page);
132 } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
133 set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
134 __put_page(kpte_page);
135 } else
136 BUG();
137
138 /*
139 * If the pte was reserved, it means it was created at boot
140 * time (not via split_large_page) and in turn we must not
141 * replace it with a largepage.
142 */
143 if (!PageReserved(kpte_page)) {
144 /* memleak and potential failed 2M page regeneration */
145 BUG_ON(!page_count(kpte_page));
146
147 if (cpu_has_pse && (page_count(kpte_page) == 1)) {
148 list_add(&kpte_page->lru, &df_list);
149 revert_page(kpte_page, address);
150 }
151 }
152 return 0;
153}
154
155static inline void flush_map(void)
156{
157 on_each_cpu(flush_kernel_map, NULL, 1, 1);
158}
159
160/*
161 * Change the page attributes of an page in the linear mapping.
162 *
163 * This should be used when a page is mapped with a different caching policy
164 * than write-back somewhere - some CPUs do not like it when mappings with
165 * different caching policies exist. This changes the page attributes of the
166 * in kernel linear mapping too.
167 *
168 * The caller needs to ensure that there are no conflicting mappings elsewhere.
169 * This function only deals with the kernel linear map.
170 *
171 * Caller must call global_flush_tlb() after this.
172 */
173int change_page_attr(struct page *page, int numpages, pgprot_t prot)
174{
175 int err = 0;
176 int i;
177 unsigned long flags;
178
179 spin_lock_irqsave(&cpa_lock, flags);
180 for (i = 0; i < numpages; i++, page++) {
181 err = __change_page_attr(page, prot);
182 if (err)
183 break;
184 }
185 spin_unlock_irqrestore(&cpa_lock, flags);
186 return err;
187}
188
189void global_flush_tlb(void)
190{
191 LIST_HEAD(l);
192 struct page *pg, *next;
193
194 BUG_ON(irqs_disabled());
195
196 spin_lock_irq(&cpa_lock);
197 list_splice_init(&df_list, &l);
198 spin_unlock_irq(&cpa_lock);
199 flush_map();
200 list_for_each_entry_safe(pg, next, &l, lru)
201 __free_page(pg);
202}
203
204#ifdef CONFIG_DEBUG_PAGEALLOC
205void kernel_map_pages(struct page *page, int numpages, int enable)
206{
207 if (PageHighMem(page))
208 return;
209 /* the return value is ignored - the calls cannot fail,
210 * large pages are disabled at boot time.
211 */
212 change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
213 /* we should perform an IPI and flush all tlbs,
214 * but that can deadlock->flush only current cpu.
215 */
216 __flush_tlb_all();
217}
218#endif
219
220EXPORT_SYMBOL(change_page_attr);
221EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
new file mode 100644
index 000000000000..0742d54f8bb0
--- /dev/null
+++ b/arch/i386/mm/pgtable.c
@@ -0,0 +1,260 @@
1/*
2 * linux/arch/i386/mm/pgtable.c
3 */
4
5#include <linux/config.h>
6#include <linux/sched.h>
7#include <linux/kernel.h>
8#include <linux/errno.h>
9#include <linux/mm.h>
10#include <linux/swap.h>
11#include <linux/smp.h>
12#include <linux/highmem.h>
13#include <linux/slab.h>
14#include <linux/pagemap.h>
15#include <linux/spinlock.h>
16
17#include <asm/system.h>
18#include <asm/pgtable.h>
19#include <asm/pgalloc.h>
20#include <asm/fixmap.h>
21#include <asm/e820.h>
22#include <asm/tlb.h>
23#include <asm/tlbflush.h>
24
25void show_mem(void)
26{
27 int total = 0, reserved = 0;
28 int shared = 0, cached = 0;
29 int highmem = 0;
30 struct page *page;
31 pg_data_t *pgdat;
32 unsigned long i;
33
34 printk("Mem-info:\n");
35 show_free_areas();
36 printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
37 for_each_pgdat(pgdat) {
38 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
39 page = pgdat->node_mem_map + i;
40 total++;
41 if (PageHighMem(page))
42 highmem++;
43 if (PageReserved(page))
44 reserved++;
45 else if (PageSwapCache(page))
46 cached++;
47 else if (page_count(page))
48 shared += page_count(page) - 1;
49 }
50 }
51 printk("%d pages of RAM\n", total);
52 printk("%d pages of HIGHMEM\n",highmem);
53 printk("%d reserved pages\n",reserved);
54 printk("%d pages shared\n",shared);
55 printk("%d pages swap cached\n",cached);
56}
57
58/*
59 * Associate a virtual page frame with a given physical page frame
60 * and protection flags for that frame.
61 */
62static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
63{
64 pgd_t *pgd;
65 pud_t *pud;
66 pmd_t *pmd;
67 pte_t *pte;
68
69 pgd = swapper_pg_dir + pgd_index(vaddr);
70 if (pgd_none(*pgd)) {
71 BUG();
72 return;
73 }
74 pud = pud_offset(pgd, vaddr);
75 if (pud_none(*pud)) {
76 BUG();
77 return;
78 }
79 pmd = pmd_offset(pud, vaddr);
80 if (pmd_none(*pmd)) {
81 BUG();
82 return;
83 }
84 pte = pte_offset_kernel(pmd, vaddr);
85 /* <pfn,flags> stored as-is, to permit clearing entries */
86 set_pte(pte, pfn_pte(pfn, flags));
87
88 /*
89 * It's enough to flush this one mapping.
90 * (PGE mappings get flushed as well)
91 */
92 __flush_tlb_one(vaddr);
93}
94
95/*
96 * Associate a large virtual page frame with a given physical page frame
97 * and protection flags for that frame. pfn is for the base of the page,
98 * vaddr is what the page gets mapped to - both must be properly aligned.
99 * The pmd must already be instantiated. Assumes PAE mode.
100 */
101void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
102{
103 pgd_t *pgd;
104 pud_t *pud;
105 pmd_t *pmd;
106
107 if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
108 printk ("set_pmd_pfn: vaddr misaligned\n");
109 return; /* BUG(); */
110 }
111 if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
112 printk ("set_pmd_pfn: pfn misaligned\n");
113 return; /* BUG(); */
114 }
115 pgd = swapper_pg_dir + pgd_index(vaddr);
116 if (pgd_none(*pgd)) {
117 printk ("set_pmd_pfn: pgd_none\n");
118 return; /* BUG(); */
119 }
120 pud = pud_offset(pgd, vaddr);
121 pmd = pmd_offset(pud, vaddr);
122 set_pmd(pmd, pfn_pmd(pfn, flags));
123 /*
124 * It's enough to flush this one mapping.
125 * (PGE mappings get flushed as well)
126 */
127 __flush_tlb_one(vaddr);
128}
129
130void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
131{
132 unsigned long address = __fix_to_virt(idx);
133
134 if (idx >= __end_of_fixed_addresses) {
135 BUG();
136 return;
137 }
138 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
139}
140
141pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
142{
143 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
144}
145
146struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
147{
148 struct page *pte;
149
150#ifdef CONFIG_HIGHPTE
151 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
152#else
153 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
154#endif
155 return pte;
156}
157
158void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
159{
160 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
161}
162
163/*
164 * List of all pgd's needed for non-PAE so it can invalidate entries
165 * in both cached and uncached pgd's; not needed for PAE since the
166 * kernel pmd is shared. If PAE were not to share the pmd a similar
167 * tactic would be needed. This is essentially codepath-based locking
168 * against pageattr.c; it is the unique case in which a valid change
169 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
170 * vmalloc faults work because attached pagetables are never freed.
171 * The locking scheme was chosen on the basis of manfred's
172 * recommendations and having no core impact whatsoever.
173 * -- wli
174 */
175DEFINE_SPINLOCK(pgd_lock);
176struct page *pgd_list;
177
178static inline void pgd_list_add(pgd_t *pgd)
179{
180 struct page *page = virt_to_page(pgd);
181 page->index = (unsigned long)pgd_list;
182 if (pgd_list)
183 pgd_list->private = (unsigned long)&page->index;
184 pgd_list = page;
185 page->private = (unsigned long)&pgd_list;
186}
187
188static inline void pgd_list_del(pgd_t *pgd)
189{
190 struct page *next, **pprev, *page = virt_to_page(pgd);
191 next = (struct page *)page->index;
192 pprev = (struct page **)page->private;
193 *pprev = next;
194 if (next)
195 next->private = (unsigned long)pprev;
196}
197
198void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
199{
200 unsigned long flags;
201
202 if (PTRS_PER_PMD == 1)
203 spin_lock_irqsave(&pgd_lock, flags);
204
205 memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
206 swapper_pg_dir + USER_PTRS_PER_PGD,
207 (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
208
209 if (PTRS_PER_PMD > 1)
210 return;
211
212 pgd_list_add(pgd);
213 spin_unlock_irqrestore(&pgd_lock, flags);
214 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
215}
216
217/* never called when PTRS_PER_PMD > 1 */
218void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
219{
220 unsigned long flags; /* can be called from interrupt context */
221
222 spin_lock_irqsave(&pgd_lock, flags);
223 pgd_list_del(pgd);
224 spin_unlock_irqrestore(&pgd_lock, flags);
225}
226
227pgd_t *pgd_alloc(struct mm_struct *mm)
228{
229 int i;
230 pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
231
232 if (PTRS_PER_PMD == 1 || !pgd)
233 return pgd;
234
235 for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
236 pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
237 if (!pmd)
238 goto out_oom;
239 set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
240 }
241 return pgd;
242
243out_oom:
244 for (i--; i >= 0; i--)
245 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
246 kmem_cache_free(pgd_cache, pgd);
247 return NULL;
248}
249
250void pgd_free(pgd_t *pgd)
251{
252 int i;
253
254 /* in the PAE case user pgd entries are overwritten before usage */
255 if (PTRS_PER_PMD > 1)
256 for (i = 0; i < USER_PTRS_PER_PGD; ++i)
257 kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
258 /* in the non-PAE case, clear_page_range() clears user pgd entries */
259 kmem_cache_free(pgd_cache, pgd);
260}