aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/mm/Makefile5
-rw-r--r--arch/x86/mm/Makefile_3210
-rw-r--r--arch/x86/mm/boot_ioremap_32.c100
-rw-r--r--arch/x86/mm/discontig_32.c431
-rw-r--r--arch/x86/mm/extable_32.c35
-rw-r--r--arch/x86/mm/fault_32.c657
-rw-r--r--arch/x86/mm/highmem_32.c113
-rw-r--r--arch/x86/mm/hugetlbpage.c391
-rw-r--r--arch/x86/mm/init_32.c858
-rw-r--r--arch/x86/mm/ioremap_32.c274
-rw-r--r--arch/x86/mm/mmap_32.c77
-rw-r--r--arch/x86/mm/pageattr_32.c278
-rw-r--r--arch/x86/mm/pgtable_32.c373
13 files changed, 3602 insertions, 0 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
new file mode 100644
index 000000000000..7317648e6587
--- /dev/null
+++ b/arch/x86/mm/Makefile
@@ -0,0 +1,5 @@
1ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/mm/Makefile_32
3else
4include ${srctree}/arch/x86_64/mm/Makefile_64
5endif
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
new file mode 100644
index 000000000000..362b4ad082de
--- /dev/null
+++ b/arch/x86/mm/Makefile_32
@@ -0,0 +1,10 @@
1#
2# Makefile for the linux i386-specific parts of the memory manager.
3#
4
5obj-y := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o
6
7obj-$(CONFIG_NUMA) += discontig_32.o
8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o
diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c
new file mode 100644
index 000000000000..4de95a17a7d4
--- /dev/null
+++ b/arch/x86/mm/boot_ioremap_32.c
@@ -0,0 +1,100 @@
1/*
2 * arch/i386/mm/boot_ioremap.c
3 *
4 * Re-map functions for early boot-time before paging_init() when the
5 * boot-time pagetables are still in use
6 *
7 * Written by Dave Hansen <haveblue@us.ibm.com>
8 */
9
10
11/*
12 * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
13 * keeps that from happenning. If anyone has a better way, I'm listening.
14 *
15 * boot_pte_t is defined only if this all works correctly
16 */
17
18#undef CONFIG_X86_PAE
19#undef CONFIG_PARAVIRT
20#include <asm/page.h>
21#include <asm/pgtable.h>
22#include <asm/tlbflush.h>
23#include <linux/init.h>
24#include <linux/stddef.h>
25
26/*
27 * I'm cheating here. It is known that the two boot PTE pages are
28 * allocated next to each other. I'm pretending that they're just
29 * one big array.
30 */
31
32#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
33
34static unsigned long boot_pte_index(unsigned long vaddr)
35{
36 return __pa(vaddr) >> PAGE_SHIFT;
37}
38
39static inline boot_pte_t* boot_vaddr_to_pte(void *address)
40{
41 boot_pte_t* boot_pg = (boot_pte_t*)pg0;
42 return &boot_pg[boot_pte_index((unsigned long)address)];
43}
44
45/*
46 * This is only for a caller who is clever enough to page-align
47 * phys_addr and virtual_source, and who also has a preference
48 * about which virtual address from which to steal ptes
49 */
50static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages,
51 void* virtual_source)
52{
53 boot_pte_t* pte;
54 int i;
55 char *vaddr = virtual_source;
56
57 pte = boot_vaddr_to_pte(virtual_source);
58 for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
59 set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
60 __flush_tlb_one(&vaddr[i*PAGE_SIZE]);
61 }
62}
63
64/* the virtual space we're going to remap comes from this array */
65#define BOOT_IOREMAP_PAGES 4
66#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
67static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
68 __attribute__ ((aligned (PAGE_SIZE)));
69
70/*
71 * This only applies to things which need to ioremap before paging_init()
72 * bt_ioremap() and plain ioremap() are both useless at this point.
73 *
74 * When used, we're still using the boot-time pagetables, which only
75 * have 2 PTE pages mapping the first 8MB
76 *
77 * There is no unmap. The boot-time PTE pages aren't used after boot.
78 * If you really want the space back, just remap it yourself.
79 * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
80 */
81__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
82{
83 unsigned long last_addr, offset;
84 unsigned int nrpages;
85
86 last_addr = phys_addr + size - 1;
87
88 /* page align the requested address */
89 offset = phys_addr & ~PAGE_MASK;
90 phys_addr &= PAGE_MASK;
91 size = PAGE_ALIGN(last_addr) - phys_addr;
92
93 nrpages = size >> PAGE_SHIFT;
94 if (nrpages > BOOT_IOREMAP_PAGES)
95 return NULL;
96
97 __boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
98
99 return &boot_ioremap_space[offset];
100}
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
new file mode 100644
index 000000000000..860e912a3fbb
--- /dev/null
+++ b/arch/x86/mm/discontig_32.c
@@ -0,0 +1,431 @@
1/*
2 * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
3 * August 2002: added remote node KVA remap - Martin J. Bligh
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#include <linux/mm.h>
26#include <linux/bootmem.h>
27#include <linux/mmzone.h>
28#include <linux/highmem.h>
29#include <linux/initrd.h>
30#include <linux/nodemask.h>
31#include <linux/module.h>
32#include <linux/kexec.h>
33#include <linux/pfn.h>
34#include <linux/swap.h>
35
36#include <asm/e820.h>
37#include <asm/setup.h>
38#include <asm/mmzone.h>
39#include <bios_ebda.h>
40
41struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
42EXPORT_SYMBOL(node_data);
43bootmem_data_t node0_bdata;
44
45/*
46 * numa interface - we expect the numa architecture specific code to have
47 * populated the following initialisation.
48 *
49 * 1) node_online_map - the map of all nodes configured (online) in the system
50 * 2) node_start_pfn - the starting page frame number for a node
51 * 3) node_end_pfn - the ending page fram number for a node
52 */
53unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
54unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
55
56
57#ifdef CONFIG_DISCONTIGMEM
58/*
59 * 4) physnode_map - the mapping between a pfn and owning node
60 * physnode_map keeps track of the physical memory layout of a generic
61 * numa node on a 256Mb break (each element of the array will
62 * represent 256Mb of memory and will be marked by the node id. so,
63 * if the first gig is on node 0, and the second gig is on node 1
64 * physnode_map will contain:
65 *
66 * physnode_map[0-3] = 0;
67 * physnode_map[4-7] = 1;
68 * physnode_map[8- ] = -1;
69 */
70s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
71EXPORT_SYMBOL(physnode_map);
72
73void memory_present(int nid, unsigned long start, unsigned long end)
74{
75 unsigned long pfn;
76
77 printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n",
78 nid, start, end);
79 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
80 printk(KERN_DEBUG " ");
81 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
82 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
83 printk("%ld ", pfn);
84 }
85 printk("\n");
86}
87
88unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
89 unsigned long end_pfn)
90{
91 unsigned long nr_pages = end_pfn - start_pfn;
92
93 if (!nr_pages)
94 return 0;
95
96 return (nr_pages + 1) * sizeof(struct page);
97}
98#endif
99
100extern unsigned long find_max_low_pfn(void);
101extern void add_one_highpage_init(struct page *, int, int);
102extern unsigned long highend_pfn, highstart_pfn;
103
104#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
105
106unsigned long node_remap_start_pfn[MAX_NUMNODES];
107unsigned long node_remap_size[MAX_NUMNODES];
108unsigned long node_remap_offset[MAX_NUMNODES];
109void *node_remap_start_vaddr[MAX_NUMNODES];
110void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
111
112void *node_remap_end_vaddr[MAX_NUMNODES];
113void *node_remap_alloc_vaddr[MAX_NUMNODES];
114static unsigned long kva_start_pfn;
115static unsigned long kva_pages;
116/*
117 * FLAT - support for basic PC memory model with discontig enabled, essentially
118 * a single node with all available processors in it with a flat
119 * memory map.
120 */
121int __init get_memcfg_numa_flat(void)
122{
123 printk("NUMA - single node, flat memory mode\n");
124
125 /* Run the memory configuration and find the top of memory. */
126 find_max_pfn();
127 node_start_pfn[0] = 0;
128 node_end_pfn[0] = max_pfn;
129 memory_present(0, 0, max_pfn);
130
131 /* Indicate there is one node available. */
132 nodes_clear(node_online_map);
133 node_set_online(0);
134 return 1;
135}
136
137/*
138 * Find the highest page frame number we have available for the node
139 */
140static void __init find_max_pfn_node(int nid)
141{
142 if (node_end_pfn[nid] > max_pfn)
143 node_end_pfn[nid] = max_pfn;
144 /*
145 * if a user has given mem=XXXX, then we need to make sure
146 * that the node _starts_ before that, too, not just ends
147 */
148 if (node_start_pfn[nid] > max_pfn)
149 node_start_pfn[nid] = max_pfn;
150 BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
151}
152
153/*
154 * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
155 * method. For node zero take this from the bottom of memory, for
156 * subsequent nodes place them at node_remap_start_vaddr which contains
157 * node local data in physically node local memory. See setup_memory()
158 * for details.
159 */
160static void __init allocate_pgdat(int nid)
161{
162 if (nid && node_has_online_mem(nid))
163 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
164 else {
165 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
166 min_low_pfn += PFN_UP(sizeof(pg_data_t));
167 }
168}
169
170void *alloc_remap(int nid, unsigned long size)
171{
172 void *allocation = node_remap_alloc_vaddr[nid];
173
174 size = ALIGN(size, L1_CACHE_BYTES);
175
176 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
177 return 0;
178
179 node_remap_alloc_vaddr[nid] += size;
180 memset(allocation, 0, size);
181
182 return allocation;
183}
184
185void __init remap_numa_kva(void)
186{
187 void *vaddr;
188 unsigned long pfn;
189 int node;
190
191 for_each_online_node(node) {
192 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
193 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
194 set_pmd_pfn((ulong) vaddr,
195 node_remap_start_pfn[node] + pfn,
196 PAGE_KERNEL_LARGE);
197 }
198 }
199}
200
201static unsigned long calculate_numa_remap_pages(void)
202{
203 int nid;
204 unsigned long size, reserve_pages = 0;
205 unsigned long pfn;
206
207 for_each_online_node(nid) {
208 unsigned old_end_pfn = node_end_pfn[nid];
209
210 /*
211 * The acpi/srat node info can show hot-add memroy zones
212 * where memory could be added but not currently present.
213 */
214 if (node_start_pfn[nid] > max_pfn)
215 continue;
216 if (node_end_pfn[nid] > max_pfn)
217 node_end_pfn[nid] = max_pfn;
218
219 /* ensure the remap includes space for the pgdat. */
220 size = node_remap_size[nid] + sizeof(pg_data_t);
221
222 /* convert size to large (pmd size) pages, rounding up */
223 size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
224 /* now the roundup is correct, convert to PAGE_SIZE pages */
225 size = size * PTRS_PER_PTE;
226
227 /*
228 * Validate the region we are allocating only contains valid
229 * pages.
230 */
231 for (pfn = node_end_pfn[nid] - size;
232 pfn < node_end_pfn[nid]; pfn++)
233 if (!page_is_ram(pfn))
234 break;
235
236 if (pfn != node_end_pfn[nid])
237 size = 0;
238
239 printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
240 size, nid);
241 node_remap_size[nid] = size;
242 node_remap_offset[nid] = reserve_pages;
243 reserve_pages += size;
244 printk("Shrinking node %d from %ld pages to %ld pages\n",
245 nid, node_end_pfn[nid], node_end_pfn[nid] - size);
246
247 if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
248 /*
249 * Align node_end_pfn[] and node_remap_start_pfn[] to
250 * pmd boundary. remap_numa_kva will barf otherwise.
251 */
252 printk("Shrinking node %d further by %ld pages for proper alignment\n",
253 nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
254 size += node_end_pfn[nid] & (PTRS_PER_PTE-1);
255 }
256
257 node_end_pfn[nid] -= size;
258 node_remap_start_pfn[nid] = node_end_pfn[nid];
259 shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
260 }
261 printk("Reserving total of %ld pages for numa KVA remap\n",
262 reserve_pages);
263 return reserve_pages;
264}
265
266extern void setup_bootmem_allocator(void);
267unsigned long __init setup_memory(void)
268{
269 int nid;
270 unsigned long system_start_pfn, system_max_low_pfn;
271
272 /*
273 * When mapping a NUMA machine we allocate the node_mem_map arrays
274 * from node local memory. They are then mapped directly into KVA
275 * between zone normal and vmalloc space. Calculate the size of
276 * this space and use it to adjust the boundry between ZONE_NORMAL
277 * and ZONE_HIGHMEM.
278 */
279 find_max_pfn();
280 get_memcfg_numa();
281
282 kva_pages = calculate_numa_remap_pages();
283
284 /* partially used pages are not usable - thus round upwards */
285 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
286
287 kva_start_pfn = find_max_low_pfn() - kva_pages;
288
289#ifdef CONFIG_BLK_DEV_INITRD
290 /* Numa kva area is below the initrd */
291 if (LOADER_TYPE && INITRD_START)
292 kva_start_pfn = PFN_DOWN(INITRD_START) - kva_pages;
293#endif
294 kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1);
295
296 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
297 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
298 kva_start_pfn, max_low_pfn);
299 printk("max_pfn = %ld\n", max_pfn);
300#ifdef CONFIG_HIGHMEM
301 highstart_pfn = highend_pfn = max_pfn;
302 if (max_pfn > system_max_low_pfn)
303 highstart_pfn = system_max_low_pfn;
304 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
305 pages_to_mb(highend_pfn - highstart_pfn));
306 num_physpages = highend_pfn;
307 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
308#else
309 num_physpages = system_max_low_pfn;
310 high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1;
311#endif
312 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
313 pages_to_mb(system_max_low_pfn));
314 printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",
315 min_low_pfn, max_low_pfn, highstart_pfn);
316
317 printk("Low memory ends at vaddr %08lx\n",
318 (ulong) pfn_to_kaddr(max_low_pfn));
319 for_each_online_node(nid) {
320 node_remap_start_vaddr[nid] = pfn_to_kaddr(
321 kva_start_pfn + node_remap_offset[nid]);
322 /* Init the node remap allocator */
323 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
324 (node_remap_size[nid] * PAGE_SIZE);
325 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
326 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
327
328 allocate_pgdat(nid);
329 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
330 (ulong) node_remap_start_vaddr[nid],
331 (ulong) pfn_to_kaddr(highstart_pfn
332 + node_remap_offset[nid] + node_remap_size[nid]));
333 }
334 printk("High memory starts at vaddr %08lx\n",
335 (ulong) pfn_to_kaddr(highstart_pfn));
336 for_each_online_node(nid)
337 find_max_pfn_node(nid);
338
339 memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
340 NODE_DATA(0)->bdata = &node0_bdata;
341 setup_bootmem_allocator();
342 return max_low_pfn;
343}
344
345void __init numa_kva_reserve(void)
346{
347 reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages));
348}
349
350void __init zone_sizes_init(void)
351{
352 int nid;
353 unsigned long max_zone_pfns[MAX_NR_ZONES];
354 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
355 max_zone_pfns[ZONE_DMA] =
356 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
357 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
358#ifdef CONFIG_HIGHMEM
359 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
360#endif
361
362 /* If SRAT has not registered memory, register it now */
363 if (find_max_pfn_with_active_regions() == 0) {
364 for_each_online_node(nid) {
365 if (node_has_online_mem(nid))
366 add_active_range(nid, node_start_pfn[nid],
367 node_end_pfn[nid]);
368 }
369 }
370
371 free_area_init_nodes(max_zone_pfns);
372 return;
373}
374
375void __init set_highmem_pages_init(int bad_ppro)
376{
377#ifdef CONFIG_HIGHMEM
378 struct zone *zone;
379 struct page *page;
380
381 for_each_zone(zone) {
382 unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
383
384 if (!is_highmem(zone))
385 continue;
386
387 zone_start_pfn = zone->zone_start_pfn;
388 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
389
390 printk("Initializing %s for node %d (%08lx:%08lx)\n",
391 zone->name, zone_to_nid(zone),
392 zone_start_pfn, zone_end_pfn);
393
394 for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
395 if (!pfn_valid(node_pfn))
396 continue;
397 page = pfn_to_page(node_pfn);
398 add_one_highpage_init(page, node_pfn, bad_ppro);
399 }
400 }
401 totalram_pages += totalhigh_pages;
402#endif
403}
404
405#ifdef CONFIG_MEMORY_HOTPLUG
406int paddr_to_nid(u64 addr)
407{
408 int nid;
409 unsigned long pfn = PFN_DOWN(addr);
410
411 for_each_node(nid)
412 if (node_start_pfn[nid] <= pfn &&
413 pfn < node_end_pfn[nid])
414 return nid;
415
416 return -1;
417}
418
419/*
420 * This function is used to ask node id BEFORE memmap and mem_section's
421 * initialization (pfn_to_nid() can't be used yet).
422 * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
423 */
424int memory_add_physaddr_to_nid(u64 addr)
425{
426 int nid = paddr_to_nid(addr);
427 return (nid >= 0) ? nid : 0;
428}
429
430EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
431#endif
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c
new file mode 100644
index 000000000000..0ce4f22a2635
--- /dev/null
+++ b/arch/x86/mm/extable_32.c
@@ -0,0 +1,35 @@
1/*
2 * linux/arch/i386/mm/extable.c
3 */
4
5#include <linux/module.h>
6#include <linux/spinlock.h>
7#include <asm/uaccess.h>
8
9int fixup_exception(struct pt_regs *regs)
10{
11 const struct exception_table_entry *fixup;
12
13#ifdef CONFIG_PNPBIOS
14 if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
15 {
16 extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
17 extern u32 pnp_bios_is_utter_crap;
18 pnp_bios_is_utter_crap = 1;
19 printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
20 __asm__ volatile(
21 "movl %0, %%esp\n\t"
22 "jmp *%1\n\t"
23 : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
24 panic("do_trap: can't hit this");
25 }
26#endif
27
28 fixup = search_exception_tables(regs->eip);
29 if (fixup) {
30 regs->eip = fixup->fixup;
31 return 1;
32 }
33
34 return 0;
35}
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
new file mode 100644
index 000000000000..fcb38e7f3543
--- /dev/null
+++ b/arch/x86/mm/fault_32.c
@@ -0,0 +1,657 @@
1/*
2 * linux/arch/i386/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 */
6
7#include <linux/signal.h>
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/errno.h>
11#include <linux/string.h>
12#include <linux/types.h>
13#include <linux/ptrace.h>
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
17#include <linux/interrupt.h>
18#include <linux/init.h>
19#include <linux/tty.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
23#include <linux/vmalloc.h>
24#include <linux/module.h>
25#include <linux/kprobes.h>
26#include <linux/uaccess.h>
27#include <linux/kdebug.h>
28
29#include <asm/system.h>
30#include <asm/desc.h>
31#include <asm/segment.h>
32
33extern void die(const char *,struct pt_regs *,long);
34
35static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
36
37int register_page_fault_notifier(struct notifier_block *nb)
38{
39 vmalloc_sync_all();
40 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
41}
42EXPORT_SYMBOL_GPL(register_page_fault_notifier);
43
44int unregister_page_fault_notifier(struct notifier_block *nb)
45{
46 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
47}
48EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
49
50static inline int notify_page_fault(struct pt_regs *regs, long err)
51{
52 struct die_args args = {
53 .regs = regs,
54 .str = "page fault",
55 .err = err,
56 .trapnr = 14,
57 .signr = SIGSEGV
58 };
59 return atomic_notifier_call_chain(&notify_page_fault_chain,
60 DIE_PAGE_FAULT, &args);
61}
62
63/*
64 * Return EIP plus the CS segment base. The segment limit is also
65 * adjusted, clamped to the kernel/user address space (whichever is
66 * appropriate), and returned in *eip_limit.
67 *
68 * The segment is checked, because it might have been changed by another
69 * task between the original faulting instruction and here.
70 *
71 * If CS is no longer a valid code segment, or if EIP is beyond the
72 * limit, or if it is a kernel address when CS is not a kernel segment,
73 * then the returned value will be greater than *eip_limit.
74 *
75 * This is slow, but is very rarely executed.
76 */
77static inline unsigned long get_segment_eip(struct pt_regs *regs,
78 unsigned long *eip_limit)
79{
80 unsigned long eip = regs->eip;
81 unsigned seg = regs->xcs & 0xffff;
82 u32 seg_ar, seg_limit, base, *desc;
83
84 /* Unlikely, but must come before segment checks. */
85 if (unlikely(regs->eflags & VM_MASK)) {
86 base = seg << 4;
87 *eip_limit = base + 0xffff;
88 return base + (eip & 0xffff);
89 }
90
91 /* The standard kernel/user address space limit. */
92 *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
93
94 /* By far the most common cases. */
95 if (likely(SEGMENT_IS_FLAT_CODE(seg)))
96 return eip;
97
98 /* Check the segment exists, is within the current LDT/GDT size,
99 that kernel/user (ring 0..3) has the appropriate privilege,
100 that it's a code segment, and get the limit. */
101 __asm__ ("larl %3,%0; lsll %3,%1"
102 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
103 if ((~seg_ar & 0x9800) || eip > seg_limit) {
104 *eip_limit = 0;
105 return 1; /* So that returned eip > *eip_limit. */
106 }
107
108 /* Get the GDT/LDT descriptor base.
109 When you look for races in this code remember that
110 LDT and other horrors are only used in user space. */
111 if (seg & (1<<2)) {
112 /* Must lock the LDT while reading it. */
113 down(&current->mm->context.sem);
114 desc = current->mm->context.ldt;
115 desc = (void *)desc + (seg & ~7);
116 } else {
117 /* Must disable preemption while reading the GDT. */
118 desc = (u32 *)get_cpu_gdt_table(get_cpu());
119 desc = (void *)desc + (seg & ~7);
120 }
121
122 /* Decode the code segment base from the descriptor */
123 base = get_desc_base((unsigned long *)desc);
124
125 if (seg & (1<<2)) {
126 up(&current->mm->context.sem);
127 } else
128 put_cpu();
129
130 /* Adjust EIP and segment limit, and clamp at the kernel limit.
131 It's legitimate for segments to wrap at 0xffffffff. */
132 seg_limit += base;
133 if (seg_limit < *eip_limit && seg_limit >= base)
134 *eip_limit = seg_limit;
135 return eip + base;
136}
137
138/*
139 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
140 * Check that here and ignore it.
141 */
142static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
143{
144 unsigned long limit;
145 unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
146 int scan_more = 1;
147 int prefetch = 0;
148 int i;
149
150 for (i = 0; scan_more && i < 15; i++) {
151 unsigned char opcode;
152 unsigned char instr_hi;
153 unsigned char instr_lo;
154
155 if (instr > (unsigned char *)limit)
156 break;
157 if (probe_kernel_address(instr, opcode))
158 break;
159
160 instr_hi = opcode & 0xf0;
161 instr_lo = opcode & 0x0f;
162 instr++;
163
164 switch (instr_hi) {
165 case 0x20:
166 case 0x30:
167 /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
168 scan_more = ((instr_lo & 7) == 0x6);
169 break;
170
171 case 0x60:
172 /* 0x64 thru 0x67 are valid prefixes in all modes. */
173 scan_more = (instr_lo & 0xC) == 0x4;
174 break;
175 case 0xF0:
176 /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
177 scan_more = !instr_lo || (instr_lo>>1) == 1;
178 break;
179 case 0x00:
180 /* Prefetch instruction is 0x0F0D or 0x0F18 */
181 scan_more = 0;
182 if (instr > (unsigned char *)limit)
183 break;
184 if (probe_kernel_address(instr, opcode))
185 break;
186 prefetch = (instr_lo == 0xF) &&
187 (opcode == 0x0D || opcode == 0x18);
188 break;
189 default:
190 scan_more = 0;
191 break;
192 }
193 }
194 return prefetch;
195}
196
197static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
198 unsigned long error_code)
199{
200 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
201 boot_cpu_data.x86 >= 6)) {
202 /* Catch an obscure case of prefetch inside an NX page. */
203 if (nx_enabled && (error_code & 16))
204 return 0;
205 return __is_prefetch(regs, addr);
206 }
207 return 0;
208}
209
210static noinline void force_sig_info_fault(int si_signo, int si_code,
211 unsigned long address, struct task_struct *tsk)
212{
213 siginfo_t info;
214
215 info.si_signo = si_signo;
216 info.si_errno = 0;
217 info.si_code = si_code;
218 info.si_addr = (void __user *)address;
219 force_sig_info(si_signo, &info, tsk);
220}
221
222fastcall void do_invalid_op(struct pt_regs *, unsigned long);
223
224static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
225{
226 unsigned index = pgd_index(address);
227 pgd_t *pgd_k;
228 pud_t *pud, *pud_k;
229 pmd_t *pmd, *pmd_k;
230
231 pgd += index;
232 pgd_k = init_mm.pgd + index;
233
234 if (!pgd_present(*pgd_k))
235 return NULL;
236
237 /*
238 * set_pgd(pgd, *pgd_k); here would be useless on PAE
239 * and redundant with the set_pmd() on non-PAE. As would
240 * set_pud.
241 */
242
243 pud = pud_offset(pgd, address);
244 pud_k = pud_offset(pgd_k, address);
245 if (!pud_present(*pud_k))
246 return NULL;
247
248 pmd = pmd_offset(pud, address);
249 pmd_k = pmd_offset(pud_k, address);
250 if (!pmd_present(*pmd_k))
251 return NULL;
252 if (!pmd_present(*pmd)) {
253 set_pmd(pmd, *pmd_k);
254 arch_flush_lazy_mmu_mode();
255 } else
256 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
257 return pmd_k;
258}
259
260/*
261 * Handle a fault on the vmalloc or module mapping area
262 *
263 * This assumes no large pages in there.
264 */
265static inline int vmalloc_fault(unsigned long address)
266{
267 unsigned long pgd_paddr;
268 pmd_t *pmd_k;
269 pte_t *pte_k;
270 /*
271 * Synchronize this task's top level page-table
272 * with the 'reference' page table.
273 *
274 * Do _not_ use "current" here. We might be inside
275 * an interrupt in the middle of a task switch..
276 */
277 pgd_paddr = read_cr3();
278 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
279 if (!pmd_k)
280 return -1;
281 pte_k = pte_offset_kernel(pmd_k, address);
282 if (!pte_present(*pte_k))
283 return -1;
284 return 0;
285}
286
287int show_unhandled_signals = 1;
288
289/*
290 * This routine handles page faults. It determines the address,
291 * and the problem, and then passes it off to one of the appropriate
292 * routines.
293 *
294 * error_code:
295 * bit 0 == 0 means no page found, 1 means protection fault
296 * bit 1 == 0 means read, 1 means write
297 * bit 2 == 0 means kernel, 1 means user-mode
298 * bit 3 == 1 means use of reserved bit detected
299 * bit 4 == 1 means fault was an instruction fetch
300 */
301fastcall void __kprobes do_page_fault(struct pt_regs *regs,
302 unsigned long error_code)
303{
304 struct task_struct *tsk;
305 struct mm_struct *mm;
306 struct vm_area_struct * vma;
307 unsigned long address;
308 int write, si_code;
309 int fault;
310
311 /* get the address */
312 address = read_cr2();
313
314 tsk = current;
315
316 si_code = SEGV_MAPERR;
317
318 /*
319 * We fault-in kernel-space virtual memory on-demand. The
320 * 'reference' page table is init_mm.pgd.
321 *
322 * NOTE! We MUST NOT take any locks for this case. We may
323 * be in an interrupt or a critical region, and should
324 * only copy the information from the master page table,
325 * nothing more.
326 *
327 * This verifies that the fault happens in kernel space
328 * (error_code & 4) == 0, and that the fault was not a
329 * protection error (error_code & 9) == 0.
330 */
331 if (unlikely(address >= TASK_SIZE)) {
332 if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
333 return;
334 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
335 return;
336 /*
337 * Don't take the mm semaphore here. If we fixup a prefetch
338 * fault we could otherwise deadlock.
339 */
340 goto bad_area_nosemaphore;
341 }
342
343 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
344 return;
345
346 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
347 fault has been handled. */
348 if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
349 local_irq_enable();
350
351 mm = tsk->mm;
352
353 /*
354 * If we're in an interrupt, have no user context or are running in an
355 * atomic region then we must not take the fault..
356 */
357 if (in_atomic() || !mm)
358 goto bad_area_nosemaphore;
359
360 /* When running in the kernel we expect faults to occur only to
361 * addresses in user space. All other faults represent errors in the
362 * kernel and should generate an OOPS. Unfortunatly, in the case of an
363 * erroneous fault occurring in a code path which already holds mmap_sem
364 * we will deadlock attempting to validate the fault against the
365 * address space. Luckily the kernel only validly references user
366 * space from well defined areas of code, which are listed in the
367 * exceptions table.
368 *
369 * As the vast majority of faults will be valid we will only perform
370 * the source reference check when there is a possibilty of a deadlock.
371 * Attempt to lock the address space, if we cannot we then validate the
372 * source. If this is invalid we can skip the address space check,
373 * thus avoiding the deadlock.
374 */
375 if (!down_read_trylock(&mm->mmap_sem)) {
376 if ((error_code & 4) == 0 &&
377 !search_exception_tables(regs->eip))
378 goto bad_area_nosemaphore;
379 down_read(&mm->mmap_sem);
380 }
381
382 vma = find_vma(mm, address);
383 if (!vma)
384 goto bad_area;
385 if (vma->vm_start <= address)
386 goto good_area;
387 if (!(vma->vm_flags & VM_GROWSDOWN))
388 goto bad_area;
389 if (error_code & 4) {
390 /*
391 * Accessing the stack below %esp is always a bug.
392 * The large cushion allows instructions like enter
393 * and pusha to work. ("enter $65535,$31" pushes
394 * 32 pointers and then decrements %esp by 65535.)
395 */
396 if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
397 goto bad_area;
398 }
399 if (expand_stack(vma, address))
400 goto bad_area;
401/*
402 * Ok, we have a good vm_area for this memory access, so
403 * we can handle it..
404 */
405good_area:
406 si_code = SEGV_ACCERR;
407 write = 0;
408 switch (error_code & 3) {
409 default: /* 3: write, present */
410 /* fall through */
411 case 2: /* write, not present */
412 if (!(vma->vm_flags & VM_WRITE))
413 goto bad_area;
414 write++;
415 break;
416 case 1: /* read, present */
417 goto bad_area;
418 case 0: /* read, not present */
419 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
420 goto bad_area;
421 }
422
423 survive:
424 /*
425 * If for any reason at all we couldn't handle the fault,
426 * make sure we exit gracefully rather than endlessly redo
427 * the fault.
428 */
429 fault = handle_mm_fault(mm, vma, address, write);
430 if (unlikely(fault & VM_FAULT_ERROR)) {
431 if (fault & VM_FAULT_OOM)
432 goto out_of_memory;
433 else if (fault & VM_FAULT_SIGBUS)
434 goto do_sigbus;
435 BUG();
436 }
437 if (fault & VM_FAULT_MAJOR)
438 tsk->maj_flt++;
439 else
440 tsk->min_flt++;
441
442 /*
443 * Did it hit the DOS screen memory VA from vm86 mode?
444 */
445 if (regs->eflags & VM_MASK) {
446 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
447 if (bit < 32)
448 tsk->thread.screen_bitmap |= 1 << bit;
449 }
450 up_read(&mm->mmap_sem);
451 return;
452
453/*
454 * Something tried to access memory that isn't in our memory map..
455 * Fix it, but check if it's kernel or user first..
456 */
457bad_area:
458 up_read(&mm->mmap_sem);
459
460bad_area_nosemaphore:
461 /* User mode accesses just cause a SIGSEGV */
462 if (error_code & 4) {
463 /*
464 * It's possible to have interrupts off here.
465 */
466 local_irq_enable();
467
468 /*
469 * Valid to do another page fault here because this one came
470 * from user space.
471 */
472 if (is_prefetch(regs, address, error_code))
473 return;
474
475 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
476 printk_ratelimit()) {
477 printk("%s%s[%d]: segfault at %08lx eip %08lx "
478 "esp %08lx error %lx\n",
479 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
480 tsk->comm, tsk->pid, address, regs->eip,
481 regs->esp, error_code);
482 }
483 tsk->thread.cr2 = address;
484 /* Kernel addresses are always protection faults */
485 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
486 tsk->thread.trap_no = 14;
487 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
488 return;
489 }
490
491#ifdef CONFIG_X86_F00F_BUG
492 /*
493 * Pentium F0 0F C7 C8 bug workaround.
494 */
495 if (boot_cpu_data.f00f_bug) {
496 unsigned long nr;
497
498 nr = (address - idt_descr.address) >> 3;
499
500 if (nr == 6) {
501 do_invalid_op(regs, 0);
502 return;
503 }
504 }
505#endif
506
507no_context:
508 /* Are we prepared to handle this kernel fault? */
509 if (fixup_exception(regs))
510 return;
511
512 /*
513 * Valid to do another page fault here, because if this fault
514 * had been triggered by is_prefetch fixup_exception would have
515 * handled it.
516 */
517 if (is_prefetch(regs, address, error_code))
518 return;
519
520/*
521 * Oops. The kernel tried to access some bad page. We'll have to
522 * terminate things with extreme prejudice.
523 */
524
525 bust_spinlocks(1);
526
527 if (oops_may_print()) {
528 __typeof__(pte_val(__pte(0))) page;
529
530#ifdef CONFIG_X86_PAE
531 if (error_code & 16) {
532 pte_t *pte = lookup_address(address);
533
534 if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
535 printk(KERN_CRIT "kernel tried to execute "
536 "NX-protected page - exploit attempt? "
537 "(uid: %d)\n", current->uid);
538 }
539#endif
540 if (address < PAGE_SIZE)
541 printk(KERN_ALERT "BUG: unable to handle kernel NULL "
542 "pointer dereference");
543 else
544 printk(KERN_ALERT "BUG: unable to handle kernel paging"
545 " request");
546 printk(" at virtual address %08lx\n",address);
547 printk(KERN_ALERT " printing eip:\n");
548 printk("%08lx\n", regs->eip);
549
550 page = read_cr3();
551 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
552#ifdef CONFIG_X86_PAE
553 printk(KERN_ALERT "*pdpt = %016Lx\n", page);
554 if ((page >> PAGE_SHIFT) < max_low_pfn
555 && page & _PAGE_PRESENT) {
556 page &= PAGE_MASK;
557 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
558 & (PTRS_PER_PMD - 1)];
559 printk(KERN_ALERT "*pde = %016Lx\n", page);
560 page &= ~_PAGE_NX;
561 }
562#else
563 printk(KERN_ALERT "*pde = %08lx\n", page);
564#endif
565
566 /*
567 * We must not directly access the pte in the highpte
568 * case if the page table is located in highmem.
569 * And let's rather not kmap-atomic the pte, just in case
570 * it's allocated already.
571 */
572 if ((page >> PAGE_SHIFT) < max_low_pfn
573 && (page & _PAGE_PRESENT)) {
574 page &= PAGE_MASK;
575 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
576 & (PTRS_PER_PTE - 1)];
577 printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page);
578 }
579 }
580
581 tsk->thread.cr2 = address;
582 tsk->thread.trap_no = 14;
583 tsk->thread.error_code = error_code;
584 die("Oops", regs, error_code);
585 bust_spinlocks(0);
586 do_exit(SIGKILL);
587
588/*
589 * We ran out of memory, or some other thing happened to us that made
590 * us unable to handle the page fault gracefully.
591 */
592out_of_memory:
593 up_read(&mm->mmap_sem);
594 if (is_init(tsk)) {
595 yield();
596 down_read(&mm->mmap_sem);
597 goto survive;
598 }
599 printk("VM: killing process %s\n", tsk->comm);
600 if (error_code & 4)
601 do_exit(SIGKILL);
602 goto no_context;
603
604do_sigbus:
605 up_read(&mm->mmap_sem);
606
607 /* Kernel mode? Handle exceptions or die */
608 if (!(error_code & 4))
609 goto no_context;
610
611 /* User space => ok to do another page fault */
612 if (is_prefetch(regs, address, error_code))
613 return;
614
615 tsk->thread.cr2 = address;
616 tsk->thread.error_code = error_code;
617 tsk->thread.trap_no = 14;
618 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
619}
620
621void vmalloc_sync_all(void)
622{
623 /*
624 * Note that races in the updates of insync and start aren't
625 * problematic: insync can only get set bits added, and updates to
626 * start are only improving performance (without affecting correctness
627 * if undone).
628 */
629 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
630 static unsigned long start = TASK_SIZE;
631 unsigned long address;
632
633 if (SHARED_KERNEL_PMD)
634 return;
635
636 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
637 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
638 if (!test_bit(pgd_index(address), insync)) {
639 unsigned long flags;
640 struct page *page;
641
642 spin_lock_irqsave(&pgd_lock, flags);
643 for (page = pgd_list; page; page =
644 (struct page *)page->index)
645 if (!vmalloc_sync_one(page_address(page),
646 address)) {
647 BUG_ON(page != pgd_list);
648 break;
649 }
650 spin_unlock_irqrestore(&pgd_lock, flags);
651 if (!page)
652 set_bit(pgd_index(address), insync);
653 }
654 if (address == start && test_bit(pgd_index(address), insync))
655 start = address + PGDIR_SIZE;
656 }
657}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
new file mode 100644
index 000000000000..1c3bf95f7356
--- /dev/null
+++ b/arch/x86/mm/highmem_32.c
@@ -0,0 +1,113 @@
1#include <linux/highmem.h>
2#include <linux/module.h>
3
4void *kmap(struct page *page)
5{
6 might_sleep();
7 if (!PageHighMem(page))
8 return page_address(page);
9 return kmap_high(page);
10}
11
12void kunmap(struct page *page)
13{
14 if (in_interrupt())
15 BUG();
16 if (!PageHighMem(page))
17 return;
18 kunmap_high(page);
19}
20
21/*
22 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
23 * no global lock is needed and because the kmap code must perform a global TLB
24 * invalidation when the kmap pool wraps.
25 *
26 * However when holding an atomic kmap is is not legal to sleep, so atomic
27 * kmaps are appropriate for short, tight code paths only.
28 */
29void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
30{
31 enum fixed_addresses idx;
32 unsigned long vaddr;
33
34 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
35 pagefault_disable();
36
37 if (!PageHighMem(page))
38 return page_address(page);
39
40 idx = type + KM_TYPE_NR*smp_processor_id();
41 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
42 BUG_ON(!pte_none(*(kmap_pte-idx)));
43 set_pte(kmap_pte-idx, mk_pte(page, prot));
44 arch_flush_lazy_mmu_mode();
45
46 return (void *)vaddr;
47}
48
49void *kmap_atomic(struct page *page, enum km_type type)
50{
51 return kmap_atomic_prot(page, type, kmap_prot);
52}
53
54void kunmap_atomic(void *kvaddr, enum km_type type)
55{
56 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
57 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
58
59 /*
60 * Force other mappings to Oops if they'll try to access this pte
61 * without first remap it. Keeping stale mappings around is a bad idea
62 * also, in case the page changes cacheability attributes or becomes
63 * a protected page in a hypervisor.
64 */
65 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
66 kpte_clear_flush(kmap_pte-idx, vaddr);
67 else {
68#ifdef CONFIG_DEBUG_HIGHMEM
69 BUG_ON(vaddr < PAGE_OFFSET);
70 BUG_ON(vaddr >= (unsigned long)high_memory);
71#endif
72 }
73
74 arch_flush_lazy_mmu_mode();
75 pagefault_enable();
76}
77
78/* This is the same as kmap_atomic() but can map memory that doesn't
79 * have a struct page associated with it.
80 */
81void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
82{
83 enum fixed_addresses idx;
84 unsigned long vaddr;
85
86 pagefault_disable();
87
88 idx = type + KM_TYPE_NR*smp_processor_id();
89 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
90 set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
91 arch_flush_lazy_mmu_mode();
92
93 return (void*) vaddr;
94}
95
96struct page *kmap_atomic_to_page(void *ptr)
97{
98 unsigned long idx, vaddr = (unsigned long)ptr;
99 pte_t *pte;
100
101 if (vaddr < FIXADDR_START)
102 return virt_to_page(ptr);
103
104 idx = virt_to_fix(vaddr);
105 pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
106 return pte_page(*pte);
107}
108
109EXPORT_SYMBOL(kmap);
110EXPORT_SYMBOL(kunmap);
111EXPORT_SYMBOL(kmap_atomic);
112EXPORT_SYMBOL(kunmap_atomic);
113EXPORT_SYMBOL(kmap_atomic_to_page);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
new file mode 100644
index 000000000000..6c06d9c0488e
--- /dev/null
+++ b/arch/x86/mm/hugetlbpage.c
@@ -0,0 +1,391 @@
1/*
2 * IA-32 Huge TLB Page Support for Kernel.
3 *
4 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
5 */
6
7#include <linux/init.h>
8#include <linux/fs.h>
9#include <linux/mm.h>
10#include <linux/hugetlb.h>
11#include <linux/pagemap.h>
12#include <linux/slab.h>
13#include <linux/err.h>
14#include <linux/sysctl.h>
15#include <asm/mman.h>
16#include <asm/tlb.h>
17#include <asm/tlbflush.h>
18
19static unsigned long page_table_shareable(struct vm_area_struct *svma,
20 struct vm_area_struct *vma,
21 unsigned long addr, pgoff_t idx)
22{
23 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
24 svma->vm_start;
25 unsigned long sbase = saddr & PUD_MASK;
26 unsigned long s_end = sbase + PUD_SIZE;
27
28 /*
29 * match the virtual addresses, permission and the alignment of the
30 * page table page.
31 */
32 if (pmd_index(addr) != pmd_index(saddr) ||
33 vma->vm_flags != svma->vm_flags ||
34 sbase < svma->vm_start || svma->vm_end < s_end)
35 return 0;
36
37 return saddr;
38}
39
40static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
41{
42 unsigned long base = addr & PUD_MASK;
43 unsigned long end = base + PUD_SIZE;
44
45 /*
46 * check on proper vm_flags and page table alignment
47 */
48 if (vma->vm_flags & VM_MAYSHARE &&
49 vma->vm_start <= base && end <= vma->vm_end)
50 return 1;
51 return 0;
52}
53
54/*
55 * search for a shareable pmd page for hugetlb.
56 */
57static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
58{
59 struct vm_area_struct *vma = find_vma(mm, addr);
60 struct address_space *mapping = vma->vm_file->f_mapping;
61 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
62 vma->vm_pgoff;
63 struct prio_tree_iter iter;
64 struct vm_area_struct *svma;
65 unsigned long saddr;
66 pte_t *spte = NULL;
67
68 if (!vma_shareable(vma, addr))
69 return;
70
71 spin_lock(&mapping->i_mmap_lock);
72 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
73 if (svma == vma)
74 continue;
75
76 saddr = page_table_shareable(svma, vma, addr, idx);
77 if (saddr) {
78 spte = huge_pte_offset(svma->vm_mm, saddr);
79 if (spte) {
80 get_page(virt_to_page(spte));
81 break;
82 }
83 }
84 }
85
86 if (!spte)
87 goto out;
88
89 spin_lock(&mm->page_table_lock);
90 if (pud_none(*pud))
91 pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
92 else
93 put_page(virt_to_page(spte));
94 spin_unlock(&mm->page_table_lock);
95out:
96 spin_unlock(&mapping->i_mmap_lock);
97}
98
99/*
100 * unmap huge page backed by shared pte.
101 *
102 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
103 * indicated by page_count > 1, unmap is achieved by clearing pud and
104 * decrementing the ref count. If count == 1, the pte page is not shared.
105 *
106 * called with vma->vm_mm->page_table_lock held.
107 *
108 * returns: 1 successfully unmapped a shared pte page
109 * 0 the underlying pte page is not shared, or it is the last user
110 */
111int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
112{
113 pgd_t *pgd = pgd_offset(mm, *addr);
114 pud_t *pud = pud_offset(pgd, *addr);
115
116 BUG_ON(page_count(virt_to_page(ptep)) == 0);
117 if (page_count(virt_to_page(ptep)) == 1)
118 return 0;
119
120 pud_clear(pud);
121 put_page(virt_to_page(ptep));
122 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
123 return 1;
124}
125
126pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
127{
128 pgd_t *pgd;
129 pud_t *pud;
130 pte_t *pte = NULL;
131
132 pgd = pgd_offset(mm, addr);
133 pud = pud_alloc(mm, pgd, addr);
134 if (pud) {
135 if (pud_none(*pud))
136 huge_pmd_share(mm, addr, pud);
137 pte = (pte_t *) pmd_alloc(mm, pud, addr);
138 }
139 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
140
141 return pte;
142}
143
144pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
145{
146 pgd_t *pgd;
147 pud_t *pud;
148 pmd_t *pmd = NULL;
149
150 pgd = pgd_offset(mm, addr);
151 if (pgd_present(*pgd)) {
152 pud = pud_offset(pgd, addr);
153 if (pud_present(*pud))
154 pmd = pmd_offset(pud, addr);
155 }
156 return (pte_t *) pmd;
157}
158
159#if 0 /* This is just for testing */
160struct page *
161follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
162{
163 unsigned long start = address;
164 int length = 1;
165 int nr;
166 struct page *page;
167 struct vm_area_struct *vma;
168
169 vma = find_vma(mm, addr);
170 if (!vma || !is_vm_hugetlb_page(vma))
171 return ERR_PTR(-EINVAL);
172
173 pte = huge_pte_offset(mm, address);
174
175 /* hugetlb should be locked, and hence, prefaulted */
176 WARN_ON(!pte || pte_none(*pte));
177
178 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
179
180 WARN_ON(!PageCompound(page));
181
182 return page;
183}
184
185int pmd_huge(pmd_t pmd)
186{
187 return 0;
188}
189
190struct page *
191follow_huge_pmd(struct mm_struct *mm, unsigned long address,
192 pmd_t *pmd, int write)
193{
194 return NULL;
195}
196
197#else
198
199struct page *
200follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
201{
202 return ERR_PTR(-EINVAL);
203}
204
205int pmd_huge(pmd_t pmd)
206{
207 return !!(pmd_val(pmd) & _PAGE_PSE);
208}
209
210struct page *
211follow_huge_pmd(struct mm_struct *mm, unsigned long address,
212 pmd_t *pmd, int write)
213{
214 struct page *page;
215
216 page = pte_page(*(pte_t *)pmd);
217 if (page)
218 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
219 return page;
220}
221#endif
222
223/* x86_64 also uses this file */
224
225#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
226static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
227 unsigned long addr, unsigned long len,
228 unsigned long pgoff, unsigned long flags)
229{
230 struct mm_struct *mm = current->mm;
231 struct vm_area_struct *vma;
232 unsigned long start_addr;
233
234 if (len > mm->cached_hole_size) {
235 start_addr = mm->free_area_cache;
236 } else {
237 start_addr = TASK_UNMAPPED_BASE;
238 mm->cached_hole_size = 0;
239 }
240
241full_search:
242 addr = ALIGN(start_addr, HPAGE_SIZE);
243
244 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
245 /* At this point: (!vma || addr < vma->vm_end). */
246 if (TASK_SIZE - len < addr) {
247 /*
248 * Start a new search - just in case we missed
249 * some holes.
250 */
251 if (start_addr != TASK_UNMAPPED_BASE) {
252 start_addr = TASK_UNMAPPED_BASE;
253 mm->cached_hole_size = 0;
254 goto full_search;
255 }
256 return -ENOMEM;
257 }
258 if (!vma || addr + len <= vma->vm_start) {
259 mm->free_area_cache = addr + len;
260 return addr;
261 }
262 if (addr + mm->cached_hole_size < vma->vm_start)
263 mm->cached_hole_size = vma->vm_start - addr;
264 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
265 }
266}
267
268static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
269 unsigned long addr0, unsigned long len,
270 unsigned long pgoff, unsigned long flags)
271{
272 struct mm_struct *mm = current->mm;
273 struct vm_area_struct *vma, *prev_vma;
274 unsigned long base = mm->mmap_base, addr = addr0;
275 unsigned long largest_hole = mm->cached_hole_size;
276 int first_time = 1;
277
278 /* don't allow allocations above current base */
279 if (mm->free_area_cache > base)
280 mm->free_area_cache = base;
281
282 if (len <= largest_hole) {
283 largest_hole = 0;
284 mm->free_area_cache = base;
285 }
286try_again:
287 /* make sure it can fit in the remaining address space */
288 if (mm->free_area_cache < len)
289 goto fail;
290
291 /* either no address requested or cant fit in requested address hole */
292 addr = (mm->free_area_cache - len) & HPAGE_MASK;
293 do {
294 /*
295 * Lookup failure means no vma is above this address,
296 * i.e. return with success:
297 */
298 if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
299 return addr;
300
301 /*
302 * new region fits between prev_vma->vm_end and
303 * vma->vm_start, use it:
304 */
305 if (addr + len <= vma->vm_start &&
306 (!prev_vma || (addr >= prev_vma->vm_end))) {
307 /* remember the address as a hint for next time */
308 mm->cached_hole_size = largest_hole;
309 return (mm->free_area_cache = addr);
310 } else {
311 /* pull free_area_cache down to the first hole */
312 if (mm->free_area_cache == vma->vm_end) {
313 mm->free_area_cache = vma->vm_start;
314 mm->cached_hole_size = largest_hole;
315 }
316 }
317
318 /* remember the largest hole we saw so far */
319 if (addr + largest_hole < vma->vm_start)
320 largest_hole = vma->vm_start - addr;
321
322 /* try just below the current vma->vm_start */
323 addr = (vma->vm_start - len) & HPAGE_MASK;
324 } while (len <= vma->vm_start);
325
326fail:
327 /*
328 * if hint left us with no space for the requested
329 * mapping then try again:
330 */
331 if (first_time) {
332 mm->free_area_cache = base;
333 largest_hole = 0;
334 first_time = 0;
335 goto try_again;
336 }
337 /*
338 * A failed mmap() very likely causes application failure,
339 * so fall back to the bottom-up function here. This scenario
340 * can happen with large stack limits and large mmap()
341 * allocations.
342 */
343 mm->free_area_cache = TASK_UNMAPPED_BASE;
344 mm->cached_hole_size = ~0UL;
345 addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
346 len, pgoff, flags);
347
348 /*
349 * Restore the topdown base:
350 */
351 mm->free_area_cache = base;
352 mm->cached_hole_size = ~0UL;
353
354 return addr;
355}
356
357unsigned long
358hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
359 unsigned long len, unsigned long pgoff, unsigned long flags)
360{
361 struct mm_struct *mm = current->mm;
362 struct vm_area_struct *vma;
363
364 if (len & ~HPAGE_MASK)
365 return -EINVAL;
366 if (len > TASK_SIZE)
367 return -ENOMEM;
368
369 if (flags & MAP_FIXED) {
370 if (prepare_hugepage_range(addr, len))
371 return -EINVAL;
372 return addr;
373 }
374
375 if (addr) {
376 addr = ALIGN(addr, HPAGE_SIZE);
377 vma = find_vma(mm, addr);
378 if (TASK_SIZE - len >= addr &&
379 (!vma || addr + len <= vma->vm_start))
380 return addr;
381 }
382 if (mm->get_unmapped_area == arch_get_unmapped_area)
383 return hugetlb_get_unmapped_area_bottomup(file, addr, len,
384 pgoff, flags);
385 else
386 return hugetlb_get_unmapped_area_topdown(file, addr, len,
387 pgoff, flags);
388}
389
390#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
391
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
new file mode 100644
index 000000000000..730a5b177b1f
--- /dev/null
+++ b/arch/x86/mm/init_32.c
@@ -0,0 +1,858 @@
1/*
2 * linux/arch/i386/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 */
8
9#include <linux/module.h>
10#include <linux/signal.h>
11#include <linux/sched.h>
12#include <linux/kernel.h>
13#include <linux/errno.h>
14#include <linux/string.h>
15#include <linux/types.h>
16#include <linux/ptrace.h>
17#include <linux/mman.h>
18#include <linux/mm.h>
19#include <linux/hugetlb.h>
20#include <linux/swap.h>
21#include <linux/smp.h>
22#include <linux/init.h>
23#include <linux/highmem.h>
24#include <linux/pagemap.h>
25#include <linux/pfn.h>
26#include <linux/poison.h>
27#include <linux/bootmem.h>
28#include <linux/slab.h>
29#include <linux/proc_fs.h>
30#include <linux/efi.h>
31#include <linux/memory_hotplug.h>
32#include <linux/initrd.h>
33#include <linux/cpumask.h>
34
35#include <asm/processor.h>
36#include <asm/system.h>
37#include <asm/uaccess.h>
38#include <asm/pgtable.h>
39#include <asm/dma.h>
40#include <asm/fixmap.h>
41#include <asm/e820.h>
42#include <asm/apic.h>
43#include <asm/tlb.h>
44#include <asm/tlbflush.h>
45#include <asm/sections.h>
46#include <asm/paravirt.h>
47
48unsigned int __VMALLOC_RESERVE = 128 << 20;
49
50DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
51unsigned long highstart_pfn, highend_pfn;
52
53static int noinline do_test_wp_bit(void);
54
55/*
56 * Creates a middle page table and puts a pointer to it in the
57 * given global directory entry. This only returns the gd entry
58 * in non-PAE compilation mode, since the middle layer is folded.
59 */
60static pmd_t * __init one_md_table_init(pgd_t *pgd)
61{
62 pud_t *pud;
63 pmd_t *pmd_table;
64
65#ifdef CONFIG_X86_PAE
66 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
67 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
68
69 paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
70 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
71 pud = pud_offset(pgd, 0);
72 if (pmd_table != pmd_offset(pud, 0))
73 BUG();
74 }
75#endif
76 pud = pud_offset(pgd, 0);
77 pmd_table = pmd_offset(pud, 0);
78 return pmd_table;
79}
80
81/*
82 * Create a page table and place a pointer to it in a middle page
83 * directory entry.
84 */
85static pte_t * __init one_page_table_init(pmd_t *pmd)
86{
87 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
88 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
89
90 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
91 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
92 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
93 }
94
95 return pte_offset_kernel(pmd, 0);
96}
97
98/*
99 * This function initializes a certain range of kernel virtual memory
100 * with new bootmem page tables, everywhere page tables are missing in
101 * the given range.
102 */
103
104/*
105 * NOTE: The pagetables are allocated contiguous on the physical space
106 * so we can cache the place of the first one and move around without
107 * checking the pgd every time.
108 */
109static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
110{
111 pgd_t *pgd;
112 pmd_t *pmd;
113 int pgd_idx, pmd_idx;
114 unsigned long vaddr;
115
116 vaddr = start;
117 pgd_idx = pgd_index(vaddr);
118 pmd_idx = pmd_index(vaddr);
119 pgd = pgd_base + pgd_idx;
120
121 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
122 pmd = one_md_table_init(pgd);
123 pmd = pmd + pmd_index(vaddr);
124 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
125 one_page_table_init(pmd);
126
127 vaddr += PMD_SIZE;
128 }
129 pmd_idx = 0;
130 }
131}
132
133static inline int is_kernel_text(unsigned long addr)
134{
135 if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
136 return 1;
137 return 0;
138}
139
140/*
141 * This maps the physical memory to kernel virtual address space, a total
142 * of max_low_pfn pages, by creating page tables starting from address
143 * PAGE_OFFSET.
144 */
145static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
146{
147 unsigned long pfn;
148 pgd_t *pgd;
149 pmd_t *pmd;
150 pte_t *pte;
151 int pgd_idx, pmd_idx, pte_ofs;
152
153 pgd_idx = pgd_index(PAGE_OFFSET);
154 pgd = pgd_base + pgd_idx;
155 pfn = 0;
156
157 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
158 pmd = one_md_table_init(pgd);
159 if (pfn >= max_low_pfn)
160 continue;
161 for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
162 unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
163
164 /* Map with big pages if possible, otherwise create normal page tables. */
165 if (cpu_has_pse) {
166 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
167 if (is_kernel_text(address) || is_kernel_text(address2))
168 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
169 else
170 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
171
172 pfn += PTRS_PER_PTE;
173 } else {
174 pte = one_page_table_init(pmd);
175
176 for (pte_ofs = 0;
177 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
178 pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
179 if (is_kernel_text(address))
180 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
181 else
182 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
183 }
184 }
185 }
186 }
187}
188
189static inline int page_kills_ppro(unsigned long pagenr)
190{
191 if (pagenr >= 0x70000 && pagenr <= 0x7003F)
192 return 1;
193 return 0;
194}
195
196int page_is_ram(unsigned long pagenr)
197{
198 int i;
199 unsigned long addr, end;
200
201 if (efi_enabled) {
202 efi_memory_desc_t *md;
203 void *p;
204
205 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
206 md = p;
207 if (!is_available_memory(md))
208 continue;
209 addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
210 end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
211
212 if ((pagenr >= addr) && (pagenr < end))
213 return 1;
214 }
215 return 0;
216 }
217
218 for (i = 0; i < e820.nr_map; i++) {
219
220 if (e820.map[i].type != E820_RAM) /* not usable memory */
221 continue;
222 /*
223 * !!!FIXME!!! Some BIOSen report areas as RAM that
224 * are not. Notably the 640->1Mb area. We need a sanity
225 * check here.
226 */
227 addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
228 end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
229 if ((pagenr >= addr) && (pagenr < end))
230 return 1;
231 }
232 return 0;
233}
234
235#ifdef CONFIG_HIGHMEM
236pte_t *kmap_pte;
237pgprot_t kmap_prot;
238
239#define kmap_get_fixmap_pte(vaddr) \
240 pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
241
242static void __init kmap_init(void)
243{
244 unsigned long kmap_vstart;
245
246 /* cache the first kmap pte */
247 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
248 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
249
250 kmap_prot = PAGE_KERNEL;
251}
252
253static void __init permanent_kmaps_init(pgd_t *pgd_base)
254{
255 pgd_t *pgd;
256 pud_t *pud;
257 pmd_t *pmd;
258 pte_t *pte;
259 unsigned long vaddr;
260
261 vaddr = PKMAP_BASE;
262 page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
263
264 pgd = swapper_pg_dir + pgd_index(vaddr);
265 pud = pud_offset(pgd, vaddr);
266 pmd = pmd_offset(pud, vaddr);
267 pte = pte_offset_kernel(pmd, vaddr);
268 pkmap_page_table = pte;
269}
270
271static void __meminit free_new_highpage(struct page *page)
272{
273 init_page_count(page);
274 __free_page(page);
275 totalhigh_pages++;
276}
277
278void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
279{
280 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
281 ClearPageReserved(page);
282 free_new_highpage(page);
283 } else
284 SetPageReserved(page);
285}
286
287static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
288{
289 free_new_highpage(page);
290 totalram_pages++;
291#ifdef CONFIG_FLATMEM
292 max_mapnr = max(pfn, max_mapnr);
293#endif
294 num_physpages++;
295 return 0;
296}
297
298/*
299 * Not currently handling the NUMA case.
300 * Assuming single node and all memory that
301 * has been added dynamically that would be
302 * onlined here is in HIGHMEM
303 */
304void __meminit online_page(struct page *page)
305{
306 ClearPageReserved(page);
307 add_one_highpage_hotplug(page, page_to_pfn(page));
308}
309
310
311#ifdef CONFIG_NUMA
312extern void set_highmem_pages_init(int);
313#else
314static void __init set_highmem_pages_init(int bad_ppro)
315{
316 int pfn;
317 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
318 add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
319 totalram_pages += totalhigh_pages;
320}
321#endif /* CONFIG_FLATMEM */
322
323#else
324#define kmap_init() do { } while (0)
325#define permanent_kmaps_init(pgd_base) do { } while (0)
326#define set_highmem_pages_init(bad_ppro) do { } while (0)
327#endif /* CONFIG_HIGHMEM */
328
329unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
330EXPORT_SYMBOL(__PAGE_KERNEL);
331unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
332
333#ifdef CONFIG_NUMA
334extern void __init remap_numa_kva(void);
335#else
336#define remap_numa_kva() do {} while (0)
337#endif
338
339void __init native_pagetable_setup_start(pgd_t *base)
340{
341#ifdef CONFIG_X86_PAE
342 int i;
343
344 /*
345 * Init entries of the first-level page table to the
346 * zero page, if they haven't already been set up.
347 *
348 * In a normal native boot, we'll be running on a
349 * pagetable rooted in swapper_pg_dir, but not in PAE
350 * mode, so this will end up clobbering the mappings
351 * for the lower 24Mbytes of the address space,
352 * without affecting the kernel address space.
353 */
354 for (i = 0; i < USER_PTRS_PER_PGD; i++)
355 set_pgd(&base[i],
356 __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
357
358 /* Make sure kernel address space is empty so that a pagetable
359 will be allocated for it. */
360 memset(&base[USER_PTRS_PER_PGD], 0,
361 KERNEL_PGD_PTRS * sizeof(pgd_t));
362#else
363 paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
364#endif
365}
366
367void __init native_pagetable_setup_done(pgd_t *base)
368{
369#ifdef CONFIG_X86_PAE
370 /*
371 * Add low memory identity-mappings - SMP needs it when
372 * starting up on an AP from real-mode. In the non-PAE
373 * case we already have these mappings through head.S.
374 * All user-space mappings are explicitly cleared after
375 * SMP startup.
376 */
377 set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
378#endif
379}
380
381/*
382 * Build a proper pagetable for the kernel mappings. Up until this
383 * point, we've been running on some set of pagetables constructed by
384 * the boot process.
385 *
386 * If we're booting on native hardware, this will be a pagetable
387 * constructed in arch/i386/kernel/head.S, and not running in PAE mode
388 * (even if we'll end up running in PAE). The root of the pagetable
389 * will be swapper_pg_dir.
390 *
391 * If we're booting paravirtualized under a hypervisor, then there are
392 * more options: we may already be running PAE, and the pagetable may
393 * or may not be based in swapper_pg_dir. In any case,
394 * paravirt_pagetable_setup_start() will set up swapper_pg_dir
395 * appropriately for the rest of the initialization to work.
396 *
397 * In general, pagetable_init() assumes that the pagetable may already
398 * be partially populated, and so it avoids stomping on any existing
399 * mappings.
400 */
401static void __init pagetable_init (void)
402{
403 unsigned long vaddr, end;
404 pgd_t *pgd_base = swapper_pg_dir;
405
406 paravirt_pagetable_setup_start(pgd_base);
407
408 /* Enable PSE if available */
409 if (cpu_has_pse)
410 set_in_cr4(X86_CR4_PSE);
411
412 /* Enable PGE if available */
413 if (cpu_has_pge) {
414 set_in_cr4(X86_CR4_PGE);
415 __PAGE_KERNEL |= _PAGE_GLOBAL;
416 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
417 }
418
419 kernel_physical_mapping_init(pgd_base);
420 remap_numa_kva();
421
422 /*
423 * Fixed mappings, only the page table structure has to be
424 * created - mappings will be set by set_fixmap():
425 */
426 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
427 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
428 page_table_range_init(vaddr, end, pgd_base);
429
430 permanent_kmaps_init(pgd_base);
431
432 paravirt_pagetable_setup_done(pgd_base);
433}
434
435#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
436/*
437 * Swap suspend & friends need this for resume because things like the intel-agp
438 * driver might have split up a kernel 4MB mapping.
439 */
440char __nosavedata swsusp_pg_dir[PAGE_SIZE]
441 __attribute__ ((aligned (PAGE_SIZE)));
442
443static inline void save_pg_dir(void)
444{
445 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
446}
447#else
448static inline void save_pg_dir(void)
449{
450}
451#endif
452
453void zap_low_mappings (void)
454{
455 int i;
456
457 save_pg_dir();
458
459 /*
460 * Zap initial low-memory mappings.
461 *
462 * Note that "pgd_clear()" doesn't do it for
463 * us, because pgd_clear() is a no-op on i386.
464 */
465 for (i = 0; i < USER_PTRS_PER_PGD; i++)
466#ifdef CONFIG_X86_PAE
467 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
468#else
469 set_pgd(swapper_pg_dir+i, __pgd(0));
470#endif
471 flush_tlb_all();
472}
473
474int nx_enabled = 0;
475
476#ifdef CONFIG_X86_PAE
477
478static int disable_nx __initdata = 0;
479u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
480EXPORT_SYMBOL_GPL(__supported_pte_mask);
481
482/*
483 * noexec = on|off
484 *
485 * Control non executable mappings.
486 *
487 * on Enable
488 * off Disable
489 */
490static int __init noexec_setup(char *str)
491{
492 if (!str || !strcmp(str, "on")) {
493 if (cpu_has_nx) {
494 __supported_pte_mask |= _PAGE_NX;
495 disable_nx = 0;
496 }
497 } else if (!strcmp(str,"off")) {
498 disable_nx = 1;
499 __supported_pte_mask &= ~_PAGE_NX;
500 } else
501 return -EINVAL;
502
503 return 0;
504}
505early_param("noexec", noexec_setup);
506
507static void __init set_nx(void)
508{
509 unsigned int v[4], l, h;
510
511 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
512 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
513 if ((v[3] & (1 << 20)) && !disable_nx) {
514 rdmsr(MSR_EFER, l, h);
515 l |= EFER_NX;
516 wrmsr(MSR_EFER, l, h);
517 nx_enabled = 1;
518 __supported_pte_mask |= _PAGE_NX;
519 }
520 }
521}
522
523/*
524 * Enables/disables executability of a given kernel page and
525 * returns the previous setting.
526 */
527int __init set_kernel_exec(unsigned long vaddr, int enable)
528{
529 pte_t *pte;
530 int ret = 1;
531
532 if (!nx_enabled)
533 goto out;
534
535 pte = lookup_address(vaddr);
536 BUG_ON(!pte);
537
538 if (!pte_exec_kernel(*pte))
539 ret = 0;
540
541 if (enable)
542 pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
543 else
544 pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
545 pte_update_defer(&init_mm, vaddr, pte);
546 __flush_tlb_all();
547out:
548 return ret;
549}
550
551#endif
552
553/*
554 * paging_init() sets up the page tables - note that the first 8MB are
555 * already mapped by head.S.
556 *
557 * This routines also unmaps the page at virtual kernel address 0, so
558 * that we can trap those pesky NULL-reference errors in the kernel.
559 */
560void __init paging_init(void)
561{
562#ifdef CONFIG_X86_PAE
563 set_nx();
564 if (nx_enabled)
565 printk("NX (Execute Disable) protection: active\n");
566#endif
567
568 pagetable_init();
569
570 load_cr3(swapper_pg_dir);
571
572#ifdef CONFIG_X86_PAE
573 /*
574 * We will bail out later - printk doesn't work right now so
575 * the user would just see a hanging kernel.
576 */
577 if (cpu_has_pae)
578 set_in_cr4(X86_CR4_PAE);
579#endif
580 __flush_tlb_all();
581
582 kmap_init();
583}
584
585/*
586 * Test if the WP bit works in supervisor mode. It isn't supported on 386's
587 * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
588 * used to involve black magic jumps to work around some nasty CPU bugs,
589 * but fortunately the switch to using exceptions got rid of all that.
590 */
591
592static void __init test_wp_bit(void)
593{
594 printk("Checking if this processor honours the WP bit even in supervisor mode... ");
595
596 /* Any page-aligned address will do, the test is non-destructive */
597 __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
598 boot_cpu_data.wp_works_ok = do_test_wp_bit();
599 clear_fixmap(FIX_WP_TEST);
600
601 if (!boot_cpu_data.wp_works_ok) {
602 printk("No.\n");
603#ifdef CONFIG_X86_WP_WORKS_OK
604 panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
605#endif
606 } else {
607 printk("Ok.\n");
608 }
609}
610
611static struct kcore_list kcore_mem, kcore_vmalloc;
612
613void __init mem_init(void)
614{
615 extern int ppro_with_ram_bug(void);
616 int codesize, reservedpages, datasize, initsize;
617 int tmp;
618 int bad_ppro;
619
620#ifdef CONFIG_FLATMEM
621 BUG_ON(!mem_map);
622#endif
623
624 bad_ppro = ppro_with_ram_bug();
625
626#ifdef CONFIG_HIGHMEM
627 /* check that fixmap and pkmap do not overlap */
628 if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
629 printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
630 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
631 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
632 BUG();
633 }
634#endif
635
636 /* this will put all low memory onto the freelists */
637 totalram_pages += free_all_bootmem();
638
639 reservedpages = 0;
640 for (tmp = 0; tmp < max_low_pfn; tmp++)
641 /*
642 * Only count reserved RAM pages
643 */
644 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
645 reservedpages++;
646
647 set_highmem_pages_init(bad_ppro);
648
649 codesize = (unsigned long) &_etext - (unsigned long) &_text;
650 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
651 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
652
653 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
654 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
655 VMALLOC_END-VMALLOC_START);
656
657 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
658 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
659 num_physpages << (PAGE_SHIFT-10),
660 codesize >> 10,
661 reservedpages << (PAGE_SHIFT-10),
662 datasize >> 10,
663 initsize >> 10,
664 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
665 );
666
667#if 1 /* double-sanity-check paranoia */
668 printk("virtual kernel memory layout:\n"
669 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
670#ifdef CONFIG_HIGHMEM
671 " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
672#endif
673 " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
674 " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
675 " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
676 " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
677 " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
678 FIXADDR_START, FIXADDR_TOP,
679 (FIXADDR_TOP - FIXADDR_START) >> 10,
680
681#ifdef CONFIG_HIGHMEM
682 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
683 (LAST_PKMAP*PAGE_SIZE) >> 10,
684#endif
685
686 VMALLOC_START, VMALLOC_END,
687 (VMALLOC_END - VMALLOC_START) >> 20,
688
689 (unsigned long)__va(0), (unsigned long)high_memory,
690 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
691
692 (unsigned long)&__init_begin, (unsigned long)&__init_end,
693 ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
694
695 (unsigned long)&_etext, (unsigned long)&_edata,
696 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
697
698 (unsigned long)&_text, (unsigned long)&_etext,
699 ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
700
701#ifdef CONFIG_HIGHMEM
702 BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
703 BUG_ON(VMALLOC_END > PKMAP_BASE);
704#endif
705 BUG_ON(VMALLOC_START > VMALLOC_END);
706 BUG_ON((unsigned long)high_memory > VMALLOC_START);
707#endif /* double-sanity-check paranoia */
708
709#ifdef CONFIG_X86_PAE
710 if (!cpu_has_pae)
711 panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
712#endif
713 if (boot_cpu_data.wp_works_ok < 0)
714 test_wp_bit();
715
716 /*
717 * Subtle. SMP is doing it's boot stuff late (because it has to
718 * fork idle threads) - but it also needs low mappings for the
719 * protected-mode entry to work. We zap these entries only after
720 * the WP-bit has been tested.
721 */
722#ifndef CONFIG_SMP
723 zap_low_mappings();
724#endif
725}
726
727#ifdef CONFIG_MEMORY_HOTPLUG
728int arch_add_memory(int nid, u64 start, u64 size)
729{
730 struct pglist_data *pgdata = NODE_DATA(nid);
731 struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
732 unsigned long start_pfn = start >> PAGE_SHIFT;
733 unsigned long nr_pages = size >> PAGE_SHIFT;
734
735 return __add_pages(zone, start_pfn, nr_pages);
736}
737
738int remove_memory(u64 start, u64 size)
739{
740 return -EINVAL;
741}
742EXPORT_SYMBOL_GPL(remove_memory);
743#endif
744
745struct kmem_cache *pmd_cache;
746
747void __init pgtable_cache_init(void)
748{
749 size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
750
751 if (PTRS_PER_PMD > 1) {
752 pmd_cache = kmem_cache_create("pmd",
753 PTRS_PER_PMD*sizeof(pmd_t),
754 PTRS_PER_PMD*sizeof(pmd_t),
755 SLAB_PANIC,
756 pmd_ctor);
757 if (!SHARED_KERNEL_PMD) {
758 /* If we're in PAE mode and have a non-shared
759 kernel pmd, then the pgd size must be a
760 page size. This is because the pgd_list
761 links through the page structure, so there
762 can only be one pgd per page for this to
763 work. */
764 pgd_size = PAGE_SIZE;
765 }
766 }
767}
768
769/*
770 * This function cannot be __init, since exceptions don't work in that
771 * section. Put this after the callers, so that it cannot be inlined.
772 */
773static int noinline do_test_wp_bit(void)
774{
775 char tmp_reg;
776 int flag;
777
778 __asm__ __volatile__(
779 " movb %0,%1 \n"
780 "1: movb %1,%0 \n"
781 " xorl %2,%2 \n"
782 "2: \n"
783 ".section __ex_table,\"a\"\n"
784 " .align 4 \n"
785 " .long 1b,2b \n"
786 ".previous \n"
787 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
788 "=q" (tmp_reg),
789 "=r" (flag)
790 :"2" (1)
791 :"memory");
792
793 return flag;
794}
795
796#ifdef CONFIG_DEBUG_RODATA
797
798void mark_rodata_ro(void)
799{
800 unsigned long start = PFN_ALIGN(_text);
801 unsigned long size = PFN_ALIGN(_etext) - start;
802
803#ifndef CONFIG_KPROBES
804#ifdef CONFIG_HOTPLUG_CPU
805 /* It must still be possible to apply SMP alternatives. */
806 if (num_possible_cpus() <= 1)
807#endif
808 {
809 change_page_attr(virt_to_page(start),
810 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
811 printk("Write protecting the kernel text: %luk\n", size >> 10);
812 }
813#endif
814 start += size;
815 size = (unsigned long)__end_rodata - start;
816 change_page_attr(virt_to_page(start),
817 size >> PAGE_SHIFT, PAGE_KERNEL_RO);
818 printk("Write protecting the kernel read-only data: %luk\n",
819 size >> 10);
820
821 /*
822 * change_page_attr() requires a global_flush_tlb() call after it.
823 * We do this after the printk so that if something went wrong in the
824 * change, the printk gets out at least to give a better debug hint
825 * of who is the culprit.
826 */
827 global_flush_tlb();
828}
829#endif
830
831void free_init_pages(char *what, unsigned long begin, unsigned long end)
832{
833 unsigned long addr;
834
835 for (addr = begin; addr < end; addr += PAGE_SIZE) {
836 ClearPageReserved(virt_to_page(addr));
837 init_page_count(virt_to_page(addr));
838 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
839 free_page(addr);
840 totalram_pages++;
841 }
842 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
843}
844
845void free_initmem(void)
846{
847 free_init_pages("unused kernel memory",
848 (unsigned long)(&__init_begin),
849 (unsigned long)(&__init_end));
850}
851
852#ifdef CONFIG_BLK_DEV_INITRD
853void free_initrd_mem(unsigned long start, unsigned long end)
854{
855 free_init_pages("initrd memory", start, end);
856}
857#endif
858
diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c
new file mode 100644
index 000000000000..0b278315d737
--- /dev/null
+++ b/arch/x86/mm/ioremap_32.c
@@ -0,0 +1,274 @@
1/*
2 * arch/i386/mm/ioremap.c
3 *
4 * Re-map IO memory to kernel address space so that we can access it.
5 * This is needed for high PCI addresses that aren't mapped in the
6 * 640k-1MB IO memory area on PC's
7 *
8 * (C) Copyright 1995 1996 Linus Torvalds
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <linux/module.h>
15#include <linux/io.h>
16#include <asm/fixmap.h>
17#include <asm/cacheflush.h>
18#include <asm/tlbflush.h>
19#include <asm/pgtable.h>
20
21#define ISA_START_ADDRESS 0xa0000
22#define ISA_END_ADDRESS 0x100000
23
24/*
25 * Generic mapping function (not visible outside):
26 */
27
28/*
29 * Remap an arbitrary physical address space into the kernel virtual
30 * address space. Needed when the kernel wants to access high addresses
31 * directly.
32 *
33 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
34 * have to convert them into an offset in a page-aligned mapping, but the
35 * caller shouldn't need to know that small detail.
36 */
37void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
38{
39 void __iomem * addr;
40 struct vm_struct * area;
41 unsigned long offset, last_addr;
42 pgprot_t prot;
43
44 /* Don't allow wraparound or zero size */
45 last_addr = phys_addr + size - 1;
46 if (!size || last_addr < phys_addr)
47 return NULL;
48
49 /*
50 * Don't remap the low PCI/ISA area, it's always mapped..
51 */
52 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
53 return (void __iomem *) phys_to_virt(phys_addr);
54
55 /*
56 * Don't allow anybody to remap normal RAM that we're using..
57 */
58 if (phys_addr <= virt_to_phys(high_memory - 1)) {
59 char *t_addr, *t_end;
60 struct page *page;
61
62 t_addr = __va(phys_addr);
63 t_end = t_addr + (size - 1);
64
65 for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
66 if(!PageReserved(page))
67 return NULL;
68 }
69
70 prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
71 | _PAGE_ACCESSED | flags);
72
73 /*
74 * Mappings have to be page-aligned
75 */
76 offset = phys_addr & ~PAGE_MASK;
77 phys_addr &= PAGE_MASK;
78 size = PAGE_ALIGN(last_addr+1) - phys_addr;
79
80 /*
81 * Ok, go for it..
82 */
83 area = get_vm_area(size, VM_IOREMAP | (flags << 20));
84 if (!area)
85 return NULL;
86 area->phys_addr = phys_addr;
87 addr = (void __iomem *) area->addr;
88 if (ioremap_page_range((unsigned long) addr,
89 (unsigned long) addr + size, phys_addr, prot)) {
90 vunmap((void __force *) addr);
91 return NULL;
92 }
93 return (void __iomem *) (offset + (char __iomem *)addr);
94}
95EXPORT_SYMBOL(__ioremap);
96
97/**
98 * ioremap_nocache - map bus memory into CPU space
99 * @offset: bus address of the memory
100 * @size: size of the resource to map
101 *
102 * ioremap_nocache performs a platform specific sequence of operations to
103 * make bus memory CPU accessible via the readb/readw/readl/writeb/
104 * writew/writel functions and the other mmio helpers. The returned
105 * address is not guaranteed to be usable directly as a virtual
106 * address.
107 *
108 * This version of ioremap ensures that the memory is marked uncachable
109 * on the CPU as well as honouring existing caching rules from things like
110 * the PCI bus. Note that there are other caches and buffers on many
111 * busses. In particular driver authors should read up on PCI writes
112 *
113 * It's useful if some control registers are in such an area and
114 * write combining or read caching is not desirable:
115 *
116 * Must be freed with iounmap.
117 */
118
119void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
120{
121 unsigned long last_addr;
122 void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
123 if (!p)
124 return p;
125
126 /* Guaranteed to be > phys_addr, as per __ioremap() */
127 last_addr = phys_addr + size - 1;
128
129 if (last_addr < virt_to_phys(high_memory) - 1) {
130 struct page *ppage = virt_to_page(__va(phys_addr));
131 unsigned long npages;
132
133 phys_addr &= PAGE_MASK;
134
135 /* This might overflow and become zero.. */
136 last_addr = PAGE_ALIGN(last_addr);
137
138 /* .. but that's ok, because modulo-2**n arithmetic will make
139 * the page-aligned "last - first" come out right.
140 */
141 npages = (last_addr - phys_addr) >> PAGE_SHIFT;
142
143 if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
144 iounmap(p);
145 p = NULL;
146 }
147 global_flush_tlb();
148 }
149
150 return p;
151}
152EXPORT_SYMBOL(ioremap_nocache);
153
154/**
155 * iounmap - Free a IO remapping
156 * @addr: virtual address from ioremap_*
157 *
158 * Caller must ensure there is only one unmapping for the same pointer.
159 */
160void iounmap(volatile void __iomem *addr)
161{
162 struct vm_struct *p, *o;
163
164 if ((void __force *)addr <= high_memory)
165 return;
166
167 /*
168 * __ioremap special-cases the PCI/ISA range by not instantiating a
169 * vm_area and by simply returning an address into the kernel mapping
170 * of ISA space. So handle that here.
171 */
172 if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
173 addr < phys_to_virt(ISA_END_ADDRESS))
174 return;
175
176 addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
177
178 /* Use the vm area unlocked, assuming the caller
179 ensures there isn't another iounmap for the same address
180 in parallel. Reuse of the virtual address is prevented by
181 leaving it in the global lists until we're done with it.
182 cpa takes care of the direct mappings. */
183 read_lock(&vmlist_lock);
184 for (p = vmlist; p; p = p->next) {
185 if (p->addr == addr)
186 break;
187 }
188 read_unlock(&vmlist_lock);
189
190 if (!p) {
191 printk("iounmap: bad address %p\n", addr);
192 dump_stack();
193 return;
194 }
195
196 /* Reset the direct mapping. Can block */
197 if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
198 change_page_attr(virt_to_page(__va(p->phys_addr)),
199 get_vm_area_size(p) >> PAGE_SHIFT,
200 PAGE_KERNEL);
201 global_flush_tlb();
202 }
203
204 /* Finally remove it */
205 o = remove_vm_area((void *)addr);
206 BUG_ON(p != o || o == NULL);
207 kfree(p);
208}
209EXPORT_SYMBOL(iounmap);
210
211void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
212{
213 unsigned long offset, last_addr;
214 unsigned int nrpages;
215 enum fixed_addresses idx;
216
217 /* Don't allow wraparound or zero size */
218 last_addr = phys_addr + size - 1;
219 if (!size || last_addr < phys_addr)
220 return NULL;
221
222 /*
223 * Don't remap the low PCI/ISA area, it's always mapped..
224 */
225 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
226 return phys_to_virt(phys_addr);
227
228 /*
229 * Mappings have to be page-aligned
230 */
231 offset = phys_addr & ~PAGE_MASK;
232 phys_addr &= PAGE_MASK;
233 size = PAGE_ALIGN(last_addr) - phys_addr;
234
235 /*
236 * Mappings have to fit in the FIX_BTMAP area.
237 */
238 nrpages = size >> PAGE_SHIFT;
239 if (nrpages > NR_FIX_BTMAPS)
240 return NULL;
241
242 /*
243 * Ok, go for it..
244 */
245 idx = FIX_BTMAP_BEGIN;
246 while (nrpages > 0) {
247 set_fixmap(idx, phys_addr);
248 phys_addr += PAGE_SIZE;
249 --idx;
250 --nrpages;
251 }
252 return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
253}
254
255void __init bt_iounmap(void *addr, unsigned long size)
256{
257 unsigned long virt_addr;
258 unsigned long offset;
259 unsigned int nrpages;
260 enum fixed_addresses idx;
261
262 virt_addr = (unsigned long)addr;
263 if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
264 return;
265 offset = virt_addr & ~PAGE_MASK;
266 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
267
268 idx = FIX_BTMAP_BEGIN;
269 while (nrpages > 0) {
270 clear_fixmap(idx);
271 --idx;
272 --nrpages;
273 }
274}
diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap_32.c
new file mode 100644
index 000000000000..552e08473755
--- /dev/null
+++ b/arch/x86/mm/mmap_32.c
@@ -0,0 +1,77 @@
1/*
2 * linux/arch/i386/mm/mmap.c
3 *
4 * flexible mmap layout support
5 *
6 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
7 * All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 *
24 * Started by Ingo Molnar <mingo@elte.hu>
25 */
26
27#include <linux/personality.h>
28#include <linux/mm.h>
29#include <linux/random.h>
30#include <linux/sched.h>
31
32/*
33 * Top of mmap area (just below the process stack).
34 *
35 * Leave an at least ~128 MB hole.
36 */
37#define MIN_GAP (128*1024*1024)
38#define MAX_GAP (TASK_SIZE/6*5)
39
40static inline unsigned long mmap_base(struct mm_struct *mm)
41{
42 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
43 unsigned long random_factor = 0;
44
45 if (current->flags & PF_RANDOMIZE)
46 random_factor = get_random_int() % (1024*1024);
47
48 if (gap < MIN_GAP)
49 gap = MIN_GAP;
50 else if (gap > MAX_GAP)
51 gap = MAX_GAP;
52
53 return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
54}
55
56/*
57 * This function, called very early during the creation of a new
58 * process VM image, sets up which VM layout function to use:
59 */
60void arch_pick_mmap_layout(struct mm_struct *mm)
61{
62 /*
63 * Fall back to the standard layout if the personality
64 * bit is set, or if the expected stack growth is unlimited:
65 */
66 if (sysctl_legacy_va_layout ||
67 (current->personality & ADDR_COMPAT_LAYOUT) ||
68 current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
69 mm->mmap_base = TASK_UNMAPPED_BASE;
70 mm->get_unmapped_area = arch_get_unmapped_area;
71 mm->unmap_area = arch_unmap_area;
72 } else {
73 mm->mmap_base = mmap_base(mm);
74 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
75 mm->unmap_area = arch_unmap_area_topdown;
76 }
77}
diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c
new file mode 100644
index 000000000000..4241a74d16c8
--- /dev/null
+++ b/arch/x86/mm/pageattr_32.c
@@ -0,0 +1,278 @@
1/*
2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback.
4 */
5
6#include <linux/mm.h>
7#include <linux/sched.h>
8#include <linux/highmem.h>
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <asm/uaccess.h>
12#include <asm/processor.h>
13#include <asm/tlbflush.h>
14#include <asm/pgalloc.h>
15#include <asm/sections.h>
16
17static DEFINE_SPINLOCK(cpa_lock);
18static struct list_head df_list = LIST_HEAD_INIT(df_list);
19
20
21pte_t *lookup_address(unsigned long address)
22{
23 pgd_t *pgd = pgd_offset_k(address);
24 pud_t *pud;
25 pmd_t *pmd;
26 if (pgd_none(*pgd))
27 return NULL;
28 pud = pud_offset(pgd, address);
29 if (pud_none(*pud))
30 return NULL;
31 pmd = pmd_offset(pud, address);
32 if (pmd_none(*pmd))
33 return NULL;
34 if (pmd_large(*pmd))
35 return (pte_t *)pmd;
36 return pte_offset_kernel(pmd, address);
37}
38
39static struct page *split_large_page(unsigned long address, pgprot_t prot,
40 pgprot_t ref_prot)
41{
42 int i;
43 unsigned long addr;
44 struct page *base;
45 pte_t *pbase;
46
47 spin_unlock_irq(&cpa_lock);
48 base = alloc_pages(GFP_KERNEL, 0);
49 spin_lock_irq(&cpa_lock);
50 if (!base)
51 return NULL;
52
53 /*
54 * page_private is used to track the number of entries in
55 * the page table page that have non standard attributes.
56 */
57 SetPagePrivate(base);
58 page_private(base) = 0;
59
60 address = __pa(address);
61 addr = address & LARGE_PAGE_MASK;
62 pbase = (pte_t *)page_address(base);
63 paravirt_alloc_pt(&init_mm, page_to_pfn(base));
64 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
65 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
66 addr == address ? prot : ref_prot));
67 }
68 return base;
69}
70
71static void cache_flush_page(struct page *p)
72{
73 unsigned long adr = (unsigned long)page_address(p);
74 int i;
75 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
76 asm volatile("clflush (%0)" :: "r" (adr + i));
77}
78
79static void flush_kernel_map(void *arg)
80{
81 struct list_head *lh = (struct list_head *)arg;
82 struct page *p;
83
84 /* High level code is not ready for clflush yet */
85 if (0 && cpu_has_clflush) {
86 list_for_each_entry (p, lh, lru)
87 cache_flush_page(p);
88 } else if (boot_cpu_data.x86_model >= 4)
89 wbinvd();
90
91 /* Flush all to work around Errata in early athlons regarding
92 * large page flushing.
93 */
94 __flush_tlb_all();
95}
96
97static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
98{
99 struct page *page;
100 unsigned long flags;
101
102 set_pte_atomic(kpte, pte); /* change init_mm */
103 if (SHARED_KERNEL_PMD)
104 return;
105
106 spin_lock_irqsave(&pgd_lock, flags);
107 for (page = pgd_list; page; page = (struct page *)page->index) {
108 pgd_t *pgd;
109 pud_t *pud;
110 pmd_t *pmd;
111 pgd = (pgd_t *)page_address(page) + pgd_index(address);
112 pud = pud_offset(pgd, address);
113 pmd = pmd_offset(pud, address);
114 set_pte_atomic((pte_t *)pmd, pte);
115 }
116 spin_unlock_irqrestore(&pgd_lock, flags);
117}
118
119/*
120 * No more special protections in this 2/4MB area - revert to a
121 * large page again.
122 */
123static inline void revert_page(struct page *kpte_page, unsigned long address)
124{
125 pgprot_t ref_prot;
126 pte_t *linear;
127
128 ref_prot =
129 ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
130 ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
131
132 linear = (pte_t *)
133 pmd_offset(pud_offset(pgd_offset_k(address), address), address);
134 set_pmd_pte(linear, address,
135 pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
136 ref_prot));
137}
138
139static inline void save_page(struct page *kpte_page)
140{
141 if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
142 list_add(&kpte_page->lru, &df_list);
143}
144
145static int
146__change_page_attr(struct page *page, pgprot_t prot)
147{
148 pte_t *kpte;
149 unsigned long address;
150 struct page *kpte_page;
151
152 BUG_ON(PageHighMem(page));
153 address = (unsigned long)page_address(page);
154
155 kpte = lookup_address(address);
156 if (!kpte)
157 return -EINVAL;
158 kpte_page = virt_to_page(kpte);
159 BUG_ON(PageLRU(kpte_page));
160 BUG_ON(PageCompound(kpte_page));
161
162 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
163 if (!pte_huge(*kpte)) {
164 set_pte_atomic(kpte, mk_pte(page, prot));
165 } else {
166 pgprot_t ref_prot;
167 struct page *split;
168
169 ref_prot =
170 ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
171 ? PAGE_KERNEL_EXEC : PAGE_KERNEL;
172 split = split_large_page(address, prot, ref_prot);
173 if (!split)
174 return -ENOMEM;
175 set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
176 kpte_page = split;
177 }
178 page_private(kpte_page)++;
179 } else if (!pte_huge(*kpte)) {
180 set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
181 BUG_ON(page_private(kpte_page) == 0);
182 page_private(kpte_page)--;
183 } else
184 BUG();
185
186 /*
187 * If the pte was reserved, it means it was created at boot
188 * time (not via split_large_page) and in turn we must not
189 * replace it with a largepage.
190 */
191
192 save_page(kpte_page);
193 if (!PageReserved(kpte_page)) {
194 if (cpu_has_pse && (page_private(kpte_page) == 0)) {
195 paravirt_release_pt(page_to_pfn(kpte_page));
196 revert_page(kpte_page, address);
197 }
198 }
199 return 0;
200}
201
202static inline void flush_map(struct list_head *l)
203{
204 on_each_cpu(flush_kernel_map, l, 1, 1);
205}
206
207/*
208 * Change the page attributes of an page in the linear mapping.
209 *
210 * This should be used when a page is mapped with a different caching policy
211 * than write-back somewhere - some CPUs do not like it when mappings with
212 * different caching policies exist. This changes the page attributes of the
213 * in kernel linear mapping too.
214 *
215 * The caller needs to ensure that there are no conflicting mappings elsewhere.
216 * This function only deals with the kernel linear map.
217 *
218 * Caller must call global_flush_tlb() after this.
219 */
220int change_page_attr(struct page *page, int numpages, pgprot_t prot)
221{
222 int err = 0;
223 int i;
224 unsigned long flags;
225
226 spin_lock_irqsave(&cpa_lock, flags);
227 for (i = 0; i < numpages; i++, page++) {
228 err = __change_page_attr(page, prot);
229 if (err)
230 break;
231 }
232 spin_unlock_irqrestore(&cpa_lock, flags);
233 return err;
234}
235
236void global_flush_tlb(void)
237{
238 struct list_head l;
239 struct page *pg, *next;
240
241 BUG_ON(irqs_disabled());
242
243 spin_lock_irq(&cpa_lock);
244 list_replace_init(&df_list, &l);
245 spin_unlock_irq(&cpa_lock);
246 flush_map(&l);
247 list_for_each_entry_safe(pg, next, &l, lru) {
248 list_del(&pg->lru);
249 clear_bit(PG_arch_1, &pg->flags);
250 if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
251 continue;
252 ClearPagePrivate(pg);
253 __free_page(pg);
254 }
255}
256
257#ifdef CONFIG_DEBUG_PAGEALLOC
258void kernel_map_pages(struct page *page, int numpages, int enable)
259{
260 if (PageHighMem(page))
261 return;
262 if (!enable)
263 debug_check_no_locks_freed(page_address(page),
264 numpages * PAGE_SIZE);
265
266 /* the return value is ignored - the calls cannot fail,
267 * large pages are disabled at boot time.
268 */
269 change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
270 /* we should perform an IPI and flush all tlbs,
271 * but that can deadlock->flush only current cpu.
272 */
273 __flush_tlb_all();
274}
275#endif
276
277EXPORT_SYMBOL(change_page_attr);
278EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
new file mode 100644
index 000000000000..01437c46baae
--- /dev/null
+++ b/arch/x86/mm/pgtable_32.c
@@ -0,0 +1,373 @@
1/*
2 * linux/arch/i386/mm/pgtable.c
3 */
4
5#include <linux/sched.h>
6#include <linux/kernel.h>
7#include <linux/errno.h>
8#include <linux/mm.h>
9#include <linux/swap.h>
10#include <linux/smp.h>
11#include <linux/highmem.h>
12#include <linux/slab.h>
13#include <linux/pagemap.h>
14#include <linux/spinlock.h>
15#include <linux/module.h>
16#include <linux/quicklist.h>
17
18#include <asm/system.h>
19#include <asm/pgtable.h>
20#include <asm/pgalloc.h>
21#include <asm/fixmap.h>
22#include <asm/e820.h>
23#include <asm/tlb.h>
24#include <asm/tlbflush.h>
25
26void show_mem(void)
27{
28 int total = 0, reserved = 0;
29 int shared = 0, cached = 0;
30 int highmem = 0;
31 struct page *page;
32 pg_data_t *pgdat;
33 unsigned long i;
34 unsigned long flags;
35
36 printk(KERN_INFO "Mem-info:\n");
37 show_free_areas();
38 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
39 for_each_online_pgdat(pgdat) {
40 pgdat_resize_lock(pgdat, &flags);
41 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
42 page = pgdat_page_nr(pgdat, i);
43 total++;
44 if (PageHighMem(page))
45 highmem++;
46 if (PageReserved(page))
47 reserved++;
48 else if (PageSwapCache(page))
49 cached++;
50 else if (page_count(page))
51 shared += page_count(page) - 1;
52 }
53 pgdat_resize_unlock(pgdat, &flags);
54 }
55 printk(KERN_INFO "%d pages of RAM\n", total);
56 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
57 printk(KERN_INFO "%d reserved pages\n", reserved);
58 printk(KERN_INFO "%d pages shared\n", shared);
59 printk(KERN_INFO "%d pages swap cached\n", cached);
60
61 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
62 printk(KERN_INFO "%lu pages writeback\n",
63 global_page_state(NR_WRITEBACK));
64 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
65 printk(KERN_INFO "%lu pages slab\n",
66 global_page_state(NR_SLAB_RECLAIMABLE) +
67 global_page_state(NR_SLAB_UNRECLAIMABLE));
68 printk(KERN_INFO "%lu pages pagetables\n",
69 global_page_state(NR_PAGETABLE));
70}
71
72/*
73 * Associate a virtual page frame with a given physical page frame
74 * and protection flags for that frame.
75 */
76static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
77{
78 pgd_t *pgd;
79 pud_t *pud;
80 pmd_t *pmd;
81 pte_t *pte;
82
83 pgd = swapper_pg_dir + pgd_index(vaddr);
84 if (pgd_none(*pgd)) {
85 BUG();
86 return;
87 }
88 pud = pud_offset(pgd, vaddr);
89 if (pud_none(*pud)) {
90 BUG();
91 return;
92 }
93 pmd = pmd_offset(pud, vaddr);
94 if (pmd_none(*pmd)) {
95 BUG();
96 return;
97 }
98 pte = pte_offset_kernel(pmd, vaddr);
99 if (pgprot_val(flags))
100 /* <pfn,flags> stored as-is, to permit clearing entries */
101 set_pte(pte, pfn_pte(pfn, flags));
102 else
103 pte_clear(&init_mm, vaddr, pte);
104
105 /*
106 * It's enough to flush this one mapping.
107 * (PGE mappings get flushed as well)
108 */
109 __flush_tlb_one(vaddr);
110}
111
112/*
113 * Associate a large virtual page frame with a given physical page frame
114 * and protection flags for that frame. pfn is for the base of the page,
115 * vaddr is what the page gets mapped to - both must be properly aligned.
116 * The pmd must already be instantiated. Assumes PAE mode.
117 */
118void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
119{
120 pgd_t *pgd;
121 pud_t *pud;
122 pmd_t *pmd;
123
124 if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
125 printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
126 return; /* BUG(); */
127 }
128 if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
129 printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
130 return; /* BUG(); */
131 }
132 pgd = swapper_pg_dir + pgd_index(vaddr);
133 if (pgd_none(*pgd)) {
134 printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
135 return; /* BUG(); */
136 }
137 pud = pud_offset(pgd, vaddr);
138 pmd = pmd_offset(pud, vaddr);
139 set_pmd(pmd, pfn_pmd(pfn, flags));
140 /*
141 * It's enough to flush this one mapping.
142 * (PGE mappings get flushed as well)
143 */
144 __flush_tlb_one(vaddr);
145}
146
147static int fixmaps;
148unsigned long __FIXADDR_TOP = 0xfffff000;
149EXPORT_SYMBOL(__FIXADDR_TOP);
150
151void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
152{
153 unsigned long address = __fix_to_virt(idx);
154
155 if (idx >= __end_of_fixed_addresses) {
156 BUG();
157 return;
158 }
159 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
160 fixmaps++;
161}
162
163/**
164 * reserve_top_address - reserves a hole in the top of kernel address space
165 * @reserve - size of hole to reserve
166 *
167 * Can be used to relocate the fixmap area and poke a hole in the top
168 * of kernel address space to make room for a hypervisor.
169 */
170void reserve_top_address(unsigned long reserve)
171{
172 BUG_ON(fixmaps > 0);
173 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
174 (int)-reserve);
175 __FIXADDR_TOP = -reserve - PAGE_SIZE;
176 __VMALLOC_RESERVE += reserve;
177}
178
179pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
180{
181 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
182}
183
184struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
185{
186 struct page *pte;
187
188#ifdef CONFIG_HIGHPTE
189 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
190#else
191 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
192#endif
193 return pte;
194}
195
196void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
197{
198 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
199}
200
201/*
202 * List of all pgd's needed for non-PAE so it can invalidate entries
203 * in both cached and uncached pgd's; not needed for PAE since the
204 * kernel pmd is shared. If PAE were not to share the pmd a similar
205 * tactic would be needed. This is essentially codepath-based locking
206 * against pageattr.c; it is the unique case in which a valid change
207 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
208 * vmalloc faults work because attached pagetables are never freed.
209 * -- wli
210 */
211DEFINE_SPINLOCK(pgd_lock);
212struct page *pgd_list;
213
214static inline void pgd_list_add(pgd_t *pgd)
215{
216 struct page *page = virt_to_page(pgd);
217 page->index = (unsigned long)pgd_list;
218 if (pgd_list)
219 set_page_private(pgd_list, (unsigned long)&page->index);
220 pgd_list = page;
221 set_page_private(page, (unsigned long)&pgd_list);
222}
223
224static inline void pgd_list_del(pgd_t *pgd)
225{
226 struct page *next, **pprev, *page = virt_to_page(pgd);
227 next = (struct page *)page->index;
228 pprev = (struct page **)page_private(page);
229 *pprev = next;
230 if (next)
231 set_page_private(next, (unsigned long)pprev);
232}
233
234
235
236#if (PTRS_PER_PMD == 1)
237/* Non-PAE pgd constructor */
238static void pgd_ctor(void *pgd)
239{
240 unsigned long flags;
241
242 /* !PAE, no pagetable sharing */
243 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
244
245 spin_lock_irqsave(&pgd_lock, flags);
246
247 /* must happen under lock */
248 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
249 swapper_pg_dir + USER_PTRS_PER_PGD,
250 KERNEL_PGD_PTRS);
251 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
252 __pa(swapper_pg_dir) >> PAGE_SHIFT,
253 USER_PTRS_PER_PGD,
254 KERNEL_PGD_PTRS);
255 pgd_list_add(pgd);
256 spin_unlock_irqrestore(&pgd_lock, flags);
257}
258#else /* PTRS_PER_PMD > 1 */
259/* PAE pgd constructor */
260static void pgd_ctor(void *pgd)
261{
262 /* PAE, kernel PMD may be shared */
263
264 if (SHARED_KERNEL_PMD) {
265 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
266 swapper_pg_dir + USER_PTRS_PER_PGD,
267 KERNEL_PGD_PTRS);
268 } else {
269 unsigned long flags;
270
271 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
272 spin_lock_irqsave(&pgd_lock, flags);
273 pgd_list_add(pgd);
274 spin_unlock_irqrestore(&pgd_lock, flags);
275 }
276}
277#endif /* PTRS_PER_PMD */
278
279static void pgd_dtor(void *pgd)
280{
281 unsigned long flags; /* can be called from interrupt context */
282
283 if (SHARED_KERNEL_PMD)
284 return;
285
286 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
287 spin_lock_irqsave(&pgd_lock, flags);
288 pgd_list_del(pgd);
289 spin_unlock_irqrestore(&pgd_lock, flags);
290}
291
292#define UNSHARED_PTRS_PER_PGD \
293 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
294
295/* If we allocate a pmd for part of the kernel address space, then
296 make sure its initialized with the appropriate kernel mappings.
297 Otherwise use a cached zeroed pmd. */
298static pmd_t *pmd_cache_alloc(int idx)
299{
300 pmd_t *pmd;
301
302 if (idx >= USER_PTRS_PER_PGD) {
303 pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
304
305 if (pmd)
306 memcpy(pmd,
307 (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
308 sizeof(pmd_t) * PTRS_PER_PMD);
309 } else
310 pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
311
312 return pmd;
313}
314
315static void pmd_cache_free(pmd_t *pmd, int idx)
316{
317 if (idx >= USER_PTRS_PER_PGD)
318 free_page((unsigned long)pmd);
319 else
320 kmem_cache_free(pmd_cache, pmd);
321}
322
323pgd_t *pgd_alloc(struct mm_struct *mm)
324{
325 int i;
326 pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
327
328 if (PTRS_PER_PMD == 1 || !pgd)
329 return pgd;
330
331 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
332 pmd_t *pmd = pmd_cache_alloc(i);
333
334 if (!pmd)
335 goto out_oom;
336
337 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
338 set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
339 }
340 return pgd;
341
342out_oom:
343 for (i--; i >= 0; i--) {
344 pgd_t pgdent = pgd[i];
345 void* pmd = (void *)__va(pgd_val(pgdent)-1);
346 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
347 pmd_cache_free(pmd, i);
348 }
349 quicklist_free(0, pgd_dtor, pgd);
350 return NULL;
351}
352
353void pgd_free(pgd_t *pgd)
354{
355 int i;
356
357 /* in the PAE case user pgd entries are overwritten before usage */
358 if (PTRS_PER_PMD > 1)
359 for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
360 pgd_t pgdent = pgd[i];
361 void* pmd = (void *)__va(pgd_val(pgdent)-1);
362 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
363 pmd_cache_free(pmd, i);
364 }
365 /* in the non-PAE case, free_pgtables() clears user pgd entries */
366 quicklist_free(0, pgd_dtor, pgd);
367}
368
369void check_pgt_cache(void)
370{
371 quicklist_trim(0, pgd_dtor, 25, 16);
372}
373