diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/i386/mm |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/i386/mm')
-rw-r--r-- | arch/i386/mm/Makefile | 10 | ||||
-rw-r--r-- | arch/i386/mm/boot_ioremap.c | 97 | ||||
-rw-r--r-- | arch/i386/mm/discontig.c | 383 | ||||
-rw-r--r-- | arch/i386/mm/extable.c | 36 | ||||
-rw-r--r-- | arch/i386/mm/fault.c | 552 | ||||
-rw-r--r-- | arch/i386/mm/highmem.c | 89 | ||||
-rw-r--r-- | arch/i386/mm/hugetlbpage.c | 431 | ||||
-rw-r--r-- | arch/i386/mm/init.c | 696 | ||||
-rw-r--r-- | arch/i386/mm/ioremap.c | 320 | ||||
-rw-r--r-- | arch/i386/mm/mmap.c | 76 | ||||
-rw-r--r-- | arch/i386/mm/pageattr.c | 221 | ||||
-rw-r--r-- | arch/i386/mm/pgtable.c | 260 |
12 files changed, 3171 insertions, 0 deletions
diff --git a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile new file mode 100644 index 000000000000..fc3272506846 --- /dev/null +++ b/arch/i386/mm/Makefile | |||
@@ -0,0 +1,10 @@ | |||
1 | # | ||
2 | # Makefile for the linux i386-specific parts of the memory manager. | ||
3 | # | ||
4 | |||
5 | obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o | ||
6 | |||
7 | obj-$(CONFIG_DISCONTIGMEM) += discontig.o | ||
8 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | ||
9 | obj-$(CONFIG_HIGHMEM) += highmem.o | ||
10 | obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o | ||
diff --git a/arch/i386/mm/boot_ioremap.c b/arch/i386/mm/boot_ioremap.c new file mode 100644 index 000000000000..523b30634e0a --- /dev/null +++ b/arch/i386/mm/boot_ioremap.c | |||
@@ -0,0 +1,97 @@ | |||
1 | /* | ||
2 | * arch/i386/mm/boot_ioremap.c | ||
3 | * | ||
4 | * Re-map functions for early boot-time before paging_init() when the | ||
5 | * boot-time pagetables are still in use | ||
6 | * | ||
7 | * Written by Dave Hansen <haveblue@us.ibm.com> | ||
8 | */ | ||
9 | |||
10 | |||
11 | /* | ||
12 | * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE | ||
13 | * keeps that from happenning. If anyone has a better way, I'm listening. | ||
14 | * | ||
15 | * boot_pte_t is defined only if this all works correctly | ||
16 | */ | ||
17 | |||
18 | #include <linux/config.h> | ||
19 | #undef CONFIG_X86_PAE | ||
20 | #include <asm/page.h> | ||
21 | #include <asm/pgtable.h> | ||
22 | #include <asm/tlbflush.h> | ||
23 | #include <linux/init.h> | ||
24 | #include <linux/stddef.h> | ||
25 | |||
26 | /* | ||
27 | * I'm cheating here. It is known that the two boot PTE pages are | ||
28 | * allocated next to each other. I'm pretending that they're just | ||
29 | * one big array. | ||
30 | */ | ||
31 | |||
32 | #define BOOT_PTE_PTRS (PTRS_PER_PTE*2) | ||
33 | #define boot_pte_index(address) \ | ||
34 | (((address) >> PAGE_SHIFT) & (BOOT_PTE_PTRS - 1)) | ||
35 | |||
36 | static inline boot_pte_t* boot_vaddr_to_pte(void *address) | ||
37 | { | ||
38 | boot_pte_t* boot_pg = (boot_pte_t*)pg0; | ||
39 | return &boot_pg[boot_pte_index((unsigned long)address)]; | ||
40 | } | ||
41 | |||
42 | /* | ||
43 | * This is only for a caller who is clever enough to page-align | ||
44 | * phys_addr and virtual_source, and who also has a preference | ||
45 | * about which virtual address from which to steal ptes | ||
46 | */ | ||
47 | static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages, | ||
48 | void* virtual_source) | ||
49 | { | ||
50 | boot_pte_t* pte; | ||
51 | int i; | ||
52 | char *vaddr = virtual_source; | ||
53 | |||
54 | pte = boot_vaddr_to_pte(virtual_source); | ||
55 | for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) { | ||
56 | set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL)); | ||
57 | __flush_tlb_one(&vaddr[i*PAGE_SIZE]); | ||
58 | } | ||
59 | } | ||
60 | |||
61 | /* the virtual space we're going to remap comes from this array */ | ||
62 | #define BOOT_IOREMAP_PAGES 4 | ||
63 | #define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE) | ||
64 | static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE] | ||
65 | __attribute__ ((aligned (PAGE_SIZE))); | ||
66 | |||
67 | /* | ||
68 | * This only applies to things which need to ioremap before paging_init() | ||
69 | * bt_ioremap() and plain ioremap() are both useless at this point. | ||
70 | * | ||
71 | * When used, we're still using the boot-time pagetables, which only | ||
72 | * have 2 PTE pages mapping the first 8MB | ||
73 | * | ||
74 | * There is no unmap. The boot-time PTE pages aren't used after boot. | ||
75 | * If you really want the space back, just remap it yourself. | ||
76 | * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE) | ||
77 | */ | ||
78 | __init void* boot_ioremap(unsigned long phys_addr, unsigned long size) | ||
79 | { | ||
80 | unsigned long last_addr, offset; | ||
81 | unsigned int nrpages; | ||
82 | |||
83 | last_addr = phys_addr + size - 1; | ||
84 | |||
85 | /* page align the requested address */ | ||
86 | offset = phys_addr & ~PAGE_MASK; | ||
87 | phys_addr &= PAGE_MASK; | ||
88 | size = PAGE_ALIGN(last_addr) - phys_addr; | ||
89 | |||
90 | nrpages = size >> PAGE_SHIFT; | ||
91 | if (nrpages > BOOT_IOREMAP_PAGES) | ||
92 | return NULL; | ||
93 | |||
94 | __boot_ioremap(phys_addr, nrpages, boot_ioremap_space); | ||
95 | |||
96 | return &boot_ioremap_space[offset]; | ||
97 | } | ||
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c new file mode 100644 index 000000000000..1726b4096b10 --- /dev/null +++ b/arch/i386/mm/discontig.c | |||
@@ -0,0 +1,383 @@ | |||
1 | /* | ||
2 | * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation | ||
3 | * August 2002: added remote node KVA remap - Martin J. Bligh | ||
4 | * | ||
5 | * Copyright (C) 2002, IBM Corp. | ||
6 | * | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, but | ||
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
17 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
18 | * details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program; if not, write to the Free Software | ||
22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | */ | ||
24 | |||
25 | #include <linux/config.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/bootmem.h> | ||
28 | #include <linux/mmzone.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/initrd.h> | ||
31 | #include <linux/nodemask.h> | ||
32 | #include <asm/e820.h> | ||
33 | #include <asm/setup.h> | ||
34 | #include <asm/mmzone.h> | ||
35 | #include <bios_ebda.h> | ||
36 | |||
37 | struct pglist_data *node_data[MAX_NUMNODES]; | ||
38 | bootmem_data_t node0_bdata; | ||
39 | |||
40 | /* | ||
41 | * numa interface - we expect the numa architecture specfic code to have | ||
42 | * populated the following initialisation. | ||
43 | * | ||
44 | * 1) node_online_map - the map of all nodes configured (online) in the system | ||
45 | * 2) physnode_map - the mapping between a pfn and owning node | ||
46 | * 3) node_start_pfn - the starting page frame number for a node | ||
47 | * 3) node_end_pfn - the ending page fram number for a node | ||
48 | */ | ||
49 | |||
50 | /* | ||
51 | * physnode_map keeps track of the physical memory layout of a generic | ||
52 | * numa node on a 256Mb break (each element of the array will | ||
53 | * represent 256Mb of memory and will be marked by the node id. so, | ||
54 | * if the first gig is on node 0, and the second gig is on node 1 | ||
55 | * physnode_map will contain: | ||
56 | * | ||
57 | * physnode_map[0-3] = 0; | ||
58 | * physnode_map[4-7] = 1; | ||
59 | * physnode_map[8- ] = -1; | ||
60 | */ | ||
61 | s8 physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1}; | ||
62 | |||
63 | void memory_present(int nid, unsigned long start, unsigned long end) | ||
64 | { | ||
65 | unsigned long pfn; | ||
66 | |||
67 | printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", | ||
68 | nid, start, end); | ||
69 | printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); | ||
70 | printk(KERN_DEBUG " "); | ||
71 | for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { | ||
72 | physnode_map[pfn / PAGES_PER_ELEMENT] = nid; | ||
73 | printk("%ld ", pfn); | ||
74 | } | ||
75 | printk("\n"); | ||
76 | } | ||
77 | |||
78 | unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, | ||
79 | unsigned long end_pfn) | ||
80 | { | ||
81 | unsigned long nr_pages = end_pfn - start_pfn; | ||
82 | |||
83 | if (!nr_pages) | ||
84 | return 0; | ||
85 | |||
86 | return (nr_pages + 1) * sizeof(struct page); | ||
87 | } | ||
88 | |||
89 | unsigned long node_start_pfn[MAX_NUMNODES]; | ||
90 | unsigned long node_end_pfn[MAX_NUMNODES]; | ||
91 | |||
92 | extern unsigned long find_max_low_pfn(void); | ||
93 | extern void find_max_pfn(void); | ||
94 | extern void one_highpage_init(struct page *, int, int); | ||
95 | |||
96 | extern struct e820map e820; | ||
97 | extern unsigned long init_pg_tables_end; | ||
98 | extern unsigned long highend_pfn, highstart_pfn; | ||
99 | extern unsigned long max_low_pfn; | ||
100 | extern unsigned long totalram_pages; | ||
101 | extern unsigned long totalhigh_pages; | ||
102 | |||
103 | #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) | ||
104 | |||
105 | unsigned long node_remap_start_pfn[MAX_NUMNODES]; | ||
106 | unsigned long node_remap_size[MAX_NUMNODES]; | ||
107 | unsigned long node_remap_offset[MAX_NUMNODES]; | ||
108 | void *node_remap_start_vaddr[MAX_NUMNODES]; | ||
109 | void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); | ||
110 | |||
111 | /* | ||
112 | * FLAT - support for basic PC memory model with discontig enabled, essentially | ||
113 | * a single node with all available processors in it with a flat | ||
114 | * memory map. | ||
115 | */ | ||
116 | int __init get_memcfg_numa_flat(void) | ||
117 | { | ||
118 | printk("NUMA - single node, flat memory mode\n"); | ||
119 | |||
120 | /* Run the memory configuration and find the top of memory. */ | ||
121 | find_max_pfn(); | ||
122 | node_start_pfn[0] = 0; | ||
123 | node_end_pfn[0] = max_pfn; | ||
124 | memory_present(0, 0, max_pfn); | ||
125 | |||
126 | /* Indicate there is one node available. */ | ||
127 | nodes_clear(node_online_map); | ||
128 | node_set_online(0); | ||
129 | return 1; | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * Find the highest page frame number we have available for the node | ||
134 | */ | ||
135 | static void __init find_max_pfn_node(int nid) | ||
136 | { | ||
137 | if (node_end_pfn[nid] > max_pfn) | ||
138 | node_end_pfn[nid] = max_pfn; | ||
139 | /* | ||
140 | * if a user has given mem=XXXX, then we need to make sure | ||
141 | * that the node _starts_ before that, too, not just ends | ||
142 | */ | ||
143 | if (node_start_pfn[nid] > max_pfn) | ||
144 | node_start_pfn[nid] = max_pfn; | ||
145 | if (node_start_pfn[nid] > node_end_pfn[nid]) | ||
146 | BUG(); | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * Allocate memory for the pg_data_t for this node via a crude pre-bootmem | ||
151 | * method. For node zero take this from the bottom of memory, for | ||
152 | * subsequent nodes place them at node_remap_start_vaddr which contains | ||
153 | * node local data in physically node local memory. See setup_memory() | ||
154 | * for details. | ||
155 | */ | ||
156 | static void __init allocate_pgdat(int nid) | ||
157 | { | ||
158 | if (nid && node_has_online_mem(nid)) | ||
159 | NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; | ||
160 | else { | ||
161 | NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); | ||
162 | min_low_pfn += PFN_UP(sizeof(pg_data_t)); | ||
163 | } | ||
164 | } | ||
165 | |||
166 | void __init remap_numa_kva(void) | ||
167 | { | ||
168 | void *vaddr; | ||
169 | unsigned long pfn; | ||
170 | int node; | ||
171 | |||
172 | for_each_online_node(node) { | ||
173 | if (node == 0) | ||
174 | continue; | ||
175 | for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { | ||
176 | vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); | ||
177 | set_pmd_pfn((ulong) vaddr, | ||
178 | node_remap_start_pfn[node] + pfn, | ||
179 | PAGE_KERNEL_LARGE); | ||
180 | } | ||
181 | } | ||
182 | } | ||
183 | |||
184 | static unsigned long calculate_numa_remap_pages(void) | ||
185 | { | ||
186 | int nid; | ||
187 | unsigned long size, reserve_pages = 0; | ||
188 | |||
189 | for_each_online_node(nid) { | ||
190 | if (nid == 0) | ||
191 | continue; | ||
192 | if (!node_remap_size[nid]) | ||
193 | continue; | ||
194 | |||
195 | /* | ||
196 | * The acpi/srat node info can show hot-add memroy zones | ||
197 | * where memory could be added but not currently present. | ||
198 | */ | ||
199 | if (node_start_pfn[nid] > max_pfn) | ||
200 | continue; | ||
201 | if (node_end_pfn[nid] > max_pfn) | ||
202 | node_end_pfn[nid] = max_pfn; | ||
203 | |||
204 | /* ensure the remap includes space for the pgdat. */ | ||
205 | size = node_remap_size[nid] + sizeof(pg_data_t); | ||
206 | |||
207 | /* convert size to large (pmd size) pages, rounding up */ | ||
208 | size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; | ||
209 | /* now the roundup is correct, convert to PAGE_SIZE pages */ | ||
210 | size = size * PTRS_PER_PTE; | ||
211 | printk("Reserving %ld pages of KVA for lmem_map of node %d\n", | ||
212 | size, nid); | ||
213 | node_remap_size[nid] = size; | ||
214 | reserve_pages += size; | ||
215 | node_remap_offset[nid] = reserve_pages; | ||
216 | printk("Shrinking node %d from %ld pages to %ld pages\n", | ||
217 | nid, node_end_pfn[nid], node_end_pfn[nid] - size); | ||
218 | node_end_pfn[nid] -= size; | ||
219 | node_remap_start_pfn[nid] = node_end_pfn[nid]; | ||
220 | } | ||
221 | printk("Reserving total of %ld pages for numa KVA remap\n", | ||
222 | reserve_pages); | ||
223 | return reserve_pages; | ||
224 | } | ||
225 | |||
226 | extern void setup_bootmem_allocator(void); | ||
227 | unsigned long __init setup_memory(void) | ||
228 | { | ||
229 | int nid; | ||
230 | unsigned long system_start_pfn, system_max_low_pfn; | ||
231 | unsigned long reserve_pages; | ||
232 | |||
233 | /* | ||
234 | * When mapping a NUMA machine we allocate the node_mem_map arrays | ||
235 | * from node local memory. They are then mapped directly into KVA | ||
236 | * between zone normal and vmalloc space. Calculate the size of | ||
237 | * this space and use it to adjust the boundry between ZONE_NORMAL | ||
238 | * and ZONE_HIGHMEM. | ||
239 | */ | ||
240 | find_max_pfn(); | ||
241 | get_memcfg_numa(); | ||
242 | |||
243 | reserve_pages = calculate_numa_remap_pages(); | ||
244 | |||
245 | /* partially used pages are not usable - thus round upwards */ | ||
246 | system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); | ||
247 | |||
248 | system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages; | ||
249 | printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n", | ||
250 | reserve_pages, max_low_pfn + reserve_pages); | ||
251 | printk("max_pfn = %ld\n", max_pfn); | ||
252 | #ifdef CONFIG_HIGHMEM | ||
253 | highstart_pfn = highend_pfn = max_pfn; | ||
254 | if (max_pfn > system_max_low_pfn) | ||
255 | highstart_pfn = system_max_low_pfn; | ||
256 | printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", | ||
257 | pages_to_mb(highend_pfn - highstart_pfn)); | ||
258 | #endif | ||
259 | printk(KERN_NOTICE "%ldMB LOWMEM available.\n", | ||
260 | pages_to_mb(system_max_low_pfn)); | ||
261 | printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", | ||
262 | min_low_pfn, max_low_pfn, highstart_pfn); | ||
263 | |||
264 | printk("Low memory ends at vaddr %08lx\n", | ||
265 | (ulong) pfn_to_kaddr(max_low_pfn)); | ||
266 | for_each_online_node(nid) { | ||
267 | node_remap_start_vaddr[nid] = pfn_to_kaddr( | ||
268 | (highstart_pfn + reserve_pages) - node_remap_offset[nid]); | ||
269 | allocate_pgdat(nid); | ||
270 | printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, | ||
271 | (ulong) node_remap_start_vaddr[nid], | ||
272 | (ulong) pfn_to_kaddr(highstart_pfn + reserve_pages | ||
273 | - node_remap_offset[nid] + node_remap_size[nid])); | ||
274 | } | ||
275 | printk("High memory starts at vaddr %08lx\n", | ||
276 | (ulong) pfn_to_kaddr(highstart_pfn)); | ||
277 | vmalloc_earlyreserve = reserve_pages * PAGE_SIZE; | ||
278 | for_each_online_node(nid) | ||
279 | find_max_pfn_node(nid); | ||
280 | |||
281 | memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); | ||
282 | NODE_DATA(0)->bdata = &node0_bdata; | ||
283 | setup_bootmem_allocator(); | ||
284 | return max_low_pfn; | ||
285 | } | ||
286 | |||
287 | void __init zone_sizes_init(void) | ||
288 | { | ||
289 | int nid; | ||
290 | |||
291 | /* | ||
292 | * Insert nodes into pgdat_list backward so they appear in order. | ||
293 | * Clobber node 0's links and NULL out pgdat_list before starting. | ||
294 | */ | ||
295 | pgdat_list = NULL; | ||
296 | for (nid = MAX_NUMNODES - 1; nid >= 0; nid--) { | ||
297 | if (!node_online(nid)) | ||
298 | continue; | ||
299 | NODE_DATA(nid)->pgdat_next = pgdat_list; | ||
300 | pgdat_list = NODE_DATA(nid); | ||
301 | } | ||
302 | |||
303 | for_each_online_node(nid) { | ||
304 | unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; | ||
305 | unsigned long *zholes_size; | ||
306 | unsigned int max_dma; | ||
307 | |||
308 | unsigned long low = max_low_pfn; | ||
309 | unsigned long start = node_start_pfn[nid]; | ||
310 | unsigned long high = node_end_pfn[nid]; | ||
311 | |||
312 | max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | ||
313 | |||
314 | if (node_has_online_mem(nid)){ | ||
315 | if (start > low) { | ||
316 | #ifdef CONFIG_HIGHMEM | ||
317 | BUG_ON(start > high); | ||
318 | zones_size[ZONE_HIGHMEM] = high - start; | ||
319 | #endif | ||
320 | } else { | ||
321 | if (low < max_dma) | ||
322 | zones_size[ZONE_DMA] = low; | ||
323 | else { | ||
324 | BUG_ON(max_dma > low); | ||
325 | BUG_ON(low > high); | ||
326 | zones_size[ZONE_DMA] = max_dma; | ||
327 | zones_size[ZONE_NORMAL] = low - max_dma; | ||
328 | #ifdef CONFIG_HIGHMEM | ||
329 | zones_size[ZONE_HIGHMEM] = high - low; | ||
330 | #endif | ||
331 | } | ||
332 | } | ||
333 | } | ||
334 | |||
335 | zholes_size = get_zholes_size(nid); | ||
336 | /* | ||
337 | * We let the lmem_map for node 0 be allocated from the | ||
338 | * normal bootmem allocator, but other nodes come from the | ||
339 | * remapped KVA area - mbligh | ||
340 | */ | ||
341 | if (!nid) | ||
342 | free_area_init_node(nid, NODE_DATA(nid), | ||
343 | zones_size, start, zholes_size); | ||
344 | else { | ||
345 | unsigned long lmem_map; | ||
346 | lmem_map = (unsigned long)node_remap_start_vaddr[nid]; | ||
347 | lmem_map += sizeof(pg_data_t) + PAGE_SIZE - 1; | ||
348 | lmem_map &= PAGE_MASK; | ||
349 | NODE_DATA(nid)->node_mem_map = (struct page *)lmem_map; | ||
350 | free_area_init_node(nid, NODE_DATA(nid), zones_size, | ||
351 | start, zholes_size); | ||
352 | } | ||
353 | } | ||
354 | return; | ||
355 | } | ||
356 | |||
357 | void __init set_highmem_pages_init(int bad_ppro) | ||
358 | { | ||
359 | #ifdef CONFIG_HIGHMEM | ||
360 | struct zone *zone; | ||
361 | |||
362 | for_each_zone(zone) { | ||
363 | unsigned long node_pfn, node_high_size, zone_start_pfn; | ||
364 | struct page * zone_mem_map; | ||
365 | |||
366 | if (!is_highmem(zone)) | ||
367 | continue; | ||
368 | |||
369 | printk("Initializing %s for node %d\n", zone->name, | ||
370 | zone->zone_pgdat->node_id); | ||
371 | |||
372 | node_high_size = zone->spanned_pages; | ||
373 | zone_mem_map = zone->zone_mem_map; | ||
374 | zone_start_pfn = zone->zone_start_pfn; | ||
375 | |||
376 | for (node_pfn = 0; node_pfn < node_high_size; node_pfn++) { | ||
377 | one_highpage_init((struct page *)(zone_mem_map + node_pfn), | ||
378 | zone_start_pfn + node_pfn, bad_ppro); | ||
379 | } | ||
380 | } | ||
381 | totalram_pages += totalhigh_pages; | ||
382 | #endif | ||
383 | } | ||
diff --git a/arch/i386/mm/extable.c b/arch/i386/mm/extable.c new file mode 100644 index 000000000000..f706449319c4 --- /dev/null +++ b/arch/i386/mm/extable.c | |||
@@ -0,0 +1,36 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/mm/extable.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/config.h> | ||
6 | #include <linux/module.h> | ||
7 | #include <linux/spinlock.h> | ||
8 | #include <asm/uaccess.h> | ||
9 | |||
10 | int fixup_exception(struct pt_regs *regs) | ||
11 | { | ||
12 | const struct exception_table_entry *fixup; | ||
13 | |||
14 | #ifdef CONFIG_PNPBIOS | ||
15 | if (unlikely((regs->xcs & ~15) == (GDT_ENTRY_PNPBIOS_BASE << 3))) | ||
16 | { | ||
17 | extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; | ||
18 | extern u32 pnp_bios_is_utter_crap; | ||
19 | pnp_bios_is_utter_crap = 1; | ||
20 | printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); | ||
21 | __asm__ volatile( | ||
22 | "movl %0, %%esp\n\t" | ||
23 | "jmp *%1\n\t" | ||
24 | : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); | ||
25 | panic("do_trap: can't hit this"); | ||
26 | } | ||
27 | #endif | ||
28 | |||
29 | fixup = search_exception_tables(regs->eip); | ||
30 | if (fixup) { | ||
31 | regs->eip = fixup->fixup; | ||
32 | return 1; | ||
33 | } | ||
34 | |||
35 | return 0; | ||
36 | } | ||
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c new file mode 100644 index 000000000000..a509237c4815 --- /dev/null +++ b/arch/i386/mm/fault.c | |||
@@ -0,0 +1,552 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/mm/fault.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | #include <linux/signal.h> | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/errno.h> | ||
11 | #include <linux/string.h> | ||
12 | #include <linux/types.h> | ||
13 | #include <linux/ptrace.h> | ||
14 | #include <linux/mman.h> | ||
15 | #include <linux/mm.h> | ||
16 | #include <linux/smp.h> | ||
17 | #include <linux/smp_lock.h> | ||
18 | #include <linux/interrupt.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/tty.h> | ||
21 | #include <linux/vt_kern.h> /* For unblank_screen() */ | ||
22 | #include <linux/highmem.h> | ||
23 | #include <linux/module.h> | ||
24 | |||
25 | #include <asm/system.h> | ||
26 | #include <asm/uaccess.h> | ||
27 | #include <asm/desc.h> | ||
28 | #include <asm/kdebug.h> | ||
29 | |||
30 | extern void die(const char *,struct pt_regs *,long); | ||
31 | |||
32 | /* | ||
33 | * Unlock any spinlocks which will prevent us from getting the | ||
34 | * message out | ||
35 | */ | ||
36 | void bust_spinlocks(int yes) | ||
37 | { | ||
38 | int loglevel_save = console_loglevel; | ||
39 | |||
40 | if (yes) { | ||
41 | oops_in_progress = 1; | ||
42 | return; | ||
43 | } | ||
44 | #ifdef CONFIG_VT | ||
45 | unblank_screen(); | ||
46 | #endif | ||
47 | oops_in_progress = 0; | ||
48 | /* | ||
49 | * OK, the message is on the console. Now we call printk() | ||
50 | * without oops_in_progress set so that printk will give klogd | ||
51 | * a poke. Hold onto your hats... | ||
52 | */ | ||
53 | console_loglevel = 15; /* NMI oopser may have shut the console up */ | ||
54 | printk(" "); | ||
55 | console_loglevel = loglevel_save; | ||
56 | } | ||
57 | |||
58 | /* | ||
59 | * Return EIP plus the CS segment base. The segment limit is also | ||
60 | * adjusted, clamped to the kernel/user address space (whichever is | ||
61 | * appropriate), and returned in *eip_limit. | ||
62 | * | ||
63 | * The segment is checked, because it might have been changed by another | ||
64 | * task between the original faulting instruction and here. | ||
65 | * | ||
66 | * If CS is no longer a valid code segment, or if EIP is beyond the | ||
67 | * limit, or if it is a kernel address when CS is not a kernel segment, | ||
68 | * then the returned value will be greater than *eip_limit. | ||
69 | * | ||
70 | * This is slow, but is very rarely executed. | ||
71 | */ | ||
72 | static inline unsigned long get_segment_eip(struct pt_regs *regs, | ||
73 | unsigned long *eip_limit) | ||
74 | { | ||
75 | unsigned long eip = regs->eip; | ||
76 | unsigned seg = regs->xcs & 0xffff; | ||
77 | u32 seg_ar, seg_limit, base, *desc; | ||
78 | |||
79 | /* The standard kernel/user address space limit. */ | ||
80 | *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg; | ||
81 | |||
82 | /* Unlikely, but must come before segment checks. */ | ||
83 | if (unlikely((regs->eflags & VM_MASK) != 0)) | ||
84 | return eip + (seg << 4); | ||
85 | |||
86 | /* By far the most common cases. */ | ||
87 | if (likely(seg == __USER_CS || seg == __KERNEL_CS)) | ||
88 | return eip; | ||
89 | |||
90 | /* Check the segment exists, is within the current LDT/GDT size, | ||
91 | that kernel/user (ring 0..3) has the appropriate privilege, | ||
92 | that it's a code segment, and get the limit. */ | ||
93 | __asm__ ("larl %3,%0; lsll %3,%1" | ||
94 | : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); | ||
95 | if ((~seg_ar & 0x9800) || eip > seg_limit) { | ||
96 | *eip_limit = 0; | ||
97 | return 1; /* So that returned eip > *eip_limit. */ | ||
98 | } | ||
99 | |||
100 | /* Get the GDT/LDT descriptor base. | ||
101 | When you look for races in this code remember that | ||
102 | LDT and other horrors are only used in user space. */ | ||
103 | if (seg & (1<<2)) { | ||
104 | /* Must lock the LDT while reading it. */ | ||
105 | down(¤t->mm->context.sem); | ||
106 | desc = current->mm->context.ldt; | ||
107 | desc = (void *)desc + (seg & ~7); | ||
108 | } else { | ||
109 | /* Must disable preemption while reading the GDT. */ | ||
110 | desc = (u32 *)&per_cpu(cpu_gdt_table, get_cpu()); | ||
111 | desc = (void *)desc + (seg & ~7); | ||
112 | } | ||
113 | |||
114 | /* Decode the code segment base from the descriptor */ | ||
115 | base = get_desc_base((unsigned long *)desc); | ||
116 | |||
117 | if (seg & (1<<2)) { | ||
118 | up(¤t->mm->context.sem); | ||
119 | } else | ||
120 | put_cpu(); | ||
121 | |||
122 | /* Adjust EIP and segment limit, and clamp at the kernel limit. | ||
123 | It's legitimate for segments to wrap at 0xffffffff. */ | ||
124 | seg_limit += base; | ||
125 | if (seg_limit < *eip_limit && seg_limit >= base) | ||
126 | *eip_limit = seg_limit; | ||
127 | return eip + base; | ||
128 | } | ||
129 | |||
130 | /* | ||
131 | * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. | ||
132 | * Check that here and ignore it. | ||
133 | */ | ||
134 | static int __is_prefetch(struct pt_regs *regs, unsigned long addr) | ||
135 | { | ||
136 | unsigned long limit; | ||
137 | unsigned long instr = get_segment_eip (regs, &limit); | ||
138 | int scan_more = 1; | ||
139 | int prefetch = 0; | ||
140 | int i; | ||
141 | |||
142 | for (i = 0; scan_more && i < 15; i++) { | ||
143 | unsigned char opcode; | ||
144 | unsigned char instr_hi; | ||
145 | unsigned char instr_lo; | ||
146 | |||
147 | if (instr > limit) | ||
148 | break; | ||
149 | if (__get_user(opcode, (unsigned char *) instr)) | ||
150 | break; | ||
151 | |||
152 | instr_hi = opcode & 0xf0; | ||
153 | instr_lo = opcode & 0x0f; | ||
154 | instr++; | ||
155 | |||
156 | switch (instr_hi) { | ||
157 | case 0x20: | ||
158 | case 0x30: | ||
159 | /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ | ||
160 | scan_more = ((instr_lo & 7) == 0x6); | ||
161 | break; | ||
162 | |||
163 | case 0x60: | ||
164 | /* 0x64 thru 0x67 are valid prefixes in all modes. */ | ||
165 | scan_more = (instr_lo & 0xC) == 0x4; | ||
166 | break; | ||
167 | case 0xF0: | ||
168 | /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ | ||
169 | scan_more = !instr_lo || (instr_lo>>1) == 1; | ||
170 | break; | ||
171 | case 0x00: | ||
172 | /* Prefetch instruction is 0x0F0D or 0x0F18 */ | ||
173 | scan_more = 0; | ||
174 | if (instr > limit) | ||
175 | break; | ||
176 | if (__get_user(opcode, (unsigned char *) instr)) | ||
177 | break; | ||
178 | prefetch = (instr_lo == 0xF) && | ||
179 | (opcode == 0x0D || opcode == 0x18); | ||
180 | break; | ||
181 | default: | ||
182 | scan_more = 0; | ||
183 | break; | ||
184 | } | ||
185 | } | ||
186 | return prefetch; | ||
187 | } | ||
188 | |||
189 | static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, | ||
190 | unsigned long error_code) | ||
191 | { | ||
192 | if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | ||
193 | boot_cpu_data.x86 >= 6)) { | ||
194 | /* Catch an obscure case of prefetch inside an NX page. */ | ||
195 | if (nx_enabled && (error_code & 16)) | ||
196 | return 0; | ||
197 | return __is_prefetch(regs, addr); | ||
198 | } | ||
199 | return 0; | ||
200 | } | ||
201 | |||
202 | fastcall void do_invalid_op(struct pt_regs *, unsigned long); | ||
203 | |||
204 | /* | ||
205 | * This routine handles page faults. It determines the address, | ||
206 | * and the problem, and then passes it off to one of the appropriate | ||
207 | * routines. | ||
208 | * | ||
209 | * error_code: | ||
210 | * bit 0 == 0 means no page found, 1 means protection fault | ||
211 | * bit 1 == 0 means read, 1 means write | ||
212 | * bit 2 == 0 means kernel, 1 means user-mode | ||
213 | */ | ||
214 | fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code) | ||
215 | { | ||
216 | struct task_struct *tsk; | ||
217 | struct mm_struct *mm; | ||
218 | struct vm_area_struct * vma; | ||
219 | unsigned long address; | ||
220 | unsigned long page; | ||
221 | int write; | ||
222 | siginfo_t info; | ||
223 | |||
224 | /* get the address */ | ||
225 | __asm__("movl %%cr2,%0":"=r" (address)); | ||
226 | |||
227 | if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, | ||
228 | SIGSEGV) == NOTIFY_STOP) | ||
229 | return; | ||
230 | /* It's safe to allow irq's after cr2 has been saved */ | ||
231 | if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) | ||
232 | local_irq_enable(); | ||
233 | |||
234 | tsk = current; | ||
235 | |||
236 | info.si_code = SEGV_MAPERR; | ||
237 | |||
238 | /* | ||
239 | * We fault-in kernel-space virtual memory on-demand. The | ||
240 | * 'reference' page table is init_mm.pgd. | ||
241 | * | ||
242 | * NOTE! We MUST NOT take any locks for this case. We may | ||
243 | * be in an interrupt or a critical region, and should | ||
244 | * only copy the information from the master page table, | ||
245 | * nothing more. | ||
246 | * | ||
247 | * This verifies that the fault happens in kernel space | ||
248 | * (error_code & 4) == 0, and that the fault was not a | ||
249 | * protection error (error_code & 1) == 0. | ||
250 | */ | ||
251 | if (unlikely(address >= TASK_SIZE)) { | ||
252 | if (!(error_code & 5)) | ||
253 | goto vmalloc_fault; | ||
254 | /* | ||
255 | * Don't take the mm semaphore here. If we fixup a prefetch | ||
256 | * fault we could otherwise deadlock. | ||
257 | */ | ||
258 | goto bad_area_nosemaphore; | ||
259 | } | ||
260 | |||
261 | mm = tsk->mm; | ||
262 | |||
263 | /* | ||
264 | * If we're in an interrupt, have no user context or are running in an | ||
265 | * atomic region then we must not take the fault.. | ||
266 | */ | ||
267 | if (in_atomic() || !mm) | ||
268 | goto bad_area_nosemaphore; | ||
269 | |||
270 | /* When running in the kernel we expect faults to occur only to | ||
271 | * addresses in user space. All other faults represent errors in the | ||
272 | * kernel and should generate an OOPS. Unfortunatly, in the case of an | ||
273 | * erroneous fault occuring in a code path which already holds mmap_sem | ||
274 | * we will deadlock attempting to validate the fault against the | ||
275 | * address space. Luckily the kernel only validly references user | ||
276 | * space from well defined areas of code, which are listed in the | ||
277 | * exceptions table. | ||
278 | * | ||
279 | * As the vast majority of faults will be valid we will only perform | ||
280 | * the source reference check when there is a possibilty of a deadlock. | ||
281 | * Attempt to lock the address space, if we cannot we then validate the | ||
282 | * source. If this is invalid we can skip the address space check, | ||
283 | * thus avoiding the deadlock. | ||
284 | */ | ||
285 | if (!down_read_trylock(&mm->mmap_sem)) { | ||
286 | if ((error_code & 4) == 0 && | ||
287 | !search_exception_tables(regs->eip)) | ||
288 | goto bad_area_nosemaphore; | ||
289 | down_read(&mm->mmap_sem); | ||
290 | } | ||
291 | |||
292 | vma = find_vma(mm, address); | ||
293 | if (!vma) | ||
294 | goto bad_area; | ||
295 | if (vma->vm_start <= address) | ||
296 | goto good_area; | ||
297 | if (!(vma->vm_flags & VM_GROWSDOWN)) | ||
298 | goto bad_area; | ||
299 | if (error_code & 4) { | ||
300 | /* | ||
301 | * accessing the stack below %esp is always a bug. | ||
302 | * The "+ 32" is there due to some instructions (like | ||
303 | * pusha) doing post-decrement on the stack and that | ||
304 | * doesn't show up until later.. | ||
305 | */ | ||
306 | if (address + 32 < regs->esp) | ||
307 | goto bad_area; | ||
308 | } | ||
309 | if (expand_stack(vma, address)) | ||
310 | goto bad_area; | ||
311 | /* | ||
312 | * Ok, we have a good vm_area for this memory access, so | ||
313 | * we can handle it.. | ||
314 | */ | ||
315 | good_area: | ||
316 | info.si_code = SEGV_ACCERR; | ||
317 | write = 0; | ||
318 | switch (error_code & 3) { | ||
319 | default: /* 3: write, present */ | ||
320 | #ifdef TEST_VERIFY_AREA | ||
321 | if (regs->cs == KERNEL_CS) | ||
322 | printk("WP fault at %08lx\n", regs->eip); | ||
323 | #endif | ||
324 | /* fall through */ | ||
325 | case 2: /* write, not present */ | ||
326 | if (!(vma->vm_flags & VM_WRITE)) | ||
327 | goto bad_area; | ||
328 | write++; | ||
329 | break; | ||
330 | case 1: /* read, present */ | ||
331 | goto bad_area; | ||
332 | case 0: /* read, not present */ | ||
333 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | ||
334 | goto bad_area; | ||
335 | } | ||
336 | |||
337 | survive: | ||
338 | /* | ||
339 | * If for any reason at all we couldn't handle the fault, | ||
340 | * make sure we exit gracefully rather than endlessly redo | ||
341 | * the fault. | ||
342 | */ | ||
343 | switch (handle_mm_fault(mm, vma, address, write)) { | ||
344 | case VM_FAULT_MINOR: | ||
345 | tsk->min_flt++; | ||
346 | break; | ||
347 | case VM_FAULT_MAJOR: | ||
348 | tsk->maj_flt++; | ||
349 | break; | ||
350 | case VM_FAULT_SIGBUS: | ||
351 | goto do_sigbus; | ||
352 | case VM_FAULT_OOM: | ||
353 | goto out_of_memory; | ||
354 | default: | ||
355 | BUG(); | ||
356 | } | ||
357 | |||
358 | /* | ||
359 | * Did it hit the DOS screen memory VA from vm86 mode? | ||
360 | */ | ||
361 | if (regs->eflags & VM_MASK) { | ||
362 | unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; | ||
363 | if (bit < 32) | ||
364 | tsk->thread.screen_bitmap |= 1 << bit; | ||
365 | } | ||
366 | up_read(&mm->mmap_sem); | ||
367 | return; | ||
368 | |||
369 | /* | ||
370 | * Something tried to access memory that isn't in our memory map.. | ||
371 | * Fix it, but check if it's kernel or user first.. | ||
372 | */ | ||
373 | bad_area: | ||
374 | up_read(&mm->mmap_sem); | ||
375 | |||
376 | bad_area_nosemaphore: | ||
377 | /* User mode accesses just cause a SIGSEGV */ | ||
378 | if (error_code & 4) { | ||
379 | /* | ||
380 | * Valid to do another page fault here because this one came | ||
381 | * from user space. | ||
382 | */ | ||
383 | if (is_prefetch(regs, address, error_code)) | ||
384 | return; | ||
385 | |||
386 | tsk->thread.cr2 = address; | ||
387 | /* Kernel addresses are always protection faults */ | ||
388 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | ||
389 | tsk->thread.trap_no = 14; | ||
390 | info.si_signo = SIGSEGV; | ||
391 | info.si_errno = 0; | ||
392 | /* info.si_code has been set above */ | ||
393 | info.si_addr = (void __user *)address; | ||
394 | force_sig_info(SIGSEGV, &info, tsk); | ||
395 | return; | ||
396 | } | ||
397 | |||
398 | #ifdef CONFIG_X86_F00F_BUG | ||
399 | /* | ||
400 | * Pentium F0 0F C7 C8 bug workaround. | ||
401 | */ | ||
402 | if (boot_cpu_data.f00f_bug) { | ||
403 | unsigned long nr; | ||
404 | |||
405 | nr = (address - idt_descr.address) >> 3; | ||
406 | |||
407 | if (nr == 6) { | ||
408 | do_invalid_op(regs, 0); | ||
409 | return; | ||
410 | } | ||
411 | } | ||
412 | #endif | ||
413 | |||
414 | no_context: | ||
415 | /* Are we prepared to handle this kernel fault? */ | ||
416 | if (fixup_exception(regs)) | ||
417 | return; | ||
418 | |||
419 | /* | ||
420 | * Valid to do another page fault here, because if this fault | ||
421 | * had been triggered by is_prefetch fixup_exception would have | ||
422 | * handled it. | ||
423 | */ | ||
424 | if (is_prefetch(regs, address, error_code)) | ||
425 | return; | ||
426 | |||
427 | /* | ||
428 | * Oops. The kernel tried to access some bad page. We'll have to | ||
429 | * terminate things with extreme prejudice. | ||
430 | */ | ||
431 | |||
432 | bust_spinlocks(1); | ||
433 | |||
434 | #ifdef CONFIG_X86_PAE | ||
435 | if (error_code & 16) { | ||
436 | pte_t *pte = lookup_address(address); | ||
437 | |||
438 | if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) | ||
439 | printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid); | ||
440 | } | ||
441 | #endif | ||
442 | if (address < PAGE_SIZE) | ||
443 | printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); | ||
444 | else | ||
445 | printk(KERN_ALERT "Unable to handle kernel paging request"); | ||
446 | printk(" at virtual address %08lx\n",address); | ||
447 | printk(KERN_ALERT " printing eip:\n"); | ||
448 | printk("%08lx\n", regs->eip); | ||
449 | asm("movl %%cr3,%0":"=r" (page)); | ||
450 | page = ((unsigned long *) __va(page))[address >> 22]; | ||
451 | printk(KERN_ALERT "*pde = %08lx\n", page); | ||
452 | /* | ||
453 | * We must not directly access the pte in the highpte | ||
454 | * case, the page table might be allocated in highmem. | ||
455 | * And lets rather not kmap-atomic the pte, just in case | ||
456 | * it's allocated already. | ||
457 | */ | ||
458 | #ifndef CONFIG_HIGHPTE | ||
459 | if (page & 1) { | ||
460 | page &= PAGE_MASK; | ||
461 | address &= 0x003ff000; | ||
462 | page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; | ||
463 | printk(KERN_ALERT "*pte = %08lx\n", page); | ||
464 | } | ||
465 | #endif | ||
466 | die("Oops", regs, error_code); | ||
467 | bust_spinlocks(0); | ||
468 | do_exit(SIGKILL); | ||
469 | |||
470 | /* | ||
471 | * We ran out of memory, or some other thing happened to us that made | ||
472 | * us unable to handle the page fault gracefully. | ||
473 | */ | ||
474 | out_of_memory: | ||
475 | up_read(&mm->mmap_sem); | ||
476 | if (tsk->pid == 1) { | ||
477 | yield(); | ||
478 | down_read(&mm->mmap_sem); | ||
479 | goto survive; | ||
480 | } | ||
481 | printk("VM: killing process %s\n", tsk->comm); | ||
482 | if (error_code & 4) | ||
483 | do_exit(SIGKILL); | ||
484 | goto no_context; | ||
485 | |||
486 | do_sigbus: | ||
487 | up_read(&mm->mmap_sem); | ||
488 | |||
489 | /* Kernel mode? Handle exceptions or die */ | ||
490 | if (!(error_code & 4)) | ||
491 | goto no_context; | ||
492 | |||
493 | /* User space => ok to do another page fault */ | ||
494 | if (is_prefetch(regs, address, error_code)) | ||
495 | return; | ||
496 | |||
497 | tsk->thread.cr2 = address; | ||
498 | tsk->thread.error_code = error_code; | ||
499 | tsk->thread.trap_no = 14; | ||
500 | info.si_signo = SIGBUS; | ||
501 | info.si_errno = 0; | ||
502 | info.si_code = BUS_ADRERR; | ||
503 | info.si_addr = (void __user *)address; | ||
504 | force_sig_info(SIGBUS, &info, tsk); | ||
505 | return; | ||
506 | |||
507 | vmalloc_fault: | ||
508 | { | ||
509 | /* | ||
510 | * Synchronize this task's top level page-table | ||
511 | * with the 'reference' page table. | ||
512 | * | ||
513 | * Do _not_ use "tsk" here. We might be inside | ||
514 | * an interrupt in the middle of a task switch.. | ||
515 | */ | ||
516 | int index = pgd_index(address); | ||
517 | unsigned long pgd_paddr; | ||
518 | pgd_t *pgd, *pgd_k; | ||
519 | pud_t *pud, *pud_k; | ||
520 | pmd_t *pmd, *pmd_k; | ||
521 | pte_t *pte_k; | ||
522 | |||
523 | asm("movl %%cr3,%0":"=r" (pgd_paddr)); | ||
524 | pgd = index + (pgd_t *)__va(pgd_paddr); | ||
525 | pgd_k = init_mm.pgd + index; | ||
526 | |||
527 | if (!pgd_present(*pgd_k)) | ||
528 | goto no_context; | ||
529 | |||
530 | /* | ||
531 | * set_pgd(pgd, *pgd_k); here would be useless on PAE | ||
532 | * and redundant with the set_pmd() on non-PAE. As would | ||
533 | * set_pud. | ||
534 | */ | ||
535 | |||
536 | pud = pud_offset(pgd, address); | ||
537 | pud_k = pud_offset(pgd_k, address); | ||
538 | if (!pud_present(*pud_k)) | ||
539 | goto no_context; | ||
540 | |||
541 | pmd = pmd_offset(pud, address); | ||
542 | pmd_k = pmd_offset(pud_k, address); | ||
543 | if (!pmd_present(*pmd_k)) | ||
544 | goto no_context; | ||
545 | set_pmd(pmd, *pmd_k); | ||
546 | |||
547 | pte_k = pte_offset_kernel(pmd_k, address); | ||
548 | if (!pte_present(*pte_k)) | ||
549 | goto no_context; | ||
550 | return; | ||
551 | } | ||
552 | } | ||
diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c new file mode 100644 index 000000000000..fc4c4cad4e98 --- /dev/null +++ b/arch/i386/mm/highmem.c | |||
@@ -0,0 +1,89 @@ | |||
1 | #include <linux/highmem.h> | ||
2 | |||
3 | void *kmap(struct page *page) | ||
4 | { | ||
5 | might_sleep(); | ||
6 | if (!PageHighMem(page)) | ||
7 | return page_address(page); | ||
8 | return kmap_high(page); | ||
9 | } | ||
10 | |||
11 | void kunmap(struct page *page) | ||
12 | { | ||
13 | if (in_interrupt()) | ||
14 | BUG(); | ||
15 | if (!PageHighMem(page)) | ||
16 | return; | ||
17 | kunmap_high(page); | ||
18 | } | ||
19 | |||
20 | /* | ||
21 | * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because | ||
22 | * no global lock is needed and because the kmap code must perform a global TLB | ||
23 | * invalidation when the kmap pool wraps. | ||
24 | * | ||
25 | * However when holding an atomic kmap is is not legal to sleep, so atomic | ||
26 | * kmaps are appropriate for short, tight code paths only. | ||
27 | */ | ||
28 | void *kmap_atomic(struct page *page, enum km_type type) | ||
29 | { | ||
30 | enum fixed_addresses idx; | ||
31 | unsigned long vaddr; | ||
32 | |||
33 | /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ | ||
34 | inc_preempt_count(); | ||
35 | if (!PageHighMem(page)) | ||
36 | return page_address(page); | ||
37 | |||
38 | idx = type + KM_TYPE_NR*smp_processor_id(); | ||
39 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | ||
40 | #ifdef CONFIG_DEBUG_HIGHMEM | ||
41 | if (!pte_none(*(kmap_pte-idx))) | ||
42 | BUG(); | ||
43 | #endif | ||
44 | set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); | ||
45 | __flush_tlb_one(vaddr); | ||
46 | |||
47 | return (void*) vaddr; | ||
48 | } | ||
49 | |||
50 | void kunmap_atomic(void *kvaddr, enum km_type type) | ||
51 | { | ||
52 | #ifdef CONFIG_DEBUG_HIGHMEM | ||
53 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; | ||
54 | enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); | ||
55 | |||
56 | if (vaddr < FIXADDR_START) { // FIXME | ||
57 | dec_preempt_count(); | ||
58 | preempt_check_resched(); | ||
59 | return; | ||
60 | } | ||
61 | |||
62 | if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) | ||
63 | BUG(); | ||
64 | |||
65 | /* | ||
66 | * force other mappings to Oops if they'll try to access | ||
67 | * this pte without first remap it | ||
68 | */ | ||
69 | pte_clear(&init_mm, vaddr, kmap_pte-idx); | ||
70 | __flush_tlb_one(vaddr); | ||
71 | #endif | ||
72 | |||
73 | dec_preempt_count(); | ||
74 | preempt_check_resched(); | ||
75 | } | ||
76 | |||
77 | struct page *kmap_atomic_to_page(void *ptr) | ||
78 | { | ||
79 | unsigned long idx, vaddr = (unsigned long)ptr; | ||
80 | pte_t *pte; | ||
81 | |||
82 | if (vaddr < FIXADDR_START) | ||
83 | return virt_to_page(ptr); | ||
84 | |||
85 | idx = virt_to_fix(vaddr); | ||
86 | pte = kmap_pte - (idx - FIX_KMAP_BEGIN); | ||
87 | return pte_page(*pte); | ||
88 | } | ||
89 | |||
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c new file mode 100644 index 000000000000..a8c45143088b --- /dev/null +++ b/arch/i386/mm/hugetlbpage.c | |||
@@ -0,0 +1,431 @@ | |||
1 | /* | ||
2 | * IA-32 Huge TLB Page Support for Kernel. | ||
3 | * | ||
4 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | ||
5 | */ | ||
6 | |||
7 | #include <linux/config.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/fs.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/hugetlb.h> | ||
12 | #include <linux/pagemap.h> | ||
13 | #include <linux/smp_lock.h> | ||
14 | #include <linux/slab.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <linux/sysctl.h> | ||
17 | #include <asm/mman.h> | ||
18 | #include <asm/tlb.h> | ||
19 | #include <asm/tlbflush.h> | ||
20 | |||
21 | static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) | ||
22 | { | ||
23 | pgd_t *pgd; | ||
24 | pud_t *pud; | ||
25 | pmd_t *pmd = NULL; | ||
26 | |||
27 | pgd = pgd_offset(mm, addr); | ||
28 | pud = pud_alloc(mm, pgd, addr); | ||
29 | pmd = pmd_alloc(mm, pud, addr); | ||
30 | return (pte_t *) pmd; | ||
31 | } | ||
32 | |||
33 | static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | ||
34 | { | ||
35 | pgd_t *pgd; | ||
36 | pud_t *pud; | ||
37 | pmd_t *pmd = NULL; | ||
38 | |||
39 | pgd = pgd_offset(mm, addr); | ||
40 | pud = pud_offset(pgd, addr); | ||
41 | pmd = pmd_offset(pud, addr); | ||
42 | return (pte_t *) pmd; | ||
43 | } | ||
44 | |||
45 | static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access) | ||
46 | { | ||
47 | pte_t entry; | ||
48 | |||
49 | add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); | ||
50 | if (write_access) { | ||
51 | entry = | ||
52 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | ||
53 | } else | ||
54 | entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); | ||
55 | entry = pte_mkyoung(entry); | ||
56 | mk_pte_huge(entry); | ||
57 | set_pte(page_table, entry); | ||
58 | } | ||
59 | |||
60 | /* | ||
61 | * This function checks for proper alignment of input addr and len parameters. | ||
62 | */ | ||
63 | int is_aligned_hugepage_range(unsigned long addr, unsigned long len) | ||
64 | { | ||
65 | if (len & ~HPAGE_MASK) | ||
66 | return -EINVAL; | ||
67 | if (addr & ~HPAGE_MASK) | ||
68 | return -EINVAL; | ||
69 | return 0; | ||
70 | } | ||
71 | |||
72 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | ||
73 | struct vm_area_struct *vma) | ||
74 | { | ||
75 | pte_t *src_pte, *dst_pte, entry; | ||
76 | struct page *ptepage; | ||
77 | unsigned long addr = vma->vm_start; | ||
78 | unsigned long end = vma->vm_end; | ||
79 | |||
80 | while (addr < end) { | ||
81 | dst_pte = huge_pte_alloc(dst, addr); | ||
82 | if (!dst_pte) | ||
83 | goto nomem; | ||
84 | src_pte = huge_pte_offset(src, addr); | ||
85 | entry = *src_pte; | ||
86 | ptepage = pte_page(entry); | ||
87 | get_page(ptepage); | ||
88 | set_pte(dst_pte, entry); | ||
89 | add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); | ||
90 | addr += HPAGE_SIZE; | ||
91 | } | ||
92 | return 0; | ||
93 | |||
94 | nomem: | ||
95 | return -ENOMEM; | ||
96 | } | ||
97 | |||
98 | int | ||
99 | follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
100 | struct page **pages, struct vm_area_struct **vmas, | ||
101 | unsigned long *position, int *length, int i) | ||
102 | { | ||
103 | unsigned long vpfn, vaddr = *position; | ||
104 | int remainder = *length; | ||
105 | |||
106 | WARN_ON(!is_vm_hugetlb_page(vma)); | ||
107 | |||
108 | vpfn = vaddr/PAGE_SIZE; | ||
109 | while (vaddr < vma->vm_end && remainder) { | ||
110 | |||
111 | if (pages) { | ||
112 | pte_t *pte; | ||
113 | struct page *page; | ||
114 | |||
115 | pte = huge_pte_offset(mm, vaddr); | ||
116 | |||
117 | /* hugetlb should be locked, and hence, prefaulted */ | ||
118 | WARN_ON(!pte || pte_none(*pte)); | ||
119 | |||
120 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | ||
121 | |||
122 | WARN_ON(!PageCompound(page)); | ||
123 | |||
124 | get_page(page); | ||
125 | pages[i] = page; | ||
126 | } | ||
127 | |||
128 | if (vmas) | ||
129 | vmas[i] = vma; | ||
130 | |||
131 | vaddr += PAGE_SIZE; | ||
132 | ++vpfn; | ||
133 | --remainder; | ||
134 | ++i; | ||
135 | } | ||
136 | |||
137 | *length = remainder; | ||
138 | *position = vaddr; | ||
139 | |||
140 | return i; | ||
141 | } | ||
142 | |||
143 | #if 0 /* This is just for testing */ | ||
144 | struct page * | ||
145 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | ||
146 | { | ||
147 | unsigned long start = address; | ||
148 | int length = 1; | ||
149 | int nr; | ||
150 | struct page *page; | ||
151 | struct vm_area_struct *vma; | ||
152 | |||
153 | vma = find_vma(mm, addr); | ||
154 | if (!vma || !is_vm_hugetlb_page(vma)) | ||
155 | return ERR_PTR(-EINVAL); | ||
156 | |||
157 | pte = huge_pte_offset(mm, address); | ||
158 | |||
159 | /* hugetlb should be locked, and hence, prefaulted */ | ||
160 | WARN_ON(!pte || pte_none(*pte)); | ||
161 | |||
162 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | ||
163 | |||
164 | WARN_ON(!PageCompound(page)); | ||
165 | |||
166 | return page; | ||
167 | } | ||
168 | |||
169 | int pmd_huge(pmd_t pmd) | ||
170 | { | ||
171 | return 0; | ||
172 | } | ||
173 | |||
174 | struct page * | ||
175 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
176 | pmd_t *pmd, int write) | ||
177 | { | ||
178 | return NULL; | ||
179 | } | ||
180 | |||
181 | #else | ||
182 | |||
183 | struct page * | ||
184 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | ||
185 | { | ||
186 | return ERR_PTR(-EINVAL); | ||
187 | } | ||
188 | |||
189 | int pmd_huge(pmd_t pmd) | ||
190 | { | ||
191 | return !!(pmd_val(pmd) & _PAGE_PSE); | ||
192 | } | ||
193 | |||
194 | struct page * | ||
195 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
196 | pmd_t *pmd, int write) | ||
197 | { | ||
198 | struct page *page; | ||
199 | |||
200 | page = pte_page(*(pte_t *)pmd); | ||
201 | if (page) | ||
202 | page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); | ||
203 | return page; | ||
204 | } | ||
205 | #endif | ||
206 | |||
207 | void unmap_hugepage_range(struct vm_area_struct *vma, | ||
208 | unsigned long start, unsigned long end) | ||
209 | { | ||
210 | struct mm_struct *mm = vma->vm_mm; | ||
211 | unsigned long address; | ||
212 | pte_t pte, *ptep; | ||
213 | struct page *page; | ||
214 | |||
215 | BUG_ON(start & (HPAGE_SIZE - 1)); | ||
216 | BUG_ON(end & (HPAGE_SIZE - 1)); | ||
217 | |||
218 | for (address = start; address < end; address += HPAGE_SIZE) { | ||
219 | ptep = huge_pte_offset(mm, address); | ||
220 | if (!ptep) | ||
221 | continue; | ||
222 | pte = ptep_get_and_clear(mm, address, ptep); | ||
223 | if (pte_none(pte)) | ||
224 | continue; | ||
225 | page = pte_page(pte); | ||
226 | put_page(page); | ||
227 | } | ||
228 | add_mm_counter(mm ,rss, -((end - start) >> PAGE_SHIFT)); | ||
229 | flush_tlb_range(vma, start, end); | ||
230 | } | ||
231 | |||
232 | int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) | ||
233 | { | ||
234 | struct mm_struct *mm = current->mm; | ||
235 | unsigned long addr; | ||
236 | int ret = 0; | ||
237 | |||
238 | BUG_ON(vma->vm_start & ~HPAGE_MASK); | ||
239 | BUG_ON(vma->vm_end & ~HPAGE_MASK); | ||
240 | |||
241 | spin_lock(&mm->page_table_lock); | ||
242 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | ||
243 | unsigned long idx; | ||
244 | pte_t *pte = huge_pte_alloc(mm, addr); | ||
245 | struct page *page; | ||
246 | |||
247 | if (!pte) { | ||
248 | ret = -ENOMEM; | ||
249 | goto out; | ||
250 | } | ||
251 | |||
252 | if (!pte_none(*pte)) { | ||
253 | pmd_t *pmd = (pmd_t *) pte; | ||
254 | |||
255 | page = pmd_page(*pmd); | ||
256 | pmd_clear(pmd); | ||
257 | mm->nr_ptes--; | ||
258 | dec_page_state(nr_page_table_pages); | ||
259 | page_cache_release(page); | ||
260 | } | ||
261 | |||
262 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
263 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
264 | page = find_get_page(mapping, idx); | ||
265 | if (!page) { | ||
266 | /* charge the fs quota first */ | ||
267 | if (hugetlb_get_quota(mapping)) { | ||
268 | ret = -ENOMEM; | ||
269 | goto out; | ||
270 | } | ||
271 | page = alloc_huge_page(); | ||
272 | if (!page) { | ||
273 | hugetlb_put_quota(mapping); | ||
274 | ret = -ENOMEM; | ||
275 | goto out; | ||
276 | } | ||
277 | ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); | ||
278 | if (! ret) { | ||
279 | unlock_page(page); | ||
280 | } else { | ||
281 | hugetlb_put_quota(mapping); | ||
282 | free_huge_page(page); | ||
283 | goto out; | ||
284 | } | ||
285 | } | ||
286 | set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); | ||
287 | } | ||
288 | out: | ||
289 | spin_unlock(&mm->page_table_lock); | ||
290 | return ret; | ||
291 | } | ||
292 | |||
293 | /* x86_64 also uses this file */ | ||
294 | |||
295 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA | ||
296 | static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | ||
297 | unsigned long addr, unsigned long len, | ||
298 | unsigned long pgoff, unsigned long flags) | ||
299 | { | ||
300 | struct mm_struct *mm = current->mm; | ||
301 | struct vm_area_struct *vma; | ||
302 | unsigned long start_addr; | ||
303 | |||
304 | start_addr = mm->free_area_cache; | ||
305 | |||
306 | full_search: | ||
307 | addr = ALIGN(start_addr, HPAGE_SIZE); | ||
308 | |||
309 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
310 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
311 | if (TASK_SIZE - len < addr) { | ||
312 | /* | ||
313 | * Start a new search - just in case we missed | ||
314 | * some holes. | ||
315 | */ | ||
316 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
317 | start_addr = TASK_UNMAPPED_BASE; | ||
318 | goto full_search; | ||
319 | } | ||
320 | return -ENOMEM; | ||
321 | } | ||
322 | if (!vma || addr + len <= vma->vm_start) { | ||
323 | mm->free_area_cache = addr + len; | ||
324 | return addr; | ||
325 | } | ||
326 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | ||
327 | } | ||
328 | } | ||
329 | |||
330 | static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | ||
331 | unsigned long addr0, unsigned long len, | ||
332 | unsigned long pgoff, unsigned long flags) | ||
333 | { | ||
334 | struct mm_struct *mm = current->mm; | ||
335 | struct vm_area_struct *vma, *prev_vma; | ||
336 | unsigned long base = mm->mmap_base, addr = addr0; | ||
337 | int first_time = 1; | ||
338 | |||
339 | /* don't allow allocations above current base */ | ||
340 | if (mm->free_area_cache > base) | ||
341 | mm->free_area_cache = base; | ||
342 | |||
343 | try_again: | ||
344 | /* make sure it can fit in the remaining address space */ | ||
345 | if (mm->free_area_cache < len) | ||
346 | goto fail; | ||
347 | |||
348 | /* either no address requested or cant fit in requested address hole */ | ||
349 | addr = (mm->free_area_cache - len) & HPAGE_MASK; | ||
350 | do { | ||
351 | /* | ||
352 | * Lookup failure means no vma is above this address, | ||
353 | * i.e. return with success: | ||
354 | */ | ||
355 | if (!(vma = find_vma_prev(mm, addr, &prev_vma))) | ||
356 | return addr; | ||
357 | |||
358 | /* | ||
359 | * new region fits between prev_vma->vm_end and | ||
360 | * vma->vm_start, use it: | ||
361 | */ | ||
362 | if (addr + len <= vma->vm_start && | ||
363 | (!prev_vma || (addr >= prev_vma->vm_end))) | ||
364 | /* remember the address as a hint for next time */ | ||
365 | return (mm->free_area_cache = addr); | ||
366 | else | ||
367 | /* pull free_area_cache down to the first hole */ | ||
368 | if (mm->free_area_cache == vma->vm_end) | ||
369 | mm->free_area_cache = vma->vm_start; | ||
370 | |||
371 | /* try just below the current vma->vm_start */ | ||
372 | addr = (vma->vm_start - len) & HPAGE_MASK; | ||
373 | } while (len <= vma->vm_start); | ||
374 | |||
375 | fail: | ||
376 | /* | ||
377 | * if hint left us with no space for the requested | ||
378 | * mapping then try again: | ||
379 | */ | ||
380 | if (first_time) { | ||
381 | mm->free_area_cache = base; | ||
382 | first_time = 0; | ||
383 | goto try_again; | ||
384 | } | ||
385 | /* | ||
386 | * A failed mmap() very likely causes application failure, | ||
387 | * so fall back to the bottom-up function here. This scenario | ||
388 | * can happen with large stack limits and large mmap() | ||
389 | * allocations. | ||
390 | */ | ||
391 | mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
392 | addr = hugetlb_get_unmapped_area_bottomup(file, addr0, | ||
393 | len, pgoff, flags); | ||
394 | |||
395 | /* | ||
396 | * Restore the topdown base: | ||
397 | */ | ||
398 | mm->free_area_cache = base; | ||
399 | |||
400 | return addr; | ||
401 | } | ||
402 | |||
403 | unsigned long | ||
404 | hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | ||
405 | unsigned long len, unsigned long pgoff, unsigned long flags) | ||
406 | { | ||
407 | struct mm_struct *mm = current->mm; | ||
408 | struct vm_area_struct *vma; | ||
409 | |||
410 | if (len & ~HPAGE_MASK) | ||
411 | return -EINVAL; | ||
412 | if (len > TASK_SIZE) | ||
413 | return -ENOMEM; | ||
414 | |||
415 | if (addr) { | ||
416 | addr = ALIGN(addr, HPAGE_SIZE); | ||
417 | vma = find_vma(mm, addr); | ||
418 | if (TASK_SIZE - len >= addr && | ||
419 | (!vma || addr + len <= vma->vm_start)) | ||
420 | return addr; | ||
421 | } | ||
422 | if (mm->get_unmapped_area == arch_get_unmapped_area) | ||
423 | return hugetlb_get_unmapped_area_bottomup(file, addr, len, | ||
424 | pgoff, flags); | ||
425 | else | ||
426 | return hugetlb_get_unmapped_area_topdown(file, addr, len, | ||
427 | pgoff, flags); | ||
428 | } | ||
429 | |||
430 | #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ | ||
431 | |||
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c new file mode 100644 index 000000000000..7a7ea3737265 --- /dev/null +++ b/arch/i386/mm/init.c | |||
@@ -0,0 +1,696 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/mm/init.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | * | ||
6 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | ||
7 | */ | ||
8 | |||
9 | #include <linux/config.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/signal.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/kernel.h> | ||
14 | #include <linux/errno.h> | ||
15 | #include <linux/string.h> | ||
16 | #include <linux/types.h> | ||
17 | #include <linux/ptrace.h> | ||
18 | #include <linux/mman.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/hugetlb.h> | ||
21 | #include <linux/swap.h> | ||
22 | #include <linux/smp.h> | ||
23 | #include <linux/init.h> | ||
24 | #include <linux/highmem.h> | ||
25 | #include <linux/pagemap.h> | ||
26 | #include <linux/bootmem.h> | ||
27 | #include <linux/slab.h> | ||
28 | #include <linux/proc_fs.h> | ||
29 | #include <linux/efi.h> | ||
30 | |||
31 | #include <asm/processor.h> | ||
32 | #include <asm/system.h> | ||
33 | #include <asm/uaccess.h> | ||
34 | #include <asm/pgtable.h> | ||
35 | #include <asm/dma.h> | ||
36 | #include <asm/fixmap.h> | ||
37 | #include <asm/e820.h> | ||
38 | #include <asm/apic.h> | ||
39 | #include <asm/tlb.h> | ||
40 | #include <asm/tlbflush.h> | ||
41 | #include <asm/sections.h> | ||
42 | |||
43 | unsigned int __VMALLOC_RESERVE = 128 << 20; | ||
44 | |||
45 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | ||
46 | unsigned long highstart_pfn, highend_pfn; | ||
47 | |||
48 | static int noinline do_test_wp_bit(void); | ||
49 | |||
50 | /* | ||
51 | * Creates a middle page table and puts a pointer to it in the | ||
52 | * given global directory entry. This only returns the gd entry | ||
53 | * in non-PAE compilation mode, since the middle layer is folded. | ||
54 | */ | ||
55 | static pmd_t * __init one_md_table_init(pgd_t *pgd) | ||
56 | { | ||
57 | pud_t *pud; | ||
58 | pmd_t *pmd_table; | ||
59 | |||
60 | #ifdef CONFIG_X86_PAE | ||
61 | pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); | ||
62 | set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
63 | pud = pud_offset(pgd, 0); | ||
64 | if (pmd_table != pmd_offset(pud, 0)) | ||
65 | BUG(); | ||
66 | #else | ||
67 | pud = pud_offset(pgd, 0); | ||
68 | pmd_table = pmd_offset(pud, 0); | ||
69 | #endif | ||
70 | |||
71 | return pmd_table; | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * Create a page table and place a pointer to it in a middle page | ||
76 | * directory entry. | ||
77 | */ | ||
78 | static pte_t * __init one_page_table_init(pmd_t *pmd) | ||
79 | { | ||
80 | if (pmd_none(*pmd)) { | ||
81 | pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); | ||
82 | set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); | ||
83 | if (page_table != pte_offset_kernel(pmd, 0)) | ||
84 | BUG(); | ||
85 | |||
86 | return page_table; | ||
87 | } | ||
88 | |||
89 | return pte_offset_kernel(pmd, 0); | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * This function initializes a certain range of kernel virtual memory | ||
94 | * with new bootmem page tables, everywhere page tables are missing in | ||
95 | * the given range. | ||
96 | */ | ||
97 | |||
98 | /* | ||
99 | * NOTE: The pagetables are allocated contiguous on the physical space | ||
100 | * so we can cache the place of the first one and move around without | ||
101 | * checking the pgd every time. | ||
102 | */ | ||
103 | static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) | ||
104 | { | ||
105 | pgd_t *pgd; | ||
106 | pud_t *pud; | ||
107 | pmd_t *pmd; | ||
108 | int pgd_idx, pmd_idx; | ||
109 | unsigned long vaddr; | ||
110 | |||
111 | vaddr = start; | ||
112 | pgd_idx = pgd_index(vaddr); | ||
113 | pmd_idx = pmd_index(vaddr); | ||
114 | pgd = pgd_base + pgd_idx; | ||
115 | |||
116 | for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { | ||
117 | if (pgd_none(*pgd)) | ||
118 | one_md_table_init(pgd); | ||
119 | pud = pud_offset(pgd, vaddr); | ||
120 | pmd = pmd_offset(pud, vaddr); | ||
121 | for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { | ||
122 | if (pmd_none(*pmd)) | ||
123 | one_page_table_init(pmd); | ||
124 | |||
125 | vaddr += PMD_SIZE; | ||
126 | } | ||
127 | pmd_idx = 0; | ||
128 | } | ||
129 | } | ||
130 | |||
131 | static inline int is_kernel_text(unsigned long addr) | ||
132 | { | ||
133 | if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) | ||
134 | return 1; | ||
135 | return 0; | ||
136 | } | ||
137 | |||
138 | /* | ||
139 | * This maps the physical memory to kernel virtual address space, a total | ||
140 | * of max_low_pfn pages, by creating page tables starting from address | ||
141 | * PAGE_OFFSET. | ||
142 | */ | ||
143 | static void __init kernel_physical_mapping_init(pgd_t *pgd_base) | ||
144 | { | ||
145 | unsigned long pfn; | ||
146 | pgd_t *pgd; | ||
147 | pmd_t *pmd; | ||
148 | pte_t *pte; | ||
149 | int pgd_idx, pmd_idx, pte_ofs; | ||
150 | |||
151 | pgd_idx = pgd_index(PAGE_OFFSET); | ||
152 | pgd = pgd_base + pgd_idx; | ||
153 | pfn = 0; | ||
154 | |||
155 | for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { | ||
156 | pmd = one_md_table_init(pgd); | ||
157 | if (pfn >= max_low_pfn) | ||
158 | continue; | ||
159 | for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { | ||
160 | unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; | ||
161 | |||
162 | /* Map with big pages if possible, otherwise create normal page tables. */ | ||
163 | if (cpu_has_pse) { | ||
164 | unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; | ||
165 | |||
166 | if (is_kernel_text(address) || is_kernel_text(address2)) | ||
167 | set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); | ||
168 | else | ||
169 | set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); | ||
170 | pfn += PTRS_PER_PTE; | ||
171 | } else { | ||
172 | pte = one_page_table_init(pmd); | ||
173 | |||
174 | for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { | ||
175 | if (is_kernel_text(address)) | ||
176 | set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); | ||
177 | else | ||
178 | set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); | ||
179 | } | ||
180 | } | ||
181 | } | ||
182 | } | ||
183 | } | ||
184 | |||
185 | static inline int page_kills_ppro(unsigned long pagenr) | ||
186 | { | ||
187 | if (pagenr >= 0x70000 && pagenr <= 0x7003F) | ||
188 | return 1; | ||
189 | return 0; | ||
190 | } | ||
191 | |||
192 | extern int is_available_memory(efi_memory_desc_t *); | ||
193 | |||
194 | static inline int page_is_ram(unsigned long pagenr) | ||
195 | { | ||
196 | int i; | ||
197 | unsigned long addr, end; | ||
198 | |||
199 | if (efi_enabled) { | ||
200 | efi_memory_desc_t *md; | ||
201 | |||
202 | for (i = 0; i < memmap.nr_map; i++) { | ||
203 | md = &memmap.map[i]; | ||
204 | if (!is_available_memory(md)) | ||
205 | continue; | ||
206 | addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; | ||
207 | end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; | ||
208 | |||
209 | if ((pagenr >= addr) && (pagenr < end)) | ||
210 | return 1; | ||
211 | } | ||
212 | return 0; | ||
213 | } | ||
214 | |||
215 | for (i = 0; i < e820.nr_map; i++) { | ||
216 | |||
217 | if (e820.map[i].type != E820_RAM) /* not usable memory */ | ||
218 | continue; | ||
219 | /* | ||
220 | * !!!FIXME!!! Some BIOSen report areas as RAM that | ||
221 | * are not. Notably the 640->1Mb area. We need a sanity | ||
222 | * check here. | ||
223 | */ | ||
224 | addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; | ||
225 | end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; | ||
226 | if ((pagenr >= addr) && (pagenr < end)) | ||
227 | return 1; | ||
228 | } | ||
229 | return 0; | ||
230 | } | ||
231 | |||
232 | #ifdef CONFIG_HIGHMEM | ||
233 | pte_t *kmap_pte; | ||
234 | pgprot_t kmap_prot; | ||
235 | |||
236 | #define kmap_get_fixmap_pte(vaddr) \ | ||
237 | pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) | ||
238 | |||
239 | static void __init kmap_init(void) | ||
240 | { | ||
241 | unsigned long kmap_vstart; | ||
242 | |||
243 | /* cache the first kmap pte */ | ||
244 | kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); | ||
245 | kmap_pte = kmap_get_fixmap_pte(kmap_vstart); | ||
246 | |||
247 | kmap_prot = PAGE_KERNEL; | ||
248 | } | ||
249 | |||
250 | static void __init permanent_kmaps_init(pgd_t *pgd_base) | ||
251 | { | ||
252 | pgd_t *pgd; | ||
253 | pud_t *pud; | ||
254 | pmd_t *pmd; | ||
255 | pte_t *pte; | ||
256 | unsigned long vaddr; | ||
257 | |||
258 | vaddr = PKMAP_BASE; | ||
259 | page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); | ||
260 | |||
261 | pgd = swapper_pg_dir + pgd_index(vaddr); | ||
262 | pud = pud_offset(pgd, vaddr); | ||
263 | pmd = pmd_offset(pud, vaddr); | ||
264 | pte = pte_offset_kernel(pmd, vaddr); | ||
265 | pkmap_page_table = pte; | ||
266 | } | ||
267 | |||
268 | void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) | ||
269 | { | ||
270 | if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { | ||
271 | ClearPageReserved(page); | ||
272 | set_bit(PG_highmem, &page->flags); | ||
273 | set_page_count(page, 1); | ||
274 | __free_page(page); | ||
275 | totalhigh_pages++; | ||
276 | } else | ||
277 | SetPageReserved(page); | ||
278 | } | ||
279 | |||
280 | #ifndef CONFIG_DISCONTIGMEM | ||
281 | static void __init set_highmem_pages_init(int bad_ppro) | ||
282 | { | ||
283 | int pfn; | ||
284 | for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) | ||
285 | one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); | ||
286 | totalram_pages += totalhigh_pages; | ||
287 | } | ||
288 | #else | ||
289 | extern void set_highmem_pages_init(int); | ||
290 | #endif /* !CONFIG_DISCONTIGMEM */ | ||
291 | |||
292 | #else | ||
293 | #define kmap_init() do { } while (0) | ||
294 | #define permanent_kmaps_init(pgd_base) do { } while (0) | ||
295 | #define set_highmem_pages_init(bad_ppro) do { } while (0) | ||
296 | #endif /* CONFIG_HIGHMEM */ | ||
297 | |||
298 | unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; | ||
299 | unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; | ||
300 | |||
301 | #ifndef CONFIG_DISCONTIGMEM | ||
302 | #define remap_numa_kva() do {} while (0) | ||
303 | #else | ||
304 | extern void __init remap_numa_kva(void); | ||
305 | #endif | ||
306 | |||
307 | static void __init pagetable_init (void) | ||
308 | { | ||
309 | unsigned long vaddr; | ||
310 | pgd_t *pgd_base = swapper_pg_dir; | ||
311 | |||
312 | #ifdef CONFIG_X86_PAE | ||
313 | int i; | ||
314 | /* Init entries of the first-level page table to the zero page */ | ||
315 | for (i = 0; i < PTRS_PER_PGD; i++) | ||
316 | set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); | ||
317 | #endif | ||
318 | |||
319 | /* Enable PSE if available */ | ||
320 | if (cpu_has_pse) { | ||
321 | set_in_cr4(X86_CR4_PSE); | ||
322 | } | ||
323 | |||
324 | /* Enable PGE if available */ | ||
325 | if (cpu_has_pge) { | ||
326 | set_in_cr4(X86_CR4_PGE); | ||
327 | __PAGE_KERNEL |= _PAGE_GLOBAL; | ||
328 | __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; | ||
329 | } | ||
330 | |||
331 | kernel_physical_mapping_init(pgd_base); | ||
332 | remap_numa_kva(); | ||
333 | |||
334 | /* | ||
335 | * Fixed mappings, only the page table structure has to be | ||
336 | * created - mappings will be set by set_fixmap(): | ||
337 | */ | ||
338 | vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; | ||
339 | page_table_range_init(vaddr, 0, pgd_base); | ||
340 | |||
341 | permanent_kmaps_init(pgd_base); | ||
342 | |||
343 | #ifdef CONFIG_X86_PAE | ||
344 | /* | ||
345 | * Add low memory identity-mappings - SMP needs it when | ||
346 | * starting up on an AP from real-mode. In the non-PAE | ||
347 | * case we already have these mappings through head.S. | ||
348 | * All user-space mappings are explicitly cleared after | ||
349 | * SMP startup. | ||
350 | */ | ||
351 | pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; | ||
352 | #endif | ||
353 | } | ||
354 | |||
355 | #if defined(CONFIG_PM_DISK) || defined(CONFIG_SOFTWARE_SUSPEND) | ||
356 | /* | ||
357 | * Swap suspend & friends need this for resume because things like the intel-agp | ||
358 | * driver might have split up a kernel 4MB mapping. | ||
359 | */ | ||
360 | char __nosavedata swsusp_pg_dir[PAGE_SIZE] | ||
361 | __attribute__ ((aligned (PAGE_SIZE))); | ||
362 | |||
363 | static inline void save_pg_dir(void) | ||
364 | { | ||
365 | memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); | ||
366 | } | ||
367 | #else | ||
368 | static inline void save_pg_dir(void) | ||
369 | { | ||
370 | } | ||
371 | #endif | ||
372 | |||
373 | void zap_low_mappings (void) | ||
374 | { | ||
375 | int i; | ||
376 | |||
377 | save_pg_dir(); | ||
378 | |||
379 | /* | ||
380 | * Zap initial low-memory mappings. | ||
381 | * | ||
382 | * Note that "pgd_clear()" doesn't do it for | ||
383 | * us, because pgd_clear() is a no-op on i386. | ||
384 | */ | ||
385 | for (i = 0; i < USER_PTRS_PER_PGD; i++) | ||
386 | #ifdef CONFIG_X86_PAE | ||
387 | set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); | ||
388 | #else | ||
389 | set_pgd(swapper_pg_dir+i, __pgd(0)); | ||
390 | #endif | ||
391 | flush_tlb_all(); | ||
392 | } | ||
393 | |||
394 | static int disable_nx __initdata = 0; | ||
395 | u64 __supported_pte_mask = ~_PAGE_NX; | ||
396 | |||
397 | /* | ||
398 | * noexec = on|off | ||
399 | * | ||
400 | * Control non executable mappings. | ||
401 | * | ||
402 | * on Enable | ||
403 | * off Disable | ||
404 | */ | ||
405 | void __init noexec_setup(const char *str) | ||
406 | { | ||
407 | if (!strncmp(str, "on",2) && cpu_has_nx) { | ||
408 | __supported_pte_mask |= _PAGE_NX; | ||
409 | disable_nx = 0; | ||
410 | } else if (!strncmp(str,"off",3)) { | ||
411 | disable_nx = 1; | ||
412 | __supported_pte_mask &= ~_PAGE_NX; | ||
413 | } | ||
414 | } | ||
415 | |||
416 | int nx_enabled = 0; | ||
417 | #ifdef CONFIG_X86_PAE | ||
418 | |||
419 | static void __init set_nx(void) | ||
420 | { | ||
421 | unsigned int v[4], l, h; | ||
422 | |||
423 | if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { | ||
424 | cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); | ||
425 | if ((v[3] & (1 << 20)) && !disable_nx) { | ||
426 | rdmsr(MSR_EFER, l, h); | ||
427 | l |= EFER_NX; | ||
428 | wrmsr(MSR_EFER, l, h); | ||
429 | nx_enabled = 1; | ||
430 | __supported_pte_mask |= _PAGE_NX; | ||
431 | } | ||
432 | } | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * Enables/disables executability of a given kernel page and | ||
437 | * returns the previous setting. | ||
438 | */ | ||
439 | int __init set_kernel_exec(unsigned long vaddr, int enable) | ||
440 | { | ||
441 | pte_t *pte; | ||
442 | int ret = 1; | ||
443 | |||
444 | if (!nx_enabled) | ||
445 | goto out; | ||
446 | |||
447 | pte = lookup_address(vaddr); | ||
448 | BUG_ON(!pte); | ||
449 | |||
450 | if (!pte_exec_kernel(*pte)) | ||
451 | ret = 0; | ||
452 | |||
453 | if (enable) | ||
454 | pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); | ||
455 | else | ||
456 | pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); | ||
457 | __flush_tlb_all(); | ||
458 | out: | ||
459 | return ret; | ||
460 | } | ||
461 | |||
462 | #endif | ||
463 | |||
464 | /* | ||
465 | * paging_init() sets up the page tables - note that the first 8MB are | ||
466 | * already mapped by head.S. | ||
467 | * | ||
468 | * This routines also unmaps the page at virtual kernel address 0, so | ||
469 | * that we can trap those pesky NULL-reference errors in the kernel. | ||
470 | */ | ||
471 | void __init paging_init(void) | ||
472 | { | ||
473 | #ifdef CONFIG_X86_PAE | ||
474 | set_nx(); | ||
475 | if (nx_enabled) | ||
476 | printk("NX (Execute Disable) protection: active\n"); | ||
477 | #endif | ||
478 | |||
479 | pagetable_init(); | ||
480 | |||
481 | load_cr3(swapper_pg_dir); | ||
482 | |||
483 | #ifdef CONFIG_X86_PAE | ||
484 | /* | ||
485 | * We will bail out later - printk doesn't work right now so | ||
486 | * the user would just see a hanging kernel. | ||
487 | */ | ||
488 | if (cpu_has_pae) | ||
489 | set_in_cr4(X86_CR4_PAE); | ||
490 | #endif | ||
491 | __flush_tlb_all(); | ||
492 | |||
493 | kmap_init(); | ||
494 | } | ||
495 | |||
496 | /* | ||
497 | * Test if the WP bit works in supervisor mode. It isn't supported on 386's | ||
498 | * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This | ||
499 | * used to involve black magic jumps to work around some nasty CPU bugs, | ||
500 | * but fortunately the switch to using exceptions got rid of all that. | ||
501 | */ | ||
502 | |||
503 | static void __init test_wp_bit(void) | ||
504 | { | ||
505 | printk("Checking if this processor honours the WP bit even in supervisor mode... "); | ||
506 | |||
507 | /* Any page-aligned address will do, the test is non-destructive */ | ||
508 | __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); | ||
509 | boot_cpu_data.wp_works_ok = do_test_wp_bit(); | ||
510 | clear_fixmap(FIX_WP_TEST); | ||
511 | |||
512 | if (!boot_cpu_data.wp_works_ok) { | ||
513 | printk("No.\n"); | ||
514 | #ifdef CONFIG_X86_WP_WORKS_OK | ||
515 | panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); | ||
516 | #endif | ||
517 | } else { | ||
518 | printk("Ok.\n"); | ||
519 | } | ||
520 | } | ||
521 | |||
522 | static void __init set_max_mapnr_init(void) | ||
523 | { | ||
524 | #ifdef CONFIG_HIGHMEM | ||
525 | num_physpages = highend_pfn; | ||
526 | #else | ||
527 | num_physpages = max_low_pfn; | ||
528 | #endif | ||
529 | #ifndef CONFIG_DISCONTIGMEM | ||
530 | max_mapnr = num_physpages; | ||
531 | #endif | ||
532 | } | ||
533 | |||
534 | static struct kcore_list kcore_mem, kcore_vmalloc; | ||
535 | |||
536 | void __init mem_init(void) | ||
537 | { | ||
538 | extern int ppro_with_ram_bug(void); | ||
539 | int codesize, reservedpages, datasize, initsize; | ||
540 | int tmp; | ||
541 | int bad_ppro; | ||
542 | |||
543 | #ifndef CONFIG_DISCONTIGMEM | ||
544 | if (!mem_map) | ||
545 | BUG(); | ||
546 | #endif | ||
547 | |||
548 | bad_ppro = ppro_with_ram_bug(); | ||
549 | |||
550 | #ifdef CONFIG_HIGHMEM | ||
551 | /* check that fixmap and pkmap do not overlap */ | ||
552 | if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { | ||
553 | printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); | ||
554 | printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", | ||
555 | PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); | ||
556 | BUG(); | ||
557 | } | ||
558 | #endif | ||
559 | |||
560 | set_max_mapnr_init(); | ||
561 | |||
562 | #ifdef CONFIG_HIGHMEM | ||
563 | high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; | ||
564 | #else | ||
565 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; | ||
566 | #endif | ||
567 | |||
568 | /* this will put all low memory onto the freelists */ | ||
569 | totalram_pages += free_all_bootmem(); | ||
570 | |||
571 | reservedpages = 0; | ||
572 | for (tmp = 0; tmp < max_low_pfn; tmp++) | ||
573 | /* | ||
574 | * Only count reserved RAM pages | ||
575 | */ | ||
576 | if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) | ||
577 | reservedpages++; | ||
578 | |||
579 | set_highmem_pages_init(bad_ppro); | ||
580 | |||
581 | codesize = (unsigned long) &_etext - (unsigned long) &_text; | ||
582 | datasize = (unsigned long) &_edata - (unsigned long) &_etext; | ||
583 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | ||
584 | |||
585 | kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | ||
586 | kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | ||
587 | VMALLOC_END-VMALLOC_START); | ||
588 | |||
589 | printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", | ||
590 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | ||
591 | num_physpages << (PAGE_SHIFT-10), | ||
592 | codesize >> 10, | ||
593 | reservedpages << (PAGE_SHIFT-10), | ||
594 | datasize >> 10, | ||
595 | initsize >> 10, | ||
596 | (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) | ||
597 | ); | ||
598 | |||
599 | #ifdef CONFIG_X86_PAE | ||
600 | if (!cpu_has_pae) | ||
601 | panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); | ||
602 | #endif | ||
603 | if (boot_cpu_data.wp_works_ok < 0) | ||
604 | test_wp_bit(); | ||
605 | |||
606 | /* | ||
607 | * Subtle. SMP is doing it's boot stuff late (because it has to | ||
608 | * fork idle threads) - but it also needs low mappings for the | ||
609 | * protected-mode entry to work. We zap these entries only after | ||
610 | * the WP-bit has been tested. | ||
611 | */ | ||
612 | #ifndef CONFIG_SMP | ||
613 | zap_low_mappings(); | ||
614 | #endif | ||
615 | } | ||
616 | |||
617 | kmem_cache_t *pgd_cache; | ||
618 | kmem_cache_t *pmd_cache; | ||
619 | |||
620 | void __init pgtable_cache_init(void) | ||
621 | { | ||
622 | if (PTRS_PER_PMD > 1) { | ||
623 | pmd_cache = kmem_cache_create("pmd", | ||
624 | PTRS_PER_PMD*sizeof(pmd_t), | ||
625 | PTRS_PER_PMD*sizeof(pmd_t), | ||
626 | 0, | ||
627 | pmd_ctor, | ||
628 | NULL); | ||
629 | if (!pmd_cache) | ||
630 | panic("pgtable_cache_init(): cannot create pmd cache"); | ||
631 | } | ||
632 | pgd_cache = kmem_cache_create("pgd", | ||
633 | PTRS_PER_PGD*sizeof(pgd_t), | ||
634 | PTRS_PER_PGD*sizeof(pgd_t), | ||
635 | 0, | ||
636 | pgd_ctor, | ||
637 | PTRS_PER_PMD == 1 ? pgd_dtor : NULL); | ||
638 | if (!pgd_cache) | ||
639 | panic("pgtable_cache_init(): Cannot create pgd cache"); | ||
640 | } | ||
641 | |||
642 | /* | ||
643 | * This function cannot be __init, since exceptions don't work in that | ||
644 | * section. Put this after the callers, so that it cannot be inlined. | ||
645 | */ | ||
646 | static int noinline do_test_wp_bit(void) | ||
647 | { | ||
648 | char tmp_reg; | ||
649 | int flag; | ||
650 | |||
651 | __asm__ __volatile__( | ||
652 | " movb %0,%1 \n" | ||
653 | "1: movb %1,%0 \n" | ||
654 | " xorl %2,%2 \n" | ||
655 | "2: \n" | ||
656 | ".section __ex_table,\"a\"\n" | ||
657 | " .align 4 \n" | ||
658 | " .long 1b,2b \n" | ||
659 | ".previous \n" | ||
660 | :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), | ||
661 | "=q" (tmp_reg), | ||
662 | "=r" (flag) | ||
663 | :"2" (1) | ||
664 | :"memory"); | ||
665 | |||
666 | return flag; | ||
667 | } | ||
668 | |||
669 | void free_initmem(void) | ||
670 | { | ||
671 | unsigned long addr; | ||
672 | |||
673 | addr = (unsigned long)(&__init_begin); | ||
674 | for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { | ||
675 | ClearPageReserved(virt_to_page(addr)); | ||
676 | set_page_count(virt_to_page(addr), 1); | ||
677 | memset((void *)addr, 0xcc, PAGE_SIZE); | ||
678 | free_page(addr); | ||
679 | totalram_pages++; | ||
680 | } | ||
681 | printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10); | ||
682 | } | ||
683 | |||
684 | #ifdef CONFIG_BLK_DEV_INITRD | ||
685 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
686 | { | ||
687 | if (start < end) | ||
688 | printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); | ||
689 | for (; start < end; start += PAGE_SIZE) { | ||
690 | ClearPageReserved(virt_to_page(start)); | ||
691 | set_page_count(virt_to_page(start), 1); | ||
692 | free_page(start); | ||
693 | totalram_pages++; | ||
694 | } | ||
695 | } | ||
696 | #endif | ||
diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c new file mode 100644 index 000000000000..db06f7399913 --- /dev/null +++ b/arch/i386/mm/ioremap.c | |||
@@ -0,0 +1,320 @@ | |||
1 | /* | ||
2 | * arch/i386/mm/ioremap.c | ||
3 | * | ||
4 | * Re-map IO memory to kernel address space so that we can access it. | ||
5 | * This is needed for high PCI addresses that aren't mapped in the | ||
6 | * 640k-1MB IO memory area on PC's | ||
7 | * | ||
8 | * (C) Copyright 1995 1996 Linus Torvalds | ||
9 | */ | ||
10 | |||
11 | #include <linux/vmalloc.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <asm/io.h> | ||
15 | #include <asm/fixmap.h> | ||
16 | #include <asm/cacheflush.h> | ||
17 | #include <asm/tlbflush.h> | ||
18 | #include <asm/pgtable.h> | ||
19 | |||
20 | #define ISA_START_ADDRESS 0xa0000 | ||
21 | #define ISA_END_ADDRESS 0x100000 | ||
22 | |||
23 | static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, | ||
24 | unsigned long end, unsigned long phys_addr, unsigned long flags) | ||
25 | { | ||
26 | pte_t *pte; | ||
27 | unsigned long pfn; | ||
28 | |||
29 | pfn = phys_addr >> PAGE_SHIFT; | ||
30 | pte = pte_alloc_kernel(&init_mm, pmd, addr); | ||
31 | if (!pte) | ||
32 | return -ENOMEM; | ||
33 | do { | ||
34 | BUG_ON(!pte_none(*pte)); | ||
35 | set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | | ||
36 | _PAGE_DIRTY | _PAGE_ACCESSED | flags))); | ||
37 | pfn++; | ||
38 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
39 | return 0; | ||
40 | } | ||
41 | |||
42 | static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, | ||
43 | unsigned long end, unsigned long phys_addr, unsigned long flags) | ||
44 | { | ||
45 | pmd_t *pmd; | ||
46 | unsigned long next; | ||
47 | |||
48 | phys_addr -= addr; | ||
49 | pmd = pmd_alloc(&init_mm, pud, addr); | ||
50 | if (!pmd) | ||
51 | return -ENOMEM; | ||
52 | do { | ||
53 | next = pmd_addr_end(addr, end); | ||
54 | if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, flags)) | ||
55 | return -ENOMEM; | ||
56 | } while (pmd++, addr = next, addr != end); | ||
57 | return 0; | ||
58 | } | ||
59 | |||
60 | static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr, | ||
61 | unsigned long end, unsigned long phys_addr, unsigned long flags) | ||
62 | { | ||
63 | pud_t *pud; | ||
64 | unsigned long next; | ||
65 | |||
66 | phys_addr -= addr; | ||
67 | pud = pud_alloc(&init_mm, pgd, addr); | ||
68 | if (!pud) | ||
69 | return -ENOMEM; | ||
70 | do { | ||
71 | next = pud_addr_end(addr, end); | ||
72 | if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, flags)) | ||
73 | return -ENOMEM; | ||
74 | } while (pud++, addr = next, addr != end); | ||
75 | return 0; | ||
76 | } | ||
77 | |||
78 | static int ioremap_page_range(unsigned long addr, | ||
79 | unsigned long end, unsigned long phys_addr, unsigned long flags) | ||
80 | { | ||
81 | pgd_t *pgd; | ||
82 | unsigned long next; | ||
83 | int err; | ||
84 | |||
85 | BUG_ON(addr >= end); | ||
86 | flush_cache_all(); | ||
87 | phys_addr -= addr; | ||
88 | pgd = pgd_offset_k(addr); | ||
89 | spin_lock(&init_mm.page_table_lock); | ||
90 | do { | ||
91 | next = pgd_addr_end(addr, end); | ||
92 | err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags); | ||
93 | if (err) | ||
94 | break; | ||
95 | } while (pgd++, addr = next, addr != end); | ||
96 | spin_unlock(&init_mm.page_table_lock); | ||
97 | flush_tlb_all(); | ||
98 | return err; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * Generic mapping function (not visible outside): | ||
103 | */ | ||
104 | |||
105 | /* | ||
106 | * Remap an arbitrary physical address space into the kernel virtual | ||
107 | * address space. Needed when the kernel wants to access high addresses | ||
108 | * directly. | ||
109 | * | ||
110 | * NOTE! We need to allow non-page-aligned mappings too: we will obviously | ||
111 | * have to convert them into an offset in a page-aligned mapping, but the | ||
112 | * caller shouldn't need to know that small detail. | ||
113 | */ | ||
114 | void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) | ||
115 | { | ||
116 | void __iomem * addr; | ||
117 | struct vm_struct * area; | ||
118 | unsigned long offset, last_addr; | ||
119 | |||
120 | /* Don't allow wraparound or zero size */ | ||
121 | last_addr = phys_addr + size - 1; | ||
122 | if (!size || last_addr < phys_addr) | ||
123 | return NULL; | ||
124 | |||
125 | /* | ||
126 | * Don't remap the low PCI/ISA area, it's always mapped.. | ||
127 | */ | ||
128 | if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | ||
129 | return (void __iomem *) phys_to_virt(phys_addr); | ||
130 | |||
131 | /* | ||
132 | * Don't allow anybody to remap normal RAM that we're using.. | ||
133 | */ | ||
134 | if (phys_addr <= virt_to_phys(high_memory - 1)) { | ||
135 | char *t_addr, *t_end; | ||
136 | struct page *page; | ||
137 | |||
138 | t_addr = __va(phys_addr); | ||
139 | t_end = t_addr + (size - 1); | ||
140 | |||
141 | for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) | ||
142 | if(!PageReserved(page)) | ||
143 | return NULL; | ||
144 | } | ||
145 | |||
146 | /* | ||
147 | * Mappings have to be page-aligned | ||
148 | */ | ||
149 | offset = phys_addr & ~PAGE_MASK; | ||
150 | phys_addr &= PAGE_MASK; | ||
151 | size = PAGE_ALIGN(last_addr+1) - phys_addr; | ||
152 | |||
153 | /* | ||
154 | * Ok, go for it.. | ||
155 | */ | ||
156 | area = get_vm_area(size, VM_IOREMAP | (flags << 20)); | ||
157 | if (!area) | ||
158 | return NULL; | ||
159 | area->phys_addr = phys_addr; | ||
160 | addr = (void __iomem *) area->addr; | ||
161 | if (ioremap_page_range((unsigned long) addr, | ||
162 | (unsigned long) addr + size, phys_addr, flags)) { | ||
163 | vunmap((void __force *) addr); | ||
164 | return NULL; | ||
165 | } | ||
166 | return (void __iomem *) (offset + (char __iomem *)addr); | ||
167 | } | ||
168 | |||
169 | |||
170 | /** | ||
171 | * ioremap_nocache - map bus memory into CPU space | ||
172 | * @offset: bus address of the memory | ||
173 | * @size: size of the resource to map | ||
174 | * | ||
175 | * ioremap_nocache performs a platform specific sequence of operations to | ||
176 | * make bus memory CPU accessible via the readb/readw/readl/writeb/ | ||
177 | * writew/writel functions and the other mmio helpers. The returned | ||
178 | * address is not guaranteed to be usable directly as a virtual | ||
179 | * address. | ||
180 | * | ||
181 | * This version of ioremap ensures that the memory is marked uncachable | ||
182 | * on the CPU as well as honouring existing caching rules from things like | ||
183 | * the PCI bus. Note that there are other caches and buffers on many | ||
184 | * busses. In particular driver authors should read up on PCI writes | ||
185 | * | ||
186 | * It's useful if some control registers are in such an area and | ||
187 | * write combining or read caching is not desirable: | ||
188 | * | ||
189 | * Must be freed with iounmap. | ||
190 | */ | ||
191 | |||
192 | void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) | ||
193 | { | ||
194 | unsigned long last_addr; | ||
195 | void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); | ||
196 | if (!p) | ||
197 | return p; | ||
198 | |||
199 | /* Guaranteed to be > phys_addr, as per __ioremap() */ | ||
200 | last_addr = phys_addr + size - 1; | ||
201 | |||
202 | if (last_addr < virt_to_phys(high_memory) - 1) { | ||
203 | struct page *ppage = virt_to_page(__va(phys_addr)); | ||
204 | unsigned long npages; | ||
205 | |||
206 | phys_addr &= PAGE_MASK; | ||
207 | |||
208 | /* This might overflow and become zero.. */ | ||
209 | last_addr = PAGE_ALIGN(last_addr); | ||
210 | |||
211 | /* .. but that's ok, because modulo-2**n arithmetic will make | ||
212 | * the page-aligned "last - first" come out right. | ||
213 | */ | ||
214 | npages = (last_addr - phys_addr) >> PAGE_SHIFT; | ||
215 | |||
216 | if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { | ||
217 | iounmap(p); | ||
218 | p = NULL; | ||
219 | } | ||
220 | global_flush_tlb(); | ||
221 | } | ||
222 | |||
223 | return p; | ||
224 | } | ||
225 | |||
226 | void iounmap(volatile void __iomem *addr) | ||
227 | { | ||
228 | struct vm_struct *p; | ||
229 | if ((void __force *) addr <= high_memory) | ||
230 | return; | ||
231 | |||
232 | /* | ||
233 | * __ioremap special-cases the PCI/ISA range by not instantiating a | ||
234 | * vm_area and by simply returning an address into the kernel mapping | ||
235 | * of ISA space. So handle that here. | ||
236 | */ | ||
237 | if (addr >= phys_to_virt(ISA_START_ADDRESS) && | ||
238 | addr < phys_to_virt(ISA_END_ADDRESS)) | ||
239 | return; | ||
240 | |||
241 | p = remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr)); | ||
242 | if (!p) { | ||
243 | printk("__iounmap: bad address %p\n", addr); | ||
244 | return; | ||
245 | } | ||
246 | |||
247 | if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) { | ||
248 | /* p->size includes the guard page, but cpa doesn't like that */ | ||
249 | change_page_attr(virt_to_page(__va(p->phys_addr)), | ||
250 | p->size >> PAGE_SHIFT, | ||
251 | PAGE_KERNEL); | ||
252 | global_flush_tlb(); | ||
253 | } | ||
254 | kfree(p); | ||
255 | } | ||
256 | |||
257 | void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) | ||
258 | { | ||
259 | unsigned long offset, last_addr; | ||
260 | unsigned int nrpages; | ||
261 | enum fixed_addresses idx; | ||
262 | |||
263 | /* Don't allow wraparound or zero size */ | ||
264 | last_addr = phys_addr + size - 1; | ||
265 | if (!size || last_addr < phys_addr) | ||
266 | return NULL; | ||
267 | |||
268 | /* | ||
269 | * Don't remap the low PCI/ISA area, it's always mapped.. | ||
270 | */ | ||
271 | if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | ||
272 | return phys_to_virt(phys_addr); | ||
273 | |||
274 | /* | ||
275 | * Mappings have to be page-aligned | ||
276 | */ | ||
277 | offset = phys_addr & ~PAGE_MASK; | ||
278 | phys_addr &= PAGE_MASK; | ||
279 | size = PAGE_ALIGN(last_addr) - phys_addr; | ||
280 | |||
281 | /* | ||
282 | * Mappings have to fit in the FIX_BTMAP area. | ||
283 | */ | ||
284 | nrpages = size >> PAGE_SHIFT; | ||
285 | if (nrpages > NR_FIX_BTMAPS) | ||
286 | return NULL; | ||
287 | |||
288 | /* | ||
289 | * Ok, go for it.. | ||
290 | */ | ||
291 | idx = FIX_BTMAP_BEGIN; | ||
292 | while (nrpages > 0) { | ||
293 | set_fixmap(idx, phys_addr); | ||
294 | phys_addr += PAGE_SIZE; | ||
295 | --idx; | ||
296 | --nrpages; | ||
297 | } | ||
298 | return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); | ||
299 | } | ||
300 | |||
301 | void __init bt_iounmap(void *addr, unsigned long size) | ||
302 | { | ||
303 | unsigned long virt_addr; | ||
304 | unsigned long offset; | ||
305 | unsigned int nrpages; | ||
306 | enum fixed_addresses idx; | ||
307 | |||
308 | virt_addr = (unsigned long)addr; | ||
309 | if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) | ||
310 | return; | ||
311 | offset = virt_addr & ~PAGE_MASK; | ||
312 | nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; | ||
313 | |||
314 | idx = FIX_BTMAP_BEGIN; | ||
315 | while (nrpages > 0) { | ||
316 | clear_fixmap(idx); | ||
317 | --idx; | ||
318 | --nrpages; | ||
319 | } | ||
320 | } | ||
diff --git a/arch/i386/mm/mmap.c b/arch/i386/mm/mmap.c new file mode 100644 index 000000000000..e4730a1a43dd --- /dev/null +++ b/arch/i386/mm/mmap.c | |||
@@ -0,0 +1,76 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/mm/mmap.c | ||
3 | * | ||
4 | * flexible mmap layout support | ||
5 | * | ||
6 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. | ||
7 | * All Rights Reserved. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | * GNU General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public License | ||
20 | * along with this program; if not, write to the Free Software | ||
21 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
22 | * | ||
23 | * | ||
24 | * Started by Ingo Molnar <mingo@elte.hu> | ||
25 | */ | ||
26 | |||
27 | #include <linux/personality.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/random.h> | ||
30 | |||
31 | /* | ||
32 | * Top of mmap area (just below the process stack). | ||
33 | * | ||
34 | * Leave an at least ~128 MB hole. | ||
35 | */ | ||
36 | #define MIN_GAP (128*1024*1024) | ||
37 | #define MAX_GAP (TASK_SIZE/6*5) | ||
38 | |||
39 | static inline unsigned long mmap_base(struct mm_struct *mm) | ||
40 | { | ||
41 | unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; | ||
42 | unsigned long random_factor = 0; | ||
43 | |||
44 | if (current->flags & PF_RANDOMIZE) | ||
45 | random_factor = get_random_int() % (1024*1024); | ||
46 | |||
47 | if (gap < MIN_GAP) | ||
48 | gap = MIN_GAP; | ||
49 | else if (gap > MAX_GAP) | ||
50 | gap = MAX_GAP; | ||
51 | |||
52 | return PAGE_ALIGN(TASK_SIZE - gap - random_factor); | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * This function, called very early during the creation of a new | ||
57 | * process VM image, sets up which VM layout function to use: | ||
58 | */ | ||
59 | void arch_pick_mmap_layout(struct mm_struct *mm) | ||
60 | { | ||
61 | /* | ||
62 | * Fall back to the standard layout if the personality | ||
63 | * bit is set, or if the expected stack growth is unlimited: | ||
64 | */ | ||
65 | if (sysctl_legacy_va_layout || | ||
66 | (current->personality & ADDR_COMPAT_LAYOUT) || | ||
67 | current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) { | ||
68 | mm->mmap_base = TASK_UNMAPPED_BASE; | ||
69 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
70 | mm->unmap_area = arch_unmap_area; | ||
71 | } else { | ||
72 | mm->mmap_base = mmap_base(mm); | ||
73 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; | ||
74 | mm->unmap_area = arch_unmap_area_topdown; | ||
75 | } | ||
76 | } | ||
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c new file mode 100644 index 000000000000..cb3da6baa704 --- /dev/null +++ b/arch/i386/mm/pageattr.c | |||
@@ -0,0 +1,221 @@ | |||
1 | /* | ||
2 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
3 | * Thanks to Ben LaHaise for precious feedback. | ||
4 | */ | ||
5 | |||
6 | #include <linux/config.h> | ||
7 | #include <linux/mm.h> | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/highmem.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <asm/uaccess.h> | ||
13 | #include <asm/processor.h> | ||
14 | #include <asm/tlbflush.h> | ||
15 | |||
16 | static DEFINE_SPINLOCK(cpa_lock); | ||
17 | static struct list_head df_list = LIST_HEAD_INIT(df_list); | ||
18 | |||
19 | |||
20 | pte_t *lookup_address(unsigned long address) | ||
21 | { | ||
22 | pgd_t *pgd = pgd_offset_k(address); | ||
23 | pud_t *pud; | ||
24 | pmd_t *pmd; | ||
25 | if (pgd_none(*pgd)) | ||
26 | return NULL; | ||
27 | pud = pud_offset(pgd, address); | ||
28 | if (pud_none(*pud)) | ||
29 | return NULL; | ||
30 | pmd = pmd_offset(pud, address); | ||
31 | if (pmd_none(*pmd)) | ||
32 | return NULL; | ||
33 | if (pmd_large(*pmd)) | ||
34 | return (pte_t *)pmd; | ||
35 | return pte_offset_kernel(pmd, address); | ||
36 | } | ||
37 | |||
38 | static struct page *split_large_page(unsigned long address, pgprot_t prot) | ||
39 | { | ||
40 | int i; | ||
41 | unsigned long addr; | ||
42 | struct page *base; | ||
43 | pte_t *pbase; | ||
44 | |||
45 | spin_unlock_irq(&cpa_lock); | ||
46 | base = alloc_pages(GFP_KERNEL, 0); | ||
47 | spin_lock_irq(&cpa_lock); | ||
48 | if (!base) | ||
49 | return NULL; | ||
50 | |||
51 | address = __pa(address); | ||
52 | addr = address & LARGE_PAGE_MASK; | ||
53 | pbase = (pte_t *)page_address(base); | ||
54 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { | ||
55 | pbase[i] = pfn_pte(addr >> PAGE_SHIFT, | ||
56 | addr == address ? prot : PAGE_KERNEL); | ||
57 | } | ||
58 | return base; | ||
59 | } | ||
60 | |||
61 | static void flush_kernel_map(void *dummy) | ||
62 | { | ||
63 | /* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */ | ||
64 | if (boot_cpu_data.x86_model >= 4) | ||
65 | asm volatile("wbinvd":::"memory"); | ||
66 | /* Flush all to work around Errata in early athlons regarding | ||
67 | * large page flushing. | ||
68 | */ | ||
69 | __flush_tlb_all(); | ||
70 | } | ||
71 | |||
72 | static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) | ||
73 | { | ||
74 | struct page *page; | ||
75 | unsigned long flags; | ||
76 | |||
77 | set_pte_atomic(kpte, pte); /* change init_mm */ | ||
78 | if (PTRS_PER_PMD > 1) | ||
79 | return; | ||
80 | |||
81 | spin_lock_irqsave(&pgd_lock, flags); | ||
82 | for (page = pgd_list; page; page = (struct page *)page->index) { | ||
83 | pgd_t *pgd; | ||
84 | pud_t *pud; | ||
85 | pmd_t *pmd; | ||
86 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
87 | pud = pud_offset(pgd, address); | ||
88 | pmd = pmd_offset(pud, address); | ||
89 | set_pte_atomic((pte_t *)pmd, pte); | ||
90 | } | ||
91 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * No more special protections in this 2/4MB area - revert to a | ||
96 | * large page again. | ||
97 | */ | ||
98 | static inline void revert_page(struct page *kpte_page, unsigned long address) | ||
99 | { | ||
100 | pte_t *linear = (pte_t *) | ||
101 | pmd_offset(pud_offset(pgd_offset_k(address), address), address); | ||
102 | set_pmd_pte(linear, address, | ||
103 | pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, | ||
104 | PAGE_KERNEL_LARGE)); | ||
105 | } | ||
106 | |||
107 | static int | ||
108 | __change_page_attr(struct page *page, pgprot_t prot) | ||
109 | { | ||
110 | pte_t *kpte; | ||
111 | unsigned long address; | ||
112 | struct page *kpte_page; | ||
113 | |||
114 | BUG_ON(PageHighMem(page)); | ||
115 | address = (unsigned long)page_address(page); | ||
116 | |||
117 | kpte = lookup_address(address); | ||
118 | if (!kpte) | ||
119 | return -EINVAL; | ||
120 | kpte_page = virt_to_page(kpte); | ||
121 | if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { | ||
122 | if ((pte_val(*kpte) & _PAGE_PSE) == 0) { | ||
123 | set_pte_atomic(kpte, mk_pte(page, prot)); | ||
124 | } else { | ||
125 | struct page *split = split_large_page(address, prot); | ||
126 | if (!split) | ||
127 | return -ENOMEM; | ||
128 | set_pmd_pte(kpte,address,mk_pte(split, PAGE_KERNEL)); | ||
129 | kpte_page = split; | ||
130 | } | ||
131 | get_page(kpte_page); | ||
132 | } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { | ||
133 | set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); | ||
134 | __put_page(kpte_page); | ||
135 | } else | ||
136 | BUG(); | ||
137 | |||
138 | /* | ||
139 | * If the pte was reserved, it means it was created at boot | ||
140 | * time (not via split_large_page) and in turn we must not | ||
141 | * replace it with a largepage. | ||
142 | */ | ||
143 | if (!PageReserved(kpte_page)) { | ||
144 | /* memleak and potential failed 2M page regeneration */ | ||
145 | BUG_ON(!page_count(kpte_page)); | ||
146 | |||
147 | if (cpu_has_pse && (page_count(kpte_page) == 1)) { | ||
148 | list_add(&kpte_page->lru, &df_list); | ||
149 | revert_page(kpte_page, address); | ||
150 | } | ||
151 | } | ||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | static inline void flush_map(void) | ||
156 | { | ||
157 | on_each_cpu(flush_kernel_map, NULL, 1, 1); | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Change the page attributes of an page in the linear mapping. | ||
162 | * | ||
163 | * This should be used when a page is mapped with a different caching policy | ||
164 | * than write-back somewhere - some CPUs do not like it when mappings with | ||
165 | * different caching policies exist. This changes the page attributes of the | ||
166 | * in kernel linear mapping too. | ||
167 | * | ||
168 | * The caller needs to ensure that there are no conflicting mappings elsewhere. | ||
169 | * This function only deals with the kernel linear map. | ||
170 | * | ||
171 | * Caller must call global_flush_tlb() after this. | ||
172 | */ | ||
173 | int change_page_attr(struct page *page, int numpages, pgprot_t prot) | ||
174 | { | ||
175 | int err = 0; | ||
176 | int i; | ||
177 | unsigned long flags; | ||
178 | |||
179 | spin_lock_irqsave(&cpa_lock, flags); | ||
180 | for (i = 0; i < numpages; i++, page++) { | ||
181 | err = __change_page_attr(page, prot); | ||
182 | if (err) | ||
183 | break; | ||
184 | } | ||
185 | spin_unlock_irqrestore(&cpa_lock, flags); | ||
186 | return err; | ||
187 | } | ||
188 | |||
189 | void global_flush_tlb(void) | ||
190 | { | ||
191 | LIST_HEAD(l); | ||
192 | struct page *pg, *next; | ||
193 | |||
194 | BUG_ON(irqs_disabled()); | ||
195 | |||
196 | spin_lock_irq(&cpa_lock); | ||
197 | list_splice_init(&df_list, &l); | ||
198 | spin_unlock_irq(&cpa_lock); | ||
199 | flush_map(); | ||
200 | list_for_each_entry_safe(pg, next, &l, lru) | ||
201 | __free_page(pg); | ||
202 | } | ||
203 | |||
204 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
205 | void kernel_map_pages(struct page *page, int numpages, int enable) | ||
206 | { | ||
207 | if (PageHighMem(page)) | ||
208 | return; | ||
209 | /* the return value is ignored - the calls cannot fail, | ||
210 | * large pages are disabled at boot time. | ||
211 | */ | ||
212 | change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); | ||
213 | /* we should perform an IPI and flush all tlbs, | ||
214 | * but that can deadlock->flush only current cpu. | ||
215 | */ | ||
216 | __flush_tlb_all(); | ||
217 | } | ||
218 | #endif | ||
219 | |||
220 | EXPORT_SYMBOL(change_page_attr); | ||
221 | EXPORT_SYMBOL(global_flush_tlb); | ||
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c new file mode 100644 index 000000000000..0742d54f8bb0 --- /dev/null +++ b/arch/i386/mm/pgtable.c | |||
@@ -0,0 +1,260 @@ | |||
1 | /* | ||
2 | * linux/arch/i386/mm/pgtable.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/config.h> | ||
6 | #include <linux/sched.h> | ||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/errno.h> | ||
9 | #include <linux/mm.h> | ||
10 | #include <linux/swap.h> | ||
11 | #include <linux/smp.h> | ||
12 | #include <linux/highmem.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/pagemap.h> | ||
15 | #include <linux/spinlock.h> | ||
16 | |||
17 | #include <asm/system.h> | ||
18 | #include <asm/pgtable.h> | ||
19 | #include <asm/pgalloc.h> | ||
20 | #include <asm/fixmap.h> | ||
21 | #include <asm/e820.h> | ||
22 | #include <asm/tlb.h> | ||
23 | #include <asm/tlbflush.h> | ||
24 | |||
25 | void show_mem(void) | ||
26 | { | ||
27 | int total = 0, reserved = 0; | ||
28 | int shared = 0, cached = 0; | ||
29 | int highmem = 0; | ||
30 | struct page *page; | ||
31 | pg_data_t *pgdat; | ||
32 | unsigned long i; | ||
33 | |||
34 | printk("Mem-info:\n"); | ||
35 | show_free_areas(); | ||
36 | printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | ||
37 | for_each_pgdat(pgdat) { | ||
38 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | ||
39 | page = pgdat->node_mem_map + i; | ||
40 | total++; | ||
41 | if (PageHighMem(page)) | ||
42 | highmem++; | ||
43 | if (PageReserved(page)) | ||
44 | reserved++; | ||
45 | else if (PageSwapCache(page)) | ||
46 | cached++; | ||
47 | else if (page_count(page)) | ||
48 | shared += page_count(page) - 1; | ||
49 | } | ||
50 | } | ||
51 | printk("%d pages of RAM\n", total); | ||
52 | printk("%d pages of HIGHMEM\n",highmem); | ||
53 | printk("%d reserved pages\n",reserved); | ||
54 | printk("%d pages shared\n",shared); | ||
55 | printk("%d pages swap cached\n",cached); | ||
56 | } | ||
57 | |||
58 | /* | ||
59 | * Associate a virtual page frame with a given physical page frame | ||
60 | * and protection flags for that frame. | ||
61 | */ | ||
62 | static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) | ||
63 | { | ||
64 | pgd_t *pgd; | ||
65 | pud_t *pud; | ||
66 | pmd_t *pmd; | ||
67 | pte_t *pte; | ||
68 | |||
69 | pgd = swapper_pg_dir + pgd_index(vaddr); | ||
70 | if (pgd_none(*pgd)) { | ||
71 | BUG(); | ||
72 | return; | ||
73 | } | ||
74 | pud = pud_offset(pgd, vaddr); | ||
75 | if (pud_none(*pud)) { | ||
76 | BUG(); | ||
77 | return; | ||
78 | } | ||
79 | pmd = pmd_offset(pud, vaddr); | ||
80 | if (pmd_none(*pmd)) { | ||
81 | BUG(); | ||
82 | return; | ||
83 | } | ||
84 | pte = pte_offset_kernel(pmd, vaddr); | ||
85 | /* <pfn,flags> stored as-is, to permit clearing entries */ | ||
86 | set_pte(pte, pfn_pte(pfn, flags)); | ||
87 | |||
88 | /* | ||
89 | * It's enough to flush this one mapping. | ||
90 | * (PGE mappings get flushed as well) | ||
91 | */ | ||
92 | __flush_tlb_one(vaddr); | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * Associate a large virtual page frame with a given physical page frame | ||
97 | * and protection flags for that frame. pfn is for the base of the page, | ||
98 | * vaddr is what the page gets mapped to - both must be properly aligned. | ||
99 | * The pmd must already be instantiated. Assumes PAE mode. | ||
100 | */ | ||
101 | void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) | ||
102 | { | ||
103 | pgd_t *pgd; | ||
104 | pud_t *pud; | ||
105 | pmd_t *pmd; | ||
106 | |||
107 | if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ | ||
108 | printk ("set_pmd_pfn: vaddr misaligned\n"); | ||
109 | return; /* BUG(); */ | ||
110 | } | ||
111 | if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ | ||
112 | printk ("set_pmd_pfn: pfn misaligned\n"); | ||
113 | return; /* BUG(); */ | ||
114 | } | ||
115 | pgd = swapper_pg_dir + pgd_index(vaddr); | ||
116 | if (pgd_none(*pgd)) { | ||
117 | printk ("set_pmd_pfn: pgd_none\n"); | ||
118 | return; /* BUG(); */ | ||
119 | } | ||
120 | pud = pud_offset(pgd, vaddr); | ||
121 | pmd = pmd_offset(pud, vaddr); | ||
122 | set_pmd(pmd, pfn_pmd(pfn, flags)); | ||
123 | /* | ||
124 | * It's enough to flush this one mapping. | ||
125 | * (PGE mappings get flushed as well) | ||
126 | */ | ||
127 | __flush_tlb_one(vaddr); | ||
128 | } | ||
129 | |||
130 | void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) | ||
131 | { | ||
132 | unsigned long address = __fix_to_virt(idx); | ||
133 | |||
134 | if (idx >= __end_of_fixed_addresses) { | ||
135 | BUG(); | ||
136 | return; | ||
137 | } | ||
138 | set_pte_pfn(address, phys >> PAGE_SHIFT, flags); | ||
139 | } | ||
140 | |||
141 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) | ||
142 | { | ||
143 | return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); | ||
144 | } | ||
145 | |||
146 | struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) | ||
147 | { | ||
148 | struct page *pte; | ||
149 | |||
150 | #ifdef CONFIG_HIGHPTE | ||
151 | pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); | ||
152 | #else | ||
153 | pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); | ||
154 | #endif | ||
155 | return pte; | ||
156 | } | ||
157 | |||
158 | void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) | ||
159 | { | ||
160 | memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * List of all pgd's needed for non-PAE so it can invalidate entries | ||
165 | * in both cached and uncached pgd's; not needed for PAE since the | ||
166 | * kernel pmd is shared. If PAE were not to share the pmd a similar | ||
167 | * tactic would be needed. This is essentially codepath-based locking | ||
168 | * against pageattr.c; it is the unique case in which a valid change | ||
169 | * of kernel pagetables can't be lazily synchronized by vmalloc faults. | ||
170 | * vmalloc faults work because attached pagetables are never freed. | ||
171 | * The locking scheme was chosen on the basis of manfred's | ||
172 | * recommendations and having no core impact whatsoever. | ||
173 | * -- wli | ||
174 | */ | ||
175 | DEFINE_SPINLOCK(pgd_lock); | ||
176 | struct page *pgd_list; | ||
177 | |||
178 | static inline void pgd_list_add(pgd_t *pgd) | ||
179 | { | ||
180 | struct page *page = virt_to_page(pgd); | ||
181 | page->index = (unsigned long)pgd_list; | ||
182 | if (pgd_list) | ||
183 | pgd_list->private = (unsigned long)&page->index; | ||
184 | pgd_list = page; | ||
185 | page->private = (unsigned long)&pgd_list; | ||
186 | } | ||
187 | |||
188 | static inline void pgd_list_del(pgd_t *pgd) | ||
189 | { | ||
190 | struct page *next, **pprev, *page = virt_to_page(pgd); | ||
191 | next = (struct page *)page->index; | ||
192 | pprev = (struct page **)page->private; | ||
193 | *pprev = next; | ||
194 | if (next) | ||
195 | next->private = (unsigned long)pprev; | ||
196 | } | ||
197 | |||
198 | void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) | ||
199 | { | ||
200 | unsigned long flags; | ||
201 | |||
202 | if (PTRS_PER_PMD == 1) | ||
203 | spin_lock_irqsave(&pgd_lock, flags); | ||
204 | |||
205 | memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, | ||
206 | swapper_pg_dir + USER_PTRS_PER_PGD, | ||
207 | (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); | ||
208 | |||
209 | if (PTRS_PER_PMD > 1) | ||
210 | return; | ||
211 | |||
212 | pgd_list_add(pgd); | ||
213 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
214 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); | ||
215 | } | ||
216 | |||
217 | /* never called when PTRS_PER_PMD > 1 */ | ||
218 | void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) | ||
219 | { | ||
220 | unsigned long flags; /* can be called from interrupt context */ | ||
221 | |||
222 | spin_lock_irqsave(&pgd_lock, flags); | ||
223 | pgd_list_del(pgd); | ||
224 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
225 | } | ||
226 | |||
227 | pgd_t *pgd_alloc(struct mm_struct *mm) | ||
228 | { | ||
229 | int i; | ||
230 | pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); | ||
231 | |||
232 | if (PTRS_PER_PMD == 1 || !pgd) | ||
233 | return pgd; | ||
234 | |||
235 | for (i = 0; i < USER_PTRS_PER_PGD; ++i) { | ||
236 | pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); | ||
237 | if (!pmd) | ||
238 | goto out_oom; | ||
239 | set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); | ||
240 | } | ||
241 | return pgd; | ||
242 | |||
243 | out_oom: | ||
244 | for (i--; i >= 0; i--) | ||
245 | kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); | ||
246 | kmem_cache_free(pgd_cache, pgd); | ||
247 | return NULL; | ||
248 | } | ||
249 | |||
250 | void pgd_free(pgd_t *pgd) | ||
251 | { | ||
252 | int i; | ||
253 | |||
254 | /* in the PAE case user pgd entries are overwritten before usage */ | ||
255 | if (PTRS_PER_PMD > 1) | ||
256 | for (i = 0; i < USER_PTRS_PER_PGD; ++i) | ||
257 | kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); | ||
258 | /* in the non-PAE case, clear_page_range() clears user pgd entries */ | ||
259 | kmem_cache_free(pgd_cache, pgd); | ||
260 | } | ||