aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-07-10 12:55:17 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-10 12:55:17 -0400
commit520b9617ab4aea764ddfc5d58cae21c16b3318e1 (patch)
tree1612249d11d455cfd6a0d691f5564673ae179c5f /arch/x86/mm
parentf57e91682d141ea50d8c6d42cdc251b6256a3755 (diff)
parentf87f38ec5a5157aa39f44f6018dc58ea62f8e0e2 (diff)
Merge branch 'x86/core' into x86/generalize-visws
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/discontig_32.c285
-rw-r--r--arch/x86/mm/dump_pagetables.c2
-rw-r--r--arch/x86/mm/fault.c97
-rw-r--r--arch/x86/mm/init_32.c522
-rw-r--r--arch/x86/mm/init_64.c539
-rw-r--r--arch/x86/mm/ioremap.c26
-rw-r--r--arch/x86/mm/k8topology_64.c21
-rw-r--r--arch/x86/mm/numa_64.c93
-rw-r--r--arch/x86/mm/pageattr-test.c21
-rw-r--r--arch/x86/mm/pageattr.c41
-rw-r--r--arch/x86/mm/pat.c375
-rw-r--r--arch/x86/mm/pgtable.c190
-rw-r--r--arch/x86/mm/pgtable_32.c56
-rw-r--r--arch/x86/mm/srat_32.c280
-rw-r--r--arch/x86/mm/srat_64.c7
16 files changed, 1662 insertions, 896 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index b7b3e4c7cfc9..c107641cd39b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -13,5 +13,6 @@ obj-$(CONFIG_NUMA) += discontig_32.o
13else 13else
14obj-$(CONFIG_NUMA) += numa_64.o 14obj-$(CONFIG_NUMA) += numa_64.o
15obj-$(CONFIG_K8_NUMA) += k8topology_64.o 15obj-$(CONFIG_K8_NUMA) += k8topology_64.o
16obj-$(CONFIG_ACPI_NUMA) += srat_64.o
17endif 16endif
17obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
18
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 914ccf983687..5dfef9fa061a 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -38,6 +38,7 @@
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/mmzone.h> 39#include <asm/mmzone.h>
40#include <asm/bios_ebda.h> 40#include <asm/bios_ebda.h>
41#include <asm/proto.h>
41 42
42struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
43EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
@@ -59,14 +60,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
59/* 60/*
60 * 4) physnode_map - the mapping between a pfn and owning node 61 * 4) physnode_map - the mapping between a pfn and owning node
61 * physnode_map keeps track of the physical memory layout of a generic 62 * physnode_map keeps track of the physical memory layout of a generic
62 * numa node on a 256Mb break (each element of the array will 63 * numa node on a 64Mb break (each element of the array will
63 * represent 256Mb of memory and will be marked by the node id. so, 64 * represent 64Mb of memory and will be marked by the node id. so,
64 * if the first gig is on node 0, and the second gig is on node 1 65 * if the first gig is on node 0, and the second gig is on node 1
65 * physnode_map will contain: 66 * physnode_map will contain:
66 * 67 *
67 * physnode_map[0-3] = 0; 68 * physnode_map[0-15] = 0;
68 * physnode_map[4-7] = 1; 69 * physnode_map[16-31] = 1;
69 * physnode_map[8- ] = -1; 70 * physnode_map[32- ] = -1;
70 */ 71 */
71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; 72s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
72EXPORT_SYMBOL(physnode_map); 73EXPORT_SYMBOL(physnode_map);
@@ -75,15 +76,15 @@ void memory_present(int nid, unsigned long start, unsigned long end)
75{ 76{
76 unsigned long pfn; 77 unsigned long pfn;
77 78
78 printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", 79 printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
79 nid, start, end); 80 nid, start, end);
80 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); 81 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
81 printk(KERN_DEBUG " "); 82 printk(KERN_DEBUG " ");
82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { 83 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid; 84 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
84 printk("%ld ", pfn); 85 printk(KERN_CONT "%lx ", pfn);
85 } 86 }
86 printk("\n"); 87 printk(KERN_CONT "\n");
87} 88}
88 89
89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, 90unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -99,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
99#endif 100#endif
100 101
101extern unsigned long find_max_low_pfn(void); 102extern unsigned long find_max_low_pfn(void);
102extern void add_one_highpage_init(struct page *, int, int);
103extern unsigned long highend_pfn, highstart_pfn; 103extern unsigned long highend_pfn, highstart_pfn;
104 104
105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -117,13 +117,13 @@ static unsigned long kva_pages;
117 */ 117 */
118int __init get_memcfg_numa_flat(void) 118int __init get_memcfg_numa_flat(void)
119{ 119{
120 printk("NUMA - single node, flat memory mode\n"); 120 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
121 121
122 /* Run the memory configuration and find the top of memory. */
123 propagate_e820_map();
124 node_start_pfn[0] = 0; 122 node_start_pfn[0] = 0;
125 node_end_pfn[0] = max_pfn; 123 node_end_pfn[0] = max_pfn;
124 e820_register_active_regions(0, 0, max_pfn);
126 memory_present(0, 0, max_pfn); 125 memory_present(0, 0, max_pfn);
126 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
127 127
128 /* Indicate there is one node available. */ 128 /* Indicate there is one node available. */
129 nodes_clear(node_online_map); 129 nodes_clear(node_online_map);
@@ -156,24 +156,32 @@ static void __init propagate_e820_map_node(int nid)
156 */ 156 */
157static void __init allocate_pgdat(int nid) 157static void __init allocate_pgdat(int nid)
158{ 158{
159 if (nid && node_has_online_mem(nid)) 159 char buf[16];
160
161 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
160 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 162 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
161 else { 163 else {
162 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); 164 unsigned long pgdat_phys;
163 min_low_pfn += PFN_UP(sizeof(pg_data_t)); 165 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
166 max_pfn_mapped<<PAGE_SHIFT,
167 sizeof(pg_data_t),
168 PAGE_SIZE);
169 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
170 memset(buf, 0, sizeof(buf));
171 sprintf(buf, "NODE_DATA %d", nid);
172 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
164 } 173 }
174 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
175 nid, (unsigned long)NODE_DATA(nid));
165} 176}
166 177
167#ifdef CONFIG_DISCONTIGMEM
168/* 178/*
169 * In the discontig memory model, a portion of the kernel virtual area (KVA) 179 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
170 * is reserved and portions of nodes are mapped using it. This is to allow 180 * virtual address space (KVA) is reserved and portions of nodes are mapped
171 * node-local memory to be allocated for structures that would normally require 181 * using it. This is to allow node-local memory to be allocated for
172 * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers 182 * structures that would normally require ZONE_NORMAL. The memory is
173 * should be prepared to allocate from the bootmem allocator instead. This KVA 183 * allocated with alloc_remap() and callers should be prepared to allocate
174 * mechanism is incompatible with SPARSEMEM as it makes assumptions about the 184 * from the bootmem allocator instead.
175 * layout of memory that are broken if alloc_remap() succeeds for some of the
176 * map and fails for others
177 */ 185 */
178static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 186static unsigned long node_remap_start_pfn[MAX_NUMNODES];
179static void *node_remap_end_vaddr[MAX_NUMNODES]; 187static void *node_remap_end_vaddr[MAX_NUMNODES];
@@ -195,15 +203,19 @@ void *alloc_remap(int nid, unsigned long size)
195 return allocation; 203 return allocation;
196} 204}
197 205
198void __init remap_numa_kva(void) 206static void __init remap_numa_kva(void)
199{ 207{
200 void *vaddr; 208 void *vaddr;
201 unsigned long pfn; 209 unsigned long pfn;
202 int node; 210 int node;
203 211
204 for_each_online_node(node) { 212 for_each_online_node(node) {
213 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
205 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { 214 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
206 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); 215 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
216 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
217 (unsigned long)vaddr,
218 node_remap_start_pfn[node] + pfn);
207 set_pmd_pfn((ulong) vaddr, 219 set_pmd_pfn((ulong) vaddr,
208 node_remap_start_pfn[node] + pfn, 220 node_remap_start_pfn[node] + pfn,
209 PAGE_KERNEL_LARGE); 221 PAGE_KERNEL_LARGE);
@@ -215,17 +227,21 @@ static unsigned long calculate_numa_remap_pages(void)
215{ 227{
216 int nid; 228 int nid;
217 unsigned long size, reserve_pages = 0; 229 unsigned long size, reserve_pages = 0;
218 unsigned long pfn;
219 230
220 for_each_online_node(nid) { 231 for_each_online_node(nid) {
221 unsigned old_end_pfn = node_end_pfn[nid]; 232 u64 node_kva_target;
233 u64 node_kva_final;
222 234
223 /* 235 /*
224 * The acpi/srat node info can show hot-add memroy zones 236 * The acpi/srat node info can show hot-add memroy zones
225 * where memory could be added but not currently present. 237 * where memory could be added but not currently present.
226 */ 238 */
239 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
240 nid, node_start_pfn[nid], node_end_pfn[nid]);
227 if (node_start_pfn[nid] > max_pfn) 241 if (node_start_pfn[nid] > max_pfn)
228 continue; 242 continue;
243 if (!node_end_pfn[nid])
244 continue;
229 if (node_end_pfn[nid] > max_pfn) 245 if (node_end_pfn[nid] > max_pfn)
230 node_end_pfn[nid] = max_pfn; 246 node_end_pfn[nid] = max_pfn;
231 247
@@ -237,41 +253,48 @@ static unsigned long calculate_numa_remap_pages(void)
237 /* now the roundup is correct, convert to PAGE_SIZE pages */ 253 /* now the roundup is correct, convert to PAGE_SIZE pages */
238 size = size * PTRS_PER_PTE; 254 size = size * PTRS_PER_PTE;
239 255
240 /* 256 node_kva_target = round_down(node_end_pfn[nid] - size,
241 * Validate the region we are allocating only contains valid 257 PTRS_PER_PTE);
242 * pages. 258 node_kva_target <<= PAGE_SHIFT;
243 */ 259 do {
244 for (pfn = node_end_pfn[nid] - size; 260 node_kva_final = find_e820_area(node_kva_target,
245 pfn < node_end_pfn[nid]; pfn++) 261 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
246 if (!page_is_ram(pfn)) 262 ((u64)size)<<PAGE_SHIFT,
247 break; 263 LARGE_PAGE_BYTES);
248 264 node_kva_target -= LARGE_PAGE_BYTES;
249 if (pfn != node_end_pfn[nid]) 265 } while (node_kva_final == -1ULL &&
250 size = 0; 266 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
267
268 if (node_kva_final == -1ULL)
269 panic("Can not get kva ram\n");
251 270
252 printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
253 size, nid);
254 node_remap_size[nid] = size; 271 node_remap_size[nid] = size;
255 node_remap_offset[nid] = reserve_pages; 272 node_remap_offset[nid] = reserve_pages;
256 reserve_pages += size; 273 reserve_pages += size;
257 printk("Shrinking node %d from %ld pages to %ld pages\n", 274 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
258 nid, node_end_pfn[nid], node_end_pfn[nid] - size); 275 " node %d at %llx\n",
259 276 size, nid, node_kva_final>>PAGE_SHIFT);
260 if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { 277
261 /* 278 /*
262 * Align node_end_pfn[] and node_remap_start_pfn[] to 279 * prevent kva address below max_low_pfn want it on system
263 * pmd boundary. remap_numa_kva will barf otherwise. 280 * with less memory later.
264 */ 281 * layout will be: KVA address , KVA RAM
265 printk("Shrinking node %d further by %ld pages for proper alignment\n", 282 *
266 nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); 283 * we are supposed to only record the one less then max_low_pfn
267 size += node_end_pfn[nid] & (PTRS_PER_PTE-1); 284 * but we could have some hole in high memory, and it will only
268 } 285 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
286 * to use it as free.
287 * So reserve_early here, hope we don't run out of that array
288 */
289 reserve_early(node_kva_final,
290 node_kva_final+(((u64)size)<<PAGE_SHIFT),
291 "KVA RAM");
269 292
270 node_end_pfn[nid] -= size; 293 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
271 node_remap_start_pfn[nid] = node_end_pfn[nid]; 294 remove_active_range(nid, node_remap_start_pfn[nid],
272 shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); 295 node_remap_start_pfn[nid] + size);
273 } 296 }
274 printk("Reserving total of %ld pages for numa KVA remap\n", 297 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
275 reserve_pages); 298 reserve_pages);
276 return reserve_pages; 299 return reserve_pages;
277} 300}
@@ -285,37 +308,16 @@ static void init_remap_allocator(int nid)
285 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + 308 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
286 ALIGN(sizeof(pg_data_t), PAGE_SIZE); 309 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
287 310
288 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, 311 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
289 (ulong) node_remap_start_vaddr[nid], 312 (ulong) node_remap_start_vaddr[nid],
290 (ulong) pfn_to_kaddr(highstart_pfn 313 (ulong) node_remap_end_vaddr[nid]);
291 + node_remap_offset[nid] + node_remap_size[nid]));
292}
293#else
294void *alloc_remap(int nid, unsigned long size)
295{
296 return NULL;
297}
298
299static unsigned long calculate_numa_remap_pages(void)
300{
301 return 0;
302}
303
304static void init_remap_allocator(int nid)
305{
306}
307
308void __init remap_numa_kva(void)
309{
310} 314}
311#endif /* CONFIG_DISCONTIGMEM */
312 315
313extern void setup_bootmem_allocator(void); 316void __init initmem_init(unsigned long start_pfn,
314unsigned long __init setup_memory(void) 317 unsigned long end_pfn)
315{ 318{
316 int nid; 319 int nid;
317 unsigned long system_start_pfn, system_max_low_pfn; 320 long kva_target_pfn;
318 unsigned long wasted_pages;
319 321
320 /* 322 /*
321 * When mapping a NUMA machine we allocate the node_mem_map arrays 323 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -324,109 +326,77 @@ unsigned long __init setup_memory(void)
324 * this space and use it to adjust the boundary between ZONE_NORMAL 326 * this space and use it to adjust the boundary between ZONE_NORMAL
325 * and ZONE_HIGHMEM. 327 * and ZONE_HIGHMEM.
326 */ 328 */
327 get_memcfg_numa();
328 329
329 kva_pages = calculate_numa_remap_pages(); 330 get_memcfg_numa();
330 331
331 /* partially used pages are not usable - thus round upwards */ 332 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
332 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
333 333
334 kva_start_pfn = find_max_low_pfn() - kva_pages; 334 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
335 do {
336 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
337 max_low_pfn<<PAGE_SHIFT,
338 kva_pages<<PAGE_SHIFT,
339 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
340 kva_target_pfn -= PTRS_PER_PTE;
341 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
335 342
336#ifdef CONFIG_BLK_DEV_INITRD 343 if (kva_start_pfn == -1UL)
337 /* Numa kva area is below the initrd */ 344 panic("Can not get kva space\n");
338 if (initrd_start)
339 kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
340 - kva_pages;
341#endif
342 345
343 /* 346 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
344 * We waste pages past at the end of the KVA for no good reason other
345 * than how it is located. This is bad.
346 */
347 wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
348 kva_start_pfn -= wasted_pages;
349 kva_pages += wasted_pages;
350
351 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
352 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
353 kva_start_pfn, max_low_pfn); 347 kva_start_pfn, max_low_pfn);
354 printk("max_pfn = %ld\n", max_pfn); 348 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
349
350 /* avoid clash with initrd */
351 reserve_early(kva_start_pfn<<PAGE_SHIFT,
352 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
353 "KVA PG");
355#ifdef CONFIG_HIGHMEM 354#ifdef CONFIG_HIGHMEM
356 highstart_pfn = highend_pfn = max_pfn; 355 highstart_pfn = highend_pfn = max_pfn;
357 if (max_pfn > system_max_low_pfn) 356 if (max_pfn > max_low_pfn)
358 highstart_pfn = system_max_low_pfn; 357 highstart_pfn = max_low_pfn;
359 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 358 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
360 pages_to_mb(highend_pfn - highstart_pfn)); 359 pages_to_mb(highend_pfn - highstart_pfn));
361 num_physpages = highend_pfn; 360 num_physpages = highend_pfn;
362 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 361 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
363#else 362#else
364 num_physpages = system_max_low_pfn; 363 num_physpages = max_low_pfn;
365 high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; 364 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
366#endif 365#endif
367 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 366 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
368 pages_to_mb(system_max_low_pfn)); 367 pages_to_mb(max_low_pfn));
369 printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 368 printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
370 min_low_pfn, max_low_pfn, highstart_pfn); 369 max_low_pfn, highstart_pfn);
371 370
372 printk("Low memory ends at vaddr %08lx\n", 371 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
373 (ulong) pfn_to_kaddr(max_low_pfn)); 372 (ulong) pfn_to_kaddr(max_low_pfn));
374 for_each_online_node(nid) { 373 for_each_online_node(nid) {
375 init_remap_allocator(nid); 374 init_remap_allocator(nid);
376 375
377 allocate_pgdat(nid); 376 allocate_pgdat(nid);
378 } 377 }
379 printk("High memory starts at vaddr %08lx\n", 378 remap_numa_kva();
379
380 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
380 (ulong) pfn_to_kaddr(highstart_pfn)); 381 (ulong) pfn_to_kaddr(highstart_pfn));
381 for_each_online_node(nid) 382 for_each_online_node(nid)
382 propagate_e820_map_node(nid); 383 propagate_e820_map_node(nid);
383 384
384 memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); 385 for_each_online_node(nid)
386 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
387
385 NODE_DATA(0)->bdata = &node0_bdata; 388 NODE_DATA(0)->bdata = &node0_bdata;
386 setup_bootmem_allocator(); 389 setup_bootmem_allocator();
387 return max_low_pfn;
388}
389
390void __init numa_kva_reserve(void)
391{
392 if (kva_pages)
393 reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
394 BOOTMEM_DEFAULT);
395} 390}
396 391
397void __init zone_sizes_init(void) 392void __init set_highmem_pages_init(void)
398{
399 int nid;
400 unsigned long max_zone_pfns[MAX_NR_ZONES];
401 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
402 max_zone_pfns[ZONE_DMA] =
403 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
404 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
405#ifdef CONFIG_HIGHMEM
406 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
407#endif
408
409 /* If SRAT has not registered memory, register it now */
410 if (find_max_pfn_with_active_regions() == 0) {
411 for_each_online_node(nid) {
412 if (node_has_online_mem(nid))
413 add_active_range(nid, node_start_pfn[nid],
414 node_end_pfn[nid]);
415 }
416 }
417
418 free_area_init_nodes(max_zone_pfns);
419 return;
420}
421
422void __init set_highmem_pages_init(int bad_ppro)
423{ 393{
424#ifdef CONFIG_HIGHMEM 394#ifdef CONFIG_HIGHMEM
425 struct zone *zone; 395 struct zone *zone;
426 struct page *page; 396 int nid;
427 397
428 for_each_zone(zone) { 398 for_each_zone(zone) {
429 unsigned long node_pfn, zone_start_pfn, zone_end_pfn; 399 unsigned long zone_start_pfn, zone_end_pfn;
430 400
431 if (!is_highmem(zone)) 401 if (!is_highmem(zone))
432 continue; 402 continue;
@@ -434,16 +404,12 @@ void __init set_highmem_pages_init(int bad_ppro)
434 zone_start_pfn = zone->zone_start_pfn; 404 zone_start_pfn = zone->zone_start_pfn;
435 zone_end_pfn = zone_start_pfn + zone->spanned_pages; 405 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
436 406
437 printk("Initializing %s for node %d (%08lx:%08lx)\n", 407 nid = zone_to_nid(zone);
438 zone->name, zone_to_nid(zone), 408 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
439 zone_start_pfn, zone_end_pfn); 409 zone->name, nid, zone_start_pfn, zone_end_pfn);
440 410
441 for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { 411 add_highpages_with_active_regions(nid, zone_start_pfn,
442 if (!pfn_valid(node_pfn)) 412 zone_end_pfn);
443 continue;
444 page = pfn_to_page(node_pfn);
445 add_one_highpage_init(page, node_pfn, bad_ppro);
446 }
447 } 413 }
448 totalram_pages += totalhigh_pages; 414 totalram_pages += totalhigh_pages;
449#endif 415#endif
@@ -476,3 +442,4 @@ int memory_add_physaddr_to_nid(u64 addr)
476 442
477EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 443EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
478#endif 444#endif
445
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2c24bea92c66..0bb0caed8971 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -42,7 +42,7 @@ static struct addr_marker address_markers[] = {
42 { 0, "User Space" }, 42 { 0, "User Space" },
43#ifdef CONFIG_X86_64 43#ifdef CONFIG_X86_64
44 { 0x8000000000000000UL, "Kernel Space" }, 44 { 0x8000000000000000UL, "Kernel Space" },
45 { 0xffff810000000000UL, "Low Kernel Mapping" }, 45 { PAGE_OFFSET, "Low Kernel Mapping" },
46 { VMALLOC_START, "vmalloc() Area" }, 46 { VMALLOC_START, "vmalloc() Area" },
47 { VMEMMAP_START, "Vmemmap" }, 47 { VMEMMAP_START, "Vmemmap" },
48 { __START_KERNEL_map, "High Kernel Mapping" }, 48 { __START_KERNEL_map, "High Kernel Mapping" },
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8bcb6f40ccb6..d0f5fce77d95 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -55,11 +55,7 @@ static inline int notify_page_fault(struct pt_regs *regs)
55 int ret = 0; 55 int ret = 0;
56 56
57 /* kprobe_running() needs smp_processor_id() */ 57 /* kprobe_running() needs smp_processor_id() */
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) { 58 if (!user_mode_vm(regs)) {
60#else
61 if (!user_mode(regs)) {
62#endif
63 preempt_disable(); 59 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 60 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1; 61 ret = 1;
@@ -396,11 +392,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
396 printk(KERN_CONT "NULL pointer dereference"); 392 printk(KERN_CONT "NULL pointer dereference");
397 else 393 else
398 printk(KERN_CONT "paging request"); 394 printk(KERN_CONT "paging request");
399#ifdef CONFIG_X86_32 395 printk(KERN_CONT " at %p\n", (void *) address);
400 printk(KERN_CONT " at %08lx\n", address);
401#else
402 printk(KERN_CONT " at %016lx\n", address);
403#endif
404 printk(KERN_ALERT "IP:"); 396 printk(KERN_ALERT "IP:");
405 printk_address(regs->ip, 1); 397 printk_address(regs->ip, 1);
406 dump_pagetable(address); 398 dump_pagetable(address);
@@ -800,14 +792,10 @@ bad_area_nosemaphore:
800 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 792 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
801 printk_ratelimit()) { 793 printk_ratelimit()) {
802 printk( 794 printk(
803#ifdef CONFIG_X86_32 795 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
804 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
805#else
806 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
807#endif
808 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 796 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
809 tsk->comm, task_pid_nr(tsk), address, regs->ip, 797 tsk->comm, task_pid_nr(tsk), address,
810 regs->sp, error_code); 798 (void *) regs->ip, (void *) regs->sp, error_code);
811 print_vma_addr(" in ", regs->ip); 799 print_vma_addr(" in ", regs->ip);
812 printk("\n"); 800 printk("\n");
813 } 801 }
@@ -915,14 +903,7 @@ LIST_HEAD(pgd_list);
915void vmalloc_sync_all(void) 903void vmalloc_sync_all(void)
916{ 904{
917#ifdef CONFIG_X86_32 905#ifdef CONFIG_X86_32
918 /* 906 unsigned long start = VMALLOC_START & PGDIR_MASK;
919 * Note that races in the updates of insync and start aren't
920 * problematic: insync can only get set bits added, and updates to
921 * start are only improving performance (without affecting correctness
922 * if undone).
923 */
924 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
925 static unsigned long start = TASK_SIZE;
926 unsigned long address; 907 unsigned long address;
927 908
928 if (SHARED_KERNEL_PMD) 909 if (SHARED_KERNEL_PMD)
@@ -930,56 +911,38 @@ void vmalloc_sync_all(void)
930 911
931 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 912 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
932 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 913 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
933 if (!test_bit(pgd_index(address), insync)) { 914 unsigned long flags;
934 unsigned long flags; 915 struct page *page;
935 struct page *page; 916
936 917 spin_lock_irqsave(&pgd_lock, flags);
937 spin_lock_irqsave(&pgd_lock, flags); 918 list_for_each_entry(page, &pgd_list, lru) {
938 list_for_each_entry(page, &pgd_list, lru) { 919 if (!vmalloc_sync_one(page_address(page),
939 if (!vmalloc_sync_one(page_address(page), 920 address))
940 address)) 921 break;
941 break;
942 }
943 spin_unlock_irqrestore(&pgd_lock, flags);
944 if (!page)
945 set_bit(pgd_index(address), insync);
946 } 922 }
947 if (address == start && test_bit(pgd_index(address), insync)) 923 spin_unlock_irqrestore(&pgd_lock, flags);
948 start = address + PGDIR_SIZE;
949 } 924 }
950#else /* CONFIG_X86_64 */ 925#else /* CONFIG_X86_64 */
951 /* 926 unsigned long start = VMALLOC_START & PGDIR_MASK;
952 * Note that races in the updates of insync and start aren't
953 * problematic: insync can only get set bits added, and updates to
954 * start are only improving performance (without affecting correctness
955 * if undone).
956 */
957 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
958 static unsigned long start = VMALLOC_START & PGDIR_MASK;
959 unsigned long address; 927 unsigned long address;
960 928
961 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { 929 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
962 if (!test_bit(pgd_index(address), insync)) { 930 const pgd_t *pgd_ref = pgd_offset_k(address);
963 const pgd_t *pgd_ref = pgd_offset_k(address); 931 unsigned long flags;
964 unsigned long flags; 932 struct page *page;
965 struct page *page; 933
966 934 if (pgd_none(*pgd_ref))
967 if (pgd_none(*pgd_ref)) 935 continue;
968 continue; 936 spin_lock_irqsave(&pgd_lock, flags);
969 spin_lock_irqsave(&pgd_lock, flags); 937 list_for_each_entry(page, &pgd_list, lru) {
970 list_for_each_entry(page, &pgd_list, lru) { 938 pgd_t *pgd;
971 pgd_t *pgd; 939 pgd = (pgd_t *)page_address(page) + pgd_index(address);
972 pgd = (pgd_t *)page_address(page) + pgd_index(address); 940 if (pgd_none(*pgd))
973 if (pgd_none(*pgd)) 941 set_pgd(pgd, *pgd_ref);
974 set_pgd(pgd, *pgd_ref); 942 else
975 else 943 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
976 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
977 }
978 spin_unlock_irqrestore(&pgd_lock, flags);
979 set_bit(pgd_index(address), insync);
980 } 944 }
981 if (address == start) 945 spin_unlock_irqrestore(&pgd_lock, flags);
982 start = address + PGDIR_SIZE;
983 } 946 }
984#endif 947#endif
985} 948}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b6..b5a0fd5f4c5f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -57,6 +57,27 @@ unsigned long highstart_pfn, highend_pfn;
57 57
58static noinline int do_test_wp_bit(void); 58static noinline int do_test_wp_bit(void);
59 59
60
61static unsigned long __initdata table_start;
62static unsigned long __meminitdata table_end;
63static unsigned long __meminitdata table_top;
64
65static int __initdata after_init_bootmem;
66
67static __init void *alloc_low_page(unsigned long *phys)
68{
69 unsigned long pfn = table_end++;
70 void *adr;
71
72 if (pfn >= table_top)
73 panic("alloc_low_page: ran out of memory");
74
75 adr = __va(pfn * PAGE_SIZE);
76 memset(adr, 0, PAGE_SIZE);
77 *phys = pfn * PAGE_SIZE;
78 return adr;
79}
80
60/* 81/*
61 * Creates a middle page table and puts a pointer to it in the 82 * Creates a middle page table and puts a pointer to it in the
62 * given global directory entry. This only returns the gd entry 83 * given global directory entry. This only returns the gd entry
@@ -68,9 +89,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
68 pmd_t *pmd_table; 89 pmd_t *pmd_table;
69 90
70#ifdef CONFIG_X86_PAE 91#ifdef CONFIG_X86_PAE
92 unsigned long phys;
71 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 93 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
72 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 94 if (after_init_bootmem)
73 95 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
96 else
97 pmd_table = (pmd_t *)alloc_low_page(&phys);
74 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 98 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
75 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 99 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
76 pud = pud_offset(pgd, 0); 100 pud = pud_offset(pgd, 0);
@@ -92,12 +116,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
92 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 116 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
93 pte_t *page_table = NULL; 117 pte_t *page_table = NULL;
94 118
119 if (after_init_bootmem) {
95#ifdef CONFIG_DEBUG_PAGEALLOC 120#ifdef CONFIG_DEBUG_PAGEALLOC
96 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 121 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
97#endif 122#endif
98 if (!page_table) { 123 if (!page_table)
99 page_table = 124 page_table =
100 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 125 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
126 } else {
127 unsigned long phys;
128 page_table = (pte_t *)alloc_low_page(&phys);
101 } 129 }
102 130
103 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 131 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@ -155,38 +183,44 @@ static inline int is_kernel_text(unsigned long addr)
155 * of max_low_pfn pages, by creating page tables starting from address 183 * of max_low_pfn pages, by creating page tables starting from address
156 * PAGE_OFFSET: 184 * PAGE_OFFSET:
157 */ 185 */
158static void __init kernel_physical_mapping_init(pgd_t *pgd_base) 186static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
187 unsigned long start_pfn,
188 unsigned long end_pfn,
189 int use_pse)
159{ 190{
160 int pgd_idx, pmd_idx, pte_ofs; 191 int pgd_idx, pmd_idx, pte_ofs;
161 unsigned long pfn; 192 unsigned long pfn;
162 pgd_t *pgd; 193 pgd_t *pgd;
163 pmd_t *pmd; 194 pmd_t *pmd;
164 pte_t *pte; 195 pte_t *pte;
196 unsigned pages_2m = 0, pages_4k = 0;
165 197
166 pgd_idx = pgd_index(PAGE_OFFSET); 198 if (!cpu_has_pse)
167 pgd = pgd_base + pgd_idx; 199 use_pse = 0;
168 pfn = 0;
169 200
201 pfn = start_pfn;
202 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
203 pgd = pgd_base + pgd_idx;
170 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { 204 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
171 pmd = one_md_table_init(pgd); 205 pmd = one_md_table_init(pgd);
172 if (pfn >= max_low_pfn)
173 continue;
174 206
175 for (pmd_idx = 0; 207 if (pfn >= end_pfn)
176 pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; 208 continue;
209#ifdef CONFIG_X86_PAE
210 pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
211 pmd += pmd_idx;
212#else
213 pmd_idx = 0;
214#endif
215 for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
177 pmd++, pmd_idx++) { 216 pmd++, pmd_idx++) {
178 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; 217 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
179 218
180 /* 219 /*
181 * Map with big pages if possible, otherwise 220 * Map with big pages if possible, otherwise
182 * create normal page tables: 221 * create normal page tables:
183 *
184 * Don't use a large page for the first 2/4MB of memory
185 * because there are often fixed size MTRRs in there
186 * and overlapping MTRRs into large pages can cause
187 * slowdowns.
188 */ 222 */
189 if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { 223 if (use_pse) {
190 unsigned int addr2; 224 unsigned int addr2;
191 pgprot_t prot = PAGE_KERNEL_LARGE; 225 pgprot_t prot = PAGE_KERNEL_LARGE;
192 226
@@ -197,34 +231,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
197 is_kernel_text(addr2)) 231 is_kernel_text(addr2))
198 prot = PAGE_KERNEL_LARGE_EXEC; 232 prot = PAGE_KERNEL_LARGE_EXEC;
199 233
234 pages_2m++;
200 set_pmd(pmd, pfn_pmd(pfn, prot)); 235 set_pmd(pmd, pfn_pmd(pfn, prot));
201 236
202 pfn += PTRS_PER_PTE; 237 pfn += PTRS_PER_PTE;
203 max_pfn_mapped = pfn;
204 continue; 238 continue;
205 } 239 }
206 pte = one_page_table_init(pmd); 240 pte = one_page_table_init(pmd);
207 241
208 for (pte_ofs = 0; 242 pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
209 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; 243 pte += pte_ofs;
244 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
210 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 245 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
211 pgprot_t prot = PAGE_KERNEL; 246 pgprot_t prot = PAGE_KERNEL;
212 247
213 if (is_kernel_text(addr)) 248 if (is_kernel_text(addr))
214 prot = PAGE_KERNEL_EXEC; 249 prot = PAGE_KERNEL_EXEC;
215 250
251 pages_4k++;
216 set_pte(pte, pfn_pte(pfn, prot)); 252 set_pte(pte, pfn_pte(pfn, prot));
217 } 253 }
218 max_pfn_mapped = pfn;
219 } 254 }
220 } 255 }
221} 256 update_page_count(PG_LEVEL_2M, pages_2m);
222 257 update_page_count(PG_LEVEL_4K, pages_4k);
223static inline int page_kills_ppro(unsigned long pagenr)
224{
225 if (pagenr >= 0x70000 && pagenr <= 0x7003F)
226 return 1;
227 return 0;
228} 258}
229 259
230/* 260/*
@@ -287,29 +317,62 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
287 pkmap_page_table = pte; 317 pkmap_page_table = pte;
288} 318}
289 319
290void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) 320static void __init add_one_highpage_init(struct page *page, int pfn)
291{ 321{
292 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { 322 ClearPageReserved(page);
293 ClearPageReserved(page); 323 init_page_count(page);
294 init_page_count(page); 324 __free_page(page);
295 __free_page(page); 325 totalhigh_pages++;
296 totalhigh_pages++;
297 } else
298 SetPageReserved(page);
299} 326}
300 327
301#ifndef CONFIG_NUMA 328struct add_highpages_data {
302static void __init set_highmem_pages_init(int bad_ppro) 329 unsigned long start_pfn;
330 unsigned long end_pfn;
331};
332
333static int __init add_highpages_work_fn(unsigned long start_pfn,
334 unsigned long end_pfn, void *datax)
303{ 335{
304 int pfn; 336 int node_pfn;
337 struct page *page;
338 unsigned long final_start_pfn, final_end_pfn;
339 struct add_highpages_data *data;
305 340
306 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { 341 data = (struct add_highpages_data *)datax;
307 /* 342
308 * Holes under sparsemem might not have no mem_map[]: 343 final_start_pfn = max(start_pfn, data->start_pfn);
309 */ 344 final_end_pfn = min(end_pfn, data->end_pfn);
310 if (pfn_valid(pfn)) 345 if (final_start_pfn >= final_end_pfn)
311 add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); 346 return 0;
347
348 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
349 node_pfn++) {
350 if (!pfn_valid(node_pfn))
351 continue;
352 page = pfn_to_page(node_pfn);
353 add_one_highpage_init(page, node_pfn);
312 } 354 }
355
356 return 0;
357
358}
359
360void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
361 unsigned long end_pfn)
362{
363 struct add_highpages_data data;
364
365 data.start_pfn = start_pfn;
366 data.end_pfn = end_pfn;
367
368 work_with_active_regions(nid, add_highpages_work_fn, &data);
369}
370
371#ifndef CONFIG_NUMA
372static void __init set_highmem_pages_init(void)
373{
374 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
375
313 totalram_pages += totalhigh_pages; 376 totalram_pages += totalhigh_pages;
314} 377}
315#endif /* !CONFIG_NUMA */ 378#endif /* !CONFIG_NUMA */
@@ -317,14 +380,9 @@ static void __init set_highmem_pages_init(int bad_ppro)
317#else 380#else
318# define kmap_init() do { } while (0) 381# define kmap_init() do { } while (0)
319# define permanent_kmaps_init(pgd_base) do { } while (0) 382# define permanent_kmaps_init(pgd_base) do { } while (0)
320# define set_highmem_pages_init(bad_ppro) do { } while (0) 383# define set_highmem_pages_init() do { } while (0)
321#endif /* CONFIG_HIGHMEM */ 384#endif /* CONFIG_HIGHMEM */
322 385
323pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
324EXPORT_SYMBOL(__PAGE_KERNEL);
325
326pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
327
328void __init native_pagetable_setup_start(pgd_t *base) 386void __init native_pagetable_setup_start(pgd_t *base)
329{ 387{
330 unsigned long pfn, va; 388 unsigned long pfn, va;
@@ -380,27 +438,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
380 * be partially populated, and so it avoids stomping on any existing 438 * be partially populated, and so it avoids stomping on any existing
381 * mappings. 439 * mappings.
382 */ 440 */
383static void __init pagetable_init(void) 441static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
384{ 442{
385 pgd_t *pgd_base = swapper_pg_dir;
386 unsigned long vaddr, end; 443 unsigned long vaddr, end;
387 444
388 paravirt_pagetable_setup_start(pgd_base);
389
390 /* Enable PSE if available */
391 if (cpu_has_pse)
392 set_in_cr4(X86_CR4_PSE);
393
394 /* Enable PGE if available */
395 if (cpu_has_pge) {
396 set_in_cr4(X86_CR4_PGE);
397 __PAGE_KERNEL |= _PAGE_GLOBAL;
398 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
399 }
400
401 kernel_physical_mapping_init(pgd_base);
402 remap_numa_kva();
403
404 /* 445 /*
405 * Fixed mappings, only the page table structure has to be 446 * Fixed mappings, only the page table structure has to be
406 * created - mappings will be set by set_fixmap(): 447 * created - mappings will be set by set_fixmap():
@@ -410,6 +451,13 @@ static void __init pagetable_init(void)
410 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 451 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
411 page_table_range_init(vaddr, end, pgd_base); 452 page_table_range_init(vaddr, end, pgd_base);
412 early_ioremap_reset(); 453 early_ioremap_reset();
454}
455
456static void __init pagetable_init(void)
457{
458 pgd_t *pgd_base = swapper_pg_dir;
459
460 paravirt_pagetable_setup_start(pgd_base);
413 461
414 permanent_kmaps_init(pgd_base); 462 permanent_kmaps_init(pgd_base);
415 463
@@ -456,7 +504,7 @@ void zap_low_mappings(void)
456 504
457int nx_enabled; 505int nx_enabled;
458 506
459pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; 507pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
460EXPORT_SYMBOL_GPL(__supported_pte_mask); 508EXPORT_SYMBOL_GPL(__supported_pte_mask);
461 509
462#ifdef CONFIG_X86_PAE 510#ifdef CONFIG_X86_PAE
@@ -509,27 +557,318 @@ static void __init set_nx(void)
509} 557}
510#endif 558#endif
511 559
560/* user-defined highmem size */
561static unsigned int highmem_pages = -1;
562
512/* 563/*
513 * paging_init() sets up the page tables - note that the first 8MB are 564 * highmem=size forces highmem to be exactly 'size' bytes.
514 * already mapped by head.S. 565 * This works even on boxes that have no highmem otherwise.
515 * 566 * This also works to reduce highmem size on bigger boxes.
516 * This routines also unmaps the page at virtual kernel address 0, so
517 * that we can trap those pesky NULL-reference errors in the kernel.
518 */ 567 */
519void __init paging_init(void) 568static int __init parse_highmem(char *arg)
569{
570 if (!arg)
571 return -EINVAL;
572
573 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
574 return 0;
575}
576early_param("highmem", parse_highmem);
577
578/*
579 * Determine low and high memory ranges:
580 */
581void __init find_low_pfn_range(void)
520{ 582{
583 /* it could update max_pfn */
584
585 /* max_low_pfn is 0, we already have early_res support */
586
587 max_low_pfn = max_pfn;
588 if (max_low_pfn > MAXMEM_PFN) {
589 if (highmem_pages == -1)
590 highmem_pages = max_pfn - MAXMEM_PFN;
591 if (highmem_pages + MAXMEM_PFN < max_pfn)
592 max_pfn = MAXMEM_PFN + highmem_pages;
593 if (highmem_pages + MAXMEM_PFN > max_pfn) {
594 printk(KERN_WARNING "only %luMB highmem pages "
595 "available, ignoring highmem size of %uMB.\n",
596 pages_to_mb(max_pfn - MAXMEM_PFN),
597 pages_to_mb(highmem_pages));
598 highmem_pages = 0;
599 }
600 max_low_pfn = MAXMEM_PFN;
601#ifndef CONFIG_HIGHMEM
602 /* Maximum memory usable is what is directly addressable */
603 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
604 MAXMEM>>20);
605 if (max_pfn > MAX_NONPAE_PFN)
606 printk(KERN_WARNING
607 "Use a HIGHMEM64G enabled kernel.\n");
608 else
609 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
610 max_pfn = MAXMEM_PFN;
611#else /* !CONFIG_HIGHMEM */
612#ifndef CONFIG_HIGHMEM64G
613 if (max_pfn > MAX_NONPAE_PFN) {
614 max_pfn = MAX_NONPAE_PFN;
615 printk(KERN_WARNING "Warning only 4GB will be used."
616 "Use a HIGHMEM64G enabled kernel.\n");
617 }
618#endif /* !CONFIG_HIGHMEM64G */
619#endif /* !CONFIG_HIGHMEM */
620 } else {
621 if (highmem_pages == -1)
622 highmem_pages = 0;
623#ifdef CONFIG_HIGHMEM
624 if (highmem_pages >= max_pfn) {
625 printk(KERN_ERR "highmem size specified (%uMB) is "
626 "bigger than pages available (%luMB)!.\n",
627 pages_to_mb(highmem_pages),
628 pages_to_mb(max_pfn));
629 highmem_pages = 0;
630 }
631 if (highmem_pages) {
632 if (max_low_pfn - highmem_pages <
633 64*1024*1024/PAGE_SIZE){
634 printk(KERN_ERR "highmem size %uMB results in "
635 "smaller than 64MB lowmem, ignoring it.\n"
636 , pages_to_mb(highmem_pages));
637 highmem_pages = 0;
638 }
639 max_low_pfn -= highmem_pages;
640 }
641#else
642 if (highmem_pages)
643 printk(KERN_ERR "ignoring highmem size on non-highmem"
644 " kernel!\n");
645#endif
646 }
647}
648
649#ifndef CONFIG_NEED_MULTIPLE_NODES
650void __init initmem_init(unsigned long start_pfn,
651 unsigned long end_pfn)
652{
653#ifdef CONFIG_HIGHMEM
654 highstart_pfn = highend_pfn = max_pfn;
655 if (max_pfn > max_low_pfn)
656 highstart_pfn = max_low_pfn;
657 memory_present(0, 0, highend_pfn);
658 e820_register_active_regions(0, 0, highend_pfn);
659 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
660 pages_to_mb(highend_pfn - highstart_pfn));
661 num_physpages = highend_pfn;
662 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
663#else
664 memory_present(0, 0, max_low_pfn);
665 e820_register_active_regions(0, 0, max_low_pfn);
666 num_physpages = max_low_pfn;
667 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
668#endif
669#ifdef CONFIG_FLATMEM
670 max_mapnr = num_physpages;
671#endif
672 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
673 pages_to_mb(max_low_pfn));
674
675 setup_bootmem_allocator();
676}
677#endif /* !CONFIG_NEED_MULTIPLE_NODES */
678
679static void __init zone_sizes_init(void)
680{
681 unsigned long max_zone_pfns[MAX_NR_ZONES];
682 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
683 max_zone_pfns[ZONE_DMA] =
684 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
685 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
686#ifdef CONFIG_HIGHMEM
687 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
688#endif
689
690 free_area_init_nodes(max_zone_pfns);
691}
692
693void __init setup_bootmem_allocator(void)
694{
695 int i;
696 unsigned long bootmap_size, bootmap;
697 /*
698 * Initialize the boot-time allocator (with low memory only):
699 */
700 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
701 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
702 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
703 PAGE_SIZE);
704 if (bootmap == -1L)
705 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
706 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
707
708 /* don't touch min_low_pfn */
709 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
710 min_low_pfn, max_low_pfn);
711 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
712 max_pfn_mapped<<PAGE_SHIFT);
713 printk(KERN_INFO " low ram: %08lx - %08lx\n",
714 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
715 printk(KERN_INFO " bootmap %08lx - %08lx\n",
716 bootmap, bootmap + bootmap_size);
717 for_each_online_node(i)
718 free_bootmem_with_active_regions(i, max_low_pfn);
719 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
720
721 after_init_bootmem = 1;
722}
723
724static void __init find_early_table_space(unsigned long end)
725{
726 unsigned long puds, pmds, ptes, tables, start;
727
728 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
729 tables = PAGE_ALIGN(puds * sizeof(pud_t));
730
731 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
732 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
733
734 if (cpu_has_pse) {
735 unsigned long extra;
736
737 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
738 extra += PMD_SIZE;
739 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
740 } else
741 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
742
743 tables += PAGE_ALIGN(ptes * sizeof(pte_t));
744
745 /* for fixmap */
746 tables += PAGE_SIZE * 2;
747
748 /*
749 * RED-PEN putting page tables only on node 0 could
750 * cause a hotspot and fill up ZONE_DMA. The page tables
751 * need roughly 0.5KB per GB.
752 */
753 start = 0x7000;
754 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
755 tables, PAGE_SIZE);
756 if (table_start == -1UL)
757 panic("Cannot find space for the kernel page tables");
758
759 table_start >>= PAGE_SHIFT;
760 table_end = table_start;
761 table_top = table_start + (tables>>PAGE_SHIFT);
762
763 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
764 end, table_start << PAGE_SHIFT,
765 (table_start << PAGE_SHIFT) + tables);
766}
767
768unsigned long __init_refok init_memory_mapping(unsigned long start,
769 unsigned long end)
770{
771 pgd_t *pgd_base = swapper_pg_dir;
772 unsigned long start_pfn, end_pfn;
773 unsigned long big_page_start;
774
775 /*
776 * Find space for the kernel direct mapping tables.
777 */
778 if (!after_init_bootmem)
779 find_early_table_space(end);
780
521#ifdef CONFIG_X86_PAE 781#ifdef CONFIG_X86_PAE
522 set_nx(); 782 set_nx();
523 if (nx_enabled) 783 if (nx_enabled)
524 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 784 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
525#endif 785#endif
526 pagetable_init(); 786
787 /* Enable PSE if available */
788 if (cpu_has_pse)
789 set_in_cr4(X86_CR4_PSE);
790
791 /* Enable PGE if available */
792 if (cpu_has_pge) {
793 set_in_cr4(X86_CR4_PGE);
794 __supported_pte_mask |= _PAGE_GLOBAL;
795 }
796
797 /*
798 * Don't use a large page for the first 2/4MB of memory
799 * because there are often fixed size MTRRs in there
800 * and overlapping MTRRs into large pages can cause
801 * slowdowns.
802 */
803 big_page_start = PMD_SIZE;
804
805 if (start < big_page_start) {
806 start_pfn = start >> PAGE_SHIFT;
807 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
808 } else {
809 /* head is not big page alignment ? */
810 start_pfn = start >> PAGE_SHIFT;
811 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
812 << (PMD_SHIFT - PAGE_SHIFT);
813 }
814 if (start_pfn < end_pfn)
815 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
816
817 /* big page range */
818 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
819 << (PMD_SHIFT - PAGE_SHIFT);
820 if (start_pfn < (big_page_start >> PAGE_SHIFT))
821 start_pfn = big_page_start >> PAGE_SHIFT;
822 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
823 if (start_pfn < end_pfn)
824 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
825 cpu_has_pse);
826
827 /* tail is not big page alignment ? */
828 start_pfn = end_pfn;
829 if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
830 end_pfn = end >> PAGE_SHIFT;
831 if (start_pfn < end_pfn)
832 kernel_physical_mapping_init(pgd_base, start_pfn,
833 end_pfn, 0);
834 }
835
836 early_ioremap_page_table_range_init(pgd_base);
527 837
528 load_cr3(swapper_pg_dir); 838 load_cr3(swapper_pg_dir);
529 839
530 __flush_tlb_all(); 840 __flush_tlb_all();
531 841
842 if (!after_init_bootmem)
843 reserve_early(table_start << PAGE_SHIFT,
844 table_end << PAGE_SHIFT, "PGTABLE");
845
846 return end >> PAGE_SHIFT;
847}
848
849
850/*
851 * paging_init() sets up the page tables - note that the first 8MB are
852 * already mapped by head.S.
853 *
854 * This routines also unmaps the page at virtual kernel address 0, so
855 * that we can trap those pesky NULL-reference errors in the kernel.
856 */
857void __init paging_init(void)
858{
859 pagetable_init();
860
861 __flush_tlb_all();
862
532 kmap_init(); 863 kmap_init();
864
865 /*
866 * NOTE: at this point the bootmem allocator is fully available.
867 */
868 sparse_init();
869 zone_sizes_init();
870
871 paravirt_post_allocator_init();
533} 872}
534 873
535/* 874/*
@@ -564,24 +903,11 @@ static struct kcore_list kcore_mem, kcore_vmalloc;
564void __init mem_init(void) 903void __init mem_init(void)
565{ 904{
566 int codesize, reservedpages, datasize, initsize; 905 int codesize, reservedpages, datasize, initsize;
567 int tmp, bad_ppro; 906 int tmp;
568 907
569#ifdef CONFIG_FLATMEM 908#ifdef CONFIG_FLATMEM
570 BUG_ON(!mem_map); 909 BUG_ON(!mem_map);
571#endif 910#endif
572 bad_ppro = ppro_with_ram_bug();
573
574#ifdef CONFIG_HIGHMEM
575 /* check that fixmap and pkmap do not overlap */
576 if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
577 printk(KERN_ERR
578 "fixmap and kmap areas overlap - this will crash\n");
579 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
580 PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
581 FIXADDR_START);
582 BUG();
583 }
584#endif
585 /* this will put all low memory onto the freelists */ 911 /* this will put all low memory onto the freelists */
586 totalram_pages += free_all_bootmem(); 912 totalram_pages += free_all_bootmem();
587 913
@@ -593,7 +919,7 @@ void __init mem_init(void)
593 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 919 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
594 reservedpages++; 920 reservedpages++;
595 921
596 set_highmem_pages_init(bad_ppro); 922 set_highmem_pages_init();
597 923
598 codesize = (unsigned long) &_etext - (unsigned long) &_text; 924 codesize = (unsigned long) &_etext - (unsigned long) &_text;
599 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 925 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
@@ -614,7 +940,6 @@ void __init mem_init(void)
614 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 940 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
615 ); 941 );
616 942
617#if 1 /* double-sanity-check paranoia */
618 printk(KERN_INFO "virtual kernel memory layout:\n" 943 printk(KERN_INFO "virtual kernel memory layout:\n"
619 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 944 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
620#ifdef CONFIG_HIGHMEM 945#ifdef CONFIG_HIGHMEM
@@ -655,7 +980,6 @@ void __init mem_init(void)
655#endif 980#endif
656 BUG_ON(VMALLOC_START > VMALLOC_END); 981 BUG_ON(VMALLOC_START > VMALLOC_END);
657 BUG_ON((unsigned long)high_memory > VMALLOC_START); 982 BUG_ON((unsigned long)high_memory > VMALLOC_START);
658#endif /* double-sanity-check paranoia */
659 983
660 if (boot_cpu_data.wp_works_ok < 0) 984 if (boot_cpu_data.wp_works_ok < 0)
661 test_wp_bit(); 985 test_wp_bit();
@@ -784,3 +1108,9 @@ void free_initrd_mem(unsigned long start, unsigned long end)
784 free_init_pages("initrd memory", start, end); 1108 free_init_pages("initrd memory", start, end);
785} 1109}
786#endif 1110#endif
1111
1112int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1113 int flags)
1114{
1115 return reserve_bootmem(phys, len, flags);
1116}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 819dad973b13..48548ef7ddf8 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -18,6 +18,7 @@
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/initrd.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include <linux/bootmem.h> 23#include <linux/bootmem.h>
23#include <linux/proc_fs.h> 24#include <linux/proc_fs.h>
@@ -47,6 +48,13 @@
47#include <asm/numa.h> 48#include <asm/numa.h>
48#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
49 50
51/*
52 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
53 * The direct mapping extends to max_pfn_mapped, so that we can directly access
54 * apertures, ACPI and other tables without having to play with fixmaps.
55 */
56unsigned long max_pfn_mapped;
57
50static unsigned long dma_reserve __initdata; 58static unsigned long dma_reserve __initdata;
51 59
52DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 60DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -135,26 +143,17 @@ static __init void *spp_getpage(void)
135 return ptr; 143 return ptr;
136} 144}
137 145
138static __init void 146void
139set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) 147set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
140{ 148{
141 pgd_t *pgd;
142 pud_t *pud; 149 pud_t *pud;
143 pmd_t *pmd; 150 pmd_t *pmd;
144 pte_t *pte, new_pte; 151 pte_t *pte;
145
146 pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
147 152
148 pgd = pgd_offset_k(vaddr); 153 pud = pud_page + pud_index(vaddr);
149 if (pgd_none(*pgd)) {
150 printk(KERN_ERR
151 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
152 return;
153 }
154 pud = pud_offset(pgd, vaddr);
155 if (pud_none(*pud)) { 154 if (pud_none(*pud)) {
156 pmd = (pmd_t *) spp_getpage(); 155 pmd = (pmd_t *) spp_getpage();
157 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); 156 pud_populate(&init_mm, pud, pmd);
158 if (pmd != pmd_offset(pud, 0)) { 157 if (pmd != pmd_offset(pud, 0)) {
159 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 158 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
160 pmd, pmd_offset(pud, 0)); 159 pmd, pmd_offset(pud, 0));
@@ -164,13 +163,12 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
164 pmd = pmd_offset(pud, vaddr); 163 pmd = pmd_offset(pud, vaddr);
165 if (pmd_none(*pmd)) { 164 if (pmd_none(*pmd)) {
166 pte = (pte_t *) spp_getpage(); 165 pte = (pte_t *) spp_getpage();
167 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); 166 pmd_populate_kernel(&init_mm, pmd, pte);
168 if (pte != pte_offset_kernel(pmd, 0)) { 167 if (pte != pte_offset_kernel(pmd, 0)) {
169 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 168 printk(KERN_ERR "PAGETABLE BUG #02!\n");
170 return; 169 return;
171 } 170 }
172 } 171 }
173 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
174 172
175 pte = pte_offset_kernel(pmd, vaddr); 173 pte = pte_offset_kernel(pmd, vaddr);
176 if (!pte_none(*pte) && pte_val(new_pte) && 174 if (!pte_none(*pte) && pte_val(new_pte) &&
@@ -185,6 +183,64 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
185 __flush_tlb_one(vaddr); 183 __flush_tlb_one(vaddr);
186} 184}
187 185
186void
187set_pte_vaddr(unsigned long vaddr, pte_t pteval)
188{
189 pgd_t *pgd;
190 pud_t *pud_page;
191
192 pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
193
194 pgd = pgd_offset_k(vaddr);
195 if (pgd_none(*pgd)) {
196 printk(KERN_ERR
197 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
198 return;
199 }
200 pud_page = (pud_t*)pgd_page_vaddr(*pgd);
201 set_pte_vaddr_pud(pud_page, vaddr, pteval);
202}
203
204/*
205 * Create large page table mappings for a range of physical addresses.
206 */
207static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
208 pgprot_t prot)
209{
210 pgd_t *pgd;
211 pud_t *pud;
212 pmd_t *pmd;
213
214 BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
215 for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
216 pgd = pgd_offset_k((unsigned long)__va(phys));
217 if (pgd_none(*pgd)) {
218 pud = (pud_t *) spp_getpage();
219 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
220 _PAGE_USER));
221 }
222 pud = pud_offset(pgd, (unsigned long)__va(phys));
223 if (pud_none(*pud)) {
224 pmd = (pmd_t *) spp_getpage();
225 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
226 _PAGE_USER));
227 }
228 pmd = pmd_offset(pud, phys);
229 BUG_ON(!pmd_none(*pmd));
230 set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
231 }
232}
233
234void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
235{
236 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
237}
238
239void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
240{
241 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
242}
243
188/* 244/*
189 * The head.S code sets up the kernel high mapping: 245 * The head.S code sets up the kernel high mapping:
190 * 246 *
@@ -213,20 +269,9 @@ void __init cleanup_highmap(void)
213 } 269 }
214} 270}
215 271
216/* NOTE: this is meant to be run only at boot */
217void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
218{
219 unsigned long address = __fix_to_virt(idx);
220
221 if (idx >= __end_of_fixed_addresses) {
222 printk(KERN_ERR "Invalid __set_fixmap\n");
223 return;
224 }
225 set_pte_phys(address, phys, prot);
226}
227
228static unsigned long __initdata table_start; 272static unsigned long __initdata table_start;
229static unsigned long __meminitdata table_end; 273static unsigned long __meminitdata table_end;
274static unsigned long __meminitdata table_top;
230 275
231static __meminit void *alloc_low_page(unsigned long *phys) 276static __meminit void *alloc_low_page(unsigned long *phys)
232{ 277{
@@ -240,7 +285,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
240 return adr; 285 return adr;
241 } 286 }
242 287
243 if (pfn >= end_pfn) 288 if (pfn >= table_top)
244 panic("alloc_low_page: ran out of memory"); 289 panic("alloc_low_page: ran out of memory");
245 290
246 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 291 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -257,65 +302,61 @@ static __meminit void unmap_low_page(void *adr)
257 early_iounmap(adr, PAGE_SIZE); 302 early_iounmap(adr, PAGE_SIZE);
258} 303}
259 304
260/* Must run before zap_low_mappings */ 305static unsigned long __meminit
261__meminit void *early_ioremap(unsigned long addr, unsigned long size) 306phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
262{ 307{
263 pmd_t *pmd, *last_pmd; 308 unsigned pages = 0;
264 unsigned long vaddr; 309 unsigned long last_map_addr = end;
265 int i, pmds; 310 int i;
311
312 pte_t *pte = pte_page + pte_index(addr);
266 313
267 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; 314 for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
268 vaddr = __START_KERNEL_map;
269 pmd = level2_kernel_pgt;
270 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
271 315
272 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { 316 if (addr >= end) {
273 for (i = 0; i < pmds; i++) { 317 if (!after_bootmem) {
274 if (pmd_present(pmd[i])) 318 for(; i < PTRS_PER_PTE; i++, pte++)
275 goto continue_outer_loop; 319 set_pte(pte, __pte(0));
320 }
321 break;
276 } 322 }
277 vaddr += addr & ~PMD_MASK;
278 addr &= PMD_MASK;
279 323
280 for (i = 0; i < pmds; i++, addr += PMD_SIZE) 324 if (pte_val(*pte))
281 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); 325 continue;
282 __flush_tlb_all();
283 326
284 return (void *)vaddr; 327 if (0)
285continue_outer_loop: 328 printk(" pte=%p addr=%lx pte=%016lx\n",
286 ; 329 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
330 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
331 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
332 pages++;
287 } 333 }
288 printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size); 334 update_page_count(PG_LEVEL_4K, pages);
289 335
290 return NULL; 336 return last_map_addr;
291} 337}
292 338
293/* 339static unsigned long __meminit
294 * To avoid virtual aliases later: 340phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
295 */
296__meminit void early_iounmap(void *addr, unsigned long size)
297{ 341{
298 unsigned long vaddr; 342 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
299 pmd_t *pmd;
300 int i, pmds;
301
302 vaddr = (unsigned long)addr;
303 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
304 pmd = level2_kernel_pgt + pmd_index(vaddr);
305
306 for (i = 0; i < pmds; i++)
307 pmd_clear(pmd + i);
308 343
309 __flush_tlb_all(); 344 return phys_pte_init(pte, address, end);
310} 345}
311 346
312static unsigned long __meminit 347static unsigned long __meminit
313phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) 348phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
349 unsigned long page_size_mask)
314{ 350{
351 unsigned long pages = 0;
352 unsigned long last_map_addr = end;
353
315 int i = pmd_index(address); 354 int i = pmd_index(address);
316 355
317 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { 356 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
357 unsigned long pte_phys;
318 pmd_t *pmd = pmd_page + pmd_index(address); 358 pmd_t *pmd = pmd_page + pmd_index(address);
359 pte_t *pte;
319 360
320 if (address >= end) { 361 if (address >= end) {
321 if (!after_bootmem) { 362 if (!after_bootmem) {
@@ -325,31 +366,50 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
325 break; 366 break;
326 } 367 }
327 368
328 if (pmd_val(*pmd)) 369 if (pmd_val(*pmd)) {
370 if (!pmd_large(*pmd))
371 last_map_addr = phys_pte_update(pmd, address,
372 end);
329 continue; 373 continue;
374 }
330 375
331 set_pte((pte_t *)pmd, 376 if (page_size_mask & (1<<PG_LEVEL_2M)) {
332 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 377 pages++;
378 set_pte((pte_t *)pmd,
379 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
380 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
381 continue;
382 }
383
384 pte = alloc_low_page(&pte_phys);
385 last_map_addr = phys_pte_init(pte, address, end);
386 unmap_low_page(pte);
387
388 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
333 } 389 }
334 return address; 390 update_page_count(PG_LEVEL_2M, pages);
391 return last_map_addr;
335} 392}
336 393
337static unsigned long __meminit 394static unsigned long __meminit
338phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) 395phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
396 unsigned long page_size_mask)
339{ 397{
340 pmd_t *pmd = pmd_offset(pud, 0); 398 pmd_t *pmd = pmd_offset(pud, 0);
341 unsigned long last_map_addr; 399 unsigned long last_map_addr;
342 400
343 spin_lock(&init_mm.page_table_lock); 401 spin_lock(&init_mm.page_table_lock);
344 last_map_addr = phys_pmd_init(pmd, address, end); 402 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
345 spin_unlock(&init_mm.page_table_lock); 403 spin_unlock(&init_mm.page_table_lock);
346 __flush_tlb_all(); 404 __flush_tlb_all();
347 return last_map_addr; 405 return last_map_addr;
348} 406}
349 407
350static unsigned long __meminit 408static unsigned long __meminit
351phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) 409phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
410 unsigned long page_size_mask)
352{ 411{
412 unsigned long pages = 0;
353 unsigned long last_map_addr = end; 413 unsigned long last_map_addr = end;
354 int i = pud_index(addr); 414 int i = pud_index(addr);
355 415
@@ -369,11 +429,13 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
369 429
370 if (pud_val(*pud)) { 430 if (pud_val(*pud)) {
371 if (!pud_large(*pud)) 431 if (!pud_large(*pud))
372 last_map_addr = phys_pmd_update(pud, addr, end); 432 last_map_addr = phys_pmd_update(pud, addr, end,
433 page_size_mask);
373 continue; 434 continue;
374 } 435 }
375 436
376 if (direct_gbpages) { 437 if (page_size_mask & (1<<PG_LEVEL_1G)) {
438 pages++;
377 set_pte((pte_t *)pud, 439 set_pte((pte_t *)pud,
378 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 440 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
379 last_map_addr = (addr & PUD_MASK) + PUD_SIZE; 441 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
@@ -383,27 +445,50 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
383 pmd = alloc_low_page(&pmd_phys); 445 pmd = alloc_low_page(&pmd_phys);
384 446
385 spin_lock(&init_mm.page_table_lock); 447 spin_lock(&init_mm.page_table_lock);
386 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 448 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
387 last_map_addr = phys_pmd_init(pmd, addr, end); 449 unmap_low_page(pmd);
450 pud_populate(&init_mm, pud, __va(pmd_phys));
388 spin_unlock(&init_mm.page_table_lock); 451 spin_unlock(&init_mm.page_table_lock);
389 452
390 unmap_low_page(pmd);
391 } 453 }
392 __flush_tlb_all(); 454 __flush_tlb_all();
455 update_page_count(PG_LEVEL_1G, pages);
393 456
394 return last_map_addr >> PAGE_SHIFT; 457 return last_map_addr;
458}
459
460static unsigned long __meminit
461phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
462 unsigned long page_size_mask)
463{
464 pud_t *pud;
465
466 pud = (pud_t *)pgd_page_vaddr(*pgd);
467
468 return phys_pud_init(pud, addr, end, page_size_mask);
395} 469}
396 470
397static void __init find_early_table_space(unsigned long end) 471static void __init find_early_table_space(unsigned long end)
398{ 472{
399 unsigned long puds, pmds, tables, start; 473 unsigned long puds, pmds, ptes, tables, start;
400 474
401 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 475 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
402 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 476 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
403 if (!direct_gbpages) { 477 if (direct_gbpages) {
478 unsigned long extra;
479 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
480 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
481 } else
404 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 482 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
405 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 483 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
406 } 484
485 if (cpu_has_pse) {
486 unsigned long extra;
487 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
488 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
489 } else
490 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
491 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
407 492
408 /* 493 /*
409 * RED-PEN putting page tables only on node 0 could 494 * RED-PEN putting page tables only on node 0 could
@@ -417,10 +502,10 @@ static void __init find_early_table_space(unsigned long end)
417 502
418 table_start >>= PAGE_SHIFT; 503 table_start >>= PAGE_SHIFT;
419 table_end = table_start; 504 table_end = table_start;
505 table_top = table_start + (tables >> PAGE_SHIFT);
420 506
421 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", 507 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
422 end, table_start << PAGE_SHIFT, 508 end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
423 (table_start << PAGE_SHIFT) + tables);
424} 509}
425 510
426static void __init init_gbpages(void) 511static void __init init_gbpages(void)
@@ -431,7 +516,7 @@ static void __init init_gbpages(void)
431 direct_gbpages = 0; 516 direct_gbpages = 0;
432} 517}
433 518
434#ifdef CONFIG_MEMTEST_BOOTPARAM 519#ifdef CONFIG_MEMTEST
435 520
436static void __init memtest(unsigned long start_phys, unsigned long size, 521static void __init memtest(unsigned long start_phys, unsigned long size,
437 unsigned pattern) 522 unsigned pattern)
@@ -493,7 +578,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
493 578
494} 579}
495 580
496static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; 581/* default is disabled */
582static int memtest_pattern __initdata;
497 583
498static int __init parse_memtest(char *arg) 584static int __init parse_memtest(char *arg)
499{ 585{
@@ -542,15 +628,85 @@ static void __init early_memtest(unsigned long start, unsigned long end)
542} 628}
543#endif 629#endif
544 630
631static unsigned long __init kernel_physical_mapping_init(unsigned long start,
632 unsigned long end,
633 unsigned long page_size_mask)
634{
635
636 unsigned long next, last_map_addr = end;
637
638 start = (unsigned long)__va(start);
639 end = (unsigned long)__va(end);
640
641 for (; start < end; start = next) {
642 pgd_t *pgd = pgd_offset_k(start);
643 unsigned long pud_phys;
644 pud_t *pud;
645
646 next = start + PGDIR_SIZE;
647 if (next > end)
648 next = end;
649
650 if (pgd_val(*pgd)) {
651 last_map_addr = phys_pud_update(pgd, __pa(start),
652 __pa(end), page_size_mask);
653 continue;
654 }
655
656 if (after_bootmem)
657 pud = pud_offset(pgd, start & PGDIR_MASK);
658 else
659 pud = alloc_low_page(&pud_phys);
660
661 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
662 page_size_mask);
663 unmap_low_page(pud);
664 pgd_populate(&init_mm, pgd_offset_k(start),
665 __va(pud_phys));
666 }
667
668 return last_map_addr;
669}
670
671struct map_range {
672 unsigned long start;
673 unsigned long end;
674 unsigned page_size_mask;
675};
676
677#define NR_RANGE_MR 5
678
679static int save_mr(struct map_range *mr, int nr_range,
680 unsigned long start_pfn, unsigned long end_pfn,
681 unsigned long page_size_mask)
682{
683
684 if (start_pfn < end_pfn) {
685 if (nr_range >= NR_RANGE_MR)
686 panic("run out of range for init_memory_mapping\n");
687 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
688 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
689 mr[nr_range].page_size_mask = page_size_mask;
690 nr_range++;
691 }
692
693 return nr_range;
694}
695
545/* 696/*
546 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 697 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
547 * This runs before bootmem is initialized and gets pages directly from 698 * This runs before bootmem is initialized and gets pages directly from
548 * the physical memory. To access them they are temporarily mapped. 699 * the physical memory. To access them they are temporarily mapped.
549 */ 700 */
550unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) 701unsigned long __init_refok init_memory_mapping(unsigned long start,
702 unsigned long end)
551{ 703{
552 unsigned long next, last_map_addr = end; 704 unsigned long last_map_addr = 0;
553 unsigned long start_phys = start, end_phys = end; 705 unsigned long page_size_mask = 0;
706 unsigned long start_pfn, end_pfn;
707
708 struct map_range mr[NR_RANGE_MR];
709 int nr_range, i;
554 710
555 printk(KERN_INFO "init_memory_mapping\n"); 711 printk(KERN_INFO "init_memory_mapping\n");
556 712
@@ -561,48 +717,101 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon
561 * memory mapped. Unfortunately this is done currently before the 717 * memory mapped. Unfortunately this is done currently before the
562 * nodes are discovered. 718 * nodes are discovered.
563 */ 719 */
564 if (!after_bootmem) { 720 if (!after_bootmem)
565 init_gbpages(); 721 init_gbpages();
566 find_early_table_space(end);
567 }
568 722
569 start = (unsigned long)__va(start); 723 if (direct_gbpages)
570 end = (unsigned long)__va(end); 724 page_size_mask |= 1 << PG_LEVEL_1G;
725 if (cpu_has_pse)
726 page_size_mask |= 1 << PG_LEVEL_2M;
727
728 memset(mr, 0, sizeof(mr));
729 nr_range = 0;
730
731 /* head if not big page alignment ?*/
732 start_pfn = start >> PAGE_SHIFT;
733 end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
734 << (PMD_SHIFT - PAGE_SHIFT);
735 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
736
737 /* big page (2M) range*/
738 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
739 << (PMD_SHIFT - PAGE_SHIFT);
740 end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
741 << (PUD_SHIFT - PAGE_SHIFT);
742 if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
743 end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
744 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
745 page_size_mask & (1<<PG_LEVEL_2M));
746
747 /* big page (1G) range */
748 start_pfn = end_pfn;
749 end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
750 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
751 page_size_mask &
752 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
753
754 /* tail is not big page (1G) alignment */
755 start_pfn = end_pfn;
756 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
757 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
758 page_size_mask & (1<<PG_LEVEL_2M));
759
760 /* tail is not big page (2M) alignment */
761 start_pfn = end_pfn;
762 end_pfn = end>>PAGE_SHIFT;
763 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
764
765 for (i = 0; i < nr_range; i++)
766 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
767 mr[i].start, mr[i].end,
768 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
769 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
571 770
572 for (; start < end; start = next) { 771 if (!after_bootmem)
573 pgd_t *pgd = pgd_offset_k(start); 772 find_early_table_space(end);
574 unsigned long pud_phys;
575 pud_t *pud;
576
577 if (after_bootmem)
578 pud = pud_offset(pgd, start & PGDIR_MASK);
579 else
580 pud = alloc_low_page(&pud_phys);
581 773
582 next = start + PGDIR_SIZE; 774 for (i = 0; i < nr_range; i++)
583 if (next > end) 775 last_map_addr = kernel_physical_mapping_init(
584 next = end; 776 mr[i].start, mr[i].end,
585 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); 777 mr[i].page_size_mask);
586 if (!after_bootmem)
587 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
588 unmap_low_page(pud);
589 }
590 778
591 if (!after_bootmem) 779 if (!after_bootmem)
592 mmu_cr4_features = read_cr4(); 780 mmu_cr4_features = read_cr4();
593 __flush_tlb_all(); 781 __flush_tlb_all();
594 782
595 if (!after_bootmem) 783 if (!after_bootmem && table_end > table_start)
596 reserve_early(table_start << PAGE_SHIFT, 784 reserve_early(table_start << PAGE_SHIFT,
597 table_end << PAGE_SHIFT, "PGTABLE"); 785 table_end << PAGE_SHIFT, "PGTABLE");
598 786
787 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
788 last_map_addr, end);
789
599 if (!after_bootmem) 790 if (!after_bootmem)
600 early_memtest(start_phys, end_phys); 791 early_memtest(start, end);
601 792
602 return last_map_addr; 793 return last_map_addr >> PAGE_SHIFT;
603} 794}
604 795
605#ifndef CONFIG_NUMA 796#ifndef CONFIG_NUMA
797void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
798{
799 unsigned long bootmap_size, bootmap;
800
801 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
802 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
803 PAGE_SIZE);
804 if (bootmap == -1L)
805 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
806 /* don't touch min_low_pfn */
807 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
808 0, end_pfn);
809 e820_register_active_regions(0, start_pfn, end_pfn);
810 free_bootmem_with_active_regions(0, end_pfn);
811 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
812 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
813}
814
606void __init paging_init(void) 815void __init paging_init(void)
607{ 816{
608 unsigned long max_zone_pfns[MAX_NR_ZONES]; 817 unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -610,9 +819,9 @@ void __init paging_init(void)
610 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 819 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
611 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 820 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
612 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 821 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
613 max_zone_pfns[ZONE_NORMAL] = end_pfn; 822 max_zone_pfns[ZONE_NORMAL] = max_pfn;
614 823
615 memory_present(0, 0, end_pfn); 824 memory_present(0, 0, max_pfn);
616 sparse_init(); 825 sparse_init();
617 free_area_init_nodes(max_zone_pfns); 826 free_area_init_nodes(max_zone_pfns);
618} 827}
@@ -694,8 +903,8 @@ void __init mem_init(void)
694#else 903#else
695 totalram_pages = free_all_bootmem(); 904 totalram_pages = free_all_bootmem();
696#endif 905#endif
697 reservedpages = end_pfn - totalram_pages - 906 reservedpages = max_pfn - totalram_pages -
698 absent_pages_in_range(0, end_pfn); 907 absent_pages_in_range(0, max_pfn);
699 after_bootmem = 1; 908 after_bootmem = 1;
700 909
701 codesize = (unsigned long) &_etext - (unsigned long) &_text; 910 codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -714,7 +923,7 @@ void __init mem_init(void)
714 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " 923 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
715 "%ldk reserved, %ldk data, %ldk init)\n", 924 "%ldk reserved, %ldk data, %ldk init)\n",
716 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 925 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
717 end_pfn << (PAGE_SHIFT-10), 926 max_pfn << (PAGE_SHIFT-10),
718 codesize >> 10, 927 codesize >> 10,
719 reservedpages << (PAGE_SHIFT-10), 928 reservedpages << (PAGE_SHIFT-10),
720 datasize >> 10, 929 datasize >> 10,
@@ -799,24 +1008,26 @@ void free_initrd_mem(unsigned long start, unsigned long end)
799} 1008}
800#endif 1009#endif
801 1010
802void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 1011int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1012 int flags)
803{ 1013{
804#ifdef CONFIG_NUMA 1014#ifdef CONFIG_NUMA
805 int nid, next_nid; 1015 int nid, next_nid;
1016 int ret;
806#endif 1017#endif
807 unsigned long pfn = phys >> PAGE_SHIFT; 1018 unsigned long pfn = phys >> PAGE_SHIFT;
808 1019
809 if (pfn >= end_pfn) { 1020 if (pfn >= max_pfn) {
810 /* 1021 /*
811 * This can happen with kdump kernels when accessing 1022 * This can happen with kdump kernels when accessing
812 * firmware tables: 1023 * firmware tables:
813 */ 1024 */
814 if (pfn < max_pfn_mapped) 1025 if (pfn < max_pfn_mapped)
815 return; 1026 return -EFAULT;
816 1027
817 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", 1028 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
818 phys, len); 1029 phys, len);
819 return; 1030 return -EFAULT;
820 } 1031 }
821 1032
822 /* Should check here against the e820 map to avoid double free */ 1033 /* Should check here against the e820 map to avoid double free */
@@ -824,9 +1035,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
824 nid = phys_to_nid(phys); 1035 nid = phys_to_nid(phys);
825 next_nid = phys_to_nid(phys + len - 1); 1036 next_nid = phys_to_nid(phys + len - 1);
826 if (nid == next_nid) 1037 if (nid == next_nid)
827 reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); 1038 ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
828 else 1039 else
829 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 1040 ret = reserve_bootmem(phys, len, flags);
1041
1042 if (ret != 0)
1043 return ret;
1044
830#else 1045#else
831 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 1046 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
832#endif 1047#endif
@@ -835,6 +1050,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
835 dma_reserve += len / PAGE_SIZE; 1050 dma_reserve += len / PAGE_SIZE;
836 set_dma_reserve(dma_reserve); 1051 set_dma_reserve(dma_reserve);
837 } 1052 }
1053
1054 return 0;
838} 1055}
839 1056
840int kern_addr_valid(unsigned long addr) 1057int kern_addr_valid(unsigned long addr)
@@ -939,7 +1156,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
939 pmd_t *pmd; 1156 pmd_t *pmd;
940 1157
941 for (; addr < end; addr = next) { 1158 for (; addr < end; addr = next) {
942 next = pmd_addr_end(addr, end); 1159 void *p = NULL;
943 1160
944 pgd = vmemmap_pgd_populate(addr, node); 1161 pgd = vmemmap_pgd_populate(addr, node);
945 if (!pgd) 1162 if (!pgd)
@@ -949,33 +1166,51 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
949 if (!pud) 1166 if (!pud)
950 return -ENOMEM; 1167 return -ENOMEM;
951 1168
952 pmd = pmd_offset(pud, addr); 1169 if (!cpu_has_pse) {
953 if (pmd_none(*pmd)) { 1170 next = (addr + PAGE_SIZE) & PAGE_MASK;
954 pte_t entry; 1171 pmd = vmemmap_pmd_populate(pud, addr, node);
955 void *p; 1172
1173 if (!pmd)
1174 return -ENOMEM;
1175
1176 p = vmemmap_pte_populate(pmd, addr, node);
956 1177
957 p = vmemmap_alloc_block(PMD_SIZE, node);
958 if (!p) 1178 if (!p)
959 return -ENOMEM; 1179 return -ENOMEM;
960 1180
961 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, 1181 addr_end = addr + PAGE_SIZE;
962 PAGE_KERNEL_LARGE); 1182 p_end = p + PAGE_SIZE;
963 set_pmd(pmd, __pmd(pte_val(entry)));
964
965 /* check to see if we have contiguous blocks */
966 if (p_end != p || node_start != node) {
967 if (p_start)
968 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
969 addr_start, addr_end-1, p_start, p_end-1, node_start);
970 addr_start = addr;
971 node_start = node;
972 p_start = p;
973 }
974 addr_end = addr + PMD_SIZE;
975 p_end = p + PMD_SIZE;
976 } else { 1183 } else {
977 vmemmap_verify((pte_t *)pmd, node, addr, next); 1184 next = pmd_addr_end(addr, end);
1185
1186 pmd = pmd_offset(pud, addr);
1187 if (pmd_none(*pmd)) {
1188 pte_t entry;
1189
1190 p = vmemmap_alloc_block(PMD_SIZE, node);
1191 if (!p)
1192 return -ENOMEM;
1193
1194 entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
1195 PAGE_KERNEL_LARGE);
1196 set_pmd(pmd, __pmd(pte_val(entry)));
1197
1198 /* check to see if we have contiguous blocks */
1199 if (p_end != p || node_start != node) {
1200 if (p_start)
1201 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1202 addr_start, addr_end-1, p_start, p_end-1, node_start);
1203 addr_start = addr;
1204 node_start = node;
1205 p_start = p;
1206 }
1207
1208 addr_end = addr + PMD_SIZE;
1209 p_end = p + PMD_SIZE;
1210 } else
1211 vmemmap_verify((pte_t *)pmd, node, addr, next);
978 } 1212 }
1213
979 } 1214 }
980 return 0; 1215 return 0;
981} 1216}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 2b2bb3f9b683..45e546c4ba78 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -142,7 +142,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
142 /* 142 /*
143 * Don't remap the low PCI/ISA area, it's always mapped.. 143 * Don't remap the low PCI/ISA area, it's always mapped..
144 */ 144 */
145 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) 145 if (is_ISA_range(phys_addr, last_addr))
146 return (__force void __iomem *)phys_to_virt(phys_addr); 146 return (__force void __iomem *)phys_to_virt(phys_addr);
147 147
148 /* 148 /*
@@ -261,7 +261,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
261{ 261{
262 /* 262 /*
263 * Ideally, this should be: 263 * Ideally, this should be:
264 * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; 264 * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
265 * 265 *
266 * Till we fix all X drivers to use ioremap_wc(), we will use 266 * Till we fix all X drivers to use ioremap_wc(), we will use
267 * UC MINUS. 267 * UC MINUS.
@@ -285,7 +285,7 @@ EXPORT_SYMBOL(ioremap_nocache);
285 */ 285 */
286void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) 286void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
287{ 287{
288 if (pat_wc_enabled) 288 if (pat_enabled)
289 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, 289 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
290 __builtin_return_address(0)); 290 __builtin_return_address(0));
291 else 291 else
@@ -318,8 +318,8 @@ void iounmap(volatile void __iomem *addr)
318 * vm_area and by simply returning an address into the kernel mapping 318 * vm_area and by simply returning an address into the kernel mapping
319 * of ISA space. So handle that here. 319 * of ISA space. So handle that here.
320 */ 320 */
321 if (addr >= phys_to_virt(ISA_START_ADDRESS) && 321 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
322 addr < phys_to_virt(ISA_END_ADDRESS)) 322 (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
323 return; 323 return;
324 324
325 addr = (volatile void __iomem *) 325 addr = (volatile void __iomem *)
@@ -332,7 +332,7 @@ void iounmap(volatile void __iomem *addr)
332 cpa takes care of the direct mappings. */ 332 cpa takes care of the direct mappings. */
333 read_lock(&vmlist_lock); 333 read_lock(&vmlist_lock);
334 for (p = vmlist; p; p = p->next) { 334 for (p = vmlist; p; p = p->next) {
335 if (p->addr == addr) 335 if (p->addr == (void __force *)addr)
336 break; 336 break;
337 } 337 }
338 read_unlock(&vmlist_lock); 338 read_unlock(&vmlist_lock);
@@ -346,7 +346,7 @@ void iounmap(volatile void __iomem *addr)
346 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); 346 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
347 347
348 /* Finally remove it */ 348 /* Finally remove it */
349 o = remove_vm_area((void *)addr); 349 o = remove_vm_area((void __force *)addr);
350 BUG_ON(p != o || o == NULL); 350 BUG_ON(p != o || o == NULL);
351 kfree(p); 351 kfree(p);
352} 352}
@@ -365,7 +365,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
365 if (page_is_ram(start >> PAGE_SHIFT)) 365 if (page_is_ram(start >> PAGE_SHIFT))
366 return __va(phys); 366 return __va(phys);
367 367
368 addr = (void *)ioremap(start, PAGE_SIZE); 368 addr = (void __force *)ioremap(start, PAGE_SIZE);
369 if (addr) 369 if (addr)
370 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 370 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
371 371
@@ -381,8 +381,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
381 return; 381 return;
382} 382}
383 383
384#ifdef CONFIG_X86_32
385
386int __initdata early_ioremap_debug; 384int __initdata early_ioremap_debug;
387 385
388static int __init early_ioremap_debug_setup(char *str) 386static int __init early_ioremap_debug_setup(char *str)
@@ -394,8 +392,7 @@ static int __init early_ioremap_debug_setup(char *str)
394early_param("early_ioremap_debug", early_ioremap_debug_setup); 392early_param("early_ioremap_debug", early_ioremap_debug_setup);
395 393
396static __initdata int after_paging_init; 394static __initdata int after_paging_init;
397static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] 395static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
398 __section(.bss.page_aligned);
399 396
400static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) 397static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
401{ 398{
@@ -484,10 +481,11 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
484 return; 481 return;
485 } 482 }
486 pte = early_ioremap_pte(addr); 483 pte = early_ioremap_pte(addr);
484
487 if (pgprot_val(flags)) 485 if (pgprot_val(flags))
488 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); 486 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
489 else 487 else
490 pte_clear(NULL, addr, pte); 488 pte_clear(&init_mm, addr, pte);
491 __flush_tlb_one(addr); 489 __flush_tlb_one(addr);
492} 490}
493 491
@@ -625,5 +623,3 @@ void __this_fixmap_does_not_exist(void)
625{ 623{
626 WARN_ON(1); 624 WARN_ON(1);
627} 625}
628
629#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 1f476e477844..41f1b5c00a1d 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,6 +22,7 @@
22#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/mpspec.h> 23#include <asm/mpspec.h>
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h>
25 26
26static __init int find_northbridge(void) 27static __init int find_northbridge(void)
27{ 28{
@@ -56,34 +57,33 @@ static __init void early_get_boot_cpu_id(void)
56 /* 57 /*
57 * Find possible boot-time SMP configuration: 58 * Find possible boot-time SMP configuration:
58 */ 59 */
60#ifdef CONFIG_X86_MPPARSE
59 early_find_smp_config(); 61 early_find_smp_config();
62#endif
60#ifdef CONFIG_ACPI 63#ifdef CONFIG_ACPI
61 /* 64 /*
62 * Read APIC information from ACPI tables. 65 * Read APIC information from ACPI tables.
63 */ 66 */
64 early_acpi_boot_init(); 67 early_acpi_boot_init();
65#endif 68#endif
69#ifdef CONFIG_X86_MPPARSE
66 /* 70 /*
67 * get boot-time SMP configuration: 71 * get boot-time SMP configuration:
68 */ 72 */
69 if (smp_found_config) 73 if (smp_found_config)
70 early_get_smp_config(); 74 early_get_smp_config();
75#endif
71 early_init_lapic_mapping(); 76 early_init_lapic_mapping();
72} 77}
73 78
74int __init k8_scan_nodes(unsigned long start, unsigned long end) 79int __init k8_scan_nodes(unsigned long start, unsigned long end)
75{ 80{
81 unsigned numnodes, cores, bits, apicid_base;
76 unsigned long prevbase; 82 unsigned long prevbase;
77 struct bootnode nodes[8]; 83 struct bootnode nodes[8];
78 int nodeid, i, nb;
79 unsigned char nodeids[8]; 84 unsigned char nodeids[8];
80 int found = 0; 85 int i, j, nb, found = 0;
81 u32 reg; 86 u32 nodeid, reg;
82 unsigned numnodes;
83 unsigned cores;
84 unsigned bits;
85 int j;
86 unsigned apicid_base;
87 87
88 if (!early_pci_allowed()) 88 if (!early_pci_allowed())
89 return -1; 89 return -1;
@@ -105,7 +105,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
105 prevbase = 0; 105 prevbase = 0;
106 for (i = 0; i < 8; i++) { 106 for (i = 0; i < 8; i++) {
107 unsigned long base, limit; 107 unsigned long base, limit;
108 u32 nodeid;
109 108
110 base = read_pci_config(0, nb, 1, 0x40 + i*8); 109 base = read_pci_config(0, nb, 1, 0x40 + i*8);
111 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 110 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -144,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
144 limit |= (1<<24)-1; 143 limit |= (1<<24)-1;
145 limit++; 144 limit++;
146 145
147 if (limit > end_pfn << PAGE_SHIFT) 146 if (limit > max_pfn << PAGE_SHIFT)
148 limit = end_pfn << PAGE_SHIFT; 147 limit = max_pfn << PAGE_SHIFT;
149 if (limit <= base) 148 if (limit <= base)
150 continue; 149 continue;
151 150
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c5066d519e5d..b432d5781773 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -27,30 +27,17 @@
27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data); 28EXPORT_SYMBOL(node_data);
29 29
30bootmem_data_t plat_node_bdata[MAX_NUMNODES]; 30static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
31 31
32struct memnode memnode; 32struct memnode memnode;
33 33
34#ifdef CONFIG_SMP
35int x86_cpu_to_node_map_init[NR_CPUS] = {
36 [0 ... NR_CPUS-1] = NUMA_NO_NODE
37};
38void *x86_cpu_to_node_map_early_ptr;
39EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
40#endif
41DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
42EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 34s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
45 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
46}; 36};
47 37
48cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
49EXPORT_SYMBOL(node_to_cpumask_map);
50
51int numa_off __initdata; 38int numa_off __initdata;
52unsigned long __initdata nodemap_addr; 39static unsigned long __initdata nodemap_addr;
53unsigned long __initdata nodemap_size; 40static unsigned long __initdata nodemap_size;
54 41
55/* 42/*
56 * Given a shift value, try to populate memnodemap[] 43 * Given a shift value, try to populate memnodemap[]
@@ -99,7 +86,7 @@ static int __init allocate_cachealigned_memnodemap(void)
99 86
100 addr = 0x8000; 87 addr = 0x8000;
101 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 88 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
102 nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT, 89 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
103 nodemap_size, L1_CACHE_BYTES); 90 nodemap_size, L1_CACHE_BYTES);
104 if (nodemap_addr == -1UL) { 91 if (nodemap_addr == -1UL) {
105 printk(KERN_ERR 92 printk(KERN_ERR
@@ -192,7 +179,7 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
192void __init setup_node_bootmem(int nodeid, unsigned long start, 179void __init setup_node_bootmem(int nodeid, unsigned long start,
193 unsigned long end) 180 unsigned long end)
194{ 181{
195 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; 182 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
196 unsigned long bootmap_start, nodedata_phys; 183 unsigned long bootmap_start, nodedata_phys;
197 void *bootmap; 184 void *bootmap;
198 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 185 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
@@ -204,7 +191,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
204 start, end); 191 start, end);
205 192
206 start_pfn = start >> PAGE_SHIFT; 193 start_pfn = start >> PAGE_SHIFT;
207 end_pfn = end >> PAGE_SHIFT; 194 last_pfn = end >> PAGE_SHIFT;
208 195
209 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 196 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
210 SMP_CACHE_BYTES); 197 SMP_CACHE_BYTES);
@@ -217,7 +204,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 204 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 205 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 206 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 207 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 208
222 /* 209 /*
223 * Find a place for the bootmem map 210 * Find a place for the bootmem map
@@ -226,14 +213,14 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
226 * early_node_mem will get that with find_e820_area instead 213 * early_node_mem will get that with find_e820_area instead
227 * of alloc_bootmem, that could clash with reserved range 214 * of alloc_bootmem, that could clash with reserved range
228 */ 215 */
229 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 216 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 217 nid = phys_to_nid(nodedata_phys);
231 if (nid == nodeid) 218 if (nid == nodeid)
232 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 219 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else 220 else
234 bootmap_start = round_up(start, PAGE_SIZE); 221 bootmap_start = round_up(start, PAGE_SIZE);
235 /* 222 /*
236 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like 223 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 224 * to use that to align to PAGE_SIZE
238 */ 225 */
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 226 bootmap = early_node_mem(nodeid, bootmap_start, end,
@@ -248,7 +235,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
248 235
249 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 236 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
250 bootmap_start >> PAGE_SHIFT, 237 bootmap_start >> PAGE_SHIFT,
251 start_pfn, end_pfn); 238 start_pfn, last_pfn);
252 239
253 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 240 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
254 bootmap_start, bootmap_start + bootmap_size - 1, 241 bootmap_start, bootmap_start + bootmap_size - 1,
@@ -309,7 +296,7 @@ void __init numa_init_array(void)
309 296
310#ifdef CONFIG_NUMA_EMU 297#ifdef CONFIG_NUMA_EMU
311/* Numa emulation */ 298/* Numa emulation */
312char *cmdline __initdata; 299static char *cmdline __initdata;
313 300
314/* 301/*
315 * Setups up nid to range from addr to addr + size. If the end 302 * Setups up nid to range from addr to addr + size. If the end
@@ -413,15 +400,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413} 400}
414 401
415/* 402/*
416 * Sets up the system RAM area from start_pfn to end_pfn according to the 403 * Sets up the system RAM area from start_pfn to last_pfn according to the
417 * numa=fake command-line option. 404 * numa=fake command-line option.
418 */ 405 */
419static struct bootnode nodes[MAX_NUMNODES] __initdata; 406static struct bootnode nodes[MAX_NUMNODES] __initdata;
420 407
421static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 408static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
422{ 409{
423 u64 size, addr = start_pfn << PAGE_SHIFT; 410 u64 size, addr = start_pfn << PAGE_SHIFT;
424 u64 max_addr = end_pfn << PAGE_SHIFT; 411 u64 max_addr = last_pfn << PAGE_SHIFT;
425 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 412 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
426 413
427 memset(&nodes, 0, sizeof(nodes)); 414 memset(&nodes, 0, sizeof(nodes));
@@ -527,7 +514,7 @@ out:
527} 514}
528#endif /* CONFIG_NUMA_EMU */ 515#endif /* CONFIG_NUMA_EMU */
529 516
530void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 517void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
531{ 518{
532 int i; 519 int i;
533 520
@@ -535,7 +522,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
535 nodes_clear(node_online_map); 522 nodes_clear(node_online_map);
536 523
537#ifdef CONFIG_NUMA_EMU 524#ifdef CONFIG_NUMA_EMU
538 if (cmdline && !numa_emulation(start_pfn, end_pfn)) 525 if (cmdline && !numa_emulation(start_pfn, last_pfn))
539 return; 526 return;
540 nodes_clear(node_possible_map); 527 nodes_clear(node_possible_map);
541 nodes_clear(node_online_map); 528 nodes_clear(node_online_map);
@@ -543,7 +530,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
543 530
544#ifdef CONFIG_ACPI_NUMA 531#ifdef CONFIG_ACPI_NUMA
545 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 532 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
546 end_pfn << PAGE_SHIFT)) 533 last_pfn << PAGE_SHIFT))
547 return; 534 return;
548 nodes_clear(node_possible_map); 535 nodes_clear(node_possible_map);
549 nodes_clear(node_online_map); 536 nodes_clear(node_online_map);
@@ -551,7 +538,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
551 538
552#ifdef CONFIG_K8_NUMA 539#ifdef CONFIG_K8_NUMA
553 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 540 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
554 end_pfn<<PAGE_SHIFT)) 541 last_pfn<<PAGE_SHIFT))
555 return; 542 return;
556 nodes_clear(node_possible_map); 543 nodes_clear(node_possible_map);
557 nodes_clear(node_online_map); 544 nodes_clear(node_online_map);
@@ -561,7 +548,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
561 548
562 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 549 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
563 start_pfn << PAGE_SHIFT, 550 start_pfn << PAGE_SHIFT,
564 end_pfn << PAGE_SHIFT); 551 last_pfn << PAGE_SHIFT);
565 /* setup dummy node covering all memory */ 552 /* setup dummy node covering all memory */
566 memnode_shift = 63; 553 memnode_shift = 63;
567 memnodemap = memnode.embedded_map; 554 memnodemap = memnode.embedded_map;
@@ -570,29 +557,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
570 node_set(0, node_possible_map); 557 node_set(0, node_possible_map);
571 for (i = 0; i < NR_CPUS; i++) 558 for (i = 0; i < NR_CPUS; i++)
572 numa_set_node(i, 0); 559 numa_set_node(i, 0);
573 /* cpumask_of_cpu() may not be available during early startup */ 560 e820_register_active_regions(0, start_pfn, last_pfn);
574 memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); 561 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
575 cpu_set(0, node_to_cpumask_map[0]);
576 e820_register_active_regions(0, start_pfn, end_pfn);
577 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
578}
579
580__cpuinit void numa_add_cpu(int cpu)
581{
582 set_bit(cpu,
583 (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
584}
585
586void __cpuinit numa_set_node(int cpu, int node)
587{
588 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
589
590 if(cpu_to_node_map)
591 cpu_to_node_map[cpu] = node;
592 else if(per_cpu_offset(cpu))
593 per_cpu(x86_cpu_to_node_map, cpu) = node;
594 else
595 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
596} 562}
597 563
598unsigned long __init numa_free_all_bootmem(void) 564unsigned long __init numa_free_all_bootmem(void)
@@ -613,7 +579,7 @@ void __init paging_init(void)
613 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 579 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
614 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 580 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
615 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 581 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
616 max_zone_pfns[ZONE_NORMAL] = end_pfn; 582 max_zone_pfns[ZONE_NORMAL] = max_pfn;
617 583
618 sparse_memory_present_with_active_regions(MAX_NUMNODES); 584 sparse_memory_present_with_active_regions(MAX_NUMNODES);
619 sparse_init(); 585 sparse_init();
@@ -641,6 +607,7 @@ static __init int numa_setup(char *opt)
641} 607}
642early_param("numa", numa_setup); 608early_param("numa", numa_setup);
643 609
610#ifdef CONFIG_NUMA
644/* 611/*
645 * Setup early cpu_to_node. 612 * Setup early cpu_to_node.
646 * 613 *
@@ -652,14 +619,19 @@ early_param("numa", numa_setup);
652 * is already initialized in a round robin manner at numa_init_array, 619 * is already initialized in a round robin manner at numa_init_array,
653 * prior to this call, and this initialization is good enough 620 * prior to this call, and this initialization is good enough
654 * for the fake NUMA cases. 621 * for the fake NUMA cases.
622 *
623 * Called before the per_cpu areas are setup.
655 */ 624 */
656void __init init_cpu_to_node(void) 625void __init init_cpu_to_node(void)
657{ 626{
658 int i; 627 int cpu;
628 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
659 629
660 for (i = 0; i < NR_CPUS; i++) { 630 BUG_ON(cpu_to_apicid == NULL);
631
632 for_each_possible_cpu(cpu) {
661 int node; 633 int node;
662 u16 apicid = x86_cpu_to_apicid_init[i]; 634 u16 apicid = cpu_to_apicid[cpu];
663 635
664 if (apicid == BAD_APICID) 636 if (apicid == BAD_APICID)
665 continue; 637 continue;
@@ -668,8 +640,9 @@ void __init init_cpu_to_node(void)
668 continue; 640 continue;
669 if (!node_online(node)) 641 if (!node_online(node))
670 continue; 642 continue;
671 numa_set_node(i, node); 643 numa_set_node(cpu, node);
672 } 644 }
673} 645}
646#endif
674 647
675 648
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 75f1b109aae8..0dcd42eb94e6 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * self test for change_page_attr. 2 * self test for change_page_attr.
3 * 3 *
4 * Clears the global bit on random pages in the direct mapping, then reverts 4 * Clears the a test pte bit on random pages in the direct mapping,
5 * and compares page tables forwards and afterwards. 5 * then reverts and compares page tables forwards and afterwards.
6 */ 6 */
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/kthread.h> 8#include <linux/kthread.h>
@@ -32,6 +32,13 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1)
36
37static int pte_testbit(pte_t pte)
38{
39 return pte_flags(pte) & _PAGE_UNUSED1;
40}
41
35struct split_state { 42struct split_state {
36 long lpg, gpg, spg, exec; 43 long lpg, gpg, spg, exec;
37 long min_exec, max_exec; 44 long min_exec, max_exec;
@@ -165,15 +172,14 @@ static int pageattr_test(void)
165 continue; 172 continue;
166 } 173 }
167 174
168 err = change_page_attr_clear(addr[i], len[i], 175 err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT);
169 __pgprot(_PAGE_GLOBAL));
170 if (err < 0) { 176 if (err < 0) {
171 printk(KERN_ERR "CPA %d failed %d\n", i, err); 177 printk(KERN_ERR "CPA %d failed %d\n", i, err);
172 failed++; 178 failed++;
173 } 179 }
174 180
175 pte = lookup_address(addr[i], &level); 181 pte = lookup_address(addr[i], &level);
176 if (!pte || pte_global(*pte) || pte_huge(*pte)) { 182 if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
177 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], 183 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
178 pte ? (u64)pte_val(*pte) : 0ULL); 184 pte ? (u64)pte_val(*pte) : 0ULL);
179 failed++; 185 failed++;
@@ -198,14 +204,13 @@ static int pageattr_test(void)
198 failed++; 204 failed++;
199 continue; 205 continue;
200 } 206 }
201 err = change_page_attr_set(addr[i], len[i], 207 err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT);
202 __pgprot(_PAGE_GLOBAL));
203 if (err < 0) { 208 if (err < 0) {
204 printk(KERN_ERR "CPA reverting failed: %d\n", err); 209 printk(KERN_ERR "CPA reverting failed: %d\n", err);
205 failed++; 210 failed++;
206 } 211 }
207 pte = lookup_address(addr[i], &level); 212 pte = lookup_address(addr[i], &level);
208 if (!pte || !pte_global(*pte)) { 213 if (!pte || pte_testbit(*pte)) {
209 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", 214 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
210 addr[i], pte ? (u64)pte_val(*pte) : 0ULL); 215 addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
211 failed++; 216 failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37e..afd40054d157 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -34,6 +34,41 @@ struct cpa_data {
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35}; 35};
36 36
37#ifdef CONFIG_PROC_FS
38static unsigned long direct_pages_count[PG_LEVEL_NUM];
39
40void update_page_count(int level, unsigned long pages)
41{
42 unsigned long flags;
43
44 /* Protect against CPA */
45 spin_lock_irqsave(&pgd_lock, flags);
46 direct_pages_count[level] += pages;
47 spin_unlock_irqrestore(&pgd_lock, flags);
48}
49
50static void split_page_count(int level)
51{
52 direct_pages_count[level]--;
53 direct_pages_count[level - 1] += PTRS_PER_PTE;
54}
55
56int arch_report_meminfo(char *page)
57{
58 int n = sprintf(page, "DirectMap4k: %8lu\n"
59 "DirectMap2M: %8lu\n",
60 direct_pages_count[PG_LEVEL_4K],
61 direct_pages_count[PG_LEVEL_2M]);
62#ifdef CONFIG_X86_64
63 n += sprintf(page + n, "DirectMap1G: %8lu\n",
64 direct_pages_count[PG_LEVEL_1G]);
65#endif
66 return n;
67}
68#else
69static inline void split_page_count(int level) { }
70#endif
71
37#ifdef CONFIG_X86_64 72#ifdef CONFIG_X86_64
38 73
39static inline unsigned long highmap_start_pfn(void) 74static inline unsigned long highmap_start_pfn(void)
@@ -500,6 +535,10 @@ static int split_large_page(pte_t *kpte, unsigned long address)
500 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 535 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
501 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 536 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
502 537
538 if (address >= (unsigned long)__va(0) &&
539 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
540 split_page_count(level);
541
503 /* 542 /*
504 * Install the new, split up pagetable. Important details here: 543 * Install the new, split up pagetable. Important details here:
505 * 544 *
@@ -805,7 +844,7 @@ int _set_memory_wc(unsigned long addr, int numpages)
805 844
806int set_memory_wc(unsigned long addr, int numpages) 845int set_memory_wc(unsigned long addr, int numpages)
807{ 846{
808 if (!pat_wc_enabled) 847 if (!pat_enabled)
809 return set_memory_uc(addr, numpages); 848 return set_memory_uc(addr, numpages);
810 849
811 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 850 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 06b7a1c90fb8..a885a1019b8a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -26,11 +26,11 @@
26#include <asm/io.h> 26#include <asm/io.h>
27 27
28#ifdef CONFIG_X86_PAT 28#ifdef CONFIG_X86_PAT
29int __read_mostly pat_wc_enabled = 1; 29int __read_mostly pat_enabled = 1;
30 30
31void __cpuinit pat_disable(char *reason) 31void __cpuinit pat_disable(char *reason)
32{ 32{
33 pat_wc_enabled = 0; 33 pat_enabled = 0;
34 printk(KERN_INFO "%s\n", reason); 34 printk(KERN_INFO "%s\n", reason);
35} 35}
36 36
@@ -42,6 +42,19 @@ static int __init nopat(char *str)
42early_param("nopat", nopat); 42early_param("nopat", nopat);
43#endif 43#endif
44 44
45
46static int debug_enable;
47static int __init pat_debug_setup(char *str)
48{
49 debug_enable = 1;
50 return 0;
51}
52__setup("debugpat", pat_debug_setup);
53
54#define dprintk(fmt, arg...) \
55 do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
56
57
45static u64 __read_mostly boot_pat_state; 58static u64 __read_mostly boot_pat_state;
46 59
47enum { 60enum {
@@ -53,24 +66,25 @@ enum {
53 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ 66 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
54}; 67};
55 68
56#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) 69#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
57 70
58void pat_init(void) 71void pat_init(void)
59{ 72{
60 u64 pat; 73 u64 pat;
61 74
62 if (!pat_wc_enabled) 75 if (!pat_enabled)
63 return; 76 return;
64 77
65 /* Paranoia check. */ 78 /* Paranoia check. */
66 if (!cpu_has_pat) { 79 if (!cpu_has_pat && boot_pat_state) {
67 printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
68 /* 80 /*
69 * Panic if this happens on the secondary CPU, and we 81 * If this happens we are on a secondary CPU, but
70 * switched to PAT on the boot CPU. We have no way to 82 * switched to PAT on the boot CPU. We have no way to
71 * undo PAT. 83 * undo PAT.
72 */ 84 */
73 BUG_ON(boot_pat_state); 85 printk(KERN_ERR "PAT enabled, "
86 "but not supported by secondary CPU\n");
87 BUG();
74 } 88 }
75 89
76 /* Set PWT to Write-Combining. All other bits stay the same */ 90 /* Set PWT to Write-Combining. All other bits stay the same */
@@ -86,8 +100,8 @@ void pat_init(void)
86 * 011 UC _PAGE_CACHE_UC 100 * 011 UC _PAGE_CACHE_UC
87 * PAT bit unused 101 * PAT bit unused
88 */ 102 */
89 pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | 103 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
90 PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); 104 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
91 105
92 /* Boot CPU check */ 106 /* Boot CPU check */
93 if (!boot_pat_state) 107 if (!boot_pat_state)
@@ -103,11 +117,11 @@ void pat_init(void)
103static char *cattr_name(unsigned long flags) 117static char *cattr_name(unsigned long flags)
104{ 118{
105 switch (flags & _PAGE_CACHE_MASK) { 119 switch (flags & _PAGE_CACHE_MASK) {
106 case _PAGE_CACHE_UC: return "uncached"; 120 case _PAGE_CACHE_UC: return "uncached";
107 case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; 121 case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
108 case _PAGE_CACHE_WB: return "write-back"; 122 case _PAGE_CACHE_WB: return "write-back";
109 case _PAGE_CACHE_WC: return "write-combining"; 123 case _PAGE_CACHE_WC: return "write-combining";
110 default: return "broken"; 124 default: return "broken";
111 } 125 }
112} 126}
113 127
@@ -145,47 +159,50 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
145 * The intersection is based on "Effective Memory Type" tables in IA-32 159 * The intersection is based on "Effective Memory Type" tables in IA-32
146 * SDM vol 3a 160 * SDM vol 3a
147 */ 161 */
148static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, 162static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
149 unsigned long *ret_prot)
150{ 163{
151 unsigned long pat_type;
152 u8 mtrr_type;
153
154 pat_type = prot & _PAGE_CACHE_MASK;
155 prot &= (~_PAGE_CACHE_MASK);
156
157 /*
158 * We return the PAT request directly for types where PAT takes
159 * precedence with respect to MTRR and for UC_MINUS.
160 * Consistency checks with other PAT requests is done later
161 * while going through memtype list.
162 */
163 if (pat_type == _PAGE_CACHE_WC) {
164 *ret_prot = prot | _PAGE_CACHE_WC;
165 return 0;
166 } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
167 *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
168 return 0;
169 } else if (pat_type == _PAGE_CACHE_UC) {
170 *ret_prot = prot | _PAGE_CACHE_UC;
171 return 0;
172 }
173
174 /* 164 /*
175 * Look for MTRR hint to get the effective type in case where PAT 165 * Look for MTRR hint to get the effective type in case where PAT
176 * request is for WB. 166 * request is for WB.
177 */ 167 */
178 mtrr_type = mtrr_type_lookup(start, end); 168 if (req_type == _PAGE_CACHE_WB) {
169 u8 mtrr_type;
170
171 mtrr_type = mtrr_type_lookup(start, end);
172 if (mtrr_type == MTRR_TYPE_UNCACHABLE)
173 return _PAGE_CACHE_UC;
174 if (mtrr_type == MTRR_TYPE_WRCOMB)
175 return _PAGE_CACHE_WC;
176 }
179 177
180 if (mtrr_type == MTRR_TYPE_UNCACHABLE) { 178 return req_type;
181 *ret_prot = prot | _PAGE_CACHE_UC; 179}
182 } else if (mtrr_type == MTRR_TYPE_WRCOMB) { 180
183 *ret_prot = prot | _PAGE_CACHE_WC; 181static int chk_conflict(struct memtype *new, struct memtype *entry,
184 } else { 182 unsigned long *type)
185 *ret_prot = prot | _PAGE_CACHE_WB; 183{
184 if (new->type != entry->type) {
185 if (type) {
186 new->type = entry->type;
187 *type = entry->type;
188 } else
189 goto conflict;
186 } 190 }
187 191
192 /* check overlaps with more than one entry in the list */
193 list_for_each_entry_continue(entry, &memtype_list, nd) {
194 if (new->end <= entry->start)
195 break;
196 else if (new->type != entry->type)
197 goto conflict;
198 }
188 return 0; 199 return 0;
200
201 conflict:
202 printk(KERN_INFO "%s:%d conflicting memory types "
203 "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
204 new->end, cattr_name(new->type), cattr_name(entry->type));
205 return -EBUSY;
189} 206}
190 207
191/* 208/*
@@ -198,37 +215,36 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
198 * req_type will have a special case value '-1', when requester want to inherit 215 * req_type will have a special case value '-1', when requester want to inherit
199 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. 216 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
200 * 217 *
201 * If ret_type is NULL, function will return an error if it cannot reserve the 218 * If new_type is NULL, function will return an error if it cannot reserve the
202 * region with req_type. If ret_type is non-null, function will return 219 * region with req_type. If new_type is non-NULL, function will return
203 * available type in ret_type in case of no error. In case of any error 220 * available type in new_type in case of no error. In case of any error
204 * it will return a negative return value. 221 * it will return a negative return value.
205 */ 222 */
206int reserve_memtype(u64 start, u64 end, unsigned long req_type, 223int reserve_memtype(u64 start, u64 end, unsigned long req_type,
207 unsigned long *ret_type) 224 unsigned long *new_type)
208{ 225{
209 struct memtype *new_entry = NULL; 226 struct memtype *new, *entry;
210 struct memtype *parse;
211 unsigned long actual_type; 227 unsigned long actual_type;
228 struct list_head *where;
212 int err = 0; 229 int err = 0;
213 230
214 /* Only track when pat_wc_enabled */ 231 BUG_ON(start >= end); /* end is exclusive */
215 if (!pat_wc_enabled) { 232
233 if (!pat_enabled) {
216 /* This is identical to page table setting without PAT */ 234 /* This is identical to page table setting without PAT */
217 if (ret_type) { 235 if (new_type) {
218 if (req_type == -1) { 236 if (req_type == -1)
219 *ret_type = _PAGE_CACHE_WB; 237 *new_type = _PAGE_CACHE_WB;
220 } else { 238 else
221 *ret_type = req_type; 239 *new_type = req_type & _PAGE_CACHE_MASK;
222 }
223 } 240 }
224 return 0; 241 return 0;
225 } 242 }
226 243
227 /* Low ISA region is always mapped WB in page table. No need to track */ 244 /* Low ISA region is always mapped WB in page table. No need to track */
228 if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { 245 if (is_ISA_range(start, end - 1)) {
229 if (ret_type) 246 if (new_type)
230 *ret_type = _PAGE_CACHE_WB; 247 *new_type = _PAGE_CACHE_WB;
231
232 return 0; 248 return 0;
233 } 249 }
234 250
@@ -241,206 +257,92 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
241 */ 257 */
242 u8 mtrr_type = mtrr_type_lookup(start, end); 258 u8 mtrr_type = mtrr_type_lookup(start, end);
243 259
244 if (mtrr_type == MTRR_TYPE_WRBACK) { 260 if (mtrr_type == MTRR_TYPE_WRBACK)
245 req_type = _PAGE_CACHE_WB;
246 actual_type = _PAGE_CACHE_WB; 261 actual_type = _PAGE_CACHE_WB;
247 } else { 262 else
248 req_type = _PAGE_CACHE_UC_MINUS;
249 actual_type = _PAGE_CACHE_UC_MINUS; 263 actual_type = _PAGE_CACHE_UC_MINUS;
250 } 264 } else
251 } else { 265 actual_type = pat_x_mtrr_type(start, end,
252 req_type &= _PAGE_CACHE_MASK; 266 req_type & _PAGE_CACHE_MASK);
253 err = pat_x_mtrr_type(start, end, req_type, &actual_type);
254 }
255
256 if (err) {
257 if (ret_type)
258 *ret_type = actual_type;
259
260 return -EINVAL;
261 }
262 267
263 new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); 268 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
264 if (!new_entry) 269 if (!new)
265 return -ENOMEM; 270 return -ENOMEM;
266 271
267 new_entry->start = start; 272 new->start = start;
268 new_entry->end = end; 273 new->end = end;
269 new_entry->type = actual_type; 274 new->type = actual_type;
270 275
271 if (ret_type) 276 if (new_type)
272 *ret_type = actual_type; 277 *new_type = actual_type;
273 278
274 spin_lock(&memtype_lock); 279 spin_lock(&memtype_lock);
275 280
276 /* Search for existing mapping that overlaps the current range */ 281 /* Search for existing mapping that overlaps the current range */
277 list_for_each_entry(parse, &memtype_list, nd) { 282 where = NULL;
278 struct memtype *saved_ptr; 283 list_for_each_entry(entry, &memtype_list, nd) {
279 284 if (end <= entry->start) {
280 if (parse->start >= end) { 285 where = entry->nd.prev;
281 pr_debug("New Entry\n");
282 list_add(&new_entry->nd, parse->nd.prev);
283 new_entry = NULL;
284 break; 286 break;
285 } 287 } else if (start <= entry->start) { /* end > entry->start */
286 288 err = chk_conflict(new, entry, new_type);
287 if (start <= parse->start && end >= parse->start) { 289 if (!err) {
288 if (actual_type != parse->type && ret_type) { 290 dprintk("Overlap at 0x%Lx-0x%Lx\n",
289 actual_type = parse->type; 291 entry->start, entry->end);
290 *ret_type = actual_type; 292 where = entry->nd.prev;
291 new_entry->type = actual_type;
292 }
293
294 if (actual_type != parse->type) {
295 printk(
296 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
297 current->comm, current->pid,
298 start, end,
299 cattr_name(actual_type),
300 cattr_name(parse->type));
301 err = -EBUSY;
302 break;
303 } 293 }
304
305 saved_ptr = parse;
306 /*
307 * Check to see whether the request overlaps more
308 * than one entry in the list
309 */
310 list_for_each_entry_continue(parse, &memtype_list, nd) {
311 if (end <= parse->start) {
312 break;
313 }
314
315 if (actual_type != parse->type) {
316 printk(
317 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
318 current->comm, current->pid,
319 start, end,
320 cattr_name(actual_type),
321 cattr_name(parse->type));
322 err = -EBUSY;
323 break;
324 }
325 }
326
327 if (err) {
328 break;
329 }
330
331 pr_debug("Overlap at 0x%Lx-0x%Lx\n",
332 saved_ptr->start, saved_ptr->end);
333 /* No conflict. Go ahead and add this new entry */
334 list_add(&new_entry->nd, saved_ptr->nd.prev);
335 new_entry = NULL;
336 break; 294 break;
337 } 295 } else if (start < entry->end) { /* start > entry->start */
338 296 err = chk_conflict(new, entry, new_type);
339 if (start < parse->end) { 297 if (!err) {
340 if (actual_type != parse->type && ret_type) { 298 dprintk("Overlap at 0x%Lx-0x%Lx\n",
341 actual_type = parse->type; 299 entry->start, entry->end);
342 *ret_type = actual_type; 300 where = &entry->nd;
343 new_entry->type = actual_type;
344 }
345
346 if (actual_type != parse->type) {
347 printk(
348 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
349 current->comm, current->pid,
350 start, end,
351 cattr_name(actual_type),
352 cattr_name(parse->type));
353 err = -EBUSY;
354 break;
355 }
356
357 saved_ptr = parse;
358 /*
359 * Check to see whether the request overlaps more
360 * than one entry in the list
361 */
362 list_for_each_entry_continue(parse, &memtype_list, nd) {
363 if (end <= parse->start) {
364 break;
365 }
366
367 if (actual_type != parse->type) {
368 printk(
369 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
370 current->comm, current->pid,
371 start, end,
372 cattr_name(actual_type),
373 cattr_name(parse->type));
374 err = -EBUSY;
375 break;
376 }
377 }
378
379 if (err) {
380 break;
381 } 301 }
382
383 pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
384 saved_ptr->start, saved_ptr->end);
385 /* No conflict. Go ahead and add this new entry */
386 list_add(&new_entry->nd, &saved_ptr->nd);
387 new_entry = NULL;
388 break; 302 break;
389 } 303 }
390 } 304 }
391 305
392 if (err) { 306 if (err) {
393 printk(KERN_INFO 307 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
394 "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", 308 "track %s, req %s\n",
395 start, end, cattr_name(new_entry->type), 309 start, end, cattr_name(new->type), cattr_name(req_type));
396 cattr_name(req_type)); 310 kfree(new);
397 kfree(new_entry);
398 spin_unlock(&memtype_lock); 311 spin_unlock(&memtype_lock);
399 return err; 312 return err;
400 } 313 }
401 314
402 if (new_entry) { 315 if (where)
403 /* No conflict. Not yet added to the list. Add to the tail */ 316 list_add(&new->nd, where);
404 list_add_tail(&new_entry->nd, &memtype_list); 317 else
405 pr_debug("New Entry\n"); 318 list_add_tail(&new->nd, &memtype_list);
406 }
407
408 if (ret_type) {
409 pr_debug(
410 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
411 start, end, cattr_name(actual_type),
412 cattr_name(req_type), cattr_name(*ret_type));
413 } else {
414 pr_debug(
415 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
416 start, end, cattr_name(actual_type),
417 cattr_name(req_type));
418 }
419 319
420 spin_unlock(&memtype_lock); 320 spin_unlock(&memtype_lock);
321
322 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
323 start, end, cattr_name(new->type), cattr_name(req_type),
324 new_type ? cattr_name(*new_type) : "-");
325
421 return err; 326 return err;
422} 327}
423 328
424int free_memtype(u64 start, u64 end) 329int free_memtype(u64 start, u64 end)
425{ 330{
426 struct memtype *ml; 331 struct memtype *entry;
427 int err = -EINVAL; 332 int err = -EINVAL;
428 333
429 /* Only track when pat_wc_enabled */ 334 if (!pat_enabled)
430 if (!pat_wc_enabled) {
431 return 0; 335 return 0;
432 }
433 336
434 /* Low ISA region is always mapped WB. No need to track */ 337 /* Low ISA region is always mapped WB. No need to track */
435 if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { 338 if (is_ISA_range(start, end - 1))
436 return 0; 339 return 0;
437 }
438 340
439 spin_lock(&memtype_lock); 341 spin_lock(&memtype_lock);
440 list_for_each_entry(ml, &memtype_list, nd) { 342 list_for_each_entry(entry, &memtype_list, nd) {
441 if (ml->start == start && ml->end == end) { 343 if (entry->start == start && entry->end == end) {
442 list_del(&ml->nd); 344 list_del(&entry->nd);
443 kfree(ml); 345 kfree(entry);
444 err = 0; 346 err = 0;
445 break; 347 break;
446 } 348 }
@@ -452,7 +354,7 @@ int free_memtype(u64 start, u64 end)
452 current->comm, current->pid, start, end); 354 current->comm, current->pid, start, end);
453 } 355 }
454 356
455 pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); 357 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
456 return err; 358 return err;
457} 359}
458 360
@@ -521,12 +423,12 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
521 * caching for the high addresses through the KEN pin, but 423 * caching for the high addresses through the KEN pin, but
522 * we maintain the tradition of paranoia in this code. 424 * we maintain the tradition of paranoia in this code.
523 */ 425 */
524 if (!pat_wc_enabled && 426 if (!pat_enabled &&
525 ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || 427 !(boot_cpu_has(X86_FEATURE_MTRR) ||
526 test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || 428 boot_cpu_has(X86_FEATURE_K6_MTRR) ||
527 test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || 429 boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
528 test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && 430 boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
529 (pfn << PAGE_SHIFT) >= __pa(high_memory)) { 431 (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
530 flags = _PAGE_CACHE_UC; 432 flags = _PAGE_CACHE_UC;
531 } 433 }
532#endif 434#endif
@@ -548,7 +450,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
548 return 0; 450 return 0;
549 451
550 if (pfn <= max_pfn_mapped && 452 if (pfn <= max_pfn_mapped &&
551 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { 453 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
552 free_memtype(offset, offset + size); 454 free_memtype(offset, offset + size);
553 printk(KERN_INFO 455 printk(KERN_INFO
554 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", 456 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
@@ -586,4 +488,3 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
586 488
587 free_memtype(addr, addr + size); 489 free_memtype(addr, addr + size);
588} 490}
589
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 50159764f694..557b2abceef8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
2#include <asm/pgalloc.h> 2#include <asm/pgalloc.h>
3#include <asm/pgtable.h> 3#include <asm/pgtable.h>
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h>
5 6
6pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
7{ 8{
@@ -65,12 +66,6 @@ static inline void pgd_list_del(pgd_t *pgd)
65static void pgd_ctor(void *p) 66static void pgd_ctor(void *p)
66{ 67{
67 pgd_t *pgd = p; 68 pgd_t *pgd = p;
68 unsigned long flags;
69
70 /* Clear usermode parts of PGD */
71 memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
72
73 spin_lock_irqsave(&pgd_lock, flags);
74 69
75 /* If the pgd points to a shared pagetable level (either the 70 /* If the pgd points to a shared pagetable level (either the
76 ptes in non-PAE, or shared PMD in PAE), then just copy the 71 ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -90,8 +85,6 @@ static void pgd_ctor(void *p)
90 /* list required to sync kernel mapping updates */ 85 /* list required to sync kernel mapping updates */
91 if (!SHARED_KERNEL_PMD) 86 if (!SHARED_KERNEL_PMD)
92 pgd_list_add(pgd); 87 pgd_list_add(pgd);
93
94 spin_unlock_irqrestore(&pgd_lock, flags);
95} 88}
96 89
97static void pgd_dtor(void *pgd) 90static void pgd_dtor(void *pgd)
@@ -119,6 +112,72 @@ static void pgd_dtor(void *pgd)
119 112
120#ifdef CONFIG_X86_PAE 113#ifdef CONFIG_X86_PAE
121/* 114/*
115 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
116 * updating the top-level pagetable entries to guarantee the
117 * processor notices the update. Since this is expensive, and
118 * all 4 top-level entries are used almost immediately in a
119 * new process's life, we just pre-populate them here.
120 *
121 * Also, if we're in a paravirt environment where the kernel pmd is
122 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
123 * and initialize the kernel pmds here.
124 */
125#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
126
127void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
128{
129 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
130
131 /* Note: almost everything apart from _PAGE_PRESENT is
132 reserved at the pmd (PDPT) level. */
133 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
134
135 /*
136 * According to Intel App note "TLBs, Paging-Structure Caches,
137 * and Their Invalidation", April 2007, document 317080-001,
138 * section 8.1: in PAE mode we explicitly have to flush the
139 * TLB via cr3 if the top-level pgd is changed...
140 */
141 if (mm == current->active_mm)
142 write_cr3(read_cr3());
143}
144#else /* !CONFIG_X86_PAE */
145
146/* No need to prepopulate any pagetable entries in non-PAE modes. */
147#define PREALLOCATED_PMDS 0
148
149#endif /* CONFIG_X86_PAE */
150
151static void free_pmds(pmd_t *pmds[])
152{
153 int i;
154
155 for(i = 0; i < PREALLOCATED_PMDS; i++)
156 if (pmds[i])
157 free_page((unsigned long)pmds[i]);
158}
159
160static int preallocate_pmds(pmd_t *pmds[])
161{
162 int i;
163 bool failed = false;
164
165 for(i = 0; i < PREALLOCATED_PMDS; i++) {
166 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
167 if (pmd == NULL)
168 failed = true;
169 pmds[i] = pmd;
170 }
171
172 if (failed) {
173 free_pmds(pmds);
174 return -ENOMEM;
175 }
176
177 return 0;
178}
179
180/*
122 * Mop up any pmd pages which may still be attached to the pgd. 181 * Mop up any pmd pages which may still be attached to the pgd.
123 * Normally they will be freed by munmap/exit_mmap, but any pmd we 182 * Normally they will be freed by munmap/exit_mmap, but any pmd we
124 * preallocate which never got a corresponding vma will need to be 183 * preallocate which never got a corresponding vma will need to be
@@ -128,7 +187,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
128{ 187{
129 int i; 188 int i;
130 189
131 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { 190 for(i = 0; i < PREALLOCATED_PMDS; i++) {
132 pgd_t pgd = pgdp[i]; 191 pgd_t pgd = pgdp[i];
133 192
134 if (pgd_val(pgd) != 0) { 193 if (pgd_val(pgd) != 0) {
@@ -142,32 +201,17 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
142 } 201 }
143} 202}
144 203
145/* 204static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
146 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
147 * updating the top-level pagetable entries to guarantee the
148 * processor notices the update. Since this is expensive, and
149 * all 4 top-level entries are used almost immediately in a
150 * new process's life, we just pre-populate them here.
151 *
152 * Also, if we're in a paravirt environment where the kernel pmd is
153 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
154 * and initialize the kernel pmds here.
155 */
156static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
157{ 205{
158 pud_t *pud; 206 pud_t *pud;
159 unsigned long addr; 207 unsigned long addr;
160 int i; 208 int i;
161 209
162 pud = pud_offset(pgd, 0); 210 pud = pud_offset(pgd, 0);
163 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
164 i++, pud++, addr += PUD_SIZE) {
165 pmd_t *pmd = pmd_alloc_one(mm, addr);
166 211
167 if (!pmd) { 212 for (addr = i = 0; i < PREALLOCATED_PMDS;
168 pgd_mop_up_pmds(mm, pgd); 213 i++, pud++, addr += PUD_SIZE) {
169 return 0; 214 pmd_t *pmd = pmds[i];
170 }
171 215
172 if (i >= KERNEL_PGD_BOUNDARY) 216 if (i >= KERNEL_PGD_BOUNDARY)
173 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 217 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
@@ -175,61 +219,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
175 219
176 pud_populate(mm, pud, pmd); 220 pud_populate(mm, pud, pmd);
177 } 221 }
178
179 return 1;
180} 222}
181 223
182void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 224pgd_t *pgd_alloc(struct mm_struct *mm)
183{ 225{
184 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 226 pgd_t *pgd;
227 pmd_t *pmds[PREALLOCATED_PMDS];
228 unsigned long flags;
185 229
186 /* Note: almost everything apart from _PAGE_PRESENT is 230 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
187 reserved at the pmd (PDPT) level. */
188 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
189 231
190 /* 232 if (pgd == NULL)
191 * According to Intel App note "TLBs, Paging-Structure Caches, 233 goto out;
192 * and Their Invalidation", April 2007, document 317080-001,
193 * section 8.1: in PAE mode we explicitly have to flush the
194 * TLB via cr3 if the top-level pgd is changed...
195 */
196 if (mm == current->active_mm)
197 write_cr3(read_cr3());
198}
199#else /* !CONFIG_X86_PAE */
200/* No need to prepopulate any pagetable entries in non-PAE modes. */
201static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
202{
203 return 1;
204}
205 234
206static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) 235 mm->pgd = pgd;
207{
208}
209#endif /* CONFIG_X86_PAE */
210 236
211pgd_t *pgd_alloc(struct mm_struct *mm) 237 if (preallocate_pmds(pmds) != 0)
212{ 238 goto out_free_pgd;
213 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
214 239
215 /* so that alloc_pmd can use it */ 240 if (paravirt_pgd_alloc(mm) != 0)
216 mm->pgd = pgd; 241 goto out_free_pmds;
217 if (pgd)
218 pgd_ctor(pgd);
219 242
220 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { 243 /*
221 pgd_dtor(pgd); 244 * Make sure that pre-populating the pmds is atomic with
222 free_page((unsigned long)pgd); 245 * respect to anything walking the pgd_list, so that they
223 pgd = NULL; 246 * never see a partially populated pgd.
224 } 247 */
248 spin_lock_irqsave(&pgd_lock, flags);
249
250 pgd_ctor(pgd);
251 pgd_prepopulate_pmd(mm, pgd, pmds);
252
253 spin_unlock_irqrestore(&pgd_lock, flags);
225 254
226 return pgd; 255 return pgd;
256
257out_free_pmds:
258 free_pmds(pmds);
259out_free_pgd:
260 free_page((unsigned long)pgd);
261out:
262 return NULL;
227} 263}
228 264
229void pgd_free(struct mm_struct *mm, pgd_t *pgd) 265void pgd_free(struct mm_struct *mm, pgd_t *pgd)
230{ 266{
231 pgd_mop_up_pmds(mm, pgd); 267 pgd_mop_up_pmds(mm, pgd);
232 pgd_dtor(pgd); 268 pgd_dtor(pgd);
269 paravirt_pgd_free(mm, pgd);
233 free_page((unsigned long)pgd); 270 free_page((unsigned long)pgd);
234} 271}
235 272
@@ -255,7 +292,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
255 292
256 if (pte_young(*ptep)) 293 if (pte_young(*ptep))
257 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 294 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
258 &ptep->pte); 295 (unsigned long *) &ptep->pte);
259 296
260 if (ret) 297 if (ret)
261 pte_update(vma->vm_mm, addr, ptep); 298 pte_update(vma->vm_mm, addr, ptep);
@@ -274,3 +311,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
274 311
275 return young; 312 return young;
276} 313}
314
315int fixmaps_set;
316
317void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
318{
319 unsigned long address = __fix_to_virt(idx);
320
321 if (idx >= __end_of_fixed_addresses) {
322 BUG();
323 return;
324 }
325 set_pte_vaddr(address, pte);
326 fixmaps_set++;
327}
328
329void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
330{
331 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
332}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 369cf065b6a4..b4becbf8c570 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -71,7 +71,7 @@ void show_mem(void)
71 * Associate a virtual page frame with a given physical page frame 71 * Associate a virtual page frame with a given physical page frame
72 * and protection flags for that frame. 72 * and protection flags for that frame.
73 */ 73 */
74static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) 74void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
75{ 75{
76 pgd_t *pgd; 76 pgd_t *pgd;
77 pud_t *pud; 77 pud_t *pud;
@@ -94,8 +94,8 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
94 return; 94 return;
95 } 95 }
96 pte = pte_offset_kernel(pmd, vaddr); 96 pte = pte_offset_kernel(pmd, vaddr);
97 if (pgprot_val(flags)) 97 if (pte_val(pteval))
98 set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags)); 98 set_pte_present(&init_mm, vaddr, pte, pteval);
99 else 99 else
100 pte_clear(&init_mm, vaddr, pte); 100 pte_clear(&init_mm, vaddr, pte);
101 101
@@ -141,22 +141,9 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
141 __flush_tlb_one(vaddr); 141 __flush_tlb_one(vaddr);
142} 142}
143 143
144static int fixmaps;
145unsigned long __FIXADDR_TOP = 0xfffff000; 144unsigned long __FIXADDR_TOP = 0xfffff000;
146EXPORT_SYMBOL(__FIXADDR_TOP); 145EXPORT_SYMBOL(__FIXADDR_TOP);
147 146
148void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
149{
150 unsigned long address = __fix_to_virt(idx);
151
152 if (idx >= __end_of_fixed_addresses) {
153 BUG();
154 return;
155 }
156 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
157 fixmaps++;
158}
159
160/** 147/**
161 * reserve_top_address - reserves a hole in the top of kernel address space 148 * reserve_top_address - reserves a hole in the top of kernel address space
162 * @reserve - size of hole to reserve 149 * @reserve - size of hole to reserve
@@ -164,11 +151,44 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
164 * Can be used to relocate the fixmap area and poke a hole in the top 151 * Can be used to relocate the fixmap area and poke a hole in the top
165 * of kernel address space to make room for a hypervisor. 152 * of kernel address space to make room for a hypervisor.
166 */ 153 */
167void reserve_top_address(unsigned long reserve) 154void __init reserve_top_address(unsigned long reserve)
168{ 155{
169 BUG_ON(fixmaps > 0); 156 BUG_ON(fixmaps_set > 0);
170 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 157 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
171 (int)-reserve); 158 (int)-reserve);
172 __FIXADDR_TOP = -reserve - PAGE_SIZE; 159 __FIXADDR_TOP = -reserve - PAGE_SIZE;
173 __VMALLOC_RESERVE += reserve; 160 __VMALLOC_RESERVE += reserve;
174} 161}
162
163/*
164 * vmalloc=size forces the vmalloc area to be exactly 'size'
165 * bytes. This can be used to increase (or decrease) the
166 * vmalloc area - the default is 128m.
167 */
168static int __init parse_vmalloc(char *arg)
169{
170 if (!arg)
171 return -EINVAL;
172
173 __VMALLOC_RESERVE = memparse(arg, &arg);
174 return 0;
175}
176early_param("vmalloc", parse_vmalloc);
177
178/*
179 * reservetop=size reserves a hole at the top of the kernel address space which
180 * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
181 * so relocating the fixmap can be done before paging initialization.
182 */
183static int __init parse_reservetop(char *arg)
184{
185 unsigned long address;
186
187 if (!arg)
188 return -EINVAL;
189
190 address = memparse(arg, &arg);
191 reserve_top_address(address);
192 return 0;
193}
194early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
new file mode 100644
index 000000000000..f41d67f8f831
--- /dev/null
+++ b/arch/x86/mm/srat_32.c
@@ -0,0 +1,280 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/acpi.h>
30#include <linux/nodemask.h>
31#include <asm/srat.h>
32#include <asm/topology.h>
33#include <asm/smp.h>
34#include <asm/e820.h>
35
36/*
37 * proximity macros and definitions
38 */
39#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
40#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
41#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
42#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
43/* bitmap length; _PXM is at most 255 */
44#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
45static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
46
47#define MAX_CHUNKS_PER_NODE 3
48#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
49struct node_memory_chunk_s {
50 unsigned long start_pfn;
51 unsigned long end_pfn;
52 u8 pxm; // proximity domain of node
53 u8 nid; // which cnode contains this chunk?
54 u8 bank; // which mem bank on this node
55};
56static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
57
58static int __initdata num_memory_chunks; /* total number of memory chunks */
59static u8 __initdata apicid_to_pxm[MAX_APICID];
60
61int numa_off __initdata;
62int acpi_numa __initdata;
63
64static __init void bad_srat(void)
65{
66 printk(KERN_ERR "SRAT: SRAT not used.\n");
67 acpi_numa = -1;
68 num_memory_chunks = 0;
69}
70
71static __init inline int srat_disabled(void)
72{
73 return numa_off || acpi_numa < 0;
74}
75
76/* Identify CPU proximity domains */
77void __init
78acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
79{
80 if (srat_disabled())
81 return;
82 if (cpu_affinity->header.length !=
83 sizeof(struct acpi_srat_cpu_affinity)) {
84 bad_srat();
85 return;
86 }
87
88 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
89 return; /* empty entry */
90
91 /* mark this node as "seen" in node bitmap */
92 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
93
94 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
95
96 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
97 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
98}
99
100/*
101 * Identify memory proximity domains and hot-remove capabilities.
102 * Fill node memory chunk list structure.
103 */
104void __init
105acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
106{
107 unsigned long long paddr, size;
108 unsigned long start_pfn, end_pfn;
109 u8 pxm;
110 struct node_memory_chunk_s *p, *q, *pend;
111
112 if (srat_disabled())
113 return;
114 if (memory_affinity->header.length !=
115 sizeof(struct acpi_srat_mem_affinity)) {
116 bad_srat();
117 return;
118 }
119
120 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
121 return; /* empty entry */
122
123 pxm = memory_affinity->proximity_domain & 0xff;
124
125 /* mark this node as "seen" in node bitmap */
126 BMAP_SET(pxm_bitmap, pxm);
127
128 /* calculate info for memory chunk structure */
129 paddr = memory_affinity->base_address;
130 size = memory_affinity->length;
131
132 start_pfn = paddr >> PAGE_SHIFT;
133 end_pfn = (paddr + size) >> PAGE_SHIFT;
134
135
136 if (num_memory_chunks >= MAXCHUNKS) {
137 printk(KERN_WARNING "Too many mem chunks in SRAT."
138 " Ignoring %lld MBytes at %llx\n",
139 size/(1024*1024), paddr);
140 return;
141 }
142
143 /* Insertion sort based on base address */
144 pend = &node_memory_chunk[num_memory_chunks];
145 for (p = &node_memory_chunk[0]; p < pend; p++) {
146 if (start_pfn < p->start_pfn)
147 break;
148 }
149 if (p < pend) {
150 for (q = pend; q >= p; q--)
151 *(q + 1) = *q;
152 }
153 p->start_pfn = start_pfn;
154 p->end_pfn = end_pfn;
155 p->pxm = pxm;
156
157 num_memory_chunks++;
158
159 printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)"
160 " in proximity domain %02x %s\n",
161 start_pfn, end_pfn,
162 memory_affinity->memory_type,
163 pxm,
164 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
165 "enabled and removable" : "enabled" ) );
166}
167
168/* Callback for SLIT parsing */
169void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
170{
171}
172
173void acpi_numa_arch_fixup(void)
174{
175}
176/*
177 * The SRAT table always lists ascending addresses, so can always
178 * assume that the first "start" address that you see is the real
179 * start of the node, and that the current "end" address is after
180 * the previous one.
181 */
182static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
183{
184 /*
185 * Only add present memory as told by the e820.
186 * There is no guarantee from the SRAT that the memory it
187 * enumerates is present at boot time because it represents
188 * *possible* memory hotplug areas the same as normal RAM.
189 */
190 if (memory_chunk->start_pfn >= max_pfn) {
191 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
192 memory_chunk->start_pfn, memory_chunk->end_pfn);
193 return;
194 }
195 if (memory_chunk->nid != nid)
196 return;
197
198 if (!node_has_online_mem(nid))
199 node_start_pfn[nid] = memory_chunk->start_pfn;
200
201 if (node_start_pfn[nid] > memory_chunk->start_pfn)
202 node_start_pfn[nid] = memory_chunk->start_pfn;
203
204 if (node_end_pfn[nid] < memory_chunk->end_pfn)
205 node_end_pfn[nid] = memory_chunk->end_pfn;
206}
207
208int __init get_memcfg_from_srat(void)
209{
210 int i, j, nid;
211
212
213 if (srat_disabled())
214 goto out_fail;
215
216 if (num_memory_chunks == 0) {
217 printk(KERN_WARNING
218 "could not finy any ACPI SRAT memory areas.\n");
219 goto out_fail;
220 }
221
222 /* Calculate total number of nodes in system from PXM bitmap and create
223 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
224 * to specify the range of _PXM values.)
225 */
226 /*
227 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
228 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
229 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
230 * approaches MAX_PXM_DOMAINS for i386.
231 */
232 nodes_clear(node_online_map);
233 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
234 if (BMAP_TEST(pxm_bitmap, i)) {
235 int nid = acpi_map_pxm_to_node(i);
236 node_set_online(nid);
237 }
238 }
239 BUG_ON(num_online_nodes() == 0);
240
241 /* set cnode id in memory chunk structure */
242 for (i = 0; i < num_memory_chunks; i++)
243 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
244
245 printk(KERN_DEBUG "pxm bitmap: ");
246 for (i = 0; i < sizeof(pxm_bitmap); i++) {
247 printk(KERN_CONT "%02x ", pxm_bitmap[i]);
248 }
249 printk(KERN_CONT "\n");
250 printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
251 num_online_nodes());
252 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
253 num_memory_chunks);
254
255 for (i = 0; i < MAX_APICID; i++)
256 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
257
258 for (j = 0; j < num_memory_chunks; j++){
259 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
260 printk(KERN_DEBUG
261 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
262 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
263 node_read_chunk(chunk->nid, chunk);
264 e820_register_active_regions(chunk->nid, chunk->start_pfn,
265 min(chunk->end_pfn, max_pfn));
266 }
267
268 for_each_online_node(nid) {
269 unsigned long start = node_start_pfn[nid];
270 unsigned long end = min(node_end_pfn[nid], max_pfn);
271
272 memory_present(nid, start, end);
273 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
274 }
275 return 1;
276out_fail:
277 printk(KERN_ERR "failed to get NUMA memory information from SRAT"
278 " table\n");
279 return 0;
280}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 99649dccad28..0fd67b81a8b6 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -299,7 +299,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
299 pxmram = 0; 299 pxmram = 0;
300 } 300 }
301 301
302 e820ram = end_pfn - absent_pages_in_range(0, end_pfn); 302 e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
303 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 303 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
304 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 304 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
305 printk(KERN_ERR 305 printk(KERN_ERR
@@ -376,7 +376,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
376 if (node == NUMA_NO_NODE) 376 if (node == NUMA_NO_NODE)
377 continue; 377 continue;
378 if (!node_isset(node, node_possible_map)) 378 if (!node_isset(node, node_possible_map))
379 numa_set_node(i, NUMA_NO_NODE); 379 numa_clear_node(i);
380 } 380 }
381 numa_init_array(); 381 numa_init_array();
382 return 0; 382 return 0;
@@ -495,6 +495,7 @@ int __node_distance(int a, int b)
495 495
496EXPORT_SYMBOL(__node_distance); 496EXPORT_SYMBOL(__node_distance);
497 497
498#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
498int memory_add_physaddr_to_nid(u64 start) 499int memory_add_physaddr_to_nid(u64 start)
499{ 500{
500 int i, ret = 0; 501 int i, ret = 0;
@@ -506,4 +507,4 @@ int memory_add_physaddr_to_nid(u64 start)
506 return ret; 507 return ret;
507} 508}
508EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 509EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
509 510#endif