aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-07-14 16:43:24 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-14 16:43:24 -0400
commita3da5bf84a97d48cfaf66c6842470fc403da5121 (patch)
treecdf66c0cff8c61eedd60601fc9dffdd1ed39b880 /arch/x86/mm
parent3b23e665b68387f5ee7b21f7b75ceea4d9acae4a (diff)
parentd59fdcf2ac501de99c3dfb452af5e254d4342886 (diff)
Merge branch 'x86/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (821 commits) x86: make 64bit hpet_set_mapping to use ioremap too, v2 x86: get x86_phys_bits early x86: max_low_pfn_mapped fix #4 x86: change _node_to_cpumask_ptr to return const ptr x86: I/O APIC: remove an IRQ2-mask hack x86: fix numaq_tsc_disable calling x86, e820: remove end_user_pfn x86: max_low_pfn_mapped fix, #3 x86: max_low_pfn_mapped fix, #2 x86: max_low_pfn_mapped fix, #1 x86_64: fix delayed signals x86: remove conflicting nx6325 and nx6125 quirks x86: Recover timer_ack lost in the merge of the NMI watchdog x86: I/O APIC: Never configure IRQ2 x86: L-APIC: Always fully configure IRQ0 x86: L-APIC: Set IRQ0 as edge-triggered x86: merge dwarf2 headers x86: use AS_CFI instead of UNWIND_INFO x86: use ignore macro instead of hash comment x86: use matching CFI_ENDPROC ...
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/discontig_32.c285
-rw-r--r--arch/x86/mm/dump_pagetables.c2
-rw-r--r--arch/x86/mm/fault.c97
-rw-r--r--arch/x86/mm/init_32.c523
-rw-r--r--arch/x86/mm/init_64.c552
-rw-r--r--arch/x86/mm/ioremap.c26
-rw-r--r--arch/x86/mm/k8topology_64.c21
-rw-r--r--arch/x86/mm/numa_64.c93
-rw-r--r--arch/x86/mm/pageattr-test.c21
-rw-r--r--arch/x86/mm/pageattr.c62
-rw-r--r--arch/x86/mm/pat.c378
-rw-r--r--arch/x86/mm/pgtable.c190
-rw-r--r--arch/x86/mm/pgtable_32.c56
-rw-r--r--arch/x86/mm/srat_32.c280
-rw-r--r--arch/x86/mm/srat_64.c21
16 files changed, 1710 insertions, 900 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index b7b3e4c7cfc9..c107641cd39b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -13,5 +13,6 @@ obj-$(CONFIG_NUMA) += discontig_32.o
13else 13else
14obj-$(CONFIG_NUMA) += numa_64.o 14obj-$(CONFIG_NUMA) += numa_64.o
15obj-$(CONFIG_K8_NUMA) += k8topology_64.o 15obj-$(CONFIG_K8_NUMA) += k8topology_64.o
16obj-$(CONFIG_ACPI_NUMA) += srat_64.o
17endif 16endif
17obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
18
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 914ccf983687..5dfef9fa061a 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -38,6 +38,7 @@
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/mmzone.h> 39#include <asm/mmzone.h>
40#include <asm/bios_ebda.h> 40#include <asm/bios_ebda.h>
41#include <asm/proto.h>
41 42
42struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
43EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
@@ -59,14 +60,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
59/* 60/*
60 * 4) physnode_map - the mapping between a pfn and owning node 61 * 4) physnode_map - the mapping between a pfn and owning node
61 * physnode_map keeps track of the physical memory layout of a generic 62 * physnode_map keeps track of the physical memory layout of a generic
62 * numa node on a 256Mb break (each element of the array will 63 * numa node on a 64Mb break (each element of the array will
63 * represent 256Mb of memory and will be marked by the node id. so, 64 * represent 64Mb of memory and will be marked by the node id. so,
64 * if the first gig is on node 0, and the second gig is on node 1 65 * if the first gig is on node 0, and the second gig is on node 1
65 * physnode_map will contain: 66 * physnode_map will contain:
66 * 67 *
67 * physnode_map[0-3] = 0; 68 * physnode_map[0-15] = 0;
68 * physnode_map[4-7] = 1; 69 * physnode_map[16-31] = 1;
69 * physnode_map[8- ] = -1; 70 * physnode_map[32- ] = -1;
70 */ 71 */
71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; 72s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
72EXPORT_SYMBOL(physnode_map); 73EXPORT_SYMBOL(physnode_map);
@@ -75,15 +76,15 @@ void memory_present(int nid, unsigned long start, unsigned long end)
75{ 76{
76 unsigned long pfn; 77 unsigned long pfn;
77 78
78 printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", 79 printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
79 nid, start, end); 80 nid, start, end);
80 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); 81 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
81 printk(KERN_DEBUG " "); 82 printk(KERN_DEBUG " ");
82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { 83 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid; 84 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
84 printk("%ld ", pfn); 85 printk(KERN_CONT "%lx ", pfn);
85 } 86 }
86 printk("\n"); 87 printk(KERN_CONT "\n");
87} 88}
88 89
89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, 90unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -99,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
99#endif 100#endif
100 101
101extern unsigned long find_max_low_pfn(void); 102extern unsigned long find_max_low_pfn(void);
102extern void add_one_highpage_init(struct page *, int, int);
103extern unsigned long highend_pfn, highstart_pfn; 103extern unsigned long highend_pfn, highstart_pfn;
104 104
105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -117,13 +117,13 @@ static unsigned long kva_pages;
117 */ 117 */
118int __init get_memcfg_numa_flat(void) 118int __init get_memcfg_numa_flat(void)
119{ 119{
120 printk("NUMA - single node, flat memory mode\n"); 120 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
121 121
122 /* Run the memory configuration and find the top of memory. */
123 propagate_e820_map();
124 node_start_pfn[0] = 0; 122 node_start_pfn[0] = 0;
125 node_end_pfn[0] = max_pfn; 123 node_end_pfn[0] = max_pfn;
124 e820_register_active_regions(0, 0, max_pfn);
126 memory_present(0, 0, max_pfn); 125 memory_present(0, 0, max_pfn);
126 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
127 127
128 /* Indicate there is one node available. */ 128 /* Indicate there is one node available. */
129 nodes_clear(node_online_map); 129 nodes_clear(node_online_map);
@@ -156,24 +156,32 @@ static void __init propagate_e820_map_node(int nid)
156 */ 156 */
157static void __init allocate_pgdat(int nid) 157static void __init allocate_pgdat(int nid)
158{ 158{
159 if (nid && node_has_online_mem(nid)) 159 char buf[16];
160
161 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
160 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 162 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
161 else { 163 else {
162 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); 164 unsigned long pgdat_phys;
163 min_low_pfn += PFN_UP(sizeof(pg_data_t)); 165 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
166 max_pfn_mapped<<PAGE_SHIFT,
167 sizeof(pg_data_t),
168 PAGE_SIZE);
169 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
170 memset(buf, 0, sizeof(buf));
171 sprintf(buf, "NODE_DATA %d", nid);
172 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
164 } 173 }
174 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
175 nid, (unsigned long)NODE_DATA(nid));
165} 176}
166 177
167#ifdef CONFIG_DISCONTIGMEM
168/* 178/*
169 * In the discontig memory model, a portion of the kernel virtual area (KVA) 179 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
170 * is reserved and portions of nodes are mapped using it. This is to allow 180 * virtual address space (KVA) is reserved and portions of nodes are mapped
171 * node-local memory to be allocated for structures that would normally require 181 * using it. This is to allow node-local memory to be allocated for
172 * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers 182 * structures that would normally require ZONE_NORMAL. The memory is
173 * should be prepared to allocate from the bootmem allocator instead. This KVA 183 * allocated with alloc_remap() and callers should be prepared to allocate
174 * mechanism is incompatible with SPARSEMEM as it makes assumptions about the 184 * from the bootmem allocator instead.
175 * layout of memory that are broken if alloc_remap() succeeds for some of the
176 * map and fails for others
177 */ 185 */
178static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 186static unsigned long node_remap_start_pfn[MAX_NUMNODES];
179static void *node_remap_end_vaddr[MAX_NUMNODES]; 187static void *node_remap_end_vaddr[MAX_NUMNODES];
@@ -195,15 +203,19 @@ void *alloc_remap(int nid, unsigned long size)
195 return allocation; 203 return allocation;
196} 204}
197 205
198void __init remap_numa_kva(void) 206static void __init remap_numa_kva(void)
199{ 207{
200 void *vaddr; 208 void *vaddr;
201 unsigned long pfn; 209 unsigned long pfn;
202 int node; 210 int node;
203 211
204 for_each_online_node(node) { 212 for_each_online_node(node) {
213 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
205 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { 214 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
206 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); 215 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
216 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
217 (unsigned long)vaddr,
218 node_remap_start_pfn[node] + pfn);
207 set_pmd_pfn((ulong) vaddr, 219 set_pmd_pfn((ulong) vaddr,
208 node_remap_start_pfn[node] + pfn, 220 node_remap_start_pfn[node] + pfn,
209 PAGE_KERNEL_LARGE); 221 PAGE_KERNEL_LARGE);
@@ -215,17 +227,21 @@ static unsigned long calculate_numa_remap_pages(void)
215{ 227{
216 int nid; 228 int nid;
217 unsigned long size, reserve_pages = 0; 229 unsigned long size, reserve_pages = 0;
218 unsigned long pfn;
219 230
220 for_each_online_node(nid) { 231 for_each_online_node(nid) {
221 unsigned old_end_pfn = node_end_pfn[nid]; 232 u64 node_kva_target;
233 u64 node_kva_final;
222 234
223 /* 235 /*
224 * The acpi/srat node info can show hot-add memroy zones 236 * The acpi/srat node info can show hot-add memroy zones
225 * where memory could be added but not currently present. 237 * where memory could be added but not currently present.
226 */ 238 */
239 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
240 nid, node_start_pfn[nid], node_end_pfn[nid]);
227 if (node_start_pfn[nid] > max_pfn) 241 if (node_start_pfn[nid] > max_pfn)
228 continue; 242 continue;
243 if (!node_end_pfn[nid])
244 continue;
229 if (node_end_pfn[nid] > max_pfn) 245 if (node_end_pfn[nid] > max_pfn)
230 node_end_pfn[nid] = max_pfn; 246 node_end_pfn[nid] = max_pfn;
231 247
@@ -237,41 +253,48 @@ static unsigned long calculate_numa_remap_pages(void)
237 /* now the roundup is correct, convert to PAGE_SIZE pages */ 253 /* now the roundup is correct, convert to PAGE_SIZE pages */
238 size = size * PTRS_PER_PTE; 254 size = size * PTRS_PER_PTE;
239 255
240 /* 256 node_kva_target = round_down(node_end_pfn[nid] - size,
241 * Validate the region we are allocating only contains valid 257 PTRS_PER_PTE);
242 * pages. 258 node_kva_target <<= PAGE_SHIFT;
243 */ 259 do {
244 for (pfn = node_end_pfn[nid] - size; 260 node_kva_final = find_e820_area(node_kva_target,
245 pfn < node_end_pfn[nid]; pfn++) 261 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
246 if (!page_is_ram(pfn)) 262 ((u64)size)<<PAGE_SHIFT,
247 break; 263 LARGE_PAGE_BYTES);
248 264 node_kva_target -= LARGE_PAGE_BYTES;
249 if (pfn != node_end_pfn[nid]) 265 } while (node_kva_final == -1ULL &&
250 size = 0; 266 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
267
268 if (node_kva_final == -1ULL)
269 panic("Can not get kva ram\n");
251 270
252 printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
253 size, nid);
254 node_remap_size[nid] = size; 271 node_remap_size[nid] = size;
255 node_remap_offset[nid] = reserve_pages; 272 node_remap_offset[nid] = reserve_pages;
256 reserve_pages += size; 273 reserve_pages += size;
257 printk("Shrinking node %d from %ld pages to %ld pages\n", 274 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
258 nid, node_end_pfn[nid], node_end_pfn[nid] - size); 275 " node %d at %llx\n",
259 276 size, nid, node_kva_final>>PAGE_SHIFT);
260 if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { 277
261 /* 278 /*
262 * Align node_end_pfn[] and node_remap_start_pfn[] to 279 * prevent kva address below max_low_pfn want it on system
263 * pmd boundary. remap_numa_kva will barf otherwise. 280 * with less memory later.
264 */ 281 * layout will be: KVA address , KVA RAM
265 printk("Shrinking node %d further by %ld pages for proper alignment\n", 282 *
266 nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); 283 * we are supposed to only record the one less then max_low_pfn
267 size += node_end_pfn[nid] & (PTRS_PER_PTE-1); 284 * but we could have some hole in high memory, and it will only
268 } 285 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
286 * to use it as free.
287 * So reserve_early here, hope we don't run out of that array
288 */
289 reserve_early(node_kva_final,
290 node_kva_final+(((u64)size)<<PAGE_SHIFT),
291 "KVA RAM");
269 292
270 node_end_pfn[nid] -= size; 293 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
271 node_remap_start_pfn[nid] = node_end_pfn[nid]; 294 remove_active_range(nid, node_remap_start_pfn[nid],
272 shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); 295 node_remap_start_pfn[nid] + size);
273 } 296 }
274 printk("Reserving total of %ld pages for numa KVA remap\n", 297 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
275 reserve_pages); 298 reserve_pages);
276 return reserve_pages; 299 return reserve_pages;
277} 300}
@@ -285,37 +308,16 @@ static void init_remap_allocator(int nid)
285 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + 308 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
286 ALIGN(sizeof(pg_data_t), PAGE_SIZE); 309 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
287 310
288 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, 311 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
289 (ulong) node_remap_start_vaddr[nid], 312 (ulong) node_remap_start_vaddr[nid],
290 (ulong) pfn_to_kaddr(highstart_pfn 313 (ulong) node_remap_end_vaddr[nid]);
291 + node_remap_offset[nid] + node_remap_size[nid]));
292}
293#else
294void *alloc_remap(int nid, unsigned long size)
295{
296 return NULL;
297}
298
299static unsigned long calculate_numa_remap_pages(void)
300{
301 return 0;
302}
303
304static void init_remap_allocator(int nid)
305{
306}
307
308void __init remap_numa_kva(void)
309{
310} 314}
311#endif /* CONFIG_DISCONTIGMEM */
312 315
313extern void setup_bootmem_allocator(void); 316void __init initmem_init(unsigned long start_pfn,
314unsigned long __init setup_memory(void) 317 unsigned long end_pfn)
315{ 318{
316 int nid; 319 int nid;
317 unsigned long system_start_pfn, system_max_low_pfn; 320 long kva_target_pfn;
318 unsigned long wasted_pages;
319 321
320 /* 322 /*
321 * When mapping a NUMA machine we allocate the node_mem_map arrays 323 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -324,109 +326,77 @@ unsigned long __init setup_memory(void)
324 * this space and use it to adjust the boundary between ZONE_NORMAL 326 * this space and use it to adjust the boundary between ZONE_NORMAL
325 * and ZONE_HIGHMEM. 327 * and ZONE_HIGHMEM.
326 */ 328 */
327 get_memcfg_numa();
328 329
329 kva_pages = calculate_numa_remap_pages(); 330 get_memcfg_numa();
330 331
331 /* partially used pages are not usable - thus round upwards */ 332 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
332 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
333 333
334 kva_start_pfn = find_max_low_pfn() - kva_pages; 334 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
335 do {
336 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
337 max_low_pfn<<PAGE_SHIFT,
338 kva_pages<<PAGE_SHIFT,
339 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
340 kva_target_pfn -= PTRS_PER_PTE;
341 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
335 342
336#ifdef CONFIG_BLK_DEV_INITRD 343 if (kva_start_pfn == -1UL)
337 /* Numa kva area is below the initrd */ 344 panic("Can not get kva space\n");
338 if (initrd_start)
339 kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
340 - kva_pages;
341#endif
342 345
343 /* 346 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
344 * We waste pages past at the end of the KVA for no good reason other
345 * than how it is located. This is bad.
346 */
347 wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
348 kva_start_pfn -= wasted_pages;
349 kva_pages += wasted_pages;
350
351 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
352 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
353 kva_start_pfn, max_low_pfn); 347 kva_start_pfn, max_low_pfn);
354 printk("max_pfn = %ld\n", max_pfn); 348 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
349
350 /* avoid clash with initrd */
351 reserve_early(kva_start_pfn<<PAGE_SHIFT,
352 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
353 "KVA PG");
355#ifdef CONFIG_HIGHMEM 354#ifdef CONFIG_HIGHMEM
356 highstart_pfn = highend_pfn = max_pfn; 355 highstart_pfn = highend_pfn = max_pfn;
357 if (max_pfn > system_max_low_pfn) 356 if (max_pfn > max_low_pfn)
358 highstart_pfn = system_max_low_pfn; 357 highstart_pfn = max_low_pfn;
359 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 358 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
360 pages_to_mb(highend_pfn - highstart_pfn)); 359 pages_to_mb(highend_pfn - highstart_pfn));
361 num_physpages = highend_pfn; 360 num_physpages = highend_pfn;
362 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 361 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
363#else 362#else
364 num_physpages = system_max_low_pfn; 363 num_physpages = max_low_pfn;
365 high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; 364 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
366#endif 365#endif
367 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 366 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
368 pages_to_mb(system_max_low_pfn)); 367 pages_to_mb(max_low_pfn));
369 printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 368 printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
370 min_low_pfn, max_low_pfn, highstart_pfn); 369 max_low_pfn, highstart_pfn);
371 370
372 printk("Low memory ends at vaddr %08lx\n", 371 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
373 (ulong) pfn_to_kaddr(max_low_pfn)); 372 (ulong) pfn_to_kaddr(max_low_pfn));
374 for_each_online_node(nid) { 373 for_each_online_node(nid) {
375 init_remap_allocator(nid); 374 init_remap_allocator(nid);
376 375
377 allocate_pgdat(nid); 376 allocate_pgdat(nid);
378 } 377 }
379 printk("High memory starts at vaddr %08lx\n", 378 remap_numa_kva();
379
380 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
380 (ulong) pfn_to_kaddr(highstart_pfn)); 381 (ulong) pfn_to_kaddr(highstart_pfn));
381 for_each_online_node(nid) 382 for_each_online_node(nid)
382 propagate_e820_map_node(nid); 383 propagate_e820_map_node(nid);
383 384
384 memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); 385 for_each_online_node(nid)
386 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
387
385 NODE_DATA(0)->bdata = &node0_bdata; 388 NODE_DATA(0)->bdata = &node0_bdata;
386 setup_bootmem_allocator(); 389 setup_bootmem_allocator();
387 return max_low_pfn;
388}
389
390void __init numa_kva_reserve(void)
391{
392 if (kva_pages)
393 reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
394 BOOTMEM_DEFAULT);
395} 390}
396 391
397void __init zone_sizes_init(void) 392void __init set_highmem_pages_init(void)
398{
399 int nid;
400 unsigned long max_zone_pfns[MAX_NR_ZONES];
401 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
402 max_zone_pfns[ZONE_DMA] =
403 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
404 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
405#ifdef CONFIG_HIGHMEM
406 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
407#endif
408
409 /* If SRAT has not registered memory, register it now */
410 if (find_max_pfn_with_active_regions() == 0) {
411 for_each_online_node(nid) {
412 if (node_has_online_mem(nid))
413 add_active_range(nid, node_start_pfn[nid],
414 node_end_pfn[nid]);
415 }
416 }
417
418 free_area_init_nodes(max_zone_pfns);
419 return;
420}
421
422void __init set_highmem_pages_init(int bad_ppro)
423{ 393{
424#ifdef CONFIG_HIGHMEM 394#ifdef CONFIG_HIGHMEM
425 struct zone *zone; 395 struct zone *zone;
426 struct page *page; 396 int nid;
427 397
428 for_each_zone(zone) { 398 for_each_zone(zone) {
429 unsigned long node_pfn, zone_start_pfn, zone_end_pfn; 399 unsigned long zone_start_pfn, zone_end_pfn;
430 400
431 if (!is_highmem(zone)) 401 if (!is_highmem(zone))
432 continue; 402 continue;
@@ -434,16 +404,12 @@ void __init set_highmem_pages_init(int bad_ppro)
434 zone_start_pfn = zone->zone_start_pfn; 404 zone_start_pfn = zone->zone_start_pfn;
435 zone_end_pfn = zone_start_pfn + zone->spanned_pages; 405 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
436 406
437 printk("Initializing %s for node %d (%08lx:%08lx)\n", 407 nid = zone_to_nid(zone);
438 zone->name, zone_to_nid(zone), 408 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
439 zone_start_pfn, zone_end_pfn); 409 zone->name, nid, zone_start_pfn, zone_end_pfn);
440 410
441 for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { 411 add_highpages_with_active_regions(nid, zone_start_pfn,
442 if (!pfn_valid(node_pfn)) 412 zone_end_pfn);
443 continue;
444 page = pfn_to_page(node_pfn);
445 add_one_highpage_init(page, node_pfn, bad_ppro);
446 }
447 } 413 }
448 totalram_pages += totalhigh_pages; 414 totalram_pages += totalhigh_pages;
449#endif 415#endif
@@ -476,3 +442,4 @@ int memory_add_physaddr_to_nid(u64 addr)
476 442
477EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 443EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
478#endif 444#endif
445
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2c24bea92c66..0bb0caed8971 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -42,7 +42,7 @@ static struct addr_marker address_markers[] = {
42 { 0, "User Space" }, 42 { 0, "User Space" },
43#ifdef CONFIG_X86_64 43#ifdef CONFIG_X86_64
44 { 0x8000000000000000UL, "Kernel Space" }, 44 { 0x8000000000000000UL, "Kernel Space" },
45 { 0xffff810000000000UL, "Low Kernel Mapping" }, 45 { PAGE_OFFSET, "Low Kernel Mapping" },
46 { VMALLOC_START, "vmalloc() Area" }, 46 { VMALLOC_START, "vmalloc() Area" },
47 { VMEMMAP_START, "Vmemmap" }, 47 { VMEMMAP_START, "Vmemmap" },
48 { __START_KERNEL_map, "High Kernel Mapping" }, 48 { __START_KERNEL_map, "High Kernel Mapping" },
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8bcb6f40ccb6..d0f5fce77d95 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -55,11 +55,7 @@ static inline int notify_page_fault(struct pt_regs *regs)
55 int ret = 0; 55 int ret = 0;
56 56
57 /* kprobe_running() needs smp_processor_id() */ 57 /* kprobe_running() needs smp_processor_id() */
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) { 58 if (!user_mode_vm(regs)) {
60#else
61 if (!user_mode(regs)) {
62#endif
63 preempt_disable(); 59 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 60 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1; 61 ret = 1;
@@ -396,11 +392,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
396 printk(KERN_CONT "NULL pointer dereference"); 392 printk(KERN_CONT "NULL pointer dereference");
397 else 393 else
398 printk(KERN_CONT "paging request"); 394 printk(KERN_CONT "paging request");
399#ifdef CONFIG_X86_32 395 printk(KERN_CONT " at %p\n", (void *) address);
400 printk(KERN_CONT " at %08lx\n", address);
401#else
402 printk(KERN_CONT " at %016lx\n", address);
403#endif
404 printk(KERN_ALERT "IP:"); 396 printk(KERN_ALERT "IP:");
405 printk_address(regs->ip, 1); 397 printk_address(regs->ip, 1);
406 dump_pagetable(address); 398 dump_pagetable(address);
@@ -800,14 +792,10 @@ bad_area_nosemaphore:
800 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 792 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
801 printk_ratelimit()) { 793 printk_ratelimit()) {
802 printk( 794 printk(
803#ifdef CONFIG_X86_32 795 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
804 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
805#else
806 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
807#endif
808 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 796 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
809 tsk->comm, task_pid_nr(tsk), address, regs->ip, 797 tsk->comm, task_pid_nr(tsk), address,
810 regs->sp, error_code); 798 (void *) regs->ip, (void *) regs->sp, error_code);
811 print_vma_addr(" in ", regs->ip); 799 print_vma_addr(" in ", regs->ip);
812 printk("\n"); 800 printk("\n");
813 } 801 }
@@ -915,14 +903,7 @@ LIST_HEAD(pgd_list);
915void vmalloc_sync_all(void) 903void vmalloc_sync_all(void)
916{ 904{
917#ifdef CONFIG_X86_32 905#ifdef CONFIG_X86_32
918 /* 906 unsigned long start = VMALLOC_START & PGDIR_MASK;
919 * Note that races in the updates of insync and start aren't
920 * problematic: insync can only get set bits added, and updates to
921 * start are only improving performance (without affecting correctness
922 * if undone).
923 */
924 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
925 static unsigned long start = TASK_SIZE;
926 unsigned long address; 907 unsigned long address;
927 908
928 if (SHARED_KERNEL_PMD) 909 if (SHARED_KERNEL_PMD)
@@ -930,56 +911,38 @@ void vmalloc_sync_all(void)
930 911
931 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 912 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
932 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 913 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
933 if (!test_bit(pgd_index(address), insync)) { 914 unsigned long flags;
934 unsigned long flags; 915 struct page *page;
935 struct page *page; 916
936 917 spin_lock_irqsave(&pgd_lock, flags);
937 spin_lock_irqsave(&pgd_lock, flags); 918 list_for_each_entry(page, &pgd_list, lru) {
938 list_for_each_entry(page, &pgd_list, lru) { 919 if (!vmalloc_sync_one(page_address(page),
939 if (!vmalloc_sync_one(page_address(page), 920 address))
940 address)) 921 break;
941 break;
942 }
943 spin_unlock_irqrestore(&pgd_lock, flags);
944 if (!page)
945 set_bit(pgd_index(address), insync);
946 } 922 }
947 if (address == start && test_bit(pgd_index(address), insync)) 923 spin_unlock_irqrestore(&pgd_lock, flags);
948 start = address + PGDIR_SIZE;
949 } 924 }
950#else /* CONFIG_X86_64 */ 925#else /* CONFIG_X86_64 */
951 /* 926 unsigned long start = VMALLOC_START & PGDIR_MASK;
952 * Note that races in the updates of insync and start aren't
953 * problematic: insync can only get set bits added, and updates to
954 * start are only improving performance (without affecting correctness
955 * if undone).
956 */
957 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
958 static unsigned long start = VMALLOC_START & PGDIR_MASK;
959 unsigned long address; 927 unsigned long address;
960 928
961 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { 929 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
962 if (!test_bit(pgd_index(address), insync)) { 930 const pgd_t *pgd_ref = pgd_offset_k(address);
963 const pgd_t *pgd_ref = pgd_offset_k(address); 931 unsigned long flags;
964 unsigned long flags; 932 struct page *page;
965 struct page *page; 933
966 934 if (pgd_none(*pgd_ref))
967 if (pgd_none(*pgd_ref)) 935 continue;
968 continue; 936 spin_lock_irqsave(&pgd_lock, flags);
969 spin_lock_irqsave(&pgd_lock, flags); 937 list_for_each_entry(page, &pgd_list, lru) {
970 list_for_each_entry(page, &pgd_list, lru) { 938 pgd_t *pgd;
971 pgd_t *pgd; 939 pgd = (pgd_t *)page_address(page) + pgd_index(address);
972 pgd = (pgd_t *)page_address(page) + pgd_index(address); 940 if (pgd_none(*pgd))
973 if (pgd_none(*pgd)) 941 set_pgd(pgd, *pgd_ref);
974 set_pgd(pgd, *pgd_ref); 942 else
975 else 943 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
976 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
977 }
978 spin_unlock_irqrestore(&pgd_lock, flags);
979 set_bit(pgd_index(address), insync);
980 } 944 }
981 if (address == start) 945 spin_unlock_irqrestore(&pgd_lock, flags);
982 start = address + PGDIR_SIZE;
983 } 946 }
984#endif 947#endif
985} 948}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b6..029e8cffca9e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -50,6 +50,7 @@
50 50
51unsigned int __VMALLOC_RESERVE = 128 << 20; 51unsigned int __VMALLOC_RESERVE = 128 << 20;
52 52
53unsigned long max_low_pfn_mapped;
53unsigned long max_pfn_mapped; 54unsigned long max_pfn_mapped;
54 55
55DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 56DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -57,6 +58,27 @@ unsigned long highstart_pfn, highend_pfn;
57 58
58static noinline int do_test_wp_bit(void); 59static noinline int do_test_wp_bit(void);
59 60
61
62static unsigned long __initdata table_start;
63static unsigned long __meminitdata table_end;
64static unsigned long __meminitdata table_top;
65
66static int __initdata after_init_bootmem;
67
68static __init void *alloc_low_page(unsigned long *phys)
69{
70 unsigned long pfn = table_end++;
71 void *adr;
72
73 if (pfn >= table_top)
74 panic("alloc_low_page: ran out of memory");
75
76 adr = __va(pfn * PAGE_SIZE);
77 memset(adr, 0, PAGE_SIZE);
78 *phys = pfn * PAGE_SIZE;
79 return adr;
80}
81
60/* 82/*
61 * Creates a middle page table and puts a pointer to it in the 83 * Creates a middle page table and puts a pointer to it in the
62 * given global directory entry. This only returns the gd entry 84 * given global directory entry. This only returns the gd entry
@@ -68,9 +90,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
68 pmd_t *pmd_table; 90 pmd_t *pmd_table;
69 91
70#ifdef CONFIG_X86_PAE 92#ifdef CONFIG_X86_PAE
93 unsigned long phys;
71 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 94 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
72 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 95 if (after_init_bootmem)
73 96 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
97 else
98 pmd_table = (pmd_t *)alloc_low_page(&phys);
74 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 99 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
75 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 100 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
76 pud = pud_offset(pgd, 0); 101 pud = pud_offset(pgd, 0);
@@ -92,12 +117,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
92 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 117 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
93 pte_t *page_table = NULL; 118 pte_t *page_table = NULL;
94 119
120 if (after_init_bootmem) {
95#ifdef CONFIG_DEBUG_PAGEALLOC 121#ifdef CONFIG_DEBUG_PAGEALLOC
96 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 122 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
97#endif 123#endif
98 if (!page_table) { 124 if (!page_table)
99 page_table = 125 page_table =
100 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 126 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
127 } else {
128 unsigned long phys;
129 page_table = (pte_t *)alloc_low_page(&phys);
101 } 130 }
102 131
103 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 132 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@ -155,38 +184,44 @@ static inline int is_kernel_text(unsigned long addr)
155 * of max_low_pfn pages, by creating page tables starting from address 184 * of max_low_pfn pages, by creating page tables starting from address
156 * PAGE_OFFSET: 185 * PAGE_OFFSET:
157 */ 186 */
158static void __init kernel_physical_mapping_init(pgd_t *pgd_base) 187static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
188 unsigned long start_pfn,
189 unsigned long end_pfn,
190 int use_pse)
159{ 191{
160 int pgd_idx, pmd_idx, pte_ofs; 192 int pgd_idx, pmd_idx, pte_ofs;
161 unsigned long pfn; 193 unsigned long pfn;
162 pgd_t *pgd; 194 pgd_t *pgd;
163 pmd_t *pmd; 195 pmd_t *pmd;
164 pte_t *pte; 196 pte_t *pte;
197 unsigned pages_2m = 0, pages_4k = 0;
165 198
166 pgd_idx = pgd_index(PAGE_OFFSET); 199 if (!cpu_has_pse)
167 pgd = pgd_base + pgd_idx; 200 use_pse = 0;
168 pfn = 0;
169 201
202 pfn = start_pfn;
203 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
204 pgd = pgd_base + pgd_idx;
170 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { 205 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
171 pmd = one_md_table_init(pgd); 206 pmd = one_md_table_init(pgd);
172 if (pfn >= max_low_pfn)
173 continue;
174 207
175 for (pmd_idx = 0; 208 if (pfn >= end_pfn)
176 pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; 209 continue;
210#ifdef CONFIG_X86_PAE
211 pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
212 pmd += pmd_idx;
213#else
214 pmd_idx = 0;
215#endif
216 for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
177 pmd++, pmd_idx++) { 217 pmd++, pmd_idx++) {
178 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; 218 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
179 219
180 /* 220 /*
181 * Map with big pages if possible, otherwise 221 * Map with big pages if possible, otherwise
182 * create normal page tables: 222 * create normal page tables:
183 *
184 * Don't use a large page for the first 2/4MB of memory
185 * because there are often fixed size MTRRs in there
186 * and overlapping MTRRs into large pages can cause
187 * slowdowns.
188 */ 223 */
189 if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { 224 if (use_pse) {
190 unsigned int addr2; 225 unsigned int addr2;
191 pgprot_t prot = PAGE_KERNEL_LARGE; 226 pgprot_t prot = PAGE_KERNEL_LARGE;
192 227
@@ -197,34 +232,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
197 is_kernel_text(addr2)) 232 is_kernel_text(addr2))
198 prot = PAGE_KERNEL_LARGE_EXEC; 233 prot = PAGE_KERNEL_LARGE_EXEC;
199 234
235 pages_2m++;
200 set_pmd(pmd, pfn_pmd(pfn, prot)); 236 set_pmd(pmd, pfn_pmd(pfn, prot));
201 237
202 pfn += PTRS_PER_PTE; 238 pfn += PTRS_PER_PTE;
203 max_pfn_mapped = pfn;
204 continue; 239 continue;
205 } 240 }
206 pte = one_page_table_init(pmd); 241 pte = one_page_table_init(pmd);
207 242
208 for (pte_ofs = 0; 243 pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
209 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; 244 pte += pte_ofs;
245 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
210 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 246 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
211 pgprot_t prot = PAGE_KERNEL; 247 pgprot_t prot = PAGE_KERNEL;
212 248
213 if (is_kernel_text(addr)) 249 if (is_kernel_text(addr))
214 prot = PAGE_KERNEL_EXEC; 250 prot = PAGE_KERNEL_EXEC;
215 251
252 pages_4k++;
216 set_pte(pte, pfn_pte(pfn, prot)); 253 set_pte(pte, pfn_pte(pfn, prot));
217 } 254 }
218 max_pfn_mapped = pfn;
219 } 255 }
220 } 256 }
221} 257 update_page_count(PG_LEVEL_2M, pages_2m);
222 258 update_page_count(PG_LEVEL_4K, pages_4k);
223static inline int page_kills_ppro(unsigned long pagenr)
224{
225 if (pagenr >= 0x70000 && pagenr <= 0x7003F)
226 return 1;
227 return 0;
228} 259}
229 260
230/* 261/*
@@ -287,29 +318,62 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
287 pkmap_page_table = pte; 318 pkmap_page_table = pte;
288} 319}
289 320
290void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) 321static void __init add_one_highpage_init(struct page *page, int pfn)
291{ 322{
292 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { 323 ClearPageReserved(page);
293 ClearPageReserved(page); 324 init_page_count(page);
294 init_page_count(page); 325 __free_page(page);
295 __free_page(page); 326 totalhigh_pages++;
296 totalhigh_pages++;
297 } else
298 SetPageReserved(page);
299} 327}
300 328
301#ifndef CONFIG_NUMA 329struct add_highpages_data {
302static void __init set_highmem_pages_init(int bad_ppro) 330 unsigned long start_pfn;
331 unsigned long end_pfn;
332};
333
334static int __init add_highpages_work_fn(unsigned long start_pfn,
335 unsigned long end_pfn, void *datax)
303{ 336{
304 int pfn; 337 int node_pfn;
338 struct page *page;
339 unsigned long final_start_pfn, final_end_pfn;
340 struct add_highpages_data *data;
305 341
306 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { 342 data = (struct add_highpages_data *)datax;
307 /* 343
308 * Holes under sparsemem might not have no mem_map[]: 344 final_start_pfn = max(start_pfn, data->start_pfn);
309 */ 345 final_end_pfn = min(end_pfn, data->end_pfn);
310 if (pfn_valid(pfn)) 346 if (final_start_pfn >= final_end_pfn)
311 add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); 347 return 0;
348
349 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
350 node_pfn++) {
351 if (!pfn_valid(node_pfn))
352 continue;
353 page = pfn_to_page(node_pfn);
354 add_one_highpage_init(page, node_pfn);
312 } 355 }
356
357 return 0;
358
359}
360
361void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
362 unsigned long end_pfn)
363{
364 struct add_highpages_data data;
365
366 data.start_pfn = start_pfn;
367 data.end_pfn = end_pfn;
368
369 work_with_active_regions(nid, add_highpages_work_fn, &data);
370}
371
372#ifndef CONFIG_NUMA
373static void __init set_highmem_pages_init(void)
374{
375 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
376
313 totalram_pages += totalhigh_pages; 377 totalram_pages += totalhigh_pages;
314} 378}
315#endif /* !CONFIG_NUMA */ 379#endif /* !CONFIG_NUMA */
@@ -317,14 +381,9 @@ static void __init set_highmem_pages_init(int bad_ppro)
317#else 381#else
318# define kmap_init() do { } while (0) 382# define kmap_init() do { } while (0)
319# define permanent_kmaps_init(pgd_base) do { } while (0) 383# define permanent_kmaps_init(pgd_base) do { } while (0)
320# define set_highmem_pages_init(bad_ppro) do { } while (0) 384# define set_highmem_pages_init() do { } while (0)
321#endif /* CONFIG_HIGHMEM */ 385#endif /* CONFIG_HIGHMEM */
322 386
323pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
324EXPORT_SYMBOL(__PAGE_KERNEL);
325
326pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
327
328void __init native_pagetable_setup_start(pgd_t *base) 387void __init native_pagetable_setup_start(pgd_t *base)
329{ 388{
330 unsigned long pfn, va; 389 unsigned long pfn, va;
@@ -380,27 +439,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
380 * be partially populated, and so it avoids stomping on any existing 439 * be partially populated, and so it avoids stomping on any existing
381 * mappings. 440 * mappings.
382 */ 441 */
383static void __init pagetable_init(void) 442static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
384{ 443{
385 pgd_t *pgd_base = swapper_pg_dir;
386 unsigned long vaddr, end; 444 unsigned long vaddr, end;
387 445
388 paravirt_pagetable_setup_start(pgd_base);
389
390 /* Enable PSE if available */
391 if (cpu_has_pse)
392 set_in_cr4(X86_CR4_PSE);
393
394 /* Enable PGE if available */
395 if (cpu_has_pge) {
396 set_in_cr4(X86_CR4_PGE);
397 __PAGE_KERNEL |= _PAGE_GLOBAL;
398 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
399 }
400
401 kernel_physical_mapping_init(pgd_base);
402 remap_numa_kva();
403
404 /* 446 /*
405 * Fixed mappings, only the page table structure has to be 447 * Fixed mappings, only the page table structure has to be
406 * created - mappings will be set by set_fixmap(): 448 * created - mappings will be set by set_fixmap():
@@ -410,6 +452,13 @@ static void __init pagetable_init(void)
410 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 452 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
411 page_table_range_init(vaddr, end, pgd_base); 453 page_table_range_init(vaddr, end, pgd_base);
412 early_ioremap_reset(); 454 early_ioremap_reset();
455}
456
457static void __init pagetable_init(void)
458{
459 pgd_t *pgd_base = swapper_pg_dir;
460
461 paravirt_pagetable_setup_start(pgd_base);
413 462
414 permanent_kmaps_init(pgd_base); 463 permanent_kmaps_init(pgd_base);
415 464
@@ -456,7 +505,7 @@ void zap_low_mappings(void)
456 505
457int nx_enabled; 506int nx_enabled;
458 507
459pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; 508pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
460EXPORT_SYMBOL_GPL(__supported_pte_mask); 509EXPORT_SYMBOL_GPL(__supported_pte_mask);
461 510
462#ifdef CONFIG_X86_PAE 511#ifdef CONFIG_X86_PAE
@@ -509,27 +558,318 @@ static void __init set_nx(void)
509} 558}
510#endif 559#endif
511 560
561/* user-defined highmem size */
562static unsigned int highmem_pages = -1;
563
512/* 564/*
513 * paging_init() sets up the page tables - note that the first 8MB are 565 * highmem=size forces highmem to be exactly 'size' bytes.
514 * already mapped by head.S. 566 * This works even on boxes that have no highmem otherwise.
515 * 567 * This also works to reduce highmem size on bigger boxes.
516 * This routines also unmaps the page at virtual kernel address 0, so
517 * that we can trap those pesky NULL-reference errors in the kernel.
518 */ 568 */
519void __init paging_init(void) 569static int __init parse_highmem(char *arg)
570{
571 if (!arg)
572 return -EINVAL;
573
574 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
575 return 0;
576}
577early_param("highmem", parse_highmem);
578
579/*
580 * Determine low and high memory ranges:
581 */
582void __init find_low_pfn_range(void)
520{ 583{
584 /* it could update max_pfn */
585
586 /* max_low_pfn is 0, we already have early_res support */
587
588 max_low_pfn = max_pfn;
589 if (max_low_pfn > MAXMEM_PFN) {
590 if (highmem_pages == -1)
591 highmem_pages = max_pfn - MAXMEM_PFN;
592 if (highmem_pages + MAXMEM_PFN < max_pfn)
593 max_pfn = MAXMEM_PFN + highmem_pages;
594 if (highmem_pages + MAXMEM_PFN > max_pfn) {
595 printk(KERN_WARNING "only %luMB highmem pages "
596 "available, ignoring highmem size of %uMB.\n",
597 pages_to_mb(max_pfn - MAXMEM_PFN),
598 pages_to_mb(highmem_pages));
599 highmem_pages = 0;
600 }
601 max_low_pfn = MAXMEM_PFN;
602#ifndef CONFIG_HIGHMEM
603 /* Maximum memory usable is what is directly addressable */
604 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
605 MAXMEM>>20);
606 if (max_pfn > MAX_NONPAE_PFN)
607 printk(KERN_WARNING
608 "Use a HIGHMEM64G enabled kernel.\n");
609 else
610 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
611 max_pfn = MAXMEM_PFN;
612#else /* !CONFIG_HIGHMEM */
613#ifndef CONFIG_HIGHMEM64G
614 if (max_pfn > MAX_NONPAE_PFN) {
615 max_pfn = MAX_NONPAE_PFN;
616 printk(KERN_WARNING "Warning only 4GB will be used."
617 "Use a HIGHMEM64G enabled kernel.\n");
618 }
619#endif /* !CONFIG_HIGHMEM64G */
620#endif /* !CONFIG_HIGHMEM */
621 } else {
622 if (highmem_pages == -1)
623 highmem_pages = 0;
624#ifdef CONFIG_HIGHMEM
625 if (highmem_pages >= max_pfn) {
626 printk(KERN_ERR "highmem size specified (%uMB) is "
627 "bigger than pages available (%luMB)!.\n",
628 pages_to_mb(highmem_pages),
629 pages_to_mb(max_pfn));
630 highmem_pages = 0;
631 }
632 if (highmem_pages) {
633 if (max_low_pfn - highmem_pages <
634 64*1024*1024/PAGE_SIZE){
635 printk(KERN_ERR "highmem size %uMB results in "
636 "smaller than 64MB lowmem, ignoring it.\n"
637 , pages_to_mb(highmem_pages));
638 highmem_pages = 0;
639 }
640 max_low_pfn -= highmem_pages;
641 }
642#else
643 if (highmem_pages)
644 printk(KERN_ERR "ignoring highmem size on non-highmem"
645 " kernel!\n");
646#endif
647 }
648}
649
650#ifndef CONFIG_NEED_MULTIPLE_NODES
651void __init initmem_init(unsigned long start_pfn,
652 unsigned long end_pfn)
653{
654#ifdef CONFIG_HIGHMEM
655 highstart_pfn = highend_pfn = max_pfn;
656 if (max_pfn > max_low_pfn)
657 highstart_pfn = max_low_pfn;
658 memory_present(0, 0, highend_pfn);
659 e820_register_active_regions(0, 0, highend_pfn);
660 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
661 pages_to_mb(highend_pfn - highstart_pfn));
662 num_physpages = highend_pfn;
663 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
664#else
665 memory_present(0, 0, max_low_pfn);
666 e820_register_active_regions(0, 0, max_low_pfn);
667 num_physpages = max_low_pfn;
668 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
669#endif
670#ifdef CONFIG_FLATMEM
671 max_mapnr = num_physpages;
672#endif
673 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
674 pages_to_mb(max_low_pfn));
675
676 setup_bootmem_allocator();
677}
678#endif /* !CONFIG_NEED_MULTIPLE_NODES */
679
680static void __init zone_sizes_init(void)
681{
682 unsigned long max_zone_pfns[MAX_NR_ZONES];
683 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
684 max_zone_pfns[ZONE_DMA] =
685 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
686 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
687#ifdef CONFIG_HIGHMEM
688 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
689#endif
690
691 free_area_init_nodes(max_zone_pfns);
692}
693
694void __init setup_bootmem_allocator(void)
695{
696 int i;
697 unsigned long bootmap_size, bootmap;
698 /*
699 * Initialize the boot-time allocator (with low memory only):
700 */
701 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
702 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
703 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
704 PAGE_SIZE);
705 if (bootmap == -1L)
706 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
707 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
708
709 /* don't touch min_low_pfn */
710 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
711 min_low_pfn, max_low_pfn);
712 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
713 max_pfn_mapped<<PAGE_SHIFT);
714 printk(KERN_INFO " low ram: %08lx - %08lx\n",
715 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
716 printk(KERN_INFO " bootmap %08lx - %08lx\n",
717 bootmap, bootmap + bootmap_size);
718 for_each_online_node(i)
719 free_bootmem_with_active_regions(i, max_low_pfn);
720 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
721
722 after_init_bootmem = 1;
723}
724
725static void __init find_early_table_space(unsigned long end)
726{
727 unsigned long puds, pmds, ptes, tables, start;
728
729 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
730 tables = PAGE_ALIGN(puds * sizeof(pud_t));
731
732 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
733 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
734
735 if (cpu_has_pse) {
736 unsigned long extra;
737
738 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
739 extra += PMD_SIZE;
740 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
741 } else
742 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
743
744 tables += PAGE_ALIGN(ptes * sizeof(pte_t));
745
746 /* for fixmap */
747 tables += PAGE_SIZE * 2;
748
749 /*
750 * RED-PEN putting page tables only on node 0 could
751 * cause a hotspot and fill up ZONE_DMA. The page tables
752 * need roughly 0.5KB per GB.
753 */
754 start = 0x7000;
755 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
756 tables, PAGE_SIZE);
757 if (table_start == -1UL)
758 panic("Cannot find space for the kernel page tables");
759
760 table_start >>= PAGE_SHIFT;
761 table_end = table_start;
762 table_top = table_start + (tables>>PAGE_SHIFT);
763
764 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
765 end, table_start << PAGE_SHIFT,
766 (table_start << PAGE_SHIFT) + tables);
767}
768
769unsigned long __init_refok init_memory_mapping(unsigned long start,
770 unsigned long end)
771{
772 pgd_t *pgd_base = swapper_pg_dir;
773 unsigned long start_pfn, end_pfn;
774 unsigned long big_page_start;
775
776 /*
777 * Find space for the kernel direct mapping tables.
778 */
779 if (!after_init_bootmem)
780 find_early_table_space(end);
781
521#ifdef CONFIG_X86_PAE 782#ifdef CONFIG_X86_PAE
522 set_nx(); 783 set_nx();
523 if (nx_enabled) 784 if (nx_enabled)
524 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 785 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
525#endif 786#endif
526 pagetable_init(); 787
788 /* Enable PSE if available */
789 if (cpu_has_pse)
790 set_in_cr4(X86_CR4_PSE);
791
792 /* Enable PGE if available */
793 if (cpu_has_pge) {
794 set_in_cr4(X86_CR4_PGE);
795 __supported_pte_mask |= _PAGE_GLOBAL;
796 }
797
798 /*
799 * Don't use a large page for the first 2/4MB of memory
800 * because there are often fixed size MTRRs in there
801 * and overlapping MTRRs into large pages can cause
802 * slowdowns.
803 */
804 big_page_start = PMD_SIZE;
805
806 if (start < big_page_start) {
807 start_pfn = start >> PAGE_SHIFT;
808 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
809 } else {
810 /* head is not big page alignment ? */
811 start_pfn = start >> PAGE_SHIFT;
812 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
813 << (PMD_SHIFT - PAGE_SHIFT);
814 }
815 if (start_pfn < end_pfn)
816 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
817
818 /* big page range */
819 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
820 << (PMD_SHIFT - PAGE_SHIFT);
821 if (start_pfn < (big_page_start >> PAGE_SHIFT))
822 start_pfn = big_page_start >> PAGE_SHIFT;
823 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
824 if (start_pfn < end_pfn)
825 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
826 cpu_has_pse);
827
828 /* tail is not big page alignment ? */
829 start_pfn = end_pfn;
830 if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
831 end_pfn = end >> PAGE_SHIFT;
832 if (start_pfn < end_pfn)
833 kernel_physical_mapping_init(pgd_base, start_pfn,
834 end_pfn, 0);
835 }
836
837 early_ioremap_page_table_range_init(pgd_base);
527 838
528 load_cr3(swapper_pg_dir); 839 load_cr3(swapper_pg_dir);
529 840
530 __flush_tlb_all(); 841 __flush_tlb_all();
531 842
843 if (!after_init_bootmem)
844 reserve_early(table_start << PAGE_SHIFT,
845 table_end << PAGE_SHIFT, "PGTABLE");
846
847 return end >> PAGE_SHIFT;
848}
849
850
851/*
852 * paging_init() sets up the page tables - note that the first 8MB are
853 * already mapped by head.S.
854 *
855 * This routines also unmaps the page at virtual kernel address 0, so
856 * that we can trap those pesky NULL-reference errors in the kernel.
857 */
858void __init paging_init(void)
859{
860 pagetable_init();
861
862 __flush_tlb_all();
863
532 kmap_init(); 864 kmap_init();
865
866 /*
867 * NOTE: at this point the bootmem allocator is fully available.
868 */
869 sparse_init();
870 zone_sizes_init();
871
872 paravirt_post_allocator_init();
533} 873}
534 874
535/* 875/*
@@ -564,24 +904,11 @@ static struct kcore_list kcore_mem, kcore_vmalloc;
564void __init mem_init(void) 904void __init mem_init(void)
565{ 905{
566 int codesize, reservedpages, datasize, initsize; 906 int codesize, reservedpages, datasize, initsize;
567 int tmp, bad_ppro; 907 int tmp;
568 908
569#ifdef CONFIG_FLATMEM 909#ifdef CONFIG_FLATMEM
570 BUG_ON(!mem_map); 910 BUG_ON(!mem_map);
571#endif 911#endif
572 bad_ppro = ppro_with_ram_bug();
573
574#ifdef CONFIG_HIGHMEM
575 /* check that fixmap and pkmap do not overlap */
576 if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
577 printk(KERN_ERR
578 "fixmap and kmap areas overlap - this will crash\n");
579 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
580 PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
581 FIXADDR_START);
582 BUG();
583 }
584#endif
585 /* this will put all low memory onto the freelists */ 912 /* this will put all low memory onto the freelists */
586 totalram_pages += free_all_bootmem(); 913 totalram_pages += free_all_bootmem();
587 914
@@ -593,7 +920,7 @@ void __init mem_init(void)
593 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 920 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
594 reservedpages++; 921 reservedpages++;
595 922
596 set_highmem_pages_init(bad_ppro); 923 set_highmem_pages_init();
597 924
598 codesize = (unsigned long) &_etext - (unsigned long) &_text; 925 codesize = (unsigned long) &_etext - (unsigned long) &_text;
599 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 926 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
@@ -614,7 +941,6 @@ void __init mem_init(void)
614 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 941 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
615 ); 942 );
616 943
617#if 1 /* double-sanity-check paranoia */
618 printk(KERN_INFO "virtual kernel memory layout:\n" 944 printk(KERN_INFO "virtual kernel memory layout:\n"
619 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 945 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
620#ifdef CONFIG_HIGHMEM 946#ifdef CONFIG_HIGHMEM
@@ -655,7 +981,6 @@ void __init mem_init(void)
655#endif 981#endif
656 BUG_ON(VMALLOC_START > VMALLOC_END); 982 BUG_ON(VMALLOC_START > VMALLOC_END);
657 BUG_ON((unsigned long)high_memory > VMALLOC_START); 983 BUG_ON((unsigned long)high_memory > VMALLOC_START);
658#endif /* double-sanity-check paranoia */
659 984
660 if (boot_cpu_data.wp_works_ok < 0) 985 if (boot_cpu_data.wp_works_ok < 0)
661 test_wp_bit(); 986 test_wp_bit();
@@ -784,3 +1109,9 @@ void free_initrd_mem(unsigned long start, unsigned long end)
784 free_init_pages("initrd memory", start, end); 1109 free_init_pages("initrd memory", start, end);
785} 1110}
786#endif 1111#endif
1112
1113int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1114 int flags)
1115{
1116 return reserve_bootmem(phys, len, flags);
1117}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 819dad973b13..a25cc6fa2207 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -18,6 +18,7 @@
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/initrd.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include <linux/bootmem.h> 23#include <linux/bootmem.h>
23#include <linux/proc_fs.h> 24#include <linux/proc_fs.h>
@@ -47,6 +48,14 @@
47#include <asm/numa.h> 48#include <asm/numa.h>
48#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
49 50
51/*
52 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
53 * The direct mapping extends to max_pfn_mapped, so that we can directly access
54 * apertures, ACPI and other tables without having to play with fixmaps.
55 */
56unsigned long max_low_pfn_mapped;
57unsigned long max_pfn_mapped;
58
50static unsigned long dma_reserve __initdata; 59static unsigned long dma_reserve __initdata;
51 60
52DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 61DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -135,26 +144,17 @@ static __init void *spp_getpage(void)
135 return ptr; 144 return ptr;
136} 145}
137 146
138static __init void 147void
139set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) 148set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
140{ 149{
141 pgd_t *pgd;
142 pud_t *pud; 150 pud_t *pud;
143 pmd_t *pmd; 151 pmd_t *pmd;
144 pte_t *pte, new_pte; 152 pte_t *pte;
145
146 pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
147 153
148 pgd = pgd_offset_k(vaddr); 154 pud = pud_page + pud_index(vaddr);
149 if (pgd_none(*pgd)) {
150 printk(KERN_ERR
151 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
152 return;
153 }
154 pud = pud_offset(pgd, vaddr);
155 if (pud_none(*pud)) { 155 if (pud_none(*pud)) {
156 pmd = (pmd_t *) spp_getpage(); 156 pmd = (pmd_t *) spp_getpage();
157 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); 157 pud_populate(&init_mm, pud, pmd);
158 if (pmd != pmd_offset(pud, 0)) { 158 if (pmd != pmd_offset(pud, 0)) {
159 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 159 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
160 pmd, pmd_offset(pud, 0)); 160 pmd, pmd_offset(pud, 0));
@@ -164,13 +164,12 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
164 pmd = pmd_offset(pud, vaddr); 164 pmd = pmd_offset(pud, vaddr);
165 if (pmd_none(*pmd)) { 165 if (pmd_none(*pmd)) {
166 pte = (pte_t *) spp_getpage(); 166 pte = (pte_t *) spp_getpage();
167 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); 167 pmd_populate_kernel(&init_mm, pmd, pte);
168 if (pte != pte_offset_kernel(pmd, 0)) { 168 if (pte != pte_offset_kernel(pmd, 0)) {
169 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 169 printk(KERN_ERR "PAGETABLE BUG #02!\n");
170 return; 170 return;
171 } 171 }
172 } 172 }
173 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
174 173
175 pte = pte_offset_kernel(pmd, vaddr); 174 pte = pte_offset_kernel(pmd, vaddr);
176 if (!pte_none(*pte) && pte_val(new_pte) && 175 if (!pte_none(*pte) && pte_val(new_pte) &&
@@ -185,6 +184,64 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
185 __flush_tlb_one(vaddr); 184 __flush_tlb_one(vaddr);
186} 185}
187 186
187void
188set_pte_vaddr(unsigned long vaddr, pte_t pteval)
189{
190 pgd_t *pgd;
191 pud_t *pud_page;
192
193 pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
194
195 pgd = pgd_offset_k(vaddr);
196 if (pgd_none(*pgd)) {
197 printk(KERN_ERR
198 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
199 return;
200 }
201 pud_page = (pud_t*)pgd_page_vaddr(*pgd);
202 set_pte_vaddr_pud(pud_page, vaddr, pteval);
203}
204
205/*
206 * Create large page table mappings for a range of physical addresses.
207 */
208static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
209 pgprot_t prot)
210{
211 pgd_t *pgd;
212 pud_t *pud;
213 pmd_t *pmd;
214
215 BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
216 for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
217 pgd = pgd_offset_k((unsigned long)__va(phys));
218 if (pgd_none(*pgd)) {
219 pud = (pud_t *) spp_getpage();
220 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
221 _PAGE_USER));
222 }
223 pud = pud_offset(pgd, (unsigned long)__va(phys));
224 if (pud_none(*pud)) {
225 pmd = (pmd_t *) spp_getpage();
226 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
227 _PAGE_USER));
228 }
229 pmd = pmd_offset(pud, phys);
230 BUG_ON(!pmd_none(*pmd));
231 set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
232 }
233}
234
235void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
236{
237 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
238}
239
240void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
241{
242 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
243}
244
188/* 245/*
189 * The head.S code sets up the kernel high mapping: 246 * The head.S code sets up the kernel high mapping:
190 * 247 *
@@ -213,20 +270,9 @@ void __init cleanup_highmap(void)
213 } 270 }
214} 271}
215 272
216/* NOTE: this is meant to be run only at boot */
217void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
218{
219 unsigned long address = __fix_to_virt(idx);
220
221 if (idx >= __end_of_fixed_addresses) {
222 printk(KERN_ERR "Invalid __set_fixmap\n");
223 return;
224 }
225 set_pte_phys(address, phys, prot);
226}
227
228static unsigned long __initdata table_start; 273static unsigned long __initdata table_start;
229static unsigned long __meminitdata table_end; 274static unsigned long __meminitdata table_end;
275static unsigned long __meminitdata table_top;
230 276
231static __meminit void *alloc_low_page(unsigned long *phys) 277static __meminit void *alloc_low_page(unsigned long *phys)
232{ 278{
@@ -240,7 +286,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
240 return adr; 286 return adr;
241 } 287 }
242 288
243 if (pfn >= end_pfn) 289 if (pfn >= table_top)
244 panic("alloc_low_page: ran out of memory"); 290 panic("alloc_low_page: ran out of memory");
245 291
246 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 292 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -257,65 +303,61 @@ static __meminit void unmap_low_page(void *adr)
257 early_iounmap(adr, PAGE_SIZE); 303 early_iounmap(adr, PAGE_SIZE);
258} 304}
259 305
260/* Must run before zap_low_mappings */ 306static unsigned long __meminit
261__meminit void *early_ioremap(unsigned long addr, unsigned long size) 307phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
262{ 308{
263 pmd_t *pmd, *last_pmd; 309 unsigned pages = 0;
264 unsigned long vaddr; 310 unsigned long last_map_addr = end;
265 int i, pmds; 311 int i;
312
313 pte_t *pte = pte_page + pte_index(addr);
266 314
267 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; 315 for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
268 vaddr = __START_KERNEL_map;
269 pmd = level2_kernel_pgt;
270 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
271 316
272 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { 317 if (addr >= end) {
273 for (i = 0; i < pmds; i++) { 318 if (!after_bootmem) {
274 if (pmd_present(pmd[i])) 319 for(; i < PTRS_PER_PTE; i++, pte++)
275 goto continue_outer_loop; 320 set_pte(pte, __pte(0));
321 }
322 break;
276 } 323 }
277 vaddr += addr & ~PMD_MASK;
278 addr &= PMD_MASK;
279 324
280 for (i = 0; i < pmds; i++, addr += PMD_SIZE) 325 if (pte_val(*pte))
281 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); 326 continue;
282 __flush_tlb_all();
283 327
284 return (void *)vaddr; 328 if (0)
285continue_outer_loop: 329 printk(" pte=%p addr=%lx pte=%016lx\n",
286 ; 330 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
331 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
332 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
333 pages++;
287 } 334 }
288 printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size); 335 update_page_count(PG_LEVEL_4K, pages);
289 336
290 return NULL; 337 return last_map_addr;
291} 338}
292 339
293/* 340static unsigned long __meminit
294 * To avoid virtual aliases later: 341phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
295 */
296__meminit void early_iounmap(void *addr, unsigned long size)
297{ 342{
298 unsigned long vaddr; 343 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
299 pmd_t *pmd;
300 int i, pmds;
301
302 vaddr = (unsigned long)addr;
303 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
304 pmd = level2_kernel_pgt + pmd_index(vaddr);
305 344
306 for (i = 0; i < pmds; i++) 345 return phys_pte_init(pte, address, end);
307 pmd_clear(pmd + i);
308
309 __flush_tlb_all();
310} 346}
311 347
312static unsigned long __meminit 348static unsigned long __meminit
313phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) 349phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
350 unsigned long page_size_mask)
314{ 351{
352 unsigned long pages = 0;
353 unsigned long last_map_addr = end;
354
315 int i = pmd_index(address); 355 int i = pmd_index(address);
316 356
317 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { 357 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
358 unsigned long pte_phys;
318 pmd_t *pmd = pmd_page + pmd_index(address); 359 pmd_t *pmd = pmd_page + pmd_index(address);
360 pte_t *pte;
319 361
320 if (address >= end) { 362 if (address >= end) {
321 if (!after_bootmem) { 363 if (!after_bootmem) {
@@ -325,31 +367,50 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
325 break; 367 break;
326 } 368 }
327 369
328 if (pmd_val(*pmd)) 370 if (pmd_val(*pmd)) {
371 if (!pmd_large(*pmd))
372 last_map_addr = phys_pte_update(pmd, address,
373 end);
374 continue;
375 }
376
377 if (page_size_mask & (1<<PG_LEVEL_2M)) {
378 pages++;
379 set_pte((pte_t *)pmd,
380 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
381 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
329 continue; 382 continue;
383 }
330 384
331 set_pte((pte_t *)pmd, 385 pte = alloc_low_page(&pte_phys);
332 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 386 last_map_addr = phys_pte_init(pte, address, end);
387 unmap_low_page(pte);
388
389 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
333 } 390 }
334 return address; 391 update_page_count(PG_LEVEL_2M, pages);
392 return last_map_addr;
335} 393}
336 394
337static unsigned long __meminit 395static unsigned long __meminit
338phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) 396phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
397 unsigned long page_size_mask)
339{ 398{
340 pmd_t *pmd = pmd_offset(pud, 0); 399 pmd_t *pmd = pmd_offset(pud, 0);
341 unsigned long last_map_addr; 400 unsigned long last_map_addr;
342 401
343 spin_lock(&init_mm.page_table_lock); 402 spin_lock(&init_mm.page_table_lock);
344 last_map_addr = phys_pmd_init(pmd, address, end); 403 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
345 spin_unlock(&init_mm.page_table_lock); 404 spin_unlock(&init_mm.page_table_lock);
346 __flush_tlb_all(); 405 __flush_tlb_all();
347 return last_map_addr; 406 return last_map_addr;
348} 407}
349 408
350static unsigned long __meminit 409static unsigned long __meminit
351phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) 410phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
411 unsigned long page_size_mask)
352{ 412{
413 unsigned long pages = 0;
353 unsigned long last_map_addr = end; 414 unsigned long last_map_addr = end;
354 int i = pud_index(addr); 415 int i = pud_index(addr);
355 416
@@ -369,11 +430,13 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
369 430
370 if (pud_val(*pud)) { 431 if (pud_val(*pud)) {
371 if (!pud_large(*pud)) 432 if (!pud_large(*pud))
372 last_map_addr = phys_pmd_update(pud, addr, end); 433 last_map_addr = phys_pmd_update(pud, addr, end,
434 page_size_mask);
373 continue; 435 continue;
374 } 436 }
375 437
376 if (direct_gbpages) { 438 if (page_size_mask & (1<<PG_LEVEL_1G)) {
439 pages++;
377 set_pte((pte_t *)pud, 440 set_pte((pte_t *)pud,
378 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 441 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
379 last_map_addr = (addr & PUD_MASK) + PUD_SIZE; 442 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
@@ -383,27 +446,50 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
383 pmd = alloc_low_page(&pmd_phys); 446 pmd = alloc_low_page(&pmd_phys);
384 447
385 spin_lock(&init_mm.page_table_lock); 448 spin_lock(&init_mm.page_table_lock);
386 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 449 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
387 last_map_addr = phys_pmd_init(pmd, addr, end); 450 unmap_low_page(pmd);
451 pud_populate(&init_mm, pud, __va(pmd_phys));
388 spin_unlock(&init_mm.page_table_lock); 452 spin_unlock(&init_mm.page_table_lock);
389 453
390 unmap_low_page(pmd);
391 } 454 }
392 __flush_tlb_all(); 455 __flush_tlb_all();
456 update_page_count(PG_LEVEL_1G, pages);
393 457
394 return last_map_addr >> PAGE_SHIFT; 458 return last_map_addr;
459}
460
461static unsigned long __meminit
462phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
463 unsigned long page_size_mask)
464{
465 pud_t *pud;
466
467 pud = (pud_t *)pgd_page_vaddr(*pgd);
468
469 return phys_pud_init(pud, addr, end, page_size_mask);
395} 470}
396 471
397static void __init find_early_table_space(unsigned long end) 472static void __init find_early_table_space(unsigned long end)
398{ 473{
399 unsigned long puds, pmds, tables, start; 474 unsigned long puds, pmds, ptes, tables, start;
400 475
401 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 476 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
402 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 477 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
403 if (!direct_gbpages) { 478 if (direct_gbpages) {
479 unsigned long extra;
480 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
481 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
482 } else
404 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 483 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
405 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 484 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
406 } 485
486 if (cpu_has_pse) {
487 unsigned long extra;
488 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
489 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
490 } else
491 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
492 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
407 493
408 /* 494 /*
409 * RED-PEN putting page tables only on node 0 could 495 * RED-PEN putting page tables only on node 0 could
@@ -417,10 +503,10 @@ static void __init find_early_table_space(unsigned long end)
417 503
418 table_start >>= PAGE_SHIFT; 504 table_start >>= PAGE_SHIFT;
419 table_end = table_start; 505 table_end = table_start;
506 table_top = table_start + (tables >> PAGE_SHIFT);
420 507
421 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", 508 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
422 end, table_start << PAGE_SHIFT, 509 end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
423 (table_start << PAGE_SHIFT) + tables);
424} 510}
425 511
426static void __init init_gbpages(void) 512static void __init init_gbpages(void)
@@ -431,7 +517,7 @@ static void __init init_gbpages(void)
431 direct_gbpages = 0; 517 direct_gbpages = 0;
432} 518}
433 519
434#ifdef CONFIG_MEMTEST_BOOTPARAM 520#ifdef CONFIG_MEMTEST
435 521
436static void __init memtest(unsigned long start_phys, unsigned long size, 522static void __init memtest(unsigned long start_phys, unsigned long size,
437 unsigned pattern) 523 unsigned pattern)
@@ -493,7 +579,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
493 579
494} 580}
495 581
496static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; 582/* default is disabled */
583static int memtest_pattern __initdata;
497 584
498static int __init parse_memtest(char *arg) 585static int __init parse_memtest(char *arg)
499{ 586{
@@ -542,15 +629,85 @@ static void __init early_memtest(unsigned long start, unsigned long end)
542} 629}
543#endif 630#endif
544 631
632static unsigned long __init kernel_physical_mapping_init(unsigned long start,
633 unsigned long end,
634 unsigned long page_size_mask)
635{
636
637 unsigned long next, last_map_addr = end;
638
639 start = (unsigned long)__va(start);
640 end = (unsigned long)__va(end);
641
642 for (; start < end; start = next) {
643 pgd_t *pgd = pgd_offset_k(start);
644 unsigned long pud_phys;
645 pud_t *pud;
646
647 next = start + PGDIR_SIZE;
648 if (next > end)
649 next = end;
650
651 if (pgd_val(*pgd)) {
652 last_map_addr = phys_pud_update(pgd, __pa(start),
653 __pa(end), page_size_mask);
654 continue;
655 }
656
657 if (after_bootmem)
658 pud = pud_offset(pgd, start & PGDIR_MASK);
659 else
660 pud = alloc_low_page(&pud_phys);
661
662 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
663 page_size_mask);
664 unmap_low_page(pud);
665 pgd_populate(&init_mm, pgd_offset_k(start),
666 __va(pud_phys));
667 }
668
669 return last_map_addr;
670}
671
672struct map_range {
673 unsigned long start;
674 unsigned long end;
675 unsigned page_size_mask;
676};
677
678#define NR_RANGE_MR 5
679
680static int save_mr(struct map_range *mr, int nr_range,
681 unsigned long start_pfn, unsigned long end_pfn,
682 unsigned long page_size_mask)
683{
684
685 if (start_pfn < end_pfn) {
686 if (nr_range >= NR_RANGE_MR)
687 panic("run out of range for init_memory_mapping\n");
688 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
689 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
690 mr[nr_range].page_size_mask = page_size_mask;
691 nr_range++;
692 }
693
694 return nr_range;
695}
696
545/* 697/*
546 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 698 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
547 * This runs before bootmem is initialized and gets pages directly from 699 * This runs before bootmem is initialized and gets pages directly from
548 * the physical memory. To access them they are temporarily mapped. 700 * the physical memory. To access them they are temporarily mapped.
549 */ 701 */
550unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) 702unsigned long __init_refok init_memory_mapping(unsigned long start,
703 unsigned long end)
551{ 704{
552 unsigned long next, last_map_addr = end; 705 unsigned long last_map_addr = 0;
553 unsigned long start_phys = start, end_phys = end; 706 unsigned long page_size_mask = 0;
707 unsigned long start_pfn, end_pfn;
708
709 struct map_range mr[NR_RANGE_MR];
710 int nr_range, i;
554 711
555 printk(KERN_INFO "init_memory_mapping\n"); 712 printk(KERN_INFO "init_memory_mapping\n");
556 713
@@ -561,48 +718,115 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon
561 * memory mapped. Unfortunately this is done currently before the 718 * memory mapped. Unfortunately this is done currently before the
562 * nodes are discovered. 719 * nodes are discovered.
563 */ 720 */
564 if (!after_bootmem) { 721 if (!after_bootmem)
565 init_gbpages(); 722 init_gbpages();
566 find_early_table_space(end); 723
724 if (direct_gbpages)
725 page_size_mask |= 1 << PG_LEVEL_1G;
726 if (cpu_has_pse)
727 page_size_mask |= 1 << PG_LEVEL_2M;
728
729 memset(mr, 0, sizeof(mr));
730 nr_range = 0;
731
732 /* head if not big page alignment ?*/
733 start_pfn = start >> PAGE_SHIFT;
734 end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
735 << (PMD_SHIFT - PAGE_SHIFT);
736 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
737
738 /* big page (2M) range*/
739 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
740 << (PMD_SHIFT - PAGE_SHIFT);
741 end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
742 << (PUD_SHIFT - PAGE_SHIFT);
743 if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
744 end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
745 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
746 page_size_mask & (1<<PG_LEVEL_2M));
747
748 /* big page (1G) range */
749 start_pfn = end_pfn;
750 end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
751 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
752 page_size_mask &
753 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
754
755 /* tail is not big page (1G) alignment */
756 start_pfn = end_pfn;
757 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
758 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
759 page_size_mask & (1<<PG_LEVEL_2M));
760
761 /* tail is not big page (2M) alignment */
762 start_pfn = end_pfn;
763 end_pfn = end>>PAGE_SHIFT;
764 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
765
766 /* try to merge same page size and continuous */
767 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
768 unsigned long old_start;
769 if (mr[i].end != mr[i+1].start ||
770 mr[i].page_size_mask != mr[i+1].page_size_mask)
771 continue;
772 /* move it */
773 old_start = mr[i].start;
774 memmove(&mr[i], &mr[i+1],
775 (nr_range - 1 - i) * sizeof (struct map_range));
776 mr[i].start = old_start;
777 nr_range--;
567 } 778 }
568 779
569 start = (unsigned long)__va(start); 780 for (i = 0; i < nr_range; i++)
570 end = (unsigned long)__va(end); 781 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
782 mr[i].start, mr[i].end,
783 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
784 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
571 785
572 for (; start < end; start = next) { 786 if (!after_bootmem)
573 pgd_t *pgd = pgd_offset_k(start); 787 find_early_table_space(end);
574 unsigned long pud_phys;
575 pud_t *pud;
576
577 if (after_bootmem)
578 pud = pud_offset(pgd, start & PGDIR_MASK);
579 else
580 pud = alloc_low_page(&pud_phys);
581 788
582 next = start + PGDIR_SIZE; 789 for (i = 0; i < nr_range; i++)
583 if (next > end) 790 last_map_addr = kernel_physical_mapping_init(
584 next = end; 791 mr[i].start, mr[i].end,
585 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); 792 mr[i].page_size_mask);
586 if (!after_bootmem)
587 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
588 unmap_low_page(pud);
589 }
590 793
591 if (!after_bootmem) 794 if (!after_bootmem)
592 mmu_cr4_features = read_cr4(); 795 mmu_cr4_features = read_cr4();
593 __flush_tlb_all(); 796 __flush_tlb_all();
594 797
595 if (!after_bootmem) 798 if (!after_bootmem && table_end > table_start)
596 reserve_early(table_start << PAGE_SHIFT, 799 reserve_early(table_start << PAGE_SHIFT,
597 table_end << PAGE_SHIFT, "PGTABLE"); 800 table_end << PAGE_SHIFT, "PGTABLE");
598 801
802 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
803 last_map_addr, end);
804
599 if (!after_bootmem) 805 if (!after_bootmem)
600 early_memtest(start_phys, end_phys); 806 early_memtest(start, end);
601 807
602 return last_map_addr; 808 return last_map_addr >> PAGE_SHIFT;
603} 809}
604 810
605#ifndef CONFIG_NUMA 811#ifndef CONFIG_NUMA
812void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
813{
814 unsigned long bootmap_size, bootmap;
815
816 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
817 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
818 PAGE_SIZE);
819 if (bootmap == -1L)
820 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
821 /* don't touch min_low_pfn */
822 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
823 0, end_pfn);
824 e820_register_active_regions(0, start_pfn, end_pfn);
825 free_bootmem_with_active_regions(0, end_pfn);
826 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
827 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
828}
829
606void __init paging_init(void) 830void __init paging_init(void)
607{ 831{
608 unsigned long max_zone_pfns[MAX_NR_ZONES]; 832 unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -610,9 +834,9 @@ void __init paging_init(void)
610 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 834 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
611 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 835 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
612 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 836 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
613 max_zone_pfns[ZONE_NORMAL] = end_pfn; 837 max_zone_pfns[ZONE_NORMAL] = max_pfn;
614 838
615 memory_present(0, 0, end_pfn); 839 memory_present(0, 0, max_pfn);
616 sparse_init(); 840 sparse_init();
617 free_area_init_nodes(max_zone_pfns); 841 free_area_init_nodes(max_zone_pfns);
618} 842}
@@ -694,8 +918,8 @@ void __init mem_init(void)
694#else 918#else
695 totalram_pages = free_all_bootmem(); 919 totalram_pages = free_all_bootmem();
696#endif 920#endif
697 reservedpages = end_pfn - totalram_pages - 921 reservedpages = max_pfn - totalram_pages -
698 absent_pages_in_range(0, end_pfn); 922 absent_pages_in_range(0, max_pfn);
699 after_bootmem = 1; 923 after_bootmem = 1;
700 924
701 codesize = (unsigned long) &_etext - (unsigned long) &_text; 925 codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -714,7 +938,7 @@ void __init mem_init(void)
714 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " 938 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
715 "%ldk reserved, %ldk data, %ldk init)\n", 939 "%ldk reserved, %ldk data, %ldk init)\n",
716 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 940 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
717 end_pfn << (PAGE_SHIFT-10), 941 max_pfn << (PAGE_SHIFT-10),
718 codesize >> 10, 942 codesize >> 10,
719 reservedpages << (PAGE_SHIFT-10), 943 reservedpages << (PAGE_SHIFT-10),
720 datasize >> 10, 944 datasize >> 10,
@@ -799,24 +1023,26 @@ void free_initrd_mem(unsigned long start, unsigned long end)
799} 1023}
800#endif 1024#endif
801 1025
802void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 1026int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1027 int flags)
803{ 1028{
804#ifdef CONFIG_NUMA 1029#ifdef CONFIG_NUMA
805 int nid, next_nid; 1030 int nid, next_nid;
1031 int ret;
806#endif 1032#endif
807 unsigned long pfn = phys >> PAGE_SHIFT; 1033 unsigned long pfn = phys >> PAGE_SHIFT;
808 1034
809 if (pfn >= end_pfn) { 1035 if (pfn >= max_pfn) {
810 /* 1036 /*
811 * This can happen with kdump kernels when accessing 1037 * This can happen with kdump kernels when accessing
812 * firmware tables: 1038 * firmware tables:
813 */ 1039 */
814 if (pfn < max_pfn_mapped) 1040 if (pfn < max_pfn_mapped)
815 return; 1041 return -EFAULT;
816 1042
817 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", 1043 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
818 phys, len); 1044 phys, len);
819 return; 1045 return -EFAULT;
820 } 1046 }
821 1047
822 /* Should check here against the e820 map to avoid double free */ 1048 /* Should check here against the e820 map to avoid double free */
@@ -824,9 +1050,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
824 nid = phys_to_nid(phys); 1050 nid = phys_to_nid(phys);
825 next_nid = phys_to_nid(phys + len - 1); 1051 next_nid = phys_to_nid(phys + len - 1);
826 if (nid == next_nid) 1052 if (nid == next_nid)
827 reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); 1053 ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
828 else 1054 else
829 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 1055 ret = reserve_bootmem(phys, len, flags);
1056
1057 if (ret != 0)
1058 return ret;
1059
830#else 1060#else
831 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 1061 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
832#endif 1062#endif
@@ -835,6 +1065,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
835 dma_reserve += len / PAGE_SIZE; 1065 dma_reserve += len / PAGE_SIZE;
836 set_dma_reserve(dma_reserve); 1066 set_dma_reserve(dma_reserve);
837 } 1067 }
1068
1069 return 0;
838} 1070}
839 1071
840int kern_addr_valid(unsigned long addr) 1072int kern_addr_valid(unsigned long addr)
@@ -939,7 +1171,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
939 pmd_t *pmd; 1171 pmd_t *pmd;
940 1172
941 for (; addr < end; addr = next) { 1173 for (; addr < end; addr = next) {
942 next = pmd_addr_end(addr, end); 1174 void *p = NULL;
943 1175
944 pgd = vmemmap_pgd_populate(addr, node); 1176 pgd = vmemmap_pgd_populate(addr, node);
945 if (!pgd) 1177 if (!pgd)
@@ -949,33 +1181,51 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
949 if (!pud) 1181 if (!pud)
950 return -ENOMEM; 1182 return -ENOMEM;
951 1183
952 pmd = pmd_offset(pud, addr); 1184 if (!cpu_has_pse) {
953 if (pmd_none(*pmd)) { 1185 next = (addr + PAGE_SIZE) & PAGE_MASK;
954 pte_t entry; 1186 pmd = vmemmap_pmd_populate(pud, addr, node);
955 void *p; 1187
1188 if (!pmd)
1189 return -ENOMEM;
1190
1191 p = vmemmap_pte_populate(pmd, addr, node);
956 1192
957 p = vmemmap_alloc_block(PMD_SIZE, node);
958 if (!p) 1193 if (!p)
959 return -ENOMEM; 1194 return -ENOMEM;
960 1195
961 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, 1196 addr_end = addr + PAGE_SIZE;
962 PAGE_KERNEL_LARGE); 1197 p_end = p + PAGE_SIZE;
963 set_pmd(pmd, __pmd(pte_val(entry)));
964
965 /* check to see if we have contiguous blocks */
966 if (p_end != p || node_start != node) {
967 if (p_start)
968 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
969 addr_start, addr_end-1, p_start, p_end-1, node_start);
970 addr_start = addr;
971 node_start = node;
972 p_start = p;
973 }
974 addr_end = addr + PMD_SIZE;
975 p_end = p + PMD_SIZE;
976 } else { 1198 } else {
977 vmemmap_verify((pte_t *)pmd, node, addr, next); 1199 next = pmd_addr_end(addr, end);
1200
1201 pmd = pmd_offset(pud, addr);
1202 if (pmd_none(*pmd)) {
1203 pte_t entry;
1204
1205 p = vmemmap_alloc_block(PMD_SIZE, node);
1206 if (!p)
1207 return -ENOMEM;
1208
1209 entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
1210 PAGE_KERNEL_LARGE);
1211 set_pmd(pmd, __pmd(pte_val(entry)));
1212
1213 /* check to see if we have contiguous blocks */
1214 if (p_end != p || node_start != node) {
1215 if (p_start)
1216 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1217 addr_start, addr_end-1, p_start, p_end-1, node_start);
1218 addr_start = addr;
1219 node_start = node;
1220 p_start = p;
1221 }
1222
1223 addr_end = addr + PMD_SIZE;
1224 p_end = p + PMD_SIZE;
1225 } else
1226 vmemmap_verify((pte_t *)pmd, node, addr, next);
978 } 1227 }
1228
979 } 1229 }
980 return 0; 1230 return 0;
981} 1231}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index d1b867101e5f..115f13ee40c9 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -142,7 +142,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
142 /* 142 /*
143 * Don't remap the low PCI/ISA area, it's always mapped.. 143 * Don't remap the low PCI/ISA area, it's always mapped..
144 */ 144 */
145 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) 145 if (is_ISA_range(phys_addr, last_addr))
146 return (__force void __iomem *)phys_to_virt(phys_addr); 146 return (__force void __iomem *)phys_to_virt(phys_addr);
147 147
148 /* 148 /*
@@ -261,7 +261,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
261{ 261{
262 /* 262 /*
263 * Ideally, this should be: 263 * Ideally, this should be:
264 * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; 264 * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
265 * 265 *
266 * Till we fix all X drivers to use ioremap_wc(), we will use 266 * Till we fix all X drivers to use ioremap_wc(), we will use
267 * UC MINUS. 267 * UC MINUS.
@@ -285,7 +285,7 @@ EXPORT_SYMBOL(ioremap_nocache);
285 */ 285 */
286void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) 286void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
287{ 287{
288 if (pat_wc_enabled) 288 if (pat_enabled)
289 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, 289 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
290 __builtin_return_address(0)); 290 __builtin_return_address(0));
291 else 291 else
@@ -341,8 +341,8 @@ void iounmap(volatile void __iomem *addr)
341 * vm_area and by simply returning an address into the kernel mapping 341 * vm_area and by simply returning an address into the kernel mapping
342 * of ISA space. So handle that here. 342 * of ISA space. So handle that here.
343 */ 343 */
344 if (addr >= phys_to_virt(ISA_START_ADDRESS) && 344 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
345 addr < phys_to_virt(ISA_END_ADDRESS)) 345 (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
346 return; 346 return;
347 347
348 addr = (volatile void __iomem *) 348 addr = (volatile void __iomem *)
@@ -355,7 +355,7 @@ void iounmap(volatile void __iomem *addr)
355 cpa takes care of the direct mappings. */ 355 cpa takes care of the direct mappings. */
356 read_lock(&vmlist_lock); 356 read_lock(&vmlist_lock);
357 for (p = vmlist; p; p = p->next) { 357 for (p = vmlist; p; p = p->next) {
358 if (p->addr == addr) 358 if (p->addr == (void __force *)addr)
359 break; 359 break;
360 } 360 }
361 read_unlock(&vmlist_lock); 361 read_unlock(&vmlist_lock);
@@ -369,7 +369,7 @@ void iounmap(volatile void __iomem *addr)
369 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); 369 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
370 370
371 /* Finally remove it */ 371 /* Finally remove it */
372 o = remove_vm_area((void *)addr); 372 o = remove_vm_area((void __force *)addr);
373 BUG_ON(p != o || o == NULL); 373 BUG_ON(p != o || o == NULL);
374 kfree(p); 374 kfree(p);
375} 375}
@@ -388,7 +388,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
388 if (page_is_ram(start >> PAGE_SHIFT)) 388 if (page_is_ram(start >> PAGE_SHIFT))
389 return __va(phys); 389 return __va(phys);
390 390
391 addr = (void *)ioremap_default(start, PAGE_SIZE); 391 addr = (void __force *)ioremap_default(start, PAGE_SIZE);
392 if (addr) 392 if (addr)
393 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 393 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
394 394
@@ -404,8 +404,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
404 return; 404 return;
405} 405}
406 406
407#ifdef CONFIG_X86_32
408
409int __initdata early_ioremap_debug; 407int __initdata early_ioremap_debug;
410 408
411static int __init early_ioremap_debug_setup(char *str) 409static int __init early_ioremap_debug_setup(char *str)
@@ -417,8 +415,7 @@ static int __init early_ioremap_debug_setup(char *str)
417early_param("early_ioremap_debug", early_ioremap_debug_setup); 415early_param("early_ioremap_debug", early_ioremap_debug_setup);
418 416
419static __initdata int after_paging_init; 417static __initdata int after_paging_init;
420static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] 418static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
421 __section(.bss.page_aligned);
422 419
423static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) 420static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
424{ 421{
@@ -507,10 +504,11 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
507 return; 504 return;
508 } 505 }
509 pte = early_ioremap_pte(addr); 506 pte = early_ioremap_pte(addr);
507
510 if (pgprot_val(flags)) 508 if (pgprot_val(flags))
511 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); 509 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
512 else 510 else
513 pte_clear(NULL, addr, pte); 511 pte_clear(&init_mm, addr, pte);
514 __flush_tlb_one(addr); 512 __flush_tlb_one(addr);
515} 513}
516 514
@@ -648,5 +646,3 @@ void __this_fixmap_does_not_exist(void)
648{ 646{
649 WARN_ON(1); 647 WARN_ON(1);
650} 648}
651
652#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 1f476e477844..41f1b5c00a1d 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,6 +22,7 @@
22#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/mpspec.h> 23#include <asm/mpspec.h>
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h>
25 26
26static __init int find_northbridge(void) 27static __init int find_northbridge(void)
27{ 28{
@@ -56,34 +57,33 @@ static __init void early_get_boot_cpu_id(void)
56 /* 57 /*
57 * Find possible boot-time SMP configuration: 58 * Find possible boot-time SMP configuration:
58 */ 59 */
60#ifdef CONFIG_X86_MPPARSE
59 early_find_smp_config(); 61 early_find_smp_config();
62#endif
60#ifdef CONFIG_ACPI 63#ifdef CONFIG_ACPI
61 /* 64 /*
62 * Read APIC information from ACPI tables. 65 * Read APIC information from ACPI tables.
63 */ 66 */
64 early_acpi_boot_init(); 67 early_acpi_boot_init();
65#endif 68#endif
69#ifdef CONFIG_X86_MPPARSE
66 /* 70 /*
67 * get boot-time SMP configuration: 71 * get boot-time SMP configuration:
68 */ 72 */
69 if (smp_found_config) 73 if (smp_found_config)
70 early_get_smp_config(); 74 early_get_smp_config();
75#endif
71 early_init_lapic_mapping(); 76 early_init_lapic_mapping();
72} 77}
73 78
74int __init k8_scan_nodes(unsigned long start, unsigned long end) 79int __init k8_scan_nodes(unsigned long start, unsigned long end)
75{ 80{
81 unsigned numnodes, cores, bits, apicid_base;
76 unsigned long prevbase; 82 unsigned long prevbase;
77 struct bootnode nodes[8]; 83 struct bootnode nodes[8];
78 int nodeid, i, nb;
79 unsigned char nodeids[8]; 84 unsigned char nodeids[8];
80 int found = 0; 85 int i, j, nb, found = 0;
81 u32 reg; 86 u32 nodeid, reg;
82 unsigned numnodes;
83 unsigned cores;
84 unsigned bits;
85 int j;
86 unsigned apicid_base;
87 87
88 if (!early_pci_allowed()) 88 if (!early_pci_allowed())
89 return -1; 89 return -1;
@@ -105,7 +105,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
105 prevbase = 0; 105 prevbase = 0;
106 for (i = 0; i < 8; i++) { 106 for (i = 0; i < 8; i++) {
107 unsigned long base, limit; 107 unsigned long base, limit;
108 u32 nodeid;
109 108
110 base = read_pci_config(0, nb, 1, 0x40 + i*8); 109 base = read_pci_config(0, nb, 1, 0x40 + i*8);
111 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 110 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -144,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
144 limit |= (1<<24)-1; 143 limit |= (1<<24)-1;
145 limit++; 144 limit++;
146 145
147 if (limit > end_pfn << PAGE_SHIFT) 146 if (limit > max_pfn << PAGE_SHIFT)
148 limit = end_pfn << PAGE_SHIFT; 147 limit = max_pfn << PAGE_SHIFT;
149 if (limit <= base) 148 if (limit <= base)
150 continue; 149 continue;
151 150
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c5066d519e5d..b432d5781773 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -27,30 +27,17 @@
27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data); 28EXPORT_SYMBOL(node_data);
29 29
30bootmem_data_t plat_node_bdata[MAX_NUMNODES]; 30static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
31 31
32struct memnode memnode; 32struct memnode memnode;
33 33
34#ifdef CONFIG_SMP
35int x86_cpu_to_node_map_init[NR_CPUS] = {
36 [0 ... NR_CPUS-1] = NUMA_NO_NODE
37};
38void *x86_cpu_to_node_map_early_ptr;
39EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
40#endif
41DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
42EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 34s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
45 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
46}; 36};
47 37
48cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
49EXPORT_SYMBOL(node_to_cpumask_map);
50
51int numa_off __initdata; 38int numa_off __initdata;
52unsigned long __initdata nodemap_addr; 39static unsigned long __initdata nodemap_addr;
53unsigned long __initdata nodemap_size; 40static unsigned long __initdata nodemap_size;
54 41
55/* 42/*
56 * Given a shift value, try to populate memnodemap[] 43 * Given a shift value, try to populate memnodemap[]
@@ -99,7 +86,7 @@ static int __init allocate_cachealigned_memnodemap(void)
99 86
100 addr = 0x8000; 87 addr = 0x8000;
101 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 88 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
102 nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT, 89 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
103 nodemap_size, L1_CACHE_BYTES); 90 nodemap_size, L1_CACHE_BYTES);
104 if (nodemap_addr == -1UL) { 91 if (nodemap_addr == -1UL) {
105 printk(KERN_ERR 92 printk(KERN_ERR
@@ -192,7 +179,7 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
192void __init setup_node_bootmem(int nodeid, unsigned long start, 179void __init setup_node_bootmem(int nodeid, unsigned long start,
193 unsigned long end) 180 unsigned long end)
194{ 181{
195 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; 182 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
196 unsigned long bootmap_start, nodedata_phys; 183 unsigned long bootmap_start, nodedata_phys;
197 void *bootmap; 184 void *bootmap;
198 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 185 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
@@ -204,7 +191,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
204 start, end); 191 start, end);
205 192
206 start_pfn = start >> PAGE_SHIFT; 193 start_pfn = start >> PAGE_SHIFT;
207 end_pfn = end >> PAGE_SHIFT; 194 last_pfn = end >> PAGE_SHIFT;
208 195
209 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 196 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
210 SMP_CACHE_BYTES); 197 SMP_CACHE_BYTES);
@@ -217,7 +204,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 204 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 205 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 206 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 207 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 208
222 /* 209 /*
223 * Find a place for the bootmem map 210 * Find a place for the bootmem map
@@ -226,14 +213,14 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
226 * early_node_mem will get that with find_e820_area instead 213 * early_node_mem will get that with find_e820_area instead
227 * of alloc_bootmem, that could clash with reserved range 214 * of alloc_bootmem, that could clash with reserved range
228 */ 215 */
229 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 216 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 217 nid = phys_to_nid(nodedata_phys);
231 if (nid == nodeid) 218 if (nid == nodeid)
232 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 219 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else 220 else
234 bootmap_start = round_up(start, PAGE_SIZE); 221 bootmap_start = round_up(start, PAGE_SIZE);
235 /* 222 /*
236 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like 223 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 224 * to use that to align to PAGE_SIZE
238 */ 225 */
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 226 bootmap = early_node_mem(nodeid, bootmap_start, end,
@@ -248,7 +235,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
248 235
249 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 236 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
250 bootmap_start >> PAGE_SHIFT, 237 bootmap_start >> PAGE_SHIFT,
251 start_pfn, end_pfn); 238 start_pfn, last_pfn);
252 239
253 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 240 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
254 bootmap_start, bootmap_start + bootmap_size - 1, 241 bootmap_start, bootmap_start + bootmap_size - 1,
@@ -309,7 +296,7 @@ void __init numa_init_array(void)
309 296
310#ifdef CONFIG_NUMA_EMU 297#ifdef CONFIG_NUMA_EMU
311/* Numa emulation */ 298/* Numa emulation */
312char *cmdline __initdata; 299static char *cmdline __initdata;
313 300
314/* 301/*
315 * Setups up nid to range from addr to addr + size. If the end 302 * Setups up nid to range from addr to addr + size. If the end
@@ -413,15 +400,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413} 400}
414 401
415/* 402/*
416 * Sets up the system RAM area from start_pfn to end_pfn according to the 403 * Sets up the system RAM area from start_pfn to last_pfn according to the
417 * numa=fake command-line option. 404 * numa=fake command-line option.
418 */ 405 */
419static struct bootnode nodes[MAX_NUMNODES] __initdata; 406static struct bootnode nodes[MAX_NUMNODES] __initdata;
420 407
421static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 408static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
422{ 409{
423 u64 size, addr = start_pfn << PAGE_SHIFT; 410 u64 size, addr = start_pfn << PAGE_SHIFT;
424 u64 max_addr = end_pfn << PAGE_SHIFT; 411 u64 max_addr = last_pfn << PAGE_SHIFT;
425 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 412 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
426 413
427 memset(&nodes, 0, sizeof(nodes)); 414 memset(&nodes, 0, sizeof(nodes));
@@ -527,7 +514,7 @@ out:
527} 514}
528#endif /* CONFIG_NUMA_EMU */ 515#endif /* CONFIG_NUMA_EMU */
529 516
530void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 517void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
531{ 518{
532 int i; 519 int i;
533 520
@@ -535,7 +522,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
535 nodes_clear(node_online_map); 522 nodes_clear(node_online_map);
536 523
537#ifdef CONFIG_NUMA_EMU 524#ifdef CONFIG_NUMA_EMU
538 if (cmdline && !numa_emulation(start_pfn, end_pfn)) 525 if (cmdline && !numa_emulation(start_pfn, last_pfn))
539 return; 526 return;
540 nodes_clear(node_possible_map); 527 nodes_clear(node_possible_map);
541 nodes_clear(node_online_map); 528 nodes_clear(node_online_map);
@@ -543,7 +530,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
543 530
544#ifdef CONFIG_ACPI_NUMA 531#ifdef CONFIG_ACPI_NUMA
545 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 532 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
546 end_pfn << PAGE_SHIFT)) 533 last_pfn << PAGE_SHIFT))
547 return; 534 return;
548 nodes_clear(node_possible_map); 535 nodes_clear(node_possible_map);
549 nodes_clear(node_online_map); 536 nodes_clear(node_online_map);
@@ -551,7 +538,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
551 538
552#ifdef CONFIG_K8_NUMA 539#ifdef CONFIG_K8_NUMA
553 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 540 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
554 end_pfn<<PAGE_SHIFT)) 541 last_pfn<<PAGE_SHIFT))
555 return; 542 return;
556 nodes_clear(node_possible_map); 543 nodes_clear(node_possible_map);
557 nodes_clear(node_online_map); 544 nodes_clear(node_online_map);
@@ -561,7 +548,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
561 548
562 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 549 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
563 start_pfn << PAGE_SHIFT, 550 start_pfn << PAGE_SHIFT,
564 end_pfn << PAGE_SHIFT); 551 last_pfn << PAGE_SHIFT);
565 /* setup dummy node covering all memory */ 552 /* setup dummy node covering all memory */
566 memnode_shift = 63; 553 memnode_shift = 63;
567 memnodemap = memnode.embedded_map; 554 memnodemap = memnode.embedded_map;
@@ -570,29 +557,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
570 node_set(0, node_possible_map); 557 node_set(0, node_possible_map);
571 for (i = 0; i < NR_CPUS; i++) 558 for (i = 0; i < NR_CPUS; i++)
572 numa_set_node(i, 0); 559 numa_set_node(i, 0);
573 /* cpumask_of_cpu() may not be available during early startup */ 560 e820_register_active_regions(0, start_pfn, last_pfn);
574 memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); 561 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
575 cpu_set(0, node_to_cpumask_map[0]);
576 e820_register_active_regions(0, start_pfn, end_pfn);
577 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
578}
579
580__cpuinit void numa_add_cpu(int cpu)
581{
582 set_bit(cpu,
583 (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
584}
585
586void __cpuinit numa_set_node(int cpu, int node)
587{
588 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
589
590 if(cpu_to_node_map)
591 cpu_to_node_map[cpu] = node;
592 else if(per_cpu_offset(cpu))
593 per_cpu(x86_cpu_to_node_map, cpu) = node;
594 else
595 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
596} 562}
597 563
598unsigned long __init numa_free_all_bootmem(void) 564unsigned long __init numa_free_all_bootmem(void)
@@ -613,7 +579,7 @@ void __init paging_init(void)
613 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 579 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
614 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 580 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
615 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 581 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
616 max_zone_pfns[ZONE_NORMAL] = end_pfn; 582 max_zone_pfns[ZONE_NORMAL] = max_pfn;
617 583
618 sparse_memory_present_with_active_regions(MAX_NUMNODES); 584 sparse_memory_present_with_active_regions(MAX_NUMNODES);
619 sparse_init(); 585 sparse_init();
@@ -641,6 +607,7 @@ static __init int numa_setup(char *opt)
641} 607}
642early_param("numa", numa_setup); 608early_param("numa", numa_setup);
643 609
610#ifdef CONFIG_NUMA
644/* 611/*
645 * Setup early cpu_to_node. 612 * Setup early cpu_to_node.
646 * 613 *
@@ -652,14 +619,19 @@ early_param("numa", numa_setup);
652 * is already initialized in a round robin manner at numa_init_array, 619 * is already initialized in a round robin manner at numa_init_array,
653 * prior to this call, and this initialization is good enough 620 * prior to this call, and this initialization is good enough
654 * for the fake NUMA cases. 621 * for the fake NUMA cases.
622 *
623 * Called before the per_cpu areas are setup.
655 */ 624 */
656void __init init_cpu_to_node(void) 625void __init init_cpu_to_node(void)
657{ 626{
658 int i; 627 int cpu;
628 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
659 629
660 for (i = 0; i < NR_CPUS; i++) { 630 BUG_ON(cpu_to_apicid == NULL);
631
632 for_each_possible_cpu(cpu) {
661 int node; 633 int node;
662 u16 apicid = x86_cpu_to_apicid_init[i]; 634 u16 apicid = cpu_to_apicid[cpu];
663 635
664 if (apicid == BAD_APICID) 636 if (apicid == BAD_APICID)
665 continue; 637 continue;
@@ -668,8 +640,9 @@ void __init init_cpu_to_node(void)
668 continue; 640 continue;
669 if (!node_online(node)) 641 if (!node_online(node))
670 continue; 642 continue;
671 numa_set_node(i, node); 643 numa_set_node(cpu, node);
672 } 644 }
673} 645}
646#endif
674 647
675 648
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 75f1b109aae8..0dcd42eb94e6 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * self test for change_page_attr. 2 * self test for change_page_attr.
3 * 3 *
4 * Clears the global bit on random pages in the direct mapping, then reverts 4 * Clears the a test pte bit on random pages in the direct mapping,
5 * and compares page tables forwards and afterwards. 5 * then reverts and compares page tables forwards and afterwards.
6 */ 6 */
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/kthread.h> 8#include <linux/kthread.h>
@@ -32,6 +32,13 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1)
36
37static int pte_testbit(pte_t pte)
38{
39 return pte_flags(pte) & _PAGE_UNUSED1;
40}
41
35struct split_state { 42struct split_state {
36 long lpg, gpg, spg, exec; 43 long lpg, gpg, spg, exec;
37 long min_exec, max_exec; 44 long min_exec, max_exec;
@@ -165,15 +172,14 @@ static int pageattr_test(void)
165 continue; 172 continue;
166 } 173 }
167 174
168 err = change_page_attr_clear(addr[i], len[i], 175 err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT);
169 __pgprot(_PAGE_GLOBAL));
170 if (err < 0) { 176 if (err < 0) {
171 printk(KERN_ERR "CPA %d failed %d\n", i, err); 177 printk(KERN_ERR "CPA %d failed %d\n", i, err);
172 failed++; 178 failed++;
173 } 179 }
174 180
175 pte = lookup_address(addr[i], &level); 181 pte = lookup_address(addr[i], &level);
176 if (!pte || pte_global(*pte) || pte_huge(*pte)) { 182 if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
177 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], 183 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
178 pte ? (u64)pte_val(*pte) : 0ULL); 184 pte ? (u64)pte_val(*pte) : 0ULL);
179 failed++; 185 failed++;
@@ -198,14 +204,13 @@ static int pageattr_test(void)
198 failed++; 204 failed++;
199 continue; 205 continue;
200 } 206 }
201 err = change_page_attr_set(addr[i], len[i], 207 err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT);
202 __pgprot(_PAGE_GLOBAL));
203 if (err < 0) { 208 if (err < 0) {
204 printk(KERN_ERR "CPA reverting failed: %d\n", err); 209 printk(KERN_ERR "CPA reverting failed: %d\n", err);
205 failed++; 210 failed++;
206 } 211 }
207 pte = lookup_address(addr[i], &level); 212 pte = lookup_address(addr[i], &level);
208 if (!pte || !pte_global(*pte)) { 213 if (!pte || pte_testbit(*pte)) {
209 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", 214 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
210 addr[i], pte ? (u64)pte_val(*pte) : 0ULL); 215 addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
211 failed++; 216 failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37e..fb6f2ab40dda 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -34,6 +34,41 @@ struct cpa_data {
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35}; 35};
36 36
37#ifdef CONFIG_PROC_FS
38static unsigned long direct_pages_count[PG_LEVEL_NUM];
39
40void update_page_count(int level, unsigned long pages)
41{
42 unsigned long flags;
43
44 /* Protect against CPA */
45 spin_lock_irqsave(&pgd_lock, flags);
46 direct_pages_count[level] += pages;
47 spin_unlock_irqrestore(&pgd_lock, flags);
48}
49
50static void split_page_count(int level)
51{
52 direct_pages_count[level]--;
53 direct_pages_count[level - 1] += PTRS_PER_PTE;
54}
55
56int arch_report_meminfo(char *page)
57{
58 int n = sprintf(page, "DirectMap4k: %8lu\n"
59 "DirectMap2M: %8lu\n",
60 direct_pages_count[PG_LEVEL_4K],
61 direct_pages_count[PG_LEVEL_2M]);
62#ifdef CONFIG_X86_64
63 n += sprintf(page + n, "DirectMap1G: %8lu\n",
64 direct_pages_count[PG_LEVEL_1G]);
65#endif
66 return n;
67}
68#else
69static inline void split_page_count(int level) { }
70#endif
71
37#ifdef CONFIG_X86_64 72#ifdef CONFIG_X86_64
38 73
39static inline unsigned long highmap_start_pfn(void) 74static inline unsigned long highmap_start_pfn(void)
@@ -500,6 +535,16 @@ static int split_large_page(pte_t *kpte, unsigned long address)
500 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 535 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
501 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 536 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
502 537
538 if (address >= (unsigned long)__va(0) &&
539 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
540 split_page_count(level);
541
542#ifdef CONFIG_X86_64
543 if (address >= (unsigned long)__va(1UL<<32) &&
544 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
545 split_page_count(level);
546#endif
547
503 /* 548 /*
504 * Install the new, split up pagetable. Important details here: 549 * Install the new, split up pagetable. Important details here:
505 * 550 *
@@ -613,15 +658,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
613 struct cpa_data alias_cpa; 658 struct cpa_data alias_cpa;
614 int ret = 0; 659 int ret = 0;
615 660
616 if (cpa->pfn > max_pfn_mapped) 661 if (cpa->pfn >= max_pfn_mapped)
617 return 0; 662 return 0;
618 663
664#ifdef CONFIG_X86_64
665 if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
666 return 0;
667#endif
619 /* 668 /*
620 * No need to redo, when the primary call touched the direct 669 * No need to redo, when the primary call touched the direct
621 * mapping already: 670 * mapping already:
622 */ 671 */
623 if (!within(cpa->vaddr, PAGE_OFFSET, 672 if (!(within(cpa->vaddr, PAGE_OFFSET,
624 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { 673 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
674#ifdef CONFIG_X86_64
675 || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
676 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
677#endif
678 )) {
625 679
626 alias_cpa = *cpa; 680 alias_cpa = *cpa;
627 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 681 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
@@ -805,7 +859,7 @@ int _set_memory_wc(unsigned long addr, int numpages)
805 859
806int set_memory_wc(unsigned long addr, int numpages) 860int set_memory_wc(unsigned long addr, int numpages)
807{ 861{
808 if (!pat_wc_enabled) 862 if (!pat_enabled)
809 return set_memory_uc(addr, numpages); 863 return set_memory_uc(addr, numpages);
810 864
811 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 865 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 06b7a1c90fb8..d4585077977a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -26,11 +26,11 @@
26#include <asm/io.h> 26#include <asm/io.h>
27 27
28#ifdef CONFIG_X86_PAT 28#ifdef CONFIG_X86_PAT
29int __read_mostly pat_wc_enabled = 1; 29int __read_mostly pat_enabled = 1;
30 30
31void __cpuinit pat_disable(char *reason) 31void __cpuinit pat_disable(char *reason)
32{ 32{
33 pat_wc_enabled = 0; 33 pat_enabled = 0;
34 printk(KERN_INFO "%s\n", reason); 34 printk(KERN_INFO "%s\n", reason);
35} 35}
36 36
@@ -42,6 +42,19 @@ static int __init nopat(char *str)
42early_param("nopat", nopat); 42early_param("nopat", nopat);
43#endif 43#endif
44 44
45
46static int debug_enable;
47static int __init pat_debug_setup(char *str)
48{
49 debug_enable = 1;
50 return 0;
51}
52__setup("debugpat", pat_debug_setup);
53
54#define dprintk(fmt, arg...) \
55 do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
56
57
45static u64 __read_mostly boot_pat_state; 58static u64 __read_mostly boot_pat_state;
46 59
47enum { 60enum {
@@ -53,24 +66,25 @@ enum {
53 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ 66 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
54}; 67};
55 68
56#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) 69#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
57 70
58void pat_init(void) 71void pat_init(void)
59{ 72{
60 u64 pat; 73 u64 pat;
61 74
62 if (!pat_wc_enabled) 75 if (!pat_enabled)
63 return; 76 return;
64 77
65 /* Paranoia check. */ 78 /* Paranoia check. */
66 if (!cpu_has_pat) { 79 if (!cpu_has_pat && boot_pat_state) {
67 printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
68 /* 80 /*
69 * Panic if this happens on the secondary CPU, and we 81 * If this happens we are on a secondary CPU, but
70 * switched to PAT on the boot CPU. We have no way to 82 * switched to PAT on the boot CPU. We have no way to
71 * undo PAT. 83 * undo PAT.
72 */ 84 */
73 BUG_ON(boot_pat_state); 85 printk(KERN_ERR "PAT enabled, "
86 "but not supported by secondary CPU\n");
87 BUG();
74 } 88 }
75 89
76 /* Set PWT to Write-Combining. All other bits stay the same */ 90 /* Set PWT to Write-Combining. All other bits stay the same */
@@ -86,8 +100,8 @@ void pat_init(void)
86 * 011 UC _PAGE_CACHE_UC 100 * 011 UC _PAGE_CACHE_UC
87 * PAT bit unused 101 * PAT bit unused
88 */ 102 */
89 pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | 103 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
90 PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); 104 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
91 105
92 /* Boot CPU check */ 106 /* Boot CPU check */
93 if (!boot_pat_state) 107 if (!boot_pat_state)
@@ -103,11 +117,11 @@ void pat_init(void)
103static char *cattr_name(unsigned long flags) 117static char *cattr_name(unsigned long flags)
104{ 118{
105 switch (flags & _PAGE_CACHE_MASK) { 119 switch (flags & _PAGE_CACHE_MASK) {
106 case _PAGE_CACHE_UC: return "uncached"; 120 case _PAGE_CACHE_UC: return "uncached";
107 case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; 121 case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
108 case _PAGE_CACHE_WB: return "write-back"; 122 case _PAGE_CACHE_WB: return "write-back";
109 case _PAGE_CACHE_WC: return "write-combining"; 123 case _PAGE_CACHE_WC: return "write-combining";
110 default: return "broken"; 124 default: return "broken";
111 } 125 }
112} 126}
113 127
@@ -145,47 +159,50 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
145 * The intersection is based on "Effective Memory Type" tables in IA-32 159 * The intersection is based on "Effective Memory Type" tables in IA-32
146 * SDM vol 3a 160 * SDM vol 3a
147 */ 161 */
148static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, 162static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
149 unsigned long *ret_prot)
150{ 163{
151 unsigned long pat_type;
152 u8 mtrr_type;
153
154 pat_type = prot & _PAGE_CACHE_MASK;
155 prot &= (~_PAGE_CACHE_MASK);
156
157 /*
158 * We return the PAT request directly for types where PAT takes
159 * precedence with respect to MTRR and for UC_MINUS.
160 * Consistency checks with other PAT requests is done later
161 * while going through memtype list.
162 */
163 if (pat_type == _PAGE_CACHE_WC) {
164 *ret_prot = prot | _PAGE_CACHE_WC;
165 return 0;
166 } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
167 *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
168 return 0;
169 } else if (pat_type == _PAGE_CACHE_UC) {
170 *ret_prot = prot | _PAGE_CACHE_UC;
171 return 0;
172 }
173
174 /* 164 /*
175 * Look for MTRR hint to get the effective type in case where PAT 165 * Look for MTRR hint to get the effective type in case where PAT
176 * request is for WB. 166 * request is for WB.
177 */ 167 */
178 mtrr_type = mtrr_type_lookup(start, end); 168 if (req_type == _PAGE_CACHE_WB) {
169 u8 mtrr_type;
170
171 mtrr_type = mtrr_type_lookup(start, end);
172 if (mtrr_type == MTRR_TYPE_UNCACHABLE)
173 return _PAGE_CACHE_UC;
174 if (mtrr_type == MTRR_TYPE_WRCOMB)
175 return _PAGE_CACHE_WC;
176 }
179 177
180 if (mtrr_type == MTRR_TYPE_UNCACHABLE) { 178 return req_type;
181 *ret_prot = prot | _PAGE_CACHE_UC; 179}
182 } else if (mtrr_type == MTRR_TYPE_WRCOMB) { 180
183 *ret_prot = prot | _PAGE_CACHE_WC; 181static int chk_conflict(struct memtype *new, struct memtype *entry,
184 } else { 182 unsigned long *type)
185 *ret_prot = prot | _PAGE_CACHE_WB; 183{
184 if (new->type != entry->type) {
185 if (type) {
186 new->type = entry->type;
187 *type = entry->type;
188 } else
189 goto conflict;
186 } 190 }
187 191
192 /* check overlaps with more than one entry in the list */
193 list_for_each_entry_continue(entry, &memtype_list, nd) {
194 if (new->end <= entry->start)
195 break;
196 else if (new->type != entry->type)
197 goto conflict;
198 }
188 return 0; 199 return 0;
200
201 conflict:
202 printk(KERN_INFO "%s:%d conflicting memory types "
203 "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
204 new->end, cattr_name(new->type), cattr_name(entry->type));
205 return -EBUSY;
189} 206}
190 207
191/* 208/*
@@ -198,37 +215,36 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
198 * req_type will have a special case value '-1', when requester want to inherit 215 * req_type will have a special case value '-1', when requester want to inherit
199 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. 216 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
200 * 217 *
201 * If ret_type is NULL, function will return an error if it cannot reserve the 218 * If new_type is NULL, function will return an error if it cannot reserve the
202 * region with req_type. If ret_type is non-null, function will return 219 * region with req_type. If new_type is non-NULL, function will return
203 * available type in ret_type in case of no error. In case of any error 220 * available type in new_type in case of no error. In case of any error
204 * it will return a negative return value. 221 * it will return a negative return value.
205 */ 222 */
206int reserve_memtype(u64 start, u64 end, unsigned long req_type, 223int reserve_memtype(u64 start, u64 end, unsigned long req_type,
207 unsigned long *ret_type) 224 unsigned long *new_type)
208{ 225{
209 struct memtype *new_entry = NULL; 226 struct memtype *new, *entry;
210 struct memtype *parse;
211 unsigned long actual_type; 227 unsigned long actual_type;
228 struct list_head *where;
212 int err = 0; 229 int err = 0;
213 230
214 /* Only track when pat_wc_enabled */ 231 BUG_ON(start >= end); /* end is exclusive */
215 if (!pat_wc_enabled) { 232
233 if (!pat_enabled) {
216 /* This is identical to page table setting without PAT */ 234 /* This is identical to page table setting without PAT */
217 if (ret_type) { 235 if (new_type) {
218 if (req_type == -1) { 236 if (req_type == -1)
219 *ret_type = _PAGE_CACHE_WB; 237 *new_type = _PAGE_CACHE_WB;
220 } else { 238 else
221 *ret_type = req_type; 239 *new_type = req_type & _PAGE_CACHE_MASK;
222 }
223 } 240 }
224 return 0; 241 return 0;
225 } 242 }
226 243
227 /* Low ISA region is always mapped WB in page table. No need to track */ 244 /* Low ISA region is always mapped WB in page table. No need to track */
228 if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { 245 if (is_ISA_range(start, end - 1)) {
229 if (ret_type) 246 if (new_type)
230 *ret_type = _PAGE_CACHE_WB; 247 *new_type = _PAGE_CACHE_WB;
231
232 return 0; 248 return 0;
233 } 249 }
234 250
@@ -241,206 +257,92 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
241 */ 257 */
242 u8 mtrr_type = mtrr_type_lookup(start, end); 258 u8 mtrr_type = mtrr_type_lookup(start, end);
243 259
244 if (mtrr_type == MTRR_TYPE_WRBACK) { 260 if (mtrr_type == MTRR_TYPE_WRBACK)
245 req_type = _PAGE_CACHE_WB;
246 actual_type = _PAGE_CACHE_WB; 261 actual_type = _PAGE_CACHE_WB;
247 } else { 262 else
248 req_type = _PAGE_CACHE_UC_MINUS;
249 actual_type = _PAGE_CACHE_UC_MINUS; 263 actual_type = _PAGE_CACHE_UC_MINUS;
250 } 264 } else
251 } else { 265 actual_type = pat_x_mtrr_type(start, end,
252 req_type &= _PAGE_CACHE_MASK; 266 req_type & _PAGE_CACHE_MASK);
253 err = pat_x_mtrr_type(start, end, req_type, &actual_type);
254 }
255
256 if (err) {
257 if (ret_type)
258 *ret_type = actual_type;
259
260 return -EINVAL;
261 }
262 267
263 new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); 268 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
264 if (!new_entry) 269 if (!new)
265 return -ENOMEM; 270 return -ENOMEM;
266 271
267 new_entry->start = start; 272 new->start = start;
268 new_entry->end = end; 273 new->end = end;
269 new_entry->type = actual_type; 274 new->type = actual_type;
270 275
271 if (ret_type) 276 if (new_type)
272 *ret_type = actual_type; 277 *new_type = actual_type;
273 278
274 spin_lock(&memtype_lock); 279 spin_lock(&memtype_lock);
275 280
276 /* Search for existing mapping that overlaps the current range */ 281 /* Search for existing mapping that overlaps the current range */
277 list_for_each_entry(parse, &memtype_list, nd) { 282 where = NULL;
278 struct memtype *saved_ptr; 283 list_for_each_entry(entry, &memtype_list, nd) {
279 284 if (end <= entry->start) {
280 if (parse->start >= end) { 285 where = entry->nd.prev;
281 pr_debug("New Entry\n");
282 list_add(&new_entry->nd, parse->nd.prev);
283 new_entry = NULL;
284 break; 286 break;
285 } 287 } else if (start <= entry->start) { /* end > entry->start */
286 288 err = chk_conflict(new, entry, new_type);
287 if (start <= parse->start && end >= parse->start) { 289 if (!err) {
288 if (actual_type != parse->type && ret_type) { 290 dprintk("Overlap at 0x%Lx-0x%Lx\n",
289 actual_type = parse->type; 291 entry->start, entry->end);
290 *ret_type = actual_type; 292 where = entry->nd.prev;
291 new_entry->type = actual_type;
292 }
293
294 if (actual_type != parse->type) {
295 printk(
296 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
297 current->comm, current->pid,
298 start, end,
299 cattr_name(actual_type),
300 cattr_name(parse->type));
301 err = -EBUSY;
302 break;
303 } 293 }
304
305 saved_ptr = parse;
306 /*
307 * Check to see whether the request overlaps more
308 * than one entry in the list
309 */
310 list_for_each_entry_continue(parse, &memtype_list, nd) {
311 if (end <= parse->start) {
312 break;
313 }
314
315 if (actual_type != parse->type) {
316 printk(
317 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
318 current->comm, current->pid,
319 start, end,
320 cattr_name(actual_type),
321 cattr_name(parse->type));
322 err = -EBUSY;
323 break;
324 }
325 }
326
327 if (err) {
328 break;
329 }
330
331 pr_debug("Overlap at 0x%Lx-0x%Lx\n",
332 saved_ptr->start, saved_ptr->end);
333 /* No conflict. Go ahead and add this new entry */
334 list_add(&new_entry->nd, saved_ptr->nd.prev);
335 new_entry = NULL;
336 break; 294 break;
337 } 295 } else if (start < entry->end) { /* start > entry->start */
338 296 err = chk_conflict(new, entry, new_type);
339 if (start < parse->end) { 297 if (!err) {
340 if (actual_type != parse->type && ret_type) { 298 dprintk("Overlap at 0x%Lx-0x%Lx\n",
341 actual_type = parse->type; 299 entry->start, entry->end);
342 *ret_type = actual_type; 300 where = &entry->nd;
343 new_entry->type = actual_type;
344 }
345
346 if (actual_type != parse->type) {
347 printk(
348 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
349 current->comm, current->pid,
350 start, end,
351 cattr_name(actual_type),
352 cattr_name(parse->type));
353 err = -EBUSY;
354 break;
355 }
356
357 saved_ptr = parse;
358 /*
359 * Check to see whether the request overlaps more
360 * than one entry in the list
361 */
362 list_for_each_entry_continue(parse, &memtype_list, nd) {
363 if (end <= parse->start) {
364 break;
365 }
366
367 if (actual_type != parse->type) {
368 printk(
369 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
370 current->comm, current->pid,
371 start, end,
372 cattr_name(actual_type),
373 cattr_name(parse->type));
374 err = -EBUSY;
375 break;
376 }
377 }
378
379 if (err) {
380 break;
381 } 301 }
382
383 pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
384 saved_ptr->start, saved_ptr->end);
385 /* No conflict. Go ahead and add this new entry */
386 list_add(&new_entry->nd, &saved_ptr->nd);
387 new_entry = NULL;
388 break; 302 break;
389 } 303 }
390 } 304 }
391 305
392 if (err) { 306 if (err) {
393 printk(KERN_INFO 307 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
394 "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", 308 "track %s, req %s\n",
395 start, end, cattr_name(new_entry->type), 309 start, end, cattr_name(new->type), cattr_name(req_type));
396 cattr_name(req_type)); 310 kfree(new);
397 kfree(new_entry);
398 spin_unlock(&memtype_lock); 311 spin_unlock(&memtype_lock);
399 return err; 312 return err;
400 } 313 }
401 314
402 if (new_entry) { 315 if (where)
403 /* No conflict. Not yet added to the list. Add to the tail */ 316 list_add(&new->nd, where);
404 list_add_tail(&new_entry->nd, &memtype_list); 317 else
405 pr_debug("New Entry\n"); 318 list_add_tail(&new->nd, &memtype_list);
406 }
407
408 if (ret_type) {
409 pr_debug(
410 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
411 start, end, cattr_name(actual_type),
412 cattr_name(req_type), cattr_name(*ret_type));
413 } else {
414 pr_debug(
415 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
416 start, end, cattr_name(actual_type),
417 cattr_name(req_type));
418 }
419 319
420 spin_unlock(&memtype_lock); 320 spin_unlock(&memtype_lock);
321
322 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
323 start, end, cattr_name(new->type), cattr_name(req_type),
324 new_type ? cattr_name(*new_type) : "-");
325
421 return err; 326 return err;
422} 327}
423 328
424int free_memtype(u64 start, u64 end) 329int free_memtype(u64 start, u64 end)
425{ 330{
426 struct memtype *ml; 331 struct memtype *entry;
427 int err = -EINVAL; 332 int err = -EINVAL;
428 333
429 /* Only track when pat_wc_enabled */ 334 if (!pat_enabled)
430 if (!pat_wc_enabled) {
431 return 0; 335 return 0;
432 }
433 336
434 /* Low ISA region is always mapped WB. No need to track */ 337 /* Low ISA region is always mapped WB. No need to track */
435 if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { 338 if (is_ISA_range(start, end - 1))
436 return 0; 339 return 0;
437 }
438 340
439 spin_lock(&memtype_lock); 341 spin_lock(&memtype_lock);
440 list_for_each_entry(ml, &memtype_list, nd) { 342 list_for_each_entry(entry, &memtype_list, nd) {
441 if (ml->start == start && ml->end == end) { 343 if (entry->start == start && entry->end == end) {
442 list_del(&ml->nd); 344 list_del(&entry->nd);
443 kfree(ml); 345 kfree(entry);
444 err = 0; 346 err = 0;
445 break; 347 break;
446 } 348 }
@@ -452,7 +354,7 @@ int free_memtype(u64 start, u64 end)
452 current->comm, current->pid, start, end); 354 current->comm, current->pid, start, end);
453 } 355 }
454 356
455 pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); 357 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
456 return err; 358 return err;
457} 359}
458 360
@@ -521,12 +423,12 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
521 * caching for the high addresses through the KEN pin, but 423 * caching for the high addresses through the KEN pin, but
522 * we maintain the tradition of paranoia in this code. 424 * we maintain the tradition of paranoia in this code.
523 */ 425 */
524 if (!pat_wc_enabled && 426 if (!pat_enabled &&
525 ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || 427 !(boot_cpu_has(X86_FEATURE_MTRR) ||
526 test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || 428 boot_cpu_has(X86_FEATURE_K6_MTRR) ||
527 test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || 429 boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
528 test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && 430 boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
529 (pfn << PAGE_SHIFT) >= __pa(high_memory)) { 431 (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
530 flags = _PAGE_CACHE_UC; 432 flags = _PAGE_CACHE_UC;
531 } 433 }
532#endif 434#endif
@@ -547,8 +449,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
547 if (retval < 0) 449 if (retval < 0)
548 return 0; 450 return 0;
549 451
550 if (pfn <= max_pfn_mapped && 452 if (((pfn < max_low_pfn_mapped) ||
551 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { 453 (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
454 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
552 free_memtype(offset, offset + size); 455 free_memtype(offset, offset + size);
553 printk(KERN_INFO 456 printk(KERN_INFO
554 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", 457 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
@@ -586,4 +489,3 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
586 489
587 free_memtype(addr, addr + size); 490 free_memtype(addr, addr + size);
588} 491}
589
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 50159764f694..557b2abceef8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
2#include <asm/pgalloc.h> 2#include <asm/pgalloc.h>
3#include <asm/pgtable.h> 3#include <asm/pgtable.h>
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h>
5 6
6pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
7{ 8{
@@ -65,12 +66,6 @@ static inline void pgd_list_del(pgd_t *pgd)
65static void pgd_ctor(void *p) 66static void pgd_ctor(void *p)
66{ 67{
67 pgd_t *pgd = p; 68 pgd_t *pgd = p;
68 unsigned long flags;
69
70 /* Clear usermode parts of PGD */
71 memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
72
73 spin_lock_irqsave(&pgd_lock, flags);
74 69
75 /* If the pgd points to a shared pagetable level (either the 70 /* If the pgd points to a shared pagetable level (either the
76 ptes in non-PAE, or shared PMD in PAE), then just copy the 71 ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -90,8 +85,6 @@ static void pgd_ctor(void *p)
90 /* list required to sync kernel mapping updates */ 85 /* list required to sync kernel mapping updates */
91 if (!SHARED_KERNEL_PMD) 86 if (!SHARED_KERNEL_PMD)
92 pgd_list_add(pgd); 87 pgd_list_add(pgd);
93
94 spin_unlock_irqrestore(&pgd_lock, flags);
95} 88}
96 89
97static void pgd_dtor(void *pgd) 90static void pgd_dtor(void *pgd)
@@ -119,6 +112,72 @@ static void pgd_dtor(void *pgd)
119 112
120#ifdef CONFIG_X86_PAE 113#ifdef CONFIG_X86_PAE
121/* 114/*
115 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
116 * updating the top-level pagetable entries to guarantee the
117 * processor notices the update. Since this is expensive, and
118 * all 4 top-level entries are used almost immediately in a
119 * new process's life, we just pre-populate them here.
120 *
121 * Also, if we're in a paravirt environment where the kernel pmd is
122 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
123 * and initialize the kernel pmds here.
124 */
125#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
126
127void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
128{
129 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
130
131 /* Note: almost everything apart from _PAGE_PRESENT is
132 reserved at the pmd (PDPT) level. */
133 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
134
135 /*
136 * According to Intel App note "TLBs, Paging-Structure Caches,
137 * and Their Invalidation", April 2007, document 317080-001,
138 * section 8.1: in PAE mode we explicitly have to flush the
139 * TLB via cr3 if the top-level pgd is changed...
140 */
141 if (mm == current->active_mm)
142 write_cr3(read_cr3());
143}
144#else /* !CONFIG_X86_PAE */
145
146/* No need to prepopulate any pagetable entries in non-PAE modes. */
147#define PREALLOCATED_PMDS 0
148
149#endif /* CONFIG_X86_PAE */
150
151static void free_pmds(pmd_t *pmds[])
152{
153 int i;
154
155 for(i = 0; i < PREALLOCATED_PMDS; i++)
156 if (pmds[i])
157 free_page((unsigned long)pmds[i]);
158}
159
160static int preallocate_pmds(pmd_t *pmds[])
161{
162 int i;
163 bool failed = false;
164
165 for(i = 0; i < PREALLOCATED_PMDS; i++) {
166 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
167 if (pmd == NULL)
168 failed = true;
169 pmds[i] = pmd;
170 }
171
172 if (failed) {
173 free_pmds(pmds);
174 return -ENOMEM;
175 }
176
177 return 0;
178}
179
180/*
122 * Mop up any pmd pages which may still be attached to the pgd. 181 * Mop up any pmd pages which may still be attached to the pgd.
123 * Normally they will be freed by munmap/exit_mmap, but any pmd we 182 * Normally they will be freed by munmap/exit_mmap, but any pmd we
124 * preallocate which never got a corresponding vma will need to be 183 * preallocate which never got a corresponding vma will need to be
@@ -128,7 +187,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
128{ 187{
129 int i; 188 int i;
130 189
131 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { 190 for(i = 0; i < PREALLOCATED_PMDS; i++) {
132 pgd_t pgd = pgdp[i]; 191 pgd_t pgd = pgdp[i];
133 192
134 if (pgd_val(pgd) != 0) { 193 if (pgd_val(pgd) != 0) {
@@ -142,32 +201,17 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
142 } 201 }
143} 202}
144 203
145/* 204static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
146 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
147 * updating the top-level pagetable entries to guarantee the
148 * processor notices the update. Since this is expensive, and
149 * all 4 top-level entries are used almost immediately in a
150 * new process's life, we just pre-populate them here.
151 *
152 * Also, if we're in a paravirt environment where the kernel pmd is
153 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
154 * and initialize the kernel pmds here.
155 */
156static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
157{ 205{
158 pud_t *pud; 206 pud_t *pud;
159 unsigned long addr; 207 unsigned long addr;
160 int i; 208 int i;
161 209
162 pud = pud_offset(pgd, 0); 210 pud = pud_offset(pgd, 0);
163 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
164 i++, pud++, addr += PUD_SIZE) {
165 pmd_t *pmd = pmd_alloc_one(mm, addr);
166 211
167 if (!pmd) { 212 for (addr = i = 0; i < PREALLOCATED_PMDS;
168 pgd_mop_up_pmds(mm, pgd); 213 i++, pud++, addr += PUD_SIZE) {
169 return 0; 214 pmd_t *pmd = pmds[i];
170 }
171 215
172 if (i >= KERNEL_PGD_BOUNDARY) 216 if (i >= KERNEL_PGD_BOUNDARY)
173 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 217 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
@@ -175,61 +219,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
175 219
176 pud_populate(mm, pud, pmd); 220 pud_populate(mm, pud, pmd);
177 } 221 }
178
179 return 1;
180} 222}
181 223
182void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 224pgd_t *pgd_alloc(struct mm_struct *mm)
183{ 225{
184 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 226 pgd_t *pgd;
227 pmd_t *pmds[PREALLOCATED_PMDS];
228 unsigned long flags;
185 229
186 /* Note: almost everything apart from _PAGE_PRESENT is 230 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
187 reserved at the pmd (PDPT) level. */
188 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
189 231
190 /* 232 if (pgd == NULL)
191 * According to Intel App note "TLBs, Paging-Structure Caches, 233 goto out;
192 * and Their Invalidation", April 2007, document 317080-001,
193 * section 8.1: in PAE mode we explicitly have to flush the
194 * TLB via cr3 if the top-level pgd is changed...
195 */
196 if (mm == current->active_mm)
197 write_cr3(read_cr3());
198}
199#else /* !CONFIG_X86_PAE */
200/* No need to prepopulate any pagetable entries in non-PAE modes. */
201static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
202{
203 return 1;
204}
205 234
206static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) 235 mm->pgd = pgd;
207{
208}
209#endif /* CONFIG_X86_PAE */
210 236
211pgd_t *pgd_alloc(struct mm_struct *mm) 237 if (preallocate_pmds(pmds) != 0)
212{ 238 goto out_free_pgd;
213 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
214 239
215 /* so that alloc_pmd can use it */ 240 if (paravirt_pgd_alloc(mm) != 0)
216 mm->pgd = pgd; 241 goto out_free_pmds;
217 if (pgd)
218 pgd_ctor(pgd);
219 242
220 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { 243 /*
221 pgd_dtor(pgd); 244 * Make sure that pre-populating the pmds is atomic with
222 free_page((unsigned long)pgd); 245 * respect to anything walking the pgd_list, so that they
223 pgd = NULL; 246 * never see a partially populated pgd.
224 } 247 */
248 spin_lock_irqsave(&pgd_lock, flags);
249
250 pgd_ctor(pgd);
251 pgd_prepopulate_pmd(mm, pgd, pmds);
252
253 spin_unlock_irqrestore(&pgd_lock, flags);
225 254
226 return pgd; 255 return pgd;
256
257out_free_pmds:
258 free_pmds(pmds);
259out_free_pgd:
260 free_page((unsigned long)pgd);
261out:
262 return NULL;
227} 263}
228 264
229void pgd_free(struct mm_struct *mm, pgd_t *pgd) 265void pgd_free(struct mm_struct *mm, pgd_t *pgd)
230{ 266{
231 pgd_mop_up_pmds(mm, pgd); 267 pgd_mop_up_pmds(mm, pgd);
232 pgd_dtor(pgd); 268 pgd_dtor(pgd);
269 paravirt_pgd_free(mm, pgd);
233 free_page((unsigned long)pgd); 270 free_page((unsigned long)pgd);
234} 271}
235 272
@@ -255,7 +292,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
255 292
256 if (pte_young(*ptep)) 293 if (pte_young(*ptep))
257 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 294 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
258 &ptep->pte); 295 (unsigned long *) &ptep->pte);
259 296
260 if (ret) 297 if (ret)
261 pte_update(vma->vm_mm, addr, ptep); 298 pte_update(vma->vm_mm, addr, ptep);
@@ -274,3 +311,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
274 311
275 return young; 312 return young;
276} 313}
314
315int fixmaps_set;
316
317void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
318{
319 unsigned long address = __fix_to_virt(idx);
320
321 if (idx >= __end_of_fixed_addresses) {
322 BUG();
323 return;
324 }
325 set_pte_vaddr(address, pte);
326 fixmaps_set++;
327}
328
329void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
330{
331 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
332}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 369cf065b6a4..b4becbf8c570 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -71,7 +71,7 @@ void show_mem(void)
71 * Associate a virtual page frame with a given physical page frame 71 * Associate a virtual page frame with a given physical page frame
72 * and protection flags for that frame. 72 * and protection flags for that frame.
73 */ 73 */
74static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) 74void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
75{ 75{
76 pgd_t *pgd; 76 pgd_t *pgd;
77 pud_t *pud; 77 pud_t *pud;
@@ -94,8 +94,8 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
94 return; 94 return;
95 } 95 }
96 pte = pte_offset_kernel(pmd, vaddr); 96 pte = pte_offset_kernel(pmd, vaddr);
97 if (pgprot_val(flags)) 97 if (pte_val(pteval))
98 set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags)); 98 set_pte_present(&init_mm, vaddr, pte, pteval);
99 else 99 else
100 pte_clear(&init_mm, vaddr, pte); 100 pte_clear(&init_mm, vaddr, pte);
101 101
@@ -141,22 +141,9 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
141 __flush_tlb_one(vaddr); 141 __flush_tlb_one(vaddr);
142} 142}
143 143
144static int fixmaps;
145unsigned long __FIXADDR_TOP = 0xfffff000; 144unsigned long __FIXADDR_TOP = 0xfffff000;
146EXPORT_SYMBOL(__FIXADDR_TOP); 145EXPORT_SYMBOL(__FIXADDR_TOP);
147 146
148void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
149{
150 unsigned long address = __fix_to_virt(idx);
151
152 if (idx >= __end_of_fixed_addresses) {
153 BUG();
154 return;
155 }
156 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
157 fixmaps++;
158}
159
160/** 147/**
161 * reserve_top_address - reserves a hole in the top of kernel address space 148 * reserve_top_address - reserves a hole in the top of kernel address space
162 * @reserve - size of hole to reserve 149 * @reserve - size of hole to reserve
@@ -164,11 +151,44 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
164 * Can be used to relocate the fixmap area and poke a hole in the top 151 * Can be used to relocate the fixmap area and poke a hole in the top
165 * of kernel address space to make room for a hypervisor. 152 * of kernel address space to make room for a hypervisor.
166 */ 153 */
167void reserve_top_address(unsigned long reserve) 154void __init reserve_top_address(unsigned long reserve)
168{ 155{
169 BUG_ON(fixmaps > 0); 156 BUG_ON(fixmaps_set > 0);
170 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 157 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
171 (int)-reserve); 158 (int)-reserve);
172 __FIXADDR_TOP = -reserve - PAGE_SIZE; 159 __FIXADDR_TOP = -reserve - PAGE_SIZE;
173 __VMALLOC_RESERVE += reserve; 160 __VMALLOC_RESERVE += reserve;
174} 161}
162
163/*
164 * vmalloc=size forces the vmalloc area to be exactly 'size'
165 * bytes. This can be used to increase (or decrease) the
166 * vmalloc area - the default is 128m.
167 */
168static int __init parse_vmalloc(char *arg)
169{
170 if (!arg)
171 return -EINVAL;
172
173 __VMALLOC_RESERVE = memparse(arg, &arg);
174 return 0;
175}
176early_param("vmalloc", parse_vmalloc);
177
178/*
179 * reservetop=size reserves a hole at the top of the kernel address space which
180 * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
181 * so relocating the fixmap can be done before paging initialization.
182 */
183static int __init parse_reservetop(char *arg)
184{
185 unsigned long address;
186
187 if (!arg)
188 return -EINVAL;
189
190 address = memparse(arg, &arg);
191 reserve_top_address(address);
192 return 0;
193}
194early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
new file mode 100644
index 000000000000..f41d67f8f831
--- /dev/null
+++ b/arch/x86/mm/srat_32.c
@@ -0,0 +1,280 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/acpi.h>
30#include <linux/nodemask.h>
31#include <asm/srat.h>
32#include <asm/topology.h>
33#include <asm/smp.h>
34#include <asm/e820.h>
35
36/*
37 * proximity macros and definitions
38 */
39#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
40#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
41#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
42#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
43/* bitmap length; _PXM is at most 255 */
44#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
45static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
46
47#define MAX_CHUNKS_PER_NODE 3
48#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
49struct node_memory_chunk_s {
50 unsigned long start_pfn;
51 unsigned long end_pfn;
52 u8 pxm; // proximity domain of node
53 u8 nid; // which cnode contains this chunk?
54 u8 bank; // which mem bank on this node
55};
56static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
57
58static int __initdata num_memory_chunks; /* total number of memory chunks */
59static u8 __initdata apicid_to_pxm[MAX_APICID];
60
61int numa_off __initdata;
62int acpi_numa __initdata;
63
64static __init void bad_srat(void)
65{
66 printk(KERN_ERR "SRAT: SRAT not used.\n");
67 acpi_numa = -1;
68 num_memory_chunks = 0;
69}
70
71static __init inline int srat_disabled(void)
72{
73 return numa_off || acpi_numa < 0;
74}
75
76/* Identify CPU proximity domains */
77void __init
78acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
79{
80 if (srat_disabled())
81 return;
82 if (cpu_affinity->header.length !=
83 sizeof(struct acpi_srat_cpu_affinity)) {
84 bad_srat();
85 return;
86 }
87
88 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
89 return; /* empty entry */
90
91 /* mark this node as "seen" in node bitmap */
92 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
93
94 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
95
96 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
97 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
98}
99
100/*
101 * Identify memory proximity domains and hot-remove capabilities.
102 * Fill node memory chunk list structure.
103 */
104void __init
105acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
106{
107 unsigned long long paddr, size;
108 unsigned long start_pfn, end_pfn;
109 u8 pxm;
110 struct node_memory_chunk_s *p, *q, *pend;
111
112 if (srat_disabled())
113 return;
114 if (memory_affinity->header.length !=
115 sizeof(struct acpi_srat_mem_affinity)) {
116 bad_srat();
117 return;
118 }
119
120 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
121 return; /* empty entry */
122
123 pxm = memory_affinity->proximity_domain & 0xff;
124
125 /* mark this node as "seen" in node bitmap */
126 BMAP_SET(pxm_bitmap, pxm);
127
128 /* calculate info for memory chunk structure */
129 paddr = memory_affinity->base_address;
130 size = memory_affinity->length;
131
132 start_pfn = paddr >> PAGE_SHIFT;
133 end_pfn = (paddr + size) >> PAGE_SHIFT;
134
135
136 if (num_memory_chunks >= MAXCHUNKS) {
137 printk(KERN_WARNING "Too many mem chunks in SRAT."
138 " Ignoring %lld MBytes at %llx\n",
139 size/(1024*1024), paddr);
140 return;
141 }
142
143 /* Insertion sort based on base address */
144 pend = &node_memory_chunk[num_memory_chunks];
145 for (p = &node_memory_chunk[0]; p < pend; p++) {
146 if (start_pfn < p->start_pfn)
147 break;
148 }
149 if (p < pend) {
150 for (q = pend; q >= p; q--)
151 *(q + 1) = *q;
152 }
153 p->start_pfn = start_pfn;
154 p->end_pfn = end_pfn;
155 p->pxm = pxm;
156
157 num_memory_chunks++;
158
159 printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)"
160 " in proximity domain %02x %s\n",
161 start_pfn, end_pfn,
162 memory_affinity->memory_type,
163 pxm,
164 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
165 "enabled and removable" : "enabled" ) );
166}
167
168/* Callback for SLIT parsing */
169void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
170{
171}
172
173void acpi_numa_arch_fixup(void)
174{
175}
176/*
177 * The SRAT table always lists ascending addresses, so can always
178 * assume that the first "start" address that you see is the real
179 * start of the node, and that the current "end" address is after
180 * the previous one.
181 */
182static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
183{
184 /*
185 * Only add present memory as told by the e820.
186 * There is no guarantee from the SRAT that the memory it
187 * enumerates is present at boot time because it represents
188 * *possible* memory hotplug areas the same as normal RAM.
189 */
190 if (memory_chunk->start_pfn >= max_pfn) {
191 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
192 memory_chunk->start_pfn, memory_chunk->end_pfn);
193 return;
194 }
195 if (memory_chunk->nid != nid)
196 return;
197
198 if (!node_has_online_mem(nid))
199 node_start_pfn[nid] = memory_chunk->start_pfn;
200
201 if (node_start_pfn[nid] > memory_chunk->start_pfn)
202 node_start_pfn[nid] = memory_chunk->start_pfn;
203
204 if (node_end_pfn[nid] < memory_chunk->end_pfn)
205 node_end_pfn[nid] = memory_chunk->end_pfn;
206}
207
208int __init get_memcfg_from_srat(void)
209{
210 int i, j, nid;
211
212
213 if (srat_disabled())
214 goto out_fail;
215
216 if (num_memory_chunks == 0) {
217 printk(KERN_WARNING
218 "could not finy any ACPI SRAT memory areas.\n");
219 goto out_fail;
220 }
221
222 /* Calculate total number of nodes in system from PXM bitmap and create
223 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
224 * to specify the range of _PXM values.)
225 */
226 /*
227 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
228 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
229 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
230 * approaches MAX_PXM_DOMAINS for i386.
231 */
232 nodes_clear(node_online_map);
233 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
234 if (BMAP_TEST(pxm_bitmap, i)) {
235 int nid = acpi_map_pxm_to_node(i);
236 node_set_online(nid);
237 }
238 }
239 BUG_ON(num_online_nodes() == 0);
240
241 /* set cnode id in memory chunk structure */
242 for (i = 0; i < num_memory_chunks; i++)
243 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
244
245 printk(KERN_DEBUG "pxm bitmap: ");
246 for (i = 0; i < sizeof(pxm_bitmap); i++) {
247 printk(KERN_CONT "%02x ", pxm_bitmap[i]);
248 }
249 printk(KERN_CONT "\n");
250 printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
251 num_online_nodes());
252 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
253 num_memory_chunks);
254
255 for (i = 0; i < MAX_APICID; i++)
256 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
257
258 for (j = 0; j < num_memory_chunks; j++){
259 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
260 printk(KERN_DEBUG
261 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
262 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
263 node_read_chunk(chunk->nid, chunk);
264 e820_register_active_regions(chunk->nid, chunk->start_pfn,
265 min(chunk->end_pfn, max_pfn));
266 }
267
268 for_each_online_node(nid) {
269 unsigned long start = node_start_pfn[nid];
270 unsigned long end = min(node_end_pfn[nid], max_pfn);
271
272 memory_present(nid, start, end);
273 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
274 }
275 return 1;
276out_fail:
277 printk(KERN_ERR "failed to get NUMA memory information from SRAT"
278 " table\n");
279 return 0;
280}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 99649dccad28..1b4763e26ea9 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -100,7 +100,19 @@ static __init inline int srat_disabled(void)
100/* Callback for SLIT parsing */ 100/* Callback for SLIT parsing */
101void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 101void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
102{ 102{
103 acpi_slit = slit; 103 unsigned length;
104 unsigned long phys;
105
106 length = slit->header.length;
107 phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
108 PAGE_SIZE);
109
110 if (phys == -1L)
111 panic(" Can not save slit!\n");
112
113 acpi_slit = __va(phys);
114 memcpy(acpi_slit, slit, length);
115 reserve_early(phys, phys + length, "ACPI SLIT");
104} 116}
105 117
106/* Callback for Proximity Domain -> LAPIC mapping */ 118/* Callback for Proximity Domain -> LAPIC mapping */
@@ -299,7 +311,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
299 pxmram = 0; 311 pxmram = 0;
300 } 312 }
301 313
302 e820ram = end_pfn - absent_pages_in_range(0, end_pfn); 314 e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
303 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 315 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
304 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 316 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
305 printk(KERN_ERR 317 printk(KERN_ERR
@@ -376,7 +388,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
376 if (node == NUMA_NO_NODE) 388 if (node == NUMA_NO_NODE)
377 continue; 389 continue;
378 if (!node_isset(node, node_possible_map)) 390 if (!node_isset(node, node_possible_map))
379 numa_set_node(i, NUMA_NO_NODE); 391 numa_clear_node(i);
380 } 392 }
381 numa_init_array(); 393 numa_init_array();
382 return 0; 394 return 0;
@@ -495,6 +507,7 @@ int __node_distance(int a, int b)
495 507
496EXPORT_SYMBOL(__node_distance); 508EXPORT_SYMBOL(__node_distance);
497 509
510#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
498int memory_add_physaddr_to_nid(u64 start) 511int memory_add_physaddr_to_nid(u64 start)
499{ 512{
500 int i, ret = 0; 513 int i, ret = 0;
@@ -506,4 +519,4 @@ int memory_add_physaddr_to_nid(u64 start)
506 return ret; 519 return ret;
507} 520}
508EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 521EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
509 522#endif