aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile8
-rw-r--r--arch/x86/mm/discontig_32.c285
-rw-r--r--arch/x86/mm/dump_pagetables.c2
-rw-r--r--arch/x86/mm/fault.c115
-rw-r--r--arch/x86/mm/init_32.c527
-rw-r--r--arch/x86/mm/init_64.c571
-rw-r--r--arch/x86/mm/ioremap.c65
-rw-r--r--arch/x86/mm/k8topology_64.c21
-rw-r--r--arch/x86/mm/kmmio.c510
-rw-r--r--arch/x86/mm/mmio-mod.c515
-rw-r--r--arch/x86/mm/numa_64.c93
-rw-r--r--arch/x86/mm/pageattr-test.c21
-rw-r--r--arch/x86/mm/pageattr.c67
-rw-r--r--arch/x86/mm/pat.c395
-rw-r--r--arch/x86/mm/pf_in.c489
-rw-r--r--arch/x86/mm/pf_in.h39
-rw-r--r--arch/x86/mm/pgtable.c190
-rw-r--r--arch/x86/mm/pgtable_32.c56
-rw-r--r--arch/x86/mm/srat_32.c279
-rw-r--r--arch/x86/mm/srat_64.c48
-rw-r--r--arch/x86/mm/testmmiotrace.c71
21 files changed, 3420 insertions, 947 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index b7b3e4c7cfc9..9873716e9f76 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -8,10 +8,16 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
8 8
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10 10
11obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
12obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15
11ifeq ($(CONFIG_X86_32),y) 16ifeq ($(CONFIG_X86_32),y)
12obj-$(CONFIG_NUMA) += discontig_32.o 17obj-$(CONFIG_NUMA) += discontig_32.o
13else 18else
14obj-$(CONFIG_NUMA) += numa_64.o 19obj-$(CONFIG_NUMA) += numa_64.o
15obj-$(CONFIG_K8_NUMA) += k8topology_64.o 20obj-$(CONFIG_K8_NUMA) += k8topology_64.o
16obj-$(CONFIG_ACPI_NUMA) += srat_64.o
17endif 21endif
22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
23
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 914ccf983687..5dfef9fa061a 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -38,6 +38,7 @@
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/mmzone.h> 39#include <asm/mmzone.h>
40#include <asm/bios_ebda.h> 40#include <asm/bios_ebda.h>
41#include <asm/proto.h>
41 42
42struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
43EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
@@ -59,14 +60,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
59/* 60/*
60 * 4) physnode_map - the mapping between a pfn and owning node 61 * 4) physnode_map - the mapping between a pfn and owning node
61 * physnode_map keeps track of the physical memory layout of a generic 62 * physnode_map keeps track of the physical memory layout of a generic
62 * numa node on a 256Mb break (each element of the array will 63 * numa node on a 64Mb break (each element of the array will
63 * represent 256Mb of memory and will be marked by the node id. so, 64 * represent 64Mb of memory and will be marked by the node id. so,
64 * if the first gig is on node 0, and the second gig is on node 1 65 * if the first gig is on node 0, and the second gig is on node 1
65 * physnode_map will contain: 66 * physnode_map will contain:
66 * 67 *
67 * physnode_map[0-3] = 0; 68 * physnode_map[0-15] = 0;
68 * physnode_map[4-7] = 1; 69 * physnode_map[16-31] = 1;
69 * physnode_map[8- ] = -1; 70 * physnode_map[32- ] = -1;
70 */ 71 */
71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; 72s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
72EXPORT_SYMBOL(physnode_map); 73EXPORT_SYMBOL(physnode_map);
@@ -75,15 +76,15 @@ void memory_present(int nid, unsigned long start, unsigned long end)
75{ 76{
76 unsigned long pfn; 77 unsigned long pfn;
77 78
78 printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", 79 printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
79 nid, start, end); 80 nid, start, end);
80 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); 81 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
81 printk(KERN_DEBUG " "); 82 printk(KERN_DEBUG " ");
82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { 83 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid; 84 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
84 printk("%ld ", pfn); 85 printk(KERN_CONT "%lx ", pfn);
85 } 86 }
86 printk("\n"); 87 printk(KERN_CONT "\n");
87} 88}
88 89
89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, 90unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -99,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
99#endif 100#endif
100 101
101extern unsigned long find_max_low_pfn(void); 102extern unsigned long find_max_low_pfn(void);
102extern void add_one_highpage_init(struct page *, int, int);
103extern unsigned long highend_pfn, highstart_pfn; 103extern unsigned long highend_pfn, highstart_pfn;
104 104
105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -117,13 +117,13 @@ static unsigned long kva_pages;
117 */ 117 */
118int __init get_memcfg_numa_flat(void) 118int __init get_memcfg_numa_flat(void)
119{ 119{
120 printk("NUMA - single node, flat memory mode\n"); 120 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
121 121
122 /* Run the memory configuration and find the top of memory. */
123 propagate_e820_map();
124 node_start_pfn[0] = 0; 122 node_start_pfn[0] = 0;
125 node_end_pfn[0] = max_pfn; 123 node_end_pfn[0] = max_pfn;
124 e820_register_active_regions(0, 0, max_pfn);
126 memory_present(0, 0, max_pfn); 125 memory_present(0, 0, max_pfn);
126 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
127 127
128 /* Indicate there is one node available. */ 128 /* Indicate there is one node available. */
129 nodes_clear(node_online_map); 129 nodes_clear(node_online_map);
@@ -156,24 +156,32 @@ static void __init propagate_e820_map_node(int nid)
156 */ 156 */
157static void __init allocate_pgdat(int nid) 157static void __init allocate_pgdat(int nid)
158{ 158{
159 if (nid && node_has_online_mem(nid)) 159 char buf[16];
160
161 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
160 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 162 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
161 else { 163 else {
162 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); 164 unsigned long pgdat_phys;
163 min_low_pfn += PFN_UP(sizeof(pg_data_t)); 165 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
166 max_pfn_mapped<<PAGE_SHIFT,
167 sizeof(pg_data_t),
168 PAGE_SIZE);
169 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
170 memset(buf, 0, sizeof(buf));
171 sprintf(buf, "NODE_DATA %d", nid);
172 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
164 } 173 }
174 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
175 nid, (unsigned long)NODE_DATA(nid));
165} 176}
166 177
167#ifdef CONFIG_DISCONTIGMEM
168/* 178/*
169 * In the discontig memory model, a portion of the kernel virtual area (KVA) 179 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
170 * is reserved and portions of nodes are mapped using it. This is to allow 180 * virtual address space (KVA) is reserved and portions of nodes are mapped
171 * node-local memory to be allocated for structures that would normally require 181 * using it. This is to allow node-local memory to be allocated for
172 * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers 182 * structures that would normally require ZONE_NORMAL. The memory is
173 * should be prepared to allocate from the bootmem allocator instead. This KVA 183 * allocated with alloc_remap() and callers should be prepared to allocate
174 * mechanism is incompatible with SPARSEMEM as it makes assumptions about the 184 * from the bootmem allocator instead.
175 * layout of memory that are broken if alloc_remap() succeeds for some of the
176 * map and fails for others
177 */ 185 */
178static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 186static unsigned long node_remap_start_pfn[MAX_NUMNODES];
179static void *node_remap_end_vaddr[MAX_NUMNODES]; 187static void *node_remap_end_vaddr[MAX_NUMNODES];
@@ -195,15 +203,19 @@ void *alloc_remap(int nid, unsigned long size)
195 return allocation; 203 return allocation;
196} 204}
197 205
198void __init remap_numa_kva(void) 206static void __init remap_numa_kva(void)
199{ 207{
200 void *vaddr; 208 void *vaddr;
201 unsigned long pfn; 209 unsigned long pfn;
202 int node; 210 int node;
203 211
204 for_each_online_node(node) { 212 for_each_online_node(node) {
213 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
205 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { 214 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
206 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); 215 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
216 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
217 (unsigned long)vaddr,
218 node_remap_start_pfn[node] + pfn);
207 set_pmd_pfn((ulong) vaddr, 219 set_pmd_pfn((ulong) vaddr,
208 node_remap_start_pfn[node] + pfn, 220 node_remap_start_pfn[node] + pfn,
209 PAGE_KERNEL_LARGE); 221 PAGE_KERNEL_LARGE);
@@ -215,17 +227,21 @@ static unsigned long calculate_numa_remap_pages(void)
215{ 227{
216 int nid; 228 int nid;
217 unsigned long size, reserve_pages = 0; 229 unsigned long size, reserve_pages = 0;
218 unsigned long pfn;
219 230
220 for_each_online_node(nid) { 231 for_each_online_node(nid) {
221 unsigned old_end_pfn = node_end_pfn[nid]; 232 u64 node_kva_target;
233 u64 node_kva_final;
222 234
223 /* 235 /*
224 * The acpi/srat node info can show hot-add memroy zones 236 * The acpi/srat node info can show hot-add memroy zones
225 * where memory could be added but not currently present. 237 * where memory could be added but not currently present.
226 */ 238 */
239 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
240 nid, node_start_pfn[nid], node_end_pfn[nid]);
227 if (node_start_pfn[nid] > max_pfn) 241 if (node_start_pfn[nid] > max_pfn)
228 continue; 242 continue;
243 if (!node_end_pfn[nid])
244 continue;
229 if (node_end_pfn[nid] > max_pfn) 245 if (node_end_pfn[nid] > max_pfn)
230 node_end_pfn[nid] = max_pfn; 246 node_end_pfn[nid] = max_pfn;
231 247
@@ -237,41 +253,48 @@ static unsigned long calculate_numa_remap_pages(void)
237 /* now the roundup is correct, convert to PAGE_SIZE pages */ 253 /* now the roundup is correct, convert to PAGE_SIZE pages */
238 size = size * PTRS_PER_PTE; 254 size = size * PTRS_PER_PTE;
239 255
240 /* 256 node_kva_target = round_down(node_end_pfn[nid] - size,
241 * Validate the region we are allocating only contains valid 257 PTRS_PER_PTE);
242 * pages. 258 node_kva_target <<= PAGE_SHIFT;
243 */ 259 do {
244 for (pfn = node_end_pfn[nid] - size; 260 node_kva_final = find_e820_area(node_kva_target,
245 pfn < node_end_pfn[nid]; pfn++) 261 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
246 if (!page_is_ram(pfn)) 262 ((u64)size)<<PAGE_SHIFT,
247 break; 263 LARGE_PAGE_BYTES);
248 264 node_kva_target -= LARGE_PAGE_BYTES;
249 if (pfn != node_end_pfn[nid]) 265 } while (node_kva_final == -1ULL &&
250 size = 0; 266 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
267
268 if (node_kva_final == -1ULL)
269 panic("Can not get kva ram\n");
251 270
252 printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
253 size, nid);
254 node_remap_size[nid] = size; 271 node_remap_size[nid] = size;
255 node_remap_offset[nid] = reserve_pages; 272 node_remap_offset[nid] = reserve_pages;
256 reserve_pages += size; 273 reserve_pages += size;
257 printk("Shrinking node %d from %ld pages to %ld pages\n", 274 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
258 nid, node_end_pfn[nid], node_end_pfn[nid] - size); 275 " node %d at %llx\n",
259 276 size, nid, node_kva_final>>PAGE_SHIFT);
260 if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { 277
261 /* 278 /*
262 * Align node_end_pfn[] and node_remap_start_pfn[] to 279 * prevent kva address below max_low_pfn want it on system
263 * pmd boundary. remap_numa_kva will barf otherwise. 280 * with less memory later.
264 */ 281 * layout will be: KVA address , KVA RAM
265 printk("Shrinking node %d further by %ld pages for proper alignment\n", 282 *
266 nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); 283 * we are supposed to only record the one less then max_low_pfn
267 size += node_end_pfn[nid] & (PTRS_PER_PTE-1); 284 * but we could have some hole in high memory, and it will only
268 } 285 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
286 * to use it as free.
287 * So reserve_early here, hope we don't run out of that array
288 */
289 reserve_early(node_kva_final,
290 node_kva_final+(((u64)size)<<PAGE_SHIFT),
291 "KVA RAM");
269 292
270 node_end_pfn[nid] -= size; 293 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
271 node_remap_start_pfn[nid] = node_end_pfn[nid]; 294 remove_active_range(nid, node_remap_start_pfn[nid],
272 shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); 295 node_remap_start_pfn[nid] + size);
273 } 296 }
274 printk("Reserving total of %ld pages for numa KVA remap\n", 297 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
275 reserve_pages); 298 reserve_pages);
276 return reserve_pages; 299 return reserve_pages;
277} 300}
@@ -285,37 +308,16 @@ static void init_remap_allocator(int nid)
285 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + 308 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
286 ALIGN(sizeof(pg_data_t), PAGE_SIZE); 309 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
287 310
288 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, 311 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
289 (ulong) node_remap_start_vaddr[nid], 312 (ulong) node_remap_start_vaddr[nid],
290 (ulong) pfn_to_kaddr(highstart_pfn 313 (ulong) node_remap_end_vaddr[nid]);
291 + node_remap_offset[nid] + node_remap_size[nid]));
292}
293#else
294void *alloc_remap(int nid, unsigned long size)
295{
296 return NULL;
297}
298
299static unsigned long calculate_numa_remap_pages(void)
300{
301 return 0;
302}
303
304static void init_remap_allocator(int nid)
305{
306}
307
308void __init remap_numa_kva(void)
309{
310} 314}
311#endif /* CONFIG_DISCONTIGMEM */
312 315
313extern void setup_bootmem_allocator(void); 316void __init initmem_init(unsigned long start_pfn,
314unsigned long __init setup_memory(void) 317 unsigned long end_pfn)
315{ 318{
316 int nid; 319 int nid;
317 unsigned long system_start_pfn, system_max_low_pfn; 320 long kva_target_pfn;
318 unsigned long wasted_pages;
319 321
320 /* 322 /*
321 * When mapping a NUMA machine we allocate the node_mem_map arrays 323 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -324,109 +326,77 @@ unsigned long __init setup_memory(void)
324 * this space and use it to adjust the boundary between ZONE_NORMAL 326 * this space and use it to adjust the boundary between ZONE_NORMAL
325 * and ZONE_HIGHMEM. 327 * and ZONE_HIGHMEM.
326 */ 328 */
327 get_memcfg_numa();
328 329
329 kva_pages = calculate_numa_remap_pages(); 330 get_memcfg_numa();
330 331
331 /* partially used pages are not usable - thus round upwards */ 332 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
332 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
333 333
334 kva_start_pfn = find_max_low_pfn() - kva_pages; 334 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
335 do {
336 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
337 max_low_pfn<<PAGE_SHIFT,
338 kva_pages<<PAGE_SHIFT,
339 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
340 kva_target_pfn -= PTRS_PER_PTE;
341 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
335 342
336#ifdef CONFIG_BLK_DEV_INITRD 343 if (kva_start_pfn == -1UL)
337 /* Numa kva area is below the initrd */ 344 panic("Can not get kva space\n");
338 if (initrd_start)
339 kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
340 - kva_pages;
341#endif
342 345
343 /* 346 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
344 * We waste pages past at the end of the KVA for no good reason other
345 * than how it is located. This is bad.
346 */
347 wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
348 kva_start_pfn -= wasted_pages;
349 kva_pages += wasted_pages;
350
351 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
352 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
353 kva_start_pfn, max_low_pfn); 347 kva_start_pfn, max_low_pfn);
354 printk("max_pfn = %ld\n", max_pfn); 348 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
349
350 /* avoid clash with initrd */
351 reserve_early(kva_start_pfn<<PAGE_SHIFT,
352 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
353 "KVA PG");
355#ifdef CONFIG_HIGHMEM 354#ifdef CONFIG_HIGHMEM
356 highstart_pfn = highend_pfn = max_pfn; 355 highstart_pfn = highend_pfn = max_pfn;
357 if (max_pfn > system_max_low_pfn) 356 if (max_pfn > max_low_pfn)
358 highstart_pfn = system_max_low_pfn; 357 highstart_pfn = max_low_pfn;
359 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 358 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
360 pages_to_mb(highend_pfn - highstart_pfn)); 359 pages_to_mb(highend_pfn - highstart_pfn));
361 num_physpages = highend_pfn; 360 num_physpages = highend_pfn;
362 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 361 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
363#else 362#else
364 num_physpages = system_max_low_pfn; 363 num_physpages = max_low_pfn;
365 high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; 364 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
366#endif 365#endif
367 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 366 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
368 pages_to_mb(system_max_low_pfn)); 367 pages_to_mb(max_low_pfn));
369 printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 368 printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
370 min_low_pfn, max_low_pfn, highstart_pfn); 369 max_low_pfn, highstart_pfn);
371 370
372 printk("Low memory ends at vaddr %08lx\n", 371 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
373 (ulong) pfn_to_kaddr(max_low_pfn)); 372 (ulong) pfn_to_kaddr(max_low_pfn));
374 for_each_online_node(nid) { 373 for_each_online_node(nid) {
375 init_remap_allocator(nid); 374 init_remap_allocator(nid);
376 375
377 allocate_pgdat(nid); 376 allocate_pgdat(nid);
378 } 377 }
379 printk("High memory starts at vaddr %08lx\n", 378 remap_numa_kva();
379
380 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
380 (ulong) pfn_to_kaddr(highstart_pfn)); 381 (ulong) pfn_to_kaddr(highstart_pfn));
381 for_each_online_node(nid) 382 for_each_online_node(nid)
382 propagate_e820_map_node(nid); 383 propagate_e820_map_node(nid);
383 384
384 memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); 385 for_each_online_node(nid)
386 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
387
385 NODE_DATA(0)->bdata = &node0_bdata; 388 NODE_DATA(0)->bdata = &node0_bdata;
386 setup_bootmem_allocator(); 389 setup_bootmem_allocator();
387 return max_low_pfn;
388}
389
390void __init numa_kva_reserve(void)
391{
392 if (kva_pages)
393 reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
394 BOOTMEM_DEFAULT);
395} 390}
396 391
397void __init zone_sizes_init(void) 392void __init set_highmem_pages_init(void)
398{
399 int nid;
400 unsigned long max_zone_pfns[MAX_NR_ZONES];
401 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
402 max_zone_pfns[ZONE_DMA] =
403 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
404 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
405#ifdef CONFIG_HIGHMEM
406 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
407#endif
408
409 /* If SRAT has not registered memory, register it now */
410 if (find_max_pfn_with_active_regions() == 0) {
411 for_each_online_node(nid) {
412 if (node_has_online_mem(nid))
413 add_active_range(nid, node_start_pfn[nid],
414 node_end_pfn[nid]);
415 }
416 }
417
418 free_area_init_nodes(max_zone_pfns);
419 return;
420}
421
422void __init set_highmem_pages_init(int bad_ppro)
423{ 393{
424#ifdef CONFIG_HIGHMEM 394#ifdef CONFIG_HIGHMEM
425 struct zone *zone; 395 struct zone *zone;
426 struct page *page; 396 int nid;
427 397
428 for_each_zone(zone) { 398 for_each_zone(zone) {
429 unsigned long node_pfn, zone_start_pfn, zone_end_pfn; 399 unsigned long zone_start_pfn, zone_end_pfn;
430 400
431 if (!is_highmem(zone)) 401 if (!is_highmem(zone))
432 continue; 402 continue;
@@ -434,16 +404,12 @@ void __init set_highmem_pages_init(int bad_ppro)
434 zone_start_pfn = zone->zone_start_pfn; 404 zone_start_pfn = zone->zone_start_pfn;
435 zone_end_pfn = zone_start_pfn + zone->spanned_pages; 405 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
436 406
437 printk("Initializing %s for node %d (%08lx:%08lx)\n", 407 nid = zone_to_nid(zone);
438 zone->name, zone_to_nid(zone), 408 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
439 zone_start_pfn, zone_end_pfn); 409 zone->name, nid, zone_start_pfn, zone_end_pfn);
440 410
441 for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { 411 add_highpages_with_active_regions(nid, zone_start_pfn,
442 if (!pfn_valid(node_pfn)) 412 zone_end_pfn);
443 continue;
444 page = pfn_to_page(node_pfn);
445 add_one_highpage_init(page, node_pfn, bad_ppro);
446 }
447 } 413 }
448 totalram_pages += totalhigh_pages; 414 totalram_pages += totalhigh_pages;
449#endif 415#endif
@@ -476,3 +442,4 @@ int memory_add_physaddr_to_nid(u64 addr)
476 442
477EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 443EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
478#endif 444#endif
445
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2c24bea92c66..0bb0caed8971 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -42,7 +42,7 @@ static struct addr_marker address_markers[] = {
42 { 0, "User Space" }, 42 { 0, "User Space" },
43#ifdef CONFIG_X86_64 43#ifdef CONFIG_X86_64
44 { 0x8000000000000000UL, "Kernel Space" }, 44 { 0x8000000000000000UL, "Kernel Space" },
45 { 0xffff810000000000UL, "Low Kernel Mapping" }, 45 { PAGE_OFFSET, "Low Kernel Mapping" },
46 { VMALLOC_START, "vmalloc() Area" }, 46 { VMALLOC_START, "vmalloc() Area" },
47 { VMEMMAP_START, "Vmemmap" }, 47 { VMEMMAP_START, "Vmemmap" },
48 { __START_KERNEL_map, "High Kernel Mapping" }, 48 { __START_KERNEL_map, "High Kernel Mapping" },
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd7e1798c75a..455f3fe67b42 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/ptrace.h> 12#include <linux/ptrace.h>
13#include <linux/mmiotrace.h>
13#include <linux/mman.h> 14#include <linux/mman.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/smp.h> 16#include <linux/smp.h>
@@ -49,17 +50,23 @@
49#define PF_RSVD (1<<3) 50#define PF_RSVD (1<<3)
50#define PF_INSTR (1<<4) 51#define PF_INSTR (1<<4)
51 52
53static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
54{
55#ifdef CONFIG_MMIOTRACE_HOOKS
56 if (unlikely(is_kmmio_active()))
57 if (kmmio_handler(regs, addr) == 1)
58 return -1;
59#endif
60 return 0;
61}
62
52static inline int notify_page_fault(struct pt_regs *regs) 63static inline int notify_page_fault(struct pt_regs *regs)
53{ 64{
54#ifdef CONFIG_KPROBES 65#ifdef CONFIG_KPROBES
55 int ret = 0; 66 int ret = 0;
56 67
57 /* kprobe_running() needs smp_processor_id() */ 68 /* kprobe_running() needs smp_processor_id() */
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) { 69 if (!user_mode_vm(regs)) {
60#else
61 if (!user_mode(regs)) {
62#endif
63 preempt_disable(); 70 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 71 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1; 72 ret = 1;
@@ -396,11 +403,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
396 printk(KERN_CONT "NULL pointer dereference"); 403 printk(KERN_CONT "NULL pointer dereference");
397 else 404 else
398 printk(KERN_CONT "paging request"); 405 printk(KERN_CONT "paging request");
399#ifdef CONFIG_X86_32 406 printk(KERN_CONT " at %p\n", (void *) address);
400 printk(KERN_CONT " at %08lx\n", address);
401#else
402 printk(KERN_CONT " at %016lx\n", address);
403#endif
404 printk(KERN_ALERT "IP:"); 407 printk(KERN_ALERT "IP:");
405 printk_address(regs->ip, 1); 408 printk_address(regs->ip, 1);
406 dump_pagetable(address); 409 dump_pagetable(address);
@@ -497,6 +500,11 @@ static int vmalloc_fault(unsigned long address)
497 unsigned long pgd_paddr; 500 unsigned long pgd_paddr;
498 pmd_t *pmd_k; 501 pmd_t *pmd_k;
499 pte_t *pte_k; 502 pte_t *pte_k;
503
504 /* Make sure we are in vmalloc area */
505 if (!(address >= VMALLOC_START && address < VMALLOC_END))
506 return -1;
507
500 /* 508 /*
501 * Synchronize this task's top level page-table 509 * Synchronize this task's top level page-table
502 * with the 'reference' page table. 510 * with the 'reference' page table.
@@ -601,6 +609,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
601 609
602 if (notify_page_fault(regs)) 610 if (notify_page_fault(regs))
603 return; 611 return;
612 if (unlikely(kmmio_fault(regs, address)))
613 return;
604 614
605 /* 615 /*
606 * We fault-in kernel-space virtual memory on-demand. The 616 * We fault-in kernel-space virtual memory on-demand. The
@@ -795,14 +805,10 @@ bad_area_nosemaphore:
795 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 805 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
796 printk_ratelimit()) { 806 printk_ratelimit()) {
797 printk( 807 printk(
798#ifdef CONFIG_X86_32 808 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
799 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
800#else
801 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
802#endif
803 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 809 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
804 tsk->comm, task_pid_nr(tsk), address, regs->ip, 810 tsk->comm, task_pid_nr(tsk), address,
805 regs->sp, error_code); 811 (void *) regs->ip, (void *) regs->sp, error_code);
806 print_vma_addr(" in ", regs->ip); 812 print_vma_addr(" in ", regs->ip);
807 printk("\n"); 813 printk("\n");
808 } 814 }
@@ -910,14 +916,7 @@ LIST_HEAD(pgd_list);
910void vmalloc_sync_all(void) 916void vmalloc_sync_all(void)
911{ 917{
912#ifdef CONFIG_X86_32 918#ifdef CONFIG_X86_32
913 /* 919 unsigned long start = VMALLOC_START & PGDIR_MASK;
914 * Note that races in the updates of insync and start aren't
915 * problematic: insync can only get set bits added, and updates to
916 * start are only improving performance (without affecting correctness
917 * if undone).
918 */
919 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
920 static unsigned long start = TASK_SIZE;
921 unsigned long address; 920 unsigned long address;
922 921
923 if (SHARED_KERNEL_PMD) 922 if (SHARED_KERNEL_PMD)
@@ -925,56 +924,38 @@ void vmalloc_sync_all(void)
925 924
926 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 925 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
927 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 926 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
928 if (!test_bit(pgd_index(address), insync)) { 927 unsigned long flags;
929 unsigned long flags; 928 struct page *page;
930 struct page *page; 929
931 930 spin_lock_irqsave(&pgd_lock, flags);
932 spin_lock_irqsave(&pgd_lock, flags); 931 list_for_each_entry(page, &pgd_list, lru) {
933 list_for_each_entry(page, &pgd_list, lru) { 932 if (!vmalloc_sync_one(page_address(page),
934 if (!vmalloc_sync_one(page_address(page), 933 address))
935 address)) 934 break;
936 break;
937 }
938 spin_unlock_irqrestore(&pgd_lock, flags);
939 if (!page)
940 set_bit(pgd_index(address), insync);
941 } 935 }
942 if (address == start && test_bit(pgd_index(address), insync)) 936 spin_unlock_irqrestore(&pgd_lock, flags);
943 start = address + PGDIR_SIZE;
944 } 937 }
945#else /* CONFIG_X86_64 */ 938#else /* CONFIG_X86_64 */
946 /* 939 unsigned long start = VMALLOC_START & PGDIR_MASK;
947 * Note that races in the updates of insync and start aren't
948 * problematic: insync can only get set bits added, and updates to
949 * start are only improving performance (without affecting correctness
950 * if undone).
951 */
952 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
953 static unsigned long start = VMALLOC_START & PGDIR_MASK;
954 unsigned long address; 940 unsigned long address;
955 941
956 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { 942 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
957 if (!test_bit(pgd_index(address), insync)) { 943 const pgd_t *pgd_ref = pgd_offset_k(address);
958 const pgd_t *pgd_ref = pgd_offset_k(address); 944 unsigned long flags;
959 unsigned long flags; 945 struct page *page;
960 struct page *page; 946
961 947 if (pgd_none(*pgd_ref))
962 if (pgd_none(*pgd_ref)) 948 continue;
963 continue; 949 spin_lock_irqsave(&pgd_lock, flags);
964 spin_lock_irqsave(&pgd_lock, flags); 950 list_for_each_entry(page, &pgd_list, lru) {
965 list_for_each_entry(page, &pgd_list, lru) { 951 pgd_t *pgd;
966 pgd_t *pgd; 952 pgd = (pgd_t *)page_address(page) + pgd_index(address);
967 pgd = (pgd_t *)page_address(page) + pgd_index(address); 953 if (pgd_none(*pgd))
968 if (pgd_none(*pgd)) 954 set_pgd(pgd, *pgd_ref);
969 set_pgd(pgd, *pgd_ref); 955 else
970 else 956 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
971 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
972 }
973 spin_unlock_irqrestore(&pgd_lock, flags);
974 set_bit(pgd_index(address), insync);
975 } 957 }
976 if (address == start) 958 spin_unlock_irqrestore(&pgd_lock, flags);
977 start = address + PGDIR_SIZE;
978 } 959 }
979#endif 960#endif
980} 961}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b6..9689a5138e64 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -50,6 +50,7 @@
50 50
51unsigned int __VMALLOC_RESERVE = 128 << 20; 51unsigned int __VMALLOC_RESERVE = 128 << 20;
52 52
53unsigned long max_low_pfn_mapped;
53unsigned long max_pfn_mapped; 54unsigned long max_pfn_mapped;
54 55
55DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 56DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -57,6 +58,27 @@ unsigned long highstart_pfn, highend_pfn;
57 58
58static noinline int do_test_wp_bit(void); 59static noinline int do_test_wp_bit(void);
59 60
61
62static unsigned long __initdata table_start;
63static unsigned long __meminitdata table_end;
64static unsigned long __meminitdata table_top;
65
66static int __initdata after_init_bootmem;
67
68static __init void *alloc_low_page(unsigned long *phys)
69{
70 unsigned long pfn = table_end++;
71 void *adr;
72
73 if (pfn >= table_top)
74 panic("alloc_low_page: ran out of memory");
75
76 adr = __va(pfn * PAGE_SIZE);
77 memset(adr, 0, PAGE_SIZE);
78 *phys = pfn * PAGE_SIZE;
79 return adr;
80}
81
60/* 82/*
61 * Creates a middle page table and puts a pointer to it in the 83 * Creates a middle page table and puts a pointer to it in the
62 * given global directory entry. This only returns the gd entry 84 * given global directory entry. This only returns the gd entry
@@ -68,9 +90,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
68 pmd_t *pmd_table; 90 pmd_t *pmd_table;
69 91
70#ifdef CONFIG_X86_PAE 92#ifdef CONFIG_X86_PAE
93 unsigned long phys;
71 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 94 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
72 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 95 if (after_init_bootmem)
73 96 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
97 else
98 pmd_table = (pmd_t *)alloc_low_page(&phys);
74 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 99 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
75 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 100 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
76 pud = pud_offset(pgd, 0); 101 pud = pud_offset(pgd, 0);
@@ -92,12 +117,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
92 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 117 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
93 pte_t *page_table = NULL; 118 pte_t *page_table = NULL;
94 119
120 if (after_init_bootmem) {
95#ifdef CONFIG_DEBUG_PAGEALLOC 121#ifdef CONFIG_DEBUG_PAGEALLOC
96 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 122 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
97#endif 123#endif
98 if (!page_table) { 124 if (!page_table)
99 page_table = 125 page_table =
100 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 126 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
127 } else {
128 unsigned long phys;
129 page_table = (pte_t *)alloc_low_page(&phys);
101 } 130 }
102 131
103 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 132 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@ -155,38 +184,44 @@ static inline int is_kernel_text(unsigned long addr)
155 * of max_low_pfn pages, by creating page tables starting from address 184 * of max_low_pfn pages, by creating page tables starting from address
156 * PAGE_OFFSET: 185 * PAGE_OFFSET:
157 */ 186 */
158static void __init kernel_physical_mapping_init(pgd_t *pgd_base) 187static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
188 unsigned long start_pfn,
189 unsigned long end_pfn,
190 int use_pse)
159{ 191{
160 int pgd_idx, pmd_idx, pte_ofs; 192 int pgd_idx, pmd_idx, pte_ofs;
161 unsigned long pfn; 193 unsigned long pfn;
162 pgd_t *pgd; 194 pgd_t *pgd;
163 pmd_t *pmd; 195 pmd_t *pmd;
164 pte_t *pte; 196 pte_t *pte;
197 unsigned pages_2m = 0, pages_4k = 0;
165 198
166 pgd_idx = pgd_index(PAGE_OFFSET); 199 if (!cpu_has_pse)
167 pgd = pgd_base + pgd_idx; 200 use_pse = 0;
168 pfn = 0;
169 201
202 pfn = start_pfn;
203 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
204 pgd = pgd_base + pgd_idx;
170 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { 205 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
171 pmd = one_md_table_init(pgd); 206 pmd = one_md_table_init(pgd);
172 if (pfn >= max_low_pfn)
173 continue;
174 207
175 for (pmd_idx = 0; 208 if (pfn >= end_pfn)
176 pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; 209 continue;
210#ifdef CONFIG_X86_PAE
211 pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
212 pmd += pmd_idx;
213#else
214 pmd_idx = 0;
215#endif
216 for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
177 pmd++, pmd_idx++) { 217 pmd++, pmd_idx++) {
178 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; 218 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
179 219
180 /* 220 /*
181 * Map with big pages if possible, otherwise 221 * Map with big pages if possible, otherwise
182 * create normal page tables: 222 * create normal page tables:
183 *
184 * Don't use a large page for the first 2/4MB of memory
185 * because there are often fixed size MTRRs in there
186 * and overlapping MTRRs into large pages can cause
187 * slowdowns.
188 */ 223 */
189 if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { 224 if (use_pse) {
190 unsigned int addr2; 225 unsigned int addr2;
191 pgprot_t prot = PAGE_KERNEL_LARGE; 226 pgprot_t prot = PAGE_KERNEL_LARGE;
192 227
@@ -197,34 +232,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
197 is_kernel_text(addr2)) 232 is_kernel_text(addr2))
198 prot = PAGE_KERNEL_LARGE_EXEC; 233 prot = PAGE_KERNEL_LARGE_EXEC;
199 234
235 pages_2m++;
200 set_pmd(pmd, pfn_pmd(pfn, prot)); 236 set_pmd(pmd, pfn_pmd(pfn, prot));
201 237
202 pfn += PTRS_PER_PTE; 238 pfn += PTRS_PER_PTE;
203 max_pfn_mapped = pfn;
204 continue; 239 continue;
205 } 240 }
206 pte = one_page_table_init(pmd); 241 pte = one_page_table_init(pmd);
207 242
208 for (pte_ofs = 0; 243 pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
209 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; 244 pte += pte_ofs;
245 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
210 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 246 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
211 pgprot_t prot = PAGE_KERNEL; 247 pgprot_t prot = PAGE_KERNEL;
212 248
213 if (is_kernel_text(addr)) 249 if (is_kernel_text(addr))
214 prot = PAGE_KERNEL_EXEC; 250 prot = PAGE_KERNEL_EXEC;
215 251
252 pages_4k++;
216 set_pte(pte, pfn_pte(pfn, prot)); 253 set_pte(pte, pfn_pte(pfn, prot));
217 } 254 }
218 max_pfn_mapped = pfn;
219 } 255 }
220 } 256 }
221} 257 update_page_count(PG_LEVEL_2M, pages_2m);
222 258 update_page_count(PG_LEVEL_4K, pages_4k);
223static inline int page_kills_ppro(unsigned long pagenr)
224{
225 if (pagenr >= 0x70000 && pagenr <= 0x7003F)
226 return 1;
227 return 0;
228} 259}
229 260
230/* 261/*
@@ -287,29 +318,62 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
287 pkmap_page_table = pte; 318 pkmap_page_table = pte;
288} 319}
289 320
290void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) 321static void __init add_one_highpage_init(struct page *page, int pfn)
291{ 322{
292 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { 323 ClearPageReserved(page);
293 ClearPageReserved(page); 324 init_page_count(page);
294 init_page_count(page); 325 __free_page(page);
295 __free_page(page); 326 totalhigh_pages++;
296 totalhigh_pages++;
297 } else
298 SetPageReserved(page);
299} 327}
300 328
301#ifndef CONFIG_NUMA 329struct add_highpages_data {
302static void __init set_highmem_pages_init(int bad_ppro) 330 unsigned long start_pfn;
331 unsigned long end_pfn;
332};
333
334static int __init add_highpages_work_fn(unsigned long start_pfn,
335 unsigned long end_pfn, void *datax)
303{ 336{
304 int pfn; 337 int node_pfn;
338 struct page *page;
339 unsigned long final_start_pfn, final_end_pfn;
340 struct add_highpages_data *data;
305 341
306 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { 342 data = (struct add_highpages_data *)datax;
307 /* 343
308 * Holes under sparsemem might not have no mem_map[]: 344 final_start_pfn = max(start_pfn, data->start_pfn);
309 */ 345 final_end_pfn = min(end_pfn, data->end_pfn);
310 if (pfn_valid(pfn)) 346 if (final_start_pfn >= final_end_pfn)
311 add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); 347 return 0;
348
349 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
350 node_pfn++) {
351 if (!pfn_valid(node_pfn))
352 continue;
353 page = pfn_to_page(node_pfn);
354 add_one_highpage_init(page, node_pfn);
312 } 355 }
356
357 return 0;
358
359}
360
361void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
362 unsigned long end_pfn)
363{
364 struct add_highpages_data data;
365
366 data.start_pfn = start_pfn;
367 data.end_pfn = end_pfn;
368
369 work_with_active_regions(nid, add_highpages_work_fn, &data);
370}
371
372#ifndef CONFIG_NUMA
373static void __init set_highmem_pages_init(void)
374{
375 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
376
313 totalram_pages += totalhigh_pages; 377 totalram_pages += totalhigh_pages;
314} 378}
315#endif /* !CONFIG_NUMA */ 379#endif /* !CONFIG_NUMA */
@@ -317,14 +381,9 @@ static void __init set_highmem_pages_init(int bad_ppro)
317#else 381#else
318# define kmap_init() do { } while (0) 382# define kmap_init() do { } while (0)
319# define permanent_kmaps_init(pgd_base) do { } while (0) 383# define permanent_kmaps_init(pgd_base) do { } while (0)
320# define set_highmem_pages_init(bad_ppro) do { } while (0) 384# define set_highmem_pages_init() do { } while (0)
321#endif /* CONFIG_HIGHMEM */ 385#endif /* CONFIG_HIGHMEM */
322 386
323pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
324EXPORT_SYMBOL(__PAGE_KERNEL);
325
326pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
327
328void __init native_pagetable_setup_start(pgd_t *base) 387void __init native_pagetable_setup_start(pgd_t *base)
329{ 388{
330 unsigned long pfn, va; 389 unsigned long pfn, va;
@@ -380,27 +439,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
380 * be partially populated, and so it avoids stomping on any existing 439 * be partially populated, and so it avoids stomping on any existing
381 * mappings. 440 * mappings.
382 */ 441 */
383static void __init pagetable_init(void) 442static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
384{ 443{
385 pgd_t *pgd_base = swapper_pg_dir;
386 unsigned long vaddr, end; 444 unsigned long vaddr, end;
387 445
388 paravirt_pagetable_setup_start(pgd_base);
389
390 /* Enable PSE if available */
391 if (cpu_has_pse)
392 set_in_cr4(X86_CR4_PSE);
393
394 /* Enable PGE if available */
395 if (cpu_has_pge) {
396 set_in_cr4(X86_CR4_PGE);
397 __PAGE_KERNEL |= _PAGE_GLOBAL;
398 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
399 }
400
401 kernel_physical_mapping_init(pgd_base);
402 remap_numa_kva();
403
404 /* 446 /*
405 * Fixed mappings, only the page table structure has to be 447 * Fixed mappings, only the page table structure has to be
406 * created - mappings will be set by set_fixmap(): 448 * created - mappings will be set by set_fixmap():
@@ -410,6 +452,13 @@ static void __init pagetable_init(void)
410 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 452 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
411 page_table_range_init(vaddr, end, pgd_base); 453 page_table_range_init(vaddr, end, pgd_base);
412 early_ioremap_reset(); 454 early_ioremap_reset();
455}
456
457static void __init pagetable_init(void)
458{
459 pgd_t *pgd_base = swapper_pg_dir;
460
461 paravirt_pagetable_setup_start(pgd_base);
413 462
414 permanent_kmaps_init(pgd_base); 463 permanent_kmaps_init(pgd_base);
415 464
@@ -456,7 +505,7 @@ void zap_low_mappings(void)
456 505
457int nx_enabled; 506int nx_enabled;
458 507
459pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; 508pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
460EXPORT_SYMBOL_GPL(__supported_pte_mask); 509EXPORT_SYMBOL_GPL(__supported_pte_mask);
461 510
462#ifdef CONFIG_X86_PAE 511#ifdef CONFIG_X86_PAE
@@ -509,27 +558,318 @@ static void __init set_nx(void)
509} 558}
510#endif 559#endif
511 560
561/* user-defined highmem size */
562static unsigned int highmem_pages = -1;
563
512/* 564/*
513 * paging_init() sets up the page tables - note that the first 8MB are 565 * highmem=size forces highmem to be exactly 'size' bytes.
514 * already mapped by head.S. 566 * This works even on boxes that have no highmem otherwise.
515 * 567 * This also works to reduce highmem size on bigger boxes.
516 * This routines also unmaps the page at virtual kernel address 0, so
517 * that we can trap those pesky NULL-reference errors in the kernel.
518 */ 568 */
519void __init paging_init(void) 569static int __init parse_highmem(char *arg)
570{
571 if (!arg)
572 return -EINVAL;
573
574 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
575 return 0;
576}
577early_param("highmem", parse_highmem);
578
579/*
580 * Determine low and high memory ranges:
581 */
582void __init find_low_pfn_range(void)
583{
584 /* it could update max_pfn */
585
586 /* max_low_pfn is 0, we already have early_res support */
587
588 max_low_pfn = max_pfn;
589 if (max_low_pfn > MAXMEM_PFN) {
590 if (highmem_pages == -1)
591 highmem_pages = max_pfn - MAXMEM_PFN;
592 if (highmem_pages + MAXMEM_PFN < max_pfn)
593 max_pfn = MAXMEM_PFN + highmem_pages;
594 if (highmem_pages + MAXMEM_PFN > max_pfn) {
595 printk(KERN_WARNING "only %luMB highmem pages "
596 "available, ignoring highmem size of %uMB.\n",
597 pages_to_mb(max_pfn - MAXMEM_PFN),
598 pages_to_mb(highmem_pages));
599 highmem_pages = 0;
600 }
601 max_low_pfn = MAXMEM_PFN;
602#ifndef CONFIG_HIGHMEM
603 /* Maximum memory usable is what is directly addressable */
604 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
605 MAXMEM>>20);
606 if (max_pfn > MAX_NONPAE_PFN)
607 printk(KERN_WARNING
608 "Use a HIGHMEM64G enabled kernel.\n");
609 else
610 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
611 max_pfn = MAXMEM_PFN;
612#else /* !CONFIG_HIGHMEM */
613#ifndef CONFIG_HIGHMEM64G
614 if (max_pfn > MAX_NONPAE_PFN) {
615 max_pfn = MAX_NONPAE_PFN;
616 printk(KERN_WARNING "Warning only 4GB will be used."
617 "Use a HIGHMEM64G enabled kernel.\n");
618 }
619#endif /* !CONFIG_HIGHMEM64G */
620#endif /* !CONFIG_HIGHMEM */
621 } else {
622 if (highmem_pages == -1)
623 highmem_pages = 0;
624#ifdef CONFIG_HIGHMEM
625 if (highmem_pages >= max_pfn) {
626 printk(KERN_ERR "highmem size specified (%uMB) is "
627 "bigger than pages available (%luMB)!.\n",
628 pages_to_mb(highmem_pages),
629 pages_to_mb(max_pfn));
630 highmem_pages = 0;
631 }
632 if (highmem_pages) {
633 if (max_low_pfn - highmem_pages <
634 64*1024*1024/PAGE_SIZE){
635 printk(KERN_ERR "highmem size %uMB results in "
636 "smaller than 64MB lowmem, ignoring it.\n"
637 , pages_to_mb(highmem_pages));
638 highmem_pages = 0;
639 }
640 max_low_pfn -= highmem_pages;
641 }
642#else
643 if (highmem_pages)
644 printk(KERN_ERR "ignoring highmem size on non-highmem"
645 " kernel!\n");
646#endif
647 }
648}
649
650#ifndef CONFIG_NEED_MULTIPLE_NODES
651void __init initmem_init(unsigned long start_pfn,
652 unsigned long end_pfn)
520{ 653{
654#ifdef CONFIG_HIGHMEM
655 highstart_pfn = highend_pfn = max_pfn;
656 if (max_pfn > max_low_pfn)
657 highstart_pfn = max_low_pfn;
658 memory_present(0, 0, highend_pfn);
659 e820_register_active_regions(0, 0, highend_pfn);
660 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
661 pages_to_mb(highend_pfn - highstart_pfn));
662 num_physpages = highend_pfn;
663 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
664#else
665 memory_present(0, 0, max_low_pfn);
666 e820_register_active_regions(0, 0, max_low_pfn);
667 num_physpages = max_low_pfn;
668 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
669#endif
670#ifdef CONFIG_FLATMEM
671 max_mapnr = num_physpages;
672#endif
673 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
674 pages_to_mb(max_low_pfn));
675
676 setup_bootmem_allocator();
677}
678#endif /* !CONFIG_NEED_MULTIPLE_NODES */
679
680static void __init zone_sizes_init(void)
681{
682 unsigned long max_zone_pfns[MAX_NR_ZONES];
683 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
684 max_zone_pfns[ZONE_DMA] =
685 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
686 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
687#ifdef CONFIG_HIGHMEM
688 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
689#endif
690
691 free_area_init_nodes(max_zone_pfns);
692}
693
694void __init setup_bootmem_allocator(void)
695{
696 int i;
697 unsigned long bootmap_size, bootmap;
698 /*
699 * Initialize the boot-time allocator (with low memory only):
700 */
701 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
702 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
703 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
704 PAGE_SIZE);
705 if (bootmap == -1L)
706 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
707 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
708
709 /* don't touch min_low_pfn */
710 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
711 min_low_pfn, max_low_pfn);
712 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
713 max_pfn_mapped<<PAGE_SHIFT);
714 printk(KERN_INFO " low ram: %08lx - %08lx\n",
715 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
716 printk(KERN_INFO " bootmap %08lx - %08lx\n",
717 bootmap, bootmap + bootmap_size);
718 for_each_online_node(i)
719 free_bootmem_with_active_regions(i, max_low_pfn);
720 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
721
722 after_init_bootmem = 1;
723}
724
725static void __init find_early_table_space(unsigned long end)
726{
727 unsigned long puds, pmds, ptes, tables, start;
728
729 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
730 tables = PAGE_ALIGN(puds * sizeof(pud_t));
731
732 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
733 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
734
735 if (cpu_has_pse) {
736 unsigned long extra;
737
738 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
739 extra += PMD_SIZE;
740 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
741 } else
742 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
743
744 tables += PAGE_ALIGN(ptes * sizeof(pte_t));
745
746 /* for fixmap */
747 tables += PAGE_SIZE * 2;
748
749 /*
750 * RED-PEN putting page tables only on node 0 could
751 * cause a hotspot and fill up ZONE_DMA. The page tables
752 * need roughly 0.5KB per GB.
753 */
754 start = 0x7000;
755 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
756 tables, PAGE_SIZE);
757 if (table_start == -1UL)
758 panic("Cannot find space for the kernel page tables");
759
760 table_start >>= PAGE_SHIFT;
761 table_end = table_start;
762 table_top = table_start + (tables>>PAGE_SHIFT);
763
764 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
765 end, table_start << PAGE_SHIFT,
766 (table_start << PAGE_SHIFT) + tables);
767}
768
769unsigned long __init_refok init_memory_mapping(unsigned long start,
770 unsigned long end)
771{
772 pgd_t *pgd_base = swapper_pg_dir;
773 unsigned long start_pfn, end_pfn;
774 unsigned long big_page_start;
775
776 /*
777 * Find space for the kernel direct mapping tables.
778 */
779 if (!after_init_bootmem)
780 find_early_table_space(end);
781
521#ifdef CONFIG_X86_PAE 782#ifdef CONFIG_X86_PAE
522 set_nx(); 783 set_nx();
523 if (nx_enabled) 784 if (nx_enabled)
524 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 785 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
525#endif 786#endif
526 pagetable_init(); 787
788 /* Enable PSE if available */
789 if (cpu_has_pse)
790 set_in_cr4(X86_CR4_PSE);
791
792 /* Enable PGE if available */
793 if (cpu_has_pge) {
794 set_in_cr4(X86_CR4_PGE);
795 __supported_pte_mask |= _PAGE_GLOBAL;
796 }
797
798 /*
799 * Don't use a large page for the first 2/4MB of memory
800 * because there are often fixed size MTRRs in there
801 * and overlapping MTRRs into large pages can cause
802 * slowdowns.
803 */
804 big_page_start = PMD_SIZE;
805
806 if (start < big_page_start) {
807 start_pfn = start >> PAGE_SHIFT;
808 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
809 } else {
810 /* head is not big page alignment ? */
811 start_pfn = start >> PAGE_SHIFT;
812 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
813 << (PMD_SHIFT - PAGE_SHIFT);
814 }
815 if (start_pfn < end_pfn)
816 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
817
818 /* big page range */
819 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
820 << (PMD_SHIFT - PAGE_SHIFT);
821 if (start_pfn < (big_page_start >> PAGE_SHIFT))
822 start_pfn = big_page_start >> PAGE_SHIFT;
823 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
824 if (start_pfn < end_pfn)
825 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
826 cpu_has_pse);
827
828 /* tail is not big page alignment ? */
829 start_pfn = end_pfn;
830 if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
831 end_pfn = end >> PAGE_SHIFT;
832 if (start_pfn < end_pfn)
833 kernel_physical_mapping_init(pgd_base, start_pfn,
834 end_pfn, 0);
835 }
836
837 early_ioremap_page_table_range_init(pgd_base);
527 838
528 load_cr3(swapper_pg_dir); 839 load_cr3(swapper_pg_dir);
529 840
530 __flush_tlb_all(); 841 __flush_tlb_all();
531 842
843 if (!after_init_bootmem)
844 reserve_early(table_start << PAGE_SHIFT,
845 table_end << PAGE_SHIFT, "PGTABLE");
846
847 return end >> PAGE_SHIFT;
848}
849
850
851/*
852 * paging_init() sets up the page tables - note that the first 8MB are
853 * already mapped by head.S.
854 *
855 * This routines also unmaps the page at virtual kernel address 0, so
856 * that we can trap those pesky NULL-reference errors in the kernel.
857 */
858void __init paging_init(void)
859{
860 pagetable_init();
861
862 __flush_tlb_all();
863
532 kmap_init(); 864 kmap_init();
865
866 /*
867 * NOTE: at this point the bootmem allocator is fully available.
868 */
869 sparse_init();
870 zone_sizes_init();
871
872 paravirt_post_allocator_init();
533} 873}
534 874
535/* 875/*
@@ -564,24 +904,11 @@ static struct kcore_list kcore_mem, kcore_vmalloc;
564void __init mem_init(void) 904void __init mem_init(void)
565{ 905{
566 int codesize, reservedpages, datasize, initsize; 906 int codesize, reservedpages, datasize, initsize;
567 int tmp, bad_ppro; 907 int tmp;
568 908
569#ifdef CONFIG_FLATMEM 909#ifdef CONFIG_FLATMEM
570 BUG_ON(!mem_map); 910 BUG_ON(!mem_map);
571#endif 911#endif
572 bad_ppro = ppro_with_ram_bug();
573
574#ifdef CONFIG_HIGHMEM
575 /* check that fixmap and pkmap do not overlap */
576 if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
577 printk(KERN_ERR
578 "fixmap and kmap areas overlap - this will crash\n");
579 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
580 PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
581 FIXADDR_START);
582 BUG();
583 }
584#endif
585 /* this will put all low memory onto the freelists */ 912 /* this will put all low memory onto the freelists */
586 totalram_pages += free_all_bootmem(); 913 totalram_pages += free_all_bootmem();
587 914
@@ -593,7 +920,7 @@ void __init mem_init(void)
593 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 920 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
594 reservedpages++; 921 reservedpages++;
595 922
596 set_highmem_pages_init(bad_ppro); 923 set_highmem_pages_init();
597 924
598 codesize = (unsigned long) &_etext - (unsigned long) &_text; 925 codesize = (unsigned long) &_etext - (unsigned long) &_text;
599 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 926 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
@@ -614,7 +941,6 @@ void __init mem_init(void)
614 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 941 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
615 ); 942 );
616 943
617#if 1 /* double-sanity-check paranoia */
618 printk(KERN_INFO "virtual kernel memory layout:\n" 944 printk(KERN_INFO "virtual kernel memory layout:\n"
619 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 945 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
620#ifdef CONFIG_HIGHMEM 946#ifdef CONFIG_HIGHMEM
@@ -655,7 +981,6 @@ void __init mem_init(void)
655#endif 981#endif
656 BUG_ON(VMALLOC_START > VMALLOC_END); 982 BUG_ON(VMALLOC_START > VMALLOC_END);
657 BUG_ON((unsigned long)high_memory > VMALLOC_START); 983 BUG_ON((unsigned long)high_memory > VMALLOC_START);
658#endif /* double-sanity-check paranoia */
659 984
660 if (boot_cpu_data.wp_works_ok < 0) 985 if (boot_cpu_data.wp_works_ok < 0)
661 test_wp_bit(); 986 test_wp_bit();
@@ -710,6 +1035,8 @@ void mark_rodata_ro(void)
710 unsigned long start = PFN_ALIGN(_text); 1035 unsigned long start = PFN_ALIGN(_text);
711 unsigned long size = PFN_ALIGN(_etext) - start; 1036 unsigned long size = PFN_ALIGN(_etext) - start;
712 1037
1038#ifndef CONFIG_DYNAMIC_FTRACE
1039 /* Dynamic tracing modifies the kernel text section */
713 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1040 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
714 printk(KERN_INFO "Write protecting the kernel text: %luk\n", 1041 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
715 size >> 10); 1042 size >> 10);
@@ -722,6 +1049,8 @@ void mark_rodata_ro(void)
722 printk(KERN_INFO "Testing CPA: write protecting again\n"); 1049 printk(KERN_INFO "Testing CPA: write protecting again\n");
723 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); 1050 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
724#endif 1051#endif
1052#endif /* CONFIG_DYNAMIC_FTRACE */
1053
725 start += size; 1054 start += size;
726 size = (unsigned long)__end_rodata - start; 1055 size = (unsigned long)__end_rodata - start;
727 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1056 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
@@ -784,3 +1113,9 @@ void free_initrd_mem(unsigned long start, unsigned long end)
784 free_init_pages("initrd memory", start, end); 1113 free_init_pages("initrd memory", start, end);
785} 1114}
786#endif 1115#endif
1116
1117int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1118 int flags)
1119{
1120 return reserve_bootmem(phys, len, flags);
1121}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 32ba13b0f818..306049edd553 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -18,6 +18,7 @@
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/initrd.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include <linux/bootmem.h> 23#include <linux/bootmem.h>
23#include <linux/proc_fs.h> 24#include <linux/proc_fs.h>
@@ -47,6 +48,14 @@
47#include <asm/numa.h> 48#include <asm/numa.h>
48#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
49 50
51/*
52 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
53 * The direct mapping extends to max_pfn_mapped, so that we can directly access
54 * apertures, ACPI and other tables without having to play with fixmaps.
55 */
56unsigned long max_low_pfn_mapped;
57unsigned long max_pfn_mapped;
58
50static unsigned long dma_reserve __initdata; 59static unsigned long dma_reserve __initdata;
51 60
52DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 61DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -135,26 +144,17 @@ static __init void *spp_getpage(void)
135 return ptr; 144 return ptr;
136} 145}
137 146
138static void 147void
139set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) 148set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
140{ 149{
141 pgd_t *pgd;
142 pud_t *pud; 150 pud_t *pud;
143 pmd_t *pmd; 151 pmd_t *pmd;
144 pte_t *pte, new_pte; 152 pte_t *pte;
145
146 pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
147 153
148 pgd = pgd_offset_k(vaddr); 154 pud = pud_page + pud_index(vaddr);
149 if (pgd_none(*pgd)) {
150 printk(KERN_ERR
151 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
152 return;
153 }
154 pud = pud_offset(pgd, vaddr);
155 if (pud_none(*pud)) { 155 if (pud_none(*pud)) {
156 pmd = (pmd_t *) spp_getpage(); 156 pmd = (pmd_t *) spp_getpage();
157 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); 157 pud_populate(&init_mm, pud, pmd);
158 if (pmd != pmd_offset(pud, 0)) { 158 if (pmd != pmd_offset(pud, 0)) {
159 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 159 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
160 pmd, pmd_offset(pud, 0)); 160 pmd, pmd_offset(pud, 0));
@@ -164,13 +164,12 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
164 pmd = pmd_offset(pud, vaddr); 164 pmd = pmd_offset(pud, vaddr);
165 if (pmd_none(*pmd)) { 165 if (pmd_none(*pmd)) {
166 pte = (pte_t *) spp_getpage(); 166 pte = (pte_t *) spp_getpage();
167 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); 167 pmd_populate_kernel(&init_mm, pmd, pte);
168 if (pte != pte_offset_kernel(pmd, 0)) { 168 if (pte != pte_offset_kernel(pmd, 0)) {
169 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 169 printk(KERN_ERR "PAGETABLE BUG #02!\n");
170 return; 170 return;
171 } 171 }
172 } 172 }
173 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
174 173
175 pte = pte_offset_kernel(pmd, vaddr); 174 pte = pte_offset_kernel(pmd, vaddr);
176 if (!pte_none(*pte) && pte_val(new_pte) && 175 if (!pte_none(*pte) && pte_val(new_pte) &&
@@ -185,6 +184,64 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
185 __flush_tlb_one(vaddr); 184 __flush_tlb_one(vaddr);
186} 185}
187 186
187void
188set_pte_vaddr(unsigned long vaddr, pte_t pteval)
189{
190 pgd_t *pgd;
191 pud_t *pud_page;
192
193 pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
194
195 pgd = pgd_offset_k(vaddr);
196 if (pgd_none(*pgd)) {
197 printk(KERN_ERR
198 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
199 return;
200 }
201 pud_page = (pud_t*)pgd_page_vaddr(*pgd);
202 set_pte_vaddr_pud(pud_page, vaddr, pteval);
203}
204
205/*
206 * Create large page table mappings for a range of physical addresses.
207 */
208static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
209 pgprot_t prot)
210{
211 pgd_t *pgd;
212 pud_t *pud;
213 pmd_t *pmd;
214
215 BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
216 for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
217 pgd = pgd_offset_k((unsigned long)__va(phys));
218 if (pgd_none(*pgd)) {
219 pud = (pud_t *) spp_getpage();
220 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
221 _PAGE_USER));
222 }
223 pud = pud_offset(pgd, (unsigned long)__va(phys));
224 if (pud_none(*pud)) {
225 pmd = (pmd_t *) spp_getpage();
226 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
227 _PAGE_USER));
228 }
229 pmd = pmd_offset(pud, phys);
230 BUG_ON(!pmd_none(*pmd));
231 set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
232 }
233}
234
235void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
236{
237 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
238}
239
240void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
241{
242 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
243}
244
188/* 245/*
189 * The head.S code sets up the kernel high mapping: 246 * The head.S code sets up the kernel high mapping:
190 * 247 *
@@ -206,27 +263,16 @@ void __init cleanup_highmap(void)
206 pmd_t *last_pmd = pmd + PTRS_PER_PMD; 263 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
207 264
208 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { 265 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
209 if (!pmd_present(*pmd)) 266 if (pmd_none(*pmd))
210 continue; 267 continue;
211 if (vaddr < (unsigned long) _text || vaddr > end) 268 if (vaddr < (unsigned long) _text || vaddr > end)
212 set_pmd(pmd, __pmd(0)); 269 set_pmd(pmd, __pmd(0));
213 } 270 }
214} 271}
215 272
216/* NOTE: this is meant to be run only at boot */
217void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
218{
219 unsigned long address = __fix_to_virt(idx);
220
221 if (idx >= __end_of_fixed_addresses) {
222 printk(KERN_ERR "Invalid __set_fixmap\n");
223 return;
224 }
225 set_pte_phys(address, phys, prot);
226}
227
228static unsigned long __initdata table_start; 273static unsigned long __initdata table_start;
229static unsigned long __meminitdata table_end; 274static unsigned long __meminitdata table_end;
275static unsigned long __meminitdata table_top;
230 276
231static __meminit void *alloc_low_page(unsigned long *phys) 277static __meminit void *alloc_low_page(unsigned long *phys)
232{ 278{
@@ -240,7 +286,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
240 return adr; 286 return adr;
241 } 287 }
242 288
243 if (pfn >= end_pfn) 289 if (pfn >= table_top)
244 panic("alloc_low_page: ran out of memory"); 290 panic("alloc_low_page: ran out of memory");
245 291
246 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 292 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -257,65 +303,61 @@ static __meminit void unmap_low_page(void *adr)
257 early_iounmap(adr, PAGE_SIZE); 303 early_iounmap(adr, PAGE_SIZE);
258} 304}
259 305
260/* Must run before zap_low_mappings */ 306static unsigned long __meminit
261__meminit void *early_ioremap(unsigned long addr, unsigned long size) 307phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
262{ 308{
263 pmd_t *pmd, *last_pmd; 309 unsigned pages = 0;
264 unsigned long vaddr; 310 unsigned long last_map_addr = end;
265 int i, pmds; 311 int i;
312
313 pte_t *pte = pte_page + pte_index(addr);
266 314
267 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; 315 for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
268 vaddr = __START_KERNEL_map;
269 pmd = level2_kernel_pgt;
270 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
271 316
272 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { 317 if (addr >= end) {
273 for (i = 0; i < pmds; i++) { 318 if (!after_bootmem) {
274 if (pmd_present(pmd[i])) 319 for(; i < PTRS_PER_PTE; i++, pte++)
275 goto continue_outer_loop; 320 set_pte(pte, __pte(0));
321 }
322 break;
276 } 323 }
277 vaddr += addr & ~PMD_MASK;
278 addr &= PMD_MASK;
279 324
280 for (i = 0; i < pmds; i++, addr += PMD_SIZE) 325 if (pte_val(*pte))
281 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); 326 continue;
282 __flush_tlb_all();
283 327
284 return (void *)vaddr; 328 if (0)
285continue_outer_loop: 329 printk(" pte=%p addr=%lx pte=%016lx\n",
286 ; 330 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
331 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
332 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
333 pages++;
287 } 334 }
288 printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size); 335 update_page_count(PG_LEVEL_4K, pages);
289 336
290 return NULL; 337 return last_map_addr;
291} 338}
292 339
293/* 340static unsigned long __meminit
294 * To avoid virtual aliases later: 341phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
295 */
296__meminit void early_iounmap(void *addr, unsigned long size)
297{ 342{
298 unsigned long vaddr; 343 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
299 pmd_t *pmd;
300 int i, pmds;
301
302 vaddr = (unsigned long)addr;
303 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
304 pmd = level2_kernel_pgt + pmd_index(vaddr);
305 344
306 for (i = 0; i < pmds; i++) 345 return phys_pte_init(pte, address, end);
307 pmd_clear(pmd + i);
308
309 __flush_tlb_all();
310} 346}
311 347
312static unsigned long __meminit 348static unsigned long __meminit
313phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) 349phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
350 unsigned long page_size_mask)
314{ 351{
352 unsigned long pages = 0;
353 unsigned long last_map_addr = end;
354
315 int i = pmd_index(address); 355 int i = pmd_index(address);
316 356
317 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { 357 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
358 unsigned long pte_phys;
318 pmd_t *pmd = pmd_page + pmd_index(address); 359 pmd_t *pmd = pmd_page + pmd_index(address);
360 pte_t *pte;
319 361
320 if (address >= end) { 362 if (address >= end) {
321 if (!after_bootmem) { 363 if (!after_bootmem) {
@@ -325,31 +367,50 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
325 break; 367 break;
326 } 368 }
327 369
328 if (pmd_val(*pmd)) 370 if (pmd_val(*pmd)) {
371 if (!pmd_large(*pmd))
372 last_map_addr = phys_pte_update(pmd, address,
373 end);
374 continue;
375 }
376
377 if (page_size_mask & (1<<PG_LEVEL_2M)) {
378 pages++;
379 set_pte((pte_t *)pmd,
380 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
381 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
329 continue; 382 continue;
383 }
330 384
331 set_pte((pte_t *)pmd, 385 pte = alloc_low_page(&pte_phys);
332 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 386 last_map_addr = phys_pte_init(pte, address, end);
387 unmap_low_page(pte);
388
389 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
333 } 390 }
334 return address; 391 update_page_count(PG_LEVEL_2M, pages);
392 return last_map_addr;
335} 393}
336 394
337static unsigned long __meminit 395static unsigned long __meminit
338phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) 396phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
397 unsigned long page_size_mask)
339{ 398{
340 pmd_t *pmd = pmd_offset(pud, 0); 399 pmd_t *pmd = pmd_offset(pud, 0);
341 unsigned long last_map_addr; 400 unsigned long last_map_addr;
342 401
343 spin_lock(&init_mm.page_table_lock); 402 spin_lock(&init_mm.page_table_lock);
344 last_map_addr = phys_pmd_init(pmd, address, end); 403 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
345 spin_unlock(&init_mm.page_table_lock); 404 spin_unlock(&init_mm.page_table_lock);
346 __flush_tlb_all(); 405 __flush_tlb_all();
347 return last_map_addr; 406 return last_map_addr;
348} 407}
349 408
350static unsigned long __meminit 409static unsigned long __meminit
351phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) 410phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
411 unsigned long page_size_mask)
352{ 412{
413 unsigned long pages = 0;
353 unsigned long last_map_addr = end; 414 unsigned long last_map_addr = end;
354 int i = pud_index(addr); 415 int i = pud_index(addr);
355 416
@@ -369,11 +430,13 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
369 430
370 if (pud_val(*pud)) { 431 if (pud_val(*pud)) {
371 if (!pud_large(*pud)) 432 if (!pud_large(*pud))
372 last_map_addr = phys_pmd_update(pud, addr, end); 433 last_map_addr = phys_pmd_update(pud, addr, end,
434 page_size_mask);
373 continue; 435 continue;
374 } 436 }
375 437
376 if (direct_gbpages) { 438 if (page_size_mask & (1<<PG_LEVEL_1G)) {
439 pages++;
377 set_pte((pte_t *)pud, 440 set_pte((pte_t *)pud,
378 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 441 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
379 last_map_addr = (addr & PUD_MASK) + PUD_SIZE; 442 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
@@ -383,27 +446,50 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
383 pmd = alloc_low_page(&pmd_phys); 446 pmd = alloc_low_page(&pmd_phys);
384 447
385 spin_lock(&init_mm.page_table_lock); 448 spin_lock(&init_mm.page_table_lock);
386 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 449 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
387 last_map_addr = phys_pmd_init(pmd, addr, end); 450 unmap_low_page(pmd);
451 pud_populate(&init_mm, pud, __va(pmd_phys));
388 spin_unlock(&init_mm.page_table_lock); 452 spin_unlock(&init_mm.page_table_lock);
389 453
390 unmap_low_page(pmd);
391 } 454 }
392 __flush_tlb_all(); 455 __flush_tlb_all();
456 update_page_count(PG_LEVEL_1G, pages);
393 457
394 return last_map_addr >> PAGE_SHIFT; 458 return last_map_addr;
459}
460
461static unsigned long __meminit
462phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
463 unsigned long page_size_mask)
464{
465 pud_t *pud;
466
467 pud = (pud_t *)pgd_page_vaddr(*pgd);
468
469 return phys_pud_init(pud, addr, end, page_size_mask);
395} 470}
396 471
397static void __init find_early_table_space(unsigned long end) 472static void __init find_early_table_space(unsigned long end)
398{ 473{
399 unsigned long puds, pmds, tables, start; 474 unsigned long puds, pmds, ptes, tables, start;
400 475
401 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 476 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
402 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 477 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
403 if (!direct_gbpages) { 478 if (direct_gbpages) {
479 unsigned long extra;
480 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
481 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
482 } else
404 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 483 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
405 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 484 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
406 } 485
486 if (cpu_has_pse) {
487 unsigned long extra;
488 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
489 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
490 } else
491 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
492 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
407 493
408 /* 494 /*
409 * RED-PEN putting page tables only on node 0 could 495 * RED-PEN putting page tables only on node 0 could
@@ -417,10 +503,10 @@ static void __init find_early_table_space(unsigned long end)
417 503
418 table_start >>= PAGE_SHIFT; 504 table_start >>= PAGE_SHIFT;
419 table_end = table_start; 505 table_end = table_start;
506 table_top = table_start + (tables >> PAGE_SHIFT);
420 507
421 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", 508 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
422 end, table_start << PAGE_SHIFT, 509 end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
423 (table_start << PAGE_SHIFT) + tables);
424} 510}
425 511
426static void __init init_gbpages(void) 512static void __init init_gbpages(void)
@@ -431,7 +517,7 @@ static void __init init_gbpages(void)
431 direct_gbpages = 0; 517 direct_gbpages = 0;
432} 518}
433 519
434#ifdef CONFIG_MEMTEST_BOOTPARAM 520#ifdef CONFIG_MEMTEST
435 521
436static void __init memtest(unsigned long start_phys, unsigned long size, 522static void __init memtest(unsigned long start_phys, unsigned long size,
437 unsigned pattern) 523 unsigned pattern)
@@ -493,7 +579,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
493 579
494} 580}
495 581
496static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; 582/* default is disabled */
583static int memtest_pattern __initdata;
497 584
498static int __init parse_memtest(char *arg) 585static int __init parse_memtest(char *arg)
499{ 586{
@@ -506,7 +593,7 @@ early_param("memtest", parse_memtest);
506 593
507static void __init early_memtest(unsigned long start, unsigned long end) 594static void __init early_memtest(unsigned long start, unsigned long end)
508{ 595{
509 unsigned long t_start, t_size; 596 u64 t_start, t_size;
510 unsigned pattern; 597 unsigned pattern;
511 598
512 if (!memtest_pattern) 599 if (!memtest_pattern)
@@ -525,8 +612,9 @@ static void __init early_memtest(unsigned long start, unsigned long end)
525 if (t_start + t_size > end) 612 if (t_start + t_size > end)
526 t_size = end - t_start; 613 t_size = end - t_start;
527 614
528 printk(KERN_CONT "\n %016lx - %016lx pattern %d", 615 printk(KERN_CONT "\n %016llx - %016llx pattern %d",
529 t_start, t_start + t_size, pattern); 616 (unsigned long long)t_start,
617 (unsigned long long)t_start + t_size, pattern);
530 618
531 memtest(t_start, t_size, pattern); 619 memtest(t_start, t_size, pattern);
532 620
@@ -541,15 +629,85 @@ static void __init early_memtest(unsigned long start, unsigned long end)
541} 629}
542#endif 630#endif
543 631
632static unsigned long __init kernel_physical_mapping_init(unsigned long start,
633 unsigned long end,
634 unsigned long page_size_mask)
635{
636
637 unsigned long next, last_map_addr = end;
638
639 start = (unsigned long)__va(start);
640 end = (unsigned long)__va(end);
641
642 for (; start < end; start = next) {
643 pgd_t *pgd = pgd_offset_k(start);
644 unsigned long pud_phys;
645 pud_t *pud;
646
647 next = (start + PGDIR_SIZE) & PGDIR_MASK;
648 if (next > end)
649 next = end;
650
651 if (pgd_val(*pgd)) {
652 last_map_addr = phys_pud_update(pgd, __pa(start),
653 __pa(end), page_size_mask);
654 continue;
655 }
656
657 if (after_bootmem)
658 pud = pud_offset(pgd, start & PGDIR_MASK);
659 else
660 pud = alloc_low_page(&pud_phys);
661
662 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
663 page_size_mask);
664 unmap_low_page(pud);
665 pgd_populate(&init_mm, pgd_offset_k(start),
666 __va(pud_phys));
667 }
668
669 return last_map_addr;
670}
671
672struct map_range {
673 unsigned long start;
674 unsigned long end;
675 unsigned page_size_mask;
676};
677
678#define NR_RANGE_MR 5
679
680static int save_mr(struct map_range *mr, int nr_range,
681 unsigned long start_pfn, unsigned long end_pfn,
682 unsigned long page_size_mask)
683{
684
685 if (start_pfn < end_pfn) {
686 if (nr_range >= NR_RANGE_MR)
687 panic("run out of range for init_memory_mapping\n");
688 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
689 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
690 mr[nr_range].page_size_mask = page_size_mask;
691 nr_range++;
692 }
693
694 return nr_range;
695}
696
544/* 697/*
545 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 698 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
546 * This runs before bootmem is initialized and gets pages directly from 699 * This runs before bootmem is initialized and gets pages directly from
547 * the physical memory. To access them they are temporarily mapped. 700 * the physical memory. To access them they are temporarily mapped.
548 */ 701 */
549unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) 702unsigned long __init_refok init_memory_mapping(unsigned long start,
703 unsigned long end)
550{ 704{
551 unsigned long next, last_map_addr = end; 705 unsigned long last_map_addr = 0;
552 unsigned long start_phys = start, end_phys = end; 706 unsigned long page_size_mask = 0;
707 unsigned long start_pfn, end_pfn;
708
709 struct map_range mr[NR_RANGE_MR];
710 int nr_range, i;
553 711
554 printk(KERN_INFO "init_memory_mapping\n"); 712 printk(KERN_INFO "init_memory_mapping\n");
555 713
@@ -560,48 +718,115 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon
560 * memory mapped. Unfortunately this is done currently before the 718 * memory mapped. Unfortunately this is done currently before the
561 * nodes are discovered. 719 * nodes are discovered.
562 */ 720 */
563 if (!after_bootmem) { 721 if (!after_bootmem)
564 init_gbpages(); 722 init_gbpages();
565 find_early_table_space(end); 723
724 if (direct_gbpages)
725 page_size_mask |= 1 << PG_LEVEL_1G;
726 if (cpu_has_pse)
727 page_size_mask |= 1 << PG_LEVEL_2M;
728
729 memset(mr, 0, sizeof(mr));
730 nr_range = 0;
731
732 /* head if not big page alignment ?*/
733 start_pfn = start >> PAGE_SHIFT;
734 end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
735 << (PMD_SHIFT - PAGE_SHIFT);
736 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
737
738 /* big page (2M) range*/
739 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
740 << (PMD_SHIFT - PAGE_SHIFT);
741 end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
742 << (PUD_SHIFT - PAGE_SHIFT);
743 if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
744 end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
745 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
746 page_size_mask & (1<<PG_LEVEL_2M));
747
748 /* big page (1G) range */
749 start_pfn = end_pfn;
750 end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
751 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
752 page_size_mask &
753 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
754
755 /* tail is not big page (1G) alignment */
756 start_pfn = end_pfn;
757 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
758 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
759 page_size_mask & (1<<PG_LEVEL_2M));
760
761 /* tail is not big page (2M) alignment */
762 start_pfn = end_pfn;
763 end_pfn = end>>PAGE_SHIFT;
764 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
765
766 /* try to merge same page size and continuous */
767 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
768 unsigned long old_start;
769 if (mr[i].end != mr[i+1].start ||
770 mr[i].page_size_mask != mr[i+1].page_size_mask)
771 continue;
772 /* move it */
773 old_start = mr[i].start;
774 memmove(&mr[i], &mr[i+1],
775 (nr_range - 1 - i) * sizeof (struct map_range));
776 mr[i].start = old_start;
777 nr_range--;
566 } 778 }
567 779
568 start = (unsigned long)__va(start); 780 for (i = 0; i < nr_range; i++)
569 end = (unsigned long)__va(end); 781 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
782 mr[i].start, mr[i].end,
783 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
784 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
570 785
571 for (; start < end; start = next) { 786 if (!after_bootmem)
572 pgd_t *pgd = pgd_offset_k(start); 787 find_early_table_space(end);
573 unsigned long pud_phys;
574 pud_t *pud;
575
576 if (after_bootmem)
577 pud = pud_offset(pgd, start & PGDIR_MASK);
578 else
579 pud = alloc_low_page(&pud_phys);
580 788
581 next = start + PGDIR_SIZE; 789 for (i = 0; i < nr_range; i++)
582 if (next > end) 790 last_map_addr = kernel_physical_mapping_init(
583 next = end; 791 mr[i].start, mr[i].end,
584 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); 792 mr[i].page_size_mask);
585 if (!after_bootmem)
586 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
587 unmap_low_page(pud);
588 }
589 793
590 if (!after_bootmem) 794 if (!after_bootmem)
591 mmu_cr4_features = read_cr4(); 795 mmu_cr4_features = read_cr4();
592 __flush_tlb_all(); 796 __flush_tlb_all();
593 797
594 if (!after_bootmem) 798 if (!after_bootmem && table_end > table_start)
595 reserve_early(table_start << PAGE_SHIFT, 799 reserve_early(table_start << PAGE_SHIFT,
596 table_end << PAGE_SHIFT, "PGTABLE"); 800 table_end << PAGE_SHIFT, "PGTABLE");
597 801
802 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
803 last_map_addr, end);
804
598 if (!after_bootmem) 805 if (!after_bootmem)
599 early_memtest(start_phys, end_phys); 806 early_memtest(start, end);
600 807
601 return last_map_addr; 808 return last_map_addr >> PAGE_SHIFT;
602} 809}
603 810
604#ifndef CONFIG_NUMA 811#ifndef CONFIG_NUMA
812void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
813{
814 unsigned long bootmap_size, bootmap;
815
816 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
817 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
818 PAGE_SIZE);
819 if (bootmap == -1L)
820 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
821 /* don't touch min_low_pfn */
822 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
823 0, end_pfn);
824 e820_register_active_regions(0, start_pfn, end_pfn);
825 free_bootmem_with_active_regions(0, end_pfn);
826 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
827 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
828}
829
605void __init paging_init(void) 830void __init paging_init(void)
606{ 831{
607 unsigned long max_zone_pfns[MAX_NR_ZONES]; 832 unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -609,9 +834,9 @@ void __init paging_init(void)
609 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 834 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
610 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 835 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
611 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 836 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
612 max_zone_pfns[ZONE_NORMAL] = end_pfn; 837 max_zone_pfns[ZONE_NORMAL] = max_pfn;
613 838
614 memory_present(0, 0, end_pfn); 839 memory_present(0, 0, max_pfn);
615 sparse_init(); 840 sparse_init();
616 free_area_init_nodes(max_zone_pfns); 841 free_area_init_nodes(max_zone_pfns);
617} 842}
@@ -693,8 +918,8 @@ void __init mem_init(void)
693#else 918#else
694 totalram_pages = free_all_bootmem(); 919 totalram_pages = free_all_bootmem();
695#endif 920#endif
696 reservedpages = end_pfn - totalram_pages - 921 reservedpages = max_pfn - totalram_pages -
697 absent_pages_in_range(0, end_pfn); 922 absent_pages_in_range(0, max_pfn);
698 after_bootmem = 1; 923 after_bootmem = 1;
699 924
700 codesize = (unsigned long) &_etext - (unsigned long) &_text; 925 codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -713,7 +938,7 @@ void __init mem_init(void)
713 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " 938 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
714 "%ldk reserved, %ldk data, %ldk init)\n", 939 "%ldk reserved, %ldk data, %ldk init)\n",
715 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 940 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
716 end_pfn << (PAGE_SHIFT-10), 941 max_pfn << (PAGE_SHIFT-10),
717 codesize >> 10, 942 codesize >> 10,
718 reservedpages << (PAGE_SHIFT-10), 943 reservedpages << (PAGE_SHIFT-10),
719 datasize >> 10, 944 datasize >> 10,
@@ -766,6 +991,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
766void mark_rodata_ro(void) 991void mark_rodata_ro(void)
767{ 992{
768 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 993 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
994 unsigned long rodata_start =
995 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
996
997#ifdef CONFIG_DYNAMIC_FTRACE
998 /* Dynamic tracing modifies the kernel text section */
999 start = rodata_start;
1000#endif
769 1001
770 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 1002 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
771 (end - start) >> 10); 1003 (end - start) >> 10);
@@ -775,8 +1007,7 @@ void mark_rodata_ro(void)
775 * The rodata section (but not the kernel text!) should also be 1007 * The rodata section (but not the kernel text!) should also be
776 * not-executable. 1008 * not-executable.
777 */ 1009 */
778 start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 1010 set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
779 set_memory_nx(start, (end - start) >> PAGE_SHIFT);
780 1011
781 rodata_test(); 1012 rodata_test();
782 1013
@@ -798,24 +1029,26 @@ void free_initrd_mem(unsigned long start, unsigned long end)
798} 1029}
799#endif 1030#endif
800 1031
801void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 1032int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1033 int flags)
802{ 1034{
803#ifdef CONFIG_NUMA 1035#ifdef CONFIG_NUMA
804 int nid, next_nid; 1036 int nid, next_nid;
1037 int ret;
805#endif 1038#endif
806 unsigned long pfn = phys >> PAGE_SHIFT; 1039 unsigned long pfn = phys >> PAGE_SHIFT;
807 1040
808 if (pfn >= end_pfn) { 1041 if (pfn >= max_pfn) {
809 /* 1042 /*
810 * This can happen with kdump kernels when accessing 1043 * This can happen with kdump kernels when accessing
811 * firmware tables: 1044 * firmware tables:
812 */ 1045 */
813 if (pfn < max_pfn_mapped) 1046 if (pfn < max_pfn_mapped)
814 return; 1047 return -EFAULT;
815 1048
816 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", 1049 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
817 phys, len); 1050 phys, len);
818 return; 1051 return -EFAULT;
819 } 1052 }
820 1053
821 /* Should check here against the e820 map to avoid double free */ 1054 /* Should check here against the e820 map to avoid double free */
@@ -823,9 +1056,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
823 nid = phys_to_nid(phys); 1056 nid = phys_to_nid(phys);
824 next_nid = phys_to_nid(phys + len - 1); 1057 next_nid = phys_to_nid(phys + len - 1);
825 if (nid == next_nid) 1058 if (nid == next_nid)
826 reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); 1059 ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
827 else 1060 else
828 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 1061 ret = reserve_bootmem(phys, len, flags);
1062
1063 if (ret != 0)
1064 return ret;
1065
829#else 1066#else
830 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 1067 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
831#endif 1068#endif
@@ -834,6 +1071,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
834 dma_reserve += len / PAGE_SIZE; 1071 dma_reserve += len / PAGE_SIZE;
835 set_dma_reserve(dma_reserve); 1072 set_dma_reserve(dma_reserve);
836 } 1073 }
1074
1075 return 0;
837} 1076}
838 1077
839int kern_addr_valid(unsigned long addr) 1078int kern_addr_valid(unsigned long addr)
@@ -938,7 +1177,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
938 pmd_t *pmd; 1177 pmd_t *pmd;
939 1178
940 for (; addr < end; addr = next) { 1179 for (; addr < end; addr = next) {
941 next = pmd_addr_end(addr, end); 1180 void *p = NULL;
942 1181
943 pgd = vmemmap_pgd_populate(addr, node); 1182 pgd = vmemmap_pgd_populate(addr, node);
944 if (!pgd) 1183 if (!pgd)
@@ -948,33 +1187,51 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
948 if (!pud) 1187 if (!pud)
949 return -ENOMEM; 1188 return -ENOMEM;
950 1189
951 pmd = pmd_offset(pud, addr); 1190 if (!cpu_has_pse) {
952 if (pmd_none(*pmd)) { 1191 next = (addr + PAGE_SIZE) & PAGE_MASK;
953 pte_t entry; 1192 pmd = vmemmap_pmd_populate(pud, addr, node);
954 void *p; 1193
1194 if (!pmd)
1195 return -ENOMEM;
1196
1197 p = vmemmap_pte_populate(pmd, addr, node);
955 1198
956 p = vmemmap_alloc_block(PMD_SIZE, node);
957 if (!p) 1199 if (!p)
958 return -ENOMEM; 1200 return -ENOMEM;
959 1201
960 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, 1202 addr_end = addr + PAGE_SIZE;
961 PAGE_KERNEL_LARGE); 1203 p_end = p + PAGE_SIZE;
962 set_pmd(pmd, __pmd(pte_val(entry)));
963
964 /* check to see if we have contiguous blocks */
965 if (p_end != p || node_start != node) {
966 if (p_start)
967 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
968 addr_start, addr_end-1, p_start, p_end-1, node_start);
969 addr_start = addr;
970 node_start = node;
971 p_start = p;
972 }
973 addr_end = addr + PMD_SIZE;
974 p_end = p + PMD_SIZE;
975 } else { 1204 } else {
976 vmemmap_verify((pte_t *)pmd, node, addr, next); 1205 next = pmd_addr_end(addr, end);
1206
1207 pmd = pmd_offset(pud, addr);
1208 if (pmd_none(*pmd)) {
1209 pte_t entry;
1210
1211 p = vmemmap_alloc_block(PMD_SIZE, node);
1212 if (!p)
1213 return -ENOMEM;
1214
1215 entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
1216 PAGE_KERNEL_LARGE);
1217 set_pmd(pmd, __pmd(pte_val(entry)));
1218
1219 /* check to see if we have contiguous blocks */
1220 if (p_end != p || node_start != node) {
1221 if (p_start)
1222 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1223 addr_start, addr_end-1, p_start, p_end-1, node_start);
1224 addr_start = addr;
1225 node_start = node;
1226 p_start = p;
1227 }
1228
1229 addr_end = addr + PMD_SIZE;
1230 p_end = p + PMD_SIZE;
1231 } else
1232 vmemmap_verify((pte_t *)pmd, node, addr, next);
977 } 1233 }
1234
978 } 1235 }
979 return 0; 1236 return 0;
980} 1237}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 71bb3159031a..24c1d3c30186 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/mmiotrace.h>
15 16
16#include <asm/cacheflush.h> 17#include <asm/cacheflush.h>
17#include <asm/e820.h> 18#include <asm/e820.h>
@@ -122,10 +123,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
122{ 123{
123 unsigned long pfn, offset, vaddr; 124 unsigned long pfn, offset, vaddr;
124 resource_size_t last_addr; 125 resource_size_t last_addr;
126 const resource_size_t unaligned_phys_addr = phys_addr;
127 const unsigned long unaligned_size = size;
125 struct vm_struct *area; 128 struct vm_struct *area;
126 unsigned long new_prot_val; 129 unsigned long new_prot_val;
127 pgprot_t prot; 130 pgprot_t prot;
128 int retval; 131 int retval;
132 void __iomem *ret_addr;
129 133
130 /* Don't allow wraparound or zero size */ 134 /* Don't allow wraparound or zero size */
131 last_addr = phys_addr + size - 1; 135 last_addr = phys_addr + size - 1;
@@ -142,7 +146,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
142 /* 146 /*
143 * Don't remap the low PCI/ISA area, it's always mapped.. 147 * Don't remap the low PCI/ISA area, it's always mapped..
144 */ 148 */
145 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) 149 if (is_ISA_range(phys_addr, last_addr))
146 return (__force void __iomem *)phys_to_virt(phys_addr); 150 return (__force void __iomem *)phys_to_virt(phys_addr);
147 151
148 /* 152 /*
@@ -233,7 +237,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
233 return NULL; 237 return NULL;
234 } 238 }
235 239
236 return (void __iomem *) (vaddr + offset); 240 ret_addr = (void __iomem *) (vaddr + offset);
241 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
242
243 return ret_addr;
237} 244}
238 245
239/** 246/**
@@ -261,7 +268,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
261{ 268{
262 /* 269 /*
263 * Ideally, this should be: 270 * Ideally, this should be:
264 * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; 271 * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
265 * 272 *
266 * Till we fix all X drivers to use ioremap_wc(), we will use 273 * Till we fix all X drivers to use ioremap_wc(), we will use
267 * UC MINUS. 274 * UC MINUS.
@@ -285,7 +292,7 @@ EXPORT_SYMBOL(ioremap_nocache);
285 */ 292 */
286void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) 293void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
287{ 294{
288 if (pat_wc_enabled) 295 if (pat_enabled)
289 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, 296 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
290 __builtin_return_address(0)); 297 __builtin_return_address(0));
291 else 298 else
@@ -300,6 +307,29 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
300} 307}
301EXPORT_SYMBOL(ioremap_cache); 308EXPORT_SYMBOL(ioremap_cache);
302 309
310static void __iomem *ioremap_default(resource_size_t phys_addr,
311 unsigned long size)
312{
313 unsigned long flags;
314 void *ret;
315 int err;
316
317 /*
318 * - WB for WB-able memory and no other conflicting mappings
319 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
320 * - Inherit from confliting mappings otherwise
321 */
322 err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
323 if (err < 0)
324 return NULL;
325
326 ret = (void *) __ioremap_caller(phys_addr, size, flags,
327 __builtin_return_address(0));
328
329 free_memtype(phys_addr, phys_addr + size);
330 return (void __iomem *)ret;
331}
332
303/** 333/**
304 * iounmap - Free a IO remapping 334 * iounmap - Free a IO remapping
305 * @addr: virtual address from ioremap_* 335 * @addr: virtual address from ioremap_*
@@ -318,13 +348,15 @@ void iounmap(volatile void __iomem *addr)
318 * vm_area and by simply returning an address into the kernel mapping 348 * vm_area and by simply returning an address into the kernel mapping
319 * of ISA space. So handle that here. 349 * of ISA space. So handle that here.
320 */ 350 */
321 if (addr >= phys_to_virt(ISA_START_ADDRESS) && 351 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
322 addr < phys_to_virt(ISA_END_ADDRESS)) 352 (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
323 return; 353 return;
324 354
325 addr = (volatile void __iomem *) 355 addr = (volatile void __iomem *)
326 (PAGE_MASK & (unsigned long __force)addr); 356 (PAGE_MASK & (unsigned long __force)addr);
327 357
358 mmiotrace_iounmap(addr);
359
328 /* Use the vm area unlocked, assuming the caller 360 /* Use the vm area unlocked, assuming the caller
329 ensures there isn't another iounmap for the same address 361 ensures there isn't another iounmap for the same address
330 in parallel. Reuse of the virtual address is prevented by 362 in parallel. Reuse of the virtual address is prevented by
@@ -332,7 +364,7 @@ void iounmap(volatile void __iomem *addr)
332 cpa takes care of the direct mappings. */ 364 cpa takes care of the direct mappings. */
333 read_lock(&vmlist_lock); 365 read_lock(&vmlist_lock);
334 for (p = vmlist; p; p = p->next) { 366 for (p = vmlist; p; p = p->next) {
335 if (p->addr == addr) 367 if (p->addr == (void __force *)addr)
336 break; 368 break;
337 } 369 }
338 read_unlock(&vmlist_lock); 370 read_unlock(&vmlist_lock);
@@ -346,7 +378,7 @@ void iounmap(volatile void __iomem *addr)
346 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); 378 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
347 379
348 /* Finally remove it */ 380 /* Finally remove it */
349 o = remove_vm_area((void *)addr); 381 o = remove_vm_area((void __force *)addr);
350 BUG_ON(p != o || o == NULL); 382 BUG_ON(p != o || o == NULL);
351 kfree(p); 383 kfree(p);
352} 384}
@@ -365,7 +397,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
365 if (page_is_ram(start >> PAGE_SHIFT)) 397 if (page_is_ram(start >> PAGE_SHIFT))
366 return __va(phys); 398 return __va(phys);
367 399
368 addr = (void *)ioremap(start, PAGE_SIZE); 400 addr = (void __force *)ioremap_default(start, PAGE_SIZE);
369 if (addr) 401 if (addr)
370 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 402 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
371 403
@@ -381,8 +413,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
381 return; 413 return;
382} 414}
383 415
384#ifdef CONFIG_X86_32
385
386int __initdata early_ioremap_debug; 416int __initdata early_ioremap_debug;
387 417
388static int __init early_ioremap_debug_setup(char *str) 418static int __init early_ioremap_debug_setup(char *str)
@@ -394,8 +424,7 @@ static int __init early_ioremap_debug_setup(char *str)
394early_param("early_ioremap_debug", early_ioremap_debug_setup); 424early_param("early_ioremap_debug", early_ioremap_debug_setup);
395 425
396static __initdata int after_paging_init; 426static __initdata int after_paging_init;
397static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] 427static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
398 __section(.bss.page_aligned);
399 428
400static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) 429static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
401{ 430{
@@ -484,10 +513,11 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
484 return; 513 return;
485 } 514 }
486 pte = early_ioremap_pte(addr); 515 pte = early_ioremap_pte(addr);
516
487 if (pgprot_val(flags)) 517 if (pgprot_val(flags))
488 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); 518 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
489 else 519 else
490 pte_clear(NULL, addr, pte); 520 pte_clear(&init_mm, addr, pte);
491 __flush_tlb_one(addr); 521 __flush_tlb_one(addr);
492} 522}
493 523
@@ -593,10 +623,11 @@ void __init early_iounmap(void *addr, unsigned long size)
593 unsigned long offset; 623 unsigned long offset;
594 unsigned int nrpages; 624 unsigned int nrpages;
595 enum fixed_addresses idx; 625 enum fixed_addresses idx;
596 unsigned int nesting; 626 int nesting;
597 627
598 nesting = --early_ioremap_nested; 628 nesting = --early_ioremap_nested;
599 WARN_ON(nesting < 0); 629 if (WARN_ON(nesting < 0))
630 return;
600 631
601 if (early_ioremap_debug) { 632 if (early_ioremap_debug) {
602 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, 633 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
@@ -624,5 +655,3 @@ void __this_fixmap_does_not_exist(void)
624{ 655{
625 WARN_ON(1); 656 WARN_ON(1);
626} 657}
627
628#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 1f476e477844..41f1b5c00a1d 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,6 +22,7 @@
22#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/mpspec.h> 23#include <asm/mpspec.h>
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h>
25 26
26static __init int find_northbridge(void) 27static __init int find_northbridge(void)
27{ 28{
@@ -56,34 +57,33 @@ static __init void early_get_boot_cpu_id(void)
56 /* 57 /*
57 * Find possible boot-time SMP configuration: 58 * Find possible boot-time SMP configuration:
58 */ 59 */
60#ifdef CONFIG_X86_MPPARSE
59 early_find_smp_config(); 61 early_find_smp_config();
62#endif
60#ifdef CONFIG_ACPI 63#ifdef CONFIG_ACPI
61 /* 64 /*
62 * Read APIC information from ACPI tables. 65 * Read APIC information from ACPI tables.
63 */ 66 */
64 early_acpi_boot_init(); 67 early_acpi_boot_init();
65#endif 68#endif
69#ifdef CONFIG_X86_MPPARSE
66 /* 70 /*
67 * get boot-time SMP configuration: 71 * get boot-time SMP configuration:
68 */ 72 */
69 if (smp_found_config) 73 if (smp_found_config)
70 early_get_smp_config(); 74 early_get_smp_config();
75#endif
71 early_init_lapic_mapping(); 76 early_init_lapic_mapping();
72} 77}
73 78
74int __init k8_scan_nodes(unsigned long start, unsigned long end) 79int __init k8_scan_nodes(unsigned long start, unsigned long end)
75{ 80{
81 unsigned numnodes, cores, bits, apicid_base;
76 unsigned long prevbase; 82 unsigned long prevbase;
77 struct bootnode nodes[8]; 83 struct bootnode nodes[8];
78 int nodeid, i, nb;
79 unsigned char nodeids[8]; 84 unsigned char nodeids[8];
80 int found = 0; 85 int i, j, nb, found = 0;
81 u32 reg; 86 u32 nodeid, reg;
82 unsigned numnodes;
83 unsigned cores;
84 unsigned bits;
85 int j;
86 unsigned apicid_base;
87 87
88 if (!early_pci_allowed()) 88 if (!early_pci_allowed())
89 return -1; 89 return -1;
@@ -105,7 +105,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
105 prevbase = 0; 105 prevbase = 0;
106 for (i = 0; i < 8; i++) { 106 for (i = 0; i < 8; i++) {
107 unsigned long base, limit; 107 unsigned long base, limit;
108 u32 nodeid;
109 108
110 base = read_pci_config(0, nb, 1, 0x40 + i*8); 109 base = read_pci_config(0, nb, 1, 0x40 + i*8);
111 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 110 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -144,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
144 limit |= (1<<24)-1; 143 limit |= (1<<24)-1;
145 limit++; 144 limit++;
146 145
147 if (limit > end_pfn << PAGE_SHIFT) 146 if (limit > max_pfn << PAGE_SHIFT)
148 limit = end_pfn << PAGE_SHIFT; 147 limit = max_pfn << PAGE_SHIFT;
149 if (limit <= base) 148 if (limit <= base)
150 continue; 149 continue;
151 150
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 000000000000..93d82038af4b
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,510 @@
1/* Support for MMIO probes.
2 * Benfit many code from kprobes
3 * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
4 * 2007 Alexander Eichner
5 * 2008 Pekka Paalanen <pq@iki.fi>
6 */
7
8#include <linux/list.h>
9#include <linux/rculist.h>
10#include <linux/spinlock.h>
11#include <linux/hash.h>
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/kernel.h>
15#include <linux/uaccess.h>
16#include <linux/ptrace.h>
17#include <linux/preempt.h>
18#include <linux/percpu.h>
19#include <linux/kdebug.h>
20#include <linux/mutex.h>
21#include <linux/io.h>
22#include <asm/cacheflush.h>
23#include <asm/tlbflush.h>
24#include <linux/errno.h>
25#include <asm/debugreg.h>
26#include <linux/mmiotrace.h>
27
28#define KMMIO_PAGE_HASH_BITS 4
29#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
30
31struct kmmio_fault_page {
32 struct list_head list;
33 struct kmmio_fault_page *release_next;
34 unsigned long page; /* location of the fault page */
35
36 /*
37 * Number of times this page has been registered as a part
38 * of a probe. If zero, page is disarmed and this may be freed.
39 * Used only by writers (RCU).
40 */
41 int count;
42};
43
44struct kmmio_delayed_release {
45 struct rcu_head rcu;
46 struct kmmio_fault_page *release_list;
47};
48
49struct kmmio_context {
50 struct kmmio_fault_page *fpage;
51 struct kmmio_probe *probe;
52 unsigned long saved_flags;
53 unsigned long addr;
54 int active;
55};
56
57static DEFINE_SPINLOCK(kmmio_lock);
58
59/* Protected by kmmio_lock */
60unsigned int kmmio_count;
61
62/* Read-protected by RCU, write-protected by kmmio_lock. */
63static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
64static LIST_HEAD(kmmio_probes);
65
66static struct list_head *kmmio_page_list(unsigned long page)
67{
68 return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
69}
70
71/* Accessed per-cpu */
72static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
73
74/*
75 * this is basically a dynamic stabbing problem:
76 * Could use the existing prio tree code or
77 * Possible better implementations:
78 * The Interval Skip List: A Data Structure for Finding All Intervals That
79 * Overlap a Point (might be simple)
80 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
81 */
82/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
83static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
84{
85 struct kmmio_probe *p;
86 list_for_each_entry_rcu(p, &kmmio_probes, list) {
87 if (addr >= p->addr && addr <= (p->addr + p->len))
88 return p;
89 }
90 return NULL;
91}
92
93/* You must be holding RCU read lock. */
94static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
95{
96 struct list_head *head;
97 struct kmmio_fault_page *p;
98
99 page &= PAGE_MASK;
100 head = kmmio_page_list(page);
101 list_for_each_entry_rcu(p, head, list) {
102 if (p->page == page)
103 return p;
104 }
105 return NULL;
106}
107
108static void set_page_present(unsigned long addr, bool present,
109 unsigned int *pglevel)
110{
111 pteval_t pteval;
112 pmdval_t pmdval;
113 unsigned int level;
114 pmd_t *pmd;
115 pte_t *pte = lookup_address(addr, &level);
116
117 if (!pte) {
118 pr_err("kmmio: no pte for page 0x%08lx\n", addr);
119 return;
120 }
121
122 if (pglevel)
123 *pglevel = level;
124
125 switch (level) {
126 case PG_LEVEL_2M:
127 pmd = (pmd_t *)pte;
128 pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
129 if (present)
130 pmdval |= _PAGE_PRESENT;
131 set_pmd(pmd, __pmd(pmdval));
132 break;
133
134 case PG_LEVEL_4K:
135 pteval = pte_val(*pte) & ~_PAGE_PRESENT;
136 if (present)
137 pteval |= _PAGE_PRESENT;
138 set_pte_atomic(pte, __pte(pteval));
139 break;
140
141 default:
142 pr_err("kmmio: unexpected page level 0x%x.\n", level);
143 return;
144 }
145
146 __flush_tlb_one(addr);
147}
148
149/** Mark the given page as not present. Access to it will trigger a fault. */
150static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
151{
152 set_page_present(page & PAGE_MASK, false, pglevel);
153}
154
155/** Mark the given page as present. */
156static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
157{
158 set_page_present(page & PAGE_MASK, true, pglevel);
159}
160
161/*
162 * This is being called from do_page_fault().
163 *
164 * We may be in an interrupt or a critical section. Also prefecthing may
165 * trigger a page fault. We may be in the middle of process switch.
166 * We cannot take any locks, because we could be executing especially
167 * within a kmmio critical section.
168 *
169 * Local interrupts are disabled, so preemption cannot happen.
170 * Do not enable interrupts, do not sleep, and watch out for other CPUs.
171 */
172/*
173 * Interrupts are disabled on entry as trap3 is an interrupt gate
174 * and they remain disabled thorough out this function.
175 */
176int kmmio_handler(struct pt_regs *regs, unsigned long addr)
177{
178 struct kmmio_context *ctx;
179 struct kmmio_fault_page *faultpage;
180 int ret = 0; /* default to fault not handled */
181
182 /*
183 * Preemption is now disabled to prevent process switch during
184 * single stepping. We can only handle one active kmmio trace
185 * per cpu, so ensure that we finish it before something else
186 * gets to run. We also hold the RCU read lock over single
187 * stepping to avoid looking up the probe and kmmio_fault_page
188 * again.
189 */
190 preempt_disable();
191 rcu_read_lock();
192
193 faultpage = get_kmmio_fault_page(addr);
194 if (!faultpage) {
195 /*
196 * Either this page fault is not caused by kmmio, or
197 * another CPU just pulled the kmmio probe from under
198 * our feet. The latter case should not be possible.
199 */
200 goto no_kmmio;
201 }
202
203 ctx = &get_cpu_var(kmmio_ctx);
204 if (ctx->active) {
205 disarm_kmmio_fault_page(faultpage->page, NULL);
206 if (addr == ctx->addr) {
207 /*
208 * On SMP we sometimes get recursive probe hits on the
209 * same address. Context is already saved, fall out.
210 */
211 pr_debug("kmmio: duplicate probe hit on CPU %d, for "
212 "address 0x%08lx.\n",
213 smp_processor_id(), addr);
214 ret = 1;
215 goto no_kmmio_ctx;
216 }
217 /*
218 * Prevent overwriting already in-flight context.
219 * This should not happen, let's hope disarming at least
220 * prevents a panic.
221 */
222 pr_emerg("kmmio: recursive probe hit on CPU %d, "
223 "for address 0x%08lx. Ignoring.\n",
224 smp_processor_id(), addr);
225 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
226 ctx->addr);
227 goto no_kmmio_ctx;
228 }
229 ctx->active++;
230
231 ctx->fpage = faultpage;
232 ctx->probe = get_kmmio_probe(addr);
233 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
234 ctx->addr = addr;
235
236 if (ctx->probe && ctx->probe->pre_handler)
237 ctx->probe->pre_handler(ctx->probe, regs, addr);
238
239 /*
240 * Enable single-stepping and disable interrupts for the faulting
241 * context. Local interrupts must not get enabled during stepping.
242 */
243 regs->flags |= X86_EFLAGS_TF;
244 regs->flags &= ~X86_EFLAGS_IF;
245
246 /* Now we set present bit in PTE and single step. */
247 disarm_kmmio_fault_page(ctx->fpage->page, NULL);
248
249 /*
250 * If another cpu accesses the same page while we are stepping,
251 * the access will not be caught. It will simply succeed and the
252 * only downside is we lose the event. If this becomes a problem,
253 * the user should drop to single cpu before tracing.
254 */
255
256 put_cpu_var(kmmio_ctx);
257 return 1; /* fault handled */
258
259no_kmmio_ctx:
260 put_cpu_var(kmmio_ctx);
261no_kmmio:
262 rcu_read_unlock();
263 preempt_enable_no_resched();
264 return ret;
265}
266
267/*
268 * Interrupts are disabled on entry as trap1 is an interrupt gate
269 * and they remain disabled thorough out this function.
270 * This must always get called as the pair to kmmio_handler().
271 */
272static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
273{
274 int ret = 0;
275 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
276
277 if (!ctx->active) {
278 pr_debug("kmmio: spurious debug trap on CPU %d.\n",
279 smp_processor_id());
280 goto out;
281 }
282
283 if (ctx->probe && ctx->probe->post_handler)
284 ctx->probe->post_handler(ctx->probe, condition, regs);
285
286 arm_kmmio_fault_page(ctx->fpage->page, NULL);
287
288 regs->flags &= ~X86_EFLAGS_TF;
289 regs->flags |= ctx->saved_flags;
290
291 /* These were acquired in kmmio_handler(). */
292 ctx->active--;
293 BUG_ON(ctx->active);
294 rcu_read_unlock();
295 preempt_enable_no_resched();
296
297 /*
298 * if somebody else is singlestepping across a probe point, flags
299 * will have TF set, in which case, continue the remaining processing
300 * of do_debug, as if this is not a probe hit.
301 */
302 if (!(regs->flags & X86_EFLAGS_TF))
303 ret = 1;
304out:
305 put_cpu_var(kmmio_ctx);
306 return ret;
307}
308
309/* You must be holding kmmio_lock. */
310static int add_kmmio_fault_page(unsigned long page)
311{
312 struct kmmio_fault_page *f;
313
314 page &= PAGE_MASK;
315 f = get_kmmio_fault_page(page);
316 if (f) {
317 if (!f->count)
318 arm_kmmio_fault_page(f->page, NULL);
319 f->count++;
320 return 0;
321 }
322
323 f = kmalloc(sizeof(*f), GFP_ATOMIC);
324 if (!f)
325 return -1;
326
327 f->count = 1;
328 f->page = page;
329 list_add_rcu(&f->list, kmmio_page_list(f->page));
330
331 arm_kmmio_fault_page(f->page, NULL);
332
333 return 0;
334}
335
336/* You must be holding kmmio_lock. */
337static void release_kmmio_fault_page(unsigned long page,
338 struct kmmio_fault_page **release_list)
339{
340 struct kmmio_fault_page *f;
341
342 page &= PAGE_MASK;
343 f = get_kmmio_fault_page(page);
344 if (!f)
345 return;
346
347 f->count--;
348 BUG_ON(f->count < 0);
349 if (!f->count) {
350 disarm_kmmio_fault_page(f->page, NULL);
351 f->release_next = *release_list;
352 *release_list = f;
353 }
354}
355
356/*
357 * With page-unaligned ioremaps, one or two armed pages may contain
358 * addresses from outside the intended mapping. Events for these addresses
359 * are currently silently dropped. The events may result only from programming
360 * mistakes by accessing addresses before the beginning or past the end of a
361 * mapping.
362 */
363int register_kmmio_probe(struct kmmio_probe *p)
364{
365 unsigned long flags;
366 int ret = 0;
367 unsigned long size = 0;
368 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
369
370 spin_lock_irqsave(&kmmio_lock, flags);
371 if (get_kmmio_probe(p->addr)) {
372 ret = -EEXIST;
373 goto out;
374 }
375 kmmio_count++;
376 list_add_rcu(&p->list, &kmmio_probes);
377 while (size < size_lim) {
378 if (add_kmmio_fault_page(p->addr + size))
379 pr_err("kmmio: Unable to set page fault.\n");
380 size += PAGE_SIZE;
381 }
382out:
383 spin_unlock_irqrestore(&kmmio_lock, flags);
384 /*
385 * XXX: What should I do here?
386 * Here was a call to global_flush_tlb(), but it does not exist
387 * anymore. It seems it's not needed after all.
388 */
389 return ret;
390}
391EXPORT_SYMBOL(register_kmmio_probe);
392
393static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
394{
395 struct kmmio_delayed_release *dr = container_of(
396 head,
397 struct kmmio_delayed_release,
398 rcu);
399 struct kmmio_fault_page *p = dr->release_list;
400 while (p) {
401 struct kmmio_fault_page *next = p->release_next;
402 BUG_ON(p->count);
403 kfree(p);
404 p = next;
405 }
406 kfree(dr);
407}
408
409static void remove_kmmio_fault_pages(struct rcu_head *head)
410{
411 struct kmmio_delayed_release *dr = container_of(
412 head,
413 struct kmmio_delayed_release,
414 rcu);
415 struct kmmio_fault_page *p = dr->release_list;
416 struct kmmio_fault_page **prevp = &dr->release_list;
417 unsigned long flags;
418 spin_lock_irqsave(&kmmio_lock, flags);
419 while (p) {
420 if (!p->count)
421 list_del_rcu(&p->list);
422 else
423 *prevp = p->release_next;
424 prevp = &p->release_next;
425 p = p->release_next;
426 }
427 spin_unlock_irqrestore(&kmmio_lock, flags);
428 /* This is the real RCU destroy call. */
429 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
430}
431
432/*
433 * Remove a kmmio probe. You have to synchronize_rcu() before you can be
434 * sure that the callbacks will not be called anymore. Only after that
435 * you may actually release your struct kmmio_probe.
436 *
437 * Unregistering a kmmio fault page has three steps:
438 * 1. release_kmmio_fault_page()
439 * Disarm the page, wait a grace period to let all faults finish.
440 * 2. remove_kmmio_fault_pages()
441 * Remove the pages from kmmio_page_table.
442 * 3. rcu_free_kmmio_fault_pages()
443 * Actally free the kmmio_fault_page structs as with RCU.
444 */
445void unregister_kmmio_probe(struct kmmio_probe *p)
446{
447 unsigned long flags;
448 unsigned long size = 0;
449 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
450 struct kmmio_fault_page *release_list = NULL;
451 struct kmmio_delayed_release *drelease;
452
453 spin_lock_irqsave(&kmmio_lock, flags);
454 while (size < size_lim) {
455 release_kmmio_fault_page(p->addr + size, &release_list);
456 size += PAGE_SIZE;
457 }
458 list_del_rcu(&p->list);
459 kmmio_count--;
460 spin_unlock_irqrestore(&kmmio_lock, flags);
461
462 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
463 if (!drelease) {
464 pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
465 return;
466 }
467 drelease->release_list = release_list;
468
469 /*
470 * This is not really RCU here. We have just disarmed a set of
471 * pages so that they cannot trigger page faults anymore. However,
472 * we cannot remove the pages from kmmio_page_table,
473 * because a probe hit might be in flight on another CPU. The
474 * pages are collected into a list, and they will be removed from
475 * kmmio_page_table when it is certain that no probe hit related to
476 * these pages can be in flight. RCU grace period sounds like a
477 * good choice.
478 *
479 * If we removed the pages too early, kmmio page fault handler might
480 * not find the respective kmmio_fault_page and determine it's not
481 * a kmmio fault, when it actually is. This would lead to madness.
482 */
483 call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
484}
485EXPORT_SYMBOL(unregister_kmmio_probe);
486
487static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
488 void *args)
489{
490 struct die_args *arg = args;
491
492 if (val == DIE_DEBUG && (arg->err & DR_STEP))
493 if (post_kmmio_handler(arg->err, arg->regs) == 1)
494 return NOTIFY_STOP;
495
496 return NOTIFY_DONE;
497}
498
499static struct notifier_block nb_die = {
500 .notifier_call = kmmio_die_notifier
501};
502
503static int __init init_kmmio(void)
504{
505 int i;
506 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
507 INIT_LIST_HEAD(&kmmio_page_table[i]);
508 return register_die_notifier(&nb_die);
509}
510fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644
index 000000000000..e7397e108beb
--- /dev/null
+++ b/arch/x86/mm/mmio-mod.c
@@ -0,0 +1,515 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2005
17 * Jeff Muizelaar, 2006, 2007
18 * Pekka Paalanen, 2008 <pq@iki.fi>
19 *
20 * Derived from the read-mod example from relay-examples by Tom Zanussi.
21 */
22#define DEBUG 1
23
24#include <linux/module.h>
25#include <linux/debugfs.h>
26#include <linux/uaccess.h>
27#include <linux/io.h>
28#include <linux/version.h>
29#include <linux/kallsyms.h>
30#include <asm/pgtable.h>
31#include <linux/mmiotrace.h>
32#include <asm/e820.h> /* for ISA_START_ADDRESS */
33#include <asm/atomic.h>
34#include <linux/percpu.h>
35#include <linux/cpu.h>
36
37#include "pf_in.h"
38
39#define NAME "mmiotrace: "
40
41struct trap_reason {
42 unsigned long addr;
43 unsigned long ip;
44 enum reason_type type;
45 int active_traces;
46};
47
48struct remap_trace {
49 struct list_head list;
50 struct kmmio_probe probe;
51 resource_size_t phys;
52 unsigned long id;
53};
54
55/* Accessed per-cpu. */
56static DEFINE_PER_CPU(struct trap_reason, pf_reason);
57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
58
59#if 0 /* XXX: no way gather this info anymore */
60/* Access to this is not per-cpu. */
61static DEFINE_PER_CPU(atomic_t, dropped);
62#endif
63
64static struct dentry *marker_file;
65
66static DEFINE_MUTEX(mmiotrace_mutex);
67static DEFINE_SPINLOCK(trace_lock);
68static atomic_t mmiotrace_enabled;
69static LIST_HEAD(trace_list); /* struct remap_trace */
70
71/*
72 * Locking in this file:
73 * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
74 * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
75 * and trace_lock.
76 * - Routines depending on is_enabled() must take trace_lock.
77 * - trace_list users must hold trace_lock.
78 * - is_enabled() guarantees that mmio_trace_record is allowed.
79 * - pre/post callbacks assume the effect of is_enabled() being true.
80 */
81
82/* module parameters */
83static unsigned long filter_offset;
84static int nommiotrace;
85static int trace_pc;
86
87module_param(filter_offset, ulong, 0);
88module_param(nommiotrace, bool, 0);
89module_param(trace_pc, bool, 0);
90
91MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
92MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
93MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
94
95static bool is_enabled(void)
96{
97 return atomic_read(&mmiotrace_enabled);
98}
99
100#if 0 /* XXX: needs rewrite */
101/*
102 * Write callback for the debugfs entry:
103 * Read a marker and write it to the mmio trace log
104 */
105static ssize_t write_marker(struct file *file, const char __user *buffer,
106 size_t count, loff_t *ppos)
107{
108 char *event = NULL;
109 struct mm_io_header *headp;
110 ssize_t len = (count > 65535) ? 65535 : count;
111
112 event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
113 if (!event)
114 return -ENOMEM;
115
116 headp = (struct mm_io_header *)event;
117 headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
118 headp->data_len = len;
119
120 if (copy_from_user(event + sizeof(*headp), buffer, len)) {
121 kfree(event);
122 return -EFAULT;
123 }
124
125 spin_lock_irq(&trace_lock);
126#if 0 /* XXX: convert this to use tracing */
127 if (is_enabled())
128 relay_write(chan, event, sizeof(*headp) + len);
129 else
130#endif
131 len = -EINVAL;
132 spin_unlock_irq(&trace_lock);
133 kfree(event);
134 return len;
135}
136#endif
137
138static void print_pte(unsigned long address)
139{
140 unsigned int level;
141 pte_t *pte = lookup_address(address, &level);
142
143 if (!pte) {
144 pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
145 __func__, address);
146 return;
147 }
148
149 if (level == PG_LEVEL_2M) {
150 pr_emerg(NAME "4MB pages are not currently supported: "
151 "0x%08lx\n", address);
152 BUG();
153 }
154 pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address,
155 (unsigned long long)pte_val(*pte),
156 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
157}
158
159/*
160 * For some reason the pre/post pairs have been called in an
161 * unmatched order. Report and die.
162 */
163static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
164{
165 const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
166 pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
167 "last fault for address: 0x%08lx\n",
168 addr, my_reason->addr);
169 print_pte(addr);
170 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
171 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
172#ifdef __i386__
173 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
174 regs->ax, regs->bx, regs->cx, regs->dx);
175 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
176 regs->si, regs->di, regs->bp, regs->sp);
177#else
178 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
179 regs->ax, regs->cx, regs->dx);
180 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
181 regs->si, regs->di, regs->bp, regs->sp);
182#endif
183 put_cpu_var(pf_reason);
184 BUG();
185}
186
187static void pre(struct kmmio_probe *p, struct pt_regs *regs,
188 unsigned long addr)
189{
190 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
191 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
192 const unsigned long instptr = instruction_pointer(regs);
193 const enum reason_type type = get_ins_type(instptr);
194 struct remap_trace *trace = p->private;
195
196 /* it doesn't make sense to have more than one active trace per cpu */
197 if (my_reason->active_traces)
198 die_kmmio_nesting_error(regs, addr);
199 else
200 my_reason->active_traces++;
201
202 my_reason->type = type;
203 my_reason->addr = addr;
204 my_reason->ip = instptr;
205
206 my_trace->phys = addr - trace->probe.addr + trace->phys;
207 my_trace->map_id = trace->id;
208
209 /*
210 * Only record the program counter when requested.
211 * It may taint clean-room reverse engineering.
212 */
213 if (trace_pc)
214 my_trace->pc = instptr;
215 else
216 my_trace->pc = 0;
217
218 /*
219 * XXX: the timestamp recorded will be *after* the tracing has been
220 * done, not at the time we hit the instruction. SMP implications
221 * on event ordering?
222 */
223
224 switch (type) {
225 case REG_READ:
226 my_trace->opcode = MMIO_READ;
227 my_trace->width = get_ins_mem_width(instptr);
228 break;
229 case REG_WRITE:
230 my_trace->opcode = MMIO_WRITE;
231 my_trace->width = get_ins_mem_width(instptr);
232 my_trace->value = get_ins_reg_val(instptr, regs);
233 break;
234 case IMM_WRITE:
235 my_trace->opcode = MMIO_WRITE;
236 my_trace->width = get_ins_mem_width(instptr);
237 my_trace->value = get_ins_imm_val(instptr);
238 break;
239 default:
240 {
241 unsigned char *ip = (unsigned char *)instptr;
242 my_trace->opcode = MMIO_UNKNOWN_OP;
243 my_trace->width = 0;
244 my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
245 *(ip + 2);
246 }
247 }
248 put_cpu_var(cpu_trace);
249 put_cpu_var(pf_reason);
250}
251
252static void post(struct kmmio_probe *p, unsigned long condition,
253 struct pt_regs *regs)
254{
255 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
256 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
257
258 /* this should always return the active_trace count to 0 */
259 my_reason->active_traces--;
260 if (my_reason->active_traces) {
261 pr_emerg(NAME "unexpected post handler");
262 BUG();
263 }
264
265 switch (my_reason->type) {
266 case REG_READ:
267 my_trace->value = get_ins_reg_val(my_reason->ip, regs);
268 break;
269 default:
270 break;
271 }
272
273 mmio_trace_rw(my_trace);
274 put_cpu_var(cpu_trace);
275 put_cpu_var(pf_reason);
276}
277
278static void ioremap_trace_core(resource_size_t offset, unsigned long size,
279 void __iomem *addr)
280{
281 static atomic_t next_id;
282 struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
283 /* These are page-unaligned. */
284 struct mmiotrace_map map = {
285 .phys = offset,
286 .virt = (unsigned long)addr,
287 .len = size,
288 .opcode = MMIO_PROBE
289 };
290
291 if (!trace) {
292 pr_err(NAME "kmalloc failed in ioremap\n");
293 return;
294 }
295
296 *trace = (struct remap_trace) {
297 .probe = {
298 .addr = (unsigned long)addr,
299 .len = size,
300 .pre_handler = pre,
301 .post_handler = post,
302 .private = trace
303 },
304 .phys = offset,
305 .id = atomic_inc_return(&next_id)
306 };
307 map.map_id = trace->id;
308
309 spin_lock_irq(&trace_lock);
310 if (!is_enabled())
311 goto not_enabled;
312
313 mmio_trace_mapping(&map);
314 list_add_tail(&trace->list, &trace_list);
315 if (!nommiotrace)
316 register_kmmio_probe(&trace->probe);
317
318not_enabled:
319 spin_unlock_irq(&trace_lock);
320}
321
322void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
323 void __iomem *addr)
324{
325 if (!is_enabled()) /* recheck and proper locking in *_core() */
326 return;
327
328 pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
329 (unsigned long long)offset, size, addr);
330 if ((filter_offset) && (offset != filter_offset))
331 return;
332 ioremap_trace_core(offset, size, addr);
333}
334
335static void iounmap_trace_core(volatile void __iomem *addr)
336{
337 struct mmiotrace_map map = {
338 .phys = 0,
339 .virt = (unsigned long)addr,
340 .len = 0,
341 .opcode = MMIO_UNPROBE
342 };
343 struct remap_trace *trace;
344 struct remap_trace *tmp;
345 struct remap_trace *found_trace = NULL;
346
347 pr_debug(NAME "Unmapping %p.\n", addr);
348
349 spin_lock_irq(&trace_lock);
350 if (!is_enabled())
351 goto not_enabled;
352
353 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
354 if ((unsigned long)addr == trace->probe.addr) {
355 if (!nommiotrace)
356 unregister_kmmio_probe(&trace->probe);
357 list_del(&trace->list);
358 found_trace = trace;
359 break;
360 }
361 }
362 map.map_id = (found_trace) ? found_trace->id : -1;
363 mmio_trace_mapping(&map);
364
365not_enabled:
366 spin_unlock_irq(&trace_lock);
367 if (found_trace) {
368 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
369 kfree(found_trace);
370 }
371}
372
373void mmiotrace_iounmap(volatile void __iomem *addr)
374{
375 might_sleep();
376 if (is_enabled()) /* recheck and proper locking in *_core() */
377 iounmap_trace_core(addr);
378}
379
380static void clear_trace_list(void)
381{
382 struct remap_trace *trace;
383 struct remap_trace *tmp;
384
385 /*
386 * No locking required, because the caller ensures we are in a
387 * critical section via mutex, and is_enabled() is false,
388 * i.e. nothing can traverse or modify this list.
389 * Caller also ensures is_enabled() cannot change.
390 */
391 list_for_each_entry(trace, &trace_list, list) {
392 pr_notice(NAME "purging non-iounmapped "
393 "trace @0x%08lx, size 0x%lx.\n",
394 trace->probe.addr, trace->probe.len);
395 if (!nommiotrace)
396 unregister_kmmio_probe(&trace->probe);
397 }
398 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
399
400 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
401 list_del(&trace->list);
402 kfree(trace);
403 }
404}
405
406#ifdef CONFIG_HOTPLUG_CPU
407static cpumask_t downed_cpus;
408
409static void enter_uniprocessor(void)
410{
411 int cpu;
412 int err;
413
414 get_online_cpus();
415 downed_cpus = cpu_online_map;
416 cpu_clear(first_cpu(cpu_online_map), downed_cpus);
417 if (num_online_cpus() > 1)
418 pr_notice(NAME "Disabling non-boot CPUs...\n");
419 put_online_cpus();
420
421 for_each_cpu_mask(cpu, downed_cpus) {
422 err = cpu_down(cpu);
423 if (!err)
424 pr_info(NAME "CPU%d is down.\n", cpu);
425 else
426 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
427 }
428 if (num_online_cpus() > 1)
429 pr_warning(NAME "multiple CPUs still online, "
430 "may miss events.\n");
431}
432
433static void leave_uniprocessor(void)
434{
435 int cpu;
436 int err;
437
438 if (cpus_weight(downed_cpus) == 0)
439 return;
440 pr_notice(NAME "Re-enabling CPUs...\n");
441 for_each_cpu_mask(cpu, downed_cpus) {
442 err = cpu_up(cpu);
443 if (!err)
444 pr_info(NAME "enabled CPU%d.\n", cpu);
445 else
446 pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err);
447 }
448}
449
450#else /* !CONFIG_HOTPLUG_CPU */
451static void enter_uniprocessor(void)
452{
453 if (num_online_cpus() > 1)
454 pr_warning(NAME "multiple CPUs are online, may miss events. "
455 "Suggest booting with maxcpus=1 kernel argument.\n");
456}
457
458static void leave_uniprocessor(void)
459{
460}
461#endif
462
463#if 0 /* XXX: out of order */
464static struct file_operations fops_marker = {
465 .owner = THIS_MODULE,
466 .write = write_marker
467};
468#endif
469
470void enable_mmiotrace(void)
471{
472 mutex_lock(&mmiotrace_mutex);
473 if (is_enabled())
474 goto out;
475
476#if 0 /* XXX: tracing does not support text entries */
477 marker_file = debugfs_create_file("marker", 0660, dir, NULL,
478 &fops_marker);
479 if (!marker_file)
480 pr_err(NAME "marker file creation failed.\n");
481#endif
482
483 if (nommiotrace)
484 pr_info(NAME "MMIO tracing disabled.\n");
485 enter_uniprocessor();
486 spin_lock_irq(&trace_lock);
487 atomic_inc(&mmiotrace_enabled);
488 spin_unlock_irq(&trace_lock);
489 pr_info(NAME "enabled.\n");
490out:
491 mutex_unlock(&mmiotrace_mutex);
492}
493
494void disable_mmiotrace(void)
495{
496 mutex_lock(&mmiotrace_mutex);
497 if (!is_enabled())
498 goto out;
499
500 spin_lock_irq(&trace_lock);
501 atomic_dec(&mmiotrace_enabled);
502 BUG_ON(is_enabled());
503 spin_unlock_irq(&trace_lock);
504
505 clear_trace_list(); /* guarantees: no more kmmio callbacks */
506 leave_uniprocessor();
507 if (marker_file) {
508 debugfs_remove(marker_file);
509 marker_file = NULL;
510 }
511
512 pr_info(NAME "disabled.\n");
513out:
514 mutex_unlock(&mmiotrace_mutex);
515}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c5066d519e5d..b432d5781773 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -27,30 +27,17 @@
27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data); 28EXPORT_SYMBOL(node_data);
29 29
30bootmem_data_t plat_node_bdata[MAX_NUMNODES]; 30static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
31 31
32struct memnode memnode; 32struct memnode memnode;
33 33
34#ifdef CONFIG_SMP
35int x86_cpu_to_node_map_init[NR_CPUS] = {
36 [0 ... NR_CPUS-1] = NUMA_NO_NODE
37};
38void *x86_cpu_to_node_map_early_ptr;
39EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
40#endif
41DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
42EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 34s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
45 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
46}; 36};
47 37
48cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
49EXPORT_SYMBOL(node_to_cpumask_map);
50
51int numa_off __initdata; 38int numa_off __initdata;
52unsigned long __initdata nodemap_addr; 39static unsigned long __initdata nodemap_addr;
53unsigned long __initdata nodemap_size; 40static unsigned long __initdata nodemap_size;
54 41
55/* 42/*
56 * Given a shift value, try to populate memnodemap[] 43 * Given a shift value, try to populate memnodemap[]
@@ -99,7 +86,7 @@ static int __init allocate_cachealigned_memnodemap(void)
99 86
100 addr = 0x8000; 87 addr = 0x8000;
101 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 88 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
102 nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT, 89 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
103 nodemap_size, L1_CACHE_BYTES); 90 nodemap_size, L1_CACHE_BYTES);
104 if (nodemap_addr == -1UL) { 91 if (nodemap_addr == -1UL) {
105 printk(KERN_ERR 92 printk(KERN_ERR
@@ -192,7 +179,7 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
192void __init setup_node_bootmem(int nodeid, unsigned long start, 179void __init setup_node_bootmem(int nodeid, unsigned long start,
193 unsigned long end) 180 unsigned long end)
194{ 181{
195 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; 182 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
196 unsigned long bootmap_start, nodedata_phys; 183 unsigned long bootmap_start, nodedata_phys;
197 void *bootmap; 184 void *bootmap;
198 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 185 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
@@ -204,7 +191,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
204 start, end); 191 start, end);
205 192
206 start_pfn = start >> PAGE_SHIFT; 193 start_pfn = start >> PAGE_SHIFT;
207 end_pfn = end >> PAGE_SHIFT; 194 last_pfn = end >> PAGE_SHIFT;
208 195
209 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 196 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
210 SMP_CACHE_BYTES); 197 SMP_CACHE_BYTES);
@@ -217,7 +204,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 204 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 205 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 206 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 207 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 208
222 /* 209 /*
223 * Find a place for the bootmem map 210 * Find a place for the bootmem map
@@ -226,14 +213,14 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
226 * early_node_mem will get that with find_e820_area instead 213 * early_node_mem will get that with find_e820_area instead
227 * of alloc_bootmem, that could clash with reserved range 214 * of alloc_bootmem, that could clash with reserved range
228 */ 215 */
229 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 216 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 217 nid = phys_to_nid(nodedata_phys);
231 if (nid == nodeid) 218 if (nid == nodeid)
232 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 219 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else 220 else
234 bootmap_start = round_up(start, PAGE_SIZE); 221 bootmap_start = round_up(start, PAGE_SIZE);
235 /* 222 /*
236 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like 223 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 224 * to use that to align to PAGE_SIZE
238 */ 225 */
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 226 bootmap = early_node_mem(nodeid, bootmap_start, end,
@@ -248,7 +235,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
248 235
249 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 236 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
250 bootmap_start >> PAGE_SHIFT, 237 bootmap_start >> PAGE_SHIFT,
251 start_pfn, end_pfn); 238 start_pfn, last_pfn);
252 239
253 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 240 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
254 bootmap_start, bootmap_start + bootmap_size - 1, 241 bootmap_start, bootmap_start + bootmap_size - 1,
@@ -309,7 +296,7 @@ void __init numa_init_array(void)
309 296
310#ifdef CONFIG_NUMA_EMU 297#ifdef CONFIG_NUMA_EMU
311/* Numa emulation */ 298/* Numa emulation */
312char *cmdline __initdata; 299static char *cmdline __initdata;
313 300
314/* 301/*
315 * Setups up nid to range from addr to addr + size. If the end 302 * Setups up nid to range from addr to addr + size. If the end
@@ -413,15 +400,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413} 400}
414 401
415/* 402/*
416 * Sets up the system RAM area from start_pfn to end_pfn according to the 403 * Sets up the system RAM area from start_pfn to last_pfn according to the
417 * numa=fake command-line option. 404 * numa=fake command-line option.
418 */ 405 */
419static struct bootnode nodes[MAX_NUMNODES] __initdata; 406static struct bootnode nodes[MAX_NUMNODES] __initdata;
420 407
421static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 408static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
422{ 409{
423 u64 size, addr = start_pfn << PAGE_SHIFT; 410 u64 size, addr = start_pfn << PAGE_SHIFT;
424 u64 max_addr = end_pfn << PAGE_SHIFT; 411 u64 max_addr = last_pfn << PAGE_SHIFT;
425 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 412 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
426 413
427 memset(&nodes, 0, sizeof(nodes)); 414 memset(&nodes, 0, sizeof(nodes));
@@ -527,7 +514,7 @@ out:
527} 514}
528#endif /* CONFIG_NUMA_EMU */ 515#endif /* CONFIG_NUMA_EMU */
529 516
530void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 517void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
531{ 518{
532 int i; 519 int i;
533 520
@@ -535,7 +522,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
535 nodes_clear(node_online_map); 522 nodes_clear(node_online_map);
536 523
537#ifdef CONFIG_NUMA_EMU 524#ifdef CONFIG_NUMA_EMU
538 if (cmdline && !numa_emulation(start_pfn, end_pfn)) 525 if (cmdline && !numa_emulation(start_pfn, last_pfn))
539 return; 526 return;
540 nodes_clear(node_possible_map); 527 nodes_clear(node_possible_map);
541 nodes_clear(node_online_map); 528 nodes_clear(node_online_map);
@@ -543,7 +530,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
543 530
544#ifdef CONFIG_ACPI_NUMA 531#ifdef CONFIG_ACPI_NUMA
545 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 532 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
546 end_pfn << PAGE_SHIFT)) 533 last_pfn << PAGE_SHIFT))
547 return; 534 return;
548 nodes_clear(node_possible_map); 535 nodes_clear(node_possible_map);
549 nodes_clear(node_online_map); 536 nodes_clear(node_online_map);
@@ -551,7 +538,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
551 538
552#ifdef CONFIG_K8_NUMA 539#ifdef CONFIG_K8_NUMA
553 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 540 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
554 end_pfn<<PAGE_SHIFT)) 541 last_pfn<<PAGE_SHIFT))
555 return; 542 return;
556 nodes_clear(node_possible_map); 543 nodes_clear(node_possible_map);
557 nodes_clear(node_online_map); 544 nodes_clear(node_online_map);
@@ -561,7 +548,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
561 548
562 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 549 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
563 start_pfn << PAGE_SHIFT, 550 start_pfn << PAGE_SHIFT,
564 end_pfn << PAGE_SHIFT); 551 last_pfn << PAGE_SHIFT);
565 /* setup dummy node covering all memory */ 552 /* setup dummy node covering all memory */
566 memnode_shift = 63; 553 memnode_shift = 63;
567 memnodemap = memnode.embedded_map; 554 memnodemap = memnode.embedded_map;
@@ -570,29 +557,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
570 node_set(0, node_possible_map); 557 node_set(0, node_possible_map);
571 for (i = 0; i < NR_CPUS; i++) 558 for (i = 0; i < NR_CPUS; i++)
572 numa_set_node(i, 0); 559 numa_set_node(i, 0);
573 /* cpumask_of_cpu() may not be available during early startup */ 560 e820_register_active_regions(0, start_pfn, last_pfn);
574 memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); 561 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
575 cpu_set(0, node_to_cpumask_map[0]);
576 e820_register_active_regions(0, start_pfn, end_pfn);
577 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
578}
579
580__cpuinit void numa_add_cpu(int cpu)
581{
582 set_bit(cpu,
583 (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
584}
585
586void __cpuinit numa_set_node(int cpu, int node)
587{
588 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
589
590 if(cpu_to_node_map)
591 cpu_to_node_map[cpu] = node;
592 else if(per_cpu_offset(cpu))
593 per_cpu(x86_cpu_to_node_map, cpu) = node;
594 else
595 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
596} 562}
597 563
598unsigned long __init numa_free_all_bootmem(void) 564unsigned long __init numa_free_all_bootmem(void)
@@ -613,7 +579,7 @@ void __init paging_init(void)
613 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 579 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
614 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 580 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
615 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 581 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
616 max_zone_pfns[ZONE_NORMAL] = end_pfn; 582 max_zone_pfns[ZONE_NORMAL] = max_pfn;
617 583
618 sparse_memory_present_with_active_regions(MAX_NUMNODES); 584 sparse_memory_present_with_active_regions(MAX_NUMNODES);
619 sparse_init(); 585 sparse_init();
@@ -641,6 +607,7 @@ static __init int numa_setup(char *opt)
641} 607}
642early_param("numa", numa_setup); 608early_param("numa", numa_setup);
643 609
610#ifdef CONFIG_NUMA
644/* 611/*
645 * Setup early cpu_to_node. 612 * Setup early cpu_to_node.
646 * 613 *
@@ -652,14 +619,19 @@ early_param("numa", numa_setup);
652 * is already initialized in a round robin manner at numa_init_array, 619 * is already initialized in a round robin manner at numa_init_array,
653 * prior to this call, and this initialization is good enough 620 * prior to this call, and this initialization is good enough
654 * for the fake NUMA cases. 621 * for the fake NUMA cases.
622 *
623 * Called before the per_cpu areas are setup.
655 */ 624 */
656void __init init_cpu_to_node(void) 625void __init init_cpu_to_node(void)
657{ 626{
658 int i; 627 int cpu;
628 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
659 629
660 for (i = 0; i < NR_CPUS; i++) { 630 BUG_ON(cpu_to_apicid == NULL);
631
632 for_each_possible_cpu(cpu) {
661 int node; 633 int node;
662 u16 apicid = x86_cpu_to_apicid_init[i]; 634 u16 apicid = cpu_to_apicid[cpu];
663 635
664 if (apicid == BAD_APICID) 636 if (apicid == BAD_APICID)
665 continue; 637 continue;
@@ -668,8 +640,9 @@ void __init init_cpu_to_node(void)
668 continue; 640 continue;
669 if (!node_online(node)) 641 if (!node_online(node))
670 continue; 642 continue;
671 numa_set_node(i, node); 643 numa_set_node(cpu, node);
672 } 644 }
673} 645}
646#endif
674 647
675 648
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 75f1b109aae8..0dcd42eb94e6 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * self test for change_page_attr. 2 * self test for change_page_attr.
3 * 3 *
4 * Clears the global bit on random pages in the direct mapping, then reverts 4 * Clears the a test pte bit on random pages in the direct mapping,
5 * and compares page tables forwards and afterwards. 5 * then reverts and compares page tables forwards and afterwards.
6 */ 6 */
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/kthread.h> 8#include <linux/kthread.h>
@@ -32,6 +32,13 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1)
36
37static int pte_testbit(pte_t pte)
38{
39 return pte_flags(pte) & _PAGE_UNUSED1;
40}
41
35struct split_state { 42struct split_state {
36 long lpg, gpg, spg, exec; 43 long lpg, gpg, spg, exec;
37 long min_exec, max_exec; 44 long min_exec, max_exec;
@@ -165,15 +172,14 @@ static int pageattr_test(void)
165 continue; 172 continue;
166 } 173 }
167 174
168 err = change_page_attr_clear(addr[i], len[i], 175 err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT);
169 __pgprot(_PAGE_GLOBAL));
170 if (err < 0) { 176 if (err < 0) {
171 printk(KERN_ERR "CPA %d failed %d\n", i, err); 177 printk(KERN_ERR "CPA %d failed %d\n", i, err);
172 failed++; 178 failed++;
173 } 179 }
174 180
175 pte = lookup_address(addr[i], &level); 181 pte = lookup_address(addr[i], &level);
176 if (!pte || pte_global(*pte) || pte_huge(*pte)) { 182 if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
177 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], 183 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
178 pte ? (u64)pte_val(*pte) : 0ULL); 184 pte ? (u64)pte_val(*pte) : 0ULL);
179 failed++; 185 failed++;
@@ -198,14 +204,13 @@ static int pageattr_test(void)
198 failed++; 204 failed++;
199 continue; 205 continue;
200 } 206 }
201 err = change_page_attr_set(addr[i], len[i], 207 err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT);
202 __pgprot(_PAGE_GLOBAL));
203 if (err < 0) { 208 if (err < 0) {
204 printk(KERN_ERR "CPA reverting failed: %d\n", err); 209 printk(KERN_ERR "CPA reverting failed: %d\n", err);
205 failed++; 210 failed++;
206 } 211 }
207 pte = lookup_address(addr[i], &level); 212 pte = lookup_address(addr[i], &level);
208 if (!pte || !pte_global(*pte)) { 213 if (!pte || pte_testbit(*pte)) {
209 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", 214 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
210 addr[i], pte ? (u64)pte_val(*pte) : 0ULL); 215 addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
211 failed++; 216 failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37e..65c6e46bf059 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -34,6 +34,41 @@ struct cpa_data {
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35}; 35};
36 36
37#ifdef CONFIG_PROC_FS
38static unsigned long direct_pages_count[PG_LEVEL_NUM];
39
40void update_page_count(int level, unsigned long pages)
41{
42 unsigned long flags;
43
44 /* Protect against CPA */
45 spin_lock_irqsave(&pgd_lock, flags);
46 direct_pages_count[level] += pages;
47 spin_unlock_irqrestore(&pgd_lock, flags);
48}
49
50static void split_page_count(int level)
51{
52 direct_pages_count[level]--;
53 direct_pages_count[level - 1] += PTRS_PER_PTE;
54}
55
56int arch_report_meminfo(char *page)
57{
58 int n = sprintf(page, "DirectMap4k: %8lu\n"
59 "DirectMap2M: %8lu\n",
60 direct_pages_count[PG_LEVEL_4K],
61 direct_pages_count[PG_LEVEL_2M]);
62#ifdef CONFIG_X86_64
63 n += sprintf(page + n, "DirectMap1G: %8lu\n",
64 direct_pages_count[PG_LEVEL_1G]);
65#endif
66 return n;
67}
68#else
69static inline void split_page_count(int level) { }
70#endif
71
37#ifdef CONFIG_X86_64 72#ifdef CONFIG_X86_64
38 73
39static inline unsigned long highmap_start_pfn(void) 74static inline unsigned long highmap_start_pfn(void)
@@ -106,7 +141,7 @@ static void cpa_flush_all(unsigned long cache)
106{ 141{
107 BUG_ON(irqs_disabled()); 142 BUG_ON(irqs_disabled());
108 143
109 on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); 144 on_each_cpu(__cpa_flush_all, (void *) cache, 1);
110} 145}
111 146
112static void __cpa_flush_range(void *arg) 147static void __cpa_flush_range(void *arg)
@@ -127,7 +162,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
127 BUG_ON(irqs_disabled()); 162 BUG_ON(irqs_disabled());
128 WARN_ON(PAGE_ALIGN(start) != start); 163 WARN_ON(PAGE_ALIGN(start) != start);
129 164
130 on_each_cpu(__cpa_flush_range, NULL, 1, 1); 165 on_each_cpu(__cpa_flush_range, NULL, 1);
131 166
132 if (!cache) 167 if (!cache)
133 return; 168 return;
@@ -227,6 +262,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
227 262
228 return pte_offset_kernel(pmd, address); 263 return pte_offset_kernel(pmd, address);
229} 264}
265EXPORT_SYMBOL_GPL(lookup_address);
230 266
231/* 267/*
232 * Set the new pmd in all the pgds we know about: 268 * Set the new pmd in all the pgds we know about:
@@ -500,6 +536,16 @@ static int split_large_page(pte_t *kpte, unsigned long address)
500 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 536 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
501 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 537 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
502 538
539 if (address >= (unsigned long)__va(0) &&
540 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
541 split_page_count(level);
542
543#ifdef CONFIG_X86_64
544 if (address >= (unsigned long)__va(1UL<<32) &&
545 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
546 split_page_count(level);
547#endif
548
503 /* 549 /*
504 * Install the new, split up pagetable. Important details here: 550 * Install the new, split up pagetable. Important details here:
505 * 551 *
@@ -613,15 +659,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
613 struct cpa_data alias_cpa; 659 struct cpa_data alias_cpa;
614 int ret = 0; 660 int ret = 0;
615 661
616 if (cpa->pfn > max_pfn_mapped) 662 if (cpa->pfn >= max_pfn_mapped)
617 return 0; 663 return 0;
618 664
665#ifdef CONFIG_X86_64
666 if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
667 return 0;
668#endif
619 /* 669 /*
620 * No need to redo, when the primary call touched the direct 670 * No need to redo, when the primary call touched the direct
621 * mapping already: 671 * mapping already:
622 */ 672 */
623 if (!within(cpa->vaddr, PAGE_OFFSET, 673 if (!(within(cpa->vaddr, PAGE_OFFSET,
624 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { 674 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
675#ifdef CONFIG_X86_64
676 || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
677 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
678#endif
679 )) {
625 680
626 alias_cpa = *cpa; 681 alias_cpa = *cpa;
627 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 682 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
@@ -805,7 +860,7 @@ int _set_memory_wc(unsigned long addr, int numpages)
805 860
806int set_memory_wc(unsigned long addr, int numpages) 861int set_memory_wc(unsigned long addr, int numpages)
807{ 862{
808 if (!pat_wc_enabled) 863 if (!pat_enabled)
809 return set_memory_uc(addr, numpages); 864 return set_memory_uc(addr, numpages);
810 865
811 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 866 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index de3a99812450..d4585077977a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -26,15 +26,15 @@
26#include <asm/io.h> 26#include <asm/io.h>
27 27
28#ifdef CONFIG_X86_PAT 28#ifdef CONFIG_X86_PAT
29int __read_mostly pat_wc_enabled = 1; 29int __read_mostly pat_enabled = 1;
30 30
31void __cpuinit pat_disable(char *reason) 31void __cpuinit pat_disable(char *reason)
32{ 32{
33 pat_wc_enabled = 0; 33 pat_enabled = 0;
34 printk(KERN_INFO "%s\n", reason); 34 printk(KERN_INFO "%s\n", reason);
35} 35}
36 36
37static int nopat(char *str) 37static int __init nopat(char *str)
38{ 38{
39 pat_disable("PAT support disabled."); 39 pat_disable("PAT support disabled.");
40 return 0; 40 return 0;
@@ -42,6 +42,19 @@ static int nopat(char *str)
42early_param("nopat", nopat); 42early_param("nopat", nopat);
43#endif 43#endif
44 44
45
46static int debug_enable;
47static int __init pat_debug_setup(char *str)
48{
49 debug_enable = 1;
50 return 0;
51}
52__setup("debugpat", pat_debug_setup);
53
54#define dprintk(fmt, arg...) \
55 do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
56
57
45static u64 __read_mostly boot_pat_state; 58static u64 __read_mostly boot_pat_state;
46 59
47enum { 60enum {
@@ -53,24 +66,25 @@ enum {
53 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ 66 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
54}; 67};
55 68
56#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) 69#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
57 70
58void pat_init(void) 71void pat_init(void)
59{ 72{
60 u64 pat; 73 u64 pat;
61 74
62 if (!pat_wc_enabled) 75 if (!pat_enabled)
63 return; 76 return;
64 77
65 /* Paranoia check. */ 78 /* Paranoia check. */
66 if (!cpu_has_pat) { 79 if (!cpu_has_pat && boot_pat_state) {
67 printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
68 /* 80 /*
69 * Panic if this happens on the secondary CPU, and we 81 * If this happens we are on a secondary CPU, but
70 * switched to PAT on the boot CPU. We have no way to 82 * switched to PAT on the boot CPU. We have no way to
71 * undo PAT. 83 * undo PAT.
72 */ 84 */
73 BUG_ON(boot_pat_state); 85 printk(KERN_ERR "PAT enabled, "
86 "but not supported by secondary CPU\n");
87 BUG();
74 } 88 }
75 89
76 /* Set PWT to Write-Combining. All other bits stay the same */ 90 /* Set PWT to Write-Combining. All other bits stay the same */
@@ -86,8 +100,8 @@ void pat_init(void)
86 * 011 UC _PAGE_CACHE_UC 100 * 011 UC _PAGE_CACHE_UC
87 * PAT bit unused 101 * PAT bit unused
88 */ 102 */
89 pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | 103 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
90 PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); 104 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
91 105
92 /* Boot CPU check */ 106 /* Boot CPU check */
93 if (!boot_pat_state) 107 if (!boot_pat_state)
@@ -103,11 +117,11 @@ void pat_init(void)
103static char *cattr_name(unsigned long flags) 117static char *cattr_name(unsigned long flags)
104{ 118{
105 switch (flags & _PAGE_CACHE_MASK) { 119 switch (flags & _PAGE_CACHE_MASK) {
106 case _PAGE_CACHE_UC: return "uncached"; 120 case _PAGE_CACHE_UC: return "uncached";
107 case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; 121 case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
108 case _PAGE_CACHE_WB: return "write-back"; 122 case _PAGE_CACHE_WB: return "write-back";
109 case _PAGE_CACHE_WC: return "write-combining"; 123 case _PAGE_CACHE_WC: return "write-combining";
110 default: return "broken"; 124 default: return "broken";
111 } 125 }
112} 126}
113 127
@@ -145,46 +159,50 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
145 * The intersection is based on "Effective Memory Type" tables in IA-32 159 * The intersection is based on "Effective Memory Type" tables in IA-32
146 * SDM vol 3a 160 * SDM vol 3a
147 */ 161 */
148static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, 162static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
149 unsigned long *ret_prot)
150{ 163{
151 unsigned long pat_type; 164 /*
152 u8 mtrr_type; 165 * Look for MTRR hint to get the effective type in case where PAT
153 166 * request is for WB.
154 mtrr_type = mtrr_type_lookup(start, end); 167 */
155 if (mtrr_type == 0xFF) { /* MTRR not enabled */ 168 if (req_type == _PAGE_CACHE_WB) {
156 *ret_prot = prot; 169 u8 mtrr_type;
157 return 0; 170
158 } 171 mtrr_type = mtrr_type_lookup(start, end);
159 if (mtrr_type == 0xFE) { /* MTRR match error */ 172 if (mtrr_type == MTRR_TYPE_UNCACHABLE)
160 *ret_prot = _PAGE_CACHE_UC; 173 return _PAGE_CACHE_UC;
161 return -1; 174 if (mtrr_type == MTRR_TYPE_WRCOMB)
162 } 175 return _PAGE_CACHE_WC;
163 if (mtrr_type != MTRR_TYPE_UNCACHABLE &&
164 mtrr_type != MTRR_TYPE_WRBACK &&
165 mtrr_type != MTRR_TYPE_WRCOMB) { /* MTRR type unhandled */
166 *ret_prot = _PAGE_CACHE_UC;
167 return -1;
168 } 176 }
169 177
170 pat_type = prot & _PAGE_CACHE_MASK; 178 return req_type;
171 prot &= (~_PAGE_CACHE_MASK); 179}
172 180
173 /* Currently doing intersection by hand. Optimize it later. */ 181static int chk_conflict(struct memtype *new, struct memtype *entry,
174 if (pat_type == _PAGE_CACHE_WC) { 182 unsigned long *type)
175 *ret_prot = prot | _PAGE_CACHE_WC; 183{
176 } else if (pat_type == _PAGE_CACHE_UC_MINUS) { 184 if (new->type != entry->type) {
177 *ret_prot = prot | _PAGE_CACHE_UC_MINUS; 185 if (type) {
178 } else if (pat_type == _PAGE_CACHE_UC || 186 new->type = entry->type;
179 mtrr_type == MTRR_TYPE_UNCACHABLE) { 187 *type = entry->type;
180 *ret_prot = prot | _PAGE_CACHE_UC; 188 } else
181 } else if (mtrr_type == MTRR_TYPE_WRCOMB) { 189 goto conflict;
182 *ret_prot = prot | _PAGE_CACHE_WC;
183 } else {
184 *ret_prot = prot | _PAGE_CACHE_WB;
185 } 190 }
186 191
192 /* check overlaps with more than one entry in the list */
193 list_for_each_entry_continue(entry, &memtype_list, nd) {
194 if (new->end <= entry->start)
195 break;
196 else if (new->type != entry->type)
197 goto conflict;
198 }
187 return 0; 199 return 0;
200
201 conflict:
202 printk(KERN_INFO "%s:%d conflicting memory types "
203 "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
204 new->end, cattr_name(new->type), cattr_name(entry->type));
205 return -EBUSY;
188} 206}
189 207
190/* 208/*
@@ -197,251 +215,134 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
197 * req_type will have a special case value '-1', when requester want to inherit 215 * req_type will have a special case value '-1', when requester want to inherit
198 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. 216 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
199 * 217 *
200 * If ret_type is NULL, function will return an error if it cannot reserve the 218 * If new_type is NULL, function will return an error if it cannot reserve the
201 * region with req_type. If ret_type is non-null, function will return 219 * region with req_type. If new_type is non-NULL, function will return
202 * available type in ret_type in case of no error. In case of any error 220 * available type in new_type in case of no error. In case of any error
203 * it will return a negative return value. 221 * it will return a negative return value.
204 */ 222 */
205int reserve_memtype(u64 start, u64 end, unsigned long req_type, 223int reserve_memtype(u64 start, u64 end, unsigned long req_type,
206 unsigned long *ret_type) 224 unsigned long *new_type)
207{ 225{
208 struct memtype *new_entry = NULL; 226 struct memtype *new, *entry;
209 struct memtype *parse;
210 unsigned long actual_type; 227 unsigned long actual_type;
228 struct list_head *where;
211 int err = 0; 229 int err = 0;
212 230
213 /* Only track when pat_wc_enabled */ 231 BUG_ON(start >= end); /* end is exclusive */
214 if (!pat_wc_enabled) { 232
233 if (!pat_enabled) {
215 /* This is identical to page table setting without PAT */ 234 /* This is identical to page table setting without PAT */
216 if (ret_type) { 235 if (new_type) {
217 if (req_type == -1) { 236 if (req_type == -1)
218 *ret_type = _PAGE_CACHE_WB; 237 *new_type = _PAGE_CACHE_WB;
219 } else { 238 else
220 *ret_type = req_type; 239 *new_type = req_type & _PAGE_CACHE_MASK;
221 }
222 } 240 }
223 return 0; 241 return 0;
224 } 242 }
225 243
226 /* Low ISA region is always mapped WB in page table. No need to track */ 244 /* Low ISA region is always mapped WB in page table. No need to track */
227 if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { 245 if (is_ISA_range(start, end - 1)) {
228 if (ret_type) 246 if (new_type)
229 *ret_type = _PAGE_CACHE_WB; 247 *new_type = _PAGE_CACHE_WB;
230
231 return 0; 248 return 0;
232 } 249 }
233 250
234 if (req_type == -1) { 251 if (req_type == -1) {
235 /* 252 /*
236 * Special case where caller wants to inherit from mtrr or 253 * Call mtrr_lookup to get the type hint. This is an
237 * existing pat mapping, defaulting to UC_MINUS in case of 254 * optimization for /dev/mem mmap'ers into WB memory (BIOS
238 * no match. 255 * tools and ACPI tools). Use WB request for WB memory and use
256 * UC_MINUS otherwise.
239 */ 257 */
240 u8 mtrr_type = mtrr_type_lookup(start, end); 258 u8 mtrr_type = mtrr_type_lookup(start, end);
241 if (mtrr_type == 0xFE) { /* MTRR match error */
242 err = -1;
243 }
244 259
245 if (mtrr_type == MTRR_TYPE_WRBACK) { 260 if (mtrr_type == MTRR_TYPE_WRBACK)
246 req_type = _PAGE_CACHE_WB;
247 actual_type = _PAGE_CACHE_WB; 261 actual_type = _PAGE_CACHE_WB;
248 } else { 262 else
249 req_type = _PAGE_CACHE_UC_MINUS;
250 actual_type = _PAGE_CACHE_UC_MINUS; 263 actual_type = _PAGE_CACHE_UC_MINUS;
251 } 264 } else
252 } else { 265 actual_type = pat_x_mtrr_type(start, end,
253 req_type &= _PAGE_CACHE_MASK; 266 req_type & _PAGE_CACHE_MASK);
254 err = pat_x_mtrr_type(start, end, req_type, &actual_type);
255 }
256 267
257 if (err) { 268 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
258 if (ret_type) 269 if (!new)
259 *ret_type = actual_type;
260
261 return -EINVAL;
262 }
263
264 new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
265 if (!new_entry)
266 return -ENOMEM; 270 return -ENOMEM;
267 271
268 new_entry->start = start; 272 new->start = start;
269 new_entry->end = end; 273 new->end = end;
270 new_entry->type = actual_type; 274 new->type = actual_type;
271 275
272 if (ret_type) 276 if (new_type)
273 *ret_type = actual_type; 277 *new_type = actual_type;
274 278
275 spin_lock(&memtype_lock); 279 spin_lock(&memtype_lock);
276 280
277 /* Search for existing mapping that overlaps the current range */ 281 /* Search for existing mapping that overlaps the current range */
278 list_for_each_entry(parse, &memtype_list, nd) { 282 where = NULL;
279 struct memtype *saved_ptr; 283 list_for_each_entry(entry, &memtype_list, nd) {
280 284 if (end <= entry->start) {
281 if (parse->start >= end) { 285 where = entry->nd.prev;
282 pr_debug("New Entry\n");
283 list_add(&new_entry->nd, parse->nd.prev);
284 new_entry = NULL;
285 break; 286 break;
286 } 287 } else if (start <= entry->start) { /* end > entry->start */
287 288 err = chk_conflict(new, entry, new_type);
288 if (start <= parse->start && end >= parse->start) { 289 if (!err) {
289 if (actual_type != parse->type && ret_type) { 290 dprintk("Overlap at 0x%Lx-0x%Lx\n",
290 actual_type = parse->type; 291 entry->start, entry->end);
291 *ret_type = actual_type; 292 where = entry->nd.prev;
292 new_entry->type = actual_type;
293 }
294
295 if (actual_type != parse->type) {
296 printk(
297 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
298 current->comm, current->pid,
299 start, end,
300 cattr_name(actual_type),
301 cattr_name(parse->type));
302 err = -EBUSY;
303 break;
304 }
305
306 saved_ptr = parse;
307 /*
308 * Check to see whether the request overlaps more
309 * than one entry in the list
310 */
311 list_for_each_entry_continue(parse, &memtype_list, nd) {
312 if (end <= parse->start) {
313 break;
314 }
315
316 if (actual_type != parse->type) {
317 printk(
318 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
319 current->comm, current->pid,
320 start, end,
321 cattr_name(actual_type),
322 cattr_name(parse->type));
323 err = -EBUSY;
324 break;
325 }
326 } 293 }
327
328 if (err) {
329 break;
330 }
331
332 pr_debug("Overlap at 0x%Lx-0x%Lx\n",
333 saved_ptr->start, saved_ptr->end);
334 /* No conflict. Go ahead and add this new entry */
335 list_add(&new_entry->nd, saved_ptr->nd.prev);
336 new_entry = NULL;
337 break; 294 break;
338 } 295 } else if (start < entry->end) { /* start > entry->start */
339 296 err = chk_conflict(new, entry, new_type);
340 if (start < parse->end) { 297 if (!err) {
341 if (actual_type != parse->type && ret_type) { 298 dprintk("Overlap at 0x%Lx-0x%Lx\n",
342 actual_type = parse->type; 299 entry->start, entry->end);
343 *ret_type = actual_type; 300 where = &entry->nd;
344 new_entry->type = actual_type;
345 }
346
347 if (actual_type != parse->type) {
348 printk(
349 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
350 current->comm, current->pid,
351 start, end,
352 cattr_name(actual_type),
353 cattr_name(parse->type));
354 err = -EBUSY;
355 break;
356 } 301 }
357
358 saved_ptr = parse;
359 /*
360 * Check to see whether the request overlaps more
361 * than one entry in the list
362 */
363 list_for_each_entry_continue(parse, &memtype_list, nd) {
364 if (end <= parse->start) {
365 break;
366 }
367
368 if (actual_type != parse->type) {
369 printk(
370 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
371 current->comm, current->pid,
372 start, end,
373 cattr_name(actual_type),
374 cattr_name(parse->type));
375 err = -EBUSY;
376 break;
377 }
378 }
379
380 if (err) {
381 break;
382 }
383
384 pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
385 saved_ptr->start, saved_ptr->end);
386 /* No conflict. Go ahead and add this new entry */
387 list_add(&new_entry->nd, &saved_ptr->nd);
388 new_entry = NULL;
389 break; 302 break;
390 } 303 }
391 } 304 }
392 305
393 if (err) { 306 if (err) {
394 printk(KERN_INFO 307 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
395 "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", 308 "track %s, req %s\n",
396 start, end, cattr_name(new_entry->type), 309 start, end, cattr_name(new->type), cattr_name(req_type));
397 cattr_name(req_type)); 310 kfree(new);
398 kfree(new_entry);
399 spin_unlock(&memtype_lock); 311 spin_unlock(&memtype_lock);
400 return err; 312 return err;
401 } 313 }
402 314
403 if (new_entry) { 315 if (where)
404 /* No conflict. Not yet added to the list. Add to the tail */ 316 list_add(&new->nd, where);
405 list_add_tail(&new_entry->nd, &memtype_list); 317 else
406 pr_debug("New Entry\n"); 318 list_add_tail(&new->nd, &memtype_list);
407 }
408
409 if (ret_type) {
410 pr_debug(
411 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
412 start, end, cattr_name(actual_type),
413 cattr_name(req_type), cattr_name(*ret_type));
414 } else {
415 pr_debug(
416 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
417 start, end, cattr_name(actual_type),
418 cattr_name(req_type));
419 }
420 319
421 spin_unlock(&memtype_lock); 320 spin_unlock(&memtype_lock);
321
322 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
323 start, end, cattr_name(new->type), cattr_name(req_type),
324 new_type ? cattr_name(*new_type) : "-");
325
422 return err; 326 return err;
423} 327}
424 328
425int free_memtype(u64 start, u64 end) 329int free_memtype(u64 start, u64 end)
426{ 330{
427 struct memtype *ml; 331 struct memtype *entry;
428 int err = -EINVAL; 332 int err = -EINVAL;
429 333
430 /* Only track when pat_wc_enabled */ 334 if (!pat_enabled)
431 if (!pat_wc_enabled) {
432 return 0; 335 return 0;
433 }
434 336
435 /* Low ISA region is always mapped WB. No need to track */ 337 /* Low ISA region is always mapped WB. No need to track */
436 if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { 338 if (is_ISA_range(start, end - 1))
437 return 0; 339 return 0;
438 }
439 340
440 spin_lock(&memtype_lock); 341 spin_lock(&memtype_lock);
441 list_for_each_entry(ml, &memtype_list, nd) { 342 list_for_each_entry(entry, &memtype_list, nd) {
442 if (ml->start == start && ml->end == end) { 343 if (entry->start == start && entry->end == end) {
443 list_del(&ml->nd); 344 list_del(&entry->nd);
444 kfree(ml); 345 kfree(entry);
445 err = 0; 346 err = 0;
446 break; 347 break;
447 } 348 }
@@ -453,7 +354,7 @@ int free_memtype(u64 start, u64 end)
453 current->comm, current->pid, start, end); 354 current->comm, current->pid, start, end);
454 } 355 }
455 356
456 pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); 357 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
457 return err; 358 return err;
458} 359}
459 360
@@ -522,12 +423,12 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
522 * caching for the high addresses through the KEN pin, but 423 * caching for the high addresses through the KEN pin, but
523 * we maintain the tradition of paranoia in this code. 424 * we maintain the tradition of paranoia in this code.
524 */ 425 */
525 if (!pat_wc_enabled && 426 if (!pat_enabled &&
526 ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || 427 !(boot_cpu_has(X86_FEATURE_MTRR) ||
527 test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || 428 boot_cpu_has(X86_FEATURE_K6_MTRR) ||
528 test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || 429 boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
529 test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && 430 boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
530 (pfn << PAGE_SHIFT) >= __pa(high_memory)) { 431 (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
531 flags = _PAGE_CACHE_UC; 432 flags = _PAGE_CACHE_UC;
532 } 433 }
533#endif 434#endif
@@ -548,8 +449,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
548 if (retval < 0) 449 if (retval < 0)
549 return 0; 450 return 0;
550 451
551 if (pfn <= max_pfn_mapped && 452 if (((pfn < max_low_pfn_mapped) ||
552 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { 453 (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
454 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
553 free_memtype(offset, offset + size); 455 free_memtype(offset, offset + size);
554 printk(KERN_INFO 456 printk(KERN_INFO
555 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", 457 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
@@ -587,4 +489,3 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
587 489
588 free_memtype(addr, addr + size); 490 free_memtype(addr, addr + size);
589} 491}
590
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644
index 000000000000..efa1911e20ca
--- /dev/null
+++ b/arch/x86/mm/pf_in.c
@@ -0,0 +1,489 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
23 * Copyright by Intel Crop., 2002
24 * Louis Zhuang (louis.zhuang@intel.com)
25 *
26 * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
27 */
28
29#include <linux/module.h>
30#include <linux/ptrace.h> /* struct pt_regs */
31#include "pf_in.h"
32
33#ifdef __i386__
34/* IA32 Manual 3, 2-1 */
35static unsigned char prefix_codes[] = {
36 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
37 0x65, 0x2E, 0x3E, 0x66, 0x67
38};
39/* IA32 Manual 3, 3-432*/
40static unsigned int reg_rop[] = {
41 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
42};
43static unsigned int reg_wop[] = { 0x88, 0x89 };
44static unsigned int imm_wop[] = { 0xC6, 0xC7 };
45/* IA32 Manual 3, 3-432*/
46static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
47static unsigned int rw32[] = {
48 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
49};
50static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
51static unsigned int mw16[] = { 0xB70F, 0xBF0F };
52static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
53static unsigned int mw64[] = {};
54#else /* not __i386__ */
55static unsigned char prefix_codes[] = {
56 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
57 0xF0, 0xF3, 0xF2,
58 /* REX Prefixes */
59 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
60 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
61};
62/* AMD64 Manual 3, Appendix A*/
63static unsigned int reg_rop[] = {
64 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
65};
66static unsigned int reg_wop[] = { 0x88, 0x89 };
67static unsigned int imm_wop[] = { 0xC6, 0xC7 };
68static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
69static unsigned int rw32[] = {
70 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
71};
72/* 8 bit only */
73static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
74/* 16 bit only */
75static unsigned int mw16[] = { 0xB70F, 0xBF0F };
76/* 16 or 32 bit */
77static unsigned int mw32[] = { 0xC7 };
78/* 16, 32 or 64 bit */
79static unsigned int mw64[] = { 0x89, 0x8B };
80#endif /* not __i386__ */
81
82static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
83 int *rexr)
84{
85 int i;
86 unsigned char *p = addr;
87 *shorted = 0;
88 *enlarged = 0;
89 *rexr = 0;
90
91restart:
92 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
93 if (*p == prefix_codes[i]) {
94 if (*p == 0x66)
95 *shorted = 1;
96#ifdef __amd64__
97 if ((*p & 0xf8) == 0x48)
98 *enlarged = 1;
99 if ((*p & 0xf4) == 0x44)
100 *rexr = 1;
101#endif
102 p++;
103 goto restart;
104 }
105 }
106
107 return (p - addr);
108}
109
110static int get_opcode(unsigned char *addr, unsigned int *opcode)
111{
112 int len;
113
114 if (*addr == 0x0F) {
115 /* 0x0F is extension instruction */
116 *opcode = *(unsigned short *)addr;
117 len = 2;
118 } else {
119 *opcode = *addr;
120 len = 1;
121 }
122
123 return len;
124}
125
126#define CHECK_OP_TYPE(opcode, array, type) \
127 for (i = 0; i < ARRAY_SIZE(array); i++) { \
128 if (array[i] == opcode) { \
129 rv = type; \
130 goto exit; \
131 } \
132 }
133
134enum reason_type get_ins_type(unsigned long ins_addr)
135{
136 unsigned int opcode;
137 unsigned char *p;
138 int shorted, enlarged, rexr;
139 int i;
140 enum reason_type rv = OTHERS;
141
142 p = (unsigned char *)ins_addr;
143 p += skip_prefix(p, &shorted, &enlarged, &rexr);
144 p += get_opcode(p, &opcode);
145
146 CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
147 CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
148 CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
149
150exit:
151 return rv;
152}
153#undef CHECK_OP_TYPE
154
155static unsigned int get_ins_reg_width(unsigned long ins_addr)
156{
157 unsigned int opcode;
158 unsigned char *p;
159 int i, shorted, enlarged, rexr;
160
161 p = (unsigned char *)ins_addr;
162 p += skip_prefix(p, &shorted, &enlarged, &rexr);
163 p += get_opcode(p, &opcode);
164
165 for (i = 0; i < ARRAY_SIZE(rw8); i++)
166 if (rw8[i] == opcode)
167 return 1;
168
169 for (i = 0; i < ARRAY_SIZE(rw32); i++)
170 if (rw32[i] == opcode)
171 return (shorted ? 2 : (enlarged ? 8 : 4));
172
173 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
174 return 0;
175}
176
177unsigned int get_ins_mem_width(unsigned long ins_addr)
178{
179 unsigned int opcode;
180 unsigned char *p;
181 int i, shorted, enlarged, rexr;
182
183 p = (unsigned char *)ins_addr;
184 p += skip_prefix(p, &shorted, &enlarged, &rexr);
185 p += get_opcode(p, &opcode);
186
187 for (i = 0; i < ARRAY_SIZE(mw8); i++)
188 if (mw8[i] == opcode)
189 return 1;
190
191 for (i = 0; i < ARRAY_SIZE(mw16); i++)
192 if (mw16[i] == opcode)
193 return 2;
194
195 for (i = 0; i < ARRAY_SIZE(mw32); i++)
196 if (mw32[i] == opcode)
197 return shorted ? 2 : 4;
198
199 for (i = 0; i < ARRAY_SIZE(mw64); i++)
200 if (mw64[i] == opcode)
201 return shorted ? 2 : (enlarged ? 8 : 4);
202
203 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
204 return 0;
205}
206
207/*
208 * Define register ident in mod/rm byte.
209 * Note: these are NOT the same as in ptrace-abi.h.
210 */
211enum {
212 arg_AL = 0,
213 arg_CL = 1,
214 arg_DL = 2,
215 arg_BL = 3,
216 arg_AH = 4,
217 arg_CH = 5,
218 arg_DH = 6,
219 arg_BH = 7,
220
221 arg_AX = 0,
222 arg_CX = 1,
223 arg_DX = 2,
224 arg_BX = 3,
225 arg_SP = 4,
226 arg_BP = 5,
227 arg_SI = 6,
228 arg_DI = 7,
229#ifdef __amd64__
230 arg_R8 = 8,
231 arg_R9 = 9,
232 arg_R10 = 10,
233 arg_R11 = 11,
234 arg_R12 = 12,
235 arg_R13 = 13,
236 arg_R14 = 14,
237 arg_R15 = 15
238#endif
239};
240
241static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
242{
243 unsigned char *rv = NULL;
244
245 switch (no) {
246 case arg_AL:
247 rv = (unsigned char *)&regs->ax;
248 break;
249 case arg_BL:
250 rv = (unsigned char *)&regs->bx;
251 break;
252 case arg_CL:
253 rv = (unsigned char *)&regs->cx;
254 break;
255 case arg_DL:
256 rv = (unsigned char *)&regs->dx;
257 break;
258 case arg_AH:
259 rv = 1 + (unsigned char *)&regs->ax;
260 break;
261 case arg_BH:
262 rv = 1 + (unsigned char *)&regs->bx;
263 break;
264 case arg_CH:
265 rv = 1 + (unsigned char *)&regs->cx;
266 break;
267 case arg_DH:
268 rv = 1 + (unsigned char *)&regs->dx;
269 break;
270#ifdef __amd64__
271 case arg_R8:
272 rv = (unsigned char *)&regs->r8;
273 break;
274 case arg_R9:
275 rv = (unsigned char *)&regs->r9;
276 break;
277 case arg_R10:
278 rv = (unsigned char *)&regs->r10;
279 break;
280 case arg_R11:
281 rv = (unsigned char *)&regs->r11;
282 break;
283 case arg_R12:
284 rv = (unsigned char *)&regs->r12;
285 break;
286 case arg_R13:
287 rv = (unsigned char *)&regs->r13;
288 break;
289 case arg_R14:
290 rv = (unsigned char *)&regs->r14;
291 break;
292 case arg_R15:
293 rv = (unsigned char *)&regs->r15;
294 break;
295#endif
296 default:
297 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
298 break;
299 }
300 return rv;
301}
302
303static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
304{
305 unsigned long *rv = NULL;
306
307 switch (no) {
308 case arg_AX:
309 rv = &regs->ax;
310 break;
311 case arg_BX:
312 rv = &regs->bx;
313 break;
314 case arg_CX:
315 rv = &regs->cx;
316 break;
317 case arg_DX:
318 rv = &regs->dx;
319 break;
320 case arg_SP:
321 rv = &regs->sp;
322 break;
323 case arg_BP:
324 rv = &regs->bp;
325 break;
326 case arg_SI:
327 rv = &regs->si;
328 break;
329 case arg_DI:
330 rv = &regs->di;
331 break;
332#ifdef __amd64__
333 case arg_R8:
334 rv = &regs->r8;
335 break;
336 case arg_R9:
337 rv = &regs->r9;
338 break;
339 case arg_R10:
340 rv = &regs->r10;
341 break;
342 case arg_R11:
343 rv = &regs->r11;
344 break;
345 case arg_R12:
346 rv = &regs->r12;
347 break;
348 case arg_R13:
349 rv = &regs->r13;
350 break;
351 case arg_R14:
352 rv = &regs->r14;
353 break;
354 case arg_R15:
355 rv = &regs->r15;
356 break;
357#endif
358 default:
359 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
360 }
361
362 return rv;
363}
364
365unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
366{
367 unsigned int opcode;
368 unsigned char mod_rm;
369 int reg;
370 unsigned char *p;
371 int i, shorted, enlarged, rexr;
372 unsigned long rv;
373
374 p = (unsigned char *)ins_addr;
375 p += skip_prefix(p, &shorted, &enlarged, &rexr);
376 p += get_opcode(p, &opcode);
377 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
378 if (reg_rop[i] == opcode) {
379 rv = REG_READ;
380 goto do_work;
381 }
382
383 for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
384 if (reg_wop[i] == opcode) {
385 rv = REG_WRITE;
386 goto do_work;
387 }
388
389 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
390 "0x%02x\n", opcode);
391 goto err;
392
393do_work:
394 mod_rm = *p;
395 reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
396 switch (get_ins_reg_width(ins_addr)) {
397 case 1:
398 return *get_reg_w8(reg, regs);
399
400 case 2:
401 return *(unsigned short *)get_reg_w32(reg, regs);
402
403 case 4:
404 return *(unsigned int *)get_reg_w32(reg, regs);
405
406#ifdef __amd64__
407 case 8:
408 return *(unsigned long *)get_reg_w32(reg, regs);
409#endif
410
411 default:
412 printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
413 }
414
415err:
416 return 0;
417}
418
419unsigned long get_ins_imm_val(unsigned long ins_addr)
420{
421 unsigned int opcode;
422 unsigned char mod_rm;
423 unsigned char mod;
424 unsigned char *p;
425 int i, shorted, enlarged, rexr;
426 unsigned long rv;
427
428 p = (unsigned char *)ins_addr;
429 p += skip_prefix(p, &shorted, &enlarged, &rexr);
430 p += get_opcode(p, &opcode);
431 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
432 if (imm_wop[i] == opcode) {
433 rv = IMM_WRITE;
434 goto do_work;
435 }
436
437 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
438 "0x%02x\n", opcode);
439 goto err;
440
441do_work:
442 mod_rm = *p;
443 mod = mod_rm >> 6;
444 p++;
445 switch (mod) {
446 case 0:
447 /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */
448 /* AMD64: XXX Check for address size prefix? */
449 if ((mod_rm & 0x7) == 0x5)
450 p += 4;
451 break;
452
453 case 1:
454 p += 1;
455 break;
456
457 case 2:
458 p += 4;
459 break;
460
461 case 3:
462 default:
463 printk(KERN_ERR "mmiotrace: not a memory access instruction "
464 "at 0x%lx, rm_mod=0x%02x\n",
465 ins_addr, mod_rm);
466 }
467
468 switch (get_ins_reg_width(ins_addr)) {
469 case 1:
470 return *(unsigned char *)p;
471
472 case 2:
473 return *(unsigned short *)p;
474
475 case 4:
476 return *(unsigned int *)p;
477
478#ifdef __amd64__
479 case 8:
480 return *(unsigned long *)p;
481#endif
482
483 default:
484 printk(KERN_ERR "mmiotrace: Error: width.\n");
485 }
486
487err:
488 return 0;
489}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644
index 000000000000..e05341a51a27
--- /dev/null
+++ b/arch/x86/mm/pf_in.h
@@ -0,0 +1,39 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22#ifndef __PF_H_
23#define __PF_H_
24
25enum reason_type {
26 NOT_ME, /* page fault is not in regions */
27 NOTHING, /* access others point in regions */
28 REG_READ, /* read from addr to reg */
29 REG_WRITE, /* write from reg to addr */
30 IMM_WRITE, /* write from imm to addr */
31 OTHERS /* Other instructions can not intercept */
32};
33
34enum reason_type get_ins_type(unsigned long ins_addr);
35unsigned int get_ins_mem_width(unsigned long ins_addr);
36unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
37unsigned long get_ins_imm_val(unsigned long ins_addr);
38
39#endif /* __PF_H_ */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 50159764f694..557b2abceef8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
2#include <asm/pgalloc.h> 2#include <asm/pgalloc.h>
3#include <asm/pgtable.h> 3#include <asm/pgtable.h>
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h>
5 6
6pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
7{ 8{
@@ -65,12 +66,6 @@ static inline void pgd_list_del(pgd_t *pgd)
65static void pgd_ctor(void *p) 66static void pgd_ctor(void *p)
66{ 67{
67 pgd_t *pgd = p; 68 pgd_t *pgd = p;
68 unsigned long flags;
69
70 /* Clear usermode parts of PGD */
71 memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
72
73 spin_lock_irqsave(&pgd_lock, flags);
74 69
75 /* If the pgd points to a shared pagetable level (either the 70 /* If the pgd points to a shared pagetable level (either the
76 ptes in non-PAE, or shared PMD in PAE), then just copy the 71 ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -90,8 +85,6 @@ static void pgd_ctor(void *p)
90 /* list required to sync kernel mapping updates */ 85 /* list required to sync kernel mapping updates */
91 if (!SHARED_KERNEL_PMD) 86 if (!SHARED_KERNEL_PMD)
92 pgd_list_add(pgd); 87 pgd_list_add(pgd);
93
94 spin_unlock_irqrestore(&pgd_lock, flags);
95} 88}
96 89
97static void pgd_dtor(void *pgd) 90static void pgd_dtor(void *pgd)
@@ -119,6 +112,72 @@ static void pgd_dtor(void *pgd)
119 112
120#ifdef CONFIG_X86_PAE 113#ifdef CONFIG_X86_PAE
121/* 114/*
115 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
116 * updating the top-level pagetable entries to guarantee the
117 * processor notices the update. Since this is expensive, and
118 * all 4 top-level entries are used almost immediately in a
119 * new process's life, we just pre-populate them here.
120 *
121 * Also, if we're in a paravirt environment where the kernel pmd is
122 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
123 * and initialize the kernel pmds here.
124 */
125#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
126
127void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
128{
129 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
130
131 /* Note: almost everything apart from _PAGE_PRESENT is
132 reserved at the pmd (PDPT) level. */
133 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
134
135 /*
136 * According to Intel App note "TLBs, Paging-Structure Caches,
137 * and Their Invalidation", April 2007, document 317080-001,
138 * section 8.1: in PAE mode we explicitly have to flush the
139 * TLB via cr3 if the top-level pgd is changed...
140 */
141 if (mm == current->active_mm)
142 write_cr3(read_cr3());
143}
144#else /* !CONFIG_X86_PAE */
145
146/* No need to prepopulate any pagetable entries in non-PAE modes. */
147#define PREALLOCATED_PMDS 0
148
149#endif /* CONFIG_X86_PAE */
150
151static void free_pmds(pmd_t *pmds[])
152{
153 int i;
154
155 for(i = 0; i < PREALLOCATED_PMDS; i++)
156 if (pmds[i])
157 free_page((unsigned long)pmds[i]);
158}
159
160static int preallocate_pmds(pmd_t *pmds[])
161{
162 int i;
163 bool failed = false;
164
165 for(i = 0; i < PREALLOCATED_PMDS; i++) {
166 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
167 if (pmd == NULL)
168 failed = true;
169 pmds[i] = pmd;
170 }
171
172 if (failed) {
173 free_pmds(pmds);
174 return -ENOMEM;
175 }
176
177 return 0;
178}
179
180/*
122 * Mop up any pmd pages which may still be attached to the pgd. 181 * Mop up any pmd pages which may still be attached to the pgd.
123 * Normally they will be freed by munmap/exit_mmap, but any pmd we 182 * Normally they will be freed by munmap/exit_mmap, but any pmd we
124 * preallocate which never got a corresponding vma will need to be 183 * preallocate which never got a corresponding vma will need to be
@@ -128,7 +187,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
128{ 187{
129 int i; 188 int i;
130 189
131 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { 190 for(i = 0; i < PREALLOCATED_PMDS; i++) {
132 pgd_t pgd = pgdp[i]; 191 pgd_t pgd = pgdp[i];
133 192
134 if (pgd_val(pgd) != 0) { 193 if (pgd_val(pgd) != 0) {
@@ -142,32 +201,17 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
142 } 201 }
143} 202}
144 203
145/* 204static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
146 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
147 * updating the top-level pagetable entries to guarantee the
148 * processor notices the update. Since this is expensive, and
149 * all 4 top-level entries are used almost immediately in a
150 * new process's life, we just pre-populate them here.
151 *
152 * Also, if we're in a paravirt environment where the kernel pmd is
153 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
154 * and initialize the kernel pmds here.
155 */
156static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
157{ 205{
158 pud_t *pud; 206 pud_t *pud;
159 unsigned long addr; 207 unsigned long addr;
160 int i; 208 int i;
161 209
162 pud = pud_offset(pgd, 0); 210 pud = pud_offset(pgd, 0);
163 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
164 i++, pud++, addr += PUD_SIZE) {
165 pmd_t *pmd = pmd_alloc_one(mm, addr);
166 211
167 if (!pmd) { 212 for (addr = i = 0; i < PREALLOCATED_PMDS;
168 pgd_mop_up_pmds(mm, pgd); 213 i++, pud++, addr += PUD_SIZE) {
169 return 0; 214 pmd_t *pmd = pmds[i];
170 }
171 215
172 if (i >= KERNEL_PGD_BOUNDARY) 216 if (i >= KERNEL_PGD_BOUNDARY)
173 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 217 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
@@ -175,61 +219,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
175 219
176 pud_populate(mm, pud, pmd); 220 pud_populate(mm, pud, pmd);
177 } 221 }
178
179 return 1;
180} 222}
181 223
182void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 224pgd_t *pgd_alloc(struct mm_struct *mm)
183{ 225{
184 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 226 pgd_t *pgd;
227 pmd_t *pmds[PREALLOCATED_PMDS];
228 unsigned long flags;
185 229
186 /* Note: almost everything apart from _PAGE_PRESENT is 230 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
187 reserved at the pmd (PDPT) level. */
188 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
189 231
190 /* 232 if (pgd == NULL)
191 * According to Intel App note "TLBs, Paging-Structure Caches, 233 goto out;
192 * and Their Invalidation", April 2007, document 317080-001,
193 * section 8.1: in PAE mode we explicitly have to flush the
194 * TLB via cr3 if the top-level pgd is changed...
195 */
196 if (mm == current->active_mm)
197 write_cr3(read_cr3());
198}
199#else /* !CONFIG_X86_PAE */
200/* No need to prepopulate any pagetable entries in non-PAE modes. */
201static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
202{
203 return 1;
204}
205 234
206static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) 235 mm->pgd = pgd;
207{
208}
209#endif /* CONFIG_X86_PAE */
210 236
211pgd_t *pgd_alloc(struct mm_struct *mm) 237 if (preallocate_pmds(pmds) != 0)
212{ 238 goto out_free_pgd;
213 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
214 239
215 /* so that alloc_pmd can use it */ 240 if (paravirt_pgd_alloc(mm) != 0)
216 mm->pgd = pgd; 241 goto out_free_pmds;
217 if (pgd)
218 pgd_ctor(pgd);
219 242
220 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { 243 /*
221 pgd_dtor(pgd); 244 * Make sure that pre-populating the pmds is atomic with
222 free_page((unsigned long)pgd); 245 * respect to anything walking the pgd_list, so that they
223 pgd = NULL; 246 * never see a partially populated pgd.
224 } 247 */
248 spin_lock_irqsave(&pgd_lock, flags);
249
250 pgd_ctor(pgd);
251 pgd_prepopulate_pmd(mm, pgd, pmds);
252
253 spin_unlock_irqrestore(&pgd_lock, flags);
225 254
226 return pgd; 255 return pgd;
256
257out_free_pmds:
258 free_pmds(pmds);
259out_free_pgd:
260 free_page((unsigned long)pgd);
261out:
262 return NULL;
227} 263}
228 264
229void pgd_free(struct mm_struct *mm, pgd_t *pgd) 265void pgd_free(struct mm_struct *mm, pgd_t *pgd)
230{ 266{
231 pgd_mop_up_pmds(mm, pgd); 267 pgd_mop_up_pmds(mm, pgd);
232 pgd_dtor(pgd); 268 pgd_dtor(pgd);
269 paravirt_pgd_free(mm, pgd);
233 free_page((unsigned long)pgd); 270 free_page((unsigned long)pgd);
234} 271}
235 272
@@ -255,7 +292,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
255 292
256 if (pte_young(*ptep)) 293 if (pte_young(*ptep))
257 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 294 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
258 &ptep->pte); 295 (unsigned long *) &ptep->pte);
259 296
260 if (ret) 297 if (ret)
261 pte_update(vma->vm_mm, addr, ptep); 298 pte_update(vma->vm_mm, addr, ptep);
@@ -274,3 +311,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
274 311
275 return young; 312 return young;
276} 313}
314
315int fixmaps_set;
316
317void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
318{
319 unsigned long address = __fix_to_virt(idx);
320
321 if (idx >= __end_of_fixed_addresses) {
322 BUG();
323 return;
324 }
325 set_pte_vaddr(address, pte);
326 fixmaps_set++;
327}
328
329void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
330{
331 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
332}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 369cf065b6a4..b4becbf8c570 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -71,7 +71,7 @@ void show_mem(void)
71 * Associate a virtual page frame with a given physical page frame 71 * Associate a virtual page frame with a given physical page frame
72 * and protection flags for that frame. 72 * and protection flags for that frame.
73 */ 73 */
74static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) 74void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
75{ 75{
76 pgd_t *pgd; 76 pgd_t *pgd;
77 pud_t *pud; 77 pud_t *pud;
@@ -94,8 +94,8 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
94 return; 94 return;
95 } 95 }
96 pte = pte_offset_kernel(pmd, vaddr); 96 pte = pte_offset_kernel(pmd, vaddr);
97 if (pgprot_val(flags)) 97 if (pte_val(pteval))
98 set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags)); 98 set_pte_present(&init_mm, vaddr, pte, pteval);
99 else 99 else
100 pte_clear(&init_mm, vaddr, pte); 100 pte_clear(&init_mm, vaddr, pte);
101 101
@@ -141,22 +141,9 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
141 __flush_tlb_one(vaddr); 141 __flush_tlb_one(vaddr);
142} 142}
143 143
144static int fixmaps;
145unsigned long __FIXADDR_TOP = 0xfffff000; 144unsigned long __FIXADDR_TOP = 0xfffff000;
146EXPORT_SYMBOL(__FIXADDR_TOP); 145EXPORT_SYMBOL(__FIXADDR_TOP);
147 146
148void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
149{
150 unsigned long address = __fix_to_virt(idx);
151
152 if (idx >= __end_of_fixed_addresses) {
153 BUG();
154 return;
155 }
156 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
157 fixmaps++;
158}
159
160/** 147/**
161 * reserve_top_address - reserves a hole in the top of kernel address space 148 * reserve_top_address - reserves a hole in the top of kernel address space
162 * @reserve - size of hole to reserve 149 * @reserve - size of hole to reserve
@@ -164,11 +151,44 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
164 * Can be used to relocate the fixmap area and poke a hole in the top 151 * Can be used to relocate the fixmap area and poke a hole in the top
165 * of kernel address space to make room for a hypervisor. 152 * of kernel address space to make room for a hypervisor.
166 */ 153 */
167void reserve_top_address(unsigned long reserve) 154void __init reserve_top_address(unsigned long reserve)
168{ 155{
169 BUG_ON(fixmaps > 0); 156 BUG_ON(fixmaps_set > 0);
170 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 157 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
171 (int)-reserve); 158 (int)-reserve);
172 __FIXADDR_TOP = -reserve - PAGE_SIZE; 159 __FIXADDR_TOP = -reserve - PAGE_SIZE;
173 __VMALLOC_RESERVE += reserve; 160 __VMALLOC_RESERVE += reserve;
174} 161}
162
163/*
164 * vmalloc=size forces the vmalloc area to be exactly 'size'
165 * bytes. This can be used to increase (or decrease) the
166 * vmalloc area - the default is 128m.
167 */
168static int __init parse_vmalloc(char *arg)
169{
170 if (!arg)
171 return -EINVAL;
172
173 __VMALLOC_RESERVE = memparse(arg, &arg);
174 return 0;
175}
176early_param("vmalloc", parse_vmalloc);
177
178/*
179 * reservetop=size reserves a hole at the top of the kernel address space which
180 * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
181 * so relocating the fixmap can be done before paging initialization.
182 */
183static int __init parse_reservetop(char *arg)
184{
185 unsigned long address;
186
187 if (!arg)
188 return -EINVAL;
189
190 address = memparse(arg, &arg);
191 reserve_top_address(address);
192 return 0;
193}
194early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
new file mode 100644
index 000000000000..1eb2973a301c
--- /dev/null
+++ b/arch/x86/mm/srat_32.c
@@ -0,0 +1,279 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/acpi.h>
30#include <linux/nodemask.h>
31#include <asm/srat.h>
32#include <asm/topology.h>
33#include <asm/smp.h>
34#include <asm/e820.h>
35
36/*
37 * proximity macros and definitions
38 */
39#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
40#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
41#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
42#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
43/* bitmap length; _PXM is at most 255 */
44#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
45static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
46
47#define MAX_CHUNKS_PER_NODE 3
48#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
49struct node_memory_chunk_s {
50 unsigned long start_pfn;
51 unsigned long end_pfn;
52 u8 pxm; // proximity domain of node
53 u8 nid; // which cnode contains this chunk?
54 u8 bank; // which mem bank on this node
55};
56static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
57
58static int __initdata num_memory_chunks; /* total number of memory chunks */
59static u8 __initdata apicid_to_pxm[MAX_APICID];
60
61int numa_off __initdata;
62int acpi_numa __initdata;
63
64static __init void bad_srat(void)
65{
66 printk(KERN_ERR "SRAT: SRAT not used.\n");
67 acpi_numa = -1;
68 num_memory_chunks = 0;
69}
70
71static __init inline int srat_disabled(void)
72{
73 return numa_off || acpi_numa < 0;
74}
75
76/* Identify CPU proximity domains */
77void __init
78acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
79{
80 if (srat_disabled())
81 return;
82 if (cpu_affinity->header.length !=
83 sizeof(struct acpi_srat_cpu_affinity)) {
84 bad_srat();
85 return;
86 }
87
88 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
89 return; /* empty entry */
90
91 /* mark this node as "seen" in node bitmap */
92 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
93
94 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
95
96 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
97 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
98}
99
100/*
101 * Identify memory proximity domains and hot-remove capabilities.
102 * Fill node memory chunk list structure.
103 */
104void __init
105acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
106{
107 unsigned long long paddr, size;
108 unsigned long start_pfn, end_pfn;
109 u8 pxm;
110 struct node_memory_chunk_s *p, *q, *pend;
111
112 if (srat_disabled())
113 return;
114 if (memory_affinity->header.length !=
115 sizeof(struct acpi_srat_mem_affinity)) {
116 bad_srat();
117 return;
118 }
119
120 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
121 return; /* empty entry */
122
123 pxm = memory_affinity->proximity_domain & 0xff;
124
125 /* mark this node as "seen" in node bitmap */
126 BMAP_SET(pxm_bitmap, pxm);
127
128 /* calculate info for memory chunk structure */
129 paddr = memory_affinity->base_address;
130 size = memory_affinity->length;
131
132 start_pfn = paddr >> PAGE_SHIFT;
133 end_pfn = (paddr + size) >> PAGE_SHIFT;
134
135
136 if (num_memory_chunks >= MAXCHUNKS) {
137 printk(KERN_WARNING "Too many mem chunks in SRAT."
138 " Ignoring %lld MBytes at %llx\n",
139 size/(1024*1024), paddr);
140 return;
141 }
142
143 /* Insertion sort based on base address */
144 pend = &node_memory_chunk[num_memory_chunks];
145 for (p = &node_memory_chunk[0]; p < pend; p++) {
146 if (start_pfn < p->start_pfn)
147 break;
148 }
149 if (p < pend) {
150 for (q = pend; q >= p; q--)
151 *(q + 1) = *q;
152 }
153 p->start_pfn = start_pfn;
154 p->end_pfn = end_pfn;
155 p->pxm = pxm;
156
157 num_memory_chunks++;
158
159 printk(KERN_DEBUG "Memory range %08lx to %08lx"
160 " in proximity domain %02x %s\n",
161 start_pfn, end_pfn,
162 pxm,
163 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
164 "enabled and removable" : "enabled" ) );
165}
166
167/* Callback for SLIT parsing */
168void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
169{
170}
171
172void acpi_numa_arch_fixup(void)
173{
174}
175/*
176 * The SRAT table always lists ascending addresses, so can always
177 * assume that the first "start" address that you see is the real
178 * start of the node, and that the current "end" address is after
179 * the previous one.
180 */
181static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
182{
183 /*
184 * Only add present memory as told by the e820.
185 * There is no guarantee from the SRAT that the memory it
186 * enumerates is present at boot time because it represents
187 * *possible* memory hotplug areas the same as normal RAM.
188 */
189 if (memory_chunk->start_pfn >= max_pfn) {
190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
191 memory_chunk->start_pfn, memory_chunk->end_pfn);
192 return;
193 }
194 if (memory_chunk->nid != nid)
195 return;
196
197 if (!node_has_online_mem(nid))
198 node_start_pfn[nid] = memory_chunk->start_pfn;
199
200 if (node_start_pfn[nid] > memory_chunk->start_pfn)
201 node_start_pfn[nid] = memory_chunk->start_pfn;
202
203 if (node_end_pfn[nid] < memory_chunk->end_pfn)
204 node_end_pfn[nid] = memory_chunk->end_pfn;
205}
206
207int __init get_memcfg_from_srat(void)
208{
209 int i, j, nid;
210
211
212 if (srat_disabled())
213 goto out_fail;
214
215 if (num_memory_chunks == 0) {
216 printk(KERN_WARNING
217 "could not finy any ACPI SRAT memory areas.\n");
218 goto out_fail;
219 }
220
221 /* Calculate total number of nodes in system from PXM bitmap and create
222 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
223 * to specify the range of _PXM values.)
224 */
225 /*
226 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
227 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
228 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
229 * approaches MAX_PXM_DOMAINS for i386.
230 */
231 nodes_clear(node_online_map);
232 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
233 if (BMAP_TEST(pxm_bitmap, i)) {
234 int nid = acpi_map_pxm_to_node(i);
235 node_set_online(nid);
236 }
237 }
238 BUG_ON(num_online_nodes() == 0);
239
240 /* set cnode id in memory chunk structure */
241 for (i = 0; i < num_memory_chunks; i++)
242 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
243
244 printk(KERN_DEBUG "pxm bitmap: ");
245 for (i = 0; i < sizeof(pxm_bitmap); i++) {
246 printk(KERN_CONT "%02x ", pxm_bitmap[i]);
247 }
248 printk(KERN_CONT "\n");
249 printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
250 num_online_nodes());
251 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
252 num_memory_chunks);
253
254 for (i = 0; i < MAX_APICID; i++)
255 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
256
257 for (j = 0; j < num_memory_chunks; j++){
258 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
259 printk(KERN_DEBUG
260 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
261 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
262 node_read_chunk(chunk->nid, chunk);
263 e820_register_active_regions(chunk->nid, chunk->start_pfn,
264 min(chunk->end_pfn, max_pfn));
265 }
266
267 for_each_online_node(nid) {
268 unsigned long start = node_start_pfn[nid];
269 unsigned long end = min(node_end_pfn[nid], max_pfn);
270
271 memory_present(nid, start, end);
272 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
273 }
274 return 1;
275out_fail:
276 printk(KERN_ERR "failed to get NUMA memory information from SRAT"
277 " table\n");
278 return 0;
279}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 3890234e5b26..1b4763e26ea9 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -97,37 +97,22 @@ static __init inline int srat_disabled(void)
97 return numa_off || acpi_numa < 0; 97 return numa_off || acpi_numa < 0;
98} 98}
99 99
100/*
101 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
102 * up the NUMA heuristics which wants the local node to have a smaller
103 * distance than the others.
104 * Do some quick checks here and only use the SLIT if it passes.
105 */
106static __init int slit_valid(struct acpi_table_slit *slit)
107{
108 int i, j;
109 int d = slit->locality_count;
110 for (i = 0; i < d; i++) {
111 for (j = 0; j < d; j++) {
112 u8 val = slit->entry[d*i + j];
113 if (i == j) {
114 if (val != LOCAL_DISTANCE)
115 return 0;
116 } else if (val <= LOCAL_DISTANCE)
117 return 0;
118 }
119 }
120 return 1;
121}
122
123/* Callback for SLIT parsing */ 100/* Callback for SLIT parsing */
124void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 101void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
125{ 102{
126 if (!slit_valid(slit)) { 103 unsigned length;
127 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); 104 unsigned long phys;
128 return; 105
129 } 106 length = slit->header.length;
130 acpi_slit = slit; 107 phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
108 PAGE_SIZE);
109
110 if (phys == -1L)
111 panic(" Can not save slit!\n");
112
113 acpi_slit = __va(phys);
114 memcpy(acpi_slit, slit, length);
115 reserve_early(phys, phys + length, "ACPI SLIT");
131} 116}
132 117
133/* Callback for Proximity Domain -> LAPIC mapping */ 118/* Callback for Proximity Domain -> LAPIC mapping */
@@ -326,7 +311,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
326 pxmram = 0; 311 pxmram = 0;
327 } 312 }
328 313
329 e820ram = end_pfn - absent_pages_in_range(0, end_pfn); 314 e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
330 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 315 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
331 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 316 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
332 printk(KERN_ERR 317 printk(KERN_ERR
@@ -403,7 +388,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
403 if (node == NUMA_NO_NODE) 388 if (node == NUMA_NO_NODE)
404 continue; 389 continue;
405 if (!node_isset(node, node_possible_map)) 390 if (!node_isset(node, node_possible_map))
406 numa_set_node(i, NUMA_NO_NODE); 391 numa_clear_node(i);
407 } 392 }
408 numa_init_array(); 393 numa_init_array();
409 return 0; 394 return 0;
@@ -522,6 +507,7 @@ int __node_distance(int a, int b)
522 507
523EXPORT_SYMBOL(__node_distance); 508EXPORT_SYMBOL(__node_distance);
524 509
510#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
525int memory_add_physaddr_to_nid(u64 start) 511int memory_add_physaddr_to_nid(u64 start)
526{ 512{
527 int i, ret = 0; 513 int i, ret = 0;
@@ -533,4 +519,4 @@ int memory_add_physaddr_to_nid(u64 start)
533 return ret; 519 return ret;
534} 520}
535EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 521EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
536 522#endif
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644
index 000000000000..d877c5b423ef
--- /dev/null
+++ b/arch/x86/mm/testmmiotrace.c
@@ -0,0 +1,71 @@
1/*
2 * Written by Pekka Paalanen, 2008 <pq@iki.fi>
3 */
4#include <linux/module.h>
5#include <linux/io.h>
6
7#define MODULE_NAME "testmmiotrace"
8
9static unsigned long mmio_address;
10module_param(mmio_address, ulong, 0);
11MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
12
13static void do_write_test(void __iomem *p)
14{
15 unsigned int i;
16 for (i = 0; i < 256; i++)
17 iowrite8(i, p + i);
18 for (i = 1024; i < (5 * 1024); i += 2)
19 iowrite16(i * 12 + 7, p + i);
20 for (i = (5 * 1024); i < (16 * 1024); i += 4)
21 iowrite32(i * 212371 + 13, p + i);
22}
23
24static void do_read_test(void __iomem *p)
25{
26 unsigned int i;
27 for (i = 0; i < 256; i++)
28 ioread8(p + i);
29 for (i = 1024; i < (5 * 1024); i += 2)
30 ioread16(p + i);
31 for (i = (5 * 1024); i < (16 * 1024); i += 4)
32 ioread32(p + i);
33}
34
35static void do_test(void)
36{
37 void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
38 if (!p) {
39 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
40 return;
41 }
42 do_write_test(p);
43 do_read_test(p);
44 iounmap(p);
45}
46
47static int __init init(void)
48{
49 if (mmio_address == 0) {
50 pr_err(MODULE_NAME ": you have to use the module argument "
51 "mmio_address.\n");
52 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
53 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
54 return -ENXIO;
55 }
56
57 pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
58 "in PCI address space, and writing "
59 "rubbish in there.\n", mmio_address);
60 do_test();
61 return 0;
62}
63
64static void __exit cleanup(void)
65{
66 pr_debug(MODULE_NAME ": unloaded.\n");
67}
68
69module_init(init);
70module_exit(cleanup);
71MODULE_LICENSE("GPL");