aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile11
-rw-r--r--arch/x86/mm/discontig_32.c288
-rw-r--r--arch/x86/mm/dump_pagetables.c12
-rw-r--r--arch/x86/mm/fault.c118
-rw-r--r--arch/x86/mm/gup.c298
-rw-r--r--arch/x86/mm/hugetlbpage.c78
-rw-r--r--arch/x86/mm/init_32.c529
-rw-r--r--arch/x86/mm/init_64.c702
-rw-r--r--arch/x86/mm/ioremap.c87
-rw-r--r--arch/x86/mm/k8topology_64.c21
-rw-r--r--arch/x86/mm/kmmio.c510
-rw-r--r--arch/x86/mm/memtest.c123
-rw-r--r--arch/x86/mm/mmio-mod.c517
-rw-r--r--arch/x86/mm/numa_64.c99
-rw-r--r--arch/x86/mm/pageattr-test.c24
-rw-r--r--arch/x86/mm/pageattr.c82
-rw-r--r--arch/x86/mm/pat.c529
-rw-r--r--arch/x86/mm/pf_in.c489
-rw-r--r--arch/x86/mm/pf_in.h39
-rw-r--r--arch/x86/mm/pgtable.c193
-rw-r--r--arch/x86/mm/pgtable_32.c103
-rw-r--r--arch/x86/mm/srat_32.c283
-rw-r--r--arch/x86/mm/srat_64.c48
-rw-r--r--arch/x86/mm/testmmiotrace.c71
24 files changed, 4070 insertions, 1184 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index b7b3e4c7cfc9..dfb932dcf136 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_X86_32) += pgtable_32.o 4obj-$(CONFIG_X86_32) += pgtable_32.o
5 5
@@ -8,10 +8,17 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
8 8
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10 10
11obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
12obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15
11ifeq ($(CONFIG_X86_32),y) 16ifeq ($(CONFIG_X86_32),y)
12obj-$(CONFIG_NUMA) += discontig_32.o 17obj-$(CONFIG_NUMA) += discontig_32.o
13else 18else
14obj-$(CONFIG_NUMA) += numa_64.o 19obj-$(CONFIG_NUMA) += numa_64.o
15obj-$(CONFIG_K8_NUMA) += k8topology_64.o 20obj-$(CONFIG_K8_NUMA) += k8topology_64.o
16obj-$(CONFIG_ACPI_NUMA) += srat_64.o
17endif 21endif
22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
23
24obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 914ccf983687..62fa440678d8 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -38,10 +38,10 @@
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/mmzone.h> 39#include <asm/mmzone.h>
40#include <asm/bios_ebda.h> 40#include <asm/bios_ebda.h>
41#include <asm/proto.h>
41 42
42struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
43EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
44static bootmem_data_t node0_bdata;
45 45
46/* 46/*
47 * numa interface - we expect the numa architecture specific code to have 47 * numa interface - we expect the numa architecture specific code to have
@@ -59,14 +59,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
59/* 59/*
60 * 4) physnode_map - the mapping between a pfn and owning node 60 * 4) physnode_map - the mapping between a pfn and owning node
61 * physnode_map keeps track of the physical memory layout of a generic 61 * physnode_map keeps track of the physical memory layout of a generic
62 * numa node on a 256Mb break (each element of the array will 62 * numa node on a 64Mb break (each element of the array will
63 * represent 256Mb of memory and will be marked by the node id. so, 63 * represent 64Mb of memory and will be marked by the node id. so,
64 * if the first gig is on node 0, and the second gig is on node 1 64 * if the first gig is on node 0, and the second gig is on node 1
65 * physnode_map will contain: 65 * physnode_map will contain:
66 * 66 *
67 * physnode_map[0-3] = 0; 67 * physnode_map[0-15] = 0;
68 * physnode_map[4-7] = 1; 68 * physnode_map[16-31] = 1;
69 * physnode_map[8- ] = -1; 69 * physnode_map[32- ] = -1;
70 */ 70 */
71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; 71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
72EXPORT_SYMBOL(physnode_map); 72EXPORT_SYMBOL(physnode_map);
@@ -75,15 +75,15 @@ void memory_present(int nid, unsigned long start, unsigned long end)
75{ 75{
76 unsigned long pfn; 76 unsigned long pfn;
77 77
78 printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", 78 printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
79 nid, start, end); 79 nid, start, end);
80 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); 80 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
81 printk(KERN_DEBUG " "); 81 printk(KERN_DEBUG " ");
82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { 82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid; 83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
84 printk("%ld ", pfn); 84 printk(KERN_CONT "%lx ", pfn);
85 } 85 }
86 printk("\n"); 86 printk(KERN_CONT "\n");
87} 87}
88 88
89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, 89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -99,7 +99,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
99#endif 99#endif
100 100
101extern unsigned long find_max_low_pfn(void); 101extern unsigned long find_max_low_pfn(void);
102extern void add_one_highpage_init(struct page *, int, int);
103extern unsigned long highend_pfn, highstart_pfn; 102extern unsigned long highend_pfn, highstart_pfn;
104 103
105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 104#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -117,13 +116,13 @@ static unsigned long kva_pages;
117 */ 116 */
118int __init get_memcfg_numa_flat(void) 117int __init get_memcfg_numa_flat(void)
119{ 118{
120 printk("NUMA - single node, flat memory mode\n"); 119 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
121 120
122 /* Run the memory configuration and find the top of memory. */
123 propagate_e820_map();
124 node_start_pfn[0] = 0; 121 node_start_pfn[0] = 0;
125 node_end_pfn[0] = max_pfn; 122 node_end_pfn[0] = max_pfn;
123 e820_register_active_regions(0, 0, max_pfn);
126 memory_present(0, 0, max_pfn); 124 memory_present(0, 0, max_pfn);
125 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
127 126
128 /* Indicate there is one node available. */ 127 /* Indicate there is one node available. */
129 nodes_clear(node_online_map); 128 nodes_clear(node_online_map);
@@ -156,24 +155,32 @@ static void __init propagate_e820_map_node(int nid)
156 */ 155 */
157static void __init allocate_pgdat(int nid) 156static void __init allocate_pgdat(int nid)
158{ 157{
159 if (nid && node_has_online_mem(nid)) 158 char buf[16];
159
160 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
160 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 161 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
161 else { 162 else {
162 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); 163 unsigned long pgdat_phys;
163 min_low_pfn += PFN_UP(sizeof(pg_data_t)); 164 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
165 max_pfn_mapped<<PAGE_SHIFT,
166 sizeof(pg_data_t),
167 PAGE_SIZE);
168 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
169 memset(buf, 0, sizeof(buf));
170 sprintf(buf, "NODE_DATA %d", nid);
171 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
164 } 172 }
173 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
174 nid, (unsigned long)NODE_DATA(nid));
165} 175}
166 176
167#ifdef CONFIG_DISCONTIGMEM
168/* 177/*
169 * In the discontig memory model, a portion of the kernel virtual area (KVA) 178 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
170 * is reserved and portions of nodes are mapped using it. This is to allow 179 * virtual address space (KVA) is reserved and portions of nodes are mapped
171 * node-local memory to be allocated for structures that would normally require 180 * using it. This is to allow node-local memory to be allocated for
172 * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers 181 * structures that would normally require ZONE_NORMAL. The memory is
173 * should be prepared to allocate from the bootmem allocator instead. This KVA 182 * allocated with alloc_remap() and callers should be prepared to allocate
174 * mechanism is incompatible with SPARSEMEM as it makes assumptions about the 183 * from the bootmem allocator instead.
175 * layout of memory that are broken if alloc_remap() succeeds for some of the
176 * map and fails for others
177 */ 184 */
178static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 185static unsigned long node_remap_start_pfn[MAX_NUMNODES];
179static void *node_remap_end_vaddr[MAX_NUMNODES]; 186static void *node_remap_end_vaddr[MAX_NUMNODES];
@@ -195,15 +202,19 @@ void *alloc_remap(int nid, unsigned long size)
195 return allocation; 202 return allocation;
196} 203}
197 204
198void __init remap_numa_kva(void) 205static void __init remap_numa_kva(void)
199{ 206{
200 void *vaddr; 207 void *vaddr;
201 unsigned long pfn; 208 unsigned long pfn;
202 int node; 209 int node;
203 210
204 for_each_online_node(node) { 211 for_each_online_node(node) {
212 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
205 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { 213 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
206 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); 214 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
215 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
216 (unsigned long)vaddr,
217 node_remap_start_pfn[node] + pfn);
207 set_pmd_pfn((ulong) vaddr, 218 set_pmd_pfn((ulong) vaddr,
208 node_remap_start_pfn[node] + pfn, 219 node_remap_start_pfn[node] + pfn,
209 PAGE_KERNEL_LARGE); 220 PAGE_KERNEL_LARGE);
@@ -215,17 +226,21 @@ static unsigned long calculate_numa_remap_pages(void)
215{ 226{
216 int nid; 227 int nid;
217 unsigned long size, reserve_pages = 0; 228 unsigned long size, reserve_pages = 0;
218 unsigned long pfn;
219 229
220 for_each_online_node(nid) { 230 for_each_online_node(nid) {
221 unsigned old_end_pfn = node_end_pfn[nid]; 231 u64 node_kva_target;
232 u64 node_kva_final;
222 233
223 /* 234 /*
224 * The acpi/srat node info can show hot-add memroy zones 235 * The acpi/srat node info can show hot-add memroy zones
225 * where memory could be added but not currently present. 236 * where memory could be added but not currently present.
226 */ 237 */
238 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
239 nid, node_start_pfn[nid], node_end_pfn[nid]);
227 if (node_start_pfn[nid] > max_pfn) 240 if (node_start_pfn[nid] > max_pfn)
228 continue; 241 continue;
242 if (!node_end_pfn[nid])
243 continue;
229 if (node_end_pfn[nid] > max_pfn) 244 if (node_end_pfn[nid] > max_pfn)
230 node_end_pfn[nid] = max_pfn; 245 node_end_pfn[nid] = max_pfn;
231 246
@@ -237,41 +252,48 @@ static unsigned long calculate_numa_remap_pages(void)
237 /* now the roundup is correct, convert to PAGE_SIZE pages */ 252 /* now the roundup is correct, convert to PAGE_SIZE pages */
238 size = size * PTRS_PER_PTE; 253 size = size * PTRS_PER_PTE;
239 254
240 /* 255 node_kva_target = round_down(node_end_pfn[nid] - size,
241 * Validate the region we are allocating only contains valid 256 PTRS_PER_PTE);
242 * pages. 257 node_kva_target <<= PAGE_SHIFT;
243 */ 258 do {
244 for (pfn = node_end_pfn[nid] - size; 259 node_kva_final = find_e820_area(node_kva_target,
245 pfn < node_end_pfn[nid]; pfn++) 260 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
246 if (!page_is_ram(pfn)) 261 ((u64)size)<<PAGE_SHIFT,
247 break; 262 LARGE_PAGE_BYTES);
248 263 node_kva_target -= LARGE_PAGE_BYTES;
249 if (pfn != node_end_pfn[nid]) 264 } while (node_kva_final == -1ULL &&
250 size = 0; 265 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
266
267 if (node_kva_final == -1ULL)
268 panic("Can not get kva ram\n");
251 269
252 printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
253 size, nid);
254 node_remap_size[nid] = size; 270 node_remap_size[nid] = size;
255 node_remap_offset[nid] = reserve_pages; 271 node_remap_offset[nid] = reserve_pages;
256 reserve_pages += size; 272 reserve_pages += size;
257 printk("Shrinking node %d from %ld pages to %ld pages\n", 273 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
258 nid, node_end_pfn[nid], node_end_pfn[nid] - size); 274 " node %d at %llx\n",
259 275 size, nid, node_kva_final>>PAGE_SHIFT);
260 if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { 276
261 /* 277 /*
262 * Align node_end_pfn[] and node_remap_start_pfn[] to 278 * prevent kva address below max_low_pfn want it on system
263 * pmd boundary. remap_numa_kva will barf otherwise. 279 * with less memory later.
264 */ 280 * layout will be: KVA address , KVA RAM
265 printk("Shrinking node %d further by %ld pages for proper alignment\n", 281 *
266 nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); 282 * we are supposed to only record the one less then max_low_pfn
267 size += node_end_pfn[nid] & (PTRS_PER_PTE-1); 283 * but we could have some hole in high memory, and it will only
268 } 284 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
285 * to use it as free.
286 * So reserve_early here, hope we don't run out of that array
287 */
288 reserve_early(node_kva_final,
289 node_kva_final+(((u64)size)<<PAGE_SHIFT),
290 "KVA RAM");
269 291
270 node_end_pfn[nid] -= size; 292 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
271 node_remap_start_pfn[nid] = node_end_pfn[nid]; 293 remove_active_range(nid, node_remap_start_pfn[nid],
272 shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); 294 node_remap_start_pfn[nid] + size);
273 } 295 }
274 printk("Reserving total of %ld pages for numa KVA remap\n", 296 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
275 reserve_pages); 297 reserve_pages);
276 return reserve_pages; 298 return reserve_pages;
277} 299}
@@ -285,37 +307,16 @@ static void init_remap_allocator(int nid)
285 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + 307 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
286 ALIGN(sizeof(pg_data_t), PAGE_SIZE); 308 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
287 309
288 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, 310 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
289 (ulong) node_remap_start_vaddr[nid], 311 (ulong) node_remap_start_vaddr[nid],
290 (ulong) pfn_to_kaddr(highstart_pfn 312 (ulong) node_remap_end_vaddr[nid]);
291 + node_remap_offset[nid] + node_remap_size[nid]));
292}
293#else
294void *alloc_remap(int nid, unsigned long size)
295{
296 return NULL;
297}
298
299static unsigned long calculate_numa_remap_pages(void)
300{
301 return 0;
302}
303
304static void init_remap_allocator(int nid)
305{
306}
307
308void __init remap_numa_kva(void)
309{
310} 313}
311#endif /* CONFIG_DISCONTIGMEM */
312 314
313extern void setup_bootmem_allocator(void); 315void __init initmem_init(unsigned long start_pfn,
314unsigned long __init setup_memory(void) 316 unsigned long end_pfn)
315{ 317{
316 int nid; 318 int nid;
317 unsigned long system_start_pfn, system_max_low_pfn; 319 long kva_target_pfn;
318 unsigned long wasted_pages;
319 320
320 /* 321 /*
321 * When mapping a NUMA machine we allocate the node_mem_map arrays 322 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -324,109 +325,77 @@ unsigned long __init setup_memory(void)
324 * this space and use it to adjust the boundary between ZONE_NORMAL 325 * this space and use it to adjust the boundary between ZONE_NORMAL
325 * and ZONE_HIGHMEM. 326 * and ZONE_HIGHMEM.
326 */ 327 */
327 get_memcfg_numa();
328 328
329 kva_pages = calculate_numa_remap_pages(); 329 get_memcfg_numa();
330 330
331 /* partially used pages are not usable - thus round upwards */ 331 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
332 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
333 332
334 kva_start_pfn = find_max_low_pfn() - kva_pages; 333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
334 do {
335 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
336 max_low_pfn<<PAGE_SHIFT,
337 kva_pages<<PAGE_SHIFT,
338 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
339 kva_target_pfn -= PTRS_PER_PTE;
340 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
335 341
336#ifdef CONFIG_BLK_DEV_INITRD 342 if (kva_start_pfn == -1UL)
337 /* Numa kva area is below the initrd */ 343 panic("Can not get kva space\n");
338 if (initrd_start)
339 kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
340 - kva_pages;
341#endif
342 344
343 /* 345 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
344 * We waste pages past at the end of the KVA for no good reason other
345 * than how it is located. This is bad.
346 */
347 wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
348 kva_start_pfn -= wasted_pages;
349 kva_pages += wasted_pages;
350
351 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
352 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
353 kva_start_pfn, max_low_pfn); 346 kva_start_pfn, max_low_pfn);
354 printk("max_pfn = %ld\n", max_pfn); 347 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
348
349 /* avoid clash with initrd */
350 reserve_early(kva_start_pfn<<PAGE_SHIFT,
351 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
352 "KVA PG");
355#ifdef CONFIG_HIGHMEM 353#ifdef CONFIG_HIGHMEM
356 highstart_pfn = highend_pfn = max_pfn; 354 highstart_pfn = highend_pfn = max_pfn;
357 if (max_pfn > system_max_low_pfn) 355 if (max_pfn > max_low_pfn)
358 highstart_pfn = system_max_low_pfn; 356 highstart_pfn = max_low_pfn;
359 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 357 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
360 pages_to_mb(highend_pfn - highstart_pfn)); 358 pages_to_mb(highend_pfn - highstart_pfn));
361 num_physpages = highend_pfn; 359 num_physpages = highend_pfn;
362 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 360 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
363#else 361#else
364 num_physpages = system_max_low_pfn; 362 num_physpages = max_low_pfn;
365 high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; 363 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
366#endif 364#endif
367 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 365 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
368 pages_to_mb(system_max_low_pfn)); 366 pages_to_mb(max_low_pfn));
369 printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 367 printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
370 min_low_pfn, max_low_pfn, highstart_pfn); 368 max_low_pfn, highstart_pfn);
371 369
372 printk("Low memory ends at vaddr %08lx\n", 370 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
373 (ulong) pfn_to_kaddr(max_low_pfn)); 371 (ulong) pfn_to_kaddr(max_low_pfn));
374 for_each_online_node(nid) { 372 for_each_online_node(nid) {
375 init_remap_allocator(nid); 373 init_remap_allocator(nid);
376 374
377 allocate_pgdat(nid); 375 allocate_pgdat(nid);
378 } 376 }
379 printk("High memory starts at vaddr %08lx\n", 377 remap_numa_kva();
378
379 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
380 (ulong) pfn_to_kaddr(highstart_pfn)); 380 (ulong) pfn_to_kaddr(highstart_pfn));
381 for_each_online_node(nid) 381 for_each_online_node(nid)
382 propagate_e820_map_node(nid); 382 propagate_e820_map_node(nid);
383 383
384 memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); 384 for_each_online_node(nid)
385 NODE_DATA(0)->bdata = &node0_bdata; 385 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
386 setup_bootmem_allocator();
387 return max_low_pfn;
388}
389
390void __init numa_kva_reserve(void)
391{
392 if (kva_pages)
393 reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
394 BOOTMEM_DEFAULT);
395}
396
397void __init zone_sizes_init(void)
398{
399 int nid;
400 unsigned long max_zone_pfns[MAX_NR_ZONES];
401 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
402 max_zone_pfns[ZONE_DMA] =
403 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
404 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
405#ifdef CONFIG_HIGHMEM
406 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
407#endif
408
409 /* If SRAT has not registered memory, register it now */
410 if (find_max_pfn_with_active_regions() == 0) {
411 for_each_online_node(nid) {
412 if (node_has_online_mem(nid))
413 add_active_range(nid, node_start_pfn[nid],
414 node_end_pfn[nid]);
415 }
416 }
417 386
418 free_area_init_nodes(max_zone_pfns); 387 NODE_DATA(0)->bdata = &bootmem_node_data[0];
419 return; 388 setup_bootmem_allocator();
420} 389}
421 390
422void __init set_highmem_pages_init(int bad_ppro) 391void __init set_highmem_pages_init(void)
423{ 392{
424#ifdef CONFIG_HIGHMEM 393#ifdef CONFIG_HIGHMEM
425 struct zone *zone; 394 struct zone *zone;
426 struct page *page; 395 int nid;
427 396
428 for_each_zone(zone) { 397 for_each_zone(zone) {
429 unsigned long node_pfn, zone_start_pfn, zone_end_pfn; 398 unsigned long zone_start_pfn, zone_end_pfn;
430 399
431 if (!is_highmem(zone)) 400 if (!is_highmem(zone))
432 continue; 401 continue;
@@ -434,16 +403,12 @@ void __init set_highmem_pages_init(int bad_ppro)
434 zone_start_pfn = zone->zone_start_pfn; 403 zone_start_pfn = zone->zone_start_pfn;
435 zone_end_pfn = zone_start_pfn + zone->spanned_pages; 404 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
436 405
437 printk("Initializing %s for node %d (%08lx:%08lx)\n", 406 nid = zone_to_nid(zone);
438 zone->name, zone_to_nid(zone), 407 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
439 zone_start_pfn, zone_end_pfn); 408 zone->name, nid, zone_start_pfn, zone_end_pfn);
440 409
441 for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { 410 add_highpages_with_active_regions(nid, zone_start_pfn,
442 if (!pfn_valid(node_pfn)) 411 zone_end_pfn);
443 continue;
444 page = pfn_to_page(node_pfn);
445 add_one_highpage_init(page, node_pfn, bad_ppro);
446 }
447 } 412 }
448 totalram_pages += totalhigh_pages; 413 totalram_pages += totalhigh_pages;
449#endif 414#endif
@@ -476,3 +441,4 @@ int memory_add_physaddr_to_nid(u64 addr)
476 441
477EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 442EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
478#endif 443#endif
444
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2c24bea92c66..a20d1fa64b4e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -42,7 +42,7 @@ static struct addr_marker address_markers[] = {
42 { 0, "User Space" }, 42 { 0, "User Space" },
43#ifdef CONFIG_X86_64 43#ifdef CONFIG_X86_64
44 { 0x8000000000000000UL, "Kernel Space" }, 44 { 0x8000000000000000UL, "Kernel Space" },
45 { 0xffff810000000000UL, "Low Kernel Mapping" }, 45 { PAGE_OFFSET, "Low Kernel Mapping" },
46 { VMALLOC_START, "vmalloc() Area" }, 46 { VMALLOC_START, "vmalloc() Area" },
47 { VMEMMAP_START, "Vmemmap" }, 47 { VMEMMAP_START, "Vmemmap" },
48 { __START_KERNEL_map, "High Kernel Mapping" }, 48 { __START_KERNEL_map, "High Kernel Mapping" },
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
148 * we have now. "break" is either changing perms, levels or 148 * we have now. "break" is either changing perms, levels or
149 * address space marker. 149 * address space marker.
150 */ 150 */
151 prot = pgprot_val(new_prot) & ~(PTE_MASK); 151 prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK);
152 cur = pgprot_val(st->current_prot) & ~(PTE_MASK); 152 cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK);
153 153
154 if (!st->level) { 154 if (!st->level) {
155 /* First entry */ 155 /* First entry */
@@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
221 for (i = 0; i < PTRS_PER_PMD; i++) { 221 for (i = 0; i < PTRS_PER_PMD; i++) {
222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
223 if (!pmd_none(*start)) { 223 if (!pmd_none(*start)) {
224 pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; 224 pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
225 225
226 if (pmd_large(*start) || !pmd_present(*start)) 226 if (pmd_large(*start) || !pmd_present(*start))
227 note_page(m, st, __pgprot(prot), 3); 227 note_page(m, st, __pgprot(prot), 3);
@@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
253 for (i = 0; i < PTRS_PER_PUD; i++) { 253 for (i = 0; i < PTRS_PER_PUD; i++) {
254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
255 if (!pud_none(*start)) { 255 if (!pud_none(*start)) {
256 pgprotval_t prot = pud_val(*start) & ~PTE_MASK; 256 pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
257 257
258 if (pud_large(*start) || !pud_present(*start)) 258 if (pud_large(*start) || !pud_present(*start))
259 note_page(m, st, __pgprot(prot), 2); 259 note_page(m, st, __pgprot(prot), 2);
@@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m)
288 for (i = 0; i < PTRS_PER_PGD; i++) { 288 for (i = 0; i < PTRS_PER_PGD; i++) {
289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
290 if (!pgd_none(*start)) { 290 if (!pgd_none(*start)) {
291 pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; 291 pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
292 292
293 if (pgd_large(*start) || !pgd_present(*start)) 293 if (pgd_large(*start) || !pgd_present(*start))
294 note_page(m, &st, __pgprot(prot), 1); 294 note_page(m, &st, __pgprot(prot), 1);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd7e1798c75a..8f92cac4e6db 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/ptrace.h> 12#include <linux/ptrace.h>
13#include <linux/mmiotrace.h>
13#include <linux/mman.h> 14#include <linux/mman.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/smp.h> 16#include <linux/smp.h>
@@ -34,6 +35,7 @@
34#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
35#include <asm/proto.h> 36#include <asm/proto.h>
36#include <asm-generic/sections.h> 37#include <asm-generic/sections.h>
38#include <asm/traps.h>
37 39
38/* 40/*
39 * Page fault error code bits 41 * Page fault error code bits
@@ -49,17 +51,23 @@
49#define PF_RSVD (1<<3) 51#define PF_RSVD (1<<3)
50#define PF_INSTR (1<<4) 52#define PF_INSTR (1<<4)
51 53
54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
55{
56#ifdef CONFIG_MMIOTRACE_HOOKS
57 if (unlikely(is_kmmio_active()))
58 if (kmmio_handler(regs, addr) == 1)
59 return -1;
60#endif
61 return 0;
62}
63
52static inline int notify_page_fault(struct pt_regs *regs) 64static inline int notify_page_fault(struct pt_regs *regs)
53{ 65{
54#ifdef CONFIG_KPROBES 66#ifdef CONFIG_KPROBES
55 int ret = 0; 67 int ret = 0;
56 68
57 /* kprobe_running() needs smp_processor_id() */ 69 /* kprobe_running() needs smp_processor_id() */
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) { 70 if (!user_mode_vm(regs)) {
60#else
61 if (!user_mode(regs)) {
62#endif
63 preempt_disable(); 71 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 72 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1; 73 ret = 1;
@@ -350,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
350 return 0; 358 return 0;
351} 359}
352 360
353void do_invalid_op(struct pt_regs *, unsigned long);
354
355static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 361static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
356{ 362{
357#ifdef CONFIG_X86_F00F_BUG 363#ifdef CONFIG_X86_F00F_BUG
@@ -396,11 +402,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
396 printk(KERN_CONT "NULL pointer dereference"); 402 printk(KERN_CONT "NULL pointer dereference");
397 else 403 else
398 printk(KERN_CONT "paging request"); 404 printk(KERN_CONT "paging request");
399#ifdef CONFIG_X86_32 405 printk(KERN_CONT " at %p\n", (void *) address);
400 printk(KERN_CONT " at %08lx\n", address);
401#else
402 printk(KERN_CONT " at %016lx\n", address);
403#endif
404 printk(KERN_ALERT "IP:"); 406 printk(KERN_ALERT "IP:");
405 printk_address(regs->ip, 1); 407 printk_address(regs->ip, 1);
406 dump_pagetable(address); 408 dump_pagetable(address);
@@ -497,6 +499,11 @@ static int vmalloc_fault(unsigned long address)
497 unsigned long pgd_paddr; 499 unsigned long pgd_paddr;
498 pmd_t *pmd_k; 500 pmd_t *pmd_k;
499 pte_t *pte_k; 501 pte_t *pte_k;
502
503 /* Make sure we are in vmalloc area */
504 if (!(address >= VMALLOC_START && address < VMALLOC_END))
505 return -1;
506
500 /* 507 /*
501 * Synchronize this task's top level page-table 508 * Synchronize this task's top level page-table
502 * with the 'reference' page table. 509 * with the 'reference' page table.
@@ -601,6 +608,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
601 608
602 if (notify_page_fault(regs)) 609 if (notify_page_fault(regs))
603 return; 610 return;
611 if (unlikely(kmmio_fault(regs, address)))
612 return;
604 613
605 /* 614 /*
606 * We fault-in kernel-space virtual memory on-demand. The 615 * We fault-in kernel-space virtual memory on-demand. The
@@ -795,14 +804,10 @@ bad_area_nosemaphore:
795 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 804 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
796 printk_ratelimit()) { 805 printk_ratelimit()) {
797 printk( 806 printk(
798#ifdef CONFIG_X86_32 807 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
799 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
800#else
801 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
802#endif
803 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 808 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
804 tsk->comm, task_pid_nr(tsk), address, regs->ip, 809 tsk->comm, task_pid_nr(tsk), address,
805 regs->sp, error_code); 810 (void *) regs->ip, (void *) regs->sp, error_code);
806 print_vma_addr(" in ", regs->ip); 811 print_vma_addr(" in ", regs->ip);
807 printk("\n"); 812 printk("\n");
808 } 813 }
@@ -910,14 +915,7 @@ LIST_HEAD(pgd_list);
910void vmalloc_sync_all(void) 915void vmalloc_sync_all(void)
911{ 916{
912#ifdef CONFIG_X86_32 917#ifdef CONFIG_X86_32
913 /* 918 unsigned long start = VMALLOC_START & PGDIR_MASK;
914 * Note that races in the updates of insync and start aren't
915 * problematic: insync can only get set bits added, and updates to
916 * start are only improving performance (without affecting correctness
917 * if undone).
918 */
919 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
920 static unsigned long start = TASK_SIZE;
921 unsigned long address; 919 unsigned long address;
922 920
923 if (SHARED_KERNEL_PMD) 921 if (SHARED_KERNEL_PMD)
@@ -925,56 +923,38 @@ void vmalloc_sync_all(void)
925 923
926 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 924 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
927 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 925 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
928 if (!test_bit(pgd_index(address), insync)) { 926 unsigned long flags;
929 unsigned long flags; 927 struct page *page;
930 struct page *page; 928
931 929 spin_lock_irqsave(&pgd_lock, flags);
932 spin_lock_irqsave(&pgd_lock, flags); 930 list_for_each_entry(page, &pgd_list, lru) {
933 list_for_each_entry(page, &pgd_list, lru) { 931 if (!vmalloc_sync_one(page_address(page),
934 if (!vmalloc_sync_one(page_address(page), 932 address))
935 address)) 933 break;
936 break;
937 }
938 spin_unlock_irqrestore(&pgd_lock, flags);
939 if (!page)
940 set_bit(pgd_index(address), insync);
941 } 934 }
942 if (address == start && test_bit(pgd_index(address), insync)) 935 spin_unlock_irqrestore(&pgd_lock, flags);
943 start = address + PGDIR_SIZE;
944 } 936 }
945#else /* CONFIG_X86_64 */ 937#else /* CONFIG_X86_64 */
946 /* 938 unsigned long start = VMALLOC_START & PGDIR_MASK;
947 * Note that races in the updates of insync and start aren't
948 * problematic: insync can only get set bits added, and updates to
949 * start are only improving performance (without affecting correctness
950 * if undone).
951 */
952 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
953 static unsigned long start = VMALLOC_START & PGDIR_MASK;
954 unsigned long address; 939 unsigned long address;
955 940
956 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { 941 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
957 if (!test_bit(pgd_index(address), insync)) { 942 const pgd_t *pgd_ref = pgd_offset_k(address);
958 const pgd_t *pgd_ref = pgd_offset_k(address); 943 unsigned long flags;
959 unsigned long flags; 944 struct page *page;
960 struct page *page; 945
961 946 if (pgd_none(*pgd_ref))
962 if (pgd_none(*pgd_ref)) 947 continue;
963 continue; 948 spin_lock_irqsave(&pgd_lock, flags);
964 spin_lock_irqsave(&pgd_lock, flags); 949 list_for_each_entry(page, &pgd_list, lru) {
965 list_for_each_entry(page, &pgd_list, lru) { 950 pgd_t *pgd;
966 pgd_t *pgd; 951 pgd = (pgd_t *)page_address(page) + pgd_index(address);
967 pgd = (pgd_t *)page_address(page) + pgd_index(address); 952 if (pgd_none(*pgd))
968 if (pgd_none(*pgd)) 953 set_pgd(pgd, *pgd_ref);
969 set_pgd(pgd, *pgd_ref); 954 else
970 else 955 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
971 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
972 }
973 spin_unlock_irqrestore(&pgd_lock, flags);
974 set_bit(pgd_index(address), insync);
975 } 956 }
976 if (address == start) 957 spin_unlock_irqrestore(&pgd_lock, flags);
977 start = address + PGDIR_SIZE;
978 } 958 }
979#endif 959#endif
980} 960}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..007bb06c7504
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,298 @@
1/*
2 * Lockless get_user_pages_fast for x86
3 *
4 * Copyright (C) 2008 Nick Piggin
5 * Copyright (C) 2008 Novell Inc.
6 */
7#include <linux/sched.h>
8#include <linux/mm.h>
9#include <linux/vmstat.h>
10#include <linux/highmem.h>
11
12#include <asm/pgtable.h>
13
14static inline pte_t gup_get_pte(pte_t *ptep)
15{
16#ifndef CONFIG_X86_PAE
17 return *ptep;
18#else
19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking
21 * any locks. For this we would like to load the pointers atoimcally,
22 * but that is not possible (without expensive cmpxchg8b) on PAE. What
23 * we do have is the guarantee that a pte will only either go from not
24 * present to present, or present to not present or both -- it will not
25 * switch to a completely different present page without a TLB flush in
26 * between; something that we are blocking by holding interrupts off.
27 *
28 * Setting ptes from not present to present goes:
29 * ptep->pte_high = h;
30 * smp_wmb();
31 * ptep->pte_low = l;
32 *
33 * And present to not present goes:
34 * ptep->pte_low = 0;
35 * smp_wmb();
36 * ptep->pte_high = 0;
37 *
38 * We must ensure here that the load of pte_low sees l iff pte_high
39 * sees h. We load pte_high *after* loading pte_low, which ensures we
40 * don't see an older value of pte_high. *Then* we recheck pte_low,
41 * which ensures that we haven't picked up a changed pte high. We might
42 * have got rubbish values from pte_low and pte_high, but we are
43 * guaranteed that pte_low will not have the present bit set *unless*
44 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
45 * we're safe.
46 *
47 * gup_get_pte should not be used or copied outside gup.c without being
48 * very careful -- it does not atomically load the pte or anything that
49 * is likely to be useful for you.
50 */
51 pte_t pte;
52
53retry:
54 pte.pte_low = ptep->pte_low;
55 smp_rmb();
56 pte.pte_high = ptep->pte_high;
57 smp_rmb();
58 if (unlikely(pte.pte_low != ptep->pte_low))
59 goto retry;
60
61 return pte;
62#endif
63}
64
65/*
66 * The performance critical leaf functions are made noinline otherwise gcc
67 * inlines everything into a single function which results in too much
68 * register pressure.
69 */
70static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
71 unsigned long end, int write, struct page **pages, int *nr)
72{
73 unsigned long mask;
74 pte_t *ptep;
75
76 mask = _PAGE_PRESENT|_PAGE_USER;
77 if (write)
78 mask |= _PAGE_RW;
79
80 ptep = pte_offset_map(&pmd, addr);
81 do {
82 pte_t pte = gup_get_pte(ptep);
83 struct page *page;
84
85 if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep);
87 return 0;
88 }
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte);
91 get_page(page);
92 pages[*nr] = page;
93 (*nr)++;
94
95 } while (ptep++, addr += PAGE_SIZE, addr != end);
96 pte_unmap(ptep - 1);
97
98 return 1;
99}
100
101static inline void get_head_page_multiple(struct page *page, int nr)
102{
103 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count);
106}
107
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
109 unsigned long end, int write, struct page **pages, int *nr)
110{
111 unsigned long mask;
112 pte_t pte = *(pte_t *)&pmd;
113 struct page *head, *page;
114 int refs;
115
116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write)
118 mask |= _PAGE_RW;
119 if ((pte_val(pte) & mask) != mask)
120 return 0;
121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124
125 refs = 0;
126 head = pte_page(pte);
127 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
128 do {
129 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page;
131 (*nr)++;
132 page++;
133 refs++;
134 } while (addr += PAGE_SIZE, addr != end);
135 get_head_page_multiple(head, refs);
136
137 return 1;
138}
139
140static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
141 int write, struct page **pages, int *nr)
142{
143 unsigned long next;
144 pmd_t *pmdp;
145
146 pmdp = pmd_offset(&pud, addr);
147 do {
148 pmd_t pmd = *pmdp;
149
150 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd))
152 return 0;
153 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
155 return 0;
156 } else {
157 if (!gup_pte_range(pmd, addr, next, write, pages, nr))
158 return 0;
159 }
160 } while (pmdp++, addr = next, addr != end);
161
162 return 1;
163}
164
165static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
166 unsigned long end, int write, struct page **pages, int *nr)
167{
168 unsigned long mask;
169 pte_t pte = *(pte_t *)&pud;
170 struct page *head, *page;
171 int refs;
172
173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write)
175 mask |= _PAGE_RW;
176 if ((pte_val(pte) & mask) != mask)
177 return 0;
178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181
182 refs = 0;
183 head = pte_page(pte);
184 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
185 do {
186 VM_BUG_ON(compound_head(page) != head);
187 pages[*nr] = page;
188 (*nr)++;
189 page++;
190 refs++;
191 } while (addr += PAGE_SIZE, addr != end);
192 get_head_page_multiple(head, refs);
193
194 return 1;
195}
196
197static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
198 int write, struct page **pages, int *nr)
199{
200 unsigned long next;
201 pud_t *pudp;
202
203 pudp = pud_offset(&pgd, addr);
204 do {
205 pud_t pud = *pudp;
206
207 next = pud_addr_end(addr, end);
208 if (pud_none(pud))
209 return 0;
210 if (unlikely(pud_large(pud))) {
211 if (!gup_huge_pud(pud, addr, next, write, pages, nr))
212 return 0;
213 } else {
214 if (!gup_pmd_range(pud, addr, next, write, pages, nr))
215 return 0;
216 }
217 } while (pudp++, addr = next, addr != end);
218
219 return 1;
220}
221
222int get_user_pages_fast(unsigned long start, int nr_pages, int write,
223 struct page **pages)
224{
225 struct mm_struct *mm = current->mm;
226 unsigned long addr, len, end;
227 unsigned long next;
228 pgd_t *pgdp;
229 int nr = 0;
230
231 start &= PAGE_MASK;
232 addr = start;
233 len = (unsigned long) nr_pages << PAGE_SHIFT;
234 end = start + len;
235 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
236 start, len)))
237 goto slow_irqon;
238
239 /*
240 * XXX: batch / limit 'nr', to avoid large irq off latency
241 * needs some instrumenting to determine the common sizes used by
242 * important workloads (eg. DB2), and whether limiting the batch size
243 * will decrease performance.
244 *
245 * It seems like we're in the clear for the moment. Direct-IO is
246 * the main guy that batches up lots of get_user_pages, and even
247 * they are limited to 64-at-a-time which is not so many.
248 */
249 /*
250 * This doesn't prevent pagetable teardown, but does prevent
251 * the pagetables and pages from being freed on x86.
252 *
253 * So long as we atomically load page table pointers versus teardown
254 * (which we do on x86, with the above PAE exception), we can follow the
255 * address down to the the page and take a ref on it.
256 */
257 local_irq_disable();
258 pgdp = pgd_offset(mm, addr);
259 do {
260 pgd_t pgd = *pgdp;
261
262 next = pgd_addr_end(addr, end);
263 if (pgd_none(pgd))
264 goto slow;
265 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
266 goto slow;
267 } while (pgdp++, addr = next, addr != end);
268 local_irq_enable();
269
270 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
271 return nr;
272
273 {
274 int ret;
275
276slow:
277 local_irq_enable();
278slow_irqon:
279 /* Try to get the remaining pages with get_user_pages */
280 start += nr << PAGE_SHIFT;
281 pages += nr;
282
283 down_read(&mm->mmap_sem);
284 ret = get_user_pages(current, mm, start,
285 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
286 up_read(&mm->mmap_sem);
287
288 /* Have to be a bit careful with return values */
289 if (nr > 0) {
290 if (ret < 0)
291 ret = nr;
292 else
293 ret += nr;
294 }
295
296 return ret;
297 }
298}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686d..8f307d914c2e 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
124 return 1; 124 return 1;
125} 125}
126 126
127pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 127pte_t *huge_pte_alloc(struct mm_struct *mm,
128 unsigned long addr, unsigned long sz)
128{ 129{
129 pgd_t *pgd; 130 pgd_t *pgd;
130 pud_t *pud; 131 pud_t *pud;
@@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
133 pgd = pgd_offset(mm, addr); 134 pgd = pgd_offset(mm, addr);
134 pud = pud_alloc(mm, pgd, addr); 135 pud = pud_alloc(mm, pgd, addr);
135 if (pud) { 136 if (pud) {
136 if (pud_none(*pud)) 137 if (sz == PUD_SIZE) {
137 huge_pmd_share(mm, addr, pud); 138 pte = (pte_t *)pud;
138 pte = (pte_t *) pmd_alloc(mm, pud, addr); 139 } else {
140 BUG_ON(sz != PMD_SIZE);
141 if (pud_none(*pud))
142 huge_pmd_share(mm, addr, pud);
143 pte = (pte_t *) pmd_alloc(mm, pud, addr);
144 }
139 } 145 }
140 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 146 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
141 147
@@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
151 pgd = pgd_offset(mm, addr); 157 pgd = pgd_offset(mm, addr);
152 if (pgd_present(*pgd)) { 158 if (pgd_present(*pgd)) {
153 pud = pud_offset(pgd, addr); 159 pud = pud_offset(pgd, addr);
154 if (pud_present(*pud)) 160 if (pud_present(*pud)) {
161 if (pud_large(*pud))
162 return (pte_t *)pud;
155 pmd = pmd_offset(pud, addr); 163 pmd = pmd_offset(pud, addr);
164 }
156 } 165 }
157 return (pte_t *) pmd; 166 return (pte_t *) pmd;
158} 167}
@@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd)
188 return 0; 197 return 0;
189} 198}
190 199
200int pud_huge(pud_t pud)
201{
202 return 0;
203}
204
191struct page * 205struct page *
192follow_huge_pmd(struct mm_struct *mm, unsigned long address, 206follow_huge_pmd(struct mm_struct *mm, unsigned long address,
193 pmd_t *pmd, int write) 207 pmd_t *pmd, int write)
@@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd)
208 return !!(pmd_val(pmd) & _PAGE_PSE); 222 return !!(pmd_val(pmd) & _PAGE_PSE);
209} 223}
210 224
225int pud_huge(pud_t pud)
226{
227 return !!(pud_val(pud) & _PAGE_PSE);
228}
229
211struct page * 230struct page *
212follow_huge_pmd(struct mm_struct *mm, unsigned long address, 231follow_huge_pmd(struct mm_struct *mm, unsigned long address,
213 pmd_t *pmd, int write) 232 pmd_t *pmd, int write)
@@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
216 235
217 page = pte_page(*(pte_t *)pmd); 236 page = pte_page(*(pte_t *)pmd);
218 if (page) 237 if (page)
219 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); 238 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
239 return page;
240}
241
242struct page *
243follow_huge_pud(struct mm_struct *mm, unsigned long address,
244 pud_t *pud, int write)
245{
246 struct page *page;
247
248 page = pte_page(*(pte_t *)pud);
249 if (page)
250 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
220 return page; 251 return page;
221} 252}
253
222#endif 254#endif
223 255
224/* x86_64 also uses this file */ 256/* x86_64 also uses this file */
@@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
228 unsigned long addr, unsigned long len, 260 unsigned long addr, unsigned long len,
229 unsigned long pgoff, unsigned long flags) 261 unsigned long pgoff, unsigned long flags)
230{ 262{
263 struct hstate *h = hstate_file(file);
231 struct mm_struct *mm = current->mm; 264 struct mm_struct *mm = current->mm;
232 struct vm_area_struct *vma; 265 struct vm_area_struct *vma;
233 unsigned long start_addr; 266 unsigned long start_addr;
@@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
240 } 273 }
241 274
242full_search: 275full_search:
243 addr = ALIGN(start_addr, HPAGE_SIZE); 276 addr = ALIGN(start_addr, huge_page_size(h));
244 277
245 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 278 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
246 /* At this point: (!vma || addr < vma->vm_end). */ 279 /* At this point: (!vma || addr < vma->vm_end). */
@@ -262,7 +295,7 @@ full_search:
262 } 295 }
263 if (addr + mm->cached_hole_size < vma->vm_start) 296 if (addr + mm->cached_hole_size < vma->vm_start)
264 mm->cached_hole_size = vma->vm_start - addr; 297 mm->cached_hole_size = vma->vm_start - addr;
265 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 298 addr = ALIGN(vma->vm_end, huge_page_size(h));
266 } 299 }
267} 300}
268 301
@@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
270 unsigned long addr0, unsigned long len, 303 unsigned long addr0, unsigned long len,
271 unsigned long pgoff, unsigned long flags) 304 unsigned long pgoff, unsigned long flags)
272{ 305{
306 struct hstate *h = hstate_file(file);
273 struct mm_struct *mm = current->mm; 307 struct mm_struct *mm = current->mm;
274 struct vm_area_struct *vma, *prev_vma; 308 struct vm_area_struct *vma, *prev_vma;
275 unsigned long base = mm->mmap_base, addr = addr0; 309 unsigned long base = mm->mmap_base, addr = addr0;
@@ -290,7 +324,7 @@ try_again:
290 goto fail; 324 goto fail;
291 325
292 /* either no address requested or cant fit in requested address hole */ 326 /* either no address requested or cant fit in requested address hole */
293 addr = (mm->free_area_cache - len) & HPAGE_MASK; 327 addr = (mm->free_area_cache - len) & huge_page_mask(h);
294 do { 328 do {
295 /* 329 /*
296 * Lookup failure means no vma is above this address, 330 * Lookup failure means no vma is above this address,
@@ -321,7 +355,7 @@ try_again:
321 largest_hole = vma->vm_start - addr; 355 largest_hole = vma->vm_start - addr;
322 356
323 /* try just below the current vma->vm_start */ 357 /* try just below the current vma->vm_start */
324 addr = (vma->vm_start - len) & HPAGE_MASK; 358 addr = (vma->vm_start - len) & huge_page_mask(h);
325 } while (len <= vma->vm_start); 359 } while (len <= vma->vm_start);
326 360
327fail: 361fail:
@@ -359,22 +393,23 @@ unsigned long
359hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 393hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
360 unsigned long len, unsigned long pgoff, unsigned long flags) 394 unsigned long len, unsigned long pgoff, unsigned long flags)
361{ 395{
396 struct hstate *h = hstate_file(file);
362 struct mm_struct *mm = current->mm; 397 struct mm_struct *mm = current->mm;
363 struct vm_area_struct *vma; 398 struct vm_area_struct *vma;
364 399
365 if (len & ~HPAGE_MASK) 400 if (len & ~huge_page_mask(h))
366 return -EINVAL; 401 return -EINVAL;
367 if (len > TASK_SIZE) 402 if (len > TASK_SIZE)
368 return -ENOMEM; 403 return -ENOMEM;
369 404
370 if (flags & MAP_FIXED) { 405 if (flags & MAP_FIXED) {
371 if (prepare_hugepage_range(addr, len)) 406 if (prepare_hugepage_range(file, addr, len))
372 return -EINVAL; 407 return -EINVAL;
373 return addr; 408 return addr;
374 } 409 }
375 410
376 if (addr) { 411 if (addr) {
377 addr = ALIGN(addr, HPAGE_SIZE); 412 addr = ALIGN(addr, huge_page_size(h));
378 vma = find_vma(mm, addr); 413 vma = find_vma(mm, addr);
379 if (TASK_SIZE - len >= addr && 414 if (TASK_SIZE - len >= addr &&
380 (!vma || addr + len <= vma->vm_start)) 415 (!vma || addr + len <= vma->vm_start))
@@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
390 425
391#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ 426#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
392 427
428#ifdef CONFIG_X86_64
429static __init int setup_hugepagesz(char *opt)
430{
431 unsigned long ps = memparse(opt, &opt);
432 if (ps == PMD_SIZE) {
433 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
434 } else if (ps == PUD_SIZE && cpu_has_gbpages) {
435 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
436 } else {
437 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
438 ps >> 20);
439 return 0;
440 }
441 return 1;
442}
443__setup("hugepagesz=", setup_hugepagesz);
444#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b6..4974e97dedfe 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -47,9 +47,11 @@
47#include <asm/paravirt.h> 47#include <asm/paravirt.h>
48#include <asm/setup.h> 48#include <asm/setup.h>
49#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
50#include <asm/smp.h>
50 51
51unsigned int __VMALLOC_RESERVE = 128 << 20; 52unsigned int __VMALLOC_RESERVE = 128 << 20;
52 53
54unsigned long max_low_pfn_mapped;
53unsigned long max_pfn_mapped; 55unsigned long max_pfn_mapped;
54 56
55DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 57DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -57,6 +59,27 @@ unsigned long highstart_pfn, highend_pfn;
57 59
58static noinline int do_test_wp_bit(void); 60static noinline int do_test_wp_bit(void);
59 61
62
63static unsigned long __initdata table_start;
64static unsigned long __meminitdata table_end;
65static unsigned long __meminitdata table_top;
66
67static int __initdata after_init_bootmem;
68
69static __init void *alloc_low_page(unsigned long *phys)
70{
71 unsigned long pfn = table_end++;
72 void *adr;
73
74 if (pfn >= table_top)
75 panic("alloc_low_page: ran out of memory");
76
77 adr = __va(pfn * PAGE_SIZE);
78 memset(adr, 0, PAGE_SIZE);
79 *phys = pfn * PAGE_SIZE;
80 return adr;
81}
82
60/* 83/*
61 * Creates a middle page table and puts a pointer to it in the 84 * Creates a middle page table and puts a pointer to it in the
62 * given global directory entry. This only returns the gd entry 85 * given global directory entry. This only returns the gd entry
@@ -68,9 +91,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
68 pmd_t *pmd_table; 91 pmd_t *pmd_table;
69 92
70#ifdef CONFIG_X86_PAE 93#ifdef CONFIG_X86_PAE
94 unsigned long phys;
71 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 95 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
72 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 96 if (after_init_bootmem)
73 97 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
98 else
99 pmd_table = (pmd_t *)alloc_low_page(&phys);
74 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 100 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
75 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 101 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
76 pud = pud_offset(pgd, 0); 102 pud = pud_offset(pgd, 0);
@@ -92,12 +118,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
92 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 118 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
93 pte_t *page_table = NULL; 119 pte_t *page_table = NULL;
94 120
121 if (after_init_bootmem) {
95#ifdef CONFIG_DEBUG_PAGEALLOC 122#ifdef CONFIG_DEBUG_PAGEALLOC
96 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 123 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
97#endif 124#endif
98 if (!page_table) { 125 if (!page_table)
99 page_table = 126 page_table =
100 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 127 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
128 } else {
129 unsigned long phys;
130 page_table = (pte_t *)alloc_low_page(&phys);
101 } 131 }
102 132
103 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 133 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@ -155,38 +185,44 @@ static inline int is_kernel_text(unsigned long addr)
155 * of max_low_pfn pages, by creating page tables starting from address 185 * of max_low_pfn pages, by creating page tables starting from address
156 * PAGE_OFFSET: 186 * PAGE_OFFSET:
157 */ 187 */
158static void __init kernel_physical_mapping_init(pgd_t *pgd_base) 188static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
189 unsigned long start_pfn,
190 unsigned long end_pfn,
191 int use_pse)
159{ 192{
160 int pgd_idx, pmd_idx, pte_ofs; 193 int pgd_idx, pmd_idx, pte_ofs;
161 unsigned long pfn; 194 unsigned long pfn;
162 pgd_t *pgd; 195 pgd_t *pgd;
163 pmd_t *pmd; 196 pmd_t *pmd;
164 pte_t *pte; 197 pte_t *pte;
198 unsigned pages_2m = 0, pages_4k = 0;
165 199
166 pgd_idx = pgd_index(PAGE_OFFSET); 200 if (!cpu_has_pse)
167 pgd = pgd_base + pgd_idx; 201 use_pse = 0;
168 pfn = 0;
169 202
203 pfn = start_pfn;
204 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
205 pgd = pgd_base + pgd_idx;
170 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { 206 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
171 pmd = one_md_table_init(pgd); 207 pmd = one_md_table_init(pgd);
172 if (pfn >= max_low_pfn)
173 continue;
174 208
175 for (pmd_idx = 0; 209 if (pfn >= end_pfn)
176 pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; 210 continue;
211#ifdef CONFIG_X86_PAE
212 pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
213 pmd += pmd_idx;
214#else
215 pmd_idx = 0;
216#endif
217 for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
177 pmd++, pmd_idx++) { 218 pmd++, pmd_idx++) {
178 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; 219 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
179 220
180 /* 221 /*
181 * Map with big pages if possible, otherwise 222 * Map with big pages if possible, otherwise
182 * create normal page tables: 223 * create normal page tables:
183 *
184 * Don't use a large page for the first 2/4MB of memory
185 * because there are often fixed size MTRRs in there
186 * and overlapping MTRRs into large pages can cause
187 * slowdowns.
188 */ 224 */
189 if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { 225 if (use_pse) {
190 unsigned int addr2; 226 unsigned int addr2;
191 pgprot_t prot = PAGE_KERNEL_LARGE; 227 pgprot_t prot = PAGE_KERNEL_LARGE;
192 228
@@ -197,34 +233,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
197 is_kernel_text(addr2)) 233 is_kernel_text(addr2))
198 prot = PAGE_KERNEL_LARGE_EXEC; 234 prot = PAGE_KERNEL_LARGE_EXEC;
199 235
236 pages_2m++;
200 set_pmd(pmd, pfn_pmd(pfn, prot)); 237 set_pmd(pmd, pfn_pmd(pfn, prot));
201 238
202 pfn += PTRS_PER_PTE; 239 pfn += PTRS_PER_PTE;
203 max_pfn_mapped = pfn;
204 continue; 240 continue;
205 } 241 }
206 pte = one_page_table_init(pmd); 242 pte = one_page_table_init(pmd);
207 243
208 for (pte_ofs = 0; 244 pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
209 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; 245 pte += pte_ofs;
246 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
210 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 247 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
211 pgprot_t prot = PAGE_KERNEL; 248 pgprot_t prot = PAGE_KERNEL;
212 249
213 if (is_kernel_text(addr)) 250 if (is_kernel_text(addr))
214 prot = PAGE_KERNEL_EXEC; 251 prot = PAGE_KERNEL_EXEC;
215 252
253 pages_4k++;
216 set_pte(pte, pfn_pte(pfn, prot)); 254 set_pte(pte, pfn_pte(pfn, prot));
217 } 255 }
218 max_pfn_mapped = pfn;
219 } 256 }
220 } 257 }
221} 258 update_page_count(PG_LEVEL_2M, pages_2m);
222 259 update_page_count(PG_LEVEL_4K, pages_4k);
223static inline int page_kills_ppro(unsigned long pagenr)
224{
225 if (pagenr >= 0x70000 && pagenr <= 0x7003F)
226 return 1;
227 return 0;
228} 260}
229 261
230/* 262/*
@@ -287,29 +319,62 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
287 pkmap_page_table = pte; 319 pkmap_page_table = pte;
288} 320}
289 321
290void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) 322static void __init add_one_highpage_init(struct page *page, int pfn)
291{ 323{
292 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { 324 ClearPageReserved(page);
293 ClearPageReserved(page); 325 init_page_count(page);
294 init_page_count(page); 326 __free_page(page);
295 __free_page(page); 327 totalhigh_pages++;
296 totalhigh_pages++;
297 } else
298 SetPageReserved(page);
299} 328}
300 329
301#ifndef CONFIG_NUMA 330struct add_highpages_data {
302static void __init set_highmem_pages_init(int bad_ppro) 331 unsigned long start_pfn;
332 unsigned long end_pfn;
333};
334
335static int __init add_highpages_work_fn(unsigned long start_pfn,
336 unsigned long end_pfn, void *datax)
303{ 337{
304 int pfn; 338 int node_pfn;
339 struct page *page;
340 unsigned long final_start_pfn, final_end_pfn;
341 struct add_highpages_data *data;
305 342
306 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { 343 data = (struct add_highpages_data *)datax;
307 /* 344
308 * Holes under sparsemem might not have no mem_map[]: 345 final_start_pfn = max(start_pfn, data->start_pfn);
309 */ 346 final_end_pfn = min(end_pfn, data->end_pfn);
310 if (pfn_valid(pfn)) 347 if (final_start_pfn >= final_end_pfn)
311 add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); 348 return 0;
349
350 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
351 node_pfn++) {
352 if (!pfn_valid(node_pfn))
353 continue;
354 page = pfn_to_page(node_pfn);
355 add_one_highpage_init(page, node_pfn);
312 } 356 }
357
358 return 0;
359
360}
361
362void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
363 unsigned long end_pfn)
364{
365 struct add_highpages_data data;
366
367 data.start_pfn = start_pfn;
368 data.end_pfn = end_pfn;
369
370 work_with_active_regions(nid, add_highpages_work_fn, &data);
371}
372
373#ifndef CONFIG_NUMA
374static void __init set_highmem_pages_init(void)
375{
376 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
377
313 totalram_pages += totalhigh_pages; 378 totalram_pages += totalhigh_pages;
314} 379}
315#endif /* !CONFIG_NUMA */ 380#endif /* !CONFIG_NUMA */
@@ -317,14 +382,9 @@ static void __init set_highmem_pages_init(int bad_ppro)
317#else 382#else
318# define kmap_init() do { } while (0) 383# define kmap_init() do { } while (0)
319# define permanent_kmaps_init(pgd_base) do { } while (0) 384# define permanent_kmaps_init(pgd_base) do { } while (0)
320# define set_highmem_pages_init(bad_ppro) do { } while (0) 385# define set_highmem_pages_init() do { } while (0)
321#endif /* CONFIG_HIGHMEM */ 386#endif /* CONFIG_HIGHMEM */
322 387
323pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
324EXPORT_SYMBOL(__PAGE_KERNEL);
325
326pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
327
328void __init native_pagetable_setup_start(pgd_t *base) 388void __init native_pagetable_setup_start(pgd_t *base)
329{ 389{
330 unsigned long pfn, va; 390 unsigned long pfn, va;
@@ -380,27 +440,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
380 * be partially populated, and so it avoids stomping on any existing 440 * be partially populated, and so it avoids stomping on any existing
381 * mappings. 441 * mappings.
382 */ 442 */
383static void __init pagetable_init(void) 443static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
384{ 444{
385 pgd_t *pgd_base = swapper_pg_dir;
386 unsigned long vaddr, end; 445 unsigned long vaddr, end;
387 446
388 paravirt_pagetable_setup_start(pgd_base);
389
390 /* Enable PSE if available */
391 if (cpu_has_pse)
392 set_in_cr4(X86_CR4_PSE);
393
394 /* Enable PGE if available */
395 if (cpu_has_pge) {
396 set_in_cr4(X86_CR4_PGE);
397 __PAGE_KERNEL |= _PAGE_GLOBAL;
398 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
399 }
400
401 kernel_physical_mapping_init(pgd_base);
402 remap_numa_kva();
403
404 /* 447 /*
405 * Fixed mappings, only the page table structure has to be 448 * Fixed mappings, only the page table structure has to be
406 * created - mappings will be set by set_fixmap(): 449 * created - mappings will be set by set_fixmap():
@@ -410,6 +453,13 @@ static void __init pagetable_init(void)
410 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 453 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
411 page_table_range_init(vaddr, end, pgd_base); 454 page_table_range_init(vaddr, end, pgd_base);
412 early_ioremap_reset(); 455 early_ioremap_reset();
456}
457
458static void __init pagetable_init(void)
459{
460 pgd_t *pgd_base = swapper_pg_dir;
461
462 paravirt_pagetable_setup_start(pgd_base);
413 463
414 permanent_kmaps_init(pgd_base); 464 permanent_kmaps_init(pgd_base);
415 465
@@ -456,7 +506,7 @@ void zap_low_mappings(void)
456 506
457int nx_enabled; 507int nx_enabled;
458 508
459pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; 509pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
460EXPORT_SYMBOL_GPL(__supported_pte_mask); 510EXPORT_SYMBOL_GPL(__supported_pte_mask);
461 511
462#ifdef CONFIG_X86_PAE 512#ifdef CONFIG_X86_PAE
@@ -509,27 +559,319 @@ static void __init set_nx(void)
509} 559}
510#endif 560#endif
511 561
562/* user-defined highmem size */
563static unsigned int highmem_pages = -1;
564
512/* 565/*
513 * paging_init() sets up the page tables - note that the first 8MB are 566 * highmem=size forces highmem to be exactly 'size' bytes.
514 * already mapped by head.S. 567 * This works even on boxes that have no highmem otherwise.
515 * 568 * This also works to reduce highmem size on bigger boxes.
516 * This routines also unmaps the page at virtual kernel address 0, so
517 * that we can trap those pesky NULL-reference errors in the kernel.
518 */ 569 */
519void __init paging_init(void) 570static int __init parse_highmem(char *arg)
571{
572 if (!arg)
573 return -EINVAL;
574
575 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
576 return 0;
577}
578early_param("highmem", parse_highmem);
579
580/*
581 * Determine low and high memory ranges:
582 */
583void __init find_low_pfn_range(void)
584{
585 /* it could update max_pfn */
586
587 /* max_low_pfn is 0, we already have early_res support */
588
589 max_low_pfn = max_pfn;
590 if (max_low_pfn > MAXMEM_PFN) {
591 if (highmem_pages == -1)
592 highmem_pages = max_pfn - MAXMEM_PFN;
593 if (highmem_pages + MAXMEM_PFN < max_pfn)
594 max_pfn = MAXMEM_PFN + highmem_pages;
595 if (highmem_pages + MAXMEM_PFN > max_pfn) {
596 printk(KERN_WARNING "only %luMB highmem pages "
597 "available, ignoring highmem size of %uMB.\n",
598 pages_to_mb(max_pfn - MAXMEM_PFN),
599 pages_to_mb(highmem_pages));
600 highmem_pages = 0;
601 }
602 max_low_pfn = MAXMEM_PFN;
603#ifndef CONFIG_HIGHMEM
604 /* Maximum memory usable is what is directly addressable */
605 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
606 MAXMEM>>20);
607 if (max_pfn > MAX_NONPAE_PFN)
608 printk(KERN_WARNING
609 "Use a HIGHMEM64G enabled kernel.\n");
610 else
611 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
612 max_pfn = MAXMEM_PFN;
613#else /* !CONFIG_HIGHMEM */
614#ifndef CONFIG_HIGHMEM64G
615 if (max_pfn > MAX_NONPAE_PFN) {
616 max_pfn = MAX_NONPAE_PFN;
617 printk(KERN_WARNING "Warning only 4GB will be used."
618 "Use a HIGHMEM64G enabled kernel.\n");
619 }
620#endif /* !CONFIG_HIGHMEM64G */
621#endif /* !CONFIG_HIGHMEM */
622 } else {
623 if (highmem_pages == -1)
624 highmem_pages = 0;
625#ifdef CONFIG_HIGHMEM
626 if (highmem_pages >= max_pfn) {
627 printk(KERN_ERR "highmem size specified (%uMB) is "
628 "bigger than pages available (%luMB)!.\n",
629 pages_to_mb(highmem_pages),
630 pages_to_mb(max_pfn));
631 highmem_pages = 0;
632 }
633 if (highmem_pages) {
634 if (max_low_pfn - highmem_pages <
635 64*1024*1024/PAGE_SIZE){
636 printk(KERN_ERR "highmem size %uMB results in "
637 "smaller than 64MB lowmem, ignoring it.\n"
638 , pages_to_mb(highmem_pages));
639 highmem_pages = 0;
640 }
641 max_low_pfn -= highmem_pages;
642 }
643#else
644 if (highmem_pages)
645 printk(KERN_ERR "ignoring highmem size on non-highmem"
646 " kernel!\n");
647#endif
648 }
649}
650
651#ifndef CONFIG_NEED_MULTIPLE_NODES
652void __init initmem_init(unsigned long start_pfn,
653 unsigned long end_pfn)
520{ 654{
655#ifdef CONFIG_HIGHMEM
656 highstart_pfn = highend_pfn = max_pfn;
657 if (max_pfn > max_low_pfn)
658 highstart_pfn = max_low_pfn;
659 memory_present(0, 0, highend_pfn);
660 e820_register_active_regions(0, 0, highend_pfn);
661 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
662 pages_to_mb(highend_pfn - highstart_pfn));
663 num_physpages = highend_pfn;
664 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
665#else
666 memory_present(0, 0, max_low_pfn);
667 e820_register_active_regions(0, 0, max_low_pfn);
668 num_physpages = max_low_pfn;
669 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
670#endif
671#ifdef CONFIG_FLATMEM
672 max_mapnr = num_physpages;
673#endif
674 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
675 pages_to_mb(max_low_pfn));
676
677 setup_bootmem_allocator();
678}
679#endif /* !CONFIG_NEED_MULTIPLE_NODES */
680
681static void __init zone_sizes_init(void)
682{
683 unsigned long max_zone_pfns[MAX_NR_ZONES];
684 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
685 max_zone_pfns[ZONE_DMA] =
686 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
687 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
688#ifdef CONFIG_HIGHMEM
689 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
690#endif
691
692 free_area_init_nodes(max_zone_pfns);
693}
694
695void __init setup_bootmem_allocator(void)
696{
697 int i;
698 unsigned long bootmap_size, bootmap;
699 /*
700 * Initialize the boot-time allocator (with low memory only):
701 */
702 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
703 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
704 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
705 PAGE_SIZE);
706 if (bootmap == -1L)
707 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
708 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
709
710 /* don't touch min_low_pfn */
711 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
712 min_low_pfn, max_low_pfn);
713 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
714 max_pfn_mapped<<PAGE_SHIFT);
715 printk(KERN_INFO " low ram: %08lx - %08lx\n",
716 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
717 printk(KERN_INFO " bootmap %08lx - %08lx\n",
718 bootmap, bootmap + bootmap_size);
719 for_each_online_node(i)
720 free_bootmem_with_active_regions(i, max_low_pfn);
721 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
722
723 after_init_bootmem = 1;
724}
725
726static void __init find_early_table_space(unsigned long end)
727{
728 unsigned long puds, pmds, ptes, tables, start;
729
730 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
731 tables = PAGE_ALIGN(puds * sizeof(pud_t));
732
733 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
734 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
735
736 if (cpu_has_pse) {
737 unsigned long extra;
738
739 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
740 extra += PMD_SIZE;
741 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
742 } else
743 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
744
745 tables += PAGE_ALIGN(ptes * sizeof(pte_t));
746
747 /* for fixmap */
748 tables += PAGE_SIZE * 2;
749
750 /*
751 * RED-PEN putting page tables only on node 0 could
752 * cause a hotspot and fill up ZONE_DMA. The page tables
753 * need roughly 0.5KB per GB.
754 */
755 start = 0x7000;
756 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
757 tables, PAGE_SIZE);
758 if (table_start == -1UL)
759 panic("Cannot find space for the kernel page tables");
760
761 table_start >>= PAGE_SHIFT;
762 table_end = table_start;
763 table_top = table_start + (tables>>PAGE_SHIFT);
764
765 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
766 end, table_start << PAGE_SHIFT,
767 (table_start << PAGE_SHIFT) + tables);
768}
769
770unsigned long __init_refok init_memory_mapping(unsigned long start,
771 unsigned long end)
772{
773 pgd_t *pgd_base = swapper_pg_dir;
774 unsigned long start_pfn, end_pfn;
775 unsigned long big_page_start;
776
777 /*
778 * Find space for the kernel direct mapping tables.
779 */
780 if (!after_init_bootmem)
781 find_early_table_space(end);
782
521#ifdef CONFIG_X86_PAE 783#ifdef CONFIG_X86_PAE
522 set_nx(); 784 set_nx();
523 if (nx_enabled) 785 if (nx_enabled)
524 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 786 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
525#endif 787#endif
526 pagetable_init(); 788
789 /* Enable PSE if available */
790 if (cpu_has_pse)
791 set_in_cr4(X86_CR4_PSE);
792
793 /* Enable PGE if available */
794 if (cpu_has_pge) {
795 set_in_cr4(X86_CR4_PGE);
796 __supported_pte_mask |= _PAGE_GLOBAL;
797 }
798
799 /*
800 * Don't use a large page for the first 2/4MB of memory
801 * because there are often fixed size MTRRs in there
802 * and overlapping MTRRs into large pages can cause
803 * slowdowns.
804 */
805 big_page_start = PMD_SIZE;
806
807 if (start < big_page_start) {
808 start_pfn = start >> PAGE_SHIFT;
809 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
810 } else {
811 /* head is not big page alignment ? */
812 start_pfn = start >> PAGE_SHIFT;
813 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
814 << (PMD_SHIFT - PAGE_SHIFT);
815 }
816 if (start_pfn < end_pfn)
817 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
818
819 /* big page range */
820 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
821 << (PMD_SHIFT - PAGE_SHIFT);
822 if (start_pfn < (big_page_start >> PAGE_SHIFT))
823 start_pfn = big_page_start >> PAGE_SHIFT;
824 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
825 if (start_pfn < end_pfn)
826 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
827 cpu_has_pse);
828
829 /* tail is not big page alignment ? */
830 start_pfn = end_pfn;
831 if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
832 end_pfn = end >> PAGE_SHIFT;
833 if (start_pfn < end_pfn)
834 kernel_physical_mapping_init(pgd_base, start_pfn,
835 end_pfn, 0);
836 }
837
838 early_ioremap_page_table_range_init(pgd_base);
527 839
528 load_cr3(swapper_pg_dir); 840 load_cr3(swapper_pg_dir);
529 841
530 __flush_tlb_all(); 842 __flush_tlb_all();
531 843
844 if (!after_init_bootmem)
845 reserve_early(table_start << PAGE_SHIFT,
846 table_end << PAGE_SHIFT, "PGTABLE");
847
848 if (!after_init_bootmem)
849 early_memtest(start, end);
850
851 return end >> PAGE_SHIFT;
852}
853
854
855/*
856 * paging_init() sets up the page tables - note that the first 8MB are
857 * already mapped by head.S.
858 *
859 * This routines also unmaps the page at virtual kernel address 0, so
860 * that we can trap those pesky NULL-reference errors in the kernel.
861 */
862void __init paging_init(void)
863{
864 pagetable_init();
865
866 __flush_tlb_all();
867
532 kmap_init(); 868 kmap_init();
869
870 /*
871 * NOTE: at this point the bootmem allocator is fully available.
872 */
873 sparse_init();
874 zone_sizes_init();
533} 875}
534 876
535/* 877/*
@@ -564,24 +906,11 @@ static struct kcore_list kcore_mem, kcore_vmalloc;
564void __init mem_init(void) 906void __init mem_init(void)
565{ 907{
566 int codesize, reservedpages, datasize, initsize; 908 int codesize, reservedpages, datasize, initsize;
567 int tmp, bad_ppro; 909 int tmp;
568 910
569#ifdef CONFIG_FLATMEM 911#ifdef CONFIG_FLATMEM
570 BUG_ON(!mem_map); 912 BUG_ON(!mem_map);
571#endif 913#endif
572 bad_ppro = ppro_with_ram_bug();
573
574#ifdef CONFIG_HIGHMEM
575 /* check that fixmap and pkmap do not overlap */
576 if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
577 printk(KERN_ERR
578 "fixmap and kmap areas overlap - this will crash\n");
579 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
580 PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
581 FIXADDR_START);
582 BUG();
583 }
584#endif
585 /* this will put all low memory onto the freelists */ 914 /* this will put all low memory onto the freelists */
586 totalram_pages += free_all_bootmem(); 915 totalram_pages += free_all_bootmem();
587 916
@@ -593,7 +922,7 @@ void __init mem_init(void)
593 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 922 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
594 reservedpages++; 923 reservedpages++;
595 924
596 set_highmem_pages_init(bad_ppro); 925 set_highmem_pages_init();
597 926
598 codesize = (unsigned long) &_etext - (unsigned long) &_text; 927 codesize = (unsigned long) &_etext - (unsigned long) &_text;
599 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 928 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
@@ -614,7 +943,6 @@ void __init mem_init(void)
614 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 943 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
615 ); 944 );
616 945
617#if 1 /* double-sanity-check paranoia */
618 printk(KERN_INFO "virtual kernel memory layout:\n" 946 printk(KERN_INFO "virtual kernel memory layout:\n"
619 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 947 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
620#ifdef CONFIG_HIGHMEM 948#ifdef CONFIG_HIGHMEM
@@ -655,7 +983,6 @@ void __init mem_init(void)
655#endif 983#endif
656 BUG_ON(VMALLOC_START > VMALLOC_END); 984 BUG_ON(VMALLOC_START > VMALLOC_END);
657 BUG_ON((unsigned long)high_memory > VMALLOC_START); 985 BUG_ON((unsigned long)high_memory > VMALLOC_START);
658#endif /* double-sanity-check paranoia */
659 986
660 if (boot_cpu_data.wp_works_ok < 0) 987 if (boot_cpu_data.wp_works_ok < 0)
661 test_wp_bit(); 988 test_wp_bit();
@@ -710,6 +1037,8 @@ void mark_rodata_ro(void)
710 unsigned long start = PFN_ALIGN(_text); 1037 unsigned long start = PFN_ALIGN(_text);
711 unsigned long size = PFN_ALIGN(_etext) - start; 1038 unsigned long size = PFN_ALIGN(_etext) - start;
712 1039
1040#ifndef CONFIG_DYNAMIC_FTRACE
1041 /* Dynamic tracing modifies the kernel text section */
713 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1042 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
714 printk(KERN_INFO "Write protecting the kernel text: %luk\n", 1043 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
715 size >> 10); 1044 size >> 10);
@@ -722,6 +1051,8 @@ void mark_rodata_ro(void)
722 printk(KERN_INFO "Testing CPA: write protecting again\n"); 1051 printk(KERN_INFO "Testing CPA: write protecting again\n");
723 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); 1052 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
724#endif 1053#endif
1054#endif /* CONFIG_DYNAMIC_FTRACE */
1055
725 start += size; 1056 start += size;
726 size = (unsigned long)__end_rodata - start; 1057 size = (unsigned long)__end_rodata - start;
727 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1058 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
@@ -784,3 +1115,9 @@ void free_initrd_mem(unsigned long start, unsigned long end)
784 free_init_pages("initrd memory", start, end); 1115 free_init_pages("initrd memory", start, end);
785} 1116}
786#endif 1117#endif
1118
1119int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1120 int flags)
1121{
1122 return reserve_bootmem(phys, len, flags);
1123}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 32ba13b0f818..d3746efb060d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -18,6 +18,7 @@
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/initrd.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include <linux/bootmem.h> 23#include <linux/bootmem.h>
23#include <linux/proc_fs.h> 24#include <linux/proc_fs.h>
@@ -47,11 +48,19 @@
47#include <asm/numa.h> 48#include <asm/numa.h>
48#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
49 50
51/*
52 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
53 * The direct mapping extends to max_pfn_mapped, so that we can directly access
54 * apertures, ACPI and other tables without having to play with fixmaps.
55 */
56unsigned long max_low_pfn_mapped;
57unsigned long max_pfn_mapped;
58
50static unsigned long dma_reserve __initdata; 59static unsigned long dma_reserve __initdata;
51 60
52DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 61DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
53 62
54int direct_gbpages __meminitdata 63int direct_gbpages
55#ifdef CONFIG_DIRECT_GBPAGES 64#ifdef CONFIG_DIRECT_GBPAGES
56 = 1 65 = 1
57#endif 66#endif
@@ -77,46 +86,13 @@ early_param("gbpages", parse_direct_gbpages_on);
77 * around without checking the pgd every time. 86 * around without checking the pgd every time.
78 */ 87 */
79 88
80void show_mem(void)
81{
82 long i, total = 0, reserved = 0;
83 long shared = 0, cached = 0;
84 struct page *page;
85 pg_data_t *pgdat;
86
87 printk(KERN_INFO "Mem-info:\n");
88 show_free_areas();
89 for_each_online_pgdat(pgdat) {
90 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
91 /*
92 * This loop can take a while with 256 GB and
93 * 4k pages so defer the NMI watchdog:
94 */
95 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
96 touch_nmi_watchdog();
97
98 if (!pfn_valid(pgdat->node_start_pfn + i))
99 continue;
100
101 page = pfn_to_page(pgdat->node_start_pfn + i);
102 total++;
103 if (PageReserved(page))
104 reserved++;
105 else if (PageSwapCache(page))
106 cached++;
107 else if (page_count(page))
108 shared += page_count(page) - 1;
109 }
110 }
111 printk(KERN_INFO "%lu pages of RAM\n", total);
112 printk(KERN_INFO "%lu reserved pages\n", reserved);
113 printk(KERN_INFO "%lu pages shared\n", shared);
114 printk(KERN_INFO "%lu pages swap cached\n", cached);
115}
116
117int after_bootmem; 89int after_bootmem;
118 90
119static __init void *spp_getpage(void) 91/*
92 * NOTE: This function is marked __ref because it calls __init function
93 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
94 */
95static __ref void *spp_getpage(void)
120{ 96{
121 void *ptr; 97 void *ptr;
122 98
@@ -135,26 +111,17 @@ static __init void *spp_getpage(void)
135 return ptr; 111 return ptr;
136} 112}
137 113
138static void 114void
139set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) 115set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
140{ 116{
141 pgd_t *pgd;
142 pud_t *pud; 117 pud_t *pud;
143 pmd_t *pmd; 118 pmd_t *pmd;
144 pte_t *pte, new_pte; 119 pte_t *pte;
145
146 pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
147 120
148 pgd = pgd_offset_k(vaddr); 121 pud = pud_page + pud_index(vaddr);
149 if (pgd_none(*pgd)) {
150 printk(KERN_ERR
151 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
152 return;
153 }
154 pud = pud_offset(pgd, vaddr);
155 if (pud_none(*pud)) { 122 if (pud_none(*pud)) {
156 pmd = (pmd_t *) spp_getpage(); 123 pmd = (pmd_t *) spp_getpage();
157 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); 124 pud_populate(&init_mm, pud, pmd);
158 if (pmd != pmd_offset(pud, 0)) { 125 if (pmd != pmd_offset(pud, 0)) {
159 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 126 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
160 pmd, pmd_offset(pud, 0)); 127 pmd, pmd_offset(pud, 0));
@@ -164,13 +131,12 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
164 pmd = pmd_offset(pud, vaddr); 131 pmd = pmd_offset(pud, vaddr);
165 if (pmd_none(*pmd)) { 132 if (pmd_none(*pmd)) {
166 pte = (pte_t *) spp_getpage(); 133 pte = (pte_t *) spp_getpage();
167 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); 134 pmd_populate_kernel(&init_mm, pmd, pte);
168 if (pte != pte_offset_kernel(pmd, 0)) { 135 if (pte != pte_offset_kernel(pmd, 0)) {
169 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 136 printk(KERN_ERR "PAGETABLE BUG #02!\n");
170 return; 137 return;
171 } 138 }
172 } 139 }
173 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
174 140
175 pte = pte_offset_kernel(pmd, vaddr); 141 pte = pte_offset_kernel(pmd, vaddr);
176 if (!pte_none(*pte) && pte_val(new_pte) && 142 if (!pte_none(*pte) && pte_val(new_pte) &&
@@ -185,6 +151,64 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
185 __flush_tlb_one(vaddr); 151 __flush_tlb_one(vaddr);
186} 152}
187 153
154void
155set_pte_vaddr(unsigned long vaddr, pte_t pteval)
156{
157 pgd_t *pgd;
158 pud_t *pud_page;
159
160 pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
161
162 pgd = pgd_offset_k(vaddr);
163 if (pgd_none(*pgd)) {
164 printk(KERN_ERR
165 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
166 return;
167 }
168 pud_page = (pud_t*)pgd_page_vaddr(*pgd);
169 set_pte_vaddr_pud(pud_page, vaddr, pteval);
170}
171
172/*
173 * Create large page table mappings for a range of physical addresses.
174 */
175static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
176 pgprot_t prot)
177{
178 pgd_t *pgd;
179 pud_t *pud;
180 pmd_t *pmd;
181
182 BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
183 for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
184 pgd = pgd_offset_k((unsigned long)__va(phys));
185 if (pgd_none(*pgd)) {
186 pud = (pud_t *) spp_getpage();
187 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
188 _PAGE_USER));
189 }
190 pud = pud_offset(pgd, (unsigned long)__va(phys));
191 if (pud_none(*pud)) {
192 pmd = (pmd_t *) spp_getpage();
193 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
194 _PAGE_USER));
195 }
196 pmd = pmd_offset(pud, phys);
197 BUG_ON(!pmd_none(*pmd));
198 set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
199 }
200}
201
202void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
203{
204 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
205}
206
207void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
208{
209 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
210}
211
188/* 212/*
189 * The head.S code sets up the kernel high mapping: 213 * The head.S code sets up the kernel high mapping:
190 * 214 *
@@ -206,29 +230,18 @@ void __init cleanup_highmap(void)
206 pmd_t *last_pmd = pmd + PTRS_PER_PMD; 230 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
207 231
208 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { 232 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
209 if (!pmd_present(*pmd)) 233 if (pmd_none(*pmd))
210 continue; 234 continue;
211 if (vaddr < (unsigned long) _text || vaddr > end) 235 if (vaddr < (unsigned long) _text || vaddr > end)
212 set_pmd(pmd, __pmd(0)); 236 set_pmd(pmd, __pmd(0));
213 } 237 }
214} 238}
215 239
216/* NOTE: this is meant to be run only at boot */
217void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
218{
219 unsigned long address = __fix_to_virt(idx);
220
221 if (idx >= __end_of_fixed_addresses) {
222 printk(KERN_ERR "Invalid __set_fixmap\n");
223 return;
224 }
225 set_pte_phys(address, phys, prot);
226}
227
228static unsigned long __initdata table_start; 240static unsigned long __initdata table_start;
229static unsigned long __meminitdata table_end; 241static unsigned long __meminitdata table_end;
242static unsigned long __meminitdata table_top;
230 243
231static __meminit void *alloc_low_page(unsigned long *phys) 244static __ref void *alloc_low_page(unsigned long *phys)
232{ 245{
233 unsigned long pfn = table_end++; 246 unsigned long pfn = table_end++;
234 void *adr; 247 void *adr;
@@ -240,7 +253,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
240 return adr; 253 return adr;
241 } 254 }
242 255
243 if (pfn >= end_pfn) 256 if (pfn >= table_top)
244 panic("alloc_low_page: ran out of memory"); 257 panic("alloc_low_page: ran out of memory");
245 258
246 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 259 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -249,7 +262,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
249 return adr; 262 return adr;
250} 263}
251 264
252static __meminit void unmap_low_page(void *adr) 265static __ref void unmap_low_page(void *adr)
253{ 266{
254 if (after_bootmem) 267 if (after_bootmem)
255 return; 268 return;
@@ -257,65 +270,62 @@ static __meminit void unmap_low_page(void *adr)
257 early_iounmap(adr, PAGE_SIZE); 270 early_iounmap(adr, PAGE_SIZE);
258} 271}
259 272
260/* Must run before zap_low_mappings */ 273static unsigned long __meminit
261__meminit void *early_ioremap(unsigned long addr, unsigned long size) 274phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
262{ 275{
263 pmd_t *pmd, *last_pmd; 276 unsigned pages = 0;
264 unsigned long vaddr; 277 unsigned long last_map_addr = end;
265 int i, pmds; 278 int i;
279
280 pte_t *pte = pte_page + pte_index(addr);
266 281
267 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; 282 for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
268 vaddr = __START_KERNEL_map;
269 pmd = level2_kernel_pgt;
270 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
271 283
272 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { 284 if (addr >= end) {
273 for (i = 0; i < pmds; i++) { 285 if (!after_bootmem) {
274 if (pmd_present(pmd[i])) 286 for(; i < PTRS_PER_PTE; i++, pte++)
275 goto continue_outer_loop; 287 set_pte(pte, __pte(0));
288 }
289 break;
276 } 290 }
277 vaddr += addr & ~PMD_MASK;
278 addr &= PMD_MASK;
279 291
280 for (i = 0; i < pmds; i++, addr += PMD_SIZE) 292 if (pte_val(*pte))
281 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); 293 continue;
282 __flush_tlb_all();
283 294
284 return (void *)vaddr; 295 if (0)
285continue_outer_loop: 296 printk(" pte=%p addr=%lx pte=%016lx\n",
286 ; 297 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
298 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
299 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
300 pages++;
287 } 301 }
288 printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size); 302 update_page_count(PG_LEVEL_4K, pages);
289 303
290 return NULL; 304 return last_map_addr;
291} 305}
292 306
293/* 307static unsigned long __meminit
294 * To avoid virtual aliases later: 308phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
295 */
296__meminit void early_iounmap(void *addr, unsigned long size)
297{ 309{
298 unsigned long vaddr; 310 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
299 pmd_t *pmd;
300 int i, pmds;
301
302 vaddr = (unsigned long)addr;
303 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
304 pmd = level2_kernel_pgt + pmd_index(vaddr);
305
306 for (i = 0; i < pmds; i++)
307 pmd_clear(pmd + i);
308 311
309 __flush_tlb_all(); 312 return phys_pte_init(pte, address, end);
310} 313}
311 314
312static unsigned long __meminit 315static unsigned long __meminit
313phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) 316phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
317 unsigned long page_size_mask)
314{ 318{
319 unsigned long pages = 0;
320 unsigned long last_map_addr = end;
321 unsigned long start = address;
322
315 int i = pmd_index(address); 323 int i = pmd_index(address);
316 324
317 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { 325 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
326 unsigned long pte_phys;
318 pmd_t *pmd = pmd_page + pmd_index(address); 327 pmd_t *pmd = pmd_page + pmd_index(address);
328 pte_t *pte;
319 329
320 if (address >= end) { 330 if (address >= end) {
321 if (!after_bootmem) { 331 if (!after_bootmem) {
@@ -325,31 +335,58 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
325 break; 335 break;
326 } 336 }
327 337
328 if (pmd_val(*pmd)) 338 if (pmd_val(*pmd)) {
339 if (!pmd_large(*pmd)) {
340 spin_lock(&init_mm.page_table_lock);
341 last_map_addr = phys_pte_update(pmd, address,
342 end);
343 spin_unlock(&init_mm.page_table_lock);
344 }
345 /* Count entries we're using from level2_ident_pgt */
346 if (start == 0)
347 pages++;
329 continue; 348 continue;
349 }
350
351 if (page_size_mask & (1<<PG_LEVEL_2M)) {
352 pages++;
353 spin_lock(&init_mm.page_table_lock);
354 set_pte((pte_t *)pmd,
355 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
356 spin_unlock(&init_mm.page_table_lock);
357 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
358 continue;
359 }
330 360
331 set_pte((pte_t *)pmd, 361 pte = alloc_low_page(&pte_phys);
332 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 362 last_map_addr = phys_pte_init(pte, address, end);
363 unmap_low_page(pte);
364
365 spin_lock(&init_mm.page_table_lock);
366 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
367 spin_unlock(&init_mm.page_table_lock);
333 } 368 }
334 return address; 369 update_page_count(PG_LEVEL_2M, pages);
370 return last_map_addr;
335} 371}
336 372
337static unsigned long __meminit 373static unsigned long __meminit
338phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) 374phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
375 unsigned long page_size_mask)
339{ 376{
340 pmd_t *pmd = pmd_offset(pud, 0); 377 pmd_t *pmd = pmd_offset(pud, 0);
341 unsigned long last_map_addr; 378 unsigned long last_map_addr;
342 379
343 spin_lock(&init_mm.page_table_lock); 380 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
344 last_map_addr = phys_pmd_init(pmd, address, end);
345 spin_unlock(&init_mm.page_table_lock);
346 __flush_tlb_all(); 381 __flush_tlb_all();
347 return last_map_addr; 382 return last_map_addr;
348} 383}
349 384
350static unsigned long __meminit 385static unsigned long __meminit
351phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) 386phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
387 unsigned long page_size_mask)
352{ 388{
389 unsigned long pages = 0;
353 unsigned long last_map_addr = end; 390 unsigned long last_map_addr = end;
354 int i = pud_index(addr); 391 int i = pud_index(addr);
355 392
@@ -369,41 +406,67 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
369 406
370 if (pud_val(*pud)) { 407 if (pud_val(*pud)) {
371 if (!pud_large(*pud)) 408 if (!pud_large(*pud))
372 last_map_addr = phys_pmd_update(pud, addr, end); 409 last_map_addr = phys_pmd_update(pud, addr, end,
410 page_size_mask);
373 continue; 411 continue;
374 } 412 }
375 413
376 if (direct_gbpages) { 414 if (page_size_mask & (1<<PG_LEVEL_1G)) {
415 pages++;
416 spin_lock(&init_mm.page_table_lock);
377 set_pte((pte_t *)pud, 417 set_pte((pte_t *)pud,
378 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 418 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
419 spin_unlock(&init_mm.page_table_lock);
379 last_map_addr = (addr & PUD_MASK) + PUD_SIZE; 420 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
380 continue; 421 continue;
381 } 422 }
382 423
383 pmd = alloc_low_page(&pmd_phys); 424 pmd = alloc_low_page(&pmd_phys);
425 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
426 unmap_low_page(pmd);
384 427
385 spin_lock(&init_mm.page_table_lock); 428 spin_lock(&init_mm.page_table_lock);
386 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 429 pud_populate(&init_mm, pud, __va(pmd_phys));
387 last_map_addr = phys_pmd_init(pmd, addr, end);
388 spin_unlock(&init_mm.page_table_lock); 430 spin_unlock(&init_mm.page_table_lock);
389
390 unmap_low_page(pmd);
391 } 431 }
392 __flush_tlb_all(); 432 __flush_tlb_all();
433 update_page_count(PG_LEVEL_1G, pages);
393 434
394 return last_map_addr >> PAGE_SHIFT; 435 return last_map_addr;
436}
437
438static unsigned long __meminit
439phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
440 unsigned long page_size_mask)
441{
442 pud_t *pud;
443
444 pud = (pud_t *)pgd_page_vaddr(*pgd);
445
446 return phys_pud_init(pud, addr, end, page_size_mask);
395} 447}
396 448
397static void __init find_early_table_space(unsigned long end) 449static void __init find_early_table_space(unsigned long end)
398{ 450{
399 unsigned long puds, pmds, tables, start; 451 unsigned long puds, pmds, ptes, tables, start;
400 452
401 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 453 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
402 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 454 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
403 if (!direct_gbpages) { 455 if (direct_gbpages) {
456 unsigned long extra;
457 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
458 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
459 } else
404 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 460 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
405 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 461 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
406 } 462
463 if (cpu_has_pse) {
464 unsigned long extra;
465 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
466 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
467 } else
468 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
469 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
407 470
408 /* 471 /*
409 * RED-PEN putting page tables only on node 0 could 472 * RED-PEN putting page tables only on node 0 could
@@ -417,10 +480,10 @@ static void __init find_early_table_space(unsigned long end)
417 480
418 table_start >>= PAGE_SHIFT; 481 table_start >>= PAGE_SHIFT;
419 table_end = table_start; 482 table_end = table_start;
483 table_top = table_start + (tables >> PAGE_SHIFT);
420 484
421 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", 485 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
422 end, table_start << PAGE_SHIFT, 486 end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
423 (table_start << PAGE_SHIFT) + tables);
424} 487}
425 488
426static void __init init_gbpages(void) 489static void __init init_gbpages(void)
@@ -431,125 +494,83 @@ static void __init init_gbpages(void)
431 direct_gbpages = 0; 494 direct_gbpages = 0;
432} 495}
433 496
434#ifdef CONFIG_MEMTEST_BOOTPARAM 497static unsigned long __init kernel_physical_mapping_init(unsigned long start,
435 498 unsigned long end,
436static void __init memtest(unsigned long start_phys, unsigned long size, 499 unsigned long page_size_mask)
437 unsigned pattern) 500{
438{
439 unsigned long i;
440 unsigned long *start;
441 unsigned long start_bad;
442 unsigned long last_bad;
443 unsigned long val;
444 unsigned long start_phys_aligned;
445 unsigned long count;
446 unsigned long incr;
447
448 switch (pattern) {
449 case 0:
450 val = 0UL;
451 break;
452 case 1:
453 val = -1UL;
454 break;
455 case 2:
456 val = 0x5555555555555555UL;
457 break;
458 case 3:
459 val = 0xaaaaaaaaaaaaaaaaUL;
460 break;
461 default:
462 return;
463 }
464 501
465 incr = sizeof(unsigned long); 502 unsigned long next, last_map_addr = end;
466 start_phys_aligned = ALIGN(start_phys, incr);
467 count = (size - (start_phys_aligned - start_phys))/incr;
468 start = __va(start_phys_aligned);
469 start_bad = 0;
470 last_bad = 0;
471
472 for (i = 0; i < count; i++)
473 start[i] = val;
474 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
475 if (*start != val) {
476 if (start_phys_aligned == last_bad + incr) {
477 last_bad += incr;
478 } else {
479 if (start_bad) {
480 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
481 val, start_bad, last_bad + incr);
482 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
483 }
484 start_bad = last_bad = start_phys_aligned;
485 }
486 }
487 }
488 if (start_bad) {
489 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
490 val, start_bad, last_bad + incr);
491 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
492 }
493 503
494} 504 start = (unsigned long)__va(start);
505 end = (unsigned long)__va(end);
495 506
496static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; 507 for (; start < end; start = next) {
508 pgd_t *pgd = pgd_offset_k(start);
509 unsigned long pud_phys;
510 pud_t *pud;
497 511
498static int __init parse_memtest(char *arg) 512 next = (start + PGDIR_SIZE) & PGDIR_MASK;
499{ 513 if (next > end)
500 if (arg) 514 next = end;
501 memtest_pattern = simple_strtoul(arg, NULL, 0);
502 return 0;
503}
504 515
505early_param("memtest", parse_memtest); 516 if (pgd_val(*pgd)) {
517 last_map_addr = phys_pud_update(pgd, __pa(start),
518 __pa(end), page_size_mask);
519 continue;
520 }
506 521
507static void __init early_memtest(unsigned long start, unsigned long end) 522 pud = alloc_low_page(&pud_phys);
508{ 523 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
509 unsigned long t_start, t_size; 524 page_size_mask);
510 unsigned pattern; 525 unmap_low_page(pud);
511 526
512 if (!memtest_pattern) 527 spin_lock(&init_mm.page_table_lock);
513 return; 528 pgd_populate(&init_mm, pgd, __va(pud_phys));
529 spin_unlock(&init_mm.page_table_lock);
530 }
514 531
515 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); 532 return last_map_addr;
516 for (pattern = 0; pattern < memtest_pattern; pattern++) { 533}
517 t_start = start;
518 t_size = 0;
519 while (t_start < end) {
520 t_start = find_e820_area_size(t_start, &t_size, 1);
521 534
522 /* done ? */ 535struct map_range {
523 if (t_start >= end) 536 unsigned long start;
524 break; 537 unsigned long end;
525 if (t_start + t_size > end) 538 unsigned page_size_mask;
526 t_size = end - t_start; 539};
527 540
528 printk(KERN_CONT "\n %016lx - %016lx pattern %d", 541#define NR_RANGE_MR 5
529 t_start, t_start + t_size, pattern);
530 542
531 memtest(t_start, t_size, pattern); 543static int save_mr(struct map_range *mr, int nr_range,
544 unsigned long start_pfn, unsigned long end_pfn,
545 unsigned long page_size_mask)
546{
532 547
533 t_start += t_size; 548 if (start_pfn < end_pfn) {
534 } 549 if (nr_range >= NR_RANGE_MR)
550 panic("run out of range for init_memory_mapping\n");
551 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
552 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
553 mr[nr_range].page_size_mask = page_size_mask;
554 nr_range++;
535 } 555 }
536 printk(KERN_CONT "\n"); 556
537} 557 return nr_range;
538#else
539static void __init early_memtest(unsigned long start, unsigned long end)
540{
541} 558}
542#endif
543 559
544/* 560/*
545 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 561 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
546 * This runs before bootmem is initialized and gets pages directly from 562 * This runs before bootmem is initialized and gets pages directly from
547 * the physical memory. To access them they are temporarily mapped. 563 * the physical memory. To access them they are temporarily mapped.
548 */ 564 */
549unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) 565unsigned long __init_refok init_memory_mapping(unsigned long start,
566 unsigned long end)
550{ 567{
551 unsigned long next, last_map_addr = end; 568 unsigned long last_map_addr = 0;
552 unsigned long start_phys = start, end_phys = end; 569 unsigned long page_size_mask = 0;
570 unsigned long start_pfn, end_pfn;
571
572 struct map_range mr[NR_RANGE_MR];
573 int nr_range, i;
553 574
554 printk(KERN_INFO "init_memory_mapping\n"); 575 printk(KERN_INFO "init_memory_mapping\n");
555 576
@@ -560,48 +581,115 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon
560 * memory mapped. Unfortunately this is done currently before the 581 * memory mapped. Unfortunately this is done currently before the
561 * nodes are discovered. 582 * nodes are discovered.
562 */ 583 */
563 if (!after_bootmem) { 584 if (!after_bootmem)
564 init_gbpages(); 585 init_gbpages();
565 find_early_table_space(end);
566 }
567 586
568 start = (unsigned long)__va(start); 587 if (direct_gbpages)
569 end = (unsigned long)__va(end); 588 page_size_mask |= 1 << PG_LEVEL_1G;
589 if (cpu_has_pse)
590 page_size_mask |= 1 << PG_LEVEL_2M;
591
592 memset(mr, 0, sizeof(mr));
593 nr_range = 0;
594
595 /* head if not big page alignment ?*/
596 start_pfn = start >> PAGE_SHIFT;
597 end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
598 << (PMD_SHIFT - PAGE_SHIFT);
599 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
600
601 /* big page (2M) range*/
602 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
603 << (PMD_SHIFT - PAGE_SHIFT);
604 end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
605 << (PUD_SHIFT - PAGE_SHIFT);
606 if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
607 end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
608 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
609 page_size_mask & (1<<PG_LEVEL_2M));
610
611 /* big page (1G) range */
612 start_pfn = end_pfn;
613 end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
614 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
615 page_size_mask &
616 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
617
618 /* tail is not big page (1G) alignment */
619 start_pfn = end_pfn;
620 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
621 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
622 page_size_mask & (1<<PG_LEVEL_2M));
623
624 /* tail is not big page (2M) alignment */
625 start_pfn = end_pfn;
626 end_pfn = end>>PAGE_SHIFT;
627 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
628
629 /* try to merge same page size and continuous */
630 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
631 unsigned long old_start;
632 if (mr[i].end != mr[i+1].start ||
633 mr[i].page_size_mask != mr[i+1].page_size_mask)
634 continue;
635 /* move it */
636 old_start = mr[i].start;
637 memmove(&mr[i], &mr[i+1],
638 (nr_range - 1 - i) * sizeof (struct map_range));
639 mr[i].start = old_start;
640 nr_range--;
641 }
570 642
571 for (; start < end; start = next) { 643 for (i = 0; i < nr_range; i++)
572 pgd_t *pgd = pgd_offset_k(start); 644 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
573 unsigned long pud_phys; 645 mr[i].start, mr[i].end,
574 pud_t *pud; 646 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
647 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
575 648
576 if (after_bootmem) 649 if (!after_bootmem)
577 pud = pud_offset(pgd, start & PGDIR_MASK); 650 find_early_table_space(end);
578 else
579 pud = alloc_low_page(&pud_phys);
580 651
581 next = start + PGDIR_SIZE; 652 for (i = 0; i < nr_range; i++)
582 if (next > end) 653 last_map_addr = kernel_physical_mapping_init(
583 next = end; 654 mr[i].start, mr[i].end,
584 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); 655 mr[i].page_size_mask);
585 if (!after_bootmem)
586 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
587 unmap_low_page(pud);
588 }
589 656
590 if (!after_bootmem) 657 if (!after_bootmem)
591 mmu_cr4_features = read_cr4(); 658 mmu_cr4_features = read_cr4();
592 __flush_tlb_all(); 659 __flush_tlb_all();
593 660
594 if (!after_bootmem) 661 if (!after_bootmem && table_end > table_start)
595 reserve_early(table_start << PAGE_SHIFT, 662 reserve_early(table_start << PAGE_SHIFT,
596 table_end << PAGE_SHIFT, "PGTABLE"); 663 table_end << PAGE_SHIFT, "PGTABLE");
597 664
665 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
666 last_map_addr, end);
667
598 if (!after_bootmem) 668 if (!after_bootmem)
599 early_memtest(start_phys, end_phys); 669 early_memtest(start, end);
600 670
601 return last_map_addr; 671 return last_map_addr >> PAGE_SHIFT;
602} 672}
603 673
604#ifndef CONFIG_NUMA 674#ifndef CONFIG_NUMA
675void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
676{
677 unsigned long bootmap_size, bootmap;
678
679 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
680 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
681 PAGE_SIZE);
682 if (bootmap == -1L)
683 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
684 /* don't touch min_low_pfn */
685 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
686 0, end_pfn);
687 e820_register_active_regions(0, start_pfn, end_pfn);
688 free_bootmem_with_active_regions(0, end_pfn);
689 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
690 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
691}
692
605void __init paging_init(void) 693void __init paging_init(void)
606{ 694{
607 unsigned long max_zone_pfns[MAX_NR_ZONES]; 695 unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -609,9 +697,9 @@ void __init paging_init(void)
609 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 697 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
610 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 698 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
611 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 699 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
612 max_zone_pfns[ZONE_NORMAL] = end_pfn; 700 max_zone_pfns[ZONE_NORMAL] = max_pfn;
613 701
614 memory_present(0, 0, end_pfn); 702 memory_present(0, 0, max_pfn);
615 sparse_init(); 703 sparse_init();
616 free_area_init_nodes(max_zone_pfns); 704 free_area_init_nodes(max_zone_pfns);
617} 705}
@@ -693,8 +781,8 @@ void __init mem_init(void)
693#else 781#else
694 totalram_pages = free_all_bootmem(); 782 totalram_pages = free_all_bootmem();
695#endif 783#endif
696 reservedpages = end_pfn - totalram_pages - 784 reservedpages = max_pfn - totalram_pages -
697 absent_pages_in_range(0, end_pfn); 785 absent_pages_in_range(0, max_pfn);
698 after_bootmem = 1; 786 after_bootmem = 1;
699 787
700 codesize = (unsigned long) &_etext - (unsigned long) &_text; 788 codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -713,7 +801,7 @@ void __init mem_init(void)
713 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " 801 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
714 "%ldk reserved, %ldk data, %ldk init)\n", 802 "%ldk reserved, %ldk data, %ldk init)\n",
715 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 803 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
716 end_pfn << (PAGE_SHIFT-10), 804 max_pfn << (PAGE_SHIFT-10),
717 codesize >> 10, 805 codesize >> 10,
718 reservedpages << (PAGE_SHIFT-10), 806 reservedpages << (PAGE_SHIFT-10),
719 datasize >> 10, 807 datasize >> 10,
@@ -766,6 +854,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
766void mark_rodata_ro(void) 854void mark_rodata_ro(void)
767{ 855{
768 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 856 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
857 unsigned long rodata_start =
858 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
859
860#ifdef CONFIG_DYNAMIC_FTRACE
861 /* Dynamic tracing modifies the kernel text section */
862 start = rodata_start;
863#endif
769 864
770 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 865 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
771 (end - start) >> 10); 866 (end - start) >> 10);
@@ -775,8 +870,7 @@ void mark_rodata_ro(void)
775 * The rodata section (but not the kernel text!) should also be 870 * The rodata section (but not the kernel text!) should also be
776 * not-executable. 871 * not-executable.
777 */ 872 */
778 start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 873 set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
779 set_memory_nx(start, (end - start) >> PAGE_SHIFT);
780 874
781 rodata_test(); 875 rodata_test();
782 876
@@ -798,24 +892,26 @@ void free_initrd_mem(unsigned long start, unsigned long end)
798} 892}
799#endif 893#endif
800 894
801void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 895int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
896 int flags)
802{ 897{
803#ifdef CONFIG_NUMA 898#ifdef CONFIG_NUMA
804 int nid, next_nid; 899 int nid, next_nid;
900 int ret;
805#endif 901#endif
806 unsigned long pfn = phys >> PAGE_SHIFT; 902 unsigned long pfn = phys >> PAGE_SHIFT;
807 903
808 if (pfn >= end_pfn) { 904 if (pfn >= max_pfn) {
809 /* 905 /*
810 * This can happen with kdump kernels when accessing 906 * This can happen with kdump kernels when accessing
811 * firmware tables: 907 * firmware tables:
812 */ 908 */
813 if (pfn < max_pfn_mapped) 909 if (pfn < max_pfn_mapped)
814 return; 910 return -EFAULT;
815 911
816 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", 912 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
817 phys, len); 913 phys, len);
818 return; 914 return -EFAULT;
819 } 915 }
820 916
821 /* Should check here against the e820 map to avoid double free */ 917 /* Should check here against the e820 map to avoid double free */
@@ -823,9 +919,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
823 nid = phys_to_nid(phys); 919 nid = phys_to_nid(phys);
824 next_nid = phys_to_nid(phys + len - 1); 920 next_nid = phys_to_nid(phys + len - 1);
825 if (nid == next_nid) 921 if (nid == next_nid)
826 reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); 922 ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
827 else 923 else
828 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 924 ret = reserve_bootmem(phys, len, flags);
925
926 if (ret != 0)
927 return ret;
928
829#else 929#else
830 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 930 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
831#endif 931#endif
@@ -834,6 +934,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
834 dma_reserve += len / PAGE_SIZE; 934 dma_reserve += len / PAGE_SIZE;
835 set_dma_reserve(dma_reserve); 935 set_dma_reserve(dma_reserve);
836 } 936 }
937
938 return 0;
837} 939}
838 940
839int kern_addr_valid(unsigned long addr) 941int kern_addr_valid(unsigned long addr)
@@ -938,7 +1040,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
938 pmd_t *pmd; 1040 pmd_t *pmd;
939 1041
940 for (; addr < end; addr = next) { 1042 for (; addr < end; addr = next) {
941 next = pmd_addr_end(addr, end); 1043 void *p = NULL;
942 1044
943 pgd = vmemmap_pgd_populate(addr, node); 1045 pgd = vmemmap_pgd_populate(addr, node);
944 if (!pgd) 1046 if (!pgd)
@@ -948,33 +1050,51 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
948 if (!pud) 1050 if (!pud)
949 return -ENOMEM; 1051 return -ENOMEM;
950 1052
951 pmd = pmd_offset(pud, addr); 1053 if (!cpu_has_pse) {
952 if (pmd_none(*pmd)) { 1054 next = (addr + PAGE_SIZE) & PAGE_MASK;
953 pte_t entry; 1055 pmd = vmemmap_pmd_populate(pud, addr, node);
954 void *p; 1056
1057 if (!pmd)
1058 return -ENOMEM;
1059
1060 p = vmemmap_pte_populate(pmd, addr, node);
955 1061
956 p = vmemmap_alloc_block(PMD_SIZE, node);
957 if (!p) 1062 if (!p)
958 return -ENOMEM; 1063 return -ENOMEM;
959 1064
960 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, 1065 addr_end = addr + PAGE_SIZE;
961 PAGE_KERNEL_LARGE); 1066 p_end = p + PAGE_SIZE;
962 set_pmd(pmd, __pmd(pte_val(entry)));
963
964 /* check to see if we have contiguous blocks */
965 if (p_end != p || node_start != node) {
966 if (p_start)
967 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
968 addr_start, addr_end-1, p_start, p_end-1, node_start);
969 addr_start = addr;
970 node_start = node;
971 p_start = p;
972 }
973 addr_end = addr + PMD_SIZE;
974 p_end = p + PMD_SIZE;
975 } else { 1067 } else {
976 vmemmap_verify((pte_t *)pmd, node, addr, next); 1068 next = pmd_addr_end(addr, end);
1069
1070 pmd = pmd_offset(pud, addr);
1071 if (pmd_none(*pmd)) {
1072 pte_t entry;
1073
1074 p = vmemmap_alloc_block(PMD_SIZE, node);
1075 if (!p)
1076 return -ENOMEM;
1077
1078 entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
1079 PAGE_KERNEL_LARGE);
1080 set_pmd(pmd, __pmd(pte_val(entry)));
1081
1082 /* check to see if we have contiguous blocks */
1083 if (p_end != p || node_start != node) {
1084 if (p_start)
1085 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1086 addr_start, addr_end-1, p_start, p_end-1, node_start);
1087 addr_start = addr;
1088 node_start = node;
1089 p_start = p;
1090 }
1091
1092 addr_end = addr + PMD_SIZE;
1093 p_end = p + PMD_SIZE;
1094 } else
1095 vmemmap_verify((pte_t *)pmd, node, addr, next);
977 } 1096 }
1097
978 } 1098 }
979 return 0; 1099 return 0;
980} 1100}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 71bb3159031a..cac6da54203b 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/mmiotrace.h>
15 16
16#include <asm/cacheflush.h> 17#include <asm/cacheflush.h>
17#include <asm/e820.h> 18#include <asm/e820.h>
@@ -122,10 +123,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
122{ 123{
123 unsigned long pfn, offset, vaddr; 124 unsigned long pfn, offset, vaddr;
124 resource_size_t last_addr; 125 resource_size_t last_addr;
126 const resource_size_t unaligned_phys_addr = phys_addr;
127 const unsigned long unaligned_size = size;
125 struct vm_struct *area; 128 struct vm_struct *area;
126 unsigned long new_prot_val; 129 unsigned long new_prot_val;
127 pgprot_t prot; 130 pgprot_t prot;
128 int retval; 131 int retval;
132 void __iomem *ret_addr;
129 133
130 /* Don't allow wraparound or zero size */ 134 /* Don't allow wraparound or zero size */
131 last_addr = phys_addr + size - 1; 135 last_addr = phys_addr + size - 1;
@@ -142,7 +146,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
142 /* 146 /*
143 * Don't remap the low PCI/ISA area, it's always mapped.. 147 * Don't remap the low PCI/ISA area, it's always mapped..
144 */ 148 */
145 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) 149 if (is_ISA_range(phys_addr, last_addr))
146 return (__force void __iomem *)phys_to_virt(phys_addr); 150 return (__force void __iomem *)phys_to_virt(phys_addr);
147 151
148 /* 152 /*
@@ -166,7 +170,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
166 phys_addr &= PAGE_MASK; 170 phys_addr &= PAGE_MASK;
167 size = PAGE_ALIGN(last_addr+1) - phys_addr; 171 size = PAGE_ALIGN(last_addr+1) - phys_addr;
168 172
169 retval = reserve_memtype(phys_addr, phys_addr + size, 173 retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
170 prot_val, &new_prot_val); 174 prot_val, &new_prot_val);
171 if (retval) { 175 if (retval) {
172 pr_debug("Warning: reserve_memtype returned %d\n", retval); 176 pr_debug("Warning: reserve_memtype returned %d\n", retval);
@@ -233,7 +237,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
233 return NULL; 237 return NULL;
234 } 238 }
235 239
236 return (void __iomem *) (vaddr + offset); 240 ret_addr = (void __iomem *) (vaddr + offset);
241 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
242
243 return ret_addr;
237} 244}
238 245
239/** 246/**
@@ -261,7 +268,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
261{ 268{
262 /* 269 /*
263 * Ideally, this should be: 270 * Ideally, this should be:
264 * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; 271 * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
265 * 272 *
266 * Till we fix all X drivers to use ioremap_wc(), we will use 273 * Till we fix all X drivers to use ioremap_wc(), we will use
267 * UC MINUS. 274 * UC MINUS.
@@ -285,7 +292,7 @@ EXPORT_SYMBOL(ioremap_nocache);
285 */ 292 */
286void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) 293void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
287{ 294{
288 if (pat_wc_enabled) 295 if (pat_enabled)
289 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, 296 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
290 __builtin_return_address(0)); 297 __builtin_return_address(0));
291 else 298 else
@@ -300,6 +307,37 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
300} 307}
301EXPORT_SYMBOL(ioremap_cache); 308EXPORT_SYMBOL(ioremap_cache);
302 309
310static void __iomem *ioremap_default(resource_size_t phys_addr,
311 unsigned long size)
312{
313 unsigned long flags;
314 void *ret;
315 int err;
316
317 /*
318 * - WB for WB-able memory and no other conflicting mappings
319 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
320 * - Inherit from confliting mappings otherwise
321 */
322 err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
323 if (err < 0)
324 return NULL;
325
326 ret = (void *) __ioremap_caller(phys_addr, size, flags,
327 __builtin_return_address(0));
328
329 free_memtype(phys_addr, phys_addr + size);
330 return (void __iomem *)ret;
331}
332
333void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
334 unsigned long prot_val)
335{
336 return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
337 __builtin_return_address(0));
338}
339EXPORT_SYMBOL(ioremap_prot);
340
303/** 341/**
304 * iounmap - Free a IO remapping 342 * iounmap - Free a IO remapping
305 * @addr: virtual address from ioremap_* 343 * @addr: virtual address from ioremap_*
@@ -318,13 +356,15 @@ void iounmap(volatile void __iomem *addr)
318 * vm_area and by simply returning an address into the kernel mapping 356 * vm_area and by simply returning an address into the kernel mapping
319 * of ISA space. So handle that here. 357 * of ISA space. So handle that here.
320 */ 358 */
321 if (addr >= phys_to_virt(ISA_START_ADDRESS) && 359 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
322 addr < phys_to_virt(ISA_END_ADDRESS)) 360 (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
323 return; 361 return;
324 362
325 addr = (volatile void __iomem *) 363 addr = (volatile void __iomem *)
326 (PAGE_MASK & (unsigned long __force)addr); 364 (PAGE_MASK & (unsigned long __force)addr);
327 365
366 mmiotrace_iounmap(addr);
367
328 /* Use the vm area unlocked, assuming the caller 368 /* Use the vm area unlocked, assuming the caller
329 ensures there isn't another iounmap for the same address 369 ensures there isn't another iounmap for the same address
330 in parallel. Reuse of the virtual address is prevented by 370 in parallel. Reuse of the virtual address is prevented by
@@ -332,7 +372,7 @@ void iounmap(volatile void __iomem *addr)
332 cpa takes care of the direct mappings. */ 372 cpa takes care of the direct mappings. */
333 read_lock(&vmlist_lock); 373 read_lock(&vmlist_lock);
334 for (p = vmlist; p; p = p->next) { 374 for (p = vmlist; p; p = p->next) {
335 if (p->addr == addr) 375 if (p->addr == (void __force *)addr)
336 break; 376 break;
337 } 377 }
338 read_unlock(&vmlist_lock); 378 read_unlock(&vmlist_lock);
@@ -346,7 +386,7 @@ void iounmap(volatile void __iomem *addr)
346 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); 386 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
347 387
348 /* Finally remove it */ 388 /* Finally remove it */
349 o = remove_vm_area((void *)addr); 389 o = remove_vm_area((void __force *)addr);
350 BUG_ON(p != o || o == NULL); 390 BUG_ON(p != o || o == NULL);
351 kfree(p); 391 kfree(p);
352} 392}
@@ -365,7 +405,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
365 if (page_is_ram(start >> PAGE_SHIFT)) 405 if (page_is_ram(start >> PAGE_SHIFT))
366 return __va(phys); 406 return __va(phys);
367 407
368 addr = (void *)ioremap(start, PAGE_SIZE); 408 addr = (void __force *)ioremap_default(start, PAGE_SIZE);
369 if (addr) 409 if (addr)
370 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 410 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
371 411
@@ -381,9 +421,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
381 return; 421 return;
382} 422}
383 423
384#ifdef CONFIG_X86_32 424static int __initdata early_ioremap_debug;
385
386int __initdata early_ioremap_debug;
387 425
388static int __init early_ioremap_debug_setup(char *str) 426static int __init early_ioremap_debug_setup(char *str)
389{ 427{
@@ -394,8 +432,7 @@ static int __init early_ioremap_debug_setup(char *str)
394early_param("early_ioremap_debug", early_ioremap_debug_setup); 432early_param("early_ioremap_debug", early_ioremap_debug_setup);
395 433
396static __initdata int after_paging_init; 434static __initdata int after_paging_init;
397static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] 435static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
398 __section(.bss.page_aligned);
399 436
400static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) 437static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
401{ 438{
@@ -484,10 +521,11 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
484 return; 521 return;
485 } 522 }
486 pte = early_ioremap_pte(addr); 523 pte = early_ioremap_pte(addr);
524
487 if (pgprot_val(flags)) 525 if (pgprot_val(flags))
488 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); 526 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
489 else 527 else
490 pte_clear(NULL, addr, pte); 528 pte_clear(&init_mm, addr, pte);
491 __flush_tlb_one(addr); 529 __flush_tlb_one(addr);
492} 530}
493 531
@@ -509,19 +547,17 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
509} 547}
510 548
511 549
512int __initdata early_ioremap_nested; 550static int __initdata early_ioremap_nested;
513 551
514static int __init check_early_ioremap_leak(void) 552static int __init check_early_ioremap_leak(void)
515{ 553{
516 if (!early_ioremap_nested) 554 if (!early_ioremap_nested)
517 return 0; 555 return 0;
518 556 WARN(1, KERN_WARNING
519 printk(KERN_WARNING
520 "Debug warning: early ioremap leak of %d areas detected.\n", 557 "Debug warning: early ioremap leak of %d areas detected.\n",
521 early_ioremap_nested); 558 early_ioremap_nested);
522 printk(KERN_WARNING 559 printk(KERN_WARNING
523 "please boot with early_ioremap_debug and report the dmesg.\n"); 560 "please boot with early_ioremap_debug and report the dmesg.\n");
524 WARN_ON(1);
525 561
526 return 1; 562 return 1;
527} 563}
@@ -593,10 +629,11 @@ void __init early_iounmap(void *addr, unsigned long size)
593 unsigned long offset; 629 unsigned long offset;
594 unsigned int nrpages; 630 unsigned int nrpages;
595 enum fixed_addresses idx; 631 enum fixed_addresses idx;
596 unsigned int nesting; 632 int nesting;
597 633
598 nesting = --early_ioremap_nested; 634 nesting = --early_ioremap_nested;
599 WARN_ON(nesting < 0); 635 if (WARN_ON(nesting < 0))
636 return;
600 637
601 if (early_ioremap_debug) { 638 if (early_ioremap_debug) {
602 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, 639 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
@@ -624,5 +661,3 @@ void __this_fixmap_does_not_exist(void)
624{ 661{
625 WARN_ON(1); 662 WARN_ON(1);
626} 663}
627
628#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 1f476e477844..41f1b5c00a1d 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,6 +22,7 @@
22#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/mpspec.h> 23#include <asm/mpspec.h>
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h>
25 26
26static __init int find_northbridge(void) 27static __init int find_northbridge(void)
27{ 28{
@@ -56,34 +57,33 @@ static __init void early_get_boot_cpu_id(void)
56 /* 57 /*
57 * Find possible boot-time SMP configuration: 58 * Find possible boot-time SMP configuration:
58 */ 59 */
60#ifdef CONFIG_X86_MPPARSE
59 early_find_smp_config(); 61 early_find_smp_config();
62#endif
60#ifdef CONFIG_ACPI 63#ifdef CONFIG_ACPI
61 /* 64 /*
62 * Read APIC information from ACPI tables. 65 * Read APIC information from ACPI tables.
63 */ 66 */
64 early_acpi_boot_init(); 67 early_acpi_boot_init();
65#endif 68#endif
69#ifdef CONFIG_X86_MPPARSE
66 /* 70 /*
67 * get boot-time SMP configuration: 71 * get boot-time SMP configuration:
68 */ 72 */
69 if (smp_found_config) 73 if (smp_found_config)
70 early_get_smp_config(); 74 early_get_smp_config();
75#endif
71 early_init_lapic_mapping(); 76 early_init_lapic_mapping();
72} 77}
73 78
74int __init k8_scan_nodes(unsigned long start, unsigned long end) 79int __init k8_scan_nodes(unsigned long start, unsigned long end)
75{ 80{
81 unsigned numnodes, cores, bits, apicid_base;
76 unsigned long prevbase; 82 unsigned long prevbase;
77 struct bootnode nodes[8]; 83 struct bootnode nodes[8];
78 int nodeid, i, nb;
79 unsigned char nodeids[8]; 84 unsigned char nodeids[8];
80 int found = 0; 85 int i, j, nb, found = 0;
81 u32 reg; 86 u32 nodeid, reg;
82 unsigned numnodes;
83 unsigned cores;
84 unsigned bits;
85 int j;
86 unsigned apicid_base;
87 87
88 if (!early_pci_allowed()) 88 if (!early_pci_allowed())
89 return -1; 89 return -1;
@@ -105,7 +105,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
105 prevbase = 0; 105 prevbase = 0;
106 for (i = 0; i < 8; i++) { 106 for (i = 0; i < 8; i++) {
107 unsigned long base, limit; 107 unsigned long base, limit;
108 u32 nodeid;
109 108
110 base = read_pci_config(0, nb, 1, 0x40 + i*8); 109 base = read_pci_config(0, nb, 1, 0x40 + i*8);
111 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 110 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -144,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
144 limit |= (1<<24)-1; 143 limit |= (1<<24)-1;
145 limit++; 144 limit++;
146 145
147 if (limit > end_pfn << PAGE_SHIFT) 146 if (limit > max_pfn << PAGE_SHIFT)
148 limit = end_pfn << PAGE_SHIFT; 147 limit = max_pfn << PAGE_SHIFT;
149 if (limit <= base) 148 if (limit <= base)
150 continue; 149 continue;
151 150
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 000000000000..93d82038af4b
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,510 @@
1/* Support for MMIO probes.
2 * Benfit many code from kprobes
3 * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
4 * 2007 Alexander Eichner
5 * 2008 Pekka Paalanen <pq@iki.fi>
6 */
7
8#include <linux/list.h>
9#include <linux/rculist.h>
10#include <linux/spinlock.h>
11#include <linux/hash.h>
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/kernel.h>
15#include <linux/uaccess.h>
16#include <linux/ptrace.h>
17#include <linux/preempt.h>
18#include <linux/percpu.h>
19#include <linux/kdebug.h>
20#include <linux/mutex.h>
21#include <linux/io.h>
22#include <asm/cacheflush.h>
23#include <asm/tlbflush.h>
24#include <linux/errno.h>
25#include <asm/debugreg.h>
26#include <linux/mmiotrace.h>
27
28#define KMMIO_PAGE_HASH_BITS 4
29#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
30
31struct kmmio_fault_page {
32 struct list_head list;
33 struct kmmio_fault_page *release_next;
34 unsigned long page; /* location of the fault page */
35
36 /*
37 * Number of times this page has been registered as a part
38 * of a probe. If zero, page is disarmed and this may be freed.
39 * Used only by writers (RCU).
40 */
41 int count;
42};
43
44struct kmmio_delayed_release {
45 struct rcu_head rcu;
46 struct kmmio_fault_page *release_list;
47};
48
49struct kmmio_context {
50 struct kmmio_fault_page *fpage;
51 struct kmmio_probe *probe;
52 unsigned long saved_flags;
53 unsigned long addr;
54 int active;
55};
56
57static DEFINE_SPINLOCK(kmmio_lock);
58
59/* Protected by kmmio_lock */
60unsigned int kmmio_count;
61
62/* Read-protected by RCU, write-protected by kmmio_lock. */
63static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
64static LIST_HEAD(kmmio_probes);
65
66static struct list_head *kmmio_page_list(unsigned long page)
67{
68 return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
69}
70
71/* Accessed per-cpu */
72static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
73
74/*
75 * this is basically a dynamic stabbing problem:
76 * Could use the existing prio tree code or
77 * Possible better implementations:
78 * The Interval Skip List: A Data Structure for Finding All Intervals That
79 * Overlap a Point (might be simple)
80 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
81 */
82/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
83static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
84{
85 struct kmmio_probe *p;
86 list_for_each_entry_rcu(p, &kmmio_probes, list) {
87 if (addr >= p->addr && addr <= (p->addr + p->len))
88 return p;
89 }
90 return NULL;
91}
92
93/* You must be holding RCU read lock. */
94static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
95{
96 struct list_head *head;
97 struct kmmio_fault_page *p;
98
99 page &= PAGE_MASK;
100 head = kmmio_page_list(page);
101 list_for_each_entry_rcu(p, head, list) {
102 if (p->page == page)
103 return p;
104 }
105 return NULL;
106}
107
108static void set_page_present(unsigned long addr, bool present,
109 unsigned int *pglevel)
110{
111 pteval_t pteval;
112 pmdval_t pmdval;
113 unsigned int level;
114 pmd_t *pmd;
115 pte_t *pte = lookup_address(addr, &level);
116
117 if (!pte) {
118 pr_err("kmmio: no pte for page 0x%08lx\n", addr);
119 return;
120 }
121
122 if (pglevel)
123 *pglevel = level;
124
125 switch (level) {
126 case PG_LEVEL_2M:
127 pmd = (pmd_t *)pte;
128 pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
129 if (present)
130 pmdval |= _PAGE_PRESENT;
131 set_pmd(pmd, __pmd(pmdval));
132 break;
133
134 case PG_LEVEL_4K:
135 pteval = pte_val(*pte) & ~_PAGE_PRESENT;
136 if (present)
137 pteval |= _PAGE_PRESENT;
138 set_pte_atomic(pte, __pte(pteval));
139 break;
140
141 default:
142 pr_err("kmmio: unexpected page level 0x%x.\n", level);
143 return;
144 }
145
146 __flush_tlb_one(addr);
147}
148
149/** Mark the given page as not present. Access to it will trigger a fault. */
150static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
151{
152 set_page_present(page & PAGE_MASK, false, pglevel);
153}
154
155/** Mark the given page as present. */
156static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
157{
158 set_page_present(page & PAGE_MASK, true, pglevel);
159}
160
161/*
162 * This is being called from do_page_fault().
163 *
164 * We may be in an interrupt or a critical section. Also prefecthing may
165 * trigger a page fault. We may be in the middle of process switch.
166 * We cannot take any locks, because we could be executing especially
167 * within a kmmio critical section.
168 *
169 * Local interrupts are disabled, so preemption cannot happen.
170 * Do not enable interrupts, do not sleep, and watch out for other CPUs.
171 */
172/*
173 * Interrupts are disabled on entry as trap3 is an interrupt gate
174 * and they remain disabled thorough out this function.
175 */
176int kmmio_handler(struct pt_regs *regs, unsigned long addr)
177{
178 struct kmmio_context *ctx;
179 struct kmmio_fault_page *faultpage;
180 int ret = 0; /* default to fault not handled */
181
182 /*
183 * Preemption is now disabled to prevent process switch during
184 * single stepping. We can only handle one active kmmio trace
185 * per cpu, so ensure that we finish it before something else
186 * gets to run. We also hold the RCU read lock over single
187 * stepping to avoid looking up the probe and kmmio_fault_page
188 * again.
189 */
190 preempt_disable();
191 rcu_read_lock();
192
193 faultpage = get_kmmio_fault_page(addr);
194 if (!faultpage) {
195 /*
196 * Either this page fault is not caused by kmmio, or
197 * another CPU just pulled the kmmio probe from under
198 * our feet. The latter case should not be possible.
199 */
200 goto no_kmmio;
201 }
202
203 ctx = &get_cpu_var(kmmio_ctx);
204 if (ctx->active) {
205 disarm_kmmio_fault_page(faultpage->page, NULL);
206 if (addr == ctx->addr) {
207 /*
208 * On SMP we sometimes get recursive probe hits on the
209 * same address. Context is already saved, fall out.
210 */
211 pr_debug("kmmio: duplicate probe hit on CPU %d, for "
212 "address 0x%08lx.\n",
213 smp_processor_id(), addr);
214 ret = 1;
215 goto no_kmmio_ctx;
216 }
217 /*
218 * Prevent overwriting already in-flight context.
219 * This should not happen, let's hope disarming at least
220 * prevents a panic.
221 */
222 pr_emerg("kmmio: recursive probe hit on CPU %d, "
223 "for address 0x%08lx. Ignoring.\n",
224 smp_processor_id(), addr);
225 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
226 ctx->addr);
227 goto no_kmmio_ctx;
228 }
229 ctx->active++;
230
231 ctx->fpage = faultpage;
232 ctx->probe = get_kmmio_probe(addr);
233 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
234 ctx->addr = addr;
235
236 if (ctx->probe && ctx->probe->pre_handler)
237 ctx->probe->pre_handler(ctx->probe, regs, addr);
238
239 /*
240 * Enable single-stepping and disable interrupts for the faulting
241 * context. Local interrupts must not get enabled during stepping.
242 */
243 regs->flags |= X86_EFLAGS_TF;
244 regs->flags &= ~X86_EFLAGS_IF;
245
246 /* Now we set present bit in PTE and single step. */
247 disarm_kmmio_fault_page(ctx->fpage->page, NULL);
248
249 /*
250 * If another cpu accesses the same page while we are stepping,
251 * the access will not be caught. It will simply succeed and the
252 * only downside is we lose the event. If this becomes a problem,
253 * the user should drop to single cpu before tracing.
254 */
255
256 put_cpu_var(kmmio_ctx);
257 return 1; /* fault handled */
258
259no_kmmio_ctx:
260 put_cpu_var(kmmio_ctx);
261no_kmmio:
262 rcu_read_unlock();
263 preempt_enable_no_resched();
264 return ret;
265}
266
267/*
268 * Interrupts are disabled on entry as trap1 is an interrupt gate
269 * and they remain disabled thorough out this function.
270 * This must always get called as the pair to kmmio_handler().
271 */
272static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
273{
274 int ret = 0;
275 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
276
277 if (!ctx->active) {
278 pr_debug("kmmio: spurious debug trap on CPU %d.\n",
279 smp_processor_id());
280 goto out;
281 }
282
283 if (ctx->probe && ctx->probe->post_handler)
284 ctx->probe->post_handler(ctx->probe, condition, regs);
285
286 arm_kmmio_fault_page(ctx->fpage->page, NULL);
287
288 regs->flags &= ~X86_EFLAGS_TF;
289 regs->flags |= ctx->saved_flags;
290
291 /* These were acquired in kmmio_handler(). */
292 ctx->active--;
293 BUG_ON(ctx->active);
294 rcu_read_unlock();
295 preempt_enable_no_resched();
296
297 /*
298 * if somebody else is singlestepping across a probe point, flags
299 * will have TF set, in which case, continue the remaining processing
300 * of do_debug, as if this is not a probe hit.
301 */
302 if (!(regs->flags & X86_EFLAGS_TF))
303 ret = 1;
304out:
305 put_cpu_var(kmmio_ctx);
306 return ret;
307}
308
309/* You must be holding kmmio_lock. */
310static int add_kmmio_fault_page(unsigned long page)
311{
312 struct kmmio_fault_page *f;
313
314 page &= PAGE_MASK;
315 f = get_kmmio_fault_page(page);
316 if (f) {
317 if (!f->count)
318 arm_kmmio_fault_page(f->page, NULL);
319 f->count++;
320 return 0;
321 }
322
323 f = kmalloc(sizeof(*f), GFP_ATOMIC);
324 if (!f)
325 return -1;
326
327 f->count = 1;
328 f->page = page;
329 list_add_rcu(&f->list, kmmio_page_list(f->page));
330
331 arm_kmmio_fault_page(f->page, NULL);
332
333 return 0;
334}
335
336/* You must be holding kmmio_lock. */
337static void release_kmmio_fault_page(unsigned long page,
338 struct kmmio_fault_page **release_list)
339{
340 struct kmmio_fault_page *f;
341
342 page &= PAGE_MASK;
343 f = get_kmmio_fault_page(page);
344 if (!f)
345 return;
346
347 f->count--;
348 BUG_ON(f->count < 0);
349 if (!f->count) {
350 disarm_kmmio_fault_page(f->page, NULL);
351 f->release_next = *release_list;
352 *release_list = f;
353 }
354}
355
356/*
357 * With page-unaligned ioremaps, one or two armed pages may contain
358 * addresses from outside the intended mapping. Events for these addresses
359 * are currently silently dropped. The events may result only from programming
360 * mistakes by accessing addresses before the beginning or past the end of a
361 * mapping.
362 */
363int register_kmmio_probe(struct kmmio_probe *p)
364{
365 unsigned long flags;
366 int ret = 0;
367 unsigned long size = 0;
368 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
369
370 spin_lock_irqsave(&kmmio_lock, flags);
371 if (get_kmmio_probe(p->addr)) {
372 ret = -EEXIST;
373 goto out;
374 }
375 kmmio_count++;
376 list_add_rcu(&p->list, &kmmio_probes);
377 while (size < size_lim) {
378 if (add_kmmio_fault_page(p->addr + size))
379 pr_err("kmmio: Unable to set page fault.\n");
380 size += PAGE_SIZE;
381 }
382out:
383 spin_unlock_irqrestore(&kmmio_lock, flags);
384 /*
385 * XXX: What should I do here?
386 * Here was a call to global_flush_tlb(), but it does not exist
387 * anymore. It seems it's not needed after all.
388 */
389 return ret;
390}
391EXPORT_SYMBOL(register_kmmio_probe);
392
393static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
394{
395 struct kmmio_delayed_release *dr = container_of(
396 head,
397 struct kmmio_delayed_release,
398 rcu);
399 struct kmmio_fault_page *p = dr->release_list;
400 while (p) {
401 struct kmmio_fault_page *next = p->release_next;
402 BUG_ON(p->count);
403 kfree(p);
404 p = next;
405 }
406 kfree(dr);
407}
408
409static void remove_kmmio_fault_pages(struct rcu_head *head)
410{
411 struct kmmio_delayed_release *dr = container_of(
412 head,
413 struct kmmio_delayed_release,
414 rcu);
415 struct kmmio_fault_page *p = dr->release_list;
416 struct kmmio_fault_page **prevp = &dr->release_list;
417 unsigned long flags;
418 spin_lock_irqsave(&kmmio_lock, flags);
419 while (p) {
420 if (!p->count)
421 list_del_rcu(&p->list);
422 else
423 *prevp = p->release_next;
424 prevp = &p->release_next;
425 p = p->release_next;
426 }
427 spin_unlock_irqrestore(&kmmio_lock, flags);
428 /* This is the real RCU destroy call. */
429 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
430}
431
432/*
433 * Remove a kmmio probe. You have to synchronize_rcu() before you can be
434 * sure that the callbacks will not be called anymore. Only after that
435 * you may actually release your struct kmmio_probe.
436 *
437 * Unregistering a kmmio fault page has three steps:
438 * 1. release_kmmio_fault_page()
439 * Disarm the page, wait a grace period to let all faults finish.
440 * 2. remove_kmmio_fault_pages()
441 * Remove the pages from kmmio_page_table.
442 * 3. rcu_free_kmmio_fault_pages()
443 * Actally free the kmmio_fault_page structs as with RCU.
444 */
445void unregister_kmmio_probe(struct kmmio_probe *p)
446{
447 unsigned long flags;
448 unsigned long size = 0;
449 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
450 struct kmmio_fault_page *release_list = NULL;
451 struct kmmio_delayed_release *drelease;
452
453 spin_lock_irqsave(&kmmio_lock, flags);
454 while (size < size_lim) {
455 release_kmmio_fault_page(p->addr + size, &release_list);
456 size += PAGE_SIZE;
457 }
458 list_del_rcu(&p->list);
459 kmmio_count--;
460 spin_unlock_irqrestore(&kmmio_lock, flags);
461
462 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
463 if (!drelease) {
464 pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
465 return;
466 }
467 drelease->release_list = release_list;
468
469 /*
470 * This is not really RCU here. We have just disarmed a set of
471 * pages so that they cannot trigger page faults anymore. However,
472 * we cannot remove the pages from kmmio_page_table,
473 * because a probe hit might be in flight on another CPU. The
474 * pages are collected into a list, and they will be removed from
475 * kmmio_page_table when it is certain that no probe hit related to
476 * these pages can be in flight. RCU grace period sounds like a
477 * good choice.
478 *
479 * If we removed the pages too early, kmmio page fault handler might
480 * not find the respective kmmio_fault_page and determine it's not
481 * a kmmio fault, when it actually is. This would lead to madness.
482 */
483 call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
484}
485EXPORT_SYMBOL(unregister_kmmio_probe);
486
487static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
488 void *args)
489{
490 struct die_args *arg = args;
491
492 if (val == DIE_DEBUG && (arg->err & DR_STEP))
493 if (post_kmmio_handler(arg->err, arg->regs) == 1)
494 return NOTIFY_STOP;
495
496 return NOTIFY_DONE;
497}
498
499static struct notifier_block nb_die = {
500 .notifier_call = kmmio_die_notifier
501};
502
503static int __init init_kmmio(void)
504{
505 int i;
506 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
507 INIT_LIST_HEAD(&kmmio_page_table[i]);
508 return register_die_notifier(&nb_die);
509}
510fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
new file mode 100644
index 000000000000..672e17f8262a
--- /dev/null
+++ b/arch/x86/mm/memtest.c
@@ -0,0 +1,123 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/pfn.h>
9
10#include <asm/e820.h>
11
12static void __init memtest(unsigned long start_phys, unsigned long size,
13 unsigned pattern)
14{
15 unsigned long i;
16 unsigned long *start;
17 unsigned long start_bad;
18 unsigned long last_bad;
19 unsigned long val;
20 unsigned long start_phys_aligned;
21 unsigned long count;
22 unsigned long incr;
23
24 switch (pattern) {
25 case 0:
26 val = 0UL;
27 break;
28 case 1:
29 val = -1UL;
30 break;
31 case 2:
32#ifdef CONFIG_X86_64
33 val = 0x5555555555555555UL;
34#else
35 val = 0x55555555UL;
36#endif
37 break;
38 case 3:
39#ifdef CONFIG_X86_64
40 val = 0xaaaaaaaaaaaaaaaaUL;
41#else
42 val = 0xaaaaaaaaUL;
43#endif
44 break;
45 default:
46 return;
47 }
48
49 incr = sizeof(unsigned long);
50 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned);
53 start_bad = 0;
54 last_bad = 0;
55
56 for (i = 0; i < count; i++)
57 start[i] = val;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
59 if (*start != val) {
60 if (start_phys_aligned == last_bad + incr) {
61 last_bad += incr;
62 } else {
63 if (start_bad) {
64 printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
67 }
68 start_bad = last_bad = start_phys_aligned;
69 }
70 }
71 }
72 if (start_bad) {
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
74 val, start_bad, last_bad + incr);
75 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
76 }
77
78}
79
80/* default is disabled */
81static int memtest_pattern __initdata;
82
83static int __init parse_memtest(char *arg)
84{
85 if (arg)
86 memtest_pattern = simple_strtoul(arg, NULL, 0);
87 return 0;
88}
89
90early_param("memtest", parse_memtest);
91
92void __init early_memtest(unsigned long start, unsigned long end)
93{
94 u64 t_start, t_size;
95 unsigned pattern;
96
97 if (!memtest_pattern)
98 return;
99
100 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
101 for (pattern = 0; pattern < memtest_pattern; pattern++) {
102 t_start = start;
103 t_size = 0;
104 while (t_start < end) {
105 t_start = find_e820_area_size(t_start, &t_size, 1);
106
107 /* done ? */
108 if (t_start >= end)
109 break;
110 if (t_start + t_size > end)
111 t_size = end - t_start;
112
113 printk(KERN_CONT "\n %010llx - %010llx pattern %d",
114 (unsigned long long)t_start,
115 (unsigned long long)t_start + t_size, pattern);
116
117 memtest(t_start, t_size, pattern);
118
119 t_start += t_size;
120 }
121 }
122 printk(KERN_CONT "\n");
123}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644
index 000000000000..635b50e85581
--- /dev/null
+++ b/arch/x86/mm/mmio-mod.c
@@ -0,0 +1,517 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2005
17 * Jeff Muizelaar, 2006, 2007
18 * Pekka Paalanen, 2008 <pq@iki.fi>
19 *
20 * Derived from the read-mod example from relay-examples by Tom Zanussi.
21 */
22#define DEBUG 1
23
24#include <linux/module.h>
25#include <linux/debugfs.h>
26#include <linux/uaccess.h>
27#include <linux/io.h>
28#include <linux/version.h>
29#include <linux/kallsyms.h>
30#include <asm/pgtable.h>
31#include <linux/mmiotrace.h>
32#include <asm/e820.h> /* for ISA_START_ADDRESS */
33#include <asm/atomic.h>
34#include <linux/percpu.h>
35#include <linux/cpu.h>
36
37#include "pf_in.h"
38
39#define NAME "mmiotrace: "
40
41struct trap_reason {
42 unsigned long addr;
43 unsigned long ip;
44 enum reason_type type;
45 int active_traces;
46};
47
48struct remap_trace {
49 struct list_head list;
50 struct kmmio_probe probe;
51 resource_size_t phys;
52 unsigned long id;
53};
54
55/* Accessed per-cpu. */
56static DEFINE_PER_CPU(struct trap_reason, pf_reason);
57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
58
59#if 0 /* XXX: no way gather this info anymore */
60/* Access to this is not per-cpu. */
61static DEFINE_PER_CPU(atomic_t, dropped);
62#endif
63
64static struct dentry *marker_file;
65
66static DEFINE_MUTEX(mmiotrace_mutex);
67static DEFINE_SPINLOCK(trace_lock);
68static atomic_t mmiotrace_enabled;
69static LIST_HEAD(trace_list); /* struct remap_trace */
70
71/*
72 * Locking in this file:
73 * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
74 * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
75 * and trace_lock.
76 * - Routines depending on is_enabled() must take trace_lock.
77 * - trace_list users must hold trace_lock.
78 * - is_enabled() guarantees that mmio_trace_record is allowed.
79 * - pre/post callbacks assume the effect of is_enabled() being true.
80 */
81
82/* module parameters */
83static unsigned long filter_offset;
84static int nommiotrace;
85static int trace_pc;
86
87module_param(filter_offset, ulong, 0);
88module_param(nommiotrace, bool, 0);
89module_param(trace_pc, bool, 0);
90
91MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
92MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
93MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
94
95static bool is_enabled(void)
96{
97 return atomic_read(&mmiotrace_enabled);
98}
99
100#if 0 /* XXX: needs rewrite */
101/*
102 * Write callback for the debugfs entry:
103 * Read a marker and write it to the mmio trace log
104 */
105static ssize_t write_marker(struct file *file, const char __user *buffer,
106 size_t count, loff_t *ppos)
107{
108 char *event = NULL;
109 struct mm_io_header *headp;
110 ssize_t len = (count > 65535) ? 65535 : count;
111
112 event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
113 if (!event)
114 return -ENOMEM;
115
116 headp = (struct mm_io_header *)event;
117 headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
118 headp->data_len = len;
119
120 if (copy_from_user(event + sizeof(*headp), buffer, len)) {
121 kfree(event);
122 return -EFAULT;
123 }
124
125 spin_lock_irq(&trace_lock);
126#if 0 /* XXX: convert this to use tracing */
127 if (is_enabled())
128 relay_write(chan, event, sizeof(*headp) + len);
129 else
130#endif
131 len = -EINVAL;
132 spin_unlock_irq(&trace_lock);
133 kfree(event);
134 return len;
135}
136#endif
137
138static void print_pte(unsigned long address)
139{
140 unsigned int level;
141 pte_t *pte = lookup_address(address, &level);
142
143 if (!pte) {
144 pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
145 __func__, address);
146 return;
147 }
148
149 if (level == PG_LEVEL_2M) {
150 pr_emerg(NAME "4MB pages are not currently supported: "
151 "0x%08lx\n", address);
152 BUG();
153 }
154 pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address,
155 (unsigned long long)pte_val(*pte),
156 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
157}
158
159/*
160 * For some reason the pre/post pairs have been called in an
161 * unmatched order. Report and die.
162 */
163static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
164{
165 const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
166 pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
167 "last fault for address: 0x%08lx\n",
168 addr, my_reason->addr);
169 print_pte(addr);
170 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
171 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
172#ifdef __i386__
173 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
174 regs->ax, regs->bx, regs->cx, regs->dx);
175 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
176 regs->si, regs->di, regs->bp, regs->sp);
177#else
178 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
179 regs->ax, regs->cx, regs->dx);
180 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
181 regs->si, regs->di, regs->bp, regs->sp);
182#endif
183 put_cpu_var(pf_reason);
184 BUG();
185}
186
187static void pre(struct kmmio_probe *p, struct pt_regs *regs,
188 unsigned long addr)
189{
190 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
191 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
192 const unsigned long instptr = instruction_pointer(regs);
193 const enum reason_type type = get_ins_type(instptr);
194 struct remap_trace *trace = p->private;
195
196 /* it doesn't make sense to have more than one active trace per cpu */
197 if (my_reason->active_traces)
198 die_kmmio_nesting_error(regs, addr);
199 else
200 my_reason->active_traces++;
201
202 my_reason->type = type;
203 my_reason->addr = addr;
204 my_reason->ip = instptr;
205
206 my_trace->phys = addr - trace->probe.addr + trace->phys;
207 my_trace->map_id = trace->id;
208
209 /*
210 * Only record the program counter when requested.
211 * It may taint clean-room reverse engineering.
212 */
213 if (trace_pc)
214 my_trace->pc = instptr;
215 else
216 my_trace->pc = 0;
217
218 /*
219 * XXX: the timestamp recorded will be *after* the tracing has been
220 * done, not at the time we hit the instruction. SMP implications
221 * on event ordering?
222 */
223
224 switch (type) {
225 case REG_READ:
226 my_trace->opcode = MMIO_READ;
227 my_trace->width = get_ins_mem_width(instptr);
228 break;
229 case REG_WRITE:
230 my_trace->opcode = MMIO_WRITE;
231 my_trace->width = get_ins_mem_width(instptr);
232 my_trace->value = get_ins_reg_val(instptr, regs);
233 break;
234 case IMM_WRITE:
235 my_trace->opcode = MMIO_WRITE;
236 my_trace->width = get_ins_mem_width(instptr);
237 my_trace->value = get_ins_imm_val(instptr);
238 break;
239 default:
240 {
241 unsigned char *ip = (unsigned char *)instptr;
242 my_trace->opcode = MMIO_UNKNOWN_OP;
243 my_trace->width = 0;
244 my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
245 *(ip + 2);
246 }
247 }
248 put_cpu_var(cpu_trace);
249 put_cpu_var(pf_reason);
250}
251
252static void post(struct kmmio_probe *p, unsigned long condition,
253 struct pt_regs *regs)
254{
255 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
256 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
257
258 /* this should always return the active_trace count to 0 */
259 my_reason->active_traces--;
260 if (my_reason->active_traces) {
261 pr_emerg(NAME "unexpected post handler");
262 BUG();
263 }
264
265 switch (my_reason->type) {
266 case REG_READ:
267 my_trace->value = get_ins_reg_val(my_reason->ip, regs);
268 break;
269 default:
270 break;
271 }
272
273 mmio_trace_rw(my_trace);
274 put_cpu_var(cpu_trace);
275 put_cpu_var(pf_reason);
276}
277
278static void ioremap_trace_core(resource_size_t offset, unsigned long size,
279 void __iomem *addr)
280{
281 static atomic_t next_id;
282 struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
283 /* These are page-unaligned. */
284 struct mmiotrace_map map = {
285 .phys = offset,
286 .virt = (unsigned long)addr,
287 .len = size,
288 .opcode = MMIO_PROBE
289 };
290
291 if (!trace) {
292 pr_err(NAME "kmalloc failed in ioremap\n");
293 return;
294 }
295
296 *trace = (struct remap_trace) {
297 .probe = {
298 .addr = (unsigned long)addr,
299 .len = size,
300 .pre_handler = pre,
301 .post_handler = post,
302 .private = trace
303 },
304 .phys = offset,
305 .id = atomic_inc_return(&next_id)
306 };
307 map.map_id = trace->id;
308
309 spin_lock_irq(&trace_lock);
310 if (!is_enabled())
311 goto not_enabled;
312
313 mmio_trace_mapping(&map);
314 list_add_tail(&trace->list, &trace_list);
315 if (!nommiotrace)
316 register_kmmio_probe(&trace->probe);
317
318not_enabled:
319 spin_unlock_irq(&trace_lock);
320}
321
322void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
323 void __iomem *addr)
324{
325 if (!is_enabled()) /* recheck and proper locking in *_core() */
326 return;
327
328 pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
329 (unsigned long long)offset, size, addr);
330 if ((filter_offset) && (offset != filter_offset))
331 return;
332 ioremap_trace_core(offset, size, addr);
333}
334
335static void iounmap_trace_core(volatile void __iomem *addr)
336{
337 struct mmiotrace_map map = {
338 .phys = 0,
339 .virt = (unsigned long)addr,
340 .len = 0,
341 .opcode = MMIO_UNPROBE
342 };
343 struct remap_trace *trace;
344 struct remap_trace *tmp;
345 struct remap_trace *found_trace = NULL;
346
347 pr_debug(NAME "Unmapping %p.\n", addr);
348
349 spin_lock_irq(&trace_lock);
350 if (!is_enabled())
351 goto not_enabled;
352
353 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
354 if ((unsigned long)addr == trace->probe.addr) {
355 if (!nommiotrace)
356 unregister_kmmio_probe(&trace->probe);
357 list_del(&trace->list);
358 found_trace = trace;
359 break;
360 }
361 }
362 map.map_id = (found_trace) ? found_trace->id : -1;
363 mmio_trace_mapping(&map);
364
365not_enabled:
366 spin_unlock_irq(&trace_lock);
367 if (found_trace) {
368 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
369 kfree(found_trace);
370 }
371}
372
373void mmiotrace_iounmap(volatile void __iomem *addr)
374{
375 might_sleep();
376 if (is_enabled()) /* recheck and proper locking in *_core() */
377 iounmap_trace_core(addr);
378}
379
380static void clear_trace_list(void)
381{
382 struct remap_trace *trace;
383 struct remap_trace *tmp;
384
385 /*
386 * No locking required, because the caller ensures we are in a
387 * critical section via mutex, and is_enabled() is false,
388 * i.e. nothing can traverse or modify this list.
389 * Caller also ensures is_enabled() cannot change.
390 */
391 list_for_each_entry(trace, &trace_list, list) {
392 pr_notice(NAME "purging non-iounmapped "
393 "trace @0x%08lx, size 0x%lx.\n",
394 trace->probe.addr, trace->probe.len);
395 if (!nommiotrace)
396 unregister_kmmio_probe(&trace->probe);
397 }
398 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
399
400 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
401 list_del(&trace->list);
402 kfree(trace);
403 }
404}
405
406#ifdef CONFIG_HOTPLUG_CPU
407static cpumask_t downed_cpus;
408
409static void enter_uniprocessor(void)
410{
411 int cpu;
412 int err;
413
414 get_online_cpus();
415 downed_cpus = cpu_online_map;
416 cpu_clear(first_cpu(cpu_online_map), downed_cpus);
417 if (num_online_cpus() > 1)
418 pr_notice(NAME "Disabling non-boot CPUs...\n");
419 put_online_cpus();
420
421 for_each_cpu_mask(cpu, downed_cpus) {
422 err = cpu_down(cpu);
423 if (!err)
424 pr_info(NAME "CPU%d is down.\n", cpu);
425 else
426 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
427 }
428 if (num_online_cpus() > 1)
429 pr_warning(NAME "multiple CPUs still online, "
430 "may miss events.\n");
431}
432
433/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
434 but this whole function is ifdefed CONFIG_HOTPLUG_CPU */
435static void __ref leave_uniprocessor(void)
436{
437 int cpu;
438 int err;
439
440 if (cpus_weight(downed_cpus) == 0)
441 return;
442 pr_notice(NAME "Re-enabling CPUs...\n");
443 for_each_cpu_mask(cpu, downed_cpus) {
444 err = cpu_up(cpu);
445 if (!err)
446 pr_info(NAME "enabled CPU%d.\n", cpu);
447 else
448 pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err);
449 }
450}
451
452#else /* !CONFIG_HOTPLUG_CPU */
453static void enter_uniprocessor(void)
454{
455 if (num_online_cpus() > 1)
456 pr_warning(NAME "multiple CPUs are online, may miss events. "
457 "Suggest booting with maxcpus=1 kernel argument.\n");
458}
459
460static void leave_uniprocessor(void)
461{
462}
463#endif
464
465#if 0 /* XXX: out of order */
466static struct file_operations fops_marker = {
467 .owner = THIS_MODULE,
468 .write = write_marker
469};
470#endif
471
472void enable_mmiotrace(void)
473{
474 mutex_lock(&mmiotrace_mutex);
475 if (is_enabled())
476 goto out;
477
478#if 0 /* XXX: tracing does not support text entries */
479 marker_file = debugfs_create_file("marker", 0660, dir, NULL,
480 &fops_marker);
481 if (!marker_file)
482 pr_err(NAME "marker file creation failed.\n");
483#endif
484
485 if (nommiotrace)
486 pr_info(NAME "MMIO tracing disabled.\n");
487 enter_uniprocessor();
488 spin_lock_irq(&trace_lock);
489 atomic_inc(&mmiotrace_enabled);
490 spin_unlock_irq(&trace_lock);
491 pr_info(NAME "enabled.\n");
492out:
493 mutex_unlock(&mmiotrace_mutex);
494}
495
496void disable_mmiotrace(void)
497{
498 mutex_lock(&mmiotrace_mutex);
499 if (!is_enabled())
500 goto out;
501
502 spin_lock_irq(&trace_lock);
503 atomic_dec(&mmiotrace_enabled);
504 BUG_ON(is_enabled());
505 spin_unlock_irq(&trace_lock);
506
507 clear_trace_list(); /* guarantees: no more kmmio callbacks */
508 leave_uniprocessor();
509 if (marker_file) {
510 debugfs_remove(marker_file);
511 marker_file = NULL;
512 }
513
514 pr_info(NAME "disabled.\n");
515out:
516 mutex_unlock(&mmiotrace_mutex);
517}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c5066d519e5d..a4dd793d6003 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,37 +20,18 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifndef Dprintk
24#define Dprintk(x...)
25#endif
26
27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data); 24EXPORT_SYMBOL(node_data);
29 25
30bootmem_data_t plat_node_bdata[MAX_NUMNODES];
31
32struct memnode memnode; 26struct memnode memnode;
33 27
34#ifdef CONFIG_SMP
35int x86_cpu_to_node_map_init[NR_CPUS] = {
36 [0 ... NR_CPUS-1] = NUMA_NO_NODE
37};
38void *x86_cpu_to_node_map_early_ptr;
39EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
40#endif
41DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
42EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 28s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
45 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 29 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
46}; 30};
47 31
48cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
49EXPORT_SYMBOL(node_to_cpumask_map);
50
51int numa_off __initdata; 32int numa_off __initdata;
52unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
53unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
54 35
55/* 36/*
56 * Given a shift value, try to populate memnodemap[] 37 * Given a shift value, try to populate memnodemap[]
@@ -99,7 +80,7 @@ static int __init allocate_cachealigned_memnodemap(void)
99 80
100 addr = 0x8000; 81 addr = 0x8000;
101 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 82 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
102 nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT, 83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
103 nodemap_size, L1_CACHE_BYTES); 84 nodemap_size, L1_CACHE_BYTES);
104 if (nodemap_addr == -1UL) { 85 if (nodemap_addr == -1UL) {
105 printk(KERN_ERR 86 printk(KERN_ERR
@@ -192,7 +173,7 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
192void __init setup_node_bootmem(int nodeid, unsigned long start, 173void __init setup_node_bootmem(int nodeid, unsigned long start,
193 unsigned long end) 174 unsigned long end)
194{ 175{
195 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; 176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
196 unsigned long bootmap_start, nodedata_phys; 177 unsigned long bootmap_start, nodedata_phys;
197 void *bootmap; 178 void *bootmap;
198 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 179 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
@@ -204,7 +185,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
204 start, end); 185 start, end);
205 186
206 start_pfn = start >> PAGE_SHIFT; 187 start_pfn = start >> PAGE_SHIFT;
207 end_pfn = end >> PAGE_SHIFT; 188 last_pfn = end >> PAGE_SHIFT;
208 189
209 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 190 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
210 SMP_CACHE_BYTES); 191 SMP_CACHE_BYTES);
@@ -215,9 +196,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
215 nodedata_phys + pgdat_size - 1); 196 nodedata_phys + pgdat_size - 1);
216 197
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 198 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 199 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 200 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 201 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 202
222 /* 203 /*
223 * Find a place for the bootmem map 204 * Find a place for the bootmem map
@@ -226,14 +207,14 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
226 * early_node_mem will get that with find_e820_area instead 207 * early_node_mem will get that with find_e820_area instead
227 * of alloc_bootmem, that could clash with reserved range 208 * of alloc_bootmem, that could clash with reserved range
228 */ 209 */
229 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 211 nid = phys_to_nid(nodedata_phys);
231 if (nid == nodeid) 212 if (nid == nodeid)
232 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 213 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else 214 else
234 bootmap_start = round_up(start, PAGE_SIZE); 215 bootmap_start = round_up(start, PAGE_SIZE);
235 /* 216 /*
236 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like 217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 218 * to use that to align to PAGE_SIZE
238 */ 219 */
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 220 bootmap = early_node_mem(nodeid, bootmap_start, end,
@@ -248,7 +229,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
248 229
249 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 230 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
250 bootmap_start >> PAGE_SHIFT, 231 bootmap_start >> PAGE_SHIFT,
251 start_pfn, end_pfn); 232 start_pfn, last_pfn);
252 233
253 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 234 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
254 bootmap_start, bootmap_start + bootmap_size - 1, 235 bootmap_start, bootmap_start + bootmap_size - 1,
@@ -309,7 +290,7 @@ void __init numa_init_array(void)
309 290
310#ifdef CONFIG_NUMA_EMU 291#ifdef CONFIG_NUMA_EMU
311/* Numa emulation */ 292/* Numa emulation */
312char *cmdline __initdata; 293static char *cmdline __initdata;
313 294
314/* 295/*
315 * Setups up nid to range from addr to addr + size. If the end 296 * Setups up nid to range from addr to addr + size. If the end
@@ -413,15 +394,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413} 394}
414 395
415/* 396/*
416 * Sets up the system RAM area from start_pfn to end_pfn according to the 397 * Sets up the system RAM area from start_pfn to last_pfn according to the
417 * numa=fake command-line option. 398 * numa=fake command-line option.
418 */ 399 */
419static struct bootnode nodes[MAX_NUMNODES] __initdata; 400static struct bootnode nodes[MAX_NUMNODES] __initdata;
420 401
421static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 402static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
422{ 403{
423 u64 size, addr = start_pfn << PAGE_SHIFT; 404 u64 size, addr = start_pfn << PAGE_SHIFT;
424 u64 max_addr = end_pfn << PAGE_SHIFT; 405 u64 max_addr = last_pfn << PAGE_SHIFT;
425 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 406 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
426 407
427 memset(&nodes, 0, sizeof(nodes)); 408 memset(&nodes, 0, sizeof(nodes));
@@ -527,7 +508,7 @@ out:
527} 508}
528#endif /* CONFIG_NUMA_EMU */ 509#endif /* CONFIG_NUMA_EMU */
529 510
530void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 511void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
531{ 512{
532 int i; 513 int i;
533 514
@@ -535,7 +516,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
535 nodes_clear(node_online_map); 516 nodes_clear(node_online_map);
536 517
537#ifdef CONFIG_NUMA_EMU 518#ifdef CONFIG_NUMA_EMU
538 if (cmdline && !numa_emulation(start_pfn, end_pfn)) 519 if (cmdline && !numa_emulation(start_pfn, last_pfn))
539 return; 520 return;
540 nodes_clear(node_possible_map); 521 nodes_clear(node_possible_map);
541 nodes_clear(node_online_map); 522 nodes_clear(node_online_map);
@@ -543,7 +524,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
543 524
544#ifdef CONFIG_ACPI_NUMA 525#ifdef CONFIG_ACPI_NUMA
545 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 526 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
546 end_pfn << PAGE_SHIFT)) 527 last_pfn << PAGE_SHIFT))
547 return; 528 return;
548 nodes_clear(node_possible_map); 529 nodes_clear(node_possible_map);
549 nodes_clear(node_online_map); 530 nodes_clear(node_online_map);
@@ -551,7 +532,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
551 532
552#ifdef CONFIG_K8_NUMA 533#ifdef CONFIG_K8_NUMA
553 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 534 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
554 end_pfn<<PAGE_SHIFT)) 535 last_pfn<<PAGE_SHIFT))
555 return; 536 return;
556 nodes_clear(node_possible_map); 537 nodes_clear(node_possible_map);
557 nodes_clear(node_online_map); 538 nodes_clear(node_online_map);
@@ -561,7 +542,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
561 542
562 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 543 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
563 start_pfn << PAGE_SHIFT, 544 start_pfn << PAGE_SHIFT,
564 end_pfn << PAGE_SHIFT); 545 last_pfn << PAGE_SHIFT);
565 /* setup dummy node covering all memory */ 546 /* setup dummy node covering all memory */
566 memnode_shift = 63; 547 memnode_shift = 63;
567 memnodemap = memnode.embedded_map; 548 memnodemap = memnode.embedded_map;
@@ -570,29 +551,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
570 node_set(0, node_possible_map); 551 node_set(0, node_possible_map);
571 for (i = 0; i < NR_CPUS; i++) 552 for (i = 0; i < NR_CPUS; i++)
572 numa_set_node(i, 0); 553 numa_set_node(i, 0);
573 /* cpumask_of_cpu() may not be available during early startup */ 554 e820_register_active_regions(0, start_pfn, last_pfn);
574 memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); 555 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
575 cpu_set(0, node_to_cpumask_map[0]);
576 e820_register_active_regions(0, start_pfn, end_pfn);
577 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
578}
579
580__cpuinit void numa_add_cpu(int cpu)
581{
582 set_bit(cpu,
583 (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
584}
585
586void __cpuinit numa_set_node(int cpu, int node)
587{
588 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
589
590 if(cpu_to_node_map)
591 cpu_to_node_map[cpu] = node;
592 else if(per_cpu_offset(cpu))
593 per_cpu(x86_cpu_to_node_map, cpu) = node;
594 else
595 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
596} 556}
597 557
598unsigned long __init numa_free_all_bootmem(void) 558unsigned long __init numa_free_all_bootmem(void)
@@ -613,7 +573,7 @@ void __init paging_init(void)
613 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 573 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
614 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 574 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
615 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 575 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
616 max_zone_pfns[ZONE_NORMAL] = end_pfn; 576 max_zone_pfns[ZONE_NORMAL] = max_pfn;
617 577
618 sparse_memory_present_with_active_regions(MAX_NUMNODES); 578 sparse_memory_present_with_active_regions(MAX_NUMNODES);
619 sparse_init(); 579 sparse_init();
@@ -641,6 +601,7 @@ static __init int numa_setup(char *opt)
641} 601}
642early_param("numa", numa_setup); 602early_param("numa", numa_setup);
643 603
604#ifdef CONFIG_NUMA
644/* 605/*
645 * Setup early cpu_to_node. 606 * Setup early cpu_to_node.
646 * 607 *
@@ -652,14 +613,19 @@ early_param("numa", numa_setup);
652 * is already initialized in a round robin manner at numa_init_array, 613 * is already initialized in a round robin manner at numa_init_array,
653 * prior to this call, and this initialization is good enough 614 * prior to this call, and this initialization is good enough
654 * for the fake NUMA cases. 615 * for the fake NUMA cases.
616 *
617 * Called before the per_cpu areas are setup.
655 */ 618 */
656void __init init_cpu_to_node(void) 619void __init init_cpu_to_node(void)
657{ 620{
658 int i; 621 int cpu;
622 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
659 623
660 for (i = 0; i < NR_CPUS; i++) { 624 BUG_ON(cpu_to_apicid == NULL);
625
626 for_each_possible_cpu(cpu) {
661 int node; 627 int node;
662 u16 apicid = x86_cpu_to_apicid_init[i]; 628 u16 apicid = cpu_to_apicid[cpu];
663 629
664 if (apicid == BAD_APICID) 630 if (apicid == BAD_APICID)
665 continue; 631 continue;
@@ -668,8 +634,9 @@ void __init init_cpu_to_node(void)
668 continue; 634 continue;
669 if (!node_online(node)) 635 if (!node_online(node))
670 continue; 636 continue;
671 numa_set_node(i, node); 637 numa_set_node(cpu, node);
672 } 638 }
673} 639}
640#endif
674 641
675 642
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 75f1b109aae8..d4aa503caaa2 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * self test for change_page_attr. 2 * self test for change_page_attr.
3 * 3 *
4 * Clears the global bit on random pages in the direct mapping, then reverts 4 * Clears the a test pte bit on random pages in the direct mapping,
5 * and compares page tables forwards and afterwards. 5 * then reverts and compares page tables forwards and afterwards.
6 */ 6 */
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/kthread.h> 8#include <linux/kthread.h>
@@ -32,6 +32,13 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1)
36
37static int pte_testbit(pte_t pte)
38{
39 return pte_flags(pte) & _PAGE_UNUSED1;
40}
41
35struct split_state { 42struct split_state {
36 long lpg, gpg, spg, exec; 43 long lpg, gpg, spg, exec;
37 long min_exec, max_exec; 44 long min_exec, max_exec;
@@ -165,15 +172,14 @@ static int pageattr_test(void)
165 continue; 172 continue;
166 } 173 }
167 174
168 err = change_page_attr_clear(addr[i], len[i], 175 err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT);
169 __pgprot(_PAGE_GLOBAL));
170 if (err < 0) { 176 if (err < 0) {
171 printk(KERN_ERR "CPA %d failed %d\n", i, err); 177 printk(KERN_ERR "CPA %d failed %d\n", i, err);
172 failed++; 178 failed++;
173 } 179 }
174 180
175 pte = lookup_address(addr[i], &level); 181 pte = lookup_address(addr[i], &level);
176 if (!pte || pte_global(*pte) || pte_huge(*pte)) { 182 if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
177 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], 183 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
178 pte ? (u64)pte_val(*pte) : 0ULL); 184 pte ? (u64)pte_val(*pte) : 0ULL);
179 failed++; 185 failed++;
@@ -198,14 +204,13 @@ static int pageattr_test(void)
198 failed++; 204 failed++;
199 continue; 205 continue;
200 } 206 }
201 err = change_page_attr_set(addr[i], len[i], 207 err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT);
202 __pgprot(_PAGE_GLOBAL));
203 if (err < 0) { 208 if (err < 0) {
204 printk(KERN_ERR "CPA reverting failed: %d\n", err); 209 printk(KERN_ERR "CPA reverting failed: %d\n", err);
205 failed++; 210 failed++;
206 } 211 }
207 pte = lookup_address(addr[i], &level); 212 pte = lookup_address(addr[i], &level);
208 if (!pte || !pte_global(*pte)) { 213 if (!pte || pte_testbit(*pte)) {
209 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", 214 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
210 addr[i], pte ? (u64)pte_val(*pte) : 0ULL); 215 addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
211 failed++; 216 failed++;
@@ -216,8 +221,7 @@ static int pageattr_test(void)
216 failed += print_split(&sc); 221 failed += print_split(&sc);
217 222
218 if (failed) { 223 if (failed) {
219 printk(KERN_ERR "NOT PASSED. Please report.\n"); 224 WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
220 WARN_ON(1);
221 return -EINVAL; 225 return -EINVAL;
222 } else { 226 } else {
223 if (print) 227 if (print)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37e..43e2f8483e4f 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -34,6 +34,47 @@ struct cpa_data {
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35}; 35};
36 36
37#ifdef CONFIG_PROC_FS
38static unsigned long direct_pages_count[PG_LEVEL_NUM];
39
40void update_page_count(int level, unsigned long pages)
41{
42 unsigned long flags;
43
44 /* Protect against CPA */
45 spin_lock_irqsave(&pgd_lock, flags);
46 direct_pages_count[level] += pages;
47 spin_unlock_irqrestore(&pgd_lock, flags);
48}
49
50static void split_page_count(int level)
51{
52 direct_pages_count[level]--;
53 direct_pages_count[level - 1] += PTRS_PER_PTE;
54}
55
56int arch_report_meminfo(char *page)
57{
58 int n = sprintf(page, "DirectMap4k: %8lu kB\n",
59 direct_pages_count[PG_LEVEL_4K] << 2);
60#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
61 n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
62 direct_pages_count[PG_LEVEL_2M] << 11);
63#else
64 n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
65 direct_pages_count[PG_LEVEL_2M] << 12);
66#endif
67#ifdef CONFIG_X86_64
68 if (direct_gbpages)
69 n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
70 direct_pages_count[PG_LEVEL_1G] << 20);
71#endif
72 return n;
73}
74#else
75static inline void split_page_count(int level) { }
76#endif
77
37#ifdef CONFIG_X86_64 78#ifdef CONFIG_X86_64
38 79
39static inline unsigned long highmap_start_pfn(void) 80static inline unsigned long highmap_start_pfn(void)
@@ -106,7 +147,7 @@ static void cpa_flush_all(unsigned long cache)
106{ 147{
107 BUG_ON(irqs_disabled()); 148 BUG_ON(irqs_disabled());
108 149
109 on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); 150 on_each_cpu(__cpa_flush_all, (void *) cache, 1);
110} 151}
111 152
112static void __cpa_flush_range(void *arg) 153static void __cpa_flush_range(void *arg)
@@ -127,7 +168,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
127 BUG_ON(irqs_disabled()); 168 BUG_ON(irqs_disabled());
128 WARN_ON(PAGE_ALIGN(start) != start); 169 WARN_ON(PAGE_ALIGN(start) != start);
129 170
130 on_each_cpu(__cpa_flush_range, NULL, 1, 1); 171 on_each_cpu(__cpa_flush_range, NULL, 1);
131 172
132 if (!cache) 173 if (!cache)
133 return; 174 return;
@@ -227,6 +268,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
227 268
228 return pte_offset_kernel(pmd, address); 269 return pte_offset_kernel(pmd, address);
229} 270}
271EXPORT_SYMBOL_GPL(lookup_address);
230 272
231/* 273/*
232 * Set the new pmd in all the pgds we know about: 274 * Set the new pmd in all the pgds we know about:
@@ -500,6 +542,16 @@ static int split_large_page(pte_t *kpte, unsigned long address)
500 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 542 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
501 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 543 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
502 544
545 if (address >= (unsigned long)__va(0) &&
546 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
547 split_page_count(level);
548
549#ifdef CONFIG_X86_64
550 if (address >= (unsigned long)__va(1UL<<32) &&
551 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
552 split_page_count(level);
553#endif
554
503 /* 555 /*
504 * Install the new, split up pagetable. Important details here: 556 * Install the new, split up pagetable. Important details here:
505 * 557 *
@@ -546,10 +598,9 @@ repeat:
546 if (!pte_val(old_pte)) { 598 if (!pte_val(old_pte)) {
547 if (!primary) 599 if (!primary)
548 return 0; 600 return 0;
549 printk(KERN_WARNING "CPA: called for zero pte. " 601 WARN(1, KERN_WARNING "CPA: called for zero pte. "
550 "vaddr = %lx cpa->vaddr = %lx\n", address, 602 "vaddr = %lx cpa->vaddr = %lx\n", address,
551 cpa->vaddr); 603 cpa->vaddr);
552 WARN_ON(1);
553 return -EINVAL; 604 return -EINVAL;
554 } 605 }
555 606
@@ -613,15 +664,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
613 struct cpa_data alias_cpa; 664 struct cpa_data alias_cpa;
614 int ret = 0; 665 int ret = 0;
615 666
616 if (cpa->pfn > max_pfn_mapped) 667 if (cpa->pfn >= max_pfn_mapped)
617 return 0; 668 return 0;
618 669
670#ifdef CONFIG_X86_64
671 if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
672 return 0;
673#endif
619 /* 674 /*
620 * No need to redo, when the primary call touched the direct 675 * No need to redo, when the primary call touched the direct
621 * mapping already: 676 * mapping already:
622 */ 677 */
623 if (!within(cpa->vaddr, PAGE_OFFSET, 678 if (!(within(cpa->vaddr, PAGE_OFFSET,
624 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { 679 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
680#ifdef CONFIG_X86_64
681 || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
682 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
683#endif
684 )) {
625 685
626 alias_cpa = *cpa; 686 alias_cpa = *cpa;
627 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 687 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
@@ -789,7 +849,7 @@ int set_memory_uc(unsigned long addr, int numpages)
789 /* 849 /*
790 * for now UC MINUS. see comments in ioremap_nocache() 850 * for now UC MINUS. see comments in ioremap_nocache()
791 */ 851 */
792 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 852 if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
793 _PAGE_CACHE_UC_MINUS, NULL)) 853 _PAGE_CACHE_UC_MINUS, NULL))
794 return -EINVAL; 854 return -EINVAL;
795 855
@@ -805,10 +865,10 @@ int _set_memory_wc(unsigned long addr, int numpages)
805 865
806int set_memory_wc(unsigned long addr, int numpages) 866int set_memory_wc(unsigned long addr, int numpages)
807{ 867{
808 if (!pat_wc_enabled) 868 if (!pat_enabled)
809 return set_memory_uc(addr, numpages); 869 return set_memory_uc(addr, numpages);
810 870
811 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 871 if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
812 _PAGE_CACHE_WC, NULL)) 872 _PAGE_CACHE_WC, NULL))
813 return -EINVAL; 873 return -EINVAL;
814 874
@@ -824,7 +884,7 @@ int _set_memory_wb(unsigned long addr, int numpages)
824 884
825int set_memory_wb(unsigned long addr, int numpages) 885int set_memory_wb(unsigned long addr, int numpages)
826{ 886{
827 free_memtype(addr, addr + numpages * PAGE_SIZE); 887 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
828 888
829 return _set_memory_wb(addr, numpages); 889 return _set_memory_wb(addr, numpages);
830} 890}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index de3a99812450..2a50e0fa64a5 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,6 +12,8 @@
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/debugfs.h>
16#include <linux/seq_file.h>
15 17
16#include <asm/msr.h> 18#include <asm/msr.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
@@ -26,15 +28,15 @@
26#include <asm/io.h> 28#include <asm/io.h>
27 29
28#ifdef CONFIG_X86_PAT 30#ifdef CONFIG_X86_PAT
29int __read_mostly pat_wc_enabled = 1; 31int __read_mostly pat_enabled = 1;
30 32
31void __cpuinit pat_disable(char *reason) 33void __cpuinit pat_disable(char *reason)
32{ 34{
33 pat_wc_enabled = 0; 35 pat_enabled = 0;
34 printk(KERN_INFO "%s\n", reason); 36 printk(KERN_INFO "%s\n", reason);
35} 37}
36 38
37static int nopat(char *str) 39static int __init nopat(char *str)
38{ 40{
39 pat_disable("PAT support disabled."); 41 pat_disable("PAT support disabled.");
40 return 0; 42 return 0;
@@ -42,6 +44,19 @@ static int nopat(char *str)
42early_param("nopat", nopat); 44early_param("nopat", nopat);
43#endif 45#endif
44 46
47
48static int debug_enable;
49static int __init pat_debug_setup(char *str)
50{
51 debug_enable = 1;
52 return 0;
53}
54__setup("debugpat", pat_debug_setup);
55
56#define dprintk(fmt, arg...) \
57 do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
58
59
45static u64 __read_mostly boot_pat_state; 60static u64 __read_mostly boot_pat_state;
46 61
47enum { 62enum {
@@ -53,24 +68,25 @@ enum {
53 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ 68 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
54}; 69};
55 70
56#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) 71#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
57 72
58void pat_init(void) 73void pat_init(void)
59{ 74{
60 u64 pat; 75 u64 pat;
61 76
62 if (!pat_wc_enabled) 77 if (!pat_enabled)
63 return; 78 return;
64 79
65 /* Paranoia check. */ 80 /* Paranoia check. */
66 if (!cpu_has_pat) { 81 if (!cpu_has_pat && boot_pat_state) {
67 printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
68 /* 82 /*
69 * Panic if this happens on the secondary CPU, and we 83 * If this happens we are on a secondary CPU, but
70 * switched to PAT on the boot CPU. We have no way to 84 * switched to PAT on the boot CPU. We have no way to
71 * undo PAT. 85 * undo PAT.
72 */ 86 */
73 BUG_ON(boot_pat_state); 87 printk(KERN_ERR "PAT enabled, "
88 "but not supported by secondary CPU\n");
89 BUG();
74 } 90 }
75 91
76 /* Set PWT to Write-Combining. All other bits stay the same */ 92 /* Set PWT to Write-Combining. All other bits stay the same */
@@ -86,8 +102,8 @@ void pat_init(void)
86 * 011 UC _PAGE_CACHE_UC 102 * 011 UC _PAGE_CACHE_UC
87 * PAT bit unused 103 * PAT bit unused
88 */ 104 */
89 pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | 105 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
90 PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); 106 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
91 107
92 /* Boot CPU check */ 108 /* Boot CPU check */
93 if (!boot_pat_state) 109 if (!boot_pat_state)
@@ -103,11 +119,11 @@ void pat_init(void)
103static char *cattr_name(unsigned long flags) 119static char *cattr_name(unsigned long flags)
104{ 120{
105 switch (flags & _PAGE_CACHE_MASK) { 121 switch (flags & _PAGE_CACHE_MASK) {
106 case _PAGE_CACHE_UC: return "uncached"; 122 case _PAGE_CACHE_UC: return "uncached";
107 case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; 123 case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
108 case _PAGE_CACHE_WB: return "write-back"; 124 case _PAGE_CACHE_WB: return "write-back";
109 case _PAGE_CACHE_WC: return "write-combining"; 125 case _PAGE_CACHE_WC: return "write-combining";
110 default: return "broken"; 126 default: return "broken";
111 } 127 }
112} 128}
113 129
@@ -145,48 +161,55 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
145 * The intersection is based on "Effective Memory Type" tables in IA-32 161 * The intersection is based on "Effective Memory Type" tables in IA-32
146 * SDM vol 3a 162 * SDM vol 3a
147 */ 163 */
148static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, 164static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
149 unsigned long *ret_prot)
150{ 165{
151 unsigned long pat_type; 166 /*
152 u8 mtrr_type; 167 * Look for MTRR hint to get the effective type in case where PAT
153 168 * request is for WB.
154 mtrr_type = mtrr_type_lookup(start, end); 169 */
155 if (mtrr_type == 0xFF) { /* MTRR not enabled */ 170 if (req_type == _PAGE_CACHE_WB) {
156 *ret_prot = prot; 171 u8 mtrr_type;
157 return 0; 172
158 } 173 mtrr_type = mtrr_type_lookup(start, end);
159 if (mtrr_type == 0xFE) { /* MTRR match error */ 174 if (mtrr_type == MTRR_TYPE_UNCACHABLE)
160 *ret_prot = _PAGE_CACHE_UC; 175 return _PAGE_CACHE_UC;
161 return -1; 176 if (mtrr_type == MTRR_TYPE_WRCOMB)
162 } 177 return _PAGE_CACHE_WC;
163 if (mtrr_type != MTRR_TYPE_UNCACHABLE &&
164 mtrr_type != MTRR_TYPE_WRBACK &&
165 mtrr_type != MTRR_TYPE_WRCOMB) { /* MTRR type unhandled */
166 *ret_prot = _PAGE_CACHE_UC;
167 return -1;
168 } 178 }
169 179
170 pat_type = prot & _PAGE_CACHE_MASK; 180 return req_type;
171 prot &= (~_PAGE_CACHE_MASK); 181}
172 182
173 /* Currently doing intersection by hand. Optimize it later. */ 183static int chk_conflict(struct memtype *new, struct memtype *entry,
174 if (pat_type == _PAGE_CACHE_WC) { 184 unsigned long *type)
175 *ret_prot = prot | _PAGE_CACHE_WC; 185{
176 } else if (pat_type == _PAGE_CACHE_UC_MINUS) { 186 if (new->type != entry->type) {
177 *ret_prot = prot | _PAGE_CACHE_UC_MINUS; 187 if (type) {
178 } else if (pat_type == _PAGE_CACHE_UC || 188 new->type = entry->type;
179 mtrr_type == MTRR_TYPE_UNCACHABLE) { 189 *type = entry->type;
180 *ret_prot = prot | _PAGE_CACHE_UC; 190 } else
181 } else if (mtrr_type == MTRR_TYPE_WRCOMB) { 191 goto conflict;
182 *ret_prot = prot | _PAGE_CACHE_WC;
183 } else {
184 *ret_prot = prot | _PAGE_CACHE_WB;
185 } 192 }
186 193
194 /* check overlaps with more than one entry in the list */
195 list_for_each_entry_continue(entry, &memtype_list, nd) {
196 if (new->end <= entry->start)
197 break;
198 else if (new->type != entry->type)
199 goto conflict;
200 }
187 return 0; 201 return 0;
202
203 conflict:
204 printk(KERN_INFO "%s:%d conflicting memory types "
205 "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
206 new->end, cattr_name(new->type), cattr_name(entry->type));
207 return -EBUSY;
188} 208}
189 209
210static struct memtype *cached_entry;
211static u64 cached_start;
212
190/* 213/*
191 * req_type typically has one of the: 214 * req_type typically has one of the:
192 * - _PAGE_CACHE_WB 215 * - _PAGE_CACHE_WB
@@ -197,251 +220,160 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
197 * req_type will have a special case value '-1', when requester want to inherit 220 * req_type will have a special case value '-1', when requester want to inherit
198 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. 221 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
199 * 222 *
200 * If ret_type is NULL, function will return an error if it cannot reserve the 223 * If new_type is NULL, function will return an error if it cannot reserve the
201 * region with req_type. If ret_type is non-null, function will return 224 * region with req_type. If new_type is non-NULL, function will return
202 * available type in ret_type in case of no error. In case of any error 225 * available type in new_type in case of no error. In case of any error
203 * it will return a negative return value. 226 * it will return a negative return value.
204 */ 227 */
205int reserve_memtype(u64 start, u64 end, unsigned long req_type, 228int reserve_memtype(u64 start, u64 end, unsigned long req_type,
206 unsigned long *ret_type) 229 unsigned long *new_type)
207{ 230{
208 struct memtype *new_entry = NULL; 231 struct memtype *new, *entry;
209 struct memtype *parse;
210 unsigned long actual_type; 232 unsigned long actual_type;
233 struct list_head *where;
211 int err = 0; 234 int err = 0;
212 235
213 /* Only track when pat_wc_enabled */ 236 BUG_ON(start >= end); /* end is exclusive */
214 if (!pat_wc_enabled) { 237
238 if (!pat_enabled) {
215 /* This is identical to page table setting without PAT */ 239 /* This is identical to page table setting without PAT */
216 if (ret_type) { 240 if (new_type) {
217 if (req_type == -1) { 241 if (req_type == -1)
218 *ret_type = _PAGE_CACHE_WB; 242 *new_type = _PAGE_CACHE_WB;
219 } else { 243 else
220 *ret_type = req_type; 244 *new_type = req_type & _PAGE_CACHE_MASK;
221 }
222 } 245 }
223 return 0; 246 return 0;
224 } 247 }
225 248
226 /* Low ISA region is always mapped WB in page table. No need to track */ 249 /* Low ISA region is always mapped WB in page table. No need to track */
227 if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { 250 if (is_ISA_range(start, end - 1)) {
228 if (ret_type) 251 if (new_type)
229 *ret_type = _PAGE_CACHE_WB; 252 *new_type = _PAGE_CACHE_WB;
230
231 return 0; 253 return 0;
232 } 254 }
233 255
234 if (req_type == -1) { 256 if (req_type == -1) {
235 /* 257 /*
236 * Special case where caller wants to inherit from mtrr or 258 * Call mtrr_lookup to get the type hint. This is an
237 * existing pat mapping, defaulting to UC_MINUS in case of 259 * optimization for /dev/mem mmap'ers into WB memory (BIOS
238 * no match. 260 * tools and ACPI tools). Use WB request for WB memory and use
261 * UC_MINUS otherwise.
239 */ 262 */
240 u8 mtrr_type = mtrr_type_lookup(start, end); 263 u8 mtrr_type = mtrr_type_lookup(start, end);
241 if (mtrr_type == 0xFE) { /* MTRR match error */
242 err = -1;
243 }
244 264
245 if (mtrr_type == MTRR_TYPE_WRBACK) { 265 if (mtrr_type == MTRR_TYPE_WRBACK)
246 req_type = _PAGE_CACHE_WB;
247 actual_type = _PAGE_CACHE_WB; 266 actual_type = _PAGE_CACHE_WB;
248 } else { 267 else
249 req_type = _PAGE_CACHE_UC_MINUS;
250 actual_type = _PAGE_CACHE_UC_MINUS; 268 actual_type = _PAGE_CACHE_UC_MINUS;
251 } 269 } else
252 } else { 270 actual_type = pat_x_mtrr_type(start, end,
253 req_type &= _PAGE_CACHE_MASK; 271 req_type & _PAGE_CACHE_MASK);
254 err = pat_x_mtrr_type(start, end, req_type, &actual_type);
255 }
256
257 if (err) {
258 if (ret_type)
259 *ret_type = actual_type;
260 272
261 return -EINVAL; 273 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
262 } 274 if (!new)
263
264 new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
265 if (!new_entry)
266 return -ENOMEM; 275 return -ENOMEM;
267 276
268 new_entry->start = start; 277 new->start = start;
269 new_entry->end = end; 278 new->end = end;
270 new_entry->type = actual_type; 279 new->type = actual_type;
271 280
272 if (ret_type) 281 if (new_type)
273 *ret_type = actual_type; 282 *new_type = actual_type;
274 283
275 spin_lock(&memtype_lock); 284 spin_lock(&memtype_lock);
276 285
277 /* Search for existing mapping that overlaps the current range */ 286 if (cached_entry && start >= cached_start)
278 list_for_each_entry(parse, &memtype_list, nd) { 287 entry = cached_entry;
279 struct memtype *saved_ptr; 288 else
289 entry = list_entry(&memtype_list, struct memtype, nd);
280 290
281 if (parse->start >= end) { 291 /* Search for existing mapping that overlaps the current range */
282 pr_debug("New Entry\n"); 292 where = NULL;
283 list_add(&new_entry->nd, parse->nd.prev); 293 list_for_each_entry_continue(entry, &memtype_list, nd) {
284 new_entry = NULL; 294 if (end <= entry->start) {
295 where = entry->nd.prev;
296 cached_entry = list_entry(where, struct memtype, nd);
285 break; 297 break;
286 } 298 } else if (start <= entry->start) { /* end > entry->start */
287 299 err = chk_conflict(new, entry, new_type);
288 if (start <= parse->start && end >= parse->start) { 300 if (!err) {
289 if (actual_type != parse->type && ret_type) { 301 dprintk("Overlap at 0x%Lx-0x%Lx\n",
290 actual_type = parse->type; 302 entry->start, entry->end);
291 *ret_type = actual_type; 303 where = entry->nd.prev;
292 new_entry->type = actual_type; 304 cached_entry = list_entry(where,
305 struct memtype, nd);
293 } 306 }
294
295 if (actual_type != parse->type) {
296 printk(
297 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
298 current->comm, current->pid,
299 start, end,
300 cattr_name(actual_type),
301 cattr_name(parse->type));
302 err = -EBUSY;
303 break;
304 }
305
306 saved_ptr = parse;
307 /*
308 * Check to see whether the request overlaps more
309 * than one entry in the list
310 */
311 list_for_each_entry_continue(parse, &memtype_list, nd) {
312 if (end <= parse->start) {
313 break;
314 }
315
316 if (actual_type != parse->type) {
317 printk(
318 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
319 current->comm, current->pid,
320 start, end,
321 cattr_name(actual_type),
322 cattr_name(parse->type));
323 err = -EBUSY;
324 break;
325 }
326 }
327
328 if (err) {
329 break;
330 }
331
332 pr_debug("Overlap at 0x%Lx-0x%Lx\n",
333 saved_ptr->start, saved_ptr->end);
334 /* No conflict. Go ahead and add this new entry */
335 list_add(&new_entry->nd, saved_ptr->nd.prev);
336 new_entry = NULL;
337 break; 307 break;
338 } 308 } else if (start < entry->end) { /* start > entry->start */
339 309 err = chk_conflict(new, entry, new_type);
340 if (start < parse->end) { 310 if (!err) {
341 if (actual_type != parse->type && ret_type) { 311 dprintk("Overlap at 0x%Lx-0x%Lx\n",
342 actual_type = parse->type; 312 entry->start, entry->end);
343 *ret_type = actual_type; 313 cached_entry = list_entry(entry->nd.prev,
344 new_entry->type = actual_type; 314 struct memtype, nd);
345 } 315
346 316 /*
347 if (actual_type != parse->type) { 317 * Move to right position in the linked
348 printk( 318 * list to add this new entry
349 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", 319 */
350 current->comm, current->pid, 320 list_for_each_entry_continue(entry,
351 start, end, 321 &memtype_list, nd) {
352 cattr_name(actual_type), 322 if (start <= entry->start) {
353 cattr_name(parse->type)); 323 where = entry->nd.prev;
354 err = -EBUSY; 324 break;
355 break; 325 }
356 }
357
358 saved_ptr = parse;
359 /*
360 * Check to see whether the request overlaps more
361 * than one entry in the list
362 */
363 list_for_each_entry_continue(parse, &memtype_list, nd) {
364 if (end <= parse->start) {
365 break;
366 }
367
368 if (actual_type != parse->type) {
369 printk(
370 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
371 current->comm, current->pid,
372 start, end,
373 cattr_name(actual_type),
374 cattr_name(parse->type));
375 err = -EBUSY;
376 break;
377 } 326 }
378 } 327 }
379
380 if (err) {
381 break;
382 }
383
384 pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
385 saved_ptr->start, saved_ptr->end);
386 /* No conflict. Go ahead and add this new entry */
387 list_add(&new_entry->nd, &saved_ptr->nd);
388 new_entry = NULL;
389 break; 328 break;
390 } 329 }
391 } 330 }
392 331
393 if (err) { 332 if (err) {
394 printk(KERN_INFO 333 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
395 "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", 334 "track %s, req %s\n",
396 start, end, cattr_name(new_entry->type), 335 start, end, cattr_name(new->type), cattr_name(req_type));
397 cattr_name(req_type)); 336 kfree(new);
398 kfree(new_entry);
399 spin_unlock(&memtype_lock); 337 spin_unlock(&memtype_lock);
400 return err; 338 return err;
401 } 339 }
402 340
403 if (new_entry) { 341 cached_start = start;
404 /* No conflict. Not yet added to the list. Add to the tail */
405 list_add_tail(&new_entry->nd, &memtype_list);
406 pr_debug("New Entry\n");
407 }
408 342
409 if (ret_type) { 343 if (where)
410 pr_debug( 344 list_add(&new->nd, where);
411 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", 345 else
412 start, end, cattr_name(actual_type), 346 list_add_tail(&new->nd, &memtype_list);
413 cattr_name(req_type), cattr_name(*ret_type));
414 } else {
415 pr_debug(
416 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
417 start, end, cattr_name(actual_type),
418 cattr_name(req_type));
419 }
420 347
421 spin_unlock(&memtype_lock); 348 spin_unlock(&memtype_lock);
349
350 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
351 start, end, cattr_name(new->type), cattr_name(req_type),
352 new_type ? cattr_name(*new_type) : "-");
353
422 return err; 354 return err;
423} 355}
424 356
425int free_memtype(u64 start, u64 end) 357int free_memtype(u64 start, u64 end)
426{ 358{
427 struct memtype *ml; 359 struct memtype *entry;
428 int err = -EINVAL; 360 int err = -EINVAL;
429 361
430 /* Only track when pat_wc_enabled */ 362 if (!pat_enabled)
431 if (!pat_wc_enabled) {
432 return 0; 363 return 0;
433 }
434 364
435 /* Low ISA region is always mapped WB. No need to track */ 365 /* Low ISA region is always mapped WB. No need to track */
436 if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { 366 if (is_ISA_range(start, end - 1))
437 return 0; 367 return 0;
438 }
439 368
440 spin_lock(&memtype_lock); 369 spin_lock(&memtype_lock);
441 list_for_each_entry(ml, &memtype_list, nd) { 370 list_for_each_entry(entry, &memtype_list, nd) {
442 if (ml->start == start && ml->end == end) { 371 if (entry->start == start && entry->end == end) {
443 list_del(&ml->nd); 372 if (cached_entry == entry || cached_start == start)
444 kfree(ml); 373 cached_entry = NULL;
374
375 list_del(&entry->nd);
376 kfree(entry);
445 err = 0; 377 err = 0;
446 break; 378 break;
447 } 379 }
@@ -453,27 +385,19 @@ int free_memtype(u64 start, u64 end)
453 current->comm, current->pid, start, end); 385 current->comm, current->pid, start, end);
454 } 386 }
455 387
456 pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); 388 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
457 return err; 389 return err;
458} 390}
459 391
460 392
461/*
462 * /dev/mem mmap interface. The memtype used for mapping varies:
463 * - Use UC for mappings with O_SYNC flag
464 * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
465 * inherit the memtype from existing mapping.
466 * - Else use UC_MINUS memtype (for backward compatibility with existing
467 * X drivers.
468 */
469pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 393pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
470 unsigned long size, pgprot_t vma_prot) 394 unsigned long size, pgprot_t vma_prot)
471{ 395{
472 return vma_prot; 396 return vma_prot;
473} 397}
474 398
475#ifdef CONFIG_NONPROMISC_DEVMEM 399#ifdef CONFIG_STRICT_DEVMEM
476/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ 400/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
477static inline int range_is_allowed(unsigned long pfn, unsigned long size) 401static inline int range_is_allowed(unsigned long pfn, unsigned long size)
478{ 402{
479 return 1; 403 return 1;
@@ -497,20 +421,20 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
497 } 421 }
498 return 1; 422 return 1;
499} 423}
500#endif /* CONFIG_NONPROMISC_DEVMEM */ 424#endif /* CONFIG_STRICT_DEVMEM */
501 425
502int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 426int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
503 unsigned long size, pgprot_t *vma_prot) 427 unsigned long size, pgprot_t *vma_prot)
504{ 428{
505 u64 offset = ((u64) pfn) << PAGE_SHIFT; 429 u64 offset = ((u64) pfn) << PAGE_SHIFT;
506 unsigned long flags = _PAGE_CACHE_UC_MINUS; 430 unsigned long flags = -1;
507 int retval; 431 int retval;
508 432
509 if (!range_is_allowed(pfn, size)) 433 if (!range_is_allowed(pfn, size))
510 return 0; 434 return 0;
511 435
512 if (file->f_flags & O_SYNC) { 436 if (file->f_flags & O_SYNC) {
513 flags = _PAGE_CACHE_UC; 437 flags = _PAGE_CACHE_UC_MINUS;
514 } 438 }
515 439
516#ifdef CONFIG_X86_32 440#ifdef CONFIG_X86_32
@@ -522,24 +446,25 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
522 * caching for the high addresses through the KEN pin, but 446 * caching for the high addresses through the KEN pin, but
523 * we maintain the tradition of paranoia in this code. 447 * we maintain the tradition of paranoia in this code.
524 */ 448 */
525 if (!pat_wc_enabled && 449 if (!pat_enabled &&
526 ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || 450 !(boot_cpu_has(X86_FEATURE_MTRR) ||
527 test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || 451 boot_cpu_has(X86_FEATURE_K6_MTRR) ||
528 test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || 452 boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
529 test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && 453 boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
530 (pfn << PAGE_SHIFT) >= __pa(high_memory)) { 454 (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
531 flags = _PAGE_CACHE_UC; 455 flags = _PAGE_CACHE_UC;
532 } 456 }
533#endif 457#endif
534 458
535 /* 459 /*
536 * With O_SYNC, we can only take UC mapping. Fail if we cannot. 460 * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
461 *
537 * Without O_SYNC, we want to get 462 * Without O_SYNC, we want to get
538 * - WB for WB-able memory and no other conflicting mappings 463 * - WB for WB-able memory and no other conflicting mappings
539 * - UC_MINUS for non-WB-able memory with no other conflicting mappings 464 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
540 * - Inherit from confliting mappings otherwise 465 * - Inherit from confliting mappings otherwise
541 */ 466 */
542 if (flags != _PAGE_CACHE_UC_MINUS) { 467 if (flags != -1) {
543 retval = reserve_memtype(offset, offset + size, flags, NULL); 468 retval = reserve_memtype(offset, offset + size, flags, NULL);
544 } else { 469 } else {
545 retval = reserve_memtype(offset, offset + size, -1, &flags); 470 retval = reserve_memtype(offset, offset + size, -1, &flags);
@@ -548,8 +473,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
548 if (retval < 0) 473 if (retval < 0)
549 return 0; 474 return 0;
550 475
551 if (pfn <= max_pfn_mapped && 476 if (((pfn < max_low_pfn_mapped) ||
552 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { 477 (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
478 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
553 free_memtype(offset, offset + size); 479 free_memtype(offset, offset + size);
554 printk(KERN_INFO 480 printk(KERN_INFO
555 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", 481 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
@@ -588,3 +514,88 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
588 free_memtype(addr, addr + size); 514 free_memtype(addr, addr + size);
589} 515}
590 516
517#if defined(CONFIG_DEBUG_FS)
518
519/* get Nth element of the linked list */
520static struct memtype *memtype_get_idx(loff_t pos)
521{
522 struct memtype *list_node, *print_entry;
523 int i = 1;
524
525 print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
526 if (!print_entry)
527 return NULL;
528
529 spin_lock(&memtype_lock);
530 list_for_each_entry(list_node, &memtype_list, nd) {
531 if (pos == i) {
532 *print_entry = *list_node;
533 spin_unlock(&memtype_lock);
534 return print_entry;
535 }
536 ++i;
537 }
538 spin_unlock(&memtype_lock);
539 kfree(print_entry);
540 return NULL;
541}
542
543static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
544{
545 if (*pos == 0) {
546 ++*pos;
547 seq_printf(seq, "PAT memtype list:\n");
548 }
549
550 return memtype_get_idx(*pos);
551}
552
553static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
554{
555 ++*pos;
556 return memtype_get_idx(*pos);
557}
558
559static void memtype_seq_stop(struct seq_file *seq, void *v)
560{
561}
562
563static int memtype_seq_show(struct seq_file *seq, void *v)
564{
565 struct memtype *print_entry = (struct memtype *)v;
566
567 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
568 print_entry->start, print_entry->end);
569 kfree(print_entry);
570 return 0;
571}
572
573static struct seq_operations memtype_seq_ops = {
574 .start = memtype_seq_start,
575 .next = memtype_seq_next,
576 .stop = memtype_seq_stop,
577 .show = memtype_seq_show,
578};
579
580static int memtype_seq_open(struct inode *inode, struct file *file)
581{
582 return seq_open(file, &memtype_seq_ops);
583}
584
585static const struct file_operations memtype_fops = {
586 .open = memtype_seq_open,
587 .read = seq_read,
588 .llseek = seq_lseek,
589 .release = seq_release,
590};
591
592static int __init pat_memtype_list_init(void)
593{
594 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
595 NULL, &memtype_fops);
596 return 0;
597}
598
599late_initcall(pat_memtype_list_init);
600
601#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644
index 000000000000..efa1911e20ca
--- /dev/null
+++ b/arch/x86/mm/pf_in.c
@@ -0,0 +1,489 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
23 * Copyright by Intel Crop., 2002
24 * Louis Zhuang (louis.zhuang@intel.com)
25 *
26 * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
27 */
28
29#include <linux/module.h>
30#include <linux/ptrace.h> /* struct pt_regs */
31#include "pf_in.h"
32
33#ifdef __i386__
34/* IA32 Manual 3, 2-1 */
35static unsigned char prefix_codes[] = {
36 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
37 0x65, 0x2E, 0x3E, 0x66, 0x67
38};
39/* IA32 Manual 3, 3-432*/
40static unsigned int reg_rop[] = {
41 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
42};
43static unsigned int reg_wop[] = { 0x88, 0x89 };
44static unsigned int imm_wop[] = { 0xC6, 0xC7 };
45/* IA32 Manual 3, 3-432*/
46static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
47static unsigned int rw32[] = {
48 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
49};
50static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
51static unsigned int mw16[] = { 0xB70F, 0xBF0F };
52static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
53static unsigned int mw64[] = {};
54#else /* not __i386__ */
55static unsigned char prefix_codes[] = {
56 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
57 0xF0, 0xF3, 0xF2,
58 /* REX Prefixes */
59 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
60 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
61};
62/* AMD64 Manual 3, Appendix A*/
63static unsigned int reg_rop[] = {
64 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
65};
66static unsigned int reg_wop[] = { 0x88, 0x89 };
67static unsigned int imm_wop[] = { 0xC6, 0xC7 };
68static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
69static unsigned int rw32[] = {
70 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
71};
72/* 8 bit only */
73static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
74/* 16 bit only */
75static unsigned int mw16[] = { 0xB70F, 0xBF0F };
76/* 16 or 32 bit */
77static unsigned int mw32[] = { 0xC7 };
78/* 16, 32 or 64 bit */
79static unsigned int mw64[] = { 0x89, 0x8B };
80#endif /* not __i386__ */
81
82static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
83 int *rexr)
84{
85 int i;
86 unsigned char *p = addr;
87 *shorted = 0;
88 *enlarged = 0;
89 *rexr = 0;
90
91restart:
92 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
93 if (*p == prefix_codes[i]) {
94 if (*p == 0x66)
95 *shorted = 1;
96#ifdef __amd64__
97 if ((*p & 0xf8) == 0x48)
98 *enlarged = 1;
99 if ((*p & 0xf4) == 0x44)
100 *rexr = 1;
101#endif
102 p++;
103 goto restart;
104 }
105 }
106
107 return (p - addr);
108}
109
110static int get_opcode(unsigned char *addr, unsigned int *opcode)
111{
112 int len;
113
114 if (*addr == 0x0F) {
115 /* 0x0F is extension instruction */
116 *opcode = *(unsigned short *)addr;
117 len = 2;
118 } else {
119 *opcode = *addr;
120 len = 1;
121 }
122
123 return len;
124}
125
126#define CHECK_OP_TYPE(opcode, array, type) \
127 for (i = 0; i < ARRAY_SIZE(array); i++) { \
128 if (array[i] == opcode) { \
129 rv = type; \
130 goto exit; \
131 } \
132 }
133
134enum reason_type get_ins_type(unsigned long ins_addr)
135{
136 unsigned int opcode;
137 unsigned char *p;
138 int shorted, enlarged, rexr;
139 int i;
140 enum reason_type rv = OTHERS;
141
142 p = (unsigned char *)ins_addr;
143 p += skip_prefix(p, &shorted, &enlarged, &rexr);
144 p += get_opcode(p, &opcode);
145
146 CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
147 CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
148 CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
149
150exit:
151 return rv;
152}
153#undef CHECK_OP_TYPE
154
155static unsigned int get_ins_reg_width(unsigned long ins_addr)
156{
157 unsigned int opcode;
158 unsigned char *p;
159 int i, shorted, enlarged, rexr;
160
161 p = (unsigned char *)ins_addr;
162 p += skip_prefix(p, &shorted, &enlarged, &rexr);
163 p += get_opcode(p, &opcode);
164
165 for (i = 0; i < ARRAY_SIZE(rw8); i++)
166 if (rw8[i] == opcode)
167 return 1;
168
169 for (i = 0; i < ARRAY_SIZE(rw32); i++)
170 if (rw32[i] == opcode)
171 return (shorted ? 2 : (enlarged ? 8 : 4));
172
173 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
174 return 0;
175}
176
177unsigned int get_ins_mem_width(unsigned long ins_addr)
178{
179 unsigned int opcode;
180 unsigned char *p;
181 int i, shorted, enlarged, rexr;
182
183 p = (unsigned char *)ins_addr;
184 p += skip_prefix(p, &shorted, &enlarged, &rexr);
185 p += get_opcode(p, &opcode);
186
187 for (i = 0; i < ARRAY_SIZE(mw8); i++)
188 if (mw8[i] == opcode)
189 return 1;
190
191 for (i = 0; i < ARRAY_SIZE(mw16); i++)
192 if (mw16[i] == opcode)
193 return 2;
194
195 for (i = 0; i < ARRAY_SIZE(mw32); i++)
196 if (mw32[i] == opcode)
197 return shorted ? 2 : 4;
198
199 for (i = 0; i < ARRAY_SIZE(mw64); i++)
200 if (mw64[i] == opcode)
201 return shorted ? 2 : (enlarged ? 8 : 4);
202
203 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
204 return 0;
205}
206
207/*
208 * Define register ident in mod/rm byte.
209 * Note: these are NOT the same as in ptrace-abi.h.
210 */
211enum {
212 arg_AL = 0,
213 arg_CL = 1,
214 arg_DL = 2,
215 arg_BL = 3,
216 arg_AH = 4,
217 arg_CH = 5,
218 arg_DH = 6,
219 arg_BH = 7,
220
221 arg_AX = 0,
222 arg_CX = 1,
223 arg_DX = 2,
224 arg_BX = 3,
225 arg_SP = 4,
226 arg_BP = 5,
227 arg_SI = 6,
228 arg_DI = 7,
229#ifdef __amd64__
230 arg_R8 = 8,
231 arg_R9 = 9,
232 arg_R10 = 10,
233 arg_R11 = 11,
234 arg_R12 = 12,
235 arg_R13 = 13,
236 arg_R14 = 14,
237 arg_R15 = 15
238#endif
239};
240
241static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
242{
243 unsigned char *rv = NULL;
244
245 switch (no) {
246 case arg_AL:
247 rv = (unsigned char *)&regs->ax;
248 break;
249 case arg_BL:
250 rv = (unsigned char *)&regs->bx;
251 break;
252 case arg_CL:
253 rv = (unsigned char *)&regs->cx;
254 break;
255 case arg_DL:
256 rv = (unsigned char *)&regs->dx;
257 break;
258 case arg_AH:
259 rv = 1 + (unsigned char *)&regs->ax;
260 break;
261 case arg_BH:
262 rv = 1 + (unsigned char *)&regs->bx;
263 break;
264 case arg_CH:
265 rv = 1 + (unsigned char *)&regs->cx;
266 break;
267 case arg_DH:
268 rv = 1 + (unsigned char *)&regs->dx;
269 break;
270#ifdef __amd64__
271 case arg_R8:
272 rv = (unsigned char *)&regs->r8;
273 break;
274 case arg_R9:
275 rv = (unsigned char *)&regs->r9;
276 break;
277 case arg_R10:
278 rv = (unsigned char *)&regs->r10;
279 break;
280 case arg_R11:
281 rv = (unsigned char *)&regs->r11;
282 break;
283 case arg_R12:
284 rv = (unsigned char *)&regs->r12;
285 break;
286 case arg_R13:
287 rv = (unsigned char *)&regs->r13;
288 break;
289 case arg_R14:
290 rv = (unsigned char *)&regs->r14;
291 break;
292 case arg_R15:
293 rv = (unsigned char *)&regs->r15;
294 break;
295#endif
296 default:
297 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
298 break;
299 }
300 return rv;
301}
302
303static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
304{
305 unsigned long *rv = NULL;
306
307 switch (no) {
308 case arg_AX:
309 rv = &regs->ax;
310 break;
311 case arg_BX:
312 rv = &regs->bx;
313 break;
314 case arg_CX:
315 rv = &regs->cx;
316 break;
317 case arg_DX:
318 rv = &regs->dx;
319 break;
320 case arg_SP:
321 rv = &regs->sp;
322 break;
323 case arg_BP:
324 rv = &regs->bp;
325 break;
326 case arg_SI:
327 rv = &regs->si;
328 break;
329 case arg_DI:
330 rv = &regs->di;
331 break;
332#ifdef __amd64__
333 case arg_R8:
334 rv = &regs->r8;
335 break;
336 case arg_R9:
337 rv = &regs->r9;
338 break;
339 case arg_R10:
340 rv = &regs->r10;
341 break;
342 case arg_R11:
343 rv = &regs->r11;
344 break;
345 case arg_R12:
346 rv = &regs->r12;
347 break;
348 case arg_R13:
349 rv = &regs->r13;
350 break;
351 case arg_R14:
352 rv = &regs->r14;
353 break;
354 case arg_R15:
355 rv = &regs->r15;
356 break;
357#endif
358 default:
359 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
360 }
361
362 return rv;
363}
364
365unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
366{
367 unsigned int opcode;
368 unsigned char mod_rm;
369 int reg;
370 unsigned char *p;
371 int i, shorted, enlarged, rexr;
372 unsigned long rv;
373
374 p = (unsigned char *)ins_addr;
375 p += skip_prefix(p, &shorted, &enlarged, &rexr);
376 p += get_opcode(p, &opcode);
377 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
378 if (reg_rop[i] == opcode) {
379 rv = REG_READ;
380 goto do_work;
381 }
382
383 for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
384 if (reg_wop[i] == opcode) {
385 rv = REG_WRITE;
386 goto do_work;
387 }
388
389 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
390 "0x%02x\n", opcode);
391 goto err;
392
393do_work:
394 mod_rm = *p;
395 reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
396 switch (get_ins_reg_width(ins_addr)) {
397 case 1:
398 return *get_reg_w8(reg, regs);
399
400 case 2:
401 return *(unsigned short *)get_reg_w32(reg, regs);
402
403 case 4:
404 return *(unsigned int *)get_reg_w32(reg, regs);
405
406#ifdef __amd64__
407 case 8:
408 return *(unsigned long *)get_reg_w32(reg, regs);
409#endif
410
411 default:
412 printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
413 }
414
415err:
416 return 0;
417}
418
419unsigned long get_ins_imm_val(unsigned long ins_addr)
420{
421 unsigned int opcode;
422 unsigned char mod_rm;
423 unsigned char mod;
424 unsigned char *p;
425 int i, shorted, enlarged, rexr;
426 unsigned long rv;
427
428 p = (unsigned char *)ins_addr;
429 p += skip_prefix(p, &shorted, &enlarged, &rexr);
430 p += get_opcode(p, &opcode);
431 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
432 if (imm_wop[i] == opcode) {
433 rv = IMM_WRITE;
434 goto do_work;
435 }
436
437 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
438 "0x%02x\n", opcode);
439 goto err;
440
441do_work:
442 mod_rm = *p;
443 mod = mod_rm >> 6;
444 p++;
445 switch (mod) {
446 case 0:
447 /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */
448 /* AMD64: XXX Check for address size prefix? */
449 if ((mod_rm & 0x7) == 0x5)
450 p += 4;
451 break;
452
453 case 1:
454 p += 1;
455 break;
456
457 case 2:
458 p += 4;
459 break;
460
461 case 3:
462 default:
463 printk(KERN_ERR "mmiotrace: not a memory access instruction "
464 "at 0x%lx, rm_mod=0x%02x\n",
465 ins_addr, mod_rm);
466 }
467
468 switch (get_ins_reg_width(ins_addr)) {
469 case 1:
470 return *(unsigned char *)p;
471
472 case 2:
473 return *(unsigned short *)p;
474
475 case 4:
476 return *(unsigned int *)p;
477
478#ifdef __amd64__
479 case 8:
480 return *(unsigned long *)p;
481#endif
482
483 default:
484 printk(KERN_ERR "mmiotrace: Error: width.\n");
485 }
486
487err:
488 return 0;
489}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644
index 000000000000..e05341a51a27
--- /dev/null
+++ b/arch/x86/mm/pf_in.h
@@ -0,0 +1,39 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22#ifndef __PF_H_
23#define __PF_H_
24
25enum reason_type {
26 NOT_ME, /* page fault is not in regions */
27 NOTHING, /* access others point in regions */
28 REG_READ, /* read from addr to reg */
29 REG_WRITE, /* write from reg to addr */
30 IMM_WRITE, /* write from imm to addr */
31 OTHERS /* Other instructions can not intercept */
32};
33
34enum reason_type get_ins_type(unsigned long ins_addr);
35unsigned int get_ins_mem_width(unsigned long ins_addr);
36unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
37unsigned long get_ins_imm_val(unsigned long ins_addr);
38
39#endif /* __PF_H_ */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 50159764f694..d50302774fe2 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
2#include <asm/pgalloc.h> 2#include <asm/pgalloc.h>
3#include <asm/pgtable.h> 3#include <asm/pgtable.h>
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h>
5 6
6pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
7{ 8{
@@ -65,12 +66,6 @@ static inline void pgd_list_del(pgd_t *pgd)
65static void pgd_ctor(void *p) 66static void pgd_ctor(void *p)
66{ 67{
67 pgd_t *pgd = p; 68 pgd_t *pgd = p;
68 unsigned long flags;
69
70 /* Clear usermode parts of PGD */
71 memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
72
73 spin_lock_irqsave(&pgd_lock, flags);
74 69
75 /* If the pgd points to a shared pagetable level (either the 70 /* If the pgd points to a shared pagetable level (either the
76 ptes in non-PAE, or shared PMD in PAE), then just copy the 71 ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -90,8 +85,6 @@ static void pgd_ctor(void *p)
90 /* list required to sync kernel mapping updates */ 85 /* list required to sync kernel mapping updates */
91 if (!SHARED_KERNEL_PMD) 86 if (!SHARED_KERNEL_PMD)
92 pgd_list_add(pgd); 87 pgd_list_add(pgd);
93
94 spin_unlock_irqrestore(&pgd_lock, flags);
95} 88}
96 89
97static void pgd_dtor(void *pgd) 90static void pgd_dtor(void *pgd)
@@ -119,6 +112,72 @@ static void pgd_dtor(void *pgd)
119 112
120#ifdef CONFIG_X86_PAE 113#ifdef CONFIG_X86_PAE
121/* 114/*
115 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
116 * updating the top-level pagetable entries to guarantee the
117 * processor notices the update. Since this is expensive, and
118 * all 4 top-level entries are used almost immediately in a
119 * new process's life, we just pre-populate them here.
120 *
121 * Also, if we're in a paravirt environment where the kernel pmd is
122 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
123 * and initialize the kernel pmds here.
124 */
125#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
126
127void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
128{
129 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
130
131 /* Note: almost everything apart from _PAGE_PRESENT is
132 reserved at the pmd (PDPT) level. */
133 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
134
135 /*
136 * According to Intel App note "TLBs, Paging-Structure Caches,
137 * and Their Invalidation", April 2007, document 317080-001,
138 * section 8.1: in PAE mode we explicitly have to flush the
139 * TLB via cr3 if the top-level pgd is changed...
140 */
141 if (mm == current->active_mm)
142 write_cr3(read_cr3());
143}
144#else /* !CONFIG_X86_PAE */
145
146/* No need to prepopulate any pagetable entries in non-PAE modes. */
147#define PREALLOCATED_PMDS 0
148
149#endif /* CONFIG_X86_PAE */
150
151static void free_pmds(pmd_t *pmds[])
152{
153 int i;
154
155 for(i = 0; i < PREALLOCATED_PMDS; i++)
156 if (pmds[i])
157 free_page((unsigned long)pmds[i]);
158}
159
160static int preallocate_pmds(pmd_t *pmds[])
161{
162 int i;
163 bool failed = false;
164
165 for(i = 0; i < PREALLOCATED_PMDS; i++) {
166 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
167 if (pmd == NULL)
168 failed = true;
169 pmds[i] = pmd;
170 }
171
172 if (failed) {
173 free_pmds(pmds);
174 return -ENOMEM;
175 }
176
177 return 0;
178}
179
180/*
122 * Mop up any pmd pages which may still be attached to the pgd. 181 * Mop up any pmd pages which may still be attached to the pgd.
123 * Normally they will be freed by munmap/exit_mmap, but any pmd we 182 * Normally they will be freed by munmap/exit_mmap, but any pmd we
124 * preallocate which never got a corresponding vma will need to be 183 * preallocate which never got a corresponding vma will need to be
@@ -128,7 +187,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
128{ 187{
129 int i; 188 int i;
130 189
131 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { 190 for(i = 0; i < PREALLOCATED_PMDS; i++) {
132 pgd_t pgd = pgdp[i]; 191 pgd_t pgd = pgdp[i];
133 192
134 if (pgd_val(pgd) != 0) { 193 if (pgd_val(pgd) != 0) {
@@ -142,32 +201,20 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
142 } 201 }
143} 202}
144 203
145/* 204static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
146 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
147 * updating the top-level pagetable entries to guarantee the
148 * processor notices the update. Since this is expensive, and
149 * all 4 top-level entries are used almost immediately in a
150 * new process's life, we just pre-populate them here.
151 *
152 * Also, if we're in a paravirt environment where the kernel pmd is
153 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
154 * and initialize the kernel pmds here.
155 */
156static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
157{ 205{
158 pud_t *pud; 206 pud_t *pud;
159 unsigned long addr; 207 unsigned long addr;
160 int i; 208 int i;
161 209
210 if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
211 return;
212
162 pud = pud_offset(pgd, 0); 213 pud = pud_offset(pgd, 0);
163 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
164 i++, pud++, addr += PUD_SIZE) {
165 pmd_t *pmd = pmd_alloc_one(mm, addr);
166 214
167 if (!pmd) { 215 for (addr = i = 0; i < PREALLOCATED_PMDS;
168 pgd_mop_up_pmds(mm, pgd); 216 i++, pud++, addr += PUD_SIZE) {
169 return 0; 217 pmd_t *pmd = pmds[i];
170 }
171 218
172 if (i >= KERNEL_PGD_BOUNDARY) 219 if (i >= KERNEL_PGD_BOUNDARY)
173 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 220 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
@@ -175,61 +222,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
175 222
176 pud_populate(mm, pud, pmd); 223 pud_populate(mm, pud, pmd);
177 } 224 }
178
179 return 1;
180} 225}
181 226
182void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 227pgd_t *pgd_alloc(struct mm_struct *mm)
183{ 228{
184 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 229 pgd_t *pgd;
230 pmd_t *pmds[PREALLOCATED_PMDS];
231 unsigned long flags;
185 232
186 /* Note: almost everything apart from _PAGE_PRESENT is 233 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
187 reserved at the pmd (PDPT) level. */
188 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
189 234
190 /* 235 if (pgd == NULL)
191 * According to Intel App note "TLBs, Paging-Structure Caches, 236 goto out;
192 * and Their Invalidation", April 2007, document 317080-001,
193 * section 8.1: in PAE mode we explicitly have to flush the
194 * TLB via cr3 if the top-level pgd is changed...
195 */
196 if (mm == current->active_mm)
197 write_cr3(read_cr3());
198}
199#else /* !CONFIG_X86_PAE */
200/* No need to prepopulate any pagetable entries in non-PAE modes. */
201static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
202{
203 return 1;
204}
205 237
206static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) 238 mm->pgd = pgd;
207{
208}
209#endif /* CONFIG_X86_PAE */
210 239
211pgd_t *pgd_alloc(struct mm_struct *mm) 240 if (preallocate_pmds(pmds) != 0)
212{ 241 goto out_free_pgd;
213 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
214 242
215 /* so that alloc_pmd can use it */ 243 if (paravirt_pgd_alloc(mm) != 0)
216 mm->pgd = pgd; 244 goto out_free_pmds;
217 if (pgd)
218 pgd_ctor(pgd);
219 245
220 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { 246 /*
221 pgd_dtor(pgd); 247 * Make sure that pre-populating the pmds is atomic with
222 free_page((unsigned long)pgd); 248 * respect to anything walking the pgd_list, so that they
223 pgd = NULL; 249 * never see a partially populated pgd.
224 } 250 */
251 spin_lock_irqsave(&pgd_lock, flags);
252
253 pgd_ctor(pgd);
254 pgd_prepopulate_pmd(mm, pgd, pmds);
255
256 spin_unlock_irqrestore(&pgd_lock, flags);
225 257
226 return pgd; 258 return pgd;
259
260out_free_pmds:
261 free_pmds(pmds);
262out_free_pgd:
263 free_page((unsigned long)pgd);
264out:
265 return NULL;
227} 266}
228 267
229void pgd_free(struct mm_struct *mm, pgd_t *pgd) 268void pgd_free(struct mm_struct *mm, pgd_t *pgd)
230{ 269{
231 pgd_mop_up_pmds(mm, pgd); 270 pgd_mop_up_pmds(mm, pgd);
232 pgd_dtor(pgd); 271 pgd_dtor(pgd);
272 paravirt_pgd_free(mm, pgd);
233 free_page((unsigned long)pgd); 273 free_page((unsigned long)pgd);
234} 274}
235 275
@@ -255,7 +295,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
255 295
256 if (pte_young(*ptep)) 296 if (pte_young(*ptep))
257 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 297 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
258 &ptep->pte); 298 (unsigned long *) &ptep->pte);
259 299
260 if (ret) 300 if (ret)
261 pte_update(vma->vm_mm, addr, ptep); 301 pte_update(vma->vm_mm, addr, ptep);
@@ -274,3 +314,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
274 314
275 return young; 315 return young;
276} 316}
317
318int fixmaps_set;
319
320void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
321{
322 unsigned long address = __fix_to_virt(idx);
323
324 if (idx >= __end_of_fixed_addresses) {
325 BUG();
326 return;
327 }
328 set_pte_vaddr(address, pte);
329 fixmaps_set++;
330}
331
332void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
333{
334 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
335}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 369cf065b6a4..cab0abbd1ebe 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,58 +20,11 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23void show_mem(void)
24{
25 int total = 0, reserved = 0;
26 int shared = 0, cached = 0;
27 int highmem = 0;
28 struct page *page;
29 pg_data_t *pgdat;
30 unsigned long i;
31 unsigned long flags;
32
33 printk(KERN_INFO "Mem-info:\n");
34 show_free_areas();
35 for_each_online_pgdat(pgdat) {
36 pgdat_resize_lock(pgdat, &flags);
37 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
38 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
39 touch_nmi_watchdog();
40 page = pgdat_page_nr(pgdat, i);
41 total++;
42 if (PageHighMem(page))
43 highmem++;
44 if (PageReserved(page))
45 reserved++;
46 else if (PageSwapCache(page))
47 cached++;
48 else if (page_count(page))
49 shared += page_count(page) - 1;
50 }
51 pgdat_resize_unlock(pgdat, &flags);
52 }
53 printk(KERN_INFO "%d pages of RAM\n", total);
54 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
55 printk(KERN_INFO "%d reserved pages\n", reserved);
56 printk(KERN_INFO "%d pages shared\n", shared);
57 printk(KERN_INFO "%d pages swap cached\n", cached);
58
59 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
60 printk(KERN_INFO "%lu pages writeback\n",
61 global_page_state(NR_WRITEBACK));
62 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
63 printk(KERN_INFO "%lu pages slab\n",
64 global_page_state(NR_SLAB_RECLAIMABLE) +
65 global_page_state(NR_SLAB_UNRECLAIMABLE));
66 printk(KERN_INFO "%lu pages pagetables\n",
67 global_page_state(NR_PAGETABLE));
68}
69
70/* 23/*
71 * Associate a virtual page frame with a given physical page frame 24 * Associate a virtual page frame with a given physical page frame
72 * and protection flags for that frame. 25 * and protection flags for that frame.
73 */ 26 */
74static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) 27void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
75{ 28{
76 pgd_t *pgd; 29 pgd_t *pgd;
77 pud_t *pud; 30 pud_t *pud;
@@ -94,8 +47,8 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
94 return; 47 return;
95 } 48 }
96 pte = pte_offset_kernel(pmd, vaddr); 49 pte = pte_offset_kernel(pmd, vaddr);
97 if (pgprot_val(flags)) 50 if (pte_val(pteval))
98 set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags)); 51 set_pte_present(&init_mm, vaddr, pte, pteval);
99 else 52 else
100 pte_clear(&init_mm, vaddr, pte); 53 pte_clear(&init_mm, vaddr, pte);
101 54
@@ -141,22 +94,9 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
141 __flush_tlb_one(vaddr); 94 __flush_tlb_one(vaddr);
142} 95}
143 96
144static int fixmaps;
145unsigned long __FIXADDR_TOP = 0xfffff000; 97unsigned long __FIXADDR_TOP = 0xfffff000;
146EXPORT_SYMBOL(__FIXADDR_TOP); 98EXPORT_SYMBOL(__FIXADDR_TOP);
147 99
148void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
149{
150 unsigned long address = __fix_to_virt(idx);
151
152 if (idx >= __end_of_fixed_addresses) {
153 BUG();
154 return;
155 }
156 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
157 fixmaps++;
158}
159
160/** 100/**
161 * reserve_top_address - reserves a hole in the top of kernel address space 101 * reserve_top_address - reserves a hole in the top of kernel address space
162 * @reserve - size of hole to reserve 102 * @reserve - size of hole to reserve
@@ -164,11 +104,44 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
164 * Can be used to relocate the fixmap area and poke a hole in the top 104 * Can be used to relocate the fixmap area and poke a hole in the top
165 * of kernel address space to make room for a hypervisor. 105 * of kernel address space to make room for a hypervisor.
166 */ 106 */
167void reserve_top_address(unsigned long reserve) 107void __init reserve_top_address(unsigned long reserve)
168{ 108{
169 BUG_ON(fixmaps > 0); 109 BUG_ON(fixmaps_set > 0);
170 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 110 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
171 (int)-reserve); 111 (int)-reserve);
172 __FIXADDR_TOP = -reserve - PAGE_SIZE; 112 __FIXADDR_TOP = -reserve - PAGE_SIZE;
173 __VMALLOC_RESERVE += reserve; 113 __VMALLOC_RESERVE += reserve;
174} 114}
115
116/*
117 * vmalloc=size forces the vmalloc area to be exactly 'size'
118 * bytes. This can be used to increase (or decrease) the
119 * vmalloc area - the default is 128m.
120 */
121static int __init parse_vmalloc(char *arg)
122{
123 if (!arg)
124 return -EINVAL;
125
126 __VMALLOC_RESERVE = memparse(arg, &arg);
127 return 0;
128}
129early_param("vmalloc", parse_vmalloc);
130
131/*
132 * reservetop=size reserves a hole at the top of the kernel address space which
133 * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
134 * so relocating the fixmap can be done before paging initialization.
135 */
136static int __init parse_reservetop(char *arg)
137{
138 unsigned long address;
139
140 if (!arg)
141 return -EINVAL;
142
143 address = memparse(arg, &arg);
144 reserve_top_address(address);
145 return 0;
146}
147early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
new file mode 100644
index 000000000000..16ae70fc57e7
--- /dev/null
+++ b/arch/x86/mm/srat_32.c
@@ -0,0 +1,283 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/acpi.h>
30#include <linux/nodemask.h>
31#include <asm/srat.h>
32#include <asm/topology.h>
33#include <asm/smp.h>
34#include <asm/e820.h>
35
36/*
37 * proximity macros and definitions
38 */
39#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
40#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
41#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
42#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
43/* bitmap length; _PXM is at most 255 */
44#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
45static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
46
47#define MAX_CHUNKS_PER_NODE 3
48#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
49struct node_memory_chunk_s {
50 unsigned long start_pfn;
51 unsigned long end_pfn;
52 u8 pxm; // proximity domain of node
53 u8 nid; // which cnode contains this chunk?
54 u8 bank; // which mem bank on this node
55};
56static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
57
58static int __initdata num_memory_chunks; /* total number of memory chunks */
59static u8 __initdata apicid_to_pxm[MAX_APICID];
60
61int numa_off __initdata;
62int acpi_numa __initdata;
63
64static __init void bad_srat(void)
65{
66 printk(KERN_ERR "SRAT: SRAT not used.\n");
67 acpi_numa = -1;
68 num_memory_chunks = 0;
69}
70
71static __init inline int srat_disabled(void)
72{
73 return numa_off || acpi_numa < 0;
74}
75
76/* Identify CPU proximity domains */
77void __init
78acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
79{
80 if (srat_disabled())
81 return;
82 if (cpu_affinity->header.length !=
83 sizeof(struct acpi_srat_cpu_affinity)) {
84 bad_srat();
85 return;
86 }
87
88 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
89 return; /* empty entry */
90
91 /* mark this node as "seen" in node bitmap */
92 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
93
94 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
95
96 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
97 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
98}
99
100/*
101 * Identify memory proximity domains and hot-remove capabilities.
102 * Fill node memory chunk list structure.
103 */
104void __init
105acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
106{
107 unsigned long long paddr, size;
108 unsigned long start_pfn, end_pfn;
109 u8 pxm;
110 struct node_memory_chunk_s *p, *q, *pend;
111
112 if (srat_disabled())
113 return;
114 if (memory_affinity->header.length !=
115 sizeof(struct acpi_srat_mem_affinity)) {
116 bad_srat();
117 return;
118 }
119
120 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
121 return; /* empty entry */
122
123 pxm = memory_affinity->proximity_domain & 0xff;
124
125 /* mark this node as "seen" in node bitmap */
126 BMAP_SET(pxm_bitmap, pxm);
127
128 /* calculate info for memory chunk structure */
129 paddr = memory_affinity->base_address;
130 size = memory_affinity->length;
131
132 start_pfn = paddr >> PAGE_SHIFT;
133 end_pfn = (paddr + size) >> PAGE_SHIFT;
134
135
136 if (num_memory_chunks >= MAXCHUNKS) {
137 printk(KERN_WARNING "Too many mem chunks in SRAT."
138 " Ignoring %lld MBytes at %llx\n",
139 size/(1024*1024), paddr);
140 return;
141 }
142
143 /* Insertion sort based on base address */
144 pend = &node_memory_chunk[num_memory_chunks];
145 for (p = &node_memory_chunk[0]; p < pend; p++) {
146 if (start_pfn < p->start_pfn)
147 break;
148 }
149 if (p < pend) {
150 for (q = pend; q >= p; q--)
151 *(q + 1) = *q;
152 }
153 p->start_pfn = start_pfn;
154 p->end_pfn = end_pfn;
155 p->pxm = pxm;
156
157 num_memory_chunks++;
158
159 printk(KERN_DEBUG "Memory range %08lx to %08lx"
160 " in proximity domain %02x %s\n",
161 start_pfn, end_pfn,
162 pxm,
163 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
164 "enabled and removable" : "enabled" ) );
165}
166
167/* Callback for SLIT parsing */
168void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
169{
170}
171
172void acpi_numa_arch_fixup(void)
173{
174}
175/*
176 * The SRAT table always lists ascending addresses, so can always
177 * assume that the first "start" address that you see is the real
178 * start of the node, and that the current "end" address is after
179 * the previous one.
180 */
181static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
182{
183 /*
184 * Only add present memory as told by the e820.
185 * There is no guarantee from the SRAT that the memory it
186 * enumerates is present at boot time because it represents
187 * *possible* memory hotplug areas the same as normal RAM.
188 */
189 if (memory_chunk->start_pfn >= max_pfn) {
190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
191 memory_chunk->start_pfn, memory_chunk->end_pfn);
192 return -1;
193 }
194 if (memory_chunk->nid != nid)
195 return -1;
196
197 if (!node_has_online_mem(nid))
198 node_start_pfn[nid] = memory_chunk->start_pfn;
199
200 if (node_start_pfn[nid] > memory_chunk->start_pfn)
201 node_start_pfn[nid] = memory_chunk->start_pfn;
202
203 if (node_end_pfn[nid] < memory_chunk->end_pfn)
204 node_end_pfn[nid] = memory_chunk->end_pfn;
205
206 return 0;
207}
208
209int __init get_memcfg_from_srat(void)
210{
211 int i, j, nid;
212
213
214 if (srat_disabled())
215 goto out_fail;
216
217 if (num_memory_chunks == 0) {
218 printk(KERN_WARNING
219 "could not finy any ACPI SRAT memory areas.\n");
220 goto out_fail;
221 }
222
223 /* Calculate total number of nodes in system from PXM bitmap and create
224 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
225 * to specify the range of _PXM values.)
226 */
227 /*
228 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
229 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
230 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
231 * approaches MAX_PXM_DOMAINS for i386.
232 */
233 nodes_clear(node_online_map);
234 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
235 if (BMAP_TEST(pxm_bitmap, i)) {
236 int nid = acpi_map_pxm_to_node(i);
237 node_set_online(nid);
238 }
239 }
240 BUG_ON(num_online_nodes() == 0);
241
242 /* set cnode id in memory chunk structure */
243 for (i = 0; i < num_memory_chunks; i++)
244 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
245
246 printk(KERN_DEBUG "pxm bitmap: ");
247 for (i = 0; i < sizeof(pxm_bitmap); i++) {
248 printk(KERN_CONT "%02x ", pxm_bitmap[i]);
249 }
250 printk(KERN_CONT "\n");
251 printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
252 num_online_nodes());
253 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
254 num_memory_chunks);
255
256 for (i = 0; i < MAX_APICID; i++)
257 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
258
259 for (j = 0; j < num_memory_chunks; j++){
260 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
261 printk(KERN_DEBUG
262 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
263 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
264 if (node_read_chunk(chunk->nid, chunk))
265 continue;
266
267 e820_register_active_regions(chunk->nid, chunk->start_pfn,
268 min(chunk->end_pfn, max_pfn));
269 }
270
271 for_each_online_node(nid) {
272 unsigned long start = node_start_pfn[nid];
273 unsigned long end = min(node_end_pfn[nid], max_pfn);
274
275 memory_present(nid, start, end);
276 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
277 }
278 return 1;
279out_fail:
280 printk(KERN_ERR "failed to get NUMA memory information from SRAT"
281 " table\n");
282 return 0;
283}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 3890234e5b26..1b4763e26ea9 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -97,37 +97,22 @@ static __init inline int srat_disabled(void)
97 return numa_off || acpi_numa < 0; 97 return numa_off || acpi_numa < 0;
98} 98}
99 99
100/*
101 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
102 * up the NUMA heuristics which wants the local node to have a smaller
103 * distance than the others.
104 * Do some quick checks here and only use the SLIT if it passes.
105 */
106static __init int slit_valid(struct acpi_table_slit *slit)
107{
108 int i, j;
109 int d = slit->locality_count;
110 for (i = 0; i < d; i++) {
111 for (j = 0; j < d; j++) {
112 u8 val = slit->entry[d*i + j];
113 if (i == j) {
114 if (val != LOCAL_DISTANCE)
115 return 0;
116 } else if (val <= LOCAL_DISTANCE)
117 return 0;
118 }
119 }
120 return 1;
121}
122
123/* Callback for SLIT parsing */ 100/* Callback for SLIT parsing */
124void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 101void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
125{ 102{
126 if (!slit_valid(slit)) { 103 unsigned length;
127 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); 104 unsigned long phys;
128 return; 105
129 } 106 length = slit->header.length;
130 acpi_slit = slit; 107 phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
108 PAGE_SIZE);
109
110 if (phys == -1L)
111 panic(" Can not save slit!\n");
112
113 acpi_slit = __va(phys);
114 memcpy(acpi_slit, slit, length);
115 reserve_early(phys, phys + length, "ACPI SLIT");
131} 116}
132 117
133/* Callback for Proximity Domain -> LAPIC mapping */ 118/* Callback for Proximity Domain -> LAPIC mapping */
@@ -326,7 +311,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
326 pxmram = 0; 311 pxmram = 0;
327 } 312 }
328 313
329 e820ram = end_pfn - absent_pages_in_range(0, end_pfn); 314 e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
330 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 315 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
331 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 316 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
332 printk(KERN_ERR 317 printk(KERN_ERR
@@ -403,7 +388,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
403 if (node == NUMA_NO_NODE) 388 if (node == NUMA_NO_NODE)
404 continue; 389 continue;
405 if (!node_isset(node, node_possible_map)) 390 if (!node_isset(node, node_possible_map))
406 numa_set_node(i, NUMA_NO_NODE); 391 numa_clear_node(i);
407 } 392 }
408 numa_init_array(); 393 numa_init_array();
409 return 0; 394 return 0;
@@ -522,6 +507,7 @@ int __node_distance(int a, int b)
522 507
523EXPORT_SYMBOL(__node_distance); 508EXPORT_SYMBOL(__node_distance);
524 509
510#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
525int memory_add_physaddr_to_nid(u64 start) 511int memory_add_physaddr_to_nid(u64 start)
526{ 512{
527 int i, ret = 0; 513 int i, ret = 0;
@@ -533,4 +519,4 @@ int memory_add_physaddr_to_nid(u64 start)
533 return ret; 519 return ret;
534} 520}
535EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 521EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
536 522#endif
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644
index 000000000000..d877c5b423ef
--- /dev/null
+++ b/arch/x86/mm/testmmiotrace.c
@@ -0,0 +1,71 @@
1/*
2 * Written by Pekka Paalanen, 2008 <pq@iki.fi>
3 */
4#include <linux/module.h>
5#include <linux/io.h>
6
7#define MODULE_NAME "testmmiotrace"
8
9static unsigned long mmio_address;
10module_param(mmio_address, ulong, 0);
11MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
12
13static void do_write_test(void __iomem *p)
14{
15 unsigned int i;
16 for (i = 0; i < 256; i++)
17 iowrite8(i, p + i);
18 for (i = 1024; i < (5 * 1024); i += 2)
19 iowrite16(i * 12 + 7, p + i);
20 for (i = (5 * 1024); i < (16 * 1024); i += 4)
21 iowrite32(i * 212371 + 13, p + i);
22}
23
24static void do_read_test(void __iomem *p)
25{
26 unsigned int i;
27 for (i = 0; i < 256; i++)
28 ioread8(p + i);
29 for (i = 1024; i < (5 * 1024); i += 2)
30 ioread16(p + i);
31 for (i = (5 * 1024); i < (16 * 1024); i += 4)
32 ioread32(p + i);
33}
34
35static void do_test(void)
36{
37 void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
38 if (!p) {
39 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
40 return;
41 }
42 do_write_test(p);
43 do_read_test(p);
44 iounmap(p);
45}
46
47static int __init init(void)
48{
49 if (mmio_address == 0) {
50 pr_err(MODULE_NAME ": you have to use the module argument "
51 "mmio_address.\n");
52 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
53 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
54 return -ENXIO;
55 }
56
57 pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
58 "in PCI address space, and writing "
59 "rubbish in there.\n", mmio_address);
60 do_test();
61 return 0;
62}
63
64static void __exit cleanup(void)
65{
66 pr_debug(MODULE_NAME ": unloaded.\n");
67}
68
69module_init(init);
70module_exit(cleanup);
71MODULE_LICENSE("GPL");