aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile11
-rw-r--r--arch/x86/mm/discontig_32.c288
-rw-r--r--arch/x86/mm/dump_pagetables.c12
-rw-r--r--arch/x86/mm/fault.c123
-rw-r--r--arch/x86/mm/gup.c298
-rw-r--r--arch/x86/mm/hugetlbpage.c78
-rw-r--r--arch/x86/mm/init_32.c600
-rw-r--r--arch/x86/mm/init_64.c817
-rw-r--r--arch/x86/mm/ioremap.c103
-rw-r--r--arch/x86/mm/k8topology_64.c21
-rw-r--r--arch/x86/mm/kmmio.c510
-rw-r--r--arch/x86/mm/memtest.c123
-rw-r--r--arch/x86/mm/mmio-mod.c517
-rw-r--r--arch/x86/mm/numa_64.c109
-rw-r--r--arch/x86/mm/pageattr-test.c27
-rw-r--r--arch/x86/mm/pageattr.c541
-rw-r--r--arch/x86/mm/pat.c612
-rw-r--r--arch/x86/mm/pf_in.c489
-rw-r--r--arch/x86/mm/pf_in.h39
-rw-r--r--arch/x86/mm/pgtable.c199
-rw-r--r--arch/x86/mm/pgtable_32.c104
-rw-r--r--arch/x86/mm/srat_32.c283
-rw-r--r--arch/x86/mm/srat_64.c21
-rw-r--r--arch/x86/mm/testmmiotrace.c71
24 files changed, 4612 insertions, 1384 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index b7b3e4c7cfc9..dfb932dcf136 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_X86_32) += pgtable_32.o 4obj-$(CONFIG_X86_32) += pgtable_32.o
5 5
@@ -8,10 +8,17 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
8 8
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10 10
11obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
12obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15
11ifeq ($(CONFIG_X86_32),y) 16ifeq ($(CONFIG_X86_32),y)
12obj-$(CONFIG_NUMA) += discontig_32.o 17obj-$(CONFIG_NUMA) += discontig_32.o
13else 18else
14obj-$(CONFIG_NUMA) += numa_64.o 19obj-$(CONFIG_NUMA) += numa_64.o
15obj-$(CONFIG_K8_NUMA) += k8topology_64.o 20obj-$(CONFIG_K8_NUMA) += k8topology_64.o
16obj-$(CONFIG_ACPI_NUMA) += srat_64.o
17endif 21endif
22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
23
24obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 914ccf983687..847c164725f4 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -38,10 +38,10 @@
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/mmzone.h> 39#include <asm/mmzone.h>
40#include <asm/bios_ebda.h> 40#include <asm/bios_ebda.h>
41#include <asm/proto.h>
41 42
42struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
43EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
44static bootmem_data_t node0_bdata;
45 45
46/* 46/*
47 * numa interface - we expect the numa architecture specific code to have 47 * numa interface - we expect the numa architecture specific code to have
@@ -59,14 +59,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
59/* 59/*
60 * 4) physnode_map - the mapping between a pfn and owning node 60 * 4) physnode_map - the mapping between a pfn and owning node
61 * physnode_map keeps track of the physical memory layout of a generic 61 * physnode_map keeps track of the physical memory layout of a generic
62 * numa node on a 256Mb break (each element of the array will 62 * numa node on a 64Mb break (each element of the array will
63 * represent 256Mb of memory and will be marked by the node id. so, 63 * represent 64Mb of memory and will be marked by the node id. so,
64 * if the first gig is on node 0, and the second gig is on node 1 64 * if the first gig is on node 0, and the second gig is on node 1
65 * physnode_map will contain: 65 * physnode_map will contain:
66 * 66 *
67 * physnode_map[0-3] = 0; 67 * physnode_map[0-15] = 0;
68 * physnode_map[4-7] = 1; 68 * physnode_map[16-31] = 1;
69 * physnode_map[8- ] = -1; 69 * physnode_map[32- ] = -1;
70 */ 70 */
71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; 71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
72EXPORT_SYMBOL(physnode_map); 72EXPORT_SYMBOL(physnode_map);
@@ -75,15 +75,15 @@ void memory_present(int nid, unsigned long start, unsigned long end)
75{ 75{
76 unsigned long pfn; 76 unsigned long pfn;
77 77
78 printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", 78 printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
79 nid, start, end); 79 nid, start, end);
80 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); 80 printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
81 printk(KERN_DEBUG " "); 81 printk(KERN_DEBUG " ");
82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { 82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid; 83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
84 printk("%ld ", pfn); 84 printk(KERN_CONT "%lx ", pfn);
85 } 85 }
86 printk("\n"); 86 printk(KERN_CONT "\n");
87} 87}
88 88
89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, 89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -99,7 +99,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
99#endif 99#endif
100 100
101extern unsigned long find_max_low_pfn(void); 101extern unsigned long find_max_low_pfn(void);
102extern void add_one_highpage_init(struct page *, int, int);
103extern unsigned long highend_pfn, highstart_pfn; 102extern unsigned long highend_pfn, highstart_pfn;
104 103
105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 104#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -117,13 +116,13 @@ static unsigned long kva_pages;
117 */ 116 */
118int __init get_memcfg_numa_flat(void) 117int __init get_memcfg_numa_flat(void)
119{ 118{
120 printk("NUMA - single node, flat memory mode\n"); 119 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
121 120
122 /* Run the memory configuration and find the top of memory. */
123 propagate_e820_map();
124 node_start_pfn[0] = 0; 121 node_start_pfn[0] = 0;
125 node_end_pfn[0] = max_pfn; 122 node_end_pfn[0] = max_pfn;
123 e820_register_active_regions(0, 0, max_pfn);
126 memory_present(0, 0, max_pfn); 124 memory_present(0, 0, max_pfn);
125 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
127 126
128 /* Indicate there is one node available. */ 127 /* Indicate there is one node available. */
129 nodes_clear(node_online_map); 128 nodes_clear(node_online_map);
@@ -156,24 +155,32 @@ static void __init propagate_e820_map_node(int nid)
156 */ 155 */
157static void __init allocate_pgdat(int nid) 156static void __init allocate_pgdat(int nid)
158{ 157{
159 if (nid && node_has_online_mem(nid)) 158 char buf[16];
159
160 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
160 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 161 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
161 else { 162 else {
162 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); 163 unsigned long pgdat_phys;
163 min_low_pfn += PFN_UP(sizeof(pg_data_t)); 164 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
165 max_pfn_mapped<<PAGE_SHIFT,
166 sizeof(pg_data_t),
167 PAGE_SIZE);
168 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
169 memset(buf, 0, sizeof(buf));
170 sprintf(buf, "NODE_DATA %d", nid);
171 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
164 } 172 }
173 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
174 nid, (unsigned long)NODE_DATA(nid));
165} 175}
166 176
167#ifdef CONFIG_DISCONTIGMEM
168/* 177/*
169 * In the discontig memory model, a portion of the kernel virtual area (KVA) 178 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
170 * is reserved and portions of nodes are mapped using it. This is to allow 179 * virtual address space (KVA) is reserved and portions of nodes are mapped
171 * node-local memory to be allocated for structures that would normally require 180 * using it. This is to allow node-local memory to be allocated for
172 * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers 181 * structures that would normally require ZONE_NORMAL. The memory is
173 * should be prepared to allocate from the bootmem allocator instead. This KVA 182 * allocated with alloc_remap() and callers should be prepared to allocate
174 * mechanism is incompatible with SPARSEMEM as it makes assumptions about the 183 * from the bootmem allocator instead.
175 * layout of memory that are broken if alloc_remap() succeeds for some of the
176 * map and fails for others
177 */ 184 */
178static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 185static unsigned long node_remap_start_pfn[MAX_NUMNODES];
179static void *node_remap_end_vaddr[MAX_NUMNODES]; 186static void *node_remap_end_vaddr[MAX_NUMNODES];
@@ -195,15 +202,19 @@ void *alloc_remap(int nid, unsigned long size)
195 return allocation; 202 return allocation;
196} 203}
197 204
198void __init remap_numa_kva(void) 205static void __init remap_numa_kva(void)
199{ 206{
200 void *vaddr; 207 void *vaddr;
201 unsigned long pfn; 208 unsigned long pfn;
202 int node; 209 int node;
203 210
204 for_each_online_node(node) { 211 for_each_online_node(node) {
212 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
205 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { 213 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
206 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); 214 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
215 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
216 (unsigned long)vaddr,
217 node_remap_start_pfn[node] + pfn);
207 set_pmd_pfn((ulong) vaddr, 218 set_pmd_pfn((ulong) vaddr,
208 node_remap_start_pfn[node] + pfn, 219 node_remap_start_pfn[node] + pfn,
209 PAGE_KERNEL_LARGE); 220 PAGE_KERNEL_LARGE);
@@ -215,17 +226,21 @@ static unsigned long calculate_numa_remap_pages(void)
215{ 226{
216 int nid; 227 int nid;
217 unsigned long size, reserve_pages = 0; 228 unsigned long size, reserve_pages = 0;
218 unsigned long pfn;
219 229
220 for_each_online_node(nid) { 230 for_each_online_node(nid) {
221 unsigned old_end_pfn = node_end_pfn[nid]; 231 u64 node_kva_target;
232 u64 node_kva_final;
222 233
223 /* 234 /*
224 * The acpi/srat node info can show hot-add memroy zones 235 * The acpi/srat node info can show hot-add memroy zones
225 * where memory could be added but not currently present. 236 * where memory could be added but not currently present.
226 */ 237 */
238 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
239 nid, node_start_pfn[nid], node_end_pfn[nid]);
227 if (node_start_pfn[nid] > max_pfn) 240 if (node_start_pfn[nid] > max_pfn)
228 continue; 241 continue;
242 if (!node_end_pfn[nid])
243 continue;
229 if (node_end_pfn[nid] > max_pfn) 244 if (node_end_pfn[nid] > max_pfn)
230 node_end_pfn[nid] = max_pfn; 245 node_end_pfn[nid] = max_pfn;
231 246
@@ -237,41 +252,48 @@ static unsigned long calculate_numa_remap_pages(void)
237 /* now the roundup is correct, convert to PAGE_SIZE pages */ 252 /* now the roundup is correct, convert to PAGE_SIZE pages */
238 size = size * PTRS_PER_PTE; 253 size = size * PTRS_PER_PTE;
239 254
240 /* 255 node_kva_target = round_down(node_end_pfn[nid] - size,
241 * Validate the region we are allocating only contains valid 256 PTRS_PER_PTE);
242 * pages. 257 node_kva_target <<= PAGE_SHIFT;
243 */ 258 do {
244 for (pfn = node_end_pfn[nid] - size; 259 node_kva_final = find_e820_area(node_kva_target,
245 pfn < node_end_pfn[nid]; pfn++) 260 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
246 if (!page_is_ram(pfn)) 261 ((u64)size)<<PAGE_SHIFT,
247 break; 262 LARGE_PAGE_BYTES);
248 263 node_kva_target -= LARGE_PAGE_BYTES;
249 if (pfn != node_end_pfn[nid]) 264 } while (node_kva_final == -1ULL &&
250 size = 0; 265 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
266
267 if (node_kva_final == -1ULL)
268 panic("Can not get kva ram\n");
251 269
252 printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
253 size, nid);
254 node_remap_size[nid] = size; 270 node_remap_size[nid] = size;
255 node_remap_offset[nid] = reserve_pages; 271 node_remap_offset[nid] = reserve_pages;
256 reserve_pages += size; 272 reserve_pages += size;
257 printk("Shrinking node %d from %ld pages to %ld pages\n", 273 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
258 nid, node_end_pfn[nid], node_end_pfn[nid] - size); 274 " node %d at %llx\n",
259 275 size, nid, node_kva_final>>PAGE_SHIFT);
260 if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { 276
261 /* 277 /*
262 * Align node_end_pfn[] and node_remap_start_pfn[] to 278 * prevent kva address below max_low_pfn want it on system
263 * pmd boundary. remap_numa_kva will barf otherwise. 279 * with less memory later.
264 */ 280 * layout will be: KVA address , KVA RAM
265 printk("Shrinking node %d further by %ld pages for proper alignment\n", 281 *
266 nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); 282 * we are supposed to only record the one less then max_low_pfn
267 size += node_end_pfn[nid] & (PTRS_PER_PTE-1); 283 * but we could have some hole in high memory, and it will only
268 } 284 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
285 * to use it as free.
286 * So reserve_early here, hope we don't run out of that array
287 */
288 reserve_early(node_kva_final,
289 node_kva_final+(((u64)size)<<PAGE_SHIFT),
290 "KVA RAM");
269 291
270 node_end_pfn[nid] -= size; 292 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
271 node_remap_start_pfn[nid] = node_end_pfn[nid]; 293 remove_active_range(nid, node_remap_start_pfn[nid],
272 shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); 294 node_remap_start_pfn[nid] + size);
273 } 295 }
274 printk("Reserving total of %ld pages for numa KVA remap\n", 296 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
275 reserve_pages); 297 reserve_pages);
276 return reserve_pages; 298 return reserve_pages;
277} 299}
@@ -285,37 +307,16 @@ static void init_remap_allocator(int nid)
285 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + 307 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
286 ALIGN(sizeof(pg_data_t), PAGE_SIZE); 308 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
287 309
288 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, 310 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
289 (ulong) node_remap_start_vaddr[nid], 311 (ulong) node_remap_start_vaddr[nid],
290 (ulong) pfn_to_kaddr(highstart_pfn 312 (ulong) node_remap_end_vaddr[nid]);
291 + node_remap_offset[nid] + node_remap_size[nid]));
292}
293#else
294void *alloc_remap(int nid, unsigned long size)
295{
296 return NULL;
297}
298
299static unsigned long calculate_numa_remap_pages(void)
300{
301 return 0;
302}
303
304static void init_remap_allocator(int nid)
305{
306}
307
308void __init remap_numa_kva(void)
309{
310} 313}
311#endif /* CONFIG_DISCONTIGMEM */
312 314
313extern void setup_bootmem_allocator(void); 315void __init initmem_init(unsigned long start_pfn,
314unsigned long __init setup_memory(void) 316 unsigned long end_pfn)
315{ 317{
316 int nid; 318 int nid;
317 unsigned long system_start_pfn, system_max_low_pfn; 319 long kva_target_pfn;
318 unsigned long wasted_pages;
319 320
320 /* 321 /*
321 * When mapping a NUMA machine we allocate the node_mem_map arrays 322 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -324,109 +325,77 @@ unsigned long __init setup_memory(void)
324 * this space and use it to adjust the boundary between ZONE_NORMAL 325 * this space and use it to adjust the boundary between ZONE_NORMAL
325 * and ZONE_HIGHMEM. 326 * and ZONE_HIGHMEM.
326 */ 327 */
327 get_memcfg_numa();
328 328
329 kva_pages = calculate_numa_remap_pages(); 329 get_memcfg_numa();
330 330
331 /* partially used pages are not usable - thus round upwards */ 331 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
332 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
333 332
334 kva_start_pfn = find_max_low_pfn() - kva_pages; 333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
334 do {
335 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
336 max_low_pfn<<PAGE_SHIFT,
337 kva_pages<<PAGE_SHIFT,
338 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
339 kva_target_pfn -= PTRS_PER_PTE;
340 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
335 341
336#ifdef CONFIG_BLK_DEV_INITRD 342 if (kva_start_pfn == -1UL)
337 /* Numa kva area is below the initrd */ 343 panic("Can not get kva space\n");
338 if (initrd_start)
339 kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
340 - kva_pages;
341#endif
342 344
343 /* 345 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
344 * We waste pages past at the end of the KVA for no good reason other
345 * than how it is located. This is bad.
346 */
347 wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
348 kva_start_pfn -= wasted_pages;
349 kva_pages += wasted_pages;
350
351 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
352 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
353 kva_start_pfn, max_low_pfn); 346 kva_start_pfn, max_low_pfn);
354 printk("max_pfn = %ld\n", max_pfn); 347 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
348
349 /* avoid clash with initrd */
350 reserve_early(kva_start_pfn<<PAGE_SHIFT,
351 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
352 "KVA PG");
355#ifdef CONFIG_HIGHMEM 353#ifdef CONFIG_HIGHMEM
356 highstart_pfn = highend_pfn = max_pfn; 354 highstart_pfn = highend_pfn = max_pfn;
357 if (max_pfn > system_max_low_pfn) 355 if (max_pfn > max_low_pfn)
358 highstart_pfn = system_max_low_pfn; 356 highstart_pfn = max_low_pfn;
359 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 357 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
360 pages_to_mb(highend_pfn - highstart_pfn)); 358 pages_to_mb(highend_pfn - highstart_pfn));
361 num_physpages = highend_pfn; 359 num_physpages = highend_pfn;
362 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 360 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
363#else 361#else
364 num_physpages = system_max_low_pfn; 362 num_physpages = max_low_pfn;
365 high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; 363 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
366#endif 364#endif
367 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 365 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
368 pages_to_mb(system_max_low_pfn)); 366 pages_to_mb(max_low_pfn));
369 printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 367 printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
370 min_low_pfn, max_low_pfn, highstart_pfn); 368 max_low_pfn, highstart_pfn);
371 369
372 printk("Low memory ends at vaddr %08lx\n", 370 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
373 (ulong) pfn_to_kaddr(max_low_pfn)); 371 (ulong) pfn_to_kaddr(max_low_pfn));
374 for_each_online_node(nid) { 372 for_each_online_node(nid) {
375 init_remap_allocator(nid); 373 init_remap_allocator(nid);
376 374
377 allocate_pgdat(nid); 375 allocate_pgdat(nid);
378 } 376 }
379 printk("High memory starts at vaddr %08lx\n", 377 remap_numa_kva();
378
379 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
380 (ulong) pfn_to_kaddr(highstart_pfn)); 380 (ulong) pfn_to_kaddr(highstart_pfn));
381 for_each_online_node(nid) 381 for_each_online_node(nid)
382 propagate_e820_map_node(nid); 382 propagate_e820_map_node(nid);
383 383
384 memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); 384 for_each_online_node(nid)
385 NODE_DATA(0)->bdata = &node0_bdata; 385 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
386 setup_bootmem_allocator();
387 return max_low_pfn;
388}
389
390void __init numa_kva_reserve(void)
391{
392 if (kva_pages)
393 reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
394 BOOTMEM_DEFAULT);
395}
396
397void __init zone_sizes_init(void)
398{
399 int nid;
400 unsigned long max_zone_pfns[MAX_NR_ZONES];
401 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
402 max_zone_pfns[ZONE_DMA] =
403 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
404 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
405#ifdef CONFIG_HIGHMEM
406 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
407#endif
408
409 /* If SRAT has not registered memory, register it now */
410 if (find_max_pfn_with_active_regions() == 0) {
411 for_each_online_node(nid) {
412 if (node_has_online_mem(nid))
413 add_active_range(nid, node_start_pfn[nid],
414 node_end_pfn[nid]);
415 }
416 }
417 386
418 free_area_init_nodes(max_zone_pfns); 387 NODE_DATA(0)->bdata = &bootmem_node_data[0];
419 return; 388 setup_bootmem_allocator();
420} 389}
421 390
422void __init set_highmem_pages_init(int bad_ppro) 391void __init set_highmem_pages_init(void)
423{ 392{
424#ifdef CONFIG_HIGHMEM 393#ifdef CONFIG_HIGHMEM
425 struct zone *zone; 394 struct zone *zone;
426 struct page *page; 395 int nid;
427 396
428 for_each_zone(zone) { 397 for_each_zone(zone) {
429 unsigned long node_pfn, zone_start_pfn, zone_end_pfn; 398 unsigned long zone_start_pfn, zone_end_pfn;
430 399
431 if (!is_highmem(zone)) 400 if (!is_highmem(zone))
432 continue; 401 continue;
@@ -434,16 +403,12 @@ void __init set_highmem_pages_init(int bad_ppro)
434 zone_start_pfn = zone->zone_start_pfn; 403 zone_start_pfn = zone->zone_start_pfn;
435 zone_end_pfn = zone_start_pfn + zone->spanned_pages; 404 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
436 405
437 printk("Initializing %s for node %d (%08lx:%08lx)\n", 406 nid = zone_to_nid(zone);
438 zone->name, zone_to_nid(zone), 407 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
439 zone_start_pfn, zone_end_pfn); 408 zone->name, nid, zone_start_pfn, zone_end_pfn);
440 409
441 for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { 410 add_highpages_with_active_regions(nid, zone_start_pfn,
442 if (!pfn_valid(node_pfn)) 411 zone_end_pfn);
443 continue;
444 page = pfn_to_page(node_pfn);
445 add_one_highpage_init(page, node_pfn, bad_ppro);
446 }
447 } 412 }
448 totalram_pages += totalhigh_pages; 413 totalram_pages += totalhigh_pages;
449#endif 414#endif
@@ -476,3 +441,4 @@ int memory_add_physaddr_to_nid(u64 addr)
476 441
477EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 442EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
478#endif 443#endif
444
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2c24bea92c66..e7277cbcfb40 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -42,7 +42,7 @@ static struct addr_marker address_markers[] = {
42 { 0, "User Space" }, 42 { 0, "User Space" },
43#ifdef CONFIG_X86_64 43#ifdef CONFIG_X86_64
44 { 0x8000000000000000UL, "Kernel Space" }, 44 { 0x8000000000000000UL, "Kernel Space" },
45 { 0xffff810000000000UL, "Low Kernel Mapping" }, 45 { PAGE_OFFSET, "Low Kernel Mapping" },
46 { VMALLOC_START, "vmalloc() Area" }, 46 { VMALLOC_START, "vmalloc() Area" },
47 { VMEMMAP_START, "Vmemmap" }, 47 { VMEMMAP_START, "Vmemmap" },
48 { __START_KERNEL_map, "High Kernel Mapping" }, 48 { __START_KERNEL_map, "High Kernel Mapping" },
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
148 * we have now. "break" is either changing perms, levels or 148 * we have now. "break" is either changing perms, levels or
149 * address space marker. 149 * address space marker.
150 */ 150 */
151 prot = pgprot_val(new_prot) & ~(PTE_MASK); 151 prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
152 cur = pgprot_val(st->current_prot) & ~(PTE_MASK); 152 cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
153 153
154 if (!st->level) { 154 if (!st->level) {
155 /* First entry */ 155 /* First entry */
@@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
221 for (i = 0; i < PTRS_PER_PMD; i++) { 221 for (i = 0; i < PTRS_PER_PMD; i++) {
222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
223 if (!pmd_none(*start)) { 223 if (!pmd_none(*start)) {
224 pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; 224 pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
225 225
226 if (pmd_large(*start) || !pmd_present(*start)) 226 if (pmd_large(*start) || !pmd_present(*start))
227 note_page(m, st, __pgprot(prot), 3); 227 note_page(m, st, __pgprot(prot), 3);
@@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
253 for (i = 0; i < PTRS_PER_PUD; i++) { 253 for (i = 0; i < PTRS_PER_PUD; i++) {
254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
255 if (!pud_none(*start)) { 255 if (!pud_none(*start)) {
256 pgprotval_t prot = pud_val(*start) & ~PTE_MASK; 256 pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
257 257
258 if (pud_large(*start) || !pud_present(*start)) 258 if (pud_large(*start) || !pud_present(*start))
259 note_page(m, st, __pgprot(prot), 2); 259 note_page(m, st, __pgprot(prot), 2);
@@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m)
288 for (i = 0; i < PTRS_PER_PGD; i++) { 288 for (i = 0; i < PTRS_PER_PGD; i++) {
289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
290 if (!pgd_none(*start)) { 290 if (!pgd_none(*start)) {
291 pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; 291 pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
292 292
293 if (pgd_large(*start) || !pgd_present(*start)) 293 if (pgd_large(*start) || !pgd_present(*start))
294 note_page(m, &st, __pgprot(prot), 1); 294 note_page(m, &st, __pgprot(prot), 1);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8bcb6f40ccb6..a742d753d5b0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/ptrace.h> 12#include <linux/ptrace.h>
13#include <linux/mmiotrace.h>
13#include <linux/mman.h> 14#include <linux/mman.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/smp.h> 16#include <linux/smp.h>
@@ -34,6 +35,7 @@
34#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
35#include <asm/proto.h> 36#include <asm/proto.h>
36#include <asm-generic/sections.h> 37#include <asm-generic/sections.h>
38#include <asm/traps.h>
37 39
38/* 40/*
39 * Page fault error code bits 41 * Page fault error code bits
@@ -49,17 +51,23 @@
49#define PF_RSVD (1<<3) 51#define PF_RSVD (1<<3)
50#define PF_INSTR (1<<4) 52#define PF_INSTR (1<<4)
51 53
54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
55{
56#ifdef CONFIG_MMIOTRACE_HOOKS
57 if (unlikely(is_kmmio_active()))
58 if (kmmio_handler(regs, addr) == 1)
59 return -1;
60#endif
61 return 0;
62}
63
52static inline int notify_page_fault(struct pt_regs *regs) 64static inline int notify_page_fault(struct pt_regs *regs)
53{ 65{
54#ifdef CONFIG_KPROBES 66#ifdef CONFIG_KPROBES
55 int ret = 0; 67 int ret = 0;
56 68
57 /* kprobe_running() needs smp_processor_id() */ 69 /* kprobe_running() needs smp_processor_id() */
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) { 70 if (!user_mode_vm(regs)) {
60#else
61 if (!user_mode(regs)) {
62#endif
63 preempt_disable(); 71 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 72 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1; 73 ret = 1;
@@ -350,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
350 return 0; 358 return 0;
351} 359}
352 360
353void do_invalid_op(struct pt_regs *, unsigned long);
354
355static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 361static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
356{ 362{
357#ifdef CONFIG_X86_F00F_BUG 363#ifdef CONFIG_X86_F00F_BUG
@@ -396,11 +402,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
396 printk(KERN_CONT "NULL pointer dereference"); 402 printk(KERN_CONT "NULL pointer dereference");
397 else 403 else
398 printk(KERN_CONT "paging request"); 404 printk(KERN_CONT "paging request");
399#ifdef CONFIG_X86_32 405 printk(KERN_CONT " at %p\n", (void *) address);
400 printk(KERN_CONT " at %08lx\n", address);
401#else
402 printk(KERN_CONT " at %016lx\n", address);
403#endif
404 printk(KERN_ALERT "IP:"); 406 printk(KERN_ALERT "IP:");
405 printk_address(regs->ip, 1); 407 printk_address(regs->ip, 1);
406 dump_pagetable(address); 408 dump_pagetable(address);
@@ -606,6 +608,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
606 608
607 if (notify_page_fault(regs)) 609 if (notify_page_fault(regs))
608 return; 610 return;
611 if (unlikely(kmmio_fault(regs, address)))
612 return;
609 613
610 /* 614 /*
611 * We fault-in kernel-space virtual memory on-demand. The 615 * We fault-in kernel-space virtual memory on-demand. The
@@ -800,14 +804,10 @@ bad_area_nosemaphore:
800 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 804 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
801 printk_ratelimit()) { 805 printk_ratelimit()) {
802 printk( 806 printk(
803#ifdef CONFIG_X86_32 807 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
804 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
805#else
806 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
807#endif
808 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 808 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
809 tsk->comm, task_pid_nr(tsk), address, regs->ip, 809 tsk->comm, task_pid_nr(tsk), address,
810 regs->sp, error_code); 810 (void *) regs->ip, (void *) regs->sp, error_code);
811 print_vma_addr(" in ", regs->ip); 811 print_vma_addr(" in ", regs->ip);
812 printk("\n"); 812 printk("\n");
813 } 813 }
@@ -914,72 +914,45 @@ LIST_HEAD(pgd_list);
914 914
915void vmalloc_sync_all(void) 915void vmalloc_sync_all(void)
916{ 916{
917#ifdef CONFIG_X86_32
918 /*
919 * Note that races in the updates of insync and start aren't
920 * problematic: insync can only get set bits added, and updates to
921 * start are only improving performance (without affecting correctness
922 * if undone).
923 */
924 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
925 static unsigned long start = TASK_SIZE;
926 unsigned long address; 917 unsigned long address;
927 918
919#ifdef CONFIG_X86_32
928 if (SHARED_KERNEL_PMD) 920 if (SHARED_KERNEL_PMD)
929 return; 921 return;
930 922
931 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 923 for (address = VMALLOC_START & PMD_MASK;
932 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 924 address >= TASK_SIZE && address < FIXADDR_TOP;
933 if (!test_bit(pgd_index(address), insync)) { 925 address += PMD_SIZE) {
934 unsigned long flags; 926 unsigned long flags;
935 struct page *page; 927 struct page *page;
936 928
937 spin_lock_irqsave(&pgd_lock, flags); 929 spin_lock_irqsave(&pgd_lock, flags);
938 list_for_each_entry(page, &pgd_list, lru) { 930 list_for_each_entry(page, &pgd_list, lru) {
939 if (!vmalloc_sync_one(page_address(page), 931 if (!vmalloc_sync_one(page_address(page),
940 address)) 932 address))
941 break; 933 break;
942 }
943 spin_unlock_irqrestore(&pgd_lock, flags);
944 if (!page)
945 set_bit(pgd_index(address), insync);
946 } 934 }
947 if (address == start && test_bit(pgd_index(address), insync)) 935 spin_unlock_irqrestore(&pgd_lock, flags);
948 start = address + PGDIR_SIZE;
949 } 936 }
950#else /* CONFIG_X86_64 */ 937#else /* CONFIG_X86_64 */
951 /* 938 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
952 * Note that races in the updates of insync and start aren't 939 address += PGDIR_SIZE) {
953 * problematic: insync can only get set bits added, and updates to 940 const pgd_t *pgd_ref = pgd_offset_k(address);
954 * start are only improving performance (without affecting correctness 941 unsigned long flags;
955 * if undone). 942 struct page *page;
956 */ 943
957 static DECLARE_BITMAP(insync, PTRS_PER_PGD); 944 if (pgd_none(*pgd_ref))
958 static unsigned long start = VMALLOC_START & PGDIR_MASK; 945 continue;
959 unsigned long address; 946 spin_lock_irqsave(&pgd_lock, flags);
960 947 list_for_each_entry(page, &pgd_list, lru) {
961 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { 948 pgd_t *pgd;
962 if (!test_bit(pgd_index(address), insync)) { 949 pgd = (pgd_t *)page_address(page) + pgd_index(address);
963 const pgd_t *pgd_ref = pgd_offset_k(address); 950 if (pgd_none(*pgd))
964 unsigned long flags; 951 set_pgd(pgd, *pgd_ref);
965 struct page *page; 952 else
966 953 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
967 if (pgd_none(*pgd_ref))
968 continue;
969 spin_lock_irqsave(&pgd_lock, flags);
970 list_for_each_entry(page, &pgd_list, lru) {
971 pgd_t *pgd;
972 pgd = (pgd_t *)page_address(page) + pgd_index(address);
973 if (pgd_none(*pgd))
974 set_pgd(pgd, *pgd_ref);
975 else
976 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
977 }
978 spin_unlock_irqrestore(&pgd_lock, flags);
979 set_bit(pgd_index(address), insync);
980 } 954 }
981 if (address == start) 955 spin_unlock_irqrestore(&pgd_lock, flags);
982 start = address + PGDIR_SIZE;
983 } 956 }
984#endif 957#endif
985} 958}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..007bb06c7504
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,298 @@
1/*
2 * Lockless get_user_pages_fast for x86
3 *
4 * Copyright (C) 2008 Nick Piggin
5 * Copyright (C) 2008 Novell Inc.
6 */
7#include <linux/sched.h>
8#include <linux/mm.h>
9#include <linux/vmstat.h>
10#include <linux/highmem.h>
11
12#include <asm/pgtable.h>
13
14static inline pte_t gup_get_pte(pte_t *ptep)
15{
16#ifndef CONFIG_X86_PAE
17 return *ptep;
18#else
19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking
21 * any locks. For this we would like to load the pointers atoimcally,
22 * but that is not possible (without expensive cmpxchg8b) on PAE. What
23 * we do have is the guarantee that a pte will only either go from not
24 * present to present, or present to not present or both -- it will not
25 * switch to a completely different present page without a TLB flush in
26 * between; something that we are blocking by holding interrupts off.
27 *
28 * Setting ptes from not present to present goes:
29 * ptep->pte_high = h;
30 * smp_wmb();
31 * ptep->pte_low = l;
32 *
33 * And present to not present goes:
34 * ptep->pte_low = 0;
35 * smp_wmb();
36 * ptep->pte_high = 0;
37 *
38 * We must ensure here that the load of pte_low sees l iff pte_high
39 * sees h. We load pte_high *after* loading pte_low, which ensures we
40 * don't see an older value of pte_high. *Then* we recheck pte_low,
41 * which ensures that we haven't picked up a changed pte high. We might
42 * have got rubbish values from pte_low and pte_high, but we are
43 * guaranteed that pte_low will not have the present bit set *unless*
44 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
45 * we're safe.
46 *
47 * gup_get_pte should not be used or copied outside gup.c without being
48 * very careful -- it does not atomically load the pte or anything that
49 * is likely to be useful for you.
50 */
51 pte_t pte;
52
53retry:
54 pte.pte_low = ptep->pte_low;
55 smp_rmb();
56 pte.pte_high = ptep->pte_high;
57 smp_rmb();
58 if (unlikely(pte.pte_low != ptep->pte_low))
59 goto retry;
60
61 return pte;
62#endif
63}
64
65/*
66 * The performance critical leaf functions are made noinline otherwise gcc
67 * inlines everything into a single function which results in too much
68 * register pressure.
69 */
70static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
71 unsigned long end, int write, struct page **pages, int *nr)
72{
73 unsigned long mask;
74 pte_t *ptep;
75
76 mask = _PAGE_PRESENT|_PAGE_USER;
77 if (write)
78 mask |= _PAGE_RW;
79
80 ptep = pte_offset_map(&pmd, addr);
81 do {
82 pte_t pte = gup_get_pte(ptep);
83 struct page *page;
84
85 if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep);
87 return 0;
88 }
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte);
91 get_page(page);
92 pages[*nr] = page;
93 (*nr)++;
94
95 } while (ptep++, addr += PAGE_SIZE, addr != end);
96 pte_unmap(ptep - 1);
97
98 return 1;
99}
100
101static inline void get_head_page_multiple(struct page *page, int nr)
102{
103 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count);
106}
107
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
109 unsigned long end, int write, struct page **pages, int *nr)
110{
111 unsigned long mask;
112 pte_t pte = *(pte_t *)&pmd;
113 struct page *head, *page;
114 int refs;
115
116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write)
118 mask |= _PAGE_RW;
119 if ((pte_val(pte) & mask) != mask)
120 return 0;
121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124
125 refs = 0;
126 head = pte_page(pte);
127 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
128 do {
129 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page;
131 (*nr)++;
132 page++;
133 refs++;
134 } while (addr += PAGE_SIZE, addr != end);
135 get_head_page_multiple(head, refs);
136
137 return 1;
138}
139
140static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
141 int write, struct page **pages, int *nr)
142{
143 unsigned long next;
144 pmd_t *pmdp;
145
146 pmdp = pmd_offset(&pud, addr);
147 do {
148 pmd_t pmd = *pmdp;
149
150 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd))
152 return 0;
153 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
155 return 0;
156 } else {
157 if (!gup_pte_range(pmd, addr, next, write, pages, nr))
158 return 0;
159 }
160 } while (pmdp++, addr = next, addr != end);
161
162 return 1;
163}
164
165static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
166 unsigned long end, int write, struct page **pages, int *nr)
167{
168 unsigned long mask;
169 pte_t pte = *(pte_t *)&pud;
170 struct page *head, *page;
171 int refs;
172
173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write)
175 mask |= _PAGE_RW;
176 if ((pte_val(pte) & mask) != mask)
177 return 0;
178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181
182 refs = 0;
183 head = pte_page(pte);
184 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
185 do {
186 VM_BUG_ON(compound_head(page) != head);
187 pages[*nr] = page;
188 (*nr)++;
189 page++;
190 refs++;
191 } while (addr += PAGE_SIZE, addr != end);
192 get_head_page_multiple(head, refs);
193
194 return 1;
195}
196
197static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
198 int write, struct page **pages, int *nr)
199{
200 unsigned long next;
201 pud_t *pudp;
202
203 pudp = pud_offset(&pgd, addr);
204 do {
205 pud_t pud = *pudp;
206
207 next = pud_addr_end(addr, end);
208 if (pud_none(pud))
209 return 0;
210 if (unlikely(pud_large(pud))) {
211 if (!gup_huge_pud(pud, addr, next, write, pages, nr))
212 return 0;
213 } else {
214 if (!gup_pmd_range(pud, addr, next, write, pages, nr))
215 return 0;
216 }
217 } while (pudp++, addr = next, addr != end);
218
219 return 1;
220}
221
222int get_user_pages_fast(unsigned long start, int nr_pages, int write,
223 struct page **pages)
224{
225 struct mm_struct *mm = current->mm;
226 unsigned long addr, len, end;
227 unsigned long next;
228 pgd_t *pgdp;
229 int nr = 0;
230
231 start &= PAGE_MASK;
232 addr = start;
233 len = (unsigned long) nr_pages << PAGE_SHIFT;
234 end = start + len;
235 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
236 start, len)))
237 goto slow_irqon;
238
239 /*
240 * XXX: batch / limit 'nr', to avoid large irq off latency
241 * needs some instrumenting to determine the common sizes used by
242 * important workloads (eg. DB2), and whether limiting the batch size
243 * will decrease performance.
244 *
245 * It seems like we're in the clear for the moment. Direct-IO is
246 * the main guy that batches up lots of get_user_pages, and even
247 * they are limited to 64-at-a-time which is not so many.
248 */
249 /*
250 * This doesn't prevent pagetable teardown, but does prevent
251 * the pagetables and pages from being freed on x86.
252 *
253 * So long as we atomically load page table pointers versus teardown
254 * (which we do on x86, with the above PAE exception), we can follow the
255 * address down to the the page and take a ref on it.
256 */
257 local_irq_disable();
258 pgdp = pgd_offset(mm, addr);
259 do {
260 pgd_t pgd = *pgdp;
261
262 next = pgd_addr_end(addr, end);
263 if (pgd_none(pgd))
264 goto slow;
265 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
266 goto slow;
267 } while (pgdp++, addr = next, addr != end);
268 local_irq_enable();
269
270 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
271 return nr;
272
273 {
274 int ret;
275
276slow:
277 local_irq_enable();
278slow_irqon:
279 /* Try to get the remaining pages with get_user_pages */
280 start += nr << PAGE_SHIFT;
281 pages += nr;
282
283 down_read(&mm->mmap_sem);
284 ret = get_user_pages(current, mm, start,
285 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
286 up_read(&mm->mmap_sem);
287
288 /* Have to be a bit careful with return values */
289 if (nr > 0) {
290 if (ret < 0)
291 ret = nr;
292 else
293 ret += nr;
294 }
295
296 return ret;
297 }
298}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686d..8f307d914c2e 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
124 return 1; 124 return 1;
125} 125}
126 126
127pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 127pte_t *huge_pte_alloc(struct mm_struct *mm,
128 unsigned long addr, unsigned long sz)
128{ 129{
129 pgd_t *pgd; 130 pgd_t *pgd;
130 pud_t *pud; 131 pud_t *pud;
@@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
133 pgd = pgd_offset(mm, addr); 134 pgd = pgd_offset(mm, addr);
134 pud = pud_alloc(mm, pgd, addr); 135 pud = pud_alloc(mm, pgd, addr);
135 if (pud) { 136 if (pud) {
136 if (pud_none(*pud)) 137 if (sz == PUD_SIZE) {
137 huge_pmd_share(mm, addr, pud); 138 pte = (pte_t *)pud;
138 pte = (pte_t *) pmd_alloc(mm, pud, addr); 139 } else {
140 BUG_ON(sz != PMD_SIZE);
141 if (pud_none(*pud))
142 huge_pmd_share(mm, addr, pud);
143 pte = (pte_t *) pmd_alloc(mm, pud, addr);
144 }
139 } 145 }
140 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 146 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
141 147
@@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
151 pgd = pgd_offset(mm, addr); 157 pgd = pgd_offset(mm, addr);
152 if (pgd_present(*pgd)) { 158 if (pgd_present(*pgd)) {
153 pud = pud_offset(pgd, addr); 159 pud = pud_offset(pgd, addr);
154 if (pud_present(*pud)) 160 if (pud_present(*pud)) {
161 if (pud_large(*pud))
162 return (pte_t *)pud;
155 pmd = pmd_offset(pud, addr); 163 pmd = pmd_offset(pud, addr);
164 }
156 } 165 }
157 return (pte_t *) pmd; 166 return (pte_t *) pmd;
158} 167}
@@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd)
188 return 0; 197 return 0;
189} 198}
190 199
200int pud_huge(pud_t pud)
201{
202 return 0;
203}
204
191struct page * 205struct page *
192follow_huge_pmd(struct mm_struct *mm, unsigned long address, 206follow_huge_pmd(struct mm_struct *mm, unsigned long address,
193 pmd_t *pmd, int write) 207 pmd_t *pmd, int write)
@@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd)
208 return !!(pmd_val(pmd) & _PAGE_PSE); 222 return !!(pmd_val(pmd) & _PAGE_PSE);
209} 223}
210 224
225int pud_huge(pud_t pud)
226{
227 return !!(pud_val(pud) & _PAGE_PSE);
228}
229
211struct page * 230struct page *
212follow_huge_pmd(struct mm_struct *mm, unsigned long address, 231follow_huge_pmd(struct mm_struct *mm, unsigned long address,
213 pmd_t *pmd, int write) 232 pmd_t *pmd, int write)
@@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
216 235
217 page = pte_page(*(pte_t *)pmd); 236 page = pte_page(*(pte_t *)pmd);
218 if (page) 237 if (page)
219 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); 238 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
239 return page;
240}
241
242struct page *
243follow_huge_pud(struct mm_struct *mm, unsigned long address,
244 pud_t *pud, int write)
245{
246 struct page *page;
247
248 page = pte_page(*(pte_t *)pud);
249 if (page)
250 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
220 return page; 251 return page;
221} 252}
253
222#endif 254#endif
223 255
224/* x86_64 also uses this file */ 256/* x86_64 also uses this file */
@@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
228 unsigned long addr, unsigned long len, 260 unsigned long addr, unsigned long len,
229 unsigned long pgoff, unsigned long flags) 261 unsigned long pgoff, unsigned long flags)
230{ 262{
263 struct hstate *h = hstate_file(file);
231 struct mm_struct *mm = current->mm; 264 struct mm_struct *mm = current->mm;
232 struct vm_area_struct *vma; 265 struct vm_area_struct *vma;
233 unsigned long start_addr; 266 unsigned long start_addr;
@@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
240 } 273 }
241 274
242full_search: 275full_search:
243 addr = ALIGN(start_addr, HPAGE_SIZE); 276 addr = ALIGN(start_addr, huge_page_size(h));
244 277
245 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 278 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
246 /* At this point: (!vma || addr < vma->vm_end). */ 279 /* At this point: (!vma || addr < vma->vm_end). */
@@ -262,7 +295,7 @@ full_search:
262 } 295 }
263 if (addr + mm->cached_hole_size < vma->vm_start) 296 if (addr + mm->cached_hole_size < vma->vm_start)
264 mm->cached_hole_size = vma->vm_start - addr; 297 mm->cached_hole_size = vma->vm_start - addr;
265 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 298 addr = ALIGN(vma->vm_end, huge_page_size(h));
266 } 299 }
267} 300}
268 301
@@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
270 unsigned long addr0, unsigned long len, 303 unsigned long addr0, unsigned long len,
271 unsigned long pgoff, unsigned long flags) 304 unsigned long pgoff, unsigned long flags)
272{ 305{
306 struct hstate *h = hstate_file(file);
273 struct mm_struct *mm = current->mm; 307 struct mm_struct *mm = current->mm;
274 struct vm_area_struct *vma, *prev_vma; 308 struct vm_area_struct *vma, *prev_vma;
275 unsigned long base = mm->mmap_base, addr = addr0; 309 unsigned long base = mm->mmap_base, addr = addr0;
@@ -290,7 +324,7 @@ try_again:
290 goto fail; 324 goto fail;
291 325
292 /* either no address requested or cant fit in requested address hole */ 326 /* either no address requested or cant fit in requested address hole */
293 addr = (mm->free_area_cache - len) & HPAGE_MASK; 327 addr = (mm->free_area_cache - len) & huge_page_mask(h);
294 do { 328 do {
295 /* 329 /*
296 * Lookup failure means no vma is above this address, 330 * Lookup failure means no vma is above this address,
@@ -321,7 +355,7 @@ try_again:
321 largest_hole = vma->vm_start - addr; 355 largest_hole = vma->vm_start - addr;
322 356
323 /* try just below the current vma->vm_start */ 357 /* try just below the current vma->vm_start */
324 addr = (vma->vm_start - len) & HPAGE_MASK; 358 addr = (vma->vm_start - len) & huge_page_mask(h);
325 } while (len <= vma->vm_start); 359 } while (len <= vma->vm_start);
326 360
327fail: 361fail:
@@ -359,22 +393,23 @@ unsigned long
359hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 393hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
360 unsigned long len, unsigned long pgoff, unsigned long flags) 394 unsigned long len, unsigned long pgoff, unsigned long flags)
361{ 395{
396 struct hstate *h = hstate_file(file);
362 struct mm_struct *mm = current->mm; 397 struct mm_struct *mm = current->mm;
363 struct vm_area_struct *vma; 398 struct vm_area_struct *vma;
364 399
365 if (len & ~HPAGE_MASK) 400 if (len & ~huge_page_mask(h))
366 return -EINVAL; 401 return -EINVAL;
367 if (len > TASK_SIZE) 402 if (len > TASK_SIZE)
368 return -ENOMEM; 403 return -ENOMEM;
369 404
370 if (flags & MAP_FIXED) { 405 if (flags & MAP_FIXED) {
371 if (prepare_hugepage_range(addr, len)) 406 if (prepare_hugepage_range(file, addr, len))
372 return -EINVAL; 407 return -EINVAL;
373 return addr; 408 return addr;
374 } 409 }
375 410
376 if (addr) { 411 if (addr) {
377 addr = ALIGN(addr, HPAGE_SIZE); 412 addr = ALIGN(addr, huge_page_size(h));
378 vma = find_vma(mm, addr); 413 vma = find_vma(mm, addr);
379 if (TASK_SIZE - len >= addr && 414 if (TASK_SIZE - len >= addr &&
380 (!vma || addr + len <= vma->vm_start)) 415 (!vma || addr + len <= vma->vm_start))
@@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
390 425
391#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ 426#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
392 427
428#ifdef CONFIG_X86_64
429static __init int setup_hugepagesz(char *opt)
430{
431 unsigned long ps = memparse(opt, &opt);
432 if (ps == PMD_SIZE) {
433 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
434 } else if (ps == PUD_SIZE && cpu_has_gbpages) {
435 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
436 } else {
437 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
438 ps >> 20);
439 return 0;
440 }
441 return 1;
442}
443__setup("hugepagesz=", setup_hugepagesz);
444#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b6..bbe044dbe014 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
31#include <linux/cpumask.h> 31#include <linux/cpumask.h>
32 32
33#include <asm/asm.h> 33#include <asm/asm.h>
34#include <asm/bios_ebda.h>
34#include <asm/processor.h> 35#include <asm/processor.h>
35#include <asm/system.h> 36#include <asm/system.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -47,9 +48,11 @@
47#include <asm/paravirt.h> 48#include <asm/paravirt.h>
48#include <asm/setup.h> 49#include <asm/setup.h>
49#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/smp.h>
50 52
51unsigned int __VMALLOC_RESERVE = 128 << 20; 53unsigned int __VMALLOC_RESERVE = 128 << 20;
52 54
55unsigned long max_low_pfn_mapped;
53unsigned long max_pfn_mapped; 56unsigned long max_pfn_mapped;
54 57
55DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 58DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -57,6 +60,27 @@ unsigned long highstart_pfn, highend_pfn;
57 60
58static noinline int do_test_wp_bit(void); 61static noinline int do_test_wp_bit(void);
59 62
63
64static unsigned long __initdata table_start;
65static unsigned long __meminitdata table_end;
66static unsigned long __meminitdata table_top;
67
68static int __initdata after_init_bootmem;
69
70static __init void *alloc_low_page(unsigned long *phys)
71{
72 unsigned long pfn = table_end++;
73 void *adr;
74
75 if (pfn >= table_top)
76 panic("alloc_low_page: ran out of memory");
77
78 adr = __va(pfn * PAGE_SIZE);
79 memset(adr, 0, PAGE_SIZE);
80 *phys = pfn * PAGE_SIZE;
81 return adr;
82}
83
60/* 84/*
61 * Creates a middle page table and puts a pointer to it in the 85 * Creates a middle page table and puts a pointer to it in the
62 * given global directory entry. This only returns the gd entry 86 * given global directory entry. This only returns the gd entry
@@ -68,9 +92,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
68 pmd_t *pmd_table; 92 pmd_t *pmd_table;
69 93
70#ifdef CONFIG_X86_PAE 94#ifdef CONFIG_X86_PAE
95 unsigned long phys;
71 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 96 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
72 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); 97 if (after_init_bootmem)
73 98 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
99 else
100 pmd_table = (pmd_t *)alloc_low_page(&phys);
74 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 101 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
75 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 102 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
76 pud = pud_offset(pgd, 0); 103 pud = pud_offset(pgd, 0);
@@ -92,12 +119,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
92 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 119 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
93 pte_t *page_table = NULL; 120 pte_t *page_table = NULL;
94 121
122 if (after_init_bootmem) {
95#ifdef CONFIG_DEBUG_PAGEALLOC 123#ifdef CONFIG_DEBUG_PAGEALLOC
96 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 124 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
97#endif 125#endif
98 if (!page_table) { 126 if (!page_table)
99 page_table = 127 page_table =
100 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 128 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
129 } else {
130 unsigned long phys;
131 page_table = (pte_t *)alloc_low_page(&phys);
101 } 132 }
102 133
103 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 134 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@ -155,40 +186,72 @@ static inline int is_kernel_text(unsigned long addr)
155 * of max_low_pfn pages, by creating page tables starting from address 186 * of max_low_pfn pages, by creating page tables starting from address
156 * PAGE_OFFSET: 187 * PAGE_OFFSET:
157 */ 188 */
158static void __init kernel_physical_mapping_init(pgd_t *pgd_base) 189static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
190 unsigned long start_pfn,
191 unsigned long end_pfn,
192 int use_pse)
159{ 193{
160 int pgd_idx, pmd_idx, pte_ofs; 194 int pgd_idx, pmd_idx, pte_ofs;
161 unsigned long pfn; 195 unsigned long pfn;
162 pgd_t *pgd; 196 pgd_t *pgd;
163 pmd_t *pmd; 197 pmd_t *pmd;
164 pte_t *pte; 198 pte_t *pte;
199 unsigned pages_2m, pages_4k;
200 int mapping_iter;
165 201
166 pgd_idx = pgd_index(PAGE_OFFSET); 202 /*
167 pgd = pgd_base + pgd_idx; 203 * First iteration will setup identity mapping using large/small pages
168 pfn = 0; 204 * based on use_pse, with other attributes same as set by
205 * the early code in head_32.S
206 *
207 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
208 * as desired for the kernel identity mapping.
209 *
210 * This two pass mechanism conforms to the TLB app note which says:
211 *
212 * "Software should not write to a paging-structure entry in a way
213 * that would change, for any linear address, both the page size
214 * and either the page frame or attributes."
215 */
216 mapping_iter = 1;
169 217
218 if (!cpu_has_pse)
219 use_pse = 0;
220
221repeat:
222 pages_2m = pages_4k = 0;
223 pfn = start_pfn;
224 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
225 pgd = pgd_base + pgd_idx;
170 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { 226 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
171 pmd = one_md_table_init(pgd); 227 pmd = one_md_table_init(pgd);
172 if (pfn >= max_low_pfn)
173 continue;
174 228
175 for (pmd_idx = 0; 229 if (pfn >= end_pfn)
176 pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; 230 continue;
231#ifdef CONFIG_X86_PAE
232 pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
233 pmd += pmd_idx;
234#else
235 pmd_idx = 0;
236#endif
237 for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
177 pmd++, pmd_idx++) { 238 pmd++, pmd_idx++) {
178 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; 239 unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
179 240
180 /* 241 /*
181 * Map with big pages if possible, otherwise 242 * Map with big pages if possible, otherwise
182 * create normal page tables: 243 * create normal page tables:
183 *
184 * Don't use a large page for the first 2/4MB of memory
185 * because there are often fixed size MTRRs in there
186 * and overlapping MTRRs into large pages can cause
187 * slowdowns.
188 */ 244 */
189 if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { 245 if (use_pse) {
190 unsigned int addr2; 246 unsigned int addr2;
191 pgprot_t prot = PAGE_KERNEL_LARGE; 247 pgprot_t prot = PAGE_KERNEL_LARGE;
248 /*
249 * first pass will use the same initial
250 * identity mapping attribute + _PAGE_PSE.
251 */
252 pgprot_t init_prot =
253 __pgprot(PTE_IDENT_ATTR |
254 _PAGE_PSE);
192 255
193 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 256 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
194 PAGE_OFFSET + PAGE_SIZE-1; 257 PAGE_OFFSET + PAGE_SIZE-1;
@@ -197,34 +260,59 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
197 is_kernel_text(addr2)) 260 is_kernel_text(addr2))
198 prot = PAGE_KERNEL_LARGE_EXEC; 261 prot = PAGE_KERNEL_LARGE_EXEC;
199 262
200 set_pmd(pmd, pfn_pmd(pfn, prot)); 263 pages_2m++;
264 if (mapping_iter == 1)
265 set_pmd(pmd, pfn_pmd(pfn, init_prot));
266 else
267 set_pmd(pmd, pfn_pmd(pfn, prot));
201 268
202 pfn += PTRS_PER_PTE; 269 pfn += PTRS_PER_PTE;
203 max_pfn_mapped = pfn;
204 continue; 270 continue;
205 } 271 }
206 pte = one_page_table_init(pmd); 272 pte = one_page_table_init(pmd);
207 273
208 for (pte_ofs = 0; 274 pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
209 pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; 275 pte += pte_ofs;
276 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
210 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 277 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
211 pgprot_t prot = PAGE_KERNEL; 278 pgprot_t prot = PAGE_KERNEL;
279 /*
280 * first pass will use the same initial
281 * identity mapping attribute.
282 */
283 pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
212 284
213 if (is_kernel_text(addr)) 285 if (is_kernel_text(addr))
214 prot = PAGE_KERNEL_EXEC; 286 prot = PAGE_KERNEL_EXEC;
215 287
216 set_pte(pte, pfn_pte(pfn, prot)); 288 pages_4k++;
289 if (mapping_iter == 1)
290 set_pte(pte, pfn_pte(pfn, init_prot));
291 else
292 set_pte(pte, pfn_pte(pfn, prot));
217 } 293 }
218 max_pfn_mapped = pfn;
219 } 294 }
220 } 295 }
221} 296 if (mapping_iter == 1) {
297 /*
298 * update direct mapping page count only in the first
299 * iteration.
300 */
301 update_page_count(PG_LEVEL_2M, pages_2m);
302 update_page_count(PG_LEVEL_4K, pages_4k);
222 303
223static inline int page_kills_ppro(unsigned long pagenr) 304 /*
224{ 305 * local global flush tlb, which will flush the previous
225 if (pagenr >= 0x70000 && pagenr <= 0x7003F) 306 * mappings present in both small and large page TLB's.
226 return 1; 307 */
227 return 0; 308 __flush_tlb_all();
309
310 /*
311 * Second iteration will set the actual desired PTE attributes.
312 */
313 mapping_iter = 2;
314 goto repeat;
315 }
228} 316}
229 317
230/* 318/*
@@ -287,29 +375,62 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
287 pkmap_page_table = pte; 375 pkmap_page_table = pte;
288} 376}
289 377
290void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) 378static void __init add_one_highpage_init(struct page *page, int pfn)
291{ 379{
292 if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { 380 ClearPageReserved(page);
293 ClearPageReserved(page); 381 init_page_count(page);
294 init_page_count(page); 382 __free_page(page);
295 __free_page(page); 383 totalhigh_pages++;
296 totalhigh_pages++;
297 } else
298 SetPageReserved(page);
299} 384}
300 385
301#ifndef CONFIG_NUMA 386struct add_highpages_data {
302static void __init set_highmem_pages_init(int bad_ppro) 387 unsigned long start_pfn;
388 unsigned long end_pfn;
389};
390
391static int __init add_highpages_work_fn(unsigned long start_pfn,
392 unsigned long end_pfn, void *datax)
303{ 393{
304 int pfn; 394 int node_pfn;
395 struct page *page;
396 unsigned long final_start_pfn, final_end_pfn;
397 struct add_highpages_data *data;
305 398
306 for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { 399 data = (struct add_highpages_data *)datax;
307 /* 400
308 * Holes under sparsemem might not have no mem_map[]: 401 final_start_pfn = max(start_pfn, data->start_pfn);
309 */ 402 final_end_pfn = min(end_pfn, data->end_pfn);
310 if (pfn_valid(pfn)) 403 if (final_start_pfn >= final_end_pfn)
311 add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); 404 return 0;
405
406 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
407 node_pfn++) {
408 if (!pfn_valid(node_pfn))
409 continue;
410 page = pfn_to_page(node_pfn);
411 add_one_highpage_init(page, node_pfn);
312 } 412 }
413
414 return 0;
415
416}
417
418void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
419 unsigned long end_pfn)
420{
421 struct add_highpages_data data;
422
423 data.start_pfn = start_pfn;
424 data.end_pfn = end_pfn;
425
426 work_with_active_regions(nid, add_highpages_work_fn, &data);
427}
428
429#ifndef CONFIG_NUMA
430static void __init set_highmem_pages_init(void)
431{
432 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
433
313 totalram_pages += totalhigh_pages; 434 totalram_pages += totalhigh_pages;
314} 435}
315#endif /* !CONFIG_NUMA */ 436#endif /* !CONFIG_NUMA */
@@ -317,14 +438,9 @@ static void __init set_highmem_pages_init(int bad_ppro)
317#else 438#else
318# define kmap_init() do { } while (0) 439# define kmap_init() do { } while (0)
319# define permanent_kmaps_init(pgd_base) do { } while (0) 440# define permanent_kmaps_init(pgd_base) do { } while (0)
320# define set_highmem_pages_init(bad_ppro) do { } while (0) 441# define set_highmem_pages_init() do { } while (0)
321#endif /* CONFIG_HIGHMEM */ 442#endif /* CONFIG_HIGHMEM */
322 443
323pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
324EXPORT_SYMBOL(__PAGE_KERNEL);
325
326pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
327
328void __init native_pagetable_setup_start(pgd_t *base) 444void __init native_pagetable_setup_start(pgd_t *base)
329{ 445{
330 unsigned long pfn, va; 446 unsigned long pfn, va;
@@ -380,27 +496,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
380 * be partially populated, and so it avoids stomping on any existing 496 * be partially populated, and so it avoids stomping on any existing
381 * mappings. 497 * mappings.
382 */ 498 */
383static void __init pagetable_init(void) 499static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
384{ 500{
385 pgd_t *pgd_base = swapper_pg_dir;
386 unsigned long vaddr, end; 501 unsigned long vaddr, end;
387 502
388 paravirt_pagetable_setup_start(pgd_base);
389
390 /* Enable PSE if available */
391 if (cpu_has_pse)
392 set_in_cr4(X86_CR4_PSE);
393
394 /* Enable PGE if available */
395 if (cpu_has_pge) {
396 set_in_cr4(X86_CR4_PGE);
397 __PAGE_KERNEL |= _PAGE_GLOBAL;
398 __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
399 }
400
401 kernel_physical_mapping_init(pgd_base);
402 remap_numa_kva();
403
404 /* 503 /*
405 * Fixed mappings, only the page table structure has to be 504 * Fixed mappings, only the page table structure has to be
406 * created - mappings will be set by set_fixmap(): 505 * created - mappings will be set by set_fixmap():
@@ -410,10 +509,13 @@ static void __init pagetable_init(void)
410 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; 509 end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
411 page_table_range_init(vaddr, end, pgd_base); 510 page_table_range_init(vaddr, end, pgd_base);
412 early_ioremap_reset(); 511 early_ioremap_reset();
512}
413 513
414 permanent_kmaps_init(pgd_base); 514static void __init pagetable_init(void)
515{
516 pgd_t *pgd_base = swapper_pg_dir;
415 517
416 paravirt_pagetable_setup_done(pgd_base); 518 permanent_kmaps_init(pgd_base);
417} 519}
418 520
419#ifdef CONFIG_ACPI_SLEEP 521#ifdef CONFIG_ACPI_SLEEP
@@ -456,7 +558,7 @@ void zap_low_mappings(void)
456 558
457int nx_enabled; 559int nx_enabled;
458 560
459pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; 561pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
460EXPORT_SYMBOL_GPL(__supported_pte_mask); 562EXPORT_SYMBOL_GPL(__supported_pte_mask);
461 563
462#ifdef CONFIG_X86_PAE 564#ifdef CONFIG_X86_PAE
@@ -509,27 +611,329 @@ static void __init set_nx(void)
509} 611}
510#endif 612#endif
511 613
614/* user-defined highmem size */
615static unsigned int highmem_pages = -1;
616
512/* 617/*
513 * paging_init() sets up the page tables - note that the first 8MB are 618 * highmem=size forces highmem to be exactly 'size' bytes.
514 * already mapped by head.S. 619 * This works even on boxes that have no highmem otherwise.
515 * 620 * This also works to reduce highmem size on bigger boxes.
516 * This routines also unmaps the page at virtual kernel address 0, so
517 * that we can trap those pesky NULL-reference errors in the kernel.
518 */ 621 */
519void __init paging_init(void) 622static int __init parse_highmem(char *arg)
623{
624 if (!arg)
625 return -EINVAL;
626
627 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
628 return 0;
629}
630early_param("highmem", parse_highmem);
631
632/*
633 * Determine low and high memory ranges:
634 */
635void __init find_low_pfn_range(void)
520{ 636{
637 /* it could update max_pfn */
638
639 /* max_low_pfn is 0, we already have early_res support */
640
641 max_low_pfn = max_pfn;
642 if (max_low_pfn > MAXMEM_PFN) {
643 if (highmem_pages == -1)
644 highmem_pages = max_pfn - MAXMEM_PFN;
645 if (highmem_pages + MAXMEM_PFN < max_pfn)
646 max_pfn = MAXMEM_PFN + highmem_pages;
647 if (highmem_pages + MAXMEM_PFN > max_pfn) {
648 printk(KERN_WARNING "only %luMB highmem pages "
649 "available, ignoring highmem size of %uMB.\n",
650 pages_to_mb(max_pfn - MAXMEM_PFN),
651 pages_to_mb(highmem_pages));
652 highmem_pages = 0;
653 }
654 max_low_pfn = MAXMEM_PFN;
655#ifndef CONFIG_HIGHMEM
656 /* Maximum memory usable is what is directly addressable */
657 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
658 MAXMEM>>20);
659 if (max_pfn > MAX_NONPAE_PFN)
660 printk(KERN_WARNING
661 "Use a HIGHMEM64G enabled kernel.\n");
662 else
663 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
664 max_pfn = MAXMEM_PFN;
665#else /* !CONFIG_HIGHMEM */
666#ifndef CONFIG_HIGHMEM64G
667 if (max_pfn > MAX_NONPAE_PFN) {
668 max_pfn = MAX_NONPAE_PFN;
669 printk(KERN_WARNING "Warning only 4GB will be used."
670 "Use a HIGHMEM64G enabled kernel.\n");
671 }
672#endif /* !CONFIG_HIGHMEM64G */
673#endif /* !CONFIG_HIGHMEM */
674 } else {
675 if (highmem_pages == -1)
676 highmem_pages = 0;
677#ifdef CONFIG_HIGHMEM
678 if (highmem_pages >= max_pfn) {
679 printk(KERN_ERR "highmem size specified (%uMB) is "
680 "bigger than pages available (%luMB)!.\n",
681 pages_to_mb(highmem_pages),
682 pages_to_mb(max_pfn));
683 highmem_pages = 0;
684 }
685 if (highmem_pages) {
686 if (max_low_pfn - highmem_pages <
687 64*1024*1024/PAGE_SIZE){
688 printk(KERN_ERR "highmem size %uMB results in "
689 "smaller than 64MB lowmem, ignoring it.\n"
690 , pages_to_mb(highmem_pages));
691 highmem_pages = 0;
692 }
693 max_low_pfn -= highmem_pages;
694 }
695#else
696 if (highmem_pages)
697 printk(KERN_ERR "ignoring highmem size on non-highmem"
698 " kernel!\n");
699#endif
700 }
701}
702
703#ifndef CONFIG_NEED_MULTIPLE_NODES
704void __init initmem_init(unsigned long start_pfn,
705 unsigned long end_pfn)
706{
707#ifdef CONFIG_HIGHMEM
708 highstart_pfn = highend_pfn = max_pfn;
709 if (max_pfn > max_low_pfn)
710 highstart_pfn = max_low_pfn;
711 memory_present(0, 0, highend_pfn);
712 e820_register_active_regions(0, 0, highend_pfn);
713 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
714 pages_to_mb(highend_pfn - highstart_pfn));
715 num_physpages = highend_pfn;
716 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
717#else
718 memory_present(0, 0, max_low_pfn);
719 e820_register_active_regions(0, 0, max_low_pfn);
720 num_physpages = max_low_pfn;
721 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
722#endif
723#ifdef CONFIG_FLATMEM
724 max_mapnr = num_physpages;
725#endif
726 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
727 pages_to_mb(max_low_pfn));
728
729 setup_bootmem_allocator();
730}
731#endif /* !CONFIG_NEED_MULTIPLE_NODES */
732
733static void __init zone_sizes_init(void)
734{
735 unsigned long max_zone_pfns[MAX_NR_ZONES];
736 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
737 max_zone_pfns[ZONE_DMA] =
738 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
739 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
740#ifdef CONFIG_HIGHMEM
741 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
742#endif
743
744 free_area_init_nodes(max_zone_pfns);
745}
746
747void __init setup_bootmem_allocator(void)
748{
749 int i;
750 unsigned long bootmap_size, bootmap;
751 /*
752 * Initialize the boot-time allocator (with low memory only):
753 */
754 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
755 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
756 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
757 PAGE_SIZE);
758 if (bootmap == -1L)
759 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
760 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
761
762 /* don't touch min_low_pfn */
763 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
764 min_low_pfn, max_low_pfn);
765 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
766 max_pfn_mapped<<PAGE_SHIFT);
767 printk(KERN_INFO " low ram: %08lx - %08lx\n",
768 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
769 printk(KERN_INFO " bootmap %08lx - %08lx\n",
770 bootmap, bootmap + bootmap_size);
771 for_each_online_node(i)
772 free_bootmem_with_active_regions(i, max_low_pfn);
773 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
774
775 after_init_bootmem = 1;
776}
777
778static void __init find_early_table_space(unsigned long end, int use_pse)
779{
780 unsigned long puds, pmds, ptes, tables, start;
781
782 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
783 tables = PAGE_ALIGN(puds * sizeof(pud_t));
784
785 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
786 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
787
788 if (use_pse) {
789 unsigned long extra;
790
791 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
792 extra += PMD_SIZE;
793 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
794 } else
795 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
796
797 tables += PAGE_ALIGN(ptes * sizeof(pte_t));
798
799 /* for fixmap */
800 tables += PAGE_SIZE * 2;
801
802 /*
803 * RED-PEN putting page tables only on node 0 could
804 * cause a hotspot and fill up ZONE_DMA. The page tables
805 * need roughly 0.5KB per GB.
806 */
807 start = 0x7000;
808 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
809 tables, PAGE_SIZE);
810 if (table_start == -1UL)
811 panic("Cannot find space for the kernel page tables");
812
813 table_start >>= PAGE_SHIFT;
814 table_end = table_start;
815 table_top = table_start + (tables>>PAGE_SHIFT);
816
817 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
818 end, table_start << PAGE_SHIFT,
819 (table_start << PAGE_SHIFT) + tables);
820}
821
822unsigned long __init_refok init_memory_mapping(unsigned long start,
823 unsigned long end)
824{
825 pgd_t *pgd_base = swapper_pg_dir;
826 unsigned long start_pfn, end_pfn;
827 unsigned long big_page_start;
828#ifdef CONFIG_DEBUG_PAGEALLOC
829 /*
830 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
831 * This will simplify cpa(), which otherwise needs to support splitting
832 * large pages into small in interrupt context, etc.
833 */
834 int use_pse = 0;
835#else
836 int use_pse = cpu_has_pse;
837#endif
838
839 /*
840 * Find space for the kernel direct mapping tables.
841 */
842 if (!after_init_bootmem)
843 find_early_table_space(end, use_pse);
844
521#ifdef CONFIG_X86_PAE 845#ifdef CONFIG_X86_PAE
522 set_nx(); 846 set_nx();
523 if (nx_enabled) 847 if (nx_enabled)
524 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 848 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
525#endif 849#endif
526 pagetable_init(); 850
851 /* Enable PSE if available */
852 if (cpu_has_pse)
853 set_in_cr4(X86_CR4_PSE);
854
855 /* Enable PGE if available */
856 if (cpu_has_pge) {
857 set_in_cr4(X86_CR4_PGE);
858 __supported_pte_mask |= _PAGE_GLOBAL;
859 }
860
861 /*
862 * Don't use a large page for the first 2/4MB of memory
863 * because there are often fixed size MTRRs in there
864 * and overlapping MTRRs into large pages can cause
865 * slowdowns.
866 */
867 big_page_start = PMD_SIZE;
868
869 if (start < big_page_start) {
870 start_pfn = start >> PAGE_SHIFT;
871 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
872 } else {
873 /* head is not big page alignment ? */
874 start_pfn = start >> PAGE_SHIFT;
875 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
876 << (PMD_SHIFT - PAGE_SHIFT);
877 }
878 if (start_pfn < end_pfn)
879 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
880
881 /* big page range */
882 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
883 << (PMD_SHIFT - PAGE_SHIFT);
884 if (start_pfn < (big_page_start >> PAGE_SHIFT))
885 start_pfn = big_page_start >> PAGE_SHIFT;
886 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
887 if (start_pfn < end_pfn)
888 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
889 use_pse);
890
891 /* tail is not big page alignment ? */
892 start_pfn = end_pfn;
893 if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
894 end_pfn = end >> PAGE_SHIFT;
895 if (start_pfn < end_pfn)
896 kernel_physical_mapping_init(pgd_base, start_pfn,
897 end_pfn, 0);
898 }
899
900 early_ioremap_page_table_range_init(pgd_base);
527 901
528 load_cr3(swapper_pg_dir); 902 load_cr3(swapper_pg_dir);
529 903
530 __flush_tlb_all(); 904 __flush_tlb_all();
531 905
906 if (!after_init_bootmem)
907 reserve_early(table_start << PAGE_SHIFT,
908 table_end << PAGE_SHIFT, "PGTABLE");
909
910 if (!after_init_bootmem)
911 early_memtest(start, end);
912
913 return end >> PAGE_SHIFT;
914}
915
916
917/*
918 * paging_init() sets up the page tables - note that the first 8MB are
919 * already mapped by head.S.
920 *
921 * This routines also unmaps the page at virtual kernel address 0, so
922 * that we can trap those pesky NULL-reference errors in the kernel.
923 */
924void __init paging_init(void)
925{
926 pagetable_init();
927
928 __flush_tlb_all();
929
532 kmap_init(); 930 kmap_init();
931
932 /*
933 * NOTE: at this point the bootmem allocator is fully available.
934 */
935 sparse_init();
936 zone_sizes_init();
533} 937}
534 938
535/* 939/*
@@ -564,24 +968,13 @@ static struct kcore_list kcore_mem, kcore_vmalloc;
564void __init mem_init(void) 968void __init mem_init(void)
565{ 969{
566 int codesize, reservedpages, datasize, initsize; 970 int codesize, reservedpages, datasize, initsize;
567 int tmp, bad_ppro; 971 int tmp;
972
973 start_periodic_check_for_corruption();
568 974
569#ifdef CONFIG_FLATMEM 975#ifdef CONFIG_FLATMEM
570 BUG_ON(!mem_map); 976 BUG_ON(!mem_map);
571#endif 977#endif
572 bad_ppro = ppro_with_ram_bug();
573
574#ifdef CONFIG_HIGHMEM
575 /* check that fixmap and pkmap do not overlap */
576 if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
577 printk(KERN_ERR
578 "fixmap and kmap areas overlap - this will crash\n");
579 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
580 PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
581 FIXADDR_START);
582 BUG();
583 }
584#endif
585 /* this will put all low memory onto the freelists */ 978 /* this will put all low memory onto the freelists */
586 totalram_pages += free_all_bootmem(); 979 totalram_pages += free_all_bootmem();
587 980
@@ -593,7 +986,7 @@ void __init mem_init(void)
593 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 986 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
594 reservedpages++; 987 reservedpages++;
595 988
596 set_highmem_pages_init(bad_ppro); 989 set_highmem_pages_init();
597 990
598 codesize = (unsigned long) &_etext - (unsigned long) &_text; 991 codesize = (unsigned long) &_etext - (unsigned long) &_text;
599 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 992 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
@@ -614,7 +1007,6 @@ void __init mem_init(void)
614 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 1007 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
615 ); 1008 );
616 1009
617#if 1 /* double-sanity-check paranoia */
618 printk(KERN_INFO "virtual kernel memory layout:\n" 1010 printk(KERN_INFO "virtual kernel memory layout:\n"
619 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 1011 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
620#ifdef CONFIG_HIGHMEM 1012#ifdef CONFIG_HIGHMEM
@@ -655,12 +1047,10 @@ void __init mem_init(void)
655#endif 1047#endif
656 BUG_ON(VMALLOC_START > VMALLOC_END); 1048 BUG_ON(VMALLOC_START > VMALLOC_END);
657 BUG_ON((unsigned long)high_memory > VMALLOC_START); 1049 BUG_ON((unsigned long)high_memory > VMALLOC_START);
658#endif /* double-sanity-check paranoia */
659 1050
660 if (boot_cpu_data.wp_works_ok < 0) 1051 if (boot_cpu_data.wp_works_ok < 0)
661 test_wp_bit(); 1052 test_wp_bit();
662 1053
663 cpa_init();
664 save_pg_dir(); 1054 save_pg_dir();
665 zap_low_mappings(); 1055 zap_low_mappings();
666} 1056}
@@ -710,6 +1100,8 @@ void mark_rodata_ro(void)
710 unsigned long start = PFN_ALIGN(_text); 1100 unsigned long start = PFN_ALIGN(_text);
711 unsigned long size = PFN_ALIGN(_etext) - start; 1101 unsigned long size = PFN_ALIGN(_etext) - start;
712 1102
1103#ifndef CONFIG_DYNAMIC_FTRACE
1104 /* Dynamic tracing modifies the kernel text section */
713 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1105 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
714 printk(KERN_INFO "Write protecting the kernel text: %luk\n", 1106 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
715 size >> 10); 1107 size >> 10);
@@ -722,6 +1114,8 @@ void mark_rodata_ro(void)
722 printk(KERN_INFO "Testing CPA: write protecting again\n"); 1114 printk(KERN_INFO "Testing CPA: write protecting again\n");
723 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); 1115 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
724#endif 1116#endif
1117#endif /* CONFIG_DYNAMIC_FTRACE */
1118
725 start += size; 1119 start += size;
726 size = (unsigned long)__end_rodata - start; 1120 size = (unsigned long)__end_rodata - start;
727 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1121 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
@@ -784,3 +1178,9 @@ void free_initrd_mem(unsigned long start, unsigned long end)
784 free_init_pages("initrd memory", start, end); 1178 free_init_pages("initrd memory", start, end);
785} 1179}
786#endif 1180#endif
1181
1182int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1183 int flags)
1184{
1185 return reserve_bootmem(phys, len, flags);
1186}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 156e6d7b0e32..3e10054c5731 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -18,6 +18,7 @@
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/initrd.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include <linux/bootmem.h> 23#include <linux/bootmem.h>
23#include <linux/proc_fs.h> 24#include <linux/proc_fs.h>
@@ -30,6 +31,7 @@
30#include <linux/nmi.h> 31#include <linux/nmi.h>
31 32
32#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/bios_ebda.h>
33#include <asm/system.h> 35#include <asm/system.h>
34#include <asm/uaccess.h> 36#include <asm/uaccess.h>
35#include <asm/pgtable.h> 37#include <asm/pgtable.h>
@@ -47,11 +49,19 @@
47#include <asm/numa.h> 49#include <asm/numa.h>
48#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
49 51
52/*
53 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
54 * The direct mapping extends to max_pfn_mapped, so that we can directly access
55 * apertures, ACPI and other tables without having to play with fixmaps.
56 */
57unsigned long max_low_pfn_mapped;
58unsigned long max_pfn_mapped;
59
50static unsigned long dma_reserve __initdata; 60static unsigned long dma_reserve __initdata;
51 61
52DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 62DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
53 63
54int direct_gbpages __meminitdata 64int direct_gbpages
55#ifdef CONFIG_DIRECT_GBPAGES 65#ifdef CONFIG_DIRECT_GBPAGES
56 = 1 66 = 1
57#endif 67#endif
@@ -77,46 +87,69 @@ early_param("gbpages", parse_direct_gbpages_on);
77 * around without checking the pgd every time. 87 * around without checking the pgd every time.
78 */ 88 */
79 89
80void show_mem(void) 90int after_bootmem;
81{
82 long i, total = 0, reserved = 0;
83 long shared = 0, cached = 0;
84 struct page *page;
85 pg_data_t *pgdat;
86 91
87 printk(KERN_INFO "Mem-info:\n"); 92unsigned long __supported_pte_mask __read_mostly = ~0UL;
88 show_free_areas(); 93EXPORT_SYMBOL_GPL(__supported_pte_mask);
89 for_each_online_pgdat(pgdat) {
90 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
91 /*
92 * This loop can take a while with 256 GB and
93 * 4k pages so defer the NMI watchdog:
94 */
95 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
96 touch_nmi_watchdog();
97 94
98 if (!pfn_valid(pgdat->node_start_pfn + i)) 95static int do_not_nx __cpuinitdata;
99 continue;
100 96
101 page = pfn_to_page(pgdat->node_start_pfn + i); 97/*
102 total++; 98 * noexec=on|off
103 if (PageReserved(page)) 99 * Control non-executable mappings for 64-bit processes.
104 reserved++; 100 *
105 else if (PageSwapCache(page)) 101 * on Enable (default)
106 cached++; 102 * off Disable
107 else if (page_count(page)) 103 */
108 shared += page_count(page) - 1; 104static int __init nonx_setup(char *str)
109 } 105{
106 if (!str)
107 return -EINVAL;
108 if (!strncmp(str, "on", 2)) {
109 __supported_pte_mask |= _PAGE_NX;
110 do_not_nx = 0;
111 } else if (!strncmp(str, "off", 3)) {
112 do_not_nx = 1;
113 __supported_pte_mask &= ~_PAGE_NX;
110 } 114 }
111 printk(KERN_INFO "%lu pages of RAM\n", total); 115 return 0;
112 printk(KERN_INFO "%lu reserved pages\n", reserved);
113 printk(KERN_INFO "%lu pages shared\n", shared);
114 printk(KERN_INFO "%lu pages swap cached\n", cached);
115} 116}
117early_param("noexec", nonx_setup);
116 118
117int after_bootmem; 119void __cpuinit check_efer(void)
120{
121 unsigned long efer;
122
123 rdmsrl(MSR_EFER, efer);
124 if (!(efer & EFER_NX) || do_not_nx)
125 __supported_pte_mask &= ~_PAGE_NX;
126}
118 127
119static __init void *spp_getpage(void) 128int force_personality32;
129
130/*
131 * noexec32=on|off
132 * Control non executable heap for 32bit processes.
133 * To control the stack too use noexec=off
134 *
135 * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
136 * off PROT_READ implies PROT_EXEC
137 */
138static int __init nonx32_setup(char *str)
139{
140 if (!strcmp(str, "on"))
141 force_personality32 &= ~READ_IMPLIES_EXEC;
142 else if (!strcmp(str, "off"))
143 force_personality32 |= READ_IMPLIES_EXEC;
144 return 1;
145}
146__setup("noexec32=", nonx32_setup);
147
148/*
149 * NOTE: This function is marked __ref because it calls __init function
150 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
151 */
152static __ref void *spp_getpage(void)
120{ 153{
121 void *ptr; 154 void *ptr;
122 155
@@ -135,26 +168,17 @@ static __init void *spp_getpage(void)
135 return ptr; 168 return ptr;
136} 169}
137 170
138static void 171void
139set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) 172set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
140{ 173{
141 pgd_t *pgd;
142 pud_t *pud; 174 pud_t *pud;
143 pmd_t *pmd; 175 pmd_t *pmd;
144 pte_t *pte, new_pte; 176 pte_t *pte;
145
146 pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
147 177
148 pgd = pgd_offset_k(vaddr); 178 pud = pud_page + pud_index(vaddr);
149 if (pgd_none(*pgd)) {
150 printk(KERN_ERR
151 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
152 return;
153 }
154 pud = pud_offset(pgd, vaddr);
155 if (pud_none(*pud)) { 179 if (pud_none(*pud)) {
156 pmd = (pmd_t *) spp_getpage(); 180 pmd = (pmd_t *) spp_getpage();
157 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); 181 pud_populate(&init_mm, pud, pmd);
158 if (pmd != pmd_offset(pud, 0)) { 182 if (pmd != pmd_offset(pud, 0)) {
159 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 183 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
160 pmd, pmd_offset(pud, 0)); 184 pmd, pmd_offset(pud, 0));
@@ -164,13 +188,12 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
164 pmd = pmd_offset(pud, vaddr); 188 pmd = pmd_offset(pud, vaddr);
165 if (pmd_none(*pmd)) { 189 if (pmd_none(*pmd)) {
166 pte = (pte_t *) spp_getpage(); 190 pte = (pte_t *) spp_getpage();
167 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); 191 pmd_populate_kernel(&init_mm, pmd, pte);
168 if (pte != pte_offset_kernel(pmd, 0)) { 192 if (pte != pte_offset_kernel(pmd, 0)) {
169 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 193 printk(KERN_ERR "PAGETABLE BUG #02!\n");
170 return; 194 return;
171 } 195 }
172 } 196 }
173 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
174 197
175 pte = pte_offset_kernel(pmd, vaddr); 198 pte = pte_offset_kernel(pmd, vaddr);
176 if (!pte_none(*pte) && pte_val(new_pte) && 199 if (!pte_none(*pte) && pte_val(new_pte) &&
@@ -185,6 +208,64 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
185 __flush_tlb_one(vaddr); 208 __flush_tlb_one(vaddr);
186} 209}
187 210
211void
212set_pte_vaddr(unsigned long vaddr, pte_t pteval)
213{
214 pgd_t *pgd;
215 pud_t *pud_page;
216
217 pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
218
219 pgd = pgd_offset_k(vaddr);
220 if (pgd_none(*pgd)) {
221 printk(KERN_ERR
222 "PGD FIXMAP MISSING, it should be setup in head.S!\n");
223 return;
224 }
225 pud_page = (pud_t*)pgd_page_vaddr(*pgd);
226 set_pte_vaddr_pud(pud_page, vaddr, pteval);
227}
228
229/*
230 * Create large page table mappings for a range of physical addresses.
231 */
232static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
233 pgprot_t prot)
234{
235 pgd_t *pgd;
236 pud_t *pud;
237 pmd_t *pmd;
238
239 BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
240 for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
241 pgd = pgd_offset_k((unsigned long)__va(phys));
242 if (pgd_none(*pgd)) {
243 pud = (pud_t *) spp_getpage();
244 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
245 _PAGE_USER));
246 }
247 pud = pud_offset(pgd, (unsigned long)__va(phys));
248 if (pud_none(*pud)) {
249 pmd = (pmd_t *) spp_getpage();
250 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
251 _PAGE_USER));
252 }
253 pmd = pmd_offset(pud, phys);
254 BUG_ON(!pmd_none(*pmd));
255 set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
256 }
257}
258
259void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
260{
261 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
262}
263
264void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
265{
266 __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
267}
268
188/* 269/*
189 * The head.S code sets up the kernel high mapping: 270 * The head.S code sets up the kernel high mapping:
190 * 271 *
@@ -201,7 +282,7 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
201void __init cleanup_highmap(void) 282void __init cleanup_highmap(void)
202{ 283{
203 unsigned long vaddr = __START_KERNEL_map; 284 unsigned long vaddr = __START_KERNEL_map;
204 unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; 285 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
205 pmd_t *pmd = level2_kernel_pgt; 286 pmd_t *pmd = level2_kernel_pgt;
206 pmd_t *last_pmd = pmd + PTRS_PER_PMD; 287 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
207 288
@@ -213,22 +294,11 @@ void __init cleanup_highmap(void)
213 } 294 }
214} 295}
215 296
216/* NOTE: this is meant to be run only at boot */
217void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
218{
219 unsigned long address = __fix_to_virt(idx);
220
221 if (idx >= __end_of_fixed_addresses) {
222 printk(KERN_ERR "Invalid __set_fixmap\n");
223 return;
224 }
225 set_pte_phys(address, phys, prot);
226}
227
228static unsigned long __initdata table_start; 297static unsigned long __initdata table_start;
229static unsigned long __meminitdata table_end; 298static unsigned long __meminitdata table_end;
299static unsigned long __meminitdata table_top;
230 300
231static __meminit void *alloc_low_page(unsigned long *phys) 301static __ref void *alloc_low_page(unsigned long *phys)
232{ 302{
233 unsigned long pfn = table_end++; 303 unsigned long pfn = table_end++;
234 void *adr; 304 void *adr;
@@ -240,7 +310,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
240 return adr; 310 return adr;
241 } 311 }
242 312
243 if (pfn >= end_pfn) 313 if (pfn >= table_top)
244 panic("alloc_low_page: ran out of memory"); 314 panic("alloc_low_page: ran out of memory");
245 315
246 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 316 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -249,7 +319,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
249 return adr; 319 return adr;
250} 320}
251 321
252static __meminit void unmap_low_page(void *adr) 322static __ref void unmap_low_page(void *adr)
253{ 323{
254 if (after_bootmem) 324 if (after_bootmem)
255 return; 325 return;
@@ -257,65 +327,71 @@ static __meminit void unmap_low_page(void *adr)
257 early_iounmap(adr, PAGE_SIZE); 327 early_iounmap(adr, PAGE_SIZE);
258} 328}
259 329
260/* Must run before zap_low_mappings */ 330static unsigned long __meminit
261__meminit void *early_ioremap(unsigned long addr, unsigned long size) 331phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
332 pgprot_t prot)
262{ 333{
263 pmd_t *pmd, *last_pmd; 334 unsigned pages = 0;
264 unsigned long vaddr; 335 unsigned long last_map_addr = end;
265 int i, pmds; 336 int i;
337
338 pte_t *pte = pte_page + pte_index(addr);
266 339
267 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; 340 for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
268 vaddr = __START_KERNEL_map;
269 pmd = level2_kernel_pgt;
270 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
271 341
272 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { 342 if (addr >= end) {
273 for (i = 0; i < pmds; i++) { 343 if (!after_bootmem) {
274 if (pmd_present(pmd[i])) 344 for(; i < PTRS_PER_PTE; i++, pte++)
275 goto continue_outer_loop; 345 set_pte(pte, __pte(0));
346 }
347 break;
276 } 348 }
277 vaddr += addr & ~PMD_MASK;
278 addr &= PMD_MASK;
279 349
280 for (i = 0; i < pmds; i++, addr += PMD_SIZE) 350 /*
281 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); 351 * We will re-use the existing mapping.
282 __flush_tlb_all(); 352 * Xen for example has some special requirements, like mapping
353 * pagetable pages as RO. So assume someone who pre-setup
354 * these mappings are more intelligent.
355 */
356 if (pte_val(*pte))
357 continue;
283 358
284 return (void *)vaddr; 359 if (0)
285continue_outer_loop: 360 printk(" pte=%p addr=%lx pte=%016lx\n",
286 ; 361 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
362 pages++;
363 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
364 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
287 } 365 }
288 printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
289 366
290 return NULL; 367 update_page_count(PG_LEVEL_4K, pages);
368
369 return last_map_addr;
291} 370}
292 371
293/* 372static unsigned long __meminit
294 * To avoid virtual aliases later: 373phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
295 */ 374 pgprot_t prot)
296__meminit void early_iounmap(void *addr, unsigned long size)
297{ 375{
298 unsigned long vaddr; 376 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
299 pmd_t *pmd;
300 int i, pmds;
301 377
302 vaddr = (unsigned long)addr; 378 return phys_pte_init(pte, address, end, prot);
303 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
304 pmd = level2_kernel_pgt + pmd_index(vaddr);
305
306 for (i = 0; i < pmds; i++)
307 pmd_clear(pmd + i);
308
309 __flush_tlb_all();
310} 379}
311 380
312static unsigned long __meminit 381static unsigned long __meminit
313phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) 382phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
383 unsigned long page_size_mask, pgprot_t prot)
314{ 384{
385 unsigned long pages = 0;
386 unsigned long last_map_addr = end;
387
315 int i = pmd_index(address); 388 int i = pmd_index(address);
316 389
317 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { 390 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
391 unsigned long pte_phys;
318 pmd_t *pmd = pmd_page + pmd_index(address); 392 pmd_t *pmd = pmd_page + pmd_index(address);
393 pte_t *pte;
394 pgprot_t new_prot = prot;
319 395
320 if (address >= end) { 396 if (address >= end) {
321 if (!after_bootmem) { 397 if (!after_bootmem) {
@@ -325,31 +401,71 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
325 break; 401 break;
326 } 402 }
327 403
328 if (pmd_val(*pmd)) 404 if (pmd_val(*pmd)) {
405 if (!pmd_large(*pmd)) {
406 spin_lock(&init_mm.page_table_lock);
407 last_map_addr = phys_pte_update(pmd, address,
408 end, prot);
409 spin_unlock(&init_mm.page_table_lock);
410 continue;
411 }
412 /*
413 * If we are ok with PG_LEVEL_2M mapping, then we will
414 * use the existing mapping,
415 *
416 * Otherwise, we will split the large page mapping but
417 * use the same existing protection bits except for
418 * large page, so that we don't violate Intel's TLB
419 * Application note (317080) which says, while changing
420 * the page sizes, new and old translations should
421 * not differ with respect to page frame and
422 * attributes.
423 */
424 if (page_size_mask & (1 << PG_LEVEL_2M))
425 continue;
426 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
427 }
428
429 if (page_size_mask & (1<<PG_LEVEL_2M)) {
430 pages++;
431 spin_lock(&init_mm.page_table_lock);
432 set_pte((pte_t *)pmd,
433 pfn_pte(address >> PAGE_SHIFT,
434 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
435 spin_unlock(&init_mm.page_table_lock);
436 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
329 continue; 437 continue;
438 }
330 439
331 set_pte((pte_t *)pmd, 440 pte = alloc_low_page(&pte_phys);
332 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 441 last_map_addr = phys_pte_init(pte, address, end, new_prot);
442 unmap_low_page(pte);
443
444 spin_lock(&init_mm.page_table_lock);
445 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
446 spin_unlock(&init_mm.page_table_lock);
333 } 447 }
334 return address; 448 update_page_count(PG_LEVEL_2M, pages);
449 return last_map_addr;
335} 450}
336 451
337static unsigned long __meminit 452static unsigned long __meminit
338phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) 453phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
454 unsigned long page_size_mask, pgprot_t prot)
339{ 455{
340 pmd_t *pmd = pmd_offset(pud, 0); 456 pmd_t *pmd = pmd_offset(pud, 0);
341 unsigned long last_map_addr; 457 unsigned long last_map_addr;
342 458
343 spin_lock(&init_mm.page_table_lock); 459 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
344 last_map_addr = phys_pmd_init(pmd, address, end);
345 spin_unlock(&init_mm.page_table_lock);
346 __flush_tlb_all(); 460 __flush_tlb_all();
347 return last_map_addr; 461 return last_map_addr;
348} 462}
349 463
350static unsigned long __meminit 464static unsigned long __meminit
351phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) 465phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
466 unsigned long page_size_mask)
352{ 467{
468 unsigned long pages = 0;
353 unsigned long last_map_addr = end; 469 unsigned long last_map_addr = end;
354 int i = pud_index(addr); 470 int i = pud_index(addr);
355 471
@@ -357,6 +473,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
357 unsigned long pmd_phys; 473 unsigned long pmd_phys;
358 pud_t *pud = pud_page + pud_index(addr); 474 pud_t *pud = pud_page + pud_index(addr);
359 pmd_t *pmd; 475 pmd_t *pmd;
476 pgprot_t prot = PAGE_KERNEL;
360 477
361 if (addr >= end) 478 if (addr >= end)
362 break; 479 break;
@@ -368,42 +485,87 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
368 } 485 }
369 486
370 if (pud_val(*pud)) { 487 if (pud_val(*pud)) {
371 if (!pud_large(*pud)) 488 if (!pud_large(*pud)) {
372 last_map_addr = phys_pmd_update(pud, addr, end); 489 last_map_addr = phys_pmd_update(pud, addr, end,
373 continue; 490 page_size_mask, prot);
491 continue;
492 }
493 /*
494 * If we are ok with PG_LEVEL_1G mapping, then we will
495 * use the existing mapping.
496 *
497 * Otherwise, we will split the gbpage mapping but use
498 * the same existing protection bits except for large
499 * page, so that we don't violate Intel's TLB
500 * Application note (317080) which says, while changing
501 * the page sizes, new and old translations should
502 * not differ with respect to page frame and
503 * attributes.
504 */
505 if (page_size_mask & (1 << PG_LEVEL_1G))
506 continue;
507 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
374 } 508 }
375 509
376 if (direct_gbpages) { 510 if (page_size_mask & (1<<PG_LEVEL_1G)) {
511 pages++;
512 spin_lock(&init_mm.page_table_lock);
377 set_pte((pte_t *)pud, 513 set_pte((pte_t *)pud,
378 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 514 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
515 spin_unlock(&init_mm.page_table_lock);
379 last_map_addr = (addr & PUD_MASK) + PUD_SIZE; 516 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
380 continue; 517 continue;
381 } 518 }
382 519
383 pmd = alloc_low_page(&pmd_phys); 520 pmd = alloc_low_page(&pmd_phys);
521 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
522 prot);
523 unmap_low_page(pmd);
384 524
385 spin_lock(&init_mm.page_table_lock); 525 spin_lock(&init_mm.page_table_lock);
386 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 526 pud_populate(&init_mm, pud, __va(pmd_phys));
387 last_map_addr = phys_pmd_init(pmd, addr, end);
388 spin_unlock(&init_mm.page_table_lock); 527 spin_unlock(&init_mm.page_table_lock);
389
390 unmap_low_page(pmd);
391 } 528 }
392 __flush_tlb_all(); 529 __flush_tlb_all();
393 530
394 return last_map_addr >> PAGE_SHIFT; 531 update_page_count(PG_LEVEL_1G, pages);
532
533 return last_map_addr;
395} 534}
396 535
397static void __init find_early_table_space(unsigned long end) 536static unsigned long __meminit
537phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
538 unsigned long page_size_mask)
398{ 539{
399 unsigned long puds, pmds, tables, start; 540 pud_t *pud;
541
542 pud = (pud_t *)pgd_page_vaddr(*pgd);
543
544 return phys_pud_init(pud, addr, end, page_size_mask);
545}
546
547static void __init find_early_table_space(unsigned long end, int use_pse,
548 int use_gbpages)
549{
550 unsigned long puds, pmds, ptes, tables, start;
400 551
401 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 552 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
402 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 553 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
403 if (!direct_gbpages) { 554 if (use_gbpages) {
555 unsigned long extra;
556 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
557 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
558 } else
404 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 559 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
405 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 560 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
406 } 561
562 if (use_pse) {
563 unsigned long extra;
564 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
565 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
566 } else
567 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
568 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
407 569
408 /* 570 /*
409 * RED-PEN putting page tables only on node 0 could 571 * RED-PEN putting page tables only on node 0 could
@@ -417,10 +579,10 @@ static void __init find_early_table_space(unsigned long end)
417 579
418 table_start >>= PAGE_SHIFT; 580 table_start >>= PAGE_SHIFT;
419 table_end = table_start; 581 table_end = table_start;
582 table_top = table_start + (tables >> PAGE_SHIFT);
420 583
421 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", 584 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
422 end, table_start << PAGE_SHIFT, 585 end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
423 (table_start << PAGE_SHIFT) + tables);
424} 586}
425 587
426static void __init init_gbpages(void) 588static void __init init_gbpages(void)
@@ -431,125 +593,85 @@ static void __init init_gbpages(void)
431 direct_gbpages = 0; 593 direct_gbpages = 0;
432} 594}
433 595
434#ifdef CONFIG_MEMTEST_BOOTPARAM 596static unsigned long __init kernel_physical_mapping_init(unsigned long start,
435 597 unsigned long end,
436static void __init memtest(unsigned long start_phys, unsigned long size, 598 unsigned long page_size_mask)
437 unsigned pattern) 599{
438{
439 unsigned long i;
440 unsigned long *start;
441 unsigned long start_bad;
442 unsigned long last_bad;
443 unsigned long val;
444 unsigned long start_phys_aligned;
445 unsigned long count;
446 unsigned long incr;
447
448 switch (pattern) {
449 case 0:
450 val = 0UL;
451 break;
452 case 1:
453 val = -1UL;
454 break;
455 case 2:
456 val = 0x5555555555555555UL;
457 break;
458 case 3:
459 val = 0xaaaaaaaaaaaaaaaaUL;
460 break;
461 default:
462 return;
463 }
464 600
465 incr = sizeof(unsigned long); 601 unsigned long next, last_map_addr = end;
466 start_phys_aligned = ALIGN(start_phys, incr);
467 count = (size - (start_phys_aligned - start_phys))/incr;
468 start = __va(start_phys_aligned);
469 start_bad = 0;
470 last_bad = 0;
471
472 for (i = 0; i < count; i++)
473 start[i] = val;
474 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
475 if (*start != val) {
476 if (start_phys_aligned == last_bad + incr) {
477 last_bad += incr;
478 } else {
479 if (start_bad) {
480 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
481 val, start_bad, last_bad + incr);
482 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
483 }
484 start_bad = last_bad = start_phys_aligned;
485 }
486 }
487 }
488 if (start_bad) {
489 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
490 val, start_bad, last_bad + incr);
491 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
492 }
493 602
494} 603 start = (unsigned long)__va(start);
604 end = (unsigned long)__va(end);
495 605
496static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; 606 for (; start < end; start = next) {
607 pgd_t *pgd = pgd_offset_k(start);
608 unsigned long pud_phys;
609 pud_t *pud;
497 610
498static int __init parse_memtest(char *arg) 611 next = (start + PGDIR_SIZE) & PGDIR_MASK;
499{ 612 if (next > end)
500 if (arg) 613 next = end;
501 memtest_pattern = simple_strtoul(arg, NULL, 0);
502 return 0;
503}
504 614
505early_param("memtest", parse_memtest); 615 if (pgd_val(*pgd)) {
616 last_map_addr = phys_pud_update(pgd, __pa(start),
617 __pa(end), page_size_mask);
618 continue;
619 }
506 620
507static void __init early_memtest(unsigned long start, unsigned long end) 621 pud = alloc_low_page(&pud_phys);
508{ 622 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
509 u64 t_start, t_size; 623 page_size_mask);
510 unsigned pattern; 624 unmap_low_page(pud);
511 625
512 if (!memtest_pattern) 626 spin_lock(&init_mm.page_table_lock);
513 return; 627 pgd_populate(&init_mm, pgd, __va(pud_phys));
628 spin_unlock(&init_mm.page_table_lock);
629 }
630 __flush_tlb_all();
514 631
515 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); 632 return last_map_addr;
516 for (pattern = 0; pattern < memtest_pattern; pattern++) { 633}
517 t_start = start;
518 t_size = 0;
519 while (t_start < end) {
520 t_start = find_e820_area_size(t_start, &t_size, 1);
521 634
522 /* done ? */ 635struct map_range {
523 if (t_start >= end) 636 unsigned long start;
524 break; 637 unsigned long end;
525 if (t_start + t_size > end) 638 unsigned page_size_mask;
526 t_size = end - t_start; 639};
527 640
528 printk(KERN_CONT "\n %016llx - %016llx pattern %d", 641#define NR_RANGE_MR 5
529 t_start, t_start + t_size, pattern);
530 642
531 memtest(t_start, t_size, pattern); 643static int save_mr(struct map_range *mr, int nr_range,
644 unsigned long start_pfn, unsigned long end_pfn,
645 unsigned long page_size_mask)
646{
532 647
533 t_start += t_size; 648 if (start_pfn < end_pfn) {
534 } 649 if (nr_range >= NR_RANGE_MR)
650 panic("run out of range for init_memory_mapping\n");
651 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
652 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
653 mr[nr_range].page_size_mask = page_size_mask;
654 nr_range++;
535 } 655 }
536 printk(KERN_CONT "\n"); 656
537} 657 return nr_range;
538#else
539static void __init early_memtest(unsigned long start, unsigned long end)
540{
541} 658}
542#endif
543 659
544/* 660/*
545 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 661 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
546 * This runs before bootmem is initialized and gets pages directly from 662 * This runs before bootmem is initialized and gets pages directly from
547 * the physical memory. To access them they are temporarily mapped. 663 * the physical memory. To access them they are temporarily mapped.
548 */ 664 */
549unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) 665unsigned long __init_refok init_memory_mapping(unsigned long start,
666 unsigned long end)
550{ 667{
551 unsigned long next, last_map_addr = end; 668 unsigned long last_map_addr = 0;
552 unsigned long start_phys = start, end_phys = end; 669 unsigned long page_size_mask = 0;
670 unsigned long start_pfn, end_pfn;
671
672 struct map_range mr[NR_RANGE_MR];
673 int nr_range, i;
674 int use_pse, use_gbpages;
553 675
554 printk(KERN_INFO "init_memory_mapping\n"); 676 printk(KERN_INFO "init_memory_mapping\n");
555 677
@@ -560,48 +682,127 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon
560 * memory mapped. Unfortunately this is done currently before the 682 * memory mapped. Unfortunately this is done currently before the
561 * nodes are discovered. 683 * nodes are discovered.
562 */ 684 */
563 if (!after_bootmem) { 685 if (!after_bootmem)
564 init_gbpages(); 686 init_gbpages();
565 find_early_table_space(end);
566 }
567 687
568 start = (unsigned long)__va(start); 688#ifdef CONFIG_DEBUG_PAGEALLOC
569 end = (unsigned long)__va(end); 689 /*
690 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
691 * This will simplify cpa(), which otherwise needs to support splitting
692 * large pages into small in interrupt context, etc.
693 */
694 use_pse = use_gbpages = 0;
695#else
696 use_pse = cpu_has_pse;
697 use_gbpages = direct_gbpages;
698#endif
570 699
571 for (; start < end; start = next) { 700 if (use_gbpages)
572 pgd_t *pgd = pgd_offset_k(start); 701 page_size_mask |= 1 << PG_LEVEL_1G;
573 unsigned long pud_phys; 702 if (use_pse)
574 pud_t *pud; 703 page_size_mask |= 1 << PG_LEVEL_2M;
704
705 memset(mr, 0, sizeof(mr));
706 nr_range = 0;
707
708 /* head if not big page alignment ?*/
709 start_pfn = start >> PAGE_SHIFT;
710 end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
711 << (PMD_SHIFT - PAGE_SHIFT);
712 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
713
714 /* big page (2M) range*/
715 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
716 << (PMD_SHIFT - PAGE_SHIFT);
717 end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
718 << (PUD_SHIFT - PAGE_SHIFT);
719 if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
720 end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
721 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
722 page_size_mask & (1<<PG_LEVEL_2M));
723
724 /* big page (1G) range */
725 start_pfn = end_pfn;
726 end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
727 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
728 page_size_mask &
729 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
730
731 /* tail is not big page (1G) alignment */
732 start_pfn = end_pfn;
733 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
734 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
735 page_size_mask & (1<<PG_LEVEL_2M));
736
737 /* tail is not big page (2M) alignment */
738 start_pfn = end_pfn;
739 end_pfn = end>>PAGE_SHIFT;
740 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
741
742 /* try to merge same page size and continuous */
743 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
744 unsigned long old_start;
745 if (mr[i].end != mr[i+1].start ||
746 mr[i].page_size_mask != mr[i+1].page_size_mask)
747 continue;
748 /* move it */
749 old_start = mr[i].start;
750 memmove(&mr[i], &mr[i+1],
751 (nr_range - 1 - i) * sizeof (struct map_range));
752 mr[i].start = old_start;
753 nr_range--;
754 }
575 755
576 if (after_bootmem) 756 for (i = 0; i < nr_range; i++)
577 pud = pud_offset(pgd, start & PGDIR_MASK); 757 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
578 else 758 mr[i].start, mr[i].end,
579 pud = alloc_low_page(&pud_phys); 759 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
760 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
580 761
581 next = start + PGDIR_SIZE; 762 if (!after_bootmem)
582 if (next > end) 763 find_early_table_space(end, use_pse, use_gbpages);
583 next = end; 764
584 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); 765 for (i = 0; i < nr_range; i++)
585 if (!after_bootmem) 766 last_map_addr = kernel_physical_mapping_init(
586 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); 767 mr[i].start, mr[i].end,
587 unmap_low_page(pud); 768 mr[i].page_size_mask);
588 }
589 769
590 if (!after_bootmem) 770 if (!after_bootmem)
591 mmu_cr4_features = read_cr4(); 771 mmu_cr4_features = read_cr4();
592 __flush_tlb_all(); 772 __flush_tlb_all();
593 773
594 if (!after_bootmem) 774 if (!after_bootmem && table_end > table_start)
595 reserve_early(table_start << PAGE_SHIFT, 775 reserve_early(table_start << PAGE_SHIFT,
596 table_end << PAGE_SHIFT, "PGTABLE"); 776 table_end << PAGE_SHIFT, "PGTABLE");
597 777
778 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
779 last_map_addr, end);
780
598 if (!after_bootmem) 781 if (!after_bootmem)
599 early_memtest(start_phys, end_phys); 782 early_memtest(start, end);
600 783
601 return last_map_addr; 784 return last_map_addr >> PAGE_SHIFT;
602} 785}
603 786
604#ifndef CONFIG_NUMA 787#ifndef CONFIG_NUMA
788void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
789{
790 unsigned long bootmap_size, bootmap;
791
792 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
793 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
794 PAGE_SIZE);
795 if (bootmap == -1L)
796 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
797 /* don't touch min_low_pfn */
798 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
799 0, end_pfn);
800 e820_register_active_regions(0, start_pfn, end_pfn);
801 free_bootmem_with_active_regions(0, end_pfn);
802 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
803 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
804}
805
605void __init paging_init(void) 806void __init paging_init(void)
606{ 807{
607 unsigned long max_zone_pfns[MAX_NR_ZONES]; 808 unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -609,9 +810,9 @@ void __init paging_init(void)
609 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 810 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
610 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 811 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
611 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 812 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
612 max_zone_pfns[ZONE_NORMAL] = end_pfn; 813 max_zone_pfns[ZONE_NORMAL] = max_pfn;
613 814
614 memory_present(0, 0, end_pfn); 815 memory_present(0, 0, max_pfn);
615 sparse_init(); 816 sparse_init();
616 free_area_init_nodes(max_zone_pfns); 817 free_area_init_nodes(max_zone_pfns);
617} 818}
@@ -681,6 +882,8 @@ void __init mem_init(void)
681{ 882{
682 long codesize, reservedpages, datasize, initsize; 883 long codesize, reservedpages, datasize, initsize;
683 884
885 start_periodic_check_for_corruption();
886
684 pci_iommu_alloc(); 887 pci_iommu_alloc();
685 888
686 /* clear_bss() already clear the empty_zero_page */ 889 /* clear_bss() already clear the empty_zero_page */
@@ -693,8 +896,8 @@ void __init mem_init(void)
693#else 896#else
694 totalram_pages = free_all_bootmem(); 897 totalram_pages = free_all_bootmem();
695#endif 898#endif
696 reservedpages = end_pfn - totalram_pages - 899 reservedpages = max_pfn - totalram_pages -
697 absent_pages_in_range(0, end_pfn); 900 absent_pages_in_range(0, max_pfn);
698 after_bootmem = 1; 901 after_bootmem = 1;
699 902
700 codesize = (unsigned long) &_etext - (unsigned long) &_text; 903 codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -713,13 +916,11 @@ void __init mem_init(void)
713 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " 916 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
714 "%ldk reserved, %ldk data, %ldk init)\n", 917 "%ldk reserved, %ldk data, %ldk init)\n",
715 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), 918 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
716 end_pfn << (PAGE_SHIFT-10), 919 max_pfn << (PAGE_SHIFT-10),
717 codesize >> 10, 920 codesize >> 10,
718 reservedpages << (PAGE_SHIFT-10), 921 reservedpages << (PAGE_SHIFT-10),
719 datasize >> 10, 922 datasize >> 10,
720 initsize >> 10); 923 initsize >> 10);
721
722 cpa_init();
723} 924}
724 925
725void free_init_pages(char *what, unsigned long begin, unsigned long end) 926void free_init_pages(char *what, unsigned long begin, unsigned long end)
@@ -766,6 +967,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
766void mark_rodata_ro(void) 967void mark_rodata_ro(void)
767{ 968{
768 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 969 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
970 unsigned long rodata_start =
971 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
972
973#ifdef CONFIG_DYNAMIC_FTRACE
974 /* Dynamic tracing modifies the kernel text section */
975 start = rodata_start;
976#endif
769 977
770 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 978 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
771 (end - start) >> 10); 979 (end - start) >> 10);
@@ -775,8 +983,7 @@ void mark_rodata_ro(void)
775 * The rodata section (but not the kernel text!) should also be 983 * The rodata section (but not the kernel text!) should also be
776 * not-executable. 984 * not-executable.
777 */ 985 */
778 start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 986 set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
779 set_memory_nx(start, (end - start) >> PAGE_SHIFT);
780 987
781 rodata_test(); 988 rodata_test();
782 989
@@ -798,24 +1005,26 @@ void free_initrd_mem(unsigned long start, unsigned long end)
798} 1005}
799#endif 1006#endif
800 1007
801void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 1008int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1009 int flags)
802{ 1010{
803#ifdef CONFIG_NUMA 1011#ifdef CONFIG_NUMA
804 int nid, next_nid; 1012 int nid, next_nid;
1013 int ret;
805#endif 1014#endif
806 unsigned long pfn = phys >> PAGE_SHIFT; 1015 unsigned long pfn = phys >> PAGE_SHIFT;
807 1016
808 if (pfn >= end_pfn) { 1017 if (pfn >= max_pfn) {
809 /* 1018 /*
810 * This can happen with kdump kernels when accessing 1019 * This can happen with kdump kernels when accessing
811 * firmware tables: 1020 * firmware tables:
812 */ 1021 */
813 if (pfn < max_pfn_mapped) 1022 if (pfn < max_pfn_mapped)
814 return; 1023 return -EFAULT;
815 1024
816 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", 1025 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
817 phys, len); 1026 phys, len);
818 return; 1027 return -EFAULT;
819 } 1028 }
820 1029
821 /* Should check here against the e820 map to avoid double free */ 1030 /* Should check here against the e820 map to avoid double free */
@@ -823,9 +1032,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
823 nid = phys_to_nid(phys); 1032 nid = phys_to_nid(phys);
824 next_nid = phys_to_nid(phys + len - 1); 1033 next_nid = phys_to_nid(phys + len - 1);
825 if (nid == next_nid) 1034 if (nid == next_nid)
826 reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); 1035 ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
827 else 1036 else
828 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 1037 ret = reserve_bootmem(phys, len, flags);
1038
1039 if (ret != 0)
1040 return ret;
1041
829#else 1042#else
830 reserve_bootmem(phys, len, BOOTMEM_DEFAULT); 1043 reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
831#endif 1044#endif
@@ -834,6 +1047,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
834 dma_reserve += len / PAGE_SIZE; 1047 dma_reserve += len / PAGE_SIZE;
835 set_dma_reserve(dma_reserve); 1048 set_dma_reserve(dma_reserve);
836 } 1049 }
1050
1051 return 0;
837} 1052}
838 1053
839int kern_addr_valid(unsigned long addr) 1054int kern_addr_valid(unsigned long addr)
@@ -938,7 +1153,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
938 pmd_t *pmd; 1153 pmd_t *pmd;
939 1154
940 for (; addr < end; addr = next) { 1155 for (; addr < end; addr = next) {
941 next = pmd_addr_end(addr, end); 1156 void *p = NULL;
942 1157
943 pgd = vmemmap_pgd_populate(addr, node); 1158 pgd = vmemmap_pgd_populate(addr, node);
944 if (!pgd) 1159 if (!pgd)
@@ -948,33 +1163,51 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
948 if (!pud) 1163 if (!pud)
949 return -ENOMEM; 1164 return -ENOMEM;
950 1165
951 pmd = pmd_offset(pud, addr); 1166 if (!cpu_has_pse) {
952 if (pmd_none(*pmd)) { 1167 next = (addr + PAGE_SIZE) & PAGE_MASK;
953 pte_t entry; 1168 pmd = vmemmap_pmd_populate(pud, addr, node);
954 void *p; 1169
1170 if (!pmd)
1171 return -ENOMEM;
1172
1173 p = vmemmap_pte_populate(pmd, addr, node);
955 1174
956 p = vmemmap_alloc_block(PMD_SIZE, node);
957 if (!p) 1175 if (!p)
958 return -ENOMEM; 1176 return -ENOMEM;
959 1177
960 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, 1178 addr_end = addr + PAGE_SIZE;
961 PAGE_KERNEL_LARGE); 1179 p_end = p + PAGE_SIZE;
962 set_pmd(pmd, __pmd(pte_val(entry)));
963
964 /* check to see if we have contiguous blocks */
965 if (p_end != p || node_start != node) {
966 if (p_start)
967 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
968 addr_start, addr_end-1, p_start, p_end-1, node_start);
969 addr_start = addr;
970 node_start = node;
971 p_start = p;
972 }
973 addr_end = addr + PMD_SIZE;
974 p_end = p + PMD_SIZE;
975 } else { 1180 } else {
976 vmemmap_verify((pte_t *)pmd, node, addr, next); 1181 next = pmd_addr_end(addr, end);
1182
1183 pmd = pmd_offset(pud, addr);
1184 if (pmd_none(*pmd)) {
1185 pte_t entry;
1186
1187 p = vmemmap_alloc_block(PMD_SIZE, node);
1188 if (!p)
1189 return -ENOMEM;
1190
1191 entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
1192 PAGE_KERNEL_LARGE);
1193 set_pmd(pmd, __pmd(pte_val(entry)));
1194
1195 /* check to see if we have contiguous blocks */
1196 if (p_end != p || node_start != node) {
1197 if (p_start)
1198 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1199 addr_start, addr_end-1, p_start, p_end-1, node_start);
1200 addr_start = addr;
1201 node_start = node;
1202 p_start = p;
1203 }
1204
1205 addr_end = addr + PMD_SIZE;
1206 p_end = p + PMD_SIZE;
1207 } else
1208 vmemmap_verify((pte_t *)pmd, node, addr, next);
977 } 1209 }
1210
978 } 1211 }
979 return 0; 1212 return 0;
980} 1213}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 9dd3cb905971..8cbeda15cd29 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/mmiotrace.h>
15 16
16#include <asm/cacheflush.h> 17#include <asm/cacheflush.h>
17#include <asm/e820.h> 18#include <asm/e820.h>
@@ -101,6 +102,25 @@ int page_is_ram(unsigned long pagenr)
101 return 0; 102 return 0;
102} 103}
103 104
105int pagerange_is_ram(unsigned long start, unsigned long end)
106{
107 int ram_page = 0, not_rampage = 0;
108 unsigned long page_nr;
109
110 for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
111 ++page_nr) {
112 if (page_is_ram(page_nr))
113 ram_page = 1;
114 else
115 not_rampage = 1;
116
117 if (ram_page == not_rampage)
118 return -1;
119 }
120
121 return ram_page;
122}
123
104/* 124/*
105 * Fix up the linear direct mapping of the kernel to avoid cache attribute 125 * Fix up the linear direct mapping of the kernel to avoid cache attribute
106 * conflicts. 126 * conflicts.
@@ -141,10 +161,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
141{ 161{
142 unsigned long pfn, offset, vaddr; 162 unsigned long pfn, offset, vaddr;
143 resource_size_t last_addr; 163 resource_size_t last_addr;
164 const resource_size_t unaligned_phys_addr = phys_addr;
165 const unsigned long unaligned_size = size;
144 struct vm_struct *area; 166 struct vm_struct *area;
145 unsigned long new_prot_val; 167 unsigned long new_prot_val;
146 pgprot_t prot; 168 pgprot_t prot;
147 int retval; 169 int retval;
170 void __iomem *ret_addr;
148 171
149 /* Don't allow wraparound or zero size */ 172 /* Don't allow wraparound or zero size */
150 last_addr = phys_addr + size - 1; 173 last_addr = phys_addr + size - 1;
@@ -161,7 +184,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
161 /* 184 /*
162 * Don't remap the low PCI/ISA area, it's always mapped.. 185 * Don't remap the low PCI/ISA area, it's always mapped..
163 */ 186 */
164 if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) 187 if (is_ISA_range(phys_addr, last_addr))
165 return (__force void __iomem *)phys_to_virt(phys_addr); 188 return (__force void __iomem *)phys_to_virt(phys_addr);
166 189
167 /* 190 /*
@@ -185,7 +208,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
185 phys_addr &= PAGE_MASK; 208 phys_addr &= PAGE_MASK;
186 size = PAGE_ALIGN(last_addr+1) - phys_addr; 209 size = PAGE_ALIGN(last_addr+1) - phys_addr;
187 210
188 retval = reserve_memtype(phys_addr, phys_addr + size, 211 retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
189 prot_val, &new_prot_val); 212 prot_val, &new_prot_val);
190 if (retval) { 213 if (retval) {
191 pr_debug("Warning: reserve_memtype returned %d\n", retval); 214 pr_debug("Warning: reserve_memtype returned %d\n", retval);
@@ -252,7 +275,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
252 return NULL; 275 return NULL;
253 } 276 }
254 277
255 return (void __iomem *) (vaddr + offset); 278 ret_addr = (void __iomem *) (vaddr + offset);
279 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
280
281 return ret_addr;
256} 282}
257 283
258/** 284/**
@@ -280,7 +306,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
280{ 306{
281 /* 307 /*
282 * Ideally, this should be: 308 * Ideally, this should be:
283 * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; 309 * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
284 * 310 *
285 * Till we fix all X drivers to use ioremap_wc(), we will use 311 * Till we fix all X drivers to use ioremap_wc(), we will use
286 * UC MINUS. 312 * UC MINUS.
@@ -304,7 +330,7 @@ EXPORT_SYMBOL(ioremap_nocache);
304 */ 330 */
305void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) 331void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
306{ 332{
307 if (pat_wc_enabled) 333 if (pat_enabled)
308 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, 334 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
309 __builtin_return_address(0)); 335 __builtin_return_address(0));
310 else 336 else
@@ -319,6 +345,37 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
319} 345}
320EXPORT_SYMBOL(ioremap_cache); 346EXPORT_SYMBOL(ioremap_cache);
321 347
348static void __iomem *ioremap_default(resource_size_t phys_addr,
349 unsigned long size)
350{
351 unsigned long flags;
352 void *ret;
353 int err;
354
355 /*
356 * - WB for WB-able memory and no other conflicting mappings
357 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
358 * - Inherit from confliting mappings otherwise
359 */
360 err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
361 if (err < 0)
362 return NULL;
363
364 ret = (void *) __ioremap_caller(phys_addr, size, flags,
365 __builtin_return_address(0));
366
367 free_memtype(phys_addr, phys_addr + size);
368 return (void __iomem *)ret;
369}
370
371void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
372 unsigned long prot_val)
373{
374 return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
375 __builtin_return_address(0));
376}
377EXPORT_SYMBOL(ioremap_prot);
378
322/** 379/**
323 * iounmap - Free a IO remapping 380 * iounmap - Free a IO remapping
324 * @addr: virtual address from ioremap_* 381 * @addr: virtual address from ioremap_*
@@ -337,13 +394,15 @@ void iounmap(volatile void __iomem *addr)
337 * vm_area and by simply returning an address into the kernel mapping 394 * vm_area and by simply returning an address into the kernel mapping
338 * of ISA space. So handle that here. 395 * of ISA space. So handle that here.
339 */ 396 */
340 if (addr >= phys_to_virt(ISA_START_ADDRESS) && 397 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
341 addr < phys_to_virt(ISA_END_ADDRESS)) 398 (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
342 return; 399 return;
343 400
344 addr = (volatile void __iomem *) 401 addr = (volatile void __iomem *)
345 (PAGE_MASK & (unsigned long __force)addr); 402 (PAGE_MASK & (unsigned long __force)addr);
346 403
404 mmiotrace_iounmap(addr);
405
347 /* Use the vm area unlocked, assuming the caller 406 /* Use the vm area unlocked, assuming the caller
348 ensures there isn't another iounmap for the same address 407 ensures there isn't another iounmap for the same address
349 in parallel. Reuse of the virtual address is prevented by 408 in parallel. Reuse of the virtual address is prevented by
@@ -351,7 +410,7 @@ void iounmap(volatile void __iomem *addr)
351 cpa takes care of the direct mappings. */ 410 cpa takes care of the direct mappings. */
352 read_lock(&vmlist_lock); 411 read_lock(&vmlist_lock);
353 for (p = vmlist; p; p = p->next) { 412 for (p = vmlist; p; p = p->next) {
354 if (p->addr == addr) 413 if (p->addr == (void __force *)addr)
355 break; 414 break;
356 } 415 }
357 read_unlock(&vmlist_lock); 416 read_unlock(&vmlist_lock);
@@ -365,7 +424,7 @@ void iounmap(volatile void __iomem *addr)
365 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); 424 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
366 425
367 /* Finally remove it */ 426 /* Finally remove it */
368 o = remove_vm_area((void *)addr); 427 o = remove_vm_area((void __force *)addr);
369 BUG_ON(p != o || o == NULL); 428 BUG_ON(p != o || o == NULL);
370 kfree(p); 429 kfree(p);
371} 430}
@@ -384,7 +443,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
384 if (page_is_ram(start >> PAGE_SHIFT)) 443 if (page_is_ram(start >> PAGE_SHIFT))
385 return __va(phys); 444 return __va(phys);
386 445
387 addr = (void *)ioremap(start, PAGE_SIZE); 446 addr = (void __force *)ioremap_default(start, PAGE_SIZE);
388 if (addr) 447 if (addr)
389 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 448 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
390 449
@@ -400,9 +459,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
400 return; 459 return;
401} 460}
402 461
403#ifdef CONFIG_X86_32 462static int __initdata early_ioremap_debug;
404
405int __initdata early_ioremap_debug;
406 463
407static int __init early_ioremap_debug_setup(char *str) 464static int __init early_ioremap_debug_setup(char *str)
408{ 465{
@@ -413,8 +470,7 @@ static int __init early_ioremap_debug_setup(char *str)
413early_param("early_ioremap_debug", early_ioremap_debug_setup); 470early_param("early_ioremap_debug", early_ioremap_debug_setup);
414 471
415static __initdata int after_paging_init; 472static __initdata int after_paging_init;
416static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] 473static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
417 __section(.bss.page_aligned);
418 474
419static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) 475static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
420{ 476{
@@ -503,10 +559,11 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
503 return; 559 return;
504 } 560 }
505 pte = early_ioremap_pte(addr); 561 pte = early_ioremap_pte(addr);
562
506 if (pgprot_val(flags)) 563 if (pgprot_val(flags))
507 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); 564 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
508 else 565 else
509 pte_clear(NULL, addr, pte); 566 pte_clear(&init_mm, addr, pte);
510 __flush_tlb_one(addr); 567 __flush_tlb_one(addr);
511} 568}
512 569
@@ -528,19 +585,17 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
528} 585}
529 586
530 587
531int __initdata early_ioremap_nested; 588static int __initdata early_ioremap_nested;
532 589
533static int __init check_early_ioremap_leak(void) 590static int __init check_early_ioremap_leak(void)
534{ 591{
535 if (!early_ioremap_nested) 592 if (!early_ioremap_nested)
536 return 0; 593 return 0;
537 594 WARN(1, KERN_WARNING
538 printk(KERN_WARNING
539 "Debug warning: early ioremap leak of %d areas detected.\n", 595 "Debug warning: early ioremap leak of %d areas detected.\n",
540 early_ioremap_nested); 596 early_ioremap_nested);
541 printk(KERN_WARNING 597 printk(KERN_WARNING
542 "please boot with early_ioremap_debug and report the dmesg.\n"); 598 "please boot with early_ioremap_debug and report the dmesg.\n");
543 WARN_ON(1);
544 599
545 return 1; 600 return 1;
546} 601}
@@ -578,7 +633,7 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
578 */ 633 */
579 offset = phys_addr & ~PAGE_MASK; 634 offset = phys_addr & ~PAGE_MASK;
580 phys_addr &= PAGE_MASK; 635 phys_addr &= PAGE_MASK;
581 size = PAGE_ALIGN(last_addr) - phys_addr; 636 size = PAGE_ALIGN(last_addr + 1) - phys_addr;
582 637
583 /* 638 /*
584 * Mappings have to fit in the FIX_BTMAP area. 639 * Mappings have to fit in the FIX_BTMAP area.
@@ -644,5 +699,3 @@ void __this_fixmap_does_not_exist(void)
644{ 699{
645 WARN_ON(1); 700 WARN_ON(1);
646} 701}
647
648#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 1f476e477844..41f1b5c00a1d 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,6 +22,7 @@
22#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/mpspec.h> 23#include <asm/mpspec.h>
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h>
25 26
26static __init int find_northbridge(void) 27static __init int find_northbridge(void)
27{ 28{
@@ -56,34 +57,33 @@ static __init void early_get_boot_cpu_id(void)
56 /* 57 /*
57 * Find possible boot-time SMP configuration: 58 * Find possible boot-time SMP configuration:
58 */ 59 */
60#ifdef CONFIG_X86_MPPARSE
59 early_find_smp_config(); 61 early_find_smp_config();
62#endif
60#ifdef CONFIG_ACPI 63#ifdef CONFIG_ACPI
61 /* 64 /*
62 * Read APIC information from ACPI tables. 65 * Read APIC information from ACPI tables.
63 */ 66 */
64 early_acpi_boot_init(); 67 early_acpi_boot_init();
65#endif 68#endif
69#ifdef CONFIG_X86_MPPARSE
66 /* 70 /*
67 * get boot-time SMP configuration: 71 * get boot-time SMP configuration:
68 */ 72 */
69 if (smp_found_config) 73 if (smp_found_config)
70 early_get_smp_config(); 74 early_get_smp_config();
75#endif
71 early_init_lapic_mapping(); 76 early_init_lapic_mapping();
72} 77}
73 78
74int __init k8_scan_nodes(unsigned long start, unsigned long end) 79int __init k8_scan_nodes(unsigned long start, unsigned long end)
75{ 80{
81 unsigned numnodes, cores, bits, apicid_base;
76 unsigned long prevbase; 82 unsigned long prevbase;
77 struct bootnode nodes[8]; 83 struct bootnode nodes[8];
78 int nodeid, i, nb;
79 unsigned char nodeids[8]; 84 unsigned char nodeids[8];
80 int found = 0; 85 int i, j, nb, found = 0;
81 u32 reg; 86 u32 nodeid, reg;
82 unsigned numnodes;
83 unsigned cores;
84 unsigned bits;
85 int j;
86 unsigned apicid_base;
87 87
88 if (!early_pci_allowed()) 88 if (!early_pci_allowed())
89 return -1; 89 return -1;
@@ -105,7 +105,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
105 prevbase = 0; 105 prevbase = 0;
106 for (i = 0; i < 8; i++) { 106 for (i = 0; i < 8; i++) {
107 unsigned long base, limit; 107 unsigned long base, limit;
108 u32 nodeid;
109 108
110 base = read_pci_config(0, nb, 1, 0x40 + i*8); 109 base = read_pci_config(0, nb, 1, 0x40 + i*8);
111 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 110 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -144,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
144 limit |= (1<<24)-1; 143 limit |= (1<<24)-1;
145 limit++; 144 limit++;
146 145
147 if (limit > end_pfn << PAGE_SHIFT) 146 if (limit > max_pfn << PAGE_SHIFT)
148 limit = end_pfn << PAGE_SHIFT; 147 limit = max_pfn << PAGE_SHIFT;
149 if (limit <= base) 148 if (limit <= base)
150 continue; 149 continue;
151 150
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 000000000000..93d82038af4b
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,510 @@
1/* Support for MMIO probes.
2 * Benfit many code from kprobes
3 * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
4 * 2007 Alexander Eichner
5 * 2008 Pekka Paalanen <pq@iki.fi>
6 */
7
8#include <linux/list.h>
9#include <linux/rculist.h>
10#include <linux/spinlock.h>
11#include <linux/hash.h>
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/kernel.h>
15#include <linux/uaccess.h>
16#include <linux/ptrace.h>
17#include <linux/preempt.h>
18#include <linux/percpu.h>
19#include <linux/kdebug.h>
20#include <linux/mutex.h>
21#include <linux/io.h>
22#include <asm/cacheflush.h>
23#include <asm/tlbflush.h>
24#include <linux/errno.h>
25#include <asm/debugreg.h>
26#include <linux/mmiotrace.h>
27
28#define KMMIO_PAGE_HASH_BITS 4
29#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
30
31struct kmmio_fault_page {
32 struct list_head list;
33 struct kmmio_fault_page *release_next;
34 unsigned long page; /* location of the fault page */
35
36 /*
37 * Number of times this page has been registered as a part
38 * of a probe. If zero, page is disarmed and this may be freed.
39 * Used only by writers (RCU).
40 */
41 int count;
42};
43
44struct kmmio_delayed_release {
45 struct rcu_head rcu;
46 struct kmmio_fault_page *release_list;
47};
48
49struct kmmio_context {
50 struct kmmio_fault_page *fpage;
51 struct kmmio_probe *probe;
52 unsigned long saved_flags;
53 unsigned long addr;
54 int active;
55};
56
57static DEFINE_SPINLOCK(kmmio_lock);
58
59/* Protected by kmmio_lock */
60unsigned int kmmio_count;
61
62/* Read-protected by RCU, write-protected by kmmio_lock. */
63static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
64static LIST_HEAD(kmmio_probes);
65
66static struct list_head *kmmio_page_list(unsigned long page)
67{
68 return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
69}
70
71/* Accessed per-cpu */
72static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
73
74/*
75 * this is basically a dynamic stabbing problem:
76 * Could use the existing prio tree code or
77 * Possible better implementations:
78 * The Interval Skip List: A Data Structure for Finding All Intervals That
79 * Overlap a Point (might be simple)
80 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
81 */
82/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
83static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
84{
85 struct kmmio_probe *p;
86 list_for_each_entry_rcu(p, &kmmio_probes, list) {
87 if (addr >= p->addr && addr <= (p->addr + p->len))
88 return p;
89 }
90 return NULL;
91}
92
93/* You must be holding RCU read lock. */
94static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
95{
96 struct list_head *head;
97 struct kmmio_fault_page *p;
98
99 page &= PAGE_MASK;
100 head = kmmio_page_list(page);
101 list_for_each_entry_rcu(p, head, list) {
102 if (p->page == page)
103 return p;
104 }
105 return NULL;
106}
107
108static void set_page_present(unsigned long addr, bool present,
109 unsigned int *pglevel)
110{
111 pteval_t pteval;
112 pmdval_t pmdval;
113 unsigned int level;
114 pmd_t *pmd;
115 pte_t *pte = lookup_address(addr, &level);
116
117 if (!pte) {
118 pr_err("kmmio: no pte for page 0x%08lx\n", addr);
119 return;
120 }
121
122 if (pglevel)
123 *pglevel = level;
124
125 switch (level) {
126 case PG_LEVEL_2M:
127 pmd = (pmd_t *)pte;
128 pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
129 if (present)
130 pmdval |= _PAGE_PRESENT;
131 set_pmd(pmd, __pmd(pmdval));
132 break;
133
134 case PG_LEVEL_4K:
135 pteval = pte_val(*pte) & ~_PAGE_PRESENT;
136 if (present)
137 pteval |= _PAGE_PRESENT;
138 set_pte_atomic(pte, __pte(pteval));
139 break;
140
141 default:
142 pr_err("kmmio: unexpected page level 0x%x.\n", level);
143 return;
144 }
145
146 __flush_tlb_one(addr);
147}
148
149/** Mark the given page as not present. Access to it will trigger a fault. */
150static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
151{
152 set_page_present(page & PAGE_MASK, false, pglevel);
153}
154
155/** Mark the given page as present. */
156static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
157{
158 set_page_present(page & PAGE_MASK, true, pglevel);
159}
160
161/*
162 * This is being called from do_page_fault().
163 *
164 * We may be in an interrupt or a critical section. Also prefecthing may
165 * trigger a page fault. We may be in the middle of process switch.
166 * We cannot take any locks, because we could be executing especially
167 * within a kmmio critical section.
168 *
169 * Local interrupts are disabled, so preemption cannot happen.
170 * Do not enable interrupts, do not sleep, and watch out for other CPUs.
171 */
172/*
173 * Interrupts are disabled on entry as trap3 is an interrupt gate
174 * and they remain disabled thorough out this function.
175 */
176int kmmio_handler(struct pt_regs *regs, unsigned long addr)
177{
178 struct kmmio_context *ctx;
179 struct kmmio_fault_page *faultpage;
180 int ret = 0; /* default to fault not handled */
181
182 /*
183 * Preemption is now disabled to prevent process switch during
184 * single stepping. We can only handle one active kmmio trace
185 * per cpu, so ensure that we finish it before something else
186 * gets to run. We also hold the RCU read lock over single
187 * stepping to avoid looking up the probe and kmmio_fault_page
188 * again.
189 */
190 preempt_disable();
191 rcu_read_lock();
192
193 faultpage = get_kmmio_fault_page(addr);
194 if (!faultpage) {
195 /*
196 * Either this page fault is not caused by kmmio, or
197 * another CPU just pulled the kmmio probe from under
198 * our feet. The latter case should not be possible.
199 */
200 goto no_kmmio;
201 }
202
203 ctx = &get_cpu_var(kmmio_ctx);
204 if (ctx->active) {
205 disarm_kmmio_fault_page(faultpage->page, NULL);
206 if (addr == ctx->addr) {
207 /*
208 * On SMP we sometimes get recursive probe hits on the
209 * same address. Context is already saved, fall out.
210 */
211 pr_debug("kmmio: duplicate probe hit on CPU %d, for "
212 "address 0x%08lx.\n",
213 smp_processor_id(), addr);
214 ret = 1;
215 goto no_kmmio_ctx;
216 }
217 /*
218 * Prevent overwriting already in-flight context.
219 * This should not happen, let's hope disarming at least
220 * prevents a panic.
221 */
222 pr_emerg("kmmio: recursive probe hit on CPU %d, "
223 "for address 0x%08lx. Ignoring.\n",
224 smp_processor_id(), addr);
225 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
226 ctx->addr);
227 goto no_kmmio_ctx;
228 }
229 ctx->active++;
230
231 ctx->fpage = faultpage;
232 ctx->probe = get_kmmio_probe(addr);
233 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
234 ctx->addr = addr;
235
236 if (ctx->probe && ctx->probe->pre_handler)
237 ctx->probe->pre_handler(ctx->probe, regs, addr);
238
239 /*
240 * Enable single-stepping and disable interrupts for the faulting
241 * context. Local interrupts must not get enabled during stepping.
242 */
243 regs->flags |= X86_EFLAGS_TF;
244 regs->flags &= ~X86_EFLAGS_IF;
245
246 /* Now we set present bit in PTE and single step. */
247 disarm_kmmio_fault_page(ctx->fpage->page, NULL);
248
249 /*
250 * If another cpu accesses the same page while we are stepping,
251 * the access will not be caught. It will simply succeed and the
252 * only downside is we lose the event. If this becomes a problem,
253 * the user should drop to single cpu before tracing.
254 */
255
256 put_cpu_var(kmmio_ctx);
257 return 1; /* fault handled */
258
259no_kmmio_ctx:
260 put_cpu_var(kmmio_ctx);
261no_kmmio:
262 rcu_read_unlock();
263 preempt_enable_no_resched();
264 return ret;
265}
266
267/*
268 * Interrupts are disabled on entry as trap1 is an interrupt gate
269 * and they remain disabled thorough out this function.
270 * This must always get called as the pair to kmmio_handler().
271 */
272static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
273{
274 int ret = 0;
275 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
276
277 if (!ctx->active) {
278 pr_debug("kmmio: spurious debug trap on CPU %d.\n",
279 smp_processor_id());
280 goto out;
281 }
282
283 if (ctx->probe && ctx->probe->post_handler)
284 ctx->probe->post_handler(ctx->probe, condition, regs);
285
286 arm_kmmio_fault_page(ctx->fpage->page, NULL);
287
288 regs->flags &= ~X86_EFLAGS_TF;
289 regs->flags |= ctx->saved_flags;
290
291 /* These were acquired in kmmio_handler(). */
292 ctx->active--;
293 BUG_ON(ctx->active);
294 rcu_read_unlock();
295 preempt_enable_no_resched();
296
297 /*
298 * if somebody else is singlestepping across a probe point, flags
299 * will have TF set, in which case, continue the remaining processing
300 * of do_debug, as if this is not a probe hit.
301 */
302 if (!(regs->flags & X86_EFLAGS_TF))
303 ret = 1;
304out:
305 put_cpu_var(kmmio_ctx);
306 return ret;
307}
308
309/* You must be holding kmmio_lock. */
310static int add_kmmio_fault_page(unsigned long page)
311{
312 struct kmmio_fault_page *f;
313
314 page &= PAGE_MASK;
315 f = get_kmmio_fault_page(page);
316 if (f) {
317 if (!f->count)
318 arm_kmmio_fault_page(f->page, NULL);
319 f->count++;
320 return 0;
321 }
322
323 f = kmalloc(sizeof(*f), GFP_ATOMIC);
324 if (!f)
325 return -1;
326
327 f->count = 1;
328 f->page = page;
329 list_add_rcu(&f->list, kmmio_page_list(f->page));
330
331 arm_kmmio_fault_page(f->page, NULL);
332
333 return 0;
334}
335
336/* You must be holding kmmio_lock. */
337static void release_kmmio_fault_page(unsigned long page,
338 struct kmmio_fault_page **release_list)
339{
340 struct kmmio_fault_page *f;
341
342 page &= PAGE_MASK;
343 f = get_kmmio_fault_page(page);
344 if (!f)
345 return;
346
347 f->count--;
348 BUG_ON(f->count < 0);
349 if (!f->count) {
350 disarm_kmmio_fault_page(f->page, NULL);
351 f->release_next = *release_list;
352 *release_list = f;
353 }
354}
355
356/*
357 * With page-unaligned ioremaps, one or two armed pages may contain
358 * addresses from outside the intended mapping. Events for these addresses
359 * are currently silently dropped. The events may result only from programming
360 * mistakes by accessing addresses before the beginning or past the end of a
361 * mapping.
362 */
363int register_kmmio_probe(struct kmmio_probe *p)
364{
365 unsigned long flags;
366 int ret = 0;
367 unsigned long size = 0;
368 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
369
370 spin_lock_irqsave(&kmmio_lock, flags);
371 if (get_kmmio_probe(p->addr)) {
372 ret = -EEXIST;
373 goto out;
374 }
375 kmmio_count++;
376 list_add_rcu(&p->list, &kmmio_probes);
377 while (size < size_lim) {
378 if (add_kmmio_fault_page(p->addr + size))
379 pr_err("kmmio: Unable to set page fault.\n");
380 size += PAGE_SIZE;
381 }
382out:
383 spin_unlock_irqrestore(&kmmio_lock, flags);
384 /*
385 * XXX: What should I do here?
386 * Here was a call to global_flush_tlb(), but it does not exist
387 * anymore. It seems it's not needed after all.
388 */
389 return ret;
390}
391EXPORT_SYMBOL(register_kmmio_probe);
392
393static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
394{
395 struct kmmio_delayed_release *dr = container_of(
396 head,
397 struct kmmio_delayed_release,
398 rcu);
399 struct kmmio_fault_page *p = dr->release_list;
400 while (p) {
401 struct kmmio_fault_page *next = p->release_next;
402 BUG_ON(p->count);
403 kfree(p);
404 p = next;
405 }
406 kfree(dr);
407}
408
409static void remove_kmmio_fault_pages(struct rcu_head *head)
410{
411 struct kmmio_delayed_release *dr = container_of(
412 head,
413 struct kmmio_delayed_release,
414 rcu);
415 struct kmmio_fault_page *p = dr->release_list;
416 struct kmmio_fault_page **prevp = &dr->release_list;
417 unsigned long flags;
418 spin_lock_irqsave(&kmmio_lock, flags);
419 while (p) {
420 if (!p->count)
421 list_del_rcu(&p->list);
422 else
423 *prevp = p->release_next;
424 prevp = &p->release_next;
425 p = p->release_next;
426 }
427 spin_unlock_irqrestore(&kmmio_lock, flags);
428 /* This is the real RCU destroy call. */
429 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
430}
431
432/*
433 * Remove a kmmio probe. You have to synchronize_rcu() before you can be
434 * sure that the callbacks will not be called anymore. Only after that
435 * you may actually release your struct kmmio_probe.
436 *
437 * Unregistering a kmmio fault page has three steps:
438 * 1. release_kmmio_fault_page()
439 * Disarm the page, wait a grace period to let all faults finish.
440 * 2. remove_kmmio_fault_pages()
441 * Remove the pages from kmmio_page_table.
442 * 3. rcu_free_kmmio_fault_pages()
443 * Actally free the kmmio_fault_page structs as with RCU.
444 */
445void unregister_kmmio_probe(struct kmmio_probe *p)
446{
447 unsigned long flags;
448 unsigned long size = 0;
449 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
450 struct kmmio_fault_page *release_list = NULL;
451 struct kmmio_delayed_release *drelease;
452
453 spin_lock_irqsave(&kmmio_lock, flags);
454 while (size < size_lim) {
455 release_kmmio_fault_page(p->addr + size, &release_list);
456 size += PAGE_SIZE;
457 }
458 list_del_rcu(&p->list);
459 kmmio_count--;
460 spin_unlock_irqrestore(&kmmio_lock, flags);
461
462 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
463 if (!drelease) {
464 pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
465 return;
466 }
467 drelease->release_list = release_list;
468
469 /*
470 * This is not really RCU here. We have just disarmed a set of
471 * pages so that they cannot trigger page faults anymore. However,
472 * we cannot remove the pages from kmmio_page_table,
473 * because a probe hit might be in flight on another CPU. The
474 * pages are collected into a list, and they will be removed from
475 * kmmio_page_table when it is certain that no probe hit related to
476 * these pages can be in flight. RCU grace period sounds like a
477 * good choice.
478 *
479 * If we removed the pages too early, kmmio page fault handler might
480 * not find the respective kmmio_fault_page and determine it's not
481 * a kmmio fault, when it actually is. This would lead to madness.
482 */
483 call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
484}
485EXPORT_SYMBOL(unregister_kmmio_probe);
486
487static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
488 void *args)
489{
490 struct die_args *arg = args;
491
492 if (val == DIE_DEBUG && (arg->err & DR_STEP))
493 if (post_kmmio_handler(arg->err, arg->regs) == 1)
494 return NOTIFY_STOP;
495
496 return NOTIFY_DONE;
497}
498
499static struct notifier_block nb_die = {
500 .notifier_call = kmmio_die_notifier
501};
502
503static int __init init_kmmio(void)
504{
505 int i;
506 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
507 INIT_LIST_HEAD(&kmmio_page_table[i]);
508 return register_die_notifier(&nb_die);
509}
510fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
new file mode 100644
index 000000000000..672e17f8262a
--- /dev/null
+++ b/arch/x86/mm/memtest.c
@@ -0,0 +1,123 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/pfn.h>
9
10#include <asm/e820.h>
11
12static void __init memtest(unsigned long start_phys, unsigned long size,
13 unsigned pattern)
14{
15 unsigned long i;
16 unsigned long *start;
17 unsigned long start_bad;
18 unsigned long last_bad;
19 unsigned long val;
20 unsigned long start_phys_aligned;
21 unsigned long count;
22 unsigned long incr;
23
24 switch (pattern) {
25 case 0:
26 val = 0UL;
27 break;
28 case 1:
29 val = -1UL;
30 break;
31 case 2:
32#ifdef CONFIG_X86_64
33 val = 0x5555555555555555UL;
34#else
35 val = 0x55555555UL;
36#endif
37 break;
38 case 3:
39#ifdef CONFIG_X86_64
40 val = 0xaaaaaaaaaaaaaaaaUL;
41#else
42 val = 0xaaaaaaaaUL;
43#endif
44 break;
45 default:
46 return;
47 }
48
49 incr = sizeof(unsigned long);
50 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned);
53 start_bad = 0;
54 last_bad = 0;
55
56 for (i = 0; i < count; i++)
57 start[i] = val;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
59 if (*start != val) {
60 if (start_phys_aligned == last_bad + incr) {
61 last_bad += incr;
62 } else {
63 if (start_bad) {
64 printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
67 }
68 start_bad = last_bad = start_phys_aligned;
69 }
70 }
71 }
72 if (start_bad) {
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
74 val, start_bad, last_bad + incr);
75 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
76 }
77
78}
79
80/* default is disabled */
81static int memtest_pattern __initdata;
82
83static int __init parse_memtest(char *arg)
84{
85 if (arg)
86 memtest_pattern = simple_strtoul(arg, NULL, 0);
87 return 0;
88}
89
90early_param("memtest", parse_memtest);
91
92void __init early_memtest(unsigned long start, unsigned long end)
93{
94 u64 t_start, t_size;
95 unsigned pattern;
96
97 if (!memtest_pattern)
98 return;
99
100 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
101 for (pattern = 0; pattern < memtest_pattern; pattern++) {
102 t_start = start;
103 t_size = 0;
104 while (t_start < end) {
105 t_start = find_e820_area_size(t_start, &t_size, 1);
106
107 /* done ? */
108 if (t_start >= end)
109 break;
110 if (t_start + t_size > end)
111 t_size = end - t_start;
112
113 printk(KERN_CONT "\n %010llx - %010llx pattern %d",
114 (unsigned long long)t_start,
115 (unsigned long long)t_start + t_size, pattern);
116
117 memtest(t_start, t_size, pattern);
118
119 t_start += t_size;
120 }
121 }
122 printk(KERN_CONT "\n");
123}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644
index 000000000000..635b50e85581
--- /dev/null
+++ b/arch/x86/mm/mmio-mod.c
@@ -0,0 +1,517 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2005
17 * Jeff Muizelaar, 2006, 2007
18 * Pekka Paalanen, 2008 <pq@iki.fi>
19 *
20 * Derived from the read-mod example from relay-examples by Tom Zanussi.
21 */
22#define DEBUG 1
23
24#include <linux/module.h>
25#include <linux/debugfs.h>
26#include <linux/uaccess.h>
27#include <linux/io.h>
28#include <linux/version.h>
29#include <linux/kallsyms.h>
30#include <asm/pgtable.h>
31#include <linux/mmiotrace.h>
32#include <asm/e820.h> /* for ISA_START_ADDRESS */
33#include <asm/atomic.h>
34#include <linux/percpu.h>
35#include <linux/cpu.h>
36
37#include "pf_in.h"
38
39#define NAME "mmiotrace: "
40
41struct trap_reason {
42 unsigned long addr;
43 unsigned long ip;
44 enum reason_type type;
45 int active_traces;
46};
47
48struct remap_trace {
49 struct list_head list;
50 struct kmmio_probe probe;
51 resource_size_t phys;
52 unsigned long id;
53};
54
55/* Accessed per-cpu. */
56static DEFINE_PER_CPU(struct trap_reason, pf_reason);
57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
58
59#if 0 /* XXX: no way gather this info anymore */
60/* Access to this is not per-cpu. */
61static DEFINE_PER_CPU(atomic_t, dropped);
62#endif
63
64static struct dentry *marker_file;
65
66static DEFINE_MUTEX(mmiotrace_mutex);
67static DEFINE_SPINLOCK(trace_lock);
68static atomic_t mmiotrace_enabled;
69static LIST_HEAD(trace_list); /* struct remap_trace */
70
71/*
72 * Locking in this file:
73 * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
74 * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
75 * and trace_lock.
76 * - Routines depending on is_enabled() must take trace_lock.
77 * - trace_list users must hold trace_lock.
78 * - is_enabled() guarantees that mmio_trace_record is allowed.
79 * - pre/post callbacks assume the effect of is_enabled() being true.
80 */
81
82/* module parameters */
83static unsigned long filter_offset;
84static int nommiotrace;
85static int trace_pc;
86
87module_param(filter_offset, ulong, 0);
88module_param(nommiotrace, bool, 0);
89module_param(trace_pc, bool, 0);
90
91MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
92MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
93MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
94
95static bool is_enabled(void)
96{
97 return atomic_read(&mmiotrace_enabled);
98}
99
100#if 0 /* XXX: needs rewrite */
101/*
102 * Write callback for the debugfs entry:
103 * Read a marker and write it to the mmio trace log
104 */
105static ssize_t write_marker(struct file *file, const char __user *buffer,
106 size_t count, loff_t *ppos)
107{
108 char *event = NULL;
109 struct mm_io_header *headp;
110 ssize_t len = (count > 65535) ? 65535 : count;
111
112 event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
113 if (!event)
114 return -ENOMEM;
115
116 headp = (struct mm_io_header *)event;
117 headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
118 headp->data_len = len;
119
120 if (copy_from_user(event + sizeof(*headp), buffer, len)) {
121 kfree(event);
122 return -EFAULT;
123 }
124
125 spin_lock_irq(&trace_lock);
126#if 0 /* XXX: convert this to use tracing */
127 if (is_enabled())
128 relay_write(chan, event, sizeof(*headp) + len);
129 else
130#endif
131 len = -EINVAL;
132 spin_unlock_irq(&trace_lock);
133 kfree(event);
134 return len;
135}
136#endif
137
138static void print_pte(unsigned long address)
139{
140 unsigned int level;
141 pte_t *pte = lookup_address(address, &level);
142
143 if (!pte) {
144 pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
145 __func__, address);
146 return;
147 }
148
149 if (level == PG_LEVEL_2M) {
150 pr_emerg(NAME "4MB pages are not currently supported: "
151 "0x%08lx\n", address);
152 BUG();
153 }
154 pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address,
155 (unsigned long long)pte_val(*pte),
156 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
157}
158
159/*
160 * For some reason the pre/post pairs have been called in an
161 * unmatched order. Report and die.
162 */
163static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
164{
165 const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
166 pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
167 "last fault for address: 0x%08lx\n",
168 addr, my_reason->addr);
169 print_pte(addr);
170 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
171 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
172#ifdef __i386__
173 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
174 regs->ax, regs->bx, regs->cx, regs->dx);
175 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
176 regs->si, regs->di, regs->bp, regs->sp);
177#else
178 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
179 regs->ax, regs->cx, regs->dx);
180 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
181 regs->si, regs->di, regs->bp, regs->sp);
182#endif
183 put_cpu_var(pf_reason);
184 BUG();
185}
186
187static void pre(struct kmmio_probe *p, struct pt_regs *regs,
188 unsigned long addr)
189{
190 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
191 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
192 const unsigned long instptr = instruction_pointer(regs);
193 const enum reason_type type = get_ins_type(instptr);
194 struct remap_trace *trace = p->private;
195
196 /* it doesn't make sense to have more than one active trace per cpu */
197 if (my_reason->active_traces)
198 die_kmmio_nesting_error(regs, addr);
199 else
200 my_reason->active_traces++;
201
202 my_reason->type = type;
203 my_reason->addr = addr;
204 my_reason->ip = instptr;
205
206 my_trace->phys = addr - trace->probe.addr + trace->phys;
207 my_trace->map_id = trace->id;
208
209 /*
210 * Only record the program counter when requested.
211 * It may taint clean-room reverse engineering.
212 */
213 if (trace_pc)
214 my_trace->pc = instptr;
215 else
216 my_trace->pc = 0;
217
218 /*
219 * XXX: the timestamp recorded will be *after* the tracing has been
220 * done, not at the time we hit the instruction. SMP implications
221 * on event ordering?
222 */
223
224 switch (type) {
225 case REG_READ:
226 my_trace->opcode = MMIO_READ;
227 my_trace->width = get_ins_mem_width(instptr);
228 break;
229 case REG_WRITE:
230 my_trace->opcode = MMIO_WRITE;
231 my_trace->width = get_ins_mem_width(instptr);
232 my_trace->value = get_ins_reg_val(instptr, regs);
233 break;
234 case IMM_WRITE:
235 my_trace->opcode = MMIO_WRITE;
236 my_trace->width = get_ins_mem_width(instptr);
237 my_trace->value = get_ins_imm_val(instptr);
238 break;
239 default:
240 {
241 unsigned char *ip = (unsigned char *)instptr;
242 my_trace->opcode = MMIO_UNKNOWN_OP;
243 my_trace->width = 0;
244 my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
245 *(ip + 2);
246 }
247 }
248 put_cpu_var(cpu_trace);
249 put_cpu_var(pf_reason);
250}
251
252static void post(struct kmmio_probe *p, unsigned long condition,
253 struct pt_regs *regs)
254{
255 struct trap_reason *my_reason = &get_cpu_var(pf_reason);
256 struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
257
258 /* this should always return the active_trace count to 0 */
259 my_reason->active_traces--;
260 if (my_reason->active_traces) {
261 pr_emerg(NAME "unexpected post handler");
262 BUG();
263 }
264
265 switch (my_reason->type) {
266 case REG_READ:
267 my_trace->value = get_ins_reg_val(my_reason->ip, regs);
268 break;
269 default:
270 break;
271 }
272
273 mmio_trace_rw(my_trace);
274 put_cpu_var(cpu_trace);
275 put_cpu_var(pf_reason);
276}
277
278static void ioremap_trace_core(resource_size_t offset, unsigned long size,
279 void __iomem *addr)
280{
281 static atomic_t next_id;
282 struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
283 /* These are page-unaligned. */
284 struct mmiotrace_map map = {
285 .phys = offset,
286 .virt = (unsigned long)addr,
287 .len = size,
288 .opcode = MMIO_PROBE
289 };
290
291 if (!trace) {
292 pr_err(NAME "kmalloc failed in ioremap\n");
293 return;
294 }
295
296 *trace = (struct remap_trace) {
297 .probe = {
298 .addr = (unsigned long)addr,
299 .len = size,
300 .pre_handler = pre,
301 .post_handler = post,
302 .private = trace
303 },
304 .phys = offset,
305 .id = atomic_inc_return(&next_id)
306 };
307 map.map_id = trace->id;
308
309 spin_lock_irq(&trace_lock);
310 if (!is_enabled())
311 goto not_enabled;
312
313 mmio_trace_mapping(&map);
314 list_add_tail(&trace->list, &trace_list);
315 if (!nommiotrace)
316 register_kmmio_probe(&trace->probe);
317
318not_enabled:
319 spin_unlock_irq(&trace_lock);
320}
321
322void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
323 void __iomem *addr)
324{
325 if (!is_enabled()) /* recheck and proper locking in *_core() */
326 return;
327
328 pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
329 (unsigned long long)offset, size, addr);
330 if ((filter_offset) && (offset != filter_offset))
331 return;
332 ioremap_trace_core(offset, size, addr);
333}
334
335static void iounmap_trace_core(volatile void __iomem *addr)
336{
337 struct mmiotrace_map map = {
338 .phys = 0,
339 .virt = (unsigned long)addr,
340 .len = 0,
341 .opcode = MMIO_UNPROBE
342 };
343 struct remap_trace *trace;
344 struct remap_trace *tmp;
345 struct remap_trace *found_trace = NULL;
346
347 pr_debug(NAME "Unmapping %p.\n", addr);
348
349 spin_lock_irq(&trace_lock);
350 if (!is_enabled())
351 goto not_enabled;
352
353 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
354 if ((unsigned long)addr == trace->probe.addr) {
355 if (!nommiotrace)
356 unregister_kmmio_probe(&trace->probe);
357 list_del(&trace->list);
358 found_trace = trace;
359 break;
360 }
361 }
362 map.map_id = (found_trace) ? found_trace->id : -1;
363 mmio_trace_mapping(&map);
364
365not_enabled:
366 spin_unlock_irq(&trace_lock);
367 if (found_trace) {
368 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
369 kfree(found_trace);
370 }
371}
372
373void mmiotrace_iounmap(volatile void __iomem *addr)
374{
375 might_sleep();
376 if (is_enabled()) /* recheck and proper locking in *_core() */
377 iounmap_trace_core(addr);
378}
379
380static void clear_trace_list(void)
381{
382 struct remap_trace *trace;
383 struct remap_trace *tmp;
384
385 /*
386 * No locking required, because the caller ensures we are in a
387 * critical section via mutex, and is_enabled() is false,
388 * i.e. nothing can traverse or modify this list.
389 * Caller also ensures is_enabled() cannot change.
390 */
391 list_for_each_entry(trace, &trace_list, list) {
392 pr_notice(NAME "purging non-iounmapped "
393 "trace @0x%08lx, size 0x%lx.\n",
394 trace->probe.addr, trace->probe.len);
395 if (!nommiotrace)
396 unregister_kmmio_probe(&trace->probe);
397 }
398 synchronize_rcu(); /* unregister_kmmio_probe() requirement */
399
400 list_for_each_entry_safe(trace, tmp, &trace_list, list) {
401 list_del(&trace->list);
402 kfree(trace);
403 }
404}
405
406#ifdef CONFIG_HOTPLUG_CPU
407static cpumask_t downed_cpus;
408
409static void enter_uniprocessor(void)
410{
411 int cpu;
412 int err;
413
414 get_online_cpus();
415 downed_cpus = cpu_online_map;
416 cpu_clear(first_cpu(cpu_online_map), downed_cpus);
417 if (num_online_cpus() > 1)
418 pr_notice(NAME "Disabling non-boot CPUs...\n");
419 put_online_cpus();
420
421 for_each_cpu_mask(cpu, downed_cpus) {
422 err = cpu_down(cpu);
423 if (!err)
424 pr_info(NAME "CPU%d is down.\n", cpu);
425 else
426 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
427 }
428 if (num_online_cpus() > 1)
429 pr_warning(NAME "multiple CPUs still online, "
430 "may miss events.\n");
431}
432
433/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
434 but this whole function is ifdefed CONFIG_HOTPLUG_CPU */
435static void __ref leave_uniprocessor(void)
436{
437 int cpu;
438 int err;
439
440 if (cpus_weight(downed_cpus) == 0)
441 return;
442 pr_notice(NAME "Re-enabling CPUs...\n");
443 for_each_cpu_mask(cpu, downed_cpus) {
444 err = cpu_up(cpu);
445 if (!err)
446 pr_info(NAME "enabled CPU%d.\n", cpu);
447 else
448 pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err);
449 }
450}
451
452#else /* !CONFIG_HOTPLUG_CPU */
453static void enter_uniprocessor(void)
454{
455 if (num_online_cpus() > 1)
456 pr_warning(NAME "multiple CPUs are online, may miss events. "
457 "Suggest booting with maxcpus=1 kernel argument.\n");
458}
459
460static void leave_uniprocessor(void)
461{
462}
463#endif
464
465#if 0 /* XXX: out of order */
466static struct file_operations fops_marker = {
467 .owner = THIS_MODULE,
468 .write = write_marker
469};
470#endif
471
472void enable_mmiotrace(void)
473{
474 mutex_lock(&mmiotrace_mutex);
475 if (is_enabled())
476 goto out;
477
478#if 0 /* XXX: tracing does not support text entries */
479 marker_file = debugfs_create_file("marker", 0660, dir, NULL,
480 &fops_marker);
481 if (!marker_file)
482 pr_err(NAME "marker file creation failed.\n");
483#endif
484
485 if (nommiotrace)
486 pr_info(NAME "MMIO tracing disabled.\n");
487 enter_uniprocessor();
488 spin_lock_irq(&trace_lock);
489 atomic_inc(&mmiotrace_enabled);
490 spin_unlock_irq(&trace_lock);
491 pr_info(NAME "enabled.\n");
492out:
493 mutex_unlock(&mmiotrace_mutex);
494}
495
496void disable_mmiotrace(void)
497{
498 mutex_lock(&mmiotrace_mutex);
499 if (!is_enabled())
500 goto out;
501
502 spin_lock_irq(&trace_lock);
503 atomic_dec(&mmiotrace_enabled);
504 BUG_ON(is_enabled());
505 spin_unlock_irq(&trace_lock);
506
507 clear_trace_list(); /* guarantees: no more kmmio callbacks */
508 leave_uniprocessor();
509 if (marker_file) {
510 debugfs_remove(marker_file);
511 marker_file = NULL;
512 }
513
514 pr_info(NAME "disabled.\n");
515out:
516 mutex_unlock(&mmiotrace_mutex);
517}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c5066d519e5d..cebcbf152d46 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,37 +20,18 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifndef Dprintk
24#define Dprintk(x...)
25#endif
26
27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data); 24EXPORT_SYMBOL(node_data);
29 25
30bootmem_data_t plat_node_bdata[MAX_NUMNODES];
31
32struct memnode memnode; 26struct memnode memnode;
33 27
34#ifdef CONFIG_SMP
35int x86_cpu_to_node_map_init[NR_CPUS] = {
36 [0 ... NR_CPUS-1] = NUMA_NO_NODE
37};
38void *x86_cpu_to_node_map_early_ptr;
39EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
40#endif
41DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
42EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 28s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
45 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 29 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
46}; 30};
47 31
48cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
49EXPORT_SYMBOL(node_to_cpumask_map);
50
51int numa_off __initdata; 32int numa_off __initdata;
52unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
53unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
54 35
55/* 36/*
56 * Given a shift value, try to populate memnodemap[] 37 * Given a shift value, try to populate memnodemap[]
@@ -98,8 +79,8 @@ static int __init allocate_cachealigned_memnodemap(void)
98 return 0; 79 return 0;
99 80
100 addr = 0x8000; 81 addr = 0x8000;
101 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 82 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
102 nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT, 83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
103 nodemap_size, L1_CACHE_BYTES); 84 nodemap_size, L1_CACHE_BYTES);
104 if (nodemap_addr == -1UL) { 85 if (nodemap_addr == -1UL) {
105 printk(KERN_ERR 86 printk(KERN_ERR
@@ -192,19 +173,19 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
192void __init setup_node_bootmem(int nodeid, unsigned long start, 173void __init setup_node_bootmem(int nodeid, unsigned long start,
193 unsigned long end) 174 unsigned long end)
194{ 175{
195 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; 176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
196 unsigned long bootmap_start, nodedata_phys; 177 unsigned long bootmap_start, nodedata_phys;
197 void *bootmap; 178 void *bootmap;
198 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 179 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
199 int nid; 180 int nid;
200 181
201 start = round_up(start, ZONE_ALIGN); 182 start = roundup(start, ZONE_ALIGN);
202 183
203 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 184 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
204 start, end); 185 start, end);
205 186
206 start_pfn = start >> PAGE_SHIFT; 187 start_pfn = start >> PAGE_SHIFT;
207 end_pfn = end >> PAGE_SHIFT; 188 last_pfn = end >> PAGE_SHIFT;
208 189
209 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 190 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
210 SMP_CACHE_BYTES); 191 SMP_CACHE_BYTES);
@@ -215,9 +196,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
215 nodedata_phys + pgdat_size - 1); 196 nodedata_phys + pgdat_size - 1);
216 197
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 198 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 199 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 200 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 201 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 202
222 /* 203 /*
223 * Find a place for the bootmem map 204 * Find a place for the bootmem map
@@ -226,14 +207,14 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
226 * early_node_mem will get that with find_e820_area instead 207 * early_node_mem will get that with find_e820_area instead
227 * of alloc_bootmem, that could clash with reserved range 208 * of alloc_bootmem, that could clash with reserved range
228 */ 209 */
229 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 211 nid = phys_to_nid(nodedata_phys);
231 if (nid == nodeid) 212 if (nid == nodeid)
232 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 213 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else 214 else
234 bootmap_start = round_up(start, PAGE_SIZE); 215 bootmap_start = roundup(start, PAGE_SIZE);
235 /* 216 /*
236 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like 217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 218 * to use that to align to PAGE_SIZE
238 */ 219 */
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 220 bootmap = early_node_mem(nodeid, bootmap_start, end,
@@ -248,7 +229,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
248 229
249 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 230 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
250 bootmap_start >> PAGE_SHIFT, 231 bootmap_start >> PAGE_SHIFT,
251 start_pfn, end_pfn); 232 start_pfn, last_pfn);
252 233
253 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 234 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
254 bootmap_start, bootmap_start + bootmap_size - 1, 235 bootmap_start, bootmap_start + bootmap_size - 1,
@@ -309,7 +290,7 @@ void __init numa_init_array(void)
309 290
310#ifdef CONFIG_NUMA_EMU 291#ifdef CONFIG_NUMA_EMU
311/* Numa emulation */ 292/* Numa emulation */
312char *cmdline __initdata; 293static char *cmdline __initdata;
313 294
314/* 295/*
315 * Setups up nid to range from addr to addr + size. If the end 296 * Setups up nid to range from addr to addr + size. If the end
@@ -413,15 +394,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413} 394}
414 395
415/* 396/*
416 * Sets up the system RAM area from start_pfn to end_pfn according to the 397 * Sets up the system RAM area from start_pfn to last_pfn according to the
417 * numa=fake command-line option. 398 * numa=fake command-line option.
418 */ 399 */
419static struct bootnode nodes[MAX_NUMNODES] __initdata; 400static struct bootnode nodes[MAX_NUMNODES] __initdata;
420 401
421static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 402static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
422{ 403{
423 u64 size, addr = start_pfn << PAGE_SHIFT; 404 u64 size, addr = start_pfn << PAGE_SHIFT;
424 u64 max_addr = end_pfn << PAGE_SHIFT; 405 u64 max_addr = last_pfn << PAGE_SHIFT;
425 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 406 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
426 407
427 memset(&nodes, 0, sizeof(nodes)); 408 memset(&nodes, 0, sizeof(nodes));
@@ -527,7 +508,7 @@ out:
527} 508}
528#endif /* CONFIG_NUMA_EMU */ 509#endif /* CONFIG_NUMA_EMU */
529 510
530void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 511void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
531{ 512{
532 int i; 513 int i;
533 514
@@ -535,7 +516,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
535 nodes_clear(node_online_map); 516 nodes_clear(node_online_map);
536 517
537#ifdef CONFIG_NUMA_EMU 518#ifdef CONFIG_NUMA_EMU
538 if (cmdline && !numa_emulation(start_pfn, end_pfn)) 519 if (cmdline && !numa_emulation(start_pfn, last_pfn))
539 return; 520 return;
540 nodes_clear(node_possible_map); 521 nodes_clear(node_possible_map);
541 nodes_clear(node_online_map); 522 nodes_clear(node_online_map);
@@ -543,7 +524,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
543 524
544#ifdef CONFIG_ACPI_NUMA 525#ifdef CONFIG_ACPI_NUMA
545 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 526 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
546 end_pfn << PAGE_SHIFT)) 527 last_pfn << PAGE_SHIFT))
547 return; 528 return;
548 nodes_clear(node_possible_map); 529 nodes_clear(node_possible_map);
549 nodes_clear(node_online_map); 530 nodes_clear(node_online_map);
@@ -551,7 +532,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
551 532
552#ifdef CONFIG_K8_NUMA 533#ifdef CONFIG_K8_NUMA
553 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 534 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
554 end_pfn<<PAGE_SHIFT)) 535 last_pfn<<PAGE_SHIFT))
555 return; 536 return;
556 nodes_clear(node_possible_map); 537 nodes_clear(node_possible_map);
557 nodes_clear(node_online_map); 538 nodes_clear(node_online_map);
@@ -561,7 +542,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
561 542
562 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 543 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
563 start_pfn << PAGE_SHIFT, 544 start_pfn << PAGE_SHIFT,
564 end_pfn << PAGE_SHIFT); 545 last_pfn << PAGE_SHIFT);
565 /* setup dummy node covering all memory */ 546 /* setup dummy node covering all memory */
566 memnode_shift = 63; 547 memnode_shift = 63;
567 memnodemap = memnode.embedded_map; 548 memnodemap = memnode.embedded_map;
@@ -570,29 +551,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
570 node_set(0, node_possible_map); 551 node_set(0, node_possible_map);
571 for (i = 0; i < NR_CPUS; i++) 552 for (i = 0; i < NR_CPUS; i++)
572 numa_set_node(i, 0); 553 numa_set_node(i, 0);
573 /* cpumask_of_cpu() may not be available during early startup */ 554 e820_register_active_regions(0, start_pfn, last_pfn);
574 memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); 555 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
575 cpu_set(0, node_to_cpumask_map[0]);
576 e820_register_active_regions(0, start_pfn, end_pfn);
577 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
578}
579
580__cpuinit void numa_add_cpu(int cpu)
581{
582 set_bit(cpu,
583 (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
584}
585
586void __cpuinit numa_set_node(int cpu, int node)
587{
588 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
589
590 if(cpu_to_node_map)
591 cpu_to_node_map[cpu] = node;
592 else if(per_cpu_offset(cpu))
593 per_cpu(x86_cpu_to_node_map, cpu) = node;
594 else
595 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
596} 556}
597 557
598unsigned long __init numa_free_all_bootmem(void) 558unsigned long __init numa_free_all_bootmem(void)
@@ -613,7 +573,7 @@ void __init paging_init(void)
613 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 573 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
614 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 574 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
615 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 575 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
616 max_zone_pfns[ZONE_NORMAL] = end_pfn; 576 max_zone_pfns[ZONE_NORMAL] = max_pfn;
617 577
618 sparse_memory_present_with_active_regions(MAX_NUMNODES); 578 sparse_memory_present_with_active_regions(MAX_NUMNODES);
619 sparse_init(); 579 sparse_init();
@@ -641,6 +601,7 @@ static __init int numa_setup(char *opt)
641} 601}
642early_param("numa", numa_setup); 602early_param("numa", numa_setup);
643 603
604#ifdef CONFIG_NUMA
644/* 605/*
645 * Setup early cpu_to_node. 606 * Setup early cpu_to_node.
646 * 607 *
@@ -652,14 +613,19 @@ early_param("numa", numa_setup);
652 * is already initialized in a round robin manner at numa_init_array, 613 * is already initialized in a round robin manner at numa_init_array,
653 * prior to this call, and this initialization is good enough 614 * prior to this call, and this initialization is good enough
654 * for the fake NUMA cases. 615 * for the fake NUMA cases.
616 *
617 * Called before the per_cpu areas are setup.
655 */ 618 */
656void __init init_cpu_to_node(void) 619void __init init_cpu_to_node(void)
657{ 620{
658 int i; 621 int cpu;
622 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
659 623
660 for (i = 0; i < NR_CPUS; i++) { 624 BUG_ON(cpu_to_apicid == NULL);
625
626 for_each_possible_cpu(cpu) {
661 int node; 627 int node;
662 u16 apicid = x86_cpu_to_apicid_init[i]; 628 u16 apicid = cpu_to_apicid[cpu];
663 629
664 if (apicid == BAD_APICID) 630 if (apicid == BAD_APICID)
665 continue; 631 continue;
@@ -668,8 +634,9 @@ void __init init_cpu_to_node(void)
668 continue; 634 continue;
669 if (!node_online(node)) 635 if (!node_online(node))
670 continue; 636 continue;
671 numa_set_node(i, node); 637 numa_set_node(cpu, node);
672 } 638 }
673} 639}
640#endif
674 641
675 642
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 75f1b109aae8..e1d106909218 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * self test for change_page_attr. 2 * self test for change_page_attr.
3 * 3 *
4 * Clears the global bit on random pages in the direct mapping, then reverts 4 * Clears the a test pte bit on random pages in the direct mapping,
5 * and compares page tables forwards and afterwards. 5 * then reverts and compares page tables forwards and afterwards.
6 */ 6 */
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/kthread.h> 8#include <linux/kthread.h>
@@ -32,6 +32,13 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
36
37static int pte_testbit(pte_t pte)
38{
39 return pte_flags(pte) & _PAGE_UNUSED1;
40}
41
35struct split_state { 42struct split_state {
36 long lpg, gpg, spg, exec; 43 long lpg, gpg, spg, exec;
37 long min_exec, max_exec; 44 long min_exec, max_exec;
@@ -111,6 +118,7 @@ static int pageattr_test(void)
111 unsigned int level; 118 unsigned int level;
112 int i, k; 119 int i, k;
113 int err; 120 int err;
121 unsigned long test_addr;
114 122
115 if (print) 123 if (print)
116 printk(KERN_INFO "CPA self-test:\n"); 124 printk(KERN_INFO "CPA self-test:\n");
@@ -165,15 +173,15 @@ static int pageattr_test(void)
165 continue; 173 continue;
166 } 174 }
167 175
168 err = change_page_attr_clear(addr[i], len[i], 176 test_addr = addr[i];
169 __pgprot(_PAGE_GLOBAL)); 177 err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
170 if (err < 0) { 178 if (err < 0) {
171 printk(KERN_ERR "CPA %d failed %d\n", i, err); 179 printk(KERN_ERR "CPA %d failed %d\n", i, err);
172 failed++; 180 failed++;
173 } 181 }
174 182
175 pte = lookup_address(addr[i], &level); 183 pte = lookup_address(addr[i], &level);
176 if (!pte || pte_global(*pte) || pte_huge(*pte)) { 184 if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
177 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], 185 printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
178 pte ? (u64)pte_val(*pte) : 0ULL); 186 pte ? (u64)pte_val(*pte) : 0ULL);
179 failed++; 187 failed++;
@@ -198,14 +206,14 @@ static int pageattr_test(void)
198 failed++; 206 failed++;
199 continue; 207 continue;
200 } 208 }
201 err = change_page_attr_set(addr[i], len[i], 209 test_addr = addr[i];
202 __pgprot(_PAGE_GLOBAL)); 210 err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
203 if (err < 0) { 211 if (err < 0) {
204 printk(KERN_ERR "CPA reverting failed: %d\n", err); 212 printk(KERN_ERR "CPA reverting failed: %d\n", err);
205 failed++; 213 failed++;
206 } 214 }
207 pte = lookup_address(addr[i], &level); 215 pte = lookup_address(addr[i], &level);
208 if (!pte || !pte_global(*pte)) { 216 if (!pte || pte_testbit(*pte)) {
209 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", 217 printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
210 addr[i], pte ? (u64)pte_val(*pte) : 0ULL); 218 addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
211 failed++; 219 failed++;
@@ -216,8 +224,7 @@ static int pageattr_test(void)
216 failed += print_split(&sc); 224 failed += print_split(&sc);
217 225
218 if (failed) { 226 if (failed) {
219 printk(KERN_ERR "NOT PASSED. Please report.\n"); 227 WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
220 WARN_ON(1);
221 return -EINVAL; 228 return -EINVAL;
222 } else { 229 } else {
223 if (print) 230 if (print)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37e..a9ec89c3fbca 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -25,15 +25,68 @@
25 * The current flushing context - we pass it instead of 5 arguments: 25 * The current flushing context - we pass it instead of 5 arguments:
26 */ 26 */
27struct cpa_data { 27struct cpa_data {
28 unsigned long vaddr; 28 unsigned long *vaddr;
29 pgprot_t mask_set; 29 pgprot_t mask_set;
30 pgprot_t mask_clr; 30 pgprot_t mask_clr;
31 int numpages; 31 int numpages;
32 int flushtlb; 32 int flags;
33 unsigned long pfn; 33 unsigned long pfn;
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35 int curpage;
35}; 36};
36 37
38/*
39 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
40 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
41 * entries change the page attribute in parallel to some other cpu
42 * splitting a large page entry along with changing the attribute.
43 */
44static DEFINE_SPINLOCK(cpa_lock);
45
46#define CPA_FLUSHTLB 1
47#define CPA_ARRAY 2
48
49#ifdef CONFIG_PROC_FS
50static unsigned long direct_pages_count[PG_LEVEL_NUM];
51
52void update_page_count(int level, unsigned long pages)
53{
54 unsigned long flags;
55
56 /* Protect against CPA */
57 spin_lock_irqsave(&pgd_lock, flags);
58 direct_pages_count[level] += pages;
59 spin_unlock_irqrestore(&pgd_lock, flags);
60}
61
62static void split_page_count(int level)
63{
64 direct_pages_count[level]--;
65 direct_pages_count[level - 1] += PTRS_PER_PTE;
66}
67
68int arch_report_meminfo(char *page)
69{
70 int n = sprintf(page, "DirectMap4k: %8lu kB\n",
71 direct_pages_count[PG_LEVEL_4K] << 2);
72#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
73 n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
74 direct_pages_count[PG_LEVEL_2M] << 11);
75#else
76 n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
77 direct_pages_count[PG_LEVEL_2M] << 12);
78#endif
79#ifdef CONFIG_X86_64
80 if (direct_gbpages)
81 n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
82 direct_pages_count[PG_LEVEL_1G] << 20);
83#endif
84 return n;
85}
86#else
87static inline void split_page_count(int level) { }
88#endif
89
37#ifdef CONFIG_X86_64 90#ifdef CONFIG_X86_64
38 91
39static inline unsigned long highmap_start_pfn(void) 92static inline unsigned long highmap_start_pfn(void)
@@ -43,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void)
43 96
44static inline unsigned long highmap_end_pfn(void) 97static inline unsigned long highmap_end_pfn(void)
45{ 98{
46 return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; 99 return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
47} 100}
48 101
49#endif 102#endif
@@ -106,7 +159,7 @@ static void cpa_flush_all(unsigned long cache)
106{ 159{
107 BUG_ON(irqs_disabled()); 160 BUG_ON(irqs_disabled());
108 161
109 on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); 162 on_each_cpu(__cpa_flush_all, (void *) cache, 1);
110} 163}
111 164
112static void __cpa_flush_range(void *arg) 165static void __cpa_flush_range(void *arg)
@@ -127,7 +180,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
127 BUG_ON(irqs_disabled()); 180 BUG_ON(irqs_disabled());
128 WARN_ON(PAGE_ALIGN(start) != start); 181 WARN_ON(PAGE_ALIGN(start) != start);
129 182
130 on_each_cpu(__cpa_flush_range, NULL, 1, 1); 183 on_each_cpu(__cpa_flush_range, NULL, 1);
131 184
132 if (!cache) 185 if (!cache)
133 return; 186 return;
@@ -149,6 +202,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
149 } 202 }
150} 203}
151 204
205static void cpa_flush_array(unsigned long *start, int numpages, int cache)
206{
207 unsigned int i, level;
208 unsigned long *addr;
209
210 BUG_ON(irqs_disabled());
211
212 on_each_cpu(__cpa_flush_range, NULL, 1);
213
214 if (!cache)
215 return;
216
217 /* 4M threshold */
218 if (numpages >= 1024) {
219 if (boot_cpu_data.x86_model >= 4)
220 wbinvd();
221 return;
222 }
223 /*
224 * We only need to flush on one CPU,
225 * clflush is a MESI-coherent instruction that
226 * will cause all other CPUs to flush the same
227 * cachelines:
228 */
229 for (i = 0, addr = start; i < numpages; i++, addr++) {
230 pte_t *pte = lookup_address(*addr, &level);
231
232 /*
233 * Only flush present addresses:
234 */
235 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
236 clflush_cache_range((void *) *addr, PAGE_SIZE);
237 }
238}
239
152/* 240/*
153 * Certain areas of memory on x86 require very specific protection flags, 241 * Certain areas of memory on x86 require very specific protection flags,
154 * for example the BIOS area or kernel text. Callers don't always get this 242 * for example the BIOS area or kernel text. Callers don't always get this
@@ -227,6 +315,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
227 315
228 return pte_offset_kernel(pmd, address); 316 return pte_offset_kernel(pmd, address);
229} 317}
318EXPORT_SYMBOL_GPL(lookup_address);
230 319
231/* 320/*
232 * Set the new pmd in all the pgds we know about: 321 * Set the new pmd in all the pgds we know about:
@@ -356,7 +445,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
356 */ 445 */
357 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); 446 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
358 __set_pmd_pte(kpte, address, new_pte); 447 __set_pmd_pte(kpte, address, new_pte);
359 cpa->flushtlb = 1; 448 cpa->flags |= CPA_FLUSHTLB;
360 do_split = 0; 449 do_split = 0;
361 } 450 }
362 451
@@ -366,84 +455,6 @@ out_unlock:
366 return do_split; 455 return do_split;
367} 456}
368 457
369static LIST_HEAD(page_pool);
370static unsigned long pool_size, pool_pages, pool_low;
371static unsigned long pool_used, pool_failed;
372
373static void cpa_fill_pool(struct page **ret)
374{
375 gfp_t gfp = GFP_KERNEL;
376 unsigned long flags;
377 struct page *p;
378
379 /*
380 * Avoid recursion (on debug-pagealloc) and also signal
381 * our priority to get to these pagetables:
382 */
383 if (current->flags & PF_MEMALLOC)
384 return;
385 current->flags |= PF_MEMALLOC;
386
387 /*
388 * Allocate atomically from atomic contexts:
389 */
390 if (in_atomic() || irqs_disabled() || debug_pagealloc)
391 gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
392
393 while (pool_pages < pool_size || (ret && !*ret)) {
394 p = alloc_pages(gfp, 0);
395 if (!p) {
396 pool_failed++;
397 break;
398 }
399 /*
400 * If the call site needs a page right now, provide it:
401 */
402 if (ret && !*ret) {
403 *ret = p;
404 continue;
405 }
406 spin_lock_irqsave(&pgd_lock, flags);
407 list_add(&p->lru, &page_pool);
408 pool_pages++;
409 spin_unlock_irqrestore(&pgd_lock, flags);
410 }
411
412 current->flags &= ~PF_MEMALLOC;
413}
414
415#define SHIFT_MB (20 - PAGE_SHIFT)
416#define ROUND_MB_GB ((1 << 10) - 1)
417#define SHIFT_MB_GB 10
418#define POOL_PAGES_PER_GB 16
419
420void __init cpa_init(void)
421{
422 struct sysinfo si;
423 unsigned long gb;
424
425 si_meminfo(&si);
426 /*
427 * Calculate the number of pool pages:
428 *
429 * Convert totalram (nr of pages) to MiB and round to the next
430 * GiB. Shift MiB to Gib and multiply the result by
431 * POOL_PAGES_PER_GB:
432 */
433 if (debug_pagealloc) {
434 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
435 pool_size = POOL_PAGES_PER_GB * gb;
436 } else {
437 pool_size = 1;
438 }
439 pool_low = pool_size;
440
441 cpa_fill_pool(NULL);
442 printk(KERN_DEBUG
443 "CPA: page pool initialized %lu of %lu pages preallocated\n",
444 pool_pages, pool_size);
445}
446
447static int split_large_page(pte_t *kpte, unsigned long address) 458static int split_large_page(pte_t *kpte, unsigned long address)
448{ 459{
449 unsigned long flags, pfn, pfninc = 1; 460 unsigned long flags, pfn, pfninc = 1;
@@ -452,28 +463,15 @@ static int split_large_page(pte_t *kpte, unsigned long address)
452 pgprot_t ref_prot; 463 pgprot_t ref_prot;
453 struct page *base; 464 struct page *base;
454 465
455 /* 466 if (!debug_pagealloc)
456 * Get a page from the pool. The pool list is protected by the 467 spin_unlock(&cpa_lock);
457 * pgd_lock, which we have to take anyway for the split 468 base = alloc_pages(GFP_KERNEL, 0);
458 * operation: 469 if (!debug_pagealloc)
459 */ 470 spin_lock(&cpa_lock);
460 spin_lock_irqsave(&pgd_lock, flags); 471 if (!base)
461 if (list_empty(&page_pool)) { 472 return -ENOMEM;
462 spin_unlock_irqrestore(&pgd_lock, flags);
463 base = NULL;
464 cpa_fill_pool(&base);
465 if (!base)
466 return -ENOMEM;
467 spin_lock_irqsave(&pgd_lock, flags);
468 } else {
469 base = list_first_entry(&page_pool, struct page, lru);
470 list_del(&base->lru);
471 pool_pages--;
472
473 if (pool_pages < pool_low)
474 pool_low = pool_pages;
475 }
476 473
474 spin_lock_irqsave(&pgd_lock, flags);
477 /* 475 /*
478 * Check for races, another CPU might have split this page 476 * Check for races, another CPU might have split this page
479 * up for us already: 477 * up for us already:
@@ -500,6 +498,16 @@ static int split_large_page(pte_t *kpte, unsigned long address)
500 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 498 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
501 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 499 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
502 500
501 if (address >= (unsigned long)__va(0) &&
502 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
503 split_page_count(level);
504
505#ifdef CONFIG_X86_64
506 if (address >= (unsigned long)__va(1UL<<32) &&
507 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
508 split_page_count(level);
509#endif
510
503 /* 511 /*
504 * Install the new, split up pagetable. Important details here: 512 * Install the new, split up pagetable. Important details here:
505 * 513 *
@@ -520,11 +528,8 @@ out_unlock:
520 * If we dropped out via the lookup_address check under 528 * If we dropped out via the lookup_address check under
521 * pgd_lock then stick the page back into the pool: 529 * pgd_lock then stick the page back into the pool:
522 */ 530 */
523 if (base) { 531 if (base)
524 list_add(&base->lru, &page_pool); 532 __free_page(base);
525 pool_pages++;
526 } else
527 pool_used++;
528 spin_unlock_irqrestore(&pgd_lock, flags); 533 spin_unlock_irqrestore(&pgd_lock, flags);
529 534
530 return 0; 535 return 0;
@@ -532,11 +537,16 @@ out_unlock:
532 537
533static int __change_page_attr(struct cpa_data *cpa, int primary) 538static int __change_page_attr(struct cpa_data *cpa, int primary)
534{ 539{
535 unsigned long address = cpa->vaddr; 540 unsigned long address;
536 int do_split, err; 541 int do_split, err;
537 unsigned int level; 542 unsigned int level;
538 pte_t *kpte, old_pte; 543 pte_t *kpte, old_pte;
539 544
545 if (cpa->flags & CPA_ARRAY)
546 address = cpa->vaddr[cpa->curpage];
547 else
548 address = *cpa->vaddr;
549
540repeat: 550repeat:
541 kpte = lookup_address(address, &level); 551 kpte = lookup_address(address, &level);
542 if (!kpte) 552 if (!kpte)
@@ -546,10 +556,9 @@ repeat:
546 if (!pte_val(old_pte)) { 556 if (!pte_val(old_pte)) {
547 if (!primary) 557 if (!primary)
548 return 0; 558 return 0;
549 printk(KERN_WARNING "CPA: called for zero pte. " 559 WARN(1, KERN_WARNING "CPA: called for zero pte. "
550 "vaddr = %lx cpa->vaddr = %lx\n", address, 560 "vaddr = %lx cpa->vaddr = %lx\n", address,
551 cpa->vaddr); 561 *cpa->vaddr);
552 WARN_ON(1);
553 return -EINVAL; 562 return -EINVAL;
554 } 563 }
555 564
@@ -575,7 +584,7 @@ repeat:
575 */ 584 */
576 if (pte_val(old_pte) != pte_val(new_pte)) { 585 if (pte_val(old_pte) != pte_val(new_pte)) {
577 set_pte_atomic(kpte, new_pte); 586 set_pte_atomic(kpte, new_pte);
578 cpa->flushtlb = 1; 587 cpa->flags |= CPA_FLUSHTLB;
579 } 588 }
580 cpa->numpages = 1; 589 cpa->numpages = 1;
581 return 0; 590 return 0;
@@ -599,7 +608,25 @@ repeat:
599 */ 608 */
600 err = split_large_page(kpte, address); 609 err = split_large_page(kpte, address);
601 if (!err) { 610 if (!err) {
602 cpa->flushtlb = 1; 611 /*
612 * Do a global flush tlb after splitting the large page
613 * and before we do the actual change page attribute in the PTE.
614 *
615 * With out this, we violate the TLB application note, that says
616 * "The TLBs may contain both ordinary and large-page
617 * translations for a 4-KByte range of linear addresses. This
618 * may occur if software modifies the paging structures so that
619 * the page size used for the address range changes. If the two
620 * translations differ with respect to page frame or attributes
621 * (e.g., permissions), processor behavior is undefined and may
622 * be implementation-specific."
623 *
624 * We do this global tlb flush inside the cpa_lock, so that we
625 * don't allow any other cpu, with stale tlb entries change the
626 * page attribute in parallel, that also falls into the
627 * just split large page entry.
628 */
629 flush_tlb_all();
603 goto repeat; 630 goto repeat;
604 } 631 }
605 632
@@ -612,19 +639,37 @@ static int cpa_process_alias(struct cpa_data *cpa)
612{ 639{
613 struct cpa_data alias_cpa; 640 struct cpa_data alias_cpa;
614 int ret = 0; 641 int ret = 0;
642 unsigned long temp_cpa_vaddr, vaddr;
615 643
616 if (cpa->pfn > max_pfn_mapped) 644 if (cpa->pfn >= max_pfn_mapped)
617 return 0; 645 return 0;
618 646
647#ifdef CONFIG_X86_64
648 if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
649 return 0;
650#endif
619 /* 651 /*
620 * No need to redo, when the primary call touched the direct 652 * No need to redo, when the primary call touched the direct
621 * mapping already: 653 * mapping already:
622 */ 654 */
623 if (!within(cpa->vaddr, PAGE_OFFSET, 655 if (cpa->flags & CPA_ARRAY)
624 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { 656 vaddr = cpa->vaddr[cpa->curpage];
657 else
658 vaddr = *cpa->vaddr;
659
660 if (!(within(vaddr, PAGE_OFFSET,
661 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
662#ifdef CONFIG_X86_64
663 || within(vaddr, PAGE_OFFSET + (1UL<<32),
664 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
665#endif
666 )) {
625 667
626 alias_cpa = *cpa; 668 alias_cpa = *cpa;
627 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 669 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
670 alias_cpa.vaddr = &temp_cpa_vaddr;
671 alias_cpa.flags &= ~CPA_ARRAY;
672
628 673
629 ret = __change_page_attr_set_clr(&alias_cpa, 0); 674 ret = __change_page_attr_set_clr(&alias_cpa, 0);
630 } 675 }
@@ -636,7 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
636 * No need to redo, when the primary call touched the high 681 * No need to redo, when the primary call touched the high
637 * mapping already: 682 * mapping already:
638 */ 683 */
639 if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) 684 if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
640 return 0; 685 return 0;
641 686
642 /* 687 /*
@@ -647,8 +692,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
647 return 0; 692 return 0;
648 693
649 alias_cpa = *cpa; 694 alias_cpa = *cpa;
650 alias_cpa.vaddr = 695 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
651 (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; 696 alias_cpa.vaddr = &temp_cpa_vaddr;
697 alias_cpa.flags &= ~CPA_ARRAY;
652 698
653 /* 699 /*
654 * The high mapping range is imprecise, so ignore the return value. 700 * The high mapping range is imprecise, so ignore the return value.
@@ -668,8 +714,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
668 * preservation check. 714 * preservation check.
669 */ 715 */
670 cpa->numpages = numpages; 716 cpa->numpages = numpages;
717 /* for array changes, we can't use large page */
718 if (cpa->flags & CPA_ARRAY)
719 cpa->numpages = 1;
671 720
721 if (!debug_pagealloc)
722 spin_lock(&cpa_lock);
672 ret = __change_page_attr(cpa, checkalias); 723 ret = __change_page_attr(cpa, checkalias);
724 if (!debug_pagealloc)
725 spin_unlock(&cpa_lock);
673 if (ret) 726 if (ret)
674 return ret; 727 return ret;
675 728
@@ -686,7 +739,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
686 */ 739 */
687 BUG_ON(cpa->numpages > numpages); 740 BUG_ON(cpa->numpages > numpages);
688 numpages -= cpa->numpages; 741 numpages -= cpa->numpages;
689 cpa->vaddr += cpa->numpages * PAGE_SIZE; 742 if (cpa->flags & CPA_ARRAY)
743 cpa->curpage++;
744 else
745 *cpa->vaddr += cpa->numpages * PAGE_SIZE;
746
690 } 747 }
691 return 0; 748 return 0;
692} 749}
@@ -697,9 +754,9 @@ static inline int cache_attr(pgprot_t attr)
697 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); 754 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
698} 755}
699 756
700static int change_page_attr_set_clr(unsigned long addr, int numpages, 757static int change_page_attr_set_clr(unsigned long *addr, int numpages,
701 pgprot_t mask_set, pgprot_t mask_clr, 758 pgprot_t mask_set, pgprot_t mask_clr,
702 int force_split) 759 int force_split, int array)
703{ 760{
704 struct cpa_data cpa; 761 struct cpa_data cpa;
705 int ret, cache, checkalias; 762 int ret, cache, checkalias;
@@ -714,21 +771,38 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
714 return 0; 771 return 0;
715 772
716 /* Ensure we are PAGE_SIZE aligned */ 773 /* Ensure we are PAGE_SIZE aligned */
717 if (addr & ~PAGE_MASK) { 774 if (!array) {
718 addr &= PAGE_MASK; 775 if (*addr & ~PAGE_MASK) {
719 /* 776 *addr &= PAGE_MASK;
720 * People should not be passing in unaligned addresses: 777 /*
721 */ 778 * People should not be passing in unaligned addresses:
722 WARN_ON_ONCE(1); 779 */
780 WARN_ON_ONCE(1);
781 }
782 } else {
783 int i;
784 for (i = 0; i < numpages; i++) {
785 if (addr[i] & ~PAGE_MASK) {
786 addr[i] &= PAGE_MASK;
787 WARN_ON_ONCE(1);
788 }
789 }
723 } 790 }
724 791
792 /* Must avoid aliasing mappings in the highmem code */
793 kmap_flush_unused();
794
725 cpa.vaddr = addr; 795 cpa.vaddr = addr;
726 cpa.numpages = numpages; 796 cpa.numpages = numpages;
727 cpa.mask_set = mask_set; 797 cpa.mask_set = mask_set;
728 cpa.mask_clr = mask_clr; 798 cpa.mask_clr = mask_clr;
729 cpa.flushtlb = 0; 799 cpa.flags = 0;
800 cpa.curpage = 0;
730 cpa.force_split = force_split; 801 cpa.force_split = force_split;
731 802
803 if (array)
804 cpa.flags |= CPA_ARRAY;
805
732 /* No alias checking for _NX bit modifications */ 806 /* No alias checking for _NX bit modifications */
733 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 807 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
734 808
@@ -737,7 +811,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
737 /* 811 /*
738 * Check whether we really changed something: 812 * Check whether we really changed something:
739 */ 813 */
740 if (!cpa.flushtlb) 814 if (!(cpa.flags & CPA_FLUSHTLB))
741 goto out; 815 goto out;
742 816
743 /* 817 /*
@@ -752,27 +826,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
752 * error case we fall back to cpa_flush_all (which uses 826 * error case we fall back to cpa_flush_all (which uses
753 * wbindv): 827 * wbindv):
754 */ 828 */
755 if (!ret && cpu_has_clflush) 829 if (!ret && cpu_has_clflush) {
756 cpa_flush_range(addr, numpages, cache); 830 if (cpa.flags & CPA_ARRAY)
757 else 831 cpa_flush_array(addr, numpages, cache);
832 else
833 cpa_flush_range(*addr, numpages, cache);
834 } else
758 cpa_flush_all(cache); 835 cpa_flush_all(cache);
759 836
760out: 837out:
761 cpa_fill_pool(NULL);
762
763 return ret; 838 return ret;
764} 839}
765 840
766static inline int change_page_attr_set(unsigned long addr, int numpages, 841static inline int change_page_attr_set(unsigned long *addr, int numpages,
767 pgprot_t mask) 842 pgprot_t mask, int array)
768{ 843{
769 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); 844 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
845 array);
770} 846}
771 847
772static inline int change_page_attr_clear(unsigned long addr, int numpages, 848static inline int change_page_attr_clear(unsigned long *addr, int numpages,
773 pgprot_t mask) 849 pgprot_t mask, int array)
774{ 850{
775 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); 851 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
852 array);
776} 853}
777 854
778int _set_memory_uc(unsigned long addr, int numpages) 855int _set_memory_uc(unsigned long addr, int numpages)
@@ -780,8 +857,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
780 /* 857 /*
781 * for now UC MINUS. see comments in ioremap_nocache() 858 * for now UC MINUS. see comments in ioremap_nocache()
782 */ 859 */
783 return change_page_attr_set(addr, numpages, 860 return change_page_attr_set(&addr, numpages,
784 __pgprot(_PAGE_CACHE_UC_MINUS)); 861 __pgprot(_PAGE_CACHE_UC_MINUS), 0);
785} 862}
786 863
787int set_memory_uc(unsigned long addr, int numpages) 864int set_memory_uc(unsigned long addr, int numpages)
@@ -789,7 +866,7 @@ int set_memory_uc(unsigned long addr, int numpages)
789 /* 866 /*
790 * for now UC MINUS. see comments in ioremap_nocache() 867 * for now UC MINUS. see comments in ioremap_nocache()
791 */ 868 */
792 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 869 if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
793 _PAGE_CACHE_UC_MINUS, NULL)) 870 _PAGE_CACHE_UC_MINUS, NULL))
794 return -EINVAL; 871 return -EINVAL;
795 872
@@ -797,18 +874,56 @@ int set_memory_uc(unsigned long addr, int numpages)
797} 874}
798EXPORT_SYMBOL(set_memory_uc); 875EXPORT_SYMBOL(set_memory_uc);
799 876
877int set_memory_array_uc(unsigned long *addr, int addrinarray)
878{
879 unsigned long start;
880 unsigned long end;
881 int i;
882 /*
883 * for now UC MINUS. see comments in ioremap_nocache()
884 */
885 for (i = 0; i < addrinarray; i++) {
886 start = __pa(addr[i]);
887 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
888 if (end != __pa(addr[i + 1]))
889 break;
890 i++;
891 }
892 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
893 goto out;
894 }
895
896 return change_page_attr_set(addr, addrinarray,
897 __pgprot(_PAGE_CACHE_UC_MINUS), 1);
898out:
899 for (i = 0; i < addrinarray; i++) {
900 unsigned long tmp = __pa(addr[i]);
901
902 if (tmp == start)
903 break;
904 for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
905 if (end != __pa(addr[i + 1]))
906 break;
907 i++;
908 }
909 free_memtype(tmp, end);
910 }
911 return -EINVAL;
912}
913EXPORT_SYMBOL(set_memory_array_uc);
914
800int _set_memory_wc(unsigned long addr, int numpages) 915int _set_memory_wc(unsigned long addr, int numpages)
801{ 916{
802 return change_page_attr_set(addr, numpages, 917 return change_page_attr_set(&addr, numpages,
803 __pgprot(_PAGE_CACHE_WC)); 918 __pgprot(_PAGE_CACHE_WC), 0);
804} 919}
805 920
806int set_memory_wc(unsigned long addr, int numpages) 921int set_memory_wc(unsigned long addr, int numpages)
807{ 922{
808 if (!pat_wc_enabled) 923 if (!pat_enabled)
809 return set_memory_uc(addr, numpages); 924 return set_memory_uc(addr, numpages);
810 925
811 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 926 if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
812 _PAGE_CACHE_WC, NULL)) 927 _PAGE_CACHE_WC, NULL))
813 return -EINVAL; 928 return -EINVAL;
814 929
@@ -818,49 +933,71 @@ EXPORT_SYMBOL(set_memory_wc);
818 933
819int _set_memory_wb(unsigned long addr, int numpages) 934int _set_memory_wb(unsigned long addr, int numpages)
820{ 935{
821 return change_page_attr_clear(addr, numpages, 936 return change_page_attr_clear(&addr, numpages,
822 __pgprot(_PAGE_CACHE_MASK)); 937 __pgprot(_PAGE_CACHE_MASK), 0);
823} 938}
824 939
825int set_memory_wb(unsigned long addr, int numpages) 940int set_memory_wb(unsigned long addr, int numpages)
826{ 941{
827 free_memtype(addr, addr + numpages * PAGE_SIZE); 942 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
828 943
829 return _set_memory_wb(addr, numpages); 944 return _set_memory_wb(addr, numpages);
830} 945}
831EXPORT_SYMBOL(set_memory_wb); 946EXPORT_SYMBOL(set_memory_wb);
832 947
948int set_memory_array_wb(unsigned long *addr, int addrinarray)
949{
950 int i;
951
952 for (i = 0; i < addrinarray; i++) {
953 unsigned long start = __pa(addr[i]);
954 unsigned long end;
955
956 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
957 if (end != __pa(addr[i + 1]))
958 break;
959 i++;
960 }
961 free_memtype(start, end);
962 }
963 return change_page_attr_clear(addr, addrinarray,
964 __pgprot(_PAGE_CACHE_MASK), 1);
965}
966EXPORT_SYMBOL(set_memory_array_wb);
967
833int set_memory_x(unsigned long addr, int numpages) 968int set_memory_x(unsigned long addr, int numpages)
834{ 969{
835 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); 970 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
836} 971}
837EXPORT_SYMBOL(set_memory_x); 972EXPORT_SYMBOL(set_memory_x);
838 973
839int set_memory_nx(unsigned long addr, int numpages) 974int set_memory_nx(unsigned long addr, int numpages)
840{ 975{
841 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); 976 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
842} 977}
843EXPORT_SYMBOL(set_memory_nx); 978EXPORT_SYMBOL(set_memory_nx);
844 979
845int set_memory_ro(unsigned long addr, int numpages) 980int set_memory_ro(unsigned long addr, int numpages)
846{ 981{
847 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); 982 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
848} 983}
984EXPORT_SYMBOL_GPL(set_memory_ro);
849 985
850int set_memory_rw(unsigned long addr, int numpages) 986int set_memory_rw(unsigned long addr, int numpages)
851{ 987{
852 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); 988 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
853} 989}
990EXPORT_SYMBOL_GPL(set_memory_rw);
854 991
855int set_memory_np(unsigned long addr, int numpages) 992int set_memory_np(unsigned long addr, int numpages)
856{ 993{
857 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); 994 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
858} 995}
859 996
860int set_memory_4k(unsigned long addr, int numpages) 997int set_memory_4k(unsigned long addr, int numpages)
861{ 998{
862 return change_page_attr_set_clr(addr, numpages, __pgprot(0), 999 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
863 __pgprot(0), 1); 1000 __pgprot(0), 1, 0);
864} 1001}
865 1002
866int set_pages_uc(struct page *page, int numpages) 1003int set_pages_uc(struct page *page, int numpages)
@@ -913,22 +1050,38 @@ int set_pages_rw(struct page *page, int numpages)
913 1050
914static int __set_pages_p(struct page *page, int numpages) 1051static int __set_pages_p(struct page *page, int numpages)
915{ 1052{
916 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1053 unsigned long tempaddr = (unsigned long) page_address(page);
1054 struct cpa_data cpa = { .vaddr = &tempaddr,
917 .numpages = numpages, 1055 .numpages = numpages,
918 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1056 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
919 .mask_clr = __pgprot(0)}; 1057 .mask_clr = __pgprot(0),
1058 .flags = 0};
920 1059
921 return __change_page_attr_set_clr(&cpa, 1); 1060 /*
1061 * No alias checking needed for setting present flag. otherwise,
1062 * we may need to break large pages for 64-bit kernel text
1063 * mappings (this adds to complexity if we want to do this from
1064 * atomic context especially). Let's keep it simple!
1065 */
1066 return __change_page_attr_set_clr(&cpa, 0);
922} 1067}
923 1068
924static int __set_pages_np(struct page *page, int numpages) 1069static int __set_pages_np(struct page *page, int numpages)
925{ 1070{
926 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1071 unsigned long tempaddr = (unsigned long) page_address(page);
1072 struct cpa_data cpa = { .vaddr = &tempaddr,
927 .numpages = numpages, 1073 .numpages = numpages,
928 .mask_set = __pgprot(0), 1074 .mask_set = __pgprot(0),
929 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; 1075 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1076 .flags = 0};
930 1077
931 return __change_page_attr_set_clr(&cpa, 1); 1078 /*
1079 * No alias checking needed for setting not present flag. otherwise,
1080 * we may need to break large pages for 64-bit kernel text
1081 * mappings (this adds to complexity if we want to do this from
1082 * atomic context especially). Let's keep it simple!
1083 */
1084 return __change_page_attr_set_clr(&cpa, 0);
932} 1085}
933 1086
934void kernel_map_pages(struct page *page, int numpages, int enable) 1087void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -948,11 +1101,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
948 1101
949 /* 1102 /*
950 * The return value is ignored as the calls cannot fail. 1103 * The return value is ignored as the calls cannot fail.
951 * Large pages are kept enabled at boot time, and are 1104 * Large pages for identity mappings are not used at boot time
952 * split up quickly with DEBUG_PAGEALLOC. If a splitup 1105 * and hence no memory allocations during large page split.
953 * fails here (due to temporary memory shortage) no damage
954 * is done because we just keep the largepage intact up
955 * to the next attempt when it will likely be split up:
956 */ 1106 */
957 if (enable) 1107 if (enable)
958 __set_pages_p(page, numpages); 1108 __set_pages_p(page, numpages);
@@ -964,53 +1114,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
964 * but that can deadlock->flush only current cpu: 1114 * but that can deadlock->flush only current cpu:
965 */ 1115 */
966 __flush_tlb_all(); 1116 __flush_tlb_all();
967
968 /*
969 * Try to refill the page pool here. We can do this only after
970 * the tlb flush.
971 */
972 cpa_fill_pool(NULL);
973}
974
975#ifdef CONFIG_DEBUG_FS
976static int dpa_show(struct seq_file *m, void *v)
977{
978 seq_puts(m, "DEBUG_PAGEALLOC\n");
979 seq_printf(m, "pool_size : %lu\n", pool_size);
980 seq_printf(m, "pool_pages : %lu\n", pool_pages);
981 seq_printf(m, "pool_low : %lu\n", pool_low);
982 seq_printf(m, "pool_used : %lu\n", pool_used);
983 seq_printf(m, "pool_failed : %lu\n", pool_failed);
984
985 return 0;
986}
987
988static int dpa_open(struct inode *inode, struct file *filp)
989{
990 return single_open(filp, dpa_show, NULL);
991} 1117}
992 1118
993static const struct file_operations dpa_fops = {
994 .open = dpa_open,
995 .read = seq_read,
996 .llseek = seq_lseek,
997 .release = single_release,
998};
999
1000static int __init debug_pagealloc_proc_init(void)
1001{
1002 struct dentry *de;
1003
1004 de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
1005 &dpa_fops);
1006 if (!de)
1007 return -ENOMEM;
1008
1009 return 0;
1010}
1011__initcall(debug_pagealloc_proc_init);
1012#endif
1013
1014#ifdef CONFIG_HIBERNATION 1119#ifdef CONFIG_HIBERNATION
1015 1120
1016bool kernel_page_present(struct page *page) 1121bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 06b7a1c90fb8..738fd0f24958 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -7,30 +7,32 @@
7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
8 */ 8 */
9 9
10#include <linux/mm.h> 10#include <linux/seq_file.h>
11#include <linux/bootmem.h>
12#include <linux/debugfs.h>
11#include <linux/kernel.h> 13#include <linux/kernel.h>
12#include <linux/gfp.h> 14#include <linux/gfp.h>
15#include <linux/mm.h>
13#include <linux/fs.h> 16#include <linux/fs.h>
14#include <linux/bootmem.h>
15 17
16#include <asm/msr.h> 18#include <asm/cacheflush.h>
17#include <asm/tlbflush.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
19#include <asm/page.h> 20#include <asm/tlbflush.h>
20#include <asm/pgtable.h> 21#include <asm/pgtable.h>
21#include <asm/pat.h>
22#include <asm/e820.h>
23#include <asm/cacheflush.h>
24#include <asm/fcntl.h> 22#include <asm/fcntl.h>
23#include <asm/e820.h>
25#include <asm/mtrr.h> 24#include <asm/mtrr.h>
25#include <asm/page.h>
26#include <asm/msr.h>
27#include <asm/pat.h>
26#include <asm/io.h> 28#include <asm/io.h>
27 29
28#ifdef CONFIG_X86_PAT 30#ifdef CONFIG_X86_PAT
29int __read_mostly pat_wc_enabled = 1; 31int __read_mostly pat_enabled = 1;
30 32
31void __cpuinit pat_disable(char *reason) 33void __cpuinit pat_disable(char *reason)
32{ 34{
33 pat_wc_enabled = 0; 35 pat_enabled = 0;
34 printk(KERN_INFO "%s\n", reason); 36 printk(KERN_INFO "%s\n", reason);
35} 37}
36 38
@@ -42,6 +44,20 @@ static int __init nopat(char *str)
42early_param("nopat", nopat); 44early_param("nopat", nopat);
43#endif 45#endif
44 46
47
48static int debug_enable;
49
50static int __init pat_debug_setup(char *str)
51{
52 debug_enable = 1;
53 return 0;
54}
55__setup("debugpat", pat_debug_setup);
56
57#define dprintk(fmt, arg...) \
58 do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
59
60
45static u64 __read_mostly boot_pat_state; 61static u64 __read_mostly boot_pat_state;
46 62
47enum { 63enum {
@@ -53,24 +69,25 @@ enum {
53 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ 69 PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
54}; 70};
55 71
56#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) 72#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
57 73
58void pat_init(void) 74void pat_init(void)
59{ 75{
60 u64 pat; 76 u64 pat;
61 77
62 if (!pat_wc_enabled) 78 if (!pat_enabled)
63 return; 79 return;
64 80
65 /* Paranoia check. */ 81 /* Paranoia check. */
66 if (!cpu_has_pat) { 82 if (!cpu_has_pat && boot_pat_state) {
67 printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
68 /* 83 /*
69 * Panic if this happens on the secondary CPU, and we 84 * If this happens we are on a secondary CPU, but
70 * switched to PAT on the boot CPU. We have no way to 85 * switched to PAT on the boot CPU. We have no way to
71 * undo PAT. 86 * undo PAT.
72 */ 87 */
73 BUG_ON(boot_pat_state); 88 printk(KERN_ERR "PAT enabled, "
89 "but not supported by secondary CPU\n");
90 BUG();
74 } 91 }
75 92
76 /* Set PWT to Write-Combining. All other bits stay the same */ 93 /* Set PWT to Write-Combining. All other bits stay the same */
@@ -86,8 +103,8 @@ void pat_init(void)
86 * 011 UC _PAGE_CACHE_UC 103 * 011 UC _PAGE_CACHE_UC
87 * PAT bit unused 104 * PAT bit unused
88 */ 105 */
89 pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | 106 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
90 PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); 107 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
91 108
92 /* Boot CPU check */ 109 /* Boot CPU check */
93 if (!boot_pat_state) 110 if (!boot_pat_state)
@@ -103,11 +120,11 @@ void pat_init(void)
103static char *cattr_name(unsigned long flags) 120static char *cattr_name(unsigned long flags)
104{ 121{
105 switch (flags & _PAGE_CACHE_MASK) { 122 switch (flags & _PAGE_CACHE_MASK) {
106 case _PAGE_CACHE_UC: return "uncached"; 123 case _PAGE_CACHE_UC: return "uncached";
107 case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; 124 case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
108 case _PAGE_CACHE_WB: return "write-back"; 125 case _PAGE_CACHE_WB: return "write-back";
109 case _PAGE_CACHE_WC: return "write-combining"; 126 case _PAGE_CACHE_WC: return "write-combining";
110 default: return "broken"; 127 default: return "broken";
111 } 128 }
112} 129}
113 130
@@ -129,14 +146,14 @@ static char *cattr_name(unsigned long flags)
129 */ 146 */
130 147
131struct memtype { 148struct memtype {
132 u64 start; 149 u64 start;
133 u64 end; 150 u64 end;
134 unsigned long type; 151 unsigned long type;
135 struct list_head nd; 152 struct list_head nd;
136}; 153};
137 154
138static LIST_HEAD(memtype_list); 155static LIST_HEAD(memtype_list);
139static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ 156static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
140 157
141/* 158/*
142 * Does intersection of PAT memory type and MTRR memory type and returns 159 * Does intersection of PAT memory type and MTRR memory type and returns
@@ -145,47 +162,113 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
145 * The intersection is based on "Effective Memory Type" tables in IA-32 162 * The intersection is based on "Effective Memory Type" tables in IA-32
146 * SDM vol 3a 163 * SDM vol 3a
147 */ 164 */
148static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, 165static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
149 unsigned long *ret_prot)
150{ 166{
151 unsigned long pat_type;
152 u8 mtrr_type;
153
154 pat_type = prot & _PAGE_CACHE_MASK;
155 prot &= (~_PAGE_CACHE_MASK);
156
157 /*
158 * We return the PAT request directly for types where PAT takes
159 * precedence with respect to MTRR and for UC_MINUS.
160 * Consistency checks with other PAT requests is done later
161 * while going through memtype list.
162 */
163 if (pat_type == _PAGE_CACHE_WC) {
164 *ret_prot = prot | _PAGE_CACHE_WC;
165 return 0;
166 } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
167 *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
168 return 0;
169 } else if (pat_type == _PAGE_CACHE_UC) {
170 *ret_prot = prot | _PAGE_CACHE_UC;
171 return 0;
172 }
173
174 /* 167 /*
175 * Look for MTRR hint to get the effective type in case where PAT 168 * Look for MTRR hint to get the effective type in case where PAT
176 * request is for WB. 169 * request is for WB.
177 */ 170 */
178 mtrr_type = mtrr_type_lookup(start, end); 171 if (req_type == _PAGE_CACHE_WB) {
172 u8 mtrr_type;
173
174 mtrr_type = mtrr_type_lookup(start, end);
175 if (mtrr_type == MTRR_TYPE_UNCACHABLE)
176 return _PAGE_CACHE_UC;
177 if (mtrr_type == MTRR_TYPE_WRCOMB)
178 return _PAGE_CACHE_WC;
179 }
179 180
180 if (mtrr_type == MTRR_TYPE_UNCACHABLE) { 181 return req_type;
181 *ret_prot = prot | _PAGE_CACHE_UC; 182}
182 } else if (mtrr_type == MTRR_TYPE_WRCOMB) { 183
183 *ret_prot = prot | _PAGE_CACHE_WC; 184static int
184 } else { 185chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
185 *ret_prot = prot | _PAGE_CACHE_WB; 186{
187 if (new->type != entry->type) {
188 if (type) {
189 new->type = entry->type;
190 *type = entry->type;
191 } else
192 goto conflict;
186 } 193 }
187 194
195 /* check overlaps with more than one entry in the list */
196 list_for_each_entry_continue(entry, &memtype_list, nd) {
197 if (new->end <= entry->start)
198 break;
199 else if (new->type != entry->type)
200 goto conflict;
201 }
188 return 0; 202 return 0;
203
204 conflict:
205 printk(KERN_INFO "%s:%d conflicting memory types "
206 "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
207 new->end, cattr_name(new->type), cattr_name(entry->type));
208 return -EBUSY;
209}
210
211static struct memtype *cached_entry;
212static u64 cached_start;
213
214/*
215 * For RAM pages, mark the pages as non WB memory type using
216 * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
217 * set_memory_wc() on a RAM page at a time before marking it as WB again.
218 * This is ok, because only one driver will be owning the page and
219 * doing set_memory_*() calls.
220 *
221 * For now, we use PageNonWB to track that the RAM page is being mapped
222 * as non WB. In future, we will have to use one more flag
223 * (or some other mechanism in page_struct) to distinguish between
224 * UC and WC mapping.
225 */
226static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
227 unsigned long *new_type)
228{
229 struct page *page;
230 u64 pfn, end_pfn;
231
232 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
233 page = pfn_to_page(pfn);
234 if (page_mapped(page) || PageNonWB(page))
235 goto out;
236
237 SetPageNonWB(page);
238 }
239 return 0;
240
241out:
242 end_pfn = pfn;
243 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
244 page = pfn_to_page(pfn);
245 ClearPageNonWB(page);
246 }
247
248 return -EINVAL;
249}
250
251static int free_ram_pages_type(u64 start, u64 end)
252{
253 struct page *page;
254 u64 pfn, end_pfn;
255
256 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
257 page = pfn_to_page(pfn);
258 if (page_mapped(page) || !PageNonWB(page))
259 goto out;
260
261 ClearPageNonWB(page);
262 }
263 return 0;
264
265out:
266 end_pfn = pfn;
267 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
268 page = pfn_to_page(pfn);
269 SetPageNonWB(page);
270 }
271 return -EINVAL;
189} 272}
190 273
191/* 274/*
@@ -198,37 +281,37 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
198 * req_type will have a special case value '-1', when requester want to inherit 281 * req_type will have a special case value '-1', when requester want to inherit
199 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. 282 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
200 * 283 *
201 * If ret_type is NULL, function will return an error if it cannot reserve the 284 * If new_type is NULL, function will return an error if it cannot reserve the
202 * region with req_type. If ret_type is non-null, function will return 285 * region with req_type. If new_type is non-NULL, function will return
203 * available type in ret_type in case of no error. In case of any error 286 * available type in new_type in case of no error. In case of any error
204 * it will return a negative return value. 287 * it will return a negative return value.
205 */ 288 */
206int reserve_memtype(u64 start, u64 end, unsigned long req_type, 289int reserve_memtype(u64 start, u64 end, unsigned long req_type,
207 unsigned long *ret_type) 290 unsigned long *new_type)
208{ 291{
209 struct memtype *new_entry = NULL; 292 struct memtype *new, *entry;
210 struct memtype *parse;
211 unsigned long actual_type; 293 unsigned long actual_type;
294 struct list_head *where;
295 int is_range_ram;
212 int err = 0; 296 int err = 0;
213 297
214 /* Only track when pat_wc_enabled */ 298 BUG_ON(start >= end); /* end is exclusive */
215 if (!pat_wc_enabled) { 299
300 if (!pat_enabled) {
216 /* This is identical to page table setting without PAT */ 301 /* This is identical to page table setting without PAT */
217 if (ret_type) { 302 if (new_type) {
218 if (req_type == -1) { 303 if (req_type == -1)
219 *ret_type = _PAGE_CACHE_WB; 304 *new_type = _PAGE_CACHE_WB;
220 } else { 305 else
221 *ret_type = req_type; 306 *new_type = req_type & _PAGE_CACHE_MASK;
222 }
223 } 307 }
224 return 0; 308 return 0;
225 } 309 }
226 310
227 /* Low ISA region is always mapped WB in page table. No need to track */ 311 /* Low ISA region is always mapped WB in page table. No need to track */
228 if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { 312 if (is_ISA_range(start, end - 1)) {
229 if (ret_type) 313 if (new_type)
230 *ret_type = _PAGE_CACHE_WB; 314 *new_type = _PAGE_CACHE_WB;
231
232 return 0; 315 return 0;
233 } 316 }
234 317
@@ -241,206 +324,133 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
241 */ 324 */
242 u8 mtrr_type = mtrr_type_lookup(start, end); 325 u8 mtrr_type = mtrr_type_lookup(start, end);
243 326
244 if (mtrr_type == MTRR_TYPE_WRBACK) { 327 if (mtrr_type == MTRR_TYPE_WRBACK)
245 req_type = _PAGE_CACHE_WB;
246 actual_type = _PAGE_CACHE_WB; 328 actual_type = _PAGE_CACHE_WB;
247 } else { 329 else
248 req_type = _PAGE_CACHE_UC_MINUS;
249 actual_type = _PAGE_CACHE_UC_MINUS; 330 actual_type = _PAGE_CACHE_UC_MINUS;
250 }
251 } else { 331 } else {
252 req_type &= _PAGE_CACHE_MASK; 332 actual_type = pat_x_mtrr_type(start, end,
253 err = pat_x_mtrr_type(start, end, req_type, &actual_type); 333 req_type & _PAGE_CACHE_MASK);
254 } 334 }
255 335
256 if (err) { 336 is_range_ram = pagerange_is_ram(start, end);
257 if (ret_type) 337 if (is_range_ram == 1)
258 *ret_type = actual_type; 338 return reserve_ram_pages_type(start, end, req_type, new_type);
259 339 else if (is_range_ram < 0)
260 return -EINVAL; 340 return -EINVAL;
261 }
262 341
263 new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); 342 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
264 if (!new_entry) 343 if (!new)
265 return -ENOMEM; 344 return -ENOMEM;
266 345
267 new_entry->start = start; 346 new->start = start;
268 new_entry->end = end; 347 new->end = end;
269 new_entry->type = actual_type; 348 new->type = actual_type;
270 349
271 if (ret_type) 350 if (new_type)
272 *ret_type = actual_type; 351 *new_type = actual_type;
273 352
274 spin_lock(&memtype_lock); 353 spin_lock(&memtype_lock);
275 354
276 /* Search for existing mapping that overlaps the current range */ 355 if (cached_entry && start >= cached_start)
277 list_for_each_entry(parse, &memtype_list, nd) { 356 entry = cached_entry;
278 struct memtype *saved_ptr; 357 else
358 entry = list_entry(&memtype_list, struct memtype, nd);
279 359
280 if (parse->start >= end) { 360 /* Search for existing mapping that overlaps the current range */
281 pr_debug("New Entry\n"); 361 where = NULL;
282 list_add(&new_entry->nd, parse->nd.prev); 362 list_for_each_entry_continue(entry, &memtype_list, nd) {
283 new_entry = NULL; 363 if (end <= entry->start) {
364 where = entry->nd.prev;
365 cached_entry = list_entry(where, struct memtype, nd);
284 break; 366 break;
285 } 367 } else if (start <= entry->start) { /* end > entry->start */
286 368 err = chk_conflict(new, entry, new_type);
287 if (start <= parse->start && end >= parse->start) { 369 if (!err) {
288 if (actual_type != parse->type && ret_type) { 370 dprintk("Overlap at 0x%Lx-0x%Lx\n",
289 actual_type = parse->type; 371 entry->start, entry->end);
290 *ret_type = actual_type; 372 where = entry->nd.prev;
291 new_entry->type = actual_type; 373 cached_entry = list_entry(where,
292 } 374 struct memtype, nd);
293
294 if (actual_type != parse->type) {
295 printk(
296 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
297 current->comm, current->pid,
298 start, end,
299 cattr_name(actual_type),
300 cattr_name(parse->type));
301 err = -EBUSY;
302 break;
303 } 375 }
304
305 saved_ptr = parse;
306 /*
307 * Check to see whether the request overlaps more
308 * than one entry in the list
309 */
310 list_for_each_entry_continue(parse, &memtype_list, nd) {
311 if (end <= parse->start) {
312 break;
313 }
314
315 if (actual_type != parse->type) {
316 printk(
317 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
318 current->comm, current->pid,
319 start, end,
320 cattr_name(actual_type),
321 cattr_name(parse->type));
322 err = -EBUSY;
323 break;
324 }
325 }
326
327 if (err) {
328 break;
329 }
330
331 pr_debug("Overlap at 0x%Lx-0x%Lx\n",
332 saved_ptr->start, saved_ptr->end);
333 /* No conflict. Go ahead and add this new entry */
334 list_add(&new_entry->nd, saved_ptr->nd.prev);
335 new_entry = NULL;
336 break; 376 break;
337 } 377 } else if (start < entry->end) { /* start > entry->start */
338 378 err = chk_conflict(new, entry, new_type);
339 if (start < parse->end) { 379 if (!err) {
340 if (actual_type != parse->type && ret_type) { 380 dprintk("Overlap at 0x%Lx-0x%Lx\n",
341 actual_type = parse->type; 381 entry->start, entry->end);
342 *ret_type = actual_type; 382 cached_entry = list_entry(entry->nd.prev,
343 new_entry->type = actual_type; 383 struct memtype, nd);
344 } 384
345 385 /*
346 if (actual_type != parse->type) { 386 * Move to right position in the linked
347 printk( 387 * list to add this new entry
348 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", 388 */
349 current->comm, current->pid, 389 list_for_each_entry_continue(entry,
350 start, end, 390 &memtype_list, nd) {
351 cattr_name(actual_type), 391 if (start <= entry->start) {
352 cattr_name(parse->type)); 392 where = entry->nd.prev;
353 err = -EBUSY; 393 break;
354 break; 394 }
355 }
356
357 saved_ptr = parse;
358 /*
359 * Check to see whether the request overlaps more
360 * than one entry in the list
361 */
362 list_for_each_entry_continue(parse, &memtype_list, nd) {
363 if (end <= parse->start) {
364 break;
365 }
366
367 if (actual_type != parse->type) {
368 printk(
369 KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
370 current->comm, current->pid,
371 start, end,
372 cattr_name(actual_type),
373 cattr_name(parse->type));
374 err = -EBUSY;
375 break;
376 } 395 }
377 } 396 }
378
379 if (err) {
380 break;
381 }
382
383 pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
384 saved_ptr->start, saved_ptr->end);
385 /* No conflict. Go ahead and add this new entry */
386 list_add(&new_entry->nd, &saved_ptr->nd);
387 new_entry = NULL;
388 break; 397 break;
389 } 398 }
390 } 399 }
391 400
392 if (err) { 401 if (err) {
393 printk(KERN_INFO 402 printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
394 "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", 403 "track %s, req %s\n",
395 start, end, cattr_name(new_entry->type), 404 start, end, cattr_name(new->type), cattr_name(req_type));
396 cattr_name(req_type)); 405 kfree(new);
397 kfree(new_entry);
398 spin_unlock(&memtype_lock); 406 spin_unlock(&memtype_lock);
407
399 return err; 408 return err;
400 } 409 }
401 410
402 if (new_entry) { 411 cached_start = start;
403 /* No conflict. Not yet added to the list. Add to the tail */
404 list_add_tail(&new_entry->nd, &memtype_list);
405 pr_debug("New Entry\n");
406 }
407 412
408 if (ret_type) { 413 if (where)
409 pr_debug( 414 list_add(&new->nd, where);
410 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", 415 else
411 start, end, cattr_name(actual_type), 416 list_add_tail(&new->nd, &memtype_list);
412 cattr_name(req_type), cattr_name(*ret_type));
413 } else {
414 pr_debug(
415 "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
416 start, end, cattr_name(actual_type),
417 cattr_name(req_type));
418 }
419 417
420 spin_unlock(&memtype_lock); 418 spin_unlock(&memtype_lock);
419
420 dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
421 start, end, cattr_name(new->type), cattr_name(req_type),
422 new_type ? cattr_name(*new_type) : "-");
423
421 return err; 424 return err;
422} 425}
423 426
424int free_memtype(u64 start, u64 end) 427int free_memtype(u64 start, u64 end)
425{ 428{
426 struct memtype *ml; 429 struct memtype *entry;
427 int err = -EINVAL; 430 int err = -EINVAL;
431 int is_range_ram;
428 432
429 /* Only track when pat_wc_enabled */ 433 if (!pat_enabled)
430 if (!pat_wc_enabled) {
431 return 0; 434 return 0;
432 }
433 435
434 /* Low ISA region is always mapped WB. No need to track */ 436 /* Low ISA region is always mapped WB. No need to track */
435 if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { 437 if (is_ISA_range(start, end - 1))
436 return 0; 438 return 0;
437 } 439
440 is_range_ram = pagerange_is_ram(start, end);
441 if (is_range_ram == 1)
442 return free_ram_pages_type(start, end);
443 else if (is_range_ram < 0)
444 return -EINVAL;
438 445
439 spin_lock(&memtype_lock); 446 spin_lock(&memtype_lock);
440 list_for_each_entry(ml, &memtype_list, nd) { 447 list_for_each_entry(entry, &memtype_list, nd) {
441 if (ml->start == start && ml->end == end) { 448 if (entry->start == start && entry->end == end) {
442 list_del(&ml->nd); 449 if (cached_entry == entry || cached_start == start)
443 kfree(ml); 450 cached_entry = NULL;
451
452 list_del(&entry->nd);
453 kfree(entry);
444 err = 0; 454 err = 0;
445 break; 455 break;
446 } 456 }
@@ -452,27 +462,20 @@ int free_memtype(u64 start, u64 end)
452 current->comm, current->pid, start, end); 462 current->comm, current->pid, start, end);
453 } 463 }
454 464
455 pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); 465 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
466
456 return err; 467 return err;
457} 468}
458 469
459 470
460/*
461 * /dev/mem mmap interface. The memtype used for mapping varies:
462 * - Use UC for mappings with O_SYNC flag
463 * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
464 * inherit the memtype from existing mapping.
465 * - Else use UC_MINUS memtype (for backward compatibility with existing
466 * X drivers.
467 */
468pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 471pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
469 unsigned long size, pgprot_t vma_prot) 472 unsigned long size, pgprot_t vma_prot)
470{ 473{
471 return vma_prot; 474 return vma_prot;
472} 475}
473 476
474#ifdef CONFIG_NONPROMISC_DEVMEM 477#ifdef CONFIG_STRICT_DEVMEM
475/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ 478/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
476static inline int range_is_allowed(unsigned long pfn, unsigned long size) 479static inline int range_is_allowed(unsigned long pfn, unsigned long size)
477{ 480{
478 return 1; 481 return 1;
@@ -496,20 +499,20 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
496 } 499 }
497 return 1; 500 return 1;
498} 501}
499#endif /* CONFIG_NONPROMISC_DEVMEM */ 502#endif /* CONFIG_STRICT_DEVMEM */
500 503
501int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 504int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
502 unsigned long size, pgprot_t *vma_prot) 505 unsigned long size, pgprot_t *vma_prot)
503{ 506{
504 u64 offset = ((u64) pfn) << PAGE_SHIFT; 507 u64 offset = ((u64) pfn) << PAGE_SHIFT;
505 unsigned long flags = _PAGE_CACHE_UC_MINUS; 508 unsigned long flags = -1;
506 int retval; 509 int retval;
507 510
508 if (!range_is_allowed(pfn, size)) 511 if (!range_is_allowed(pfn, size))
509 return 0; 512 return 0;
510 513
511 if (file->f_flags & O_SYNC) { 514 if (file->f_flags & O_SYNC) {
512 flags = _PAGE_CACHE_UC; 515 flags = _PAGE_CACHE_UC_MINUS;
513 } 516 }
514 517
515#ifdef CONFIG_X86_32 518#ifdef CONFIG_X86_32
@@ -521,24 +524,25 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
521 * caching for the high addresses through the KEN pin, but 524 * caching for the high addresses through the KEN pin, but
522 * we maintain the tradition of paranoia in this code. 525 * we maintain the tradition of paranoia in this code.
523 */ 526 */
524 if (!pat_wc_enabled && 527 if (!pat_enabled &&
525 ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || 528 !(boot_cpu_has(X86_FEATURE_MTRR) ||
526 test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || 529 boot_cpu_has(X86_FEATURE_K6_MTRR) ||
527 test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || 530 boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
528 test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && 531 boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
529 (pfn << PAGE_SHIFT) >= __pa(high_memory)) { 532 (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
530 flags = _PAGE_CACHE_UC; 533 flags = _PAGE_CACHE_UC;
531 } 534 }
532#endif 535#endif
533 536
534 /* 537 /*
535 * With O_SYNC, we can only take UC mapping. Fail if we cannot. 538 * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
539 *
536 * Without O_SYNC, we want to get 540 * Without O_SYNC, we want to get
537 * - WB for WB-able memory and no other conflicting mappings 541 * - WB for WB-able memory and no other conflicting mappings
538 * - UC_MINUS for non-WB-able memory with no other conflicting mappings 542 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
539 * - Inherit from confliting mappings otherwise 543 * - Inherit from confliting mappings otherwise
540 */ 544 */
541 if (flags != _PAGE_CACHE_UC_MINUS) { 545 if (flags != -1) {
542 retval = reserve_memtype(offset, offset + size, flags, NULL); 546 retval = reserve_memtype(offset, offset + size, flags, NULL);
543 } else { 547 } else {
544 retval = reserve_memtype(offset, offset + size, -1, &flags); 548 retval = reserve_memtype(offset, offset + size, -1, &flags);
@@ -547,8 +551,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
547 if (retval < 0) 551 if (retval < 0)
548 return 0; 552 return 0;
549 553
550 if (pfn <= max_pfn_mapped && 554 if (((pfn < max_low_pfn_mapped) ||
551 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { 555 (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
556 ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
552 free_memtype(offset, offset + size); 557 free_memtype(offset, offset + size);
553 printk(KERN_INFO 558 printk(KERN_INFO
554 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", 559 "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
@@ -565,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
565 570
566void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) 571void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
567{ 572{
573 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
568 u64 addr = (u64)pfn << PAGE_SHIFT; 574 u64 addr = (u64)pfn << PAGE_SHIFT;
569 unsigned long flags; 575 unsigned long flags;
570 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
571 576
572 reserve_memtype(addr, addr + size, want_flags, &flags); 577 reserve_memtype(addr, addr + size, want_flags, &flags);
573 if (flags != want_flags) { 578 if (flags != want_flags) {
@@ -587,3 +592,90 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
587 free_memtype(addr, addr + size); 592 free_memtype(addr, addr + size);
588} 593}
589 594
595#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
596
597/* get Nth element of the linked list */
598static struct memtype *memtype_get_idx(loff_t pos)
599{
600 struct memtype *list_node, *print_entry;
601 int i = 1;
602
603 print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
604 if (!print_entry)
605 return NULL;
606
607 spin_lock(&memtype_lock);
608 list_for_each_entry(list_node, &memtype_list, nd) {
609 if (pos == i) {
610 *print_entry = *list_node;
611 spin_unlock(&memtype_lock);
612 return print_entry;
613 }
614 ++i;
615 }
616 spin_unlock(&memtype_lock);
617 kfree(print_entry);
618
619 return NULL;
620}
621
622static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
623{
624 if (*pos == 0) {
625 ++*pos;
626 seq_printf(seq, "PAT memtype list:\n");
627 }
628
629 return memtype_get_idx(*pos);
630}
631
632static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
633{
634 ++*pos;
635 return memtype_get_idx(*pos);
636}
637
638static void memtype_seq_stop(struct seq_file *seq, void *v)
639{
640}
641
642static int memtype_seq_show(struct seq_file *seq, void *v)
643{
644 struct memtype *print_entry = (struct memtype *)v;
645
646 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
647 print_entry->start, print_entry->end);
648 kfree(print_entry);
649
650 return 0;
651}
652
653static struct seq_operations memtype_seq_ops = {
654 .start = memtype_seq_start,
655 .next = memtype_seq_next,
656 .stop = memtype_seq_stop,
657 .show = memtype_seq_show,
658};
659
660static int memtype_seq_open(struct inode *inode, struct file *file)
661{
662 return seq_open(file, &memtype_seq_ops);
663}
664
665static const struct file_operations memtype_fops = {
666 .open = memtype_seq_open,
667 .read = seq_read,
668 .llseek = seq_lseek,
669 .release = seq_release,
670};
671
672static int __init pat_memtype_list_init(void)
673{
674 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
675 NULL, &memtype_fops);
676 return 0;
677}
678
679late_initcall(pat_memtype_list_init);
680
681#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644
index 000000000000..efa1911e20ca
--- /dev/null
+++ b/arch/x86/mm/pf_in.c
@@ -0,0 +1,489 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
23 * Copyright by Intel Crop., 2002
24 * Louis Zhuang (louis.zhuang@intel.com)
25 *
26 * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
27 */
28
29#include <linux/module.h>
30#include <linux/ptrace.h> /* struct pt_regs */
31#include "pf_in.h"
32
33#ifdef __i386__
34/* IA32 Manual 3, 2-1 */
35static unsigned char prefix_codes[] = {
36 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
37 0x65, 0x2E, 0x3E, 0x66, 0x67
38};
39/* IA32 Manual 3, 3-432*/
40static unsigned int reg_rop[] = {
41 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
42};
43static unsigned int reg_wop[] = { 0x88, 0x89 };
44static unsigned int imm_wop[] = { 0xC6, 0xC7 };
45/* IA32 Manual 3, 3-432*/
46static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
47static unsigned int rw32[] = {
48 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
49};
50static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
51static unsigned int mw16[] = { 0xB70F, 0xBF0F };
52static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
53static unsigned int mw64[] = {};
54#else /* not __i386__ */
55static unsigned char prefix_codes[] = {
56 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
57 0xF0, 0xF3, 0xF2,
58 /* REX Prefixes */
59 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
60 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
61};
62/* AMD64 Manual 3, Appendix A*/
63static unsigned int reg_rop[] = {
64 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
65};
66static unsigned int reg_wop[] = { 0x88, 0x89 };
67static unsigned int imm_wop[] = { 0xC6, 0xC7 };
68static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
69static unsigned int rw32[] = {
70 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
71};
72/* 8 bit only */
73static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
74/* 16 bit only */
75static unsigned int mw16[] = { 0xB70F, 0xBF0F };
76/* 16 or 32 bit */
77static unsigned int mw32[] = { 0xC7 };
78/* 16, 32 or 64 bit */
79static unsigned int mw64[] = { 0x89, 0x8B };
80#endif /* not __i386__ */
81
82static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
83 int *rexr)
84{
85 int i;
86 unsigned char *p = addr;
87 *shorted = 0;
88 *enlarged = 0;
89 *rexr = 0;
90
91restart:
92 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
93 if (*p == prefix_codes[i]) {
94 if (*p == 0x66)
95 *shorted = 1;
96#ifdef __amd64__
97 if ((*p & 0xf8) == 0x48)
98 *enlarged = 1;
99 if ((*p & 0xf4) == 0x44)
100 *rexr = 1;
101#endif
102 p++;
103 goto restart;
104 }
105 }
106
107 return (p - addr);
108}
109
110static int get_opcode(unsigned char *addr, unsigned int *opcode)
111{
112 int len;
113
114 if (*addr == 0x0F) {
115 /* 0x0F is extension instruction */
116 *opcode = *(unsigned short *)addr;
117 len = 2;
118 } else {
119 *opcode = *addr;
120 len = 1;
121 }
122
123 return len;
124}
125
126#define CHECK_OP_TYPE(opcode, array, type) \
127 for (i = 0; i < ARRAY_SIZE(array); i++) { \
128 if (array[i] == opcode) { \
129 rv = type; \
130 goto exit; \
131 } \
132 }
133
134enum reason_type get_ins_type(unsigned long ins_addr)
135{
136 unsigned int opcode;
137 unsigned char *p;
138 int shorted, enlarged, rexr;
139 int i;
140 enum reason_type rv = OTHERS;
141
142 p = (unsigned char *)ins_addr;
143 p += skip_prefix(p, &shorted, &enlarged, &rexr);
144 p += get_opcode(p, &opcode);
145
146 CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
147 CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
148 CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
149
150exit:
151 return rv;
152}
153#undef CHECK_OP_TYPE
154
155static unsigned int get_ins_reg_width(unsigned long ins_addr)
156{
157 unsigned int opcode;
158 unsigned char *p;
159 int i, shorted, enlarged, rexr;
160
161 p = (unsigned char *)ins_addr;
162 p += skip_prefix(p, &shorted, &enlarged, &rexr);
163 p += get_opcode(p, &opcode);
164
165 for (i = 0; i < ARRAY_SIZE(rw8); i++)
166 if (rw8[i] == opcode)
167 return 1;
168
169 for (i = 0; i < ARRAY_SIZE(rw32); i++)
170 if (rw32[i] == opcode)
171 return (shorted ? 2 : (enlarged ? 8 : 4));
172
173 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
174 return 0;
175}
176
177unsigned int get_ins_mem_width(unsigned long ins_addr)
178{
179 unsigned int opcode;
180 unsigned char *p;
181 int i, shorted, enlarged, rexr;
182
183 p = (unsigned char *)ins_addr;
184 p += skip_prefix(p, &shorted, &enlarged, &rexr);
185 p += get_opcode(p, &opcode);
186
187 for (i = 0; i < ARRAY_SIZE(mw8); i++)
188 if (mw8[i] == opcode)
189 return 1;
190
191 for (i = 0; i < ARRAY_SIZE(mw16); i++)
192 if (mw16[i] == opcode)
193 return 2;
194
195 for (i = 0; i < ARRAY_SIZE(mw32); i++)
196 if (mw32[i] == opcode)
197 return shorted ? 2 : 4;
198
199 for (i = 0; i < ARRAY_SIZE(mw64); i++)
200 if (mw64[i] == opcode)
201 return shorted ? 2 : (enlarged ? 8 : 4);
202
203 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
204 return 0;
205}
206
207/*
208 * Define register ident in mod/rm byte.
209 * Note: these are NOT the same as in ptrace-abi.h.
210 */
211enum {
212 arg_AL = 0,
213 arg_CL = 1,
214 arg_DL = 2,
215 arg_BL = 3,
216 arg_AH = 4,
217 arg_CH = 5,
218 arg_DH = 6,
219 arg_BH = 7,
220
221 arg_AX = 0,
222 arg_CX = 1,
223 arg_DX = 2,
224 arg_BX = 3,
225 arg_SP = 4,
226 arg_BP = 5,
227 arg_SI = 6,
228 arg_DI = 7,
229#ifdef __amd64__
230 arg_R8 = 8,
231 arg_R9 = 9,
232 arg_R10 = 10,
233 arg_R11 = 11,
234 arg_R12 = 12,
235 arg_R13 = 13,
236 arg_R14 = 14,
237 arg_R15 = 15
238#endif
239};
240
241static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
242{
243 unsigned char *rv = NULL;
244
245 switch (no) {
246 case arg_AL:
247 rv = (unsigned char *)&regs->ax;
248 break;
249 case arg_BL:
250 rv = (unsigned char *)&regs->bx;
251 break;
252 case arg_CL:
253 rv = (unsigned char *)&regs->cx;
254 break;
255 case arg_DL:
256 rv = (unsigned char *)&regs->dx;
257 break;
258 case arg_AH:
259 rv = 1 + (unsigned char *)&regs->ax;
260 break;
261 case arg_BH:
262 rv = 1 + (unsigned char *)&regs->bx;
263 break;
264 case arg_CH:
265 rv = 1 + (unsigned char *)&regs->cx;
266 break;
267 case arg_DH:
268 rv = 1 + (unsigned char *)&regs->dx;
269 break;
270#ifdef __amd64__
271 case arg_R8:
272 rv = (unsigned char *)&regs->r8;
273 break;
274 case arg_R9:
275 rv = (unsigned char *)&regs->r9;
276 break;
277 case arg_R10:
278 rv = (unsigned char *)&regs->r10;
279 break;
280 case arg_R11:
281 rv = (unsigned char *)&regs->r11;
282 break;
283 case arg_R12:
284 rv = (unsigned char *)&regs->r12;
285 break;
286 case arg_R13:
287 rv = (unsigned char *)&regs->r13;
288 break;
289 case arg_R14:
290 rv = (unsigned char *)&regs->r14;
291 break;
292 case arg_R15:
293 rv = (unsigned char *)&regs->r15;
294 break;
295#endif
296 default:
297 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
298 break;
299 }
300 return rv;
301}
302
303static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
304{
305 unsigned long *rv = NULL;
306
307 switch (no) {
308 case arg_AX:
309 rv = &regs->ax;
310 break;
311 case arg_BX:
312 rv = &regs->bx;
313 break;
314 case arg_CX:
315 rv = &regs->cx;
316 break;
317 case arg_DX:
318 rv = &regs->dx;
319 break;
320 case arg_SP:
321 rv = &regs->sp;
322 break;
323 case arg_BP:
324 rv = &regs->bp;
325 break;
326 case arg_SI:
327 rv = &regs->si;
328 break;
329 case arg_DI:
330 rv = &regs->di;
331 break;
332#ifdef __amd64__
333 case arg_R8:
334 rv = &regs->r8;
335 break;
336 case arg_R9:
337 rv = &regs->r9;
338 break;
339 case arg_R10:
340 rv = &regs->r10;
341 break;
342 case arg_R11:
343 rv = &regs->r11;
344 break;
345 case arg_R12:
346 rv = &regs->r12;
347 break;
348 case arg_R13:
349 rv = &regs->r13;
350 break;
351 case arg_R14:
352 rv = &regs->r14;
353 break;
354 case arg_R15:
355 rv = &regs->r15;
356 break;
357#endif
358 default:
359 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
360 }
361
362 return rv;
363}
364
365unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
366{
367 unsigned int opcode;
368 unsigned char mod_rm;
369 int reg;
370 unsigned char *p;
371 int i, shorted, enlarged, rexr;
372 unsigned long rv;
373
374 p = (unsigned char *)ins_addr;
375 p += skip_prefix(p, &shorted, &enlarged, &rexr);
376 p += get_opcode(p, &opcode);
377 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
378 if (reg_rop[i] == opcode) {
379 rv = REG_READ;
380 goto do_work;
381 }
382
383 for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
384 if (reg_wop[i] == opcode) {
385 rv = REG_WRITE;
386 goto do_work;
387 }
388
389 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
390 "0x%02x\n", opcode);
391 goto err;
392
393do_work:
394 mod_rm = *p;
395 reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
396 switch (get_ins_reg_width(ins_addr)) {
397 case 1:
398 return *get_reg_w8(reg, regs);
399
400 case 2:
401 return *(unsigned short *)get_reg_w32(reg, regs);
402
403 case 4:
404 return *(unsigned int *)get_reg_w32(reg, regs);
405
406#ifdef __amd64__
407 case 8:
408 return *(unsigned long *)get_reg_w32(reg, regs);
409#endif
410
411 default:
412 printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
413 }
414
415err:
416 return 0;
417}
418
419unsigned long get_ins_imm_val(unsigned long ins_addr)
420{
421 unsigned int opcode;
422 unsigned char mod_rm;
423 unsigned char mod;
424 unsigned char *p;
425 int i, shorted, enlarged, rexr;
426 unsigned long rv;
427
428 p = (unsigned char *)ins_addr;
429 p += skip_prefix(p, &shorted, &enlarged, &rexr);
430 p += get_opcode(p, &opcode);
431 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
432 if (imm_wop[i] == opcode) {
433 rv = IMM_WRITE;
434 goto do_work;
435 }
436
437 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
438 "0x%02x\n", opcode);
439 goto err;
440
441do_work:
442 mod_rm = *p;
443 mod = mod_rm >> 6;
444 p++;
445 switch (mod) {
446 case 0:
447 /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */
448 /* AMD64: XXX Check for address size prefix? */
449 if ((mod_rm & 0x7) == 0x5)
450 p += 4;
451 break;
452
453 case 1:
454 p += 1;
455 break;
456
457 case 2:
458 p += 4;
459 break;
460
461 case 3:
462 default:
463 printk(KERN_ERR "mmiotrace: not a memory access instruction "
464 "at 0x%lx, rm_mod=0x%02x\n",
465 ins_addr, mod_rm);
466 }
467
468 switch (get_ins_reg_width(ins_addr)) {
469 case 1:
470 return *(unsigned char *)p;
471
472 case 2:
473 return *(unsigned short *)p;
474
475 case 4:
476 return *(unsigned int *)p;
477
478#ifdef __amd64__
479 case 8:
480 return *(unsigned long *)p;
481#endif
482
483 default:
484 printk(KERN_ERR "mmiotrace: Error: width.\n");
485 }
486
487err:
488 return 0;
489}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644
index 000000000000..e05341a51a27
--- /dev/null
+++ b/arch/x86/mm/pf_in.h
@@ -0,0 +1,39 @@
1/*
2 * Fault Injection Test harness (FI)
3 * Copyright (C) Intel Crop.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18 * USA.
19 *
20 */
21
22#ifndef __PF_H_
23#define __PF_H_
24
25enum reason_type {
26 NOT_ME, /* page fault is not in regions */
27 NOTHING, /* access others point in regions */
28 REG_READ, /* read from addr to reg */
29 REG_WRITE, /* write from reg to addr */
30 IMM_WRITE, /* write from imm to addr */
31 OTHERS /* Other instructions can not intercept */
32};
33
34enum reason_type get_ins_type(unsigned long ins_addr);
35unsigned int get_ins_mem_width(unsigned long ins_addr);
36unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
37unsigned long get_ins_imm_val(unsigned long ins_addr);
38
39#endif /* __PF_H_ */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 50159764f694..86f2ffc43c3d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
2#include <asm/pgalloc.h> 2#include <asm/pgalloc.h>
3#include <asm/pgtable.h> 3#include <asm/pgtable.h>
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h>
5 6
6pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
7{ 8{
@@ -62,16 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd)
62#define UNSHARED_PTRS_PER_PGD \ 63#define UNSHARED_PTRS_PER_PGD \
63 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
64 65
65static void pgd_ctor(void *p) 66static void pgd_ctor(pgd_t *pgd)
66{ 67{
67 pgd_t *pgd = p;
68 unsigned long flags;
69
70 /* Clear usermode parts of PGD */
71 memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
72
73 spin_lock_irqsave(&pgd_lock, flags);
74
75 /* If the pgd points to a shared pagetable level (either the 68 /* If the pgd points to a shared pagetable level (either the
76 ptes in non-PAE, or shared PMD in PAE), then just copy the 69 ptes in non-PAE, or shared PMD in PAE), then just copy the
77 references from swapper_pg_dir. */ 70 references from swapper_pg_dir. */
@@ -90,11 +83,9 @@ static void pgd_ctor(void *p)
90 /* list required to sync kernel mapping updates */ 83 /* list required to sync kernel mapping updates */
91 if (!SHARED_KERNEL_PMD) 84 if (!SHARED_KERNEL_PMD)
92 pgd_list_add(pgd); 85 pgd_list_add(pgd);
93
94 spin_unlock_irqrestore(&pgd_lock, flags);
95} 86}
96 87
97static void pgd_dtor(void *pgd) 88static void pgd_dtor(pgd_t *pgd)
98{ 89{
99 unsigned long flags; /* can be called from interrupt context */ 90 unsigned long flags; /* can be called from interrupt context */
100 91
@@ -119,6 +110,72 @@ static void pgd_dtor(void *pgd)
119 110
120#ifdef CONFIG_X86_PAE 111#ifdef CONFIG_X86_PAE
121/* 112/*
113 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
114 * updating the top-level pagetable entries to guarantee the
115 * processor notices the update. Since this is expensive, and
116 * all 4 top-level entries are used almost immediately in a
117 * new process's life, we just pre-populate them here.
118 *
119 * Also, if we're in a paravirt environment where the kernel pmd is
120 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
121 * and initialize the kernel pmds here.
122 */
123#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
124
125void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
126{
127 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
128
129 /* Note: almost everything apart from _PAGE_PRESENT is
130 reserved at the pmd (PDPT) level. */
131 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
132
133 /*
134 * According to Intel App note "TLBs, Paging-Structure Caches,
135 * and Their Invalidation", April 2007, document 317080-001,
136 * section 8.1: in PAE mode we explicitly have to flush the
137 * TLB via cr3 if the top-level pgd is changed...
138 */
139 if (mm == current->active_mm)
140 write_cr3(read_cr3());
141}
142#else /* !CONFIG_X86_PAE */
143
144/* No need to prepopulate any pagetable entries in non-PAE modes. */
145#define PREALLOCATED_PMDS 0
146
147#endif /* CONFIG_X86_PAE */
148
149static void free_pmds(pmd_t *pmds[])
150{
151 int i;
152
153 for(i = 0; i < PREALLOCATED_PMDS; i++)
154 if (pmds[i])
155 free_page((unsigned long)pmds[i]);
156}
157
158static int preallocate_pmds(pmd_t *pmds[])
159{
160 int i;
161 bool failed = false;
162
163 for(i = 0; i < PREALLOCATED_PMDS; i++) {
164 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
165 if (pmd == NULL)
166 failed = true;
167 pmds[i] = pmd;
168 }
169
170 if (failed) {
171 free_pmds(pmds);
172 return -ENOMEM;
173 }
174
175 return 0;
176}
177
178/*
122 * Mop up any pmd pages which may still be attached to the pgd. 179 * Mop up any pmd pages which may still be attached to the pgd.
123 * Normally they will be freed by munmap/exit_mmap, but any pmd we 180 * Normally they will be freed by munmap/exit_mmap, but any pmd we
124 * preallocate which never got a corresponding vma will need to be 181 * preallocate which never got a corresponding vma will need to be
@@ -128,7 +185,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
128{ 185{
129 int i; 186 int i;
130 187
131 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { 188 for(i = 0; i < PREALLOCATED_PMDS; i++) {
132 pgd_t pgd = pgdp[i]; 189 pgd_t pgd = pgdp[i];
133 190
134 if (pgd_val(pgd) != 0) { 191 if (pgd_val(pgd) != 0) {
@@ -142,32 +199,20 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
142 } 199 }
143} 200}
144 201
145/* 202static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
146 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
147 * updating the top-level pagetable entries to guarantee the
148 * processor notices the update. Since this is expensive, and
149 * all 4 top-level entries are used almost immediately in a
150 * new process's life, we just pre-populate them here.
151 *
152 * Also, if we're in a paravirt environment where the kernel pmd is
153 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
154 * and initialize the kernel pmds here.
155 */
156static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
157{ 203{
158 pud_t *pud; 204 pud_t *pud;
159 unsigned long addr; 205 unsigned long addr;
160 int i; 206 int i;
161 207
208 if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
209 return;
210
162 pud = pud_offset(pgd, 0); 211 pud = pud_offset(pgd, 0);
163 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
164 i++, pud++, addr += PUD_SIZE) {
165 pmd_t *pmd = pmd_alloc_one(mm, addr);
166 212
167 if (!pmd) { 213 for (addr = i = 0; i < PREALLOCATED_PMDS;
168 pgd_mop_up_pmds(mm, pgd); 214 i++, pud++, addr += PUD_SIZE) {
169 return 0; 215 pmd_t *pmd = pmds[i];
170 }
171 216
172 if (i >= KERNEL_PGD_BOUNDARY) 217 if (i >= KERNEL_PGD_BOUNDARY)
173 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 218 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
@@ -175,61 +220,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
175 220
176 pud_populate(mm, pud, pmd); 221 pud_populate(mm, pud, pmd);
177 } 222 }
178
179 return 1;
180} 223}
181 224
182void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 225pgd_t *pgd_alloc(struct mm_struct *mm)
183{ 226{
184 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 227 pgd_t *pgd;
228 pmd_t *pmds[PREALLOCATED_PMDS];
229 unsigned long flags;
185 230
186 /* Note: almost everything apart from _PAGE_PRESENT is 231 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
187 reserved at the pmd (PDPT) level. */
188 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
189 232
190 /* 233 if (pgd == NULL)
191 * According to Intel App note "TLBs, Paging-Structure Caches, 234 goto out;
192 * and Their Invalidation", April 2007, document 317080-001,
193 * section 8.1: in PAE mode we explicitly have to flush the
194 * TLB via cr3 if the top-level pgd is changed...
195 */
196 if (mm == current->active_mm)
197 write_cr3(read_cr3());
198}
199#else /* !CONFIG_X86_PAE */
200/* No need to prepopulate any pagetable entries in non-PAE modes. */
201static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
202{
203 return 1;
204}
205 235
206static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) 236 mm->pgd = pgd;
207{
208}
209#endif /* CONFIG_X86_PAE */
210 237
211pgd_t *pgd_alloc(struct mm_struct *mm) 238 if (preallocate_pmds(pmds) != 0)
212{ 239 goto out_free_pgd;
213 pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
214 240
215 /* so that alloc_pmd can use it */ 241 if (paravirt_pgd_alloc(mm) != 0)
216 mm->pgd = pgd; 242 goto out_free_pmds;
217 if (pgd)
218 pgd_ctor(pgd);
219 243
220 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { 244 /*
221 pgd_dtor(pgd); 245 * Make sure that pre-populating the pmds is atomic with
222 free_page((unsigned long)pgd); 246 * respect to anything walking the pgd_list, so that they
223 pgd = NULL; 247 * never see a partially populated pgd.
224 } 248 */
249 spin_lock_irqsave(&pgd_lock, flags);
250
251 pgd_ctor(pgd);
252 pgd_prepopulate_pmd(mm, pgd, pmds);
253
254 spin_unlock_irqrestore(&pgd_lock, flags);
225 255
226 return pgd; 256 return pgd;
257
258out_free_pmds:
259 free_pmds(pmds);
260out_free_pgd:
261 free_page((unsigned long)pgd);
262out:
263 return NULL;
227} 264}
228 265
229void pgd_free(struct mm_struct *mm, pgd_t *pgd) 266void pgd_free(struct mm_struct *mm, pgd_t *pgd)
230{ 267{
231 pgd_mop_up_pmds(mm, pgd); 268 pgd_mop_up_pmds(mm, pgd);
232 pgd_dtor(pgd); 269 pgd_dtor(pgd);
270 paravirt_pgd_free(mm, pgd);
233 free_page((unsigned long)pgd); 271 free_page((unsigned long)pgd);
234} 272}
235 273
@@ -255,7 +293,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
255 293
256 if (pte_young(*ptep)) 294 if (pte_young(*ptep))
257 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 295 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
258 &ptep->pte); 296 (unsigned long *) &ptep->pte);
259 297
260 if (ret) 298 if (ret)
261 pte_update(vma->vm_mm, addr, ptep); 299 pte_update(vma->vm_mm, addr, ptep);
@@ -274,3 +312,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
274 312
275 return young; 313 return young;
276} 314}
315
316int fixmaps_set;
317
318void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
319{
320 unsigned long address = __fix_to_virt(idx);
321
322 if (idx >= __end_of_fixed_addresses) {
323 BUG();
324 return;
325 }
326 set_pte_vaddr(address, pte);
327 fixmaps_set++;
328}
329
330void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
331{
332 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
333}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 369cf065b6a4..0951db9ee519 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,58 +20,11 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23void show_mem(void)
24{
25 int total = 0, reserved = 0;
26 int shared = 0, cached = 0;
27 int highmem = 0;
28 struct page *page;
29 pg_data_t *pgdat;
30 unsigned long i;
31 unsigned long flags;
32
33 printk(KERN_INFO "Mem-info:\n");
34 show_free_areas();
35 for_each_online_pgdat(pgdat) {
36 pgdat_resize_lock(pgdat, &flags);
37 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
38 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
39 touch_nmi_watchdog();
40 page = pgdat_page_nr(pgdat, i);
41 total++;
42 if (PageHighMem(page))
43 highmem++;
44 if (PageReserved(page))
45 reserved++;
46 else if (PageSwapCache(page))
47 cached++;
48 else if (page_count(page))
49 shared += page_count(page) - 1;
50 }
51 pgdat_resize_unlock(pgdat, &flags);
52 }
53 printk(KERN_INFO "%d pages of RAM\n", total);
54 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
55 printk(KERN_INFO "%d reserved pages\n", reserved);
56 printk(KERN_INFO "%d pages shared\n", shared);
57 printk(KERN_INFO "%d pages swap cached\n", cached);
58
59 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
60 printk(KERN_INFO "%lu pages writeback\n",
61 global_page_state(NR_WRITEBACK));
62 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
63 printk(KERN_INFO "%lu pages slab\n",
64 global_page_state(NR_SLAB_RECLAIMABLE) +
65 global_page_state(NR_SLAB_UNRECLAIMABLE));
66 printk(KERN_INFO "%lu pages pagetables\n",
67 global_page_state(NR_PAGETABLE));
68}
69
70/* 23/*
71 * Associate a virtual page frame with a given physical page frame 24 * Associate a virtual page frame with a given physical page frame
72 * and protection flags for that frame. 25 * and protection flags for that frame.
73 */ 26 */
74static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) 27void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
75{ 28{
76 pgd_t *pgd; 29 pgd_t *pgd;
77 pud_t *pud; 30 pud_t *pud;
@@ -94,8 +47,8 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
94 return; 47 return;
95 } 48 }
96 pte = pte_offset_kernel(pmd, vaddr); 49 pte = pte_offset_kernel(pmd, vaddr);
97 if (pgprot_val(flags)) 50 if (pte_val(pteval))
98 set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags)); 51 set_pte_present(&init_mm, vaddr, pte, pteval);
99 else 52 else
100 pte_clear(&init_mm, vaddr, pte); 53 pte_clear(&init_mm, vaddr, pte);
101 54
@@ -141,22 +94,9 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
141 __flush_tlb_one(vaddr); 94 __flush_tlb_one(vaddr);
142} 95}
143 96
144static int fixmaps;
145unsigned long __FIXADDR_TOP = 0xfffff000; 97unsigned long __FIXADDR_TOP = 0xfffff000;
146EXPORT_SYMBOL(__FIXADDR_TOP); 98EXPORT_SYMBOL(__FIXADDR_TOP);
147 99
148void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
149{
150 unsigned long address = __fix_to_virt(idx);
151
152 if (idx >= __end_of_fixed_addresses) {
153 BUG();
154 return;
155 }
156 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
157 fixmaps++;
158}
159
160/** 100/**
161 * reserve_top_address - reserves a hole in the top of kernel address space 101 * reserve_top_address - reserves a hole in the top of kernel address space
162 * @reserve - size of hole to reserve 102 * @reserve - size of hole to reserve
@@ -164,11 +104,45 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
164 * Can be used to relocate the fixmap area and poke a hole in the top 104 * Can be used to relocate the fixmap area and poke a hole in the top
165 * of kernel address space to make room for a hypervisor. 105 * of kernel address space to make room for a hypervisor.
166 */ 106 */
167void reserve_top_address(unsigned long reserve) 107void __init reserve_top_address(unsigned long reserve)
168{ 108{
169 BUG_ON(fixmaps > 0); 109 BUG_ON(fixmaps_set > 0);
170 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", 110 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
171 (int)-reserve); 111 (int)-reserve);
172 __FIXADDR_TOP = -reserve - PAGE_SIZE; 112 __FIXADDR_TOP = -reserve - PAGE_SIZE;
173 __VMALLOC_RESERVE += reserve; 113 __VMALLOC_RESERVE += reserve;
174} 114}
115
116/*
117 * vmalloc=size forces the vmalloc area to be exactly 'size'
118 * bytes. This can be used to increase (or decrease) the
119 * vmalloc area - the default is 128m.
120 */
121static int __init parse_vmalloc(char *arg)
122{
123 if (!arg)
124 return -EINVAL;
125
126 /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
127 __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
128 return 0;
129}
130early_param("vmalloc", parse_vmalloc);
131
132/*
133 * reservetop=size reserves a hole at the top of the kernel address space which
134 * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
135 * so relocating the fixmap can be done before paging initialization.
136 */
137static int __init parse_reservetop(char *arg)
138{
139 unsigned long address;
140
141 if (!arg)
142 return -EINVAL;
143
144 address = memparse(arg, &arg);
145 reserve_top_address(address);
146 return 0;
147}
148early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
new file mode 100644
index 000000000000..16ae70fc57e7
--- /dev/null
+++ b/arch/x86/mm/srat_32.c
@@ -0,0 +1,283 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/acpi.h>
30#include <linux/nodemask.h>
31#include <asm/srat.h>
32#include <asm/topology.h>
33#include <asm/smp.h>
34#include <asm/e820.h>
35
36/*
37 * proximity macros and definitions
38 */
39#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
40#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
41#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
42#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
43/* bitmap length; _PXM is at most 255 */
44#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
45static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
46
47#define MAX_CHUNKS_PER_NODE 3
48#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
49struct node_memory_chunk_s {
50 unsigned long start_pfn;
51 unsigned long end_pfn;
52 u8 pxm; // proximity domain of node
53 u8 nid; // which cnode contains this chunk?
54 u8 bank; // which mem bank on this node
55};
56static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
57
58static int __initdata num_memory_chunks; /* total number of memory chunks */
59static u8 __initdata apicid_to_pxm[MAX_APICID];
60
61int numa_off __initdata;
62int acpi_numa __initdata;
63
64static __init void bad_srat(void)
65{
66 printk(KERN_ERR "SRAT: SRAT not used.\n");
67 acpi_numa = -1;
68 num_memory_chunks = 0;
69}
70
71static __init inline int srat_disabled(void)
72{
73 return numa_off || acpi_numa < 0;
74}
75
76/* Identify CPU proximity domains */
77void __init
78acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
79{
80 if (srat_disabled())
81 return;
82 if (cpu_affinity->header.length !=
83 sizeof(struct acpi_srat_cpu_affinity)) {
84 bad_srat();
85 return;
86 }
87
88 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
89 return; /* empty entry */
90
91 /* mark this node as "seen" in node bitmap */
92 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
93
94 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
95
96 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
97 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
98}
99
100/*
101 * Identify memory proximity domains and hot-remove capabilities.
102 * Fill node memory chunk list structure.
103 */
104void __init
105acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
106{
107 unsigned long long paddr, size;
108 unsigned long start_pfn, end_pfn;
109 u8 pxm;
110 struct node_memory_chunk_s *p, *q, *pend;
111
112 if (srat_disabled())
113 return;
114 if (memory_affinity->header.length !=
115 sizeof(struct acpi_srat_mem_affinity)) {
116 bad_srat();
117 return;
118 }
119
120 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
121 return; /* empty entry */
122
123 pxm = memory_affinity->proximity_domain & 0xff;
124
125 /* mark this node as "seen" in node bitmap */
126 BMAP_SET(pxm_bitmap, pxm);
127
128 /* calculate info for memory chunk structure */
129 paddr = memory_affinity->base_address;
130 size = memory_affinity->length;
131
132 start_pfn = paddr >> PAGE_SHIFT;
133 end_pfn = (paddr + size) >> PAGE_SHIFT;
134
135
136 if (num_memory_chunks >= MAXCHUNKS) {
137 printk(KERN_WARNING "Too many mem chunks in SRAT."
138 " Ignoring %lld MBytes at %llx\n",
139 size/(1024*1024), paddr);
140 return;
141 }
142
143 /* Insertion sort based on base address */
144 pend = &node_memory_chunk[num_memory_chunks];
145 for (p = &node_memory_chunk[0]; p < pend; p++) {
146 if (start_pfn < p->start_pfn)
147 break;
148 }
149 if (p < pend) {
150 for (q = pend; q >= p; q--)
151 *(q + 1) = *q;
152 }
153 p->start_pfn = start_pfn;
154 p->end_pfn = end_pfn;
155 p->pxm = pxm;
156
157 num_memory_chunks++;
158
159 printk(KERN_DEBUG "Memory range %08lx to %08lx"
160 " in proximity domain %02x %s\n",
161 start_pfn, end_pfn,
162 pxm,
163 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
164 "enabled and removable" : "enabled" ) );
165}
166
167/* Callback for SLIT parsing */
168void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
169{
170}
171
172void acpi_numa_arch_fixup(void)
173{
174}
175/*
176 * The SRAT table always lists ascending addresses, so can always
177 * assume that the first "start" address that you see is the real
178 * start of the node, and that the current "end" address is after
179 * the previous one.
180 */
181static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
182{
183 /*
184 * Only add present memory as told by the e820.
185 * There is no guarantee from the SRAT that the memory it
186 * enumerates is present at boot time because it represents
187 * *possible* memory hotplug areas the same as normal RAM.
188 */
189 if (memory_chunk->start_pfn >= max_pfn) {
190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
191 memory_chunk->start_pfn, memory_chunk->end_pfn);
192 return -1;
193 }
194 if (memory_chunk->nid != nid)
195 return -1;
196
197 if (!node_has_online_mem(nid))
198 node_start_pfn[nid] = memory_chunk->start_pfn;
199
200 if (node_start_pfn[nid] > memory_chunk->start_pfn)
201 node_start_pfn[nid] = memory_chunk->start_pfn;
202
203 if (node_end_pfn[nid] < memory_chunk->end_pfn)
204 node_end_pfn[nid] = memory_chunk->end_pfn;
205
206 return 0;
207}
208
209int __init get_memcfg_from_srat(void)
210{
211 int i, j, nid;
212
213
214 if (srat_disabled())
215 goto out_fail;
216
217 if (num_memory_chunks == 0) {
218 printk(KERN_WARNING
219 "could not finy any ACPI SRAT memory areas.\n");
220 goto out_fail;
221 }
222
223 /* Calculate total number of nodes in system from PXM bitmap and create
224 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
225 * to specify the range of _PXM values.)
226 */
227 /*
228 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
229 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
230 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
231 * approaches MAX_PXM_DOMAINS for i386.
232 */
233 nodes_clear(node_online_map);
234 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
235 if (BMAP_TEST(pxm_bitmap, i)) {
236 int nid = acpi_map_pxm_to_node(i);
237 node_set_online(nid);
238 }
239 }
240 BUG_ON(num_online_nodes() == 0);
241
242 /* set cnode id in memory chunk structure */
243 for (i = 0; i < num_memory_chunks; i++)
244 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
245
246 printk(KERN_DEBUG "pxm bitmap: ");
247 for (i = 0; i < sizeof(pxm_bitmap); i++) {
248 printk(KERN_CONT "%02x ", pxm_bitmap[i]);
249 }
250 printk(KERN_CONT "\n");
251 printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
252 num_online_nodes());
253 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
254 num_memory_chunks);
255
256 for (i = 0; i < MAX_APICID; i++)
257 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
258
259 for (j = 0; j < num_memory_chunks; j++){
260 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
261 printk(KERN_DEBUG
262 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
263 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
264 if (node_read_chunk(chunk->nid, chunk))
265 continue;
266
267 e820_register_active_regions(chunk->nid, chunk->start_pfn,
268 min(chunk->end_pfn, max_pfn));
269 }
270
271 for_each_online_node(nid) {
272 unsigned long start = node_start_pfn[nid];
273 unsigned long end = min(node_end_pfn[nid], max_pfn);
274
275 memory_present(nid, start, end);
276 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
277 }
278 return 1;
279out_fail:
280 printk(KERN_ERR "failed to get NUMA memory information from SRAT"
281 " table\n");
282 return 0;
283}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 99649dccad28..1b4763e26ea9 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -100,7 +100,19 @@ static __init inline int srat_disabled(void)
100/* Callback for SLIT parsing */ 100/* Callback for SLIT parsing */
101void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 101void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
102{ 102{
103 acpi_slit = slit; 103 unsigned length;
104 unsigned long phys;
105
106 length = slit->header.length;
107 phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
108 PAGE_SIZE);
109
110 if (phys == -1L)
111 panic(" Can not save slit!\n");
112
113 acpi_slit = __va(phys);
114 memcpy(acpi_slit, slit, length);
115 reserve_early(phys, phys + length, "ACPI SLIT");
104} 116}
105 117
106/* Callback for Proximity Domain -> LAPIC mapping */ 118/* Callback for Proximity Domain -> LAPIC mapping */
@@ -299,7 +311,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
299 pxmram = 0; 311 pxmram = 0;
300 } 312 }
301 313
302 e820ram = end_pfn - absent_pages_in_range(0, end_pfn); 314 e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
303 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 315 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
304 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 316 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
305 printk(KERN_ERR 317 printk(KERN_ERR
@@ -376,7 +388,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
376 if (node == NUMA_NO_NODE) 388 if (node == NUMA_NO_NODE)
377 continue; 389 continue;
378 if (!node_isset(node, node_possible_map)) 390 if (!node_isset(node, node_possible_map))
379 numa_set_node(i, NUMA_NO_NODE); 391 numa_clear_node(i);
380 } 392 }
381 numa_init_array(); 393 numa_init_array();
382 return 0; 394 return 0;
@@ -495,6 +507,7 @@ int __node_distance(int a, int b)
495 507
496EXPORT_SYMBOL(__node_distance); 508EXPORT_SYMBOL(__node_distance);
497 509
510#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
498int memory_add_physaddr_to_nid(u64 start) 511int memory_add_physaddr_to_nid(u64 start)
499{ 512{
500 int i, ret = 0; 513 int i, ret = 0;
@@ -506,4 +519,4 @@ int memory_add_physaddr_to_nid(u64 start)
506 return ret; 519 return ret;
507} 520}
508EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 521EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
509 522#endif
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644
index 000000000000..d877c5b423ef
--- /dev/null
+++ b/arch/x86/mm/testmmiotrace.c
@@ -0,0 +1,71 @@
1/*
2 * Written by Pekka Paalanen, 2008 <pq@iki.fi>
3 */
4#include <linux/module.h>
5#include <linux/io.h>
6
7#define MODULE_NAME "testmmiotrace"
8
9static unsigned long mmio_address;
10module_param(mmio_address, ulong, 0);
11MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
12
13static void do_write_test(void __iomem *p)
14{
15 unsigned int i;
16 for (i = 0; i < 256; i++)
17 iowrite8(i, p + i);
18 for (i = 1024; i < (5 * 1024); i += 2)
19 iowrite16(i * 12 + 7, p + i);
20 for (i = (5 * 1024); i < (16 * 1024); i += 4)
21 iowrite32(i * 212371 + 13, p + i);
22}
23
24static void do_read_test(void __iomem *p)
25{
26 unsigned int i;
27 for (i = 0; i < 256; i++)
28 ioread8(p + i);
29 for (i = 1024; i < (5 * 1024); i += 2)
30 ioread16(p + i);
31 for (i = (5 * 1024); i < (16 * 1024); i += 4)
32 ioread32(p + i);
33}
34
35static void do_test(void)
36{
37 void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
38 if (!p) {
39 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
40 return;
41 }
42 do_write_test(p);
43 do_read_test(p);
44 iounmap(p);
45}
46
47static int __init init(void)
48{
49 if (mmio_address == 0) {
50 pr_err(MODULE_NAME ": you have to use the module argument "
51 "mmio_address.\n");
52 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
53 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
54 return -ENXIO;
55 }
56
57 pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
58 "in PCI address space, and writing "
59 "rubbish in there.\n", mmio_address);
60 do_test();
61 return 0;
62}
63
64static void __exit cleanup(void)
65{
66 pr_debug(MODULE_NAME ": unloaded.\n");
67}
68
69module_init(init);
70module_exit(cleanup);
71MODULE_LICENSE("GPL");