aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-02-04 12:16:03 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2008-02-04 12:16:03 -0500
commitd2fc0bacd5c438cb459fdf531eff00ab18422a00 (patch)
treed0ea52e4d2ad2fac12e19eaf6891c6af98353cfc /arch/x86/mm
parent93890b71a34f9490673a6edd56b61c2124215e46 (diff)
parent795d45b22c079946332bf3825afefe5a981a97b6 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86
* git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86: (78 commits) x86: fix RTC lockdep warning: potential hardirq recursion x86: cpa, micro-optimization x86: cpa, clean up code flow x86: cpa, eliminate CPA_ enum x86: cpa, cleanups x86: implement gbpages support in change_page_attr() x86: support gbpages in pagetable dump x86: add gbpages support to lookup_address x86: add pgtable accessor functions for gbpages x86: add PUD_PAGE_SIZE x86: add feature macros for the gbpages cpuid bit x86: switch direct mapping setup over to set_pte x86: fix page-present check in cpa_flush_range x86: remove cpa warning x86: remove now unused clear_kernel_mapping x86: switch pci-gart over to using set_memory_np() instead of clear_kernel_mapping() x86: cpa selftest, skip non present entries x86: CPA fix pagetable split x86: rename LARGE_PAGE_SIZE to PMD_PAGE_SIZE x86: cpa, fix lookup_address ...
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/fault.c34
-rw-r--r--arch/x86/mm/init_32.c6
-rw-r--r--arch/x86/mm/init_64.c49
-rw-r--r--arch/x86/mm/ioremap.c41
-rw-r--r--arch/x86/mm/numa_64.c7
-rw-r--r--arch/x86/mm/pageattr-test.c3
-rw-r--r--arch/x86/mm/pageattr.c400
-rw-r--r--arch/x86/mm/pgtable_32.c61
8 files changed, 372 insertions, 229 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e4440d0abf8..ad8b9733d6b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -240,7 +240,8 @@ void dump_pagetable(unsigned long address)
240 pud = pud_offset(pgd, address); 240 pud = pud_offset(pgd, address);
241 if (bad_address(pud)) goto bad; 241 if (bad_address(pud)) goto bad;
242 printk("PUD %lx ", pud_val(*pud)); 242 printk("PUD %lx ", pud_val(*pud));
243 if (!pud_present(*pud)) goto ret; 243 if (!pud_present(*pud) || pud_large(*pud))
244 goto ret;
244 245
245 pmd = pmd_offset(pud, address); 246 pmd = pmd_offset(pud, address);
246 if (bad_address(pmd)) goto bad; 247 if (bad_address(pmd)) goto bad;
@@ -508,6 +509,10 @@ static int vmalloc_fault(unsigned long address)
508 pmd_t *pmd, *pmd_ref; 509 pmd_t *pmd, *pmd_ref;
509 pte_t *pte, *pte_ref; 510 pte_t *pte, *pte_ref;
510 511
512 /* Make sure we are in vmalloc area */
513 if (!(address >= VMALLOC_START && address < VMALLOC_END))
514 return -1;
515
511 /* Copy kernel mappings over when needed. This can also 516 /* Copy kernel mappings over when needed. This can also
512 happen within a race in page table update. In the later 517 happen within a race in page table update. In the later
513 case just flush. */ 518 case just flush. */
@@ -603,6 +608,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
603 */ 608 */
604#ifdef CONFIG_X86_32 609#ifdef CONFIG_X86_32
605 if (unlikely(address >= TASK_SIZE)) { 610 if (unlikely(address >= TASK_SIZE)) {
611#else
612 if (unlikely(address >= TASK_SIZE64)) {
613#endif
606 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 614 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
607 vmalloc_fault(address) >= 0) 615 vmalloc_fault(address) >= 0)
608 return; 616 return;
@@ -618,6 +626,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
618 goto bad_area_nosemaphore; 626 goto bad_area_nosemaphore;
619 } 627 }
620 628
629
630#ifdef CONFIG_X86_32
621 /* It's safe to allow irq's after cr2 has been saved and the vmalloc 631 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
622 fault has been handled. */ 632 fault has been handled. */
623 if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) 633 if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
@@ -630,28 +640,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
630 if (in_atomic() || !mm) 640 if (in_atomic() || !mm)
631 goto bad_area_nosemaphore; 641 goto bad_area_nosemaphore;
632#else /* CONFIG_X86_64 */ 642#else /* CONFIG_X86_64 */
633 if (unlikely(address >= TASK_SIZE64)) {
634 /*
635 * Don't check for the module range here: its PML4
636 * is always initialized because it's shared with the main
637 * kernel text. Only vmalloc may need PML4 syncups.
638 */
639 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
640 ((address >= VMALLOC_START && address < VMALLOC_END))) {
641 if (vmalloc_fault(address) >= 0)
642 return;
643 }
644
645 /* Can handle a stale RO->RW TLB */
646 if (spurious_fault(address, error_code))
647 return;
648
649 /*
650 * Don't take the mm semaphore here. If we fixup a prefetch
651 * fault we could otherwise deadlock.
652 */
653 goto bad_area_nosemaphore;
654 }
655 if (likely(regs->flags & X86_EFLAGS_IF)) 643 if (likely(regs->flags & X86_EFLAGS_IF))
656 local_irq_enable(); 644 local_irq_enable();
657 645
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f2f36f8dae5..d1bc04006d1 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
31#include <linux/initrd.h> 31#include <linux/initrd.h>
32#include <linux/cpumask.h> 32#include <linux/cpumask.h>
33 33
34#include <asm/asm.h>
34#include <asm/processor.h> 35#include <asm/processor.h>
35#include <asm/system.h> 36#include <asm/system.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -718,10 +719,7 @@ static noinline int do_test_wp_bit(void)
718 "1: movb %1, %0 \n" 719 "1: movb %1, %0 \n"
719 " xorl %2, %2 \n" 720 " xorl %2, %2 \n"
720 "2: \n" 721 "2: \n"
721 ".section __ex_table, \"a\"\n" 722 _ASM_EXTABLE(1b,2b)
722 " .align 4 \n"
723 " .long 1b, 2b \n"
724 ".previous \n"
725 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), 723 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
726 "=q" (tmp_reg), 724 "=q" (tmp_reg),
727 "=r" (flag) 725 "=r" (flag)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index eabcaed76c2..3a98d6f724a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -273,7 +273,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
273 int i = pmd_index(address); 273 int i = pmd_index(address);
274 274
275 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { 275 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
276 unsigned long entry;
277 pmd_t *pmd = pmd_page + pmd_index(address); 276 pmd_t *pmd = pmd_page + pmd_index(address);
278 277
279 if (address >= end) { 278 if (address >= end) {
@@ -287,9 +286,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
287 if (pmd_val(*pmd)) 286 if (pmd_val(*pmd))
288 continue; 287 continue;
289 288
290 entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address; 289 set_pte((pte_t *)pmd,
291 entry &= __supported_pte_mask; 290 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
292 set_pmd(pmd, __pmd(entry));
293 } 291 }
294} 292}
295 293
@@ -435,49 +433,6 @@ void __init paging_init(void)
435#endif 433#endif
436 434
437/* 435/*
438 * Unmap a kernel mapping if it exists. This is useful to avoid
439 * prefetches from the CPU leading to inconsistent cache lines.
440 * address and size must be aligned to 2MB boundaries.
441 * Does nothing when the mapping doesn't exist.
442 */
443void __init clear_kernel_mapping(unsigned long address, unsigned long size)
444{
445 unsigned long end = address + size;
446
447 BUG_ON(address & ~LARGE_PAGE_MASK);
448 BUG_ON(size & ~LARGE_PAGE_MASK);
449
450 for (; address < end; address += LARGE_PAGE_SIZE) {
451 pgd_t *pgd = pgd_offset_k(address);
452 pud_t *pud;
453 pmd_t *pmd;
454
455 if (pgd_none(*pgd))
456 continue;
457
458 pud = pud_offset(pgd, address);
459 if (pud_none(*pud))
460 continue;
461
462 pmd = pmd_offset(pud, address);
463 if (!pmd || pmd_none(*pmd))
464 continue;
465
466 if (!(pmd_val(*pmd) & _PAGE_PSE)) {
467 /*
468 * Could handle this, but it should not happen
469 * currently:
470 */
471 printk(KERN_ERR "clear_kernel_mapping: "
472 "mapping has been split. will leak memory\n");
473 pmd_ERROR(*pmd);
474 }
475 set_pmd(pmd, __pmd(0));
476 }
477 __flush_tlb_all();
478}
479
480/*
481 * Memory hotplug specific functions 436 * Memory hotplug specific functions
482 */ 437 */
483void online_page(struct page *page) 438void online_page(struct page *page)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c004d94608f..ee6648fe6b1 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -70,25 +70,12 @@ int page_is_ram(unsigned long pagenr)
70 * Fix up the linear direct mapping of the kernel to avoid cache attribute 70 * Fix up the linear direct mapping of the kernel to avoid cache attribute
71 * conflicts. 71 * conflicts.
72 */ 72 */
73static int ioremap_change_attr(unsigned long paddr, unsigned long size, 73static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
74 enum ioremap_mode mode) 74 enum ioremap_mode mode)
75{ 75{
76 unsigned long vaddr = (unsigned long)__va(paddr);
77 unsigned long nrpages = size >> PAGE_SHIFT; 76 unsigned long nrpages = size >> PAGE_SHIFT;
78 unsigned int level;
79 int err; 77 int err;
80 78
81 /* No change for pages after the last mapping */
82 if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT))
83 return 0;
84
85 /*
86 * If there is no identity map for this address,
87 * change_page_attr_addr is unnecessary
88 */
89 if (!lookup_address(vaddr, &level))
90 return 0;
91
92 switch (mode) { 79 switch (mode) {
93 case IOR_MODE_UNCACHED: 80 case IOR_MODE_UNCACHED:
94 default: 81 default:
@@ -114,9 +101,8 @@ static int ioremap_change_attr(unsigned long paddr, unsigned long size,
114static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, 101static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
115 enum ioremap_mode mode) 102 enum ioremap_mode mode)
116{ 103{
117 void __iomem *addr; 104 unsigned long pfn, offset, last_addr, vaddr;
118 struct vm_struct *area; 105 struct vm_struct *area;
119 unsigned long offset, last_addr;
120 pgprot_t prot; 106 pgprot_t prot;
121 107
122 /* Don't allow wraparound or zero size */ 108 /* Don't allow wraparound or zero size */
@@ -133,9 +119,10 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
133 /* 119 /*
134 * Don't allow anybody to remap normal RAM that we're using.. 120 * Don't allow anybody to remap normal RAM that we're using..
135 */ 121 */
136 for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped && 122 for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped &&
137 (offset << PAGE_SHIFT) < last_addr; offset++) { 123 (pfn << PAGE_SHIFT) < last_addr; pfn++) {
138 if (page_is_ram(offset)) 124 if (page_is_ram(pfn) && pfn_valid(pfn) &&
125 !PageReserved(pfn_to_page(pfn)))
139 return NULL; 126 return NULL;
140 } 127 }
141 128
@@ -163,19 +150,18 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
163 if (!area) 150 if (!area)
164 return NULL; 151 return NULL;
165 area->phys_addr = phys_addr; 152 area->phys_addr = phys_addr;
166 addr = (void __iomem *) area->addr; 153 vaddr = (unsigned long) area->addr;
167 if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, 154 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
168 phys_addr, prot)) { 155 remove_vm_area((void *)(vaddr & PAGE_MASK));
169 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
170 return NULL; 156 return NULL;
171 } 157 }
172 158
173 if (ioremap_change_attr(phys_addr, size, mode) < 0) { 159 if (ioremap_change_attr(vaddr, size, mode) < 0) {
174 vunmap(addr); 160 vunmap(area->addr);
175 return NULL; 161 return NULL;
176 } 162 }
177 163
178 return (void __iomem *) (offset + (char __iomem *)addr); 164 return (void __iomem *) (vaddr + offset);
179} 165}
180 166
181/** 167/**
@@ -254,9 +240,6 @@ void iounmap(volatile void __iomem *addr)
254 return; 240 return;
255 } 241 }
256 242
257 /* Reset the direct mapping. Can block */
258 ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED);
259
260 /* Finally remove it */ 243 /* Finally remove it */
261 o = remove_vm_area((void *)addr); 244 o = remove_vm_area((void *)addr);
262 BUG_ON(p != o || o == NULL); 245 BUG_ON(p != o || o == NULL);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a920d09b919..5a02bf4c91e 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -202,6 +202,8 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
202 if (node_data[nodeid] == NULL) 202 if (node_data[nodeid] == NULL)
203 return; 203 return;
204 nodedata_phys = __pa(node_data[nodeid]); 204 nodedata_phys = __pa(node_data[nodeid]);
205 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
206 nodedata_phys + pgdat_size - 1);
205 207
206 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 208 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
207 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 209 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
@@ -225,12 +227,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
225 return; 227 return;
226 } 228 }
227 bootmap_start = __pa(bootmap); 229 bootmap_start = __pa(bootmap);
228 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
229 230
230 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 231 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
231 bootmap_start >> PAGE_SHIFT, 232 bootmap_start >> PAGE_SHIFT,
232 start_pfn, end_pfn); 233 start_pfn, end_pfn);
233 234
235 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
236 bootmap_start, bootmap_start + bootmap_size - 1,
237 bootmap_pages);
238
234 free_bootmem_with_active_regions(nodeid, end); 239 free_bootmem_with_active_regions(nodeid, end);
235 240
236 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 241 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 7573e786d2f..398f3a578dd 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -137,7 +137,8 @@ static __init int exercise_pageattr(void)
137 137
138 for (k = 0; k < len[i]; k++) { 138 for (k = 0; k < len[i]; k++) {
139 pte = lookup_address(addr[i] + k*PAGE_SIZE, &level); 139 pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
140 if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) { 140 if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 ||
141 !(pte_val(*pte) & _PAGE_PRESENT)) {
141 addr[i] = 0; 142 addr[i] = 0;
142 break; 143 break;
143 } 144 }
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e297bd65e51..bb55a78dcd6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -16,6 +16,17 @@
16#include <asm/uaccess.h> 16#include <asm/uaccess.h>
17#include <asm/pgalloc.h> 17#include <asm/pgalloc.h>
18 18
19/*
20 * The current flushing context - we pass it instead of 5 arguments:
21 */
22struct cpa_data {
23 unsigned long vaddr;
24 pgprot_t mask_set;
25 pgprot_t mask_clr;
26 int numpages;
27 int flushtlb;
28};
29
19static inline int 30static inline int
20within(unsigned long addr, unsigned long start, unsigned long end) 31within(unsigned long addr, unsigned long start, unsigned long end)
21{ 32{
@@ -52,21 +63,23 @@ void clflush_cache_range(void *vaddr, unsigned int size)
52 63
53static void __cpa_flush_all(void *arg) 64static void __cpa_flush_all(void *arg)
54{ 65{
66 unsigned long cache = (unsigned long)arg;
67
55 /* 68 /*
56 * Flush all to work around Errata in early athlons regarding 69 * Flush all to work around Errata in early athlons regarding
57 * large page flushing. 70 * large page flushing.
58 */ 71 */
59 __flush_tlb_all(); 72 __flush_tlb_all();
60 73
61 if (boot_cpu_data.x86_model >= 4) 74 if (cache && boot_cpu_data.x86_model >= 4)
62 wbinvd(); 75 wbinvd();
63} 76}
64 77
65static void cpa_flush_all(void) 78static void cpa_flush_all(unsigned long cache)
66{ 79{
67 BUG_ON(irqs_disabled()); 80 BUG_ON(irqs_disabled());
68 81
69 on_each_cpu(__cpa_flush_all, NULL, 1, 1); 82 on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
70} 83}
71 84
72static void __cpa_flush_range(void *arg) 85static void __cpa_flush_range(void *arg)
@@ -79,7 +92,7 @@ static void __cpa_flush_range(void *arg)
79 __flush_tlb_all(); 92 __flush_tlb_all();
80} 93}
81 94
82static void cpa_flush_range(unsigned long start, int numpages) 95static void cpa_flush_range(unsigned long start, int numpages, int cache)
83{ 96{
84 unsigned int i, level; 97 unsigned int i, level;
85 unsigned long addr; 98 unsigned long addr;
@@ -89,6 +102,9 @@ static void cpa_flush_range(unsigned long start, int numpages)
89 102
90 on_each_cpu(__cpa_flush_range, NULL, 1, 1); 103 on_each_cpu(__cpa_flush_range, NULL, 1, 1);
91 104
105 if (!cache)
106 return;
107
92 /* 108 /*
93 * We only need to flush on one CPU, 109 * We only need to flush on one CPU,
94 * clflush is a MESI-coherent instruction that 110 * clflush is a MESI-coherent instruction that
@@ -101,11 +117,27 @@ static void cpa_flush_range(unsigned long start, int numpages)
101 /* 117 /*
102 * Only flush present addresses: 118 * Only flush present addresses:
103 */ 119 */
104 if (pte && pte_present(*pte)) 120 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
105 clflush_cache_range((void *) addr, PAGE_SIZE); 121 clflush_cache_range((void *) addr, PAGE_SIZE);
106 } 122 }
107} 123}
108 124
125#define HIGH_MAP_START __START_KERNEL_map
126#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE)
127
128
129/*
130 * Converts a virtual address to a X86-64 highmap address
131 */
132static unsigned long virt_to_highmap(void *address)
133{
134#ifdef CONFIG_X86_64
135 return __pa((unsigned long)address) + HIGH_MAP_START - phys_base;
136#else
137 return (unsigned long)address;
138#endif
139}
140
109/* 141/*
110 * Certain areas of memory on x86 require very specific protection flags, 142 * Certain areas of memory on x86 require very specific protection flags,
111 * for example the BIOS area or kernel text. Callers don't always get this 143 * for example the BIOS area or kernel text. Callers don't always get this
@@ -129,12 +161,24 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
129 */ 161 */
130 if (within(address, (unsigned long)_text, (unsigned long)_etext)) 162 if (within(address, (unsigned long)_text, (unsigned long)_etext))
131 pgprot_val(forbidden) |= _PAGE_NX; 163 pgprot_val(forbidden) |= _PAGE_NX;
164 /*
165 * Do the same for the x86-64 high kernel mapping
166 */
167 if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext)))
168 pgprot_val(forbidden) |= _PAGE_NX;
169
132 170
133#ifdef CONFIG_DEBUG_RODATA 171#ifdef CONFIG_DEBUG_RODATA
134 /* The .rodata section needs to be read-only */ 172 /* The .rodata section needs to be read-only */
135 if (within(address, (unsigned long)__start_rodata, 173 if (within(address, (unsigned long)__start_rodata,
136 (unsigned long)__end_rodata)) 174 (unsigned long)__end_rodata))
137 pgprot_val(forbidden) |= _PAGE_RW; 175 pgprot_val(forbidden) |= _PAGE_RW;
176 /*
177 * Do the same for the x86-64 high kernel mapping
178 */
179 if (within(address, virt_to_highmap(__start_rodata),
180 virt_to_highmap(__end_rodata)))
181 pgprot_val(forbidden) |= _PAGE_RW;
138#endif 182#endif
139 183
140 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 184 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
@@ -142,6 +186,14 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
142 return prot; 186 return prot;
143} 187}
144 188
189/*
190 * Lookup the page table entry for a virtual address. Return a pointer
191 * to the entry and the level of the mapping.
192 *
193 * Note: We return pud and pmd either when the entry is marked large
194 * or when the present bit is not set. Otherwise we would return a
195 * pointer to a nonexisting mapping.
196 */
145pte_t *lookup_address(unsigned long address, int *level) 197pte_t *lookup_address(unsigned long address, int *level)
146{ 198{
147 pgd_t *pgd = pgd_offset_k(address); 199 pgd_t *pgd = pgd_offset_k(address);
@@ -152,21 +204,31 @@ pte_t *lookup_address(unsigned long address, int *level)
152 204
153 if (pgd_none(*pgd)) 205 if (pgd_none(*pgd))
154 return NULL; 206 return NULL;
207
155 pud = pud_offset(pgd, address); 208 pud = pud_offset(pgd, address);
156 if (pud_none(*pud)) 209 if (pud_none(*pud))
157 return NULL; 210 return NULL;
211
212 *level = PG_LEVEL_1G;
213 if (pud_large(*pud) || !pud_present(*pud))
214 return (pte_t *)pud;
215
158 pmd = pmd_offset(pud, address); 216 pmd = pmd_offset(pud, address);
159 if (pmd_none(*pmd)) 217 if (pmd_none(*pmd))
160 return NULL; 218 return NULL;
161 219
162 *level = PG_LEVEL_2M; 220 *level = PG_LEVEL_2M;
163 if (pmd_large(*pmd)) 221 if (pmd_large(*pmd) || !pmd_present(*pmd))
164 return (pte_t *)pmd; 222 return (pte_t *)pmd;
165 223
166 *level = PG_LEVEL_4K; 224 *level = PG_LEVEL_4K;
225
167 return pte_offset_kernel(pmd, address); 226 return pte_offset_kernel(pmd, address);
168} 227}
169 228
229/*
230 * Set the new pmd in all the pgds we know about:
231 */
170static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 232static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
171{ 233{
172 /* change init_mm */ 234 /* change init_mm */
@@ -175,6 +237,7 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
175 if (!SHARED_KERNEL_PMD) { 237 if (!SHARED_KERNEL_PMD) {
176 struct page *page; 238 struct page *page;
177 239
240 address = __pa(address);
178 list_for_each_entry(page, &pgd_list, lru) { 241 list_for_each_entry(page, &pgd_list, lru) {
179 pgd_t *pgd; 242 pgd_t *pgd;
180 pud_t *pud; 243 pud_t *pud;
@@ -189,18 +252,114 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
189#endif 252#endif
190} 253}
191 254
255static int
256try_preserve_large_page(pte_t *kpte, unsigned long address,
257 struct cpa_data *cpa)
258{
259 unsigned long nextpage_addr, numpages, pmask, psize, flags;
260 pte_t new_pte, old_pte, *tmp;
261 pgprot_t old_prot, new_prot;
262 int level, do_split = 1;
263
264 /*
265 * An Athlon 64 X2 showed hard hangs if we tried to preserve
266 * largepages and changed the PSE entry from RW to RO.
267 *
268 * As AMD CPUs have a long series of erratas in this area,
269 * (and none of the known ones seem to explain this hang),
270 * disable this code until the hang can be debugged:
271 */
272 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
273 return 1;
274
275 spin_lock_irqsave(&pgd_lock, flags);
276 /*
277 * Check for races, another CPU might have split this page
278 * up already:
279 */
280 tmp = lookup_address(address, &level);
281 if (tmp != kpte)
282 goto out_unlock;
283
284 switch (level) {
285 case PG_LEVEL_2M:
286 psize = PMD_PAGE_SIZE;
287 pmask = PMD_PAGE_MASK;
288 break;
289#ifdef CONFIG_X86_64
290 case PG_LEVEL_1G:
291 psize = PMD_PAGE_SIZE;
292 pmask = PMD_PAGE_MASK;
293 break;
294#endif
295 default:
296 do_split = -EINVAL;
297 goto out_unlock;
298 }
299
300 /*
301 * Calculate the number of pages, which fit into this large
302 * page starting at address:
303 */
304 nextpage_addr = (address + psize) & pmask;
305 numpages = (nextpage_addr - address) >> PAGE_SHIFT;
306 if (numpages < cpa->numpages)
307 cpa->numpages = numpages;
308
309 /*
310 * We are safe now. Check whether the new pgprot is the same:
311 */
312 old_pte = *kpte;
313 old_prot = new_prot = pte_pgprot(old_pte);
314
315 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
316 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
317 new_prot = static_protections(new_prot, address);
318
319 /*
320 * If there are no changes, return. maxpages has been updated
321 * above:
322 */
323 if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
324 do_split = 0;
325 goto out_unlock;
326 }
327
328 /*
329 * We need to change the attributes. Check, whether we can
330 * change the large page in one go. We request a split, when
331 * the address is not aligned and the number of pages is
332 * smaller than the number of pages in the large page. Note
333 * that we limited the number of possible pages already to
334 * the number of pages in the large page.
335 */
336 if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
337 /*
338 * The address is aligned and the number of pages
339 * covers the full page.
340 */
341 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
342 __set_pmd_pte(kpte, address, new_pte);
343 cpa->flushtlb = 1;
344 do_split = 0;
345 }
346
347out_unlock:
348 spin_unlock_irqrestore(&pgd_lock, flags);
349
350 return do_split;
351}
352
192static int split_large_page(pte_t *kpte, unsigned long address) 353static int split_large_page(pte_t *kpte, unsigned long address)
193{ 354{
194 pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); 355 unsigned long flags, pfn, pfninc = 1;
195 gfp_t gfp_flags = GFP_KERNEL; 356 gfp_t gfp_flags = GFP_KERNEL;
196 unsigned long flags; 357 unsigned int i, level;
197 unsigned long addr;
198 pte_t *pbase, *tmp; 358 pte_t *pbase, *tmp;
359 pgprot_t ref_prot;
199 struct page *base; 360 struct page *base;
200 unsigned int i, level;
201 361
202#ifdef CONFIG_DEBUG_PAGEALLOC 362#ifdef CONFIG_DEBUG_PAGEALLOC
203 gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN;
204 gfp_flags = GFP_ATOMIC | __GFP_NOWARN; 363 gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
205#endif 364#endif
206 base = alloc_pages(gfp_flags, 0); 365 base = alloc_pages(gfp_flags, 0);
@@ -213,30 +372,41 @@ static int split_large_page(pte_t *kpte, unsigned long address)
213 * up for us already: 372 * up for us already:
214 */ 373 */
215 tmp = lookup_address(address, &level); 374 tmp = lookup_address(address, &level);
216 if (tmp != kpte) { 375 if (tmp != kpte)
217 WARN_ON_ONCE(1);
218 goto out_unlock; 376 goto out_unlock;
219 }
220 377
221 address = __pa(address);
222 addr = address & LARGE_PAGE_MASK;
223 pbase = (pte_t *)page_address(base); 378 pbase = (pte_t *)page_address(base);
224#ifdef CONFIG_X86_32 379#ifdef CONFIG_X86_32
225 paravirt_alloc_pt(&init_mm, page_to_pfn(base)); 380 paravirt_alloc_pt(&init_mm, page_to_pfn(base));
226#endif 381#endif
382 ref_prot = pte_pgprot(pte_clrhuge(*kpte));
383
384#ifdef CONFIG_X86_64
385 if (level == PG_LEVEL_1G) {
386 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
387 pgprot_val(ref_prot) |= _PAGE_PSE;
388 }
389#endif
227 390
228 pgprot_val(ref_prot) &= ~_PAGE_NX; 391 /*
229 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) 392 * Get the target pfn from the original entry:
230 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot)); 393 */
394 pfn = pte_pfn(*kpte);
395 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
396 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
231 397
232 /* 398 /*
233 * Install the new, split up pagetable. Important detail here: 399 * Install the new, split up pagetable. Important details here:
234 * 400 *
235 * On Intel the NX bit of all levels must be cleared to make a 401 * On Intel the NX bit of all levels must be cleared to make a
236 * page executable. See section 4.13.2 of Intel 64 and IA-32 402 * page executable. See section 4.13.2 of Intel 64 and IA-32
237 * Architectures Software Developer's Manual). 403 * Architectures Software Developer's Manual).
404 *
405 * Mark the entry present. The current mapping might be
406 * set to not present, which we preserved above.
238 */ 407 */
239 ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); 408 ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
409 pgprot_val(ref_prot) |= _PAGE_PRESENT;
240 __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); 410 __set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
241 base = NULL; 411 base = NULL;
242 412
@@ -249,18 +419,12 @@ out_unlock:
249 return 0; 419 return 0;
250} 420}
251 421
252static int 422static int __change_page_attr(unsigned long address, struct cpa_data *cpa)
253__change_page_attr(unsigned long address, unsigned long pfn,
254 pgprot_t mask_set, pgprot_t mask_clr)
255{ 423{
424 int level, do_split, err;
256 struct page *kpte_page; 425 struct page *kpte_page;
257 int level, err = 0;
258 pte_t *kpte; 426 pte_t *kpte;
259 427
260#ifdef CONFIG_X86_32
261 BUG_ON(pfn > max_low_pfn);
262#endif
263
264repeat: 428repeat:
265 kpte = lookup_address(address, &level); 429 kpte = lookup_address(address, &level);
266 if (!kpte) 430 if (!kpte)
@@ -271,23 +435,62 @@ repeat:
271 BUG_ON(PageCompound(kpte_page)); 435 BUG_ON(PageCompound(kpte_page));
272 436
273 if (level == PG_LEVEL_4K) { 437 if (level == PG_LEVEL_4K) {
274 pgprot_t new_prot = pte_pgprot(*kpte);
275 pte_t new_pte, old_pte = *kpte; 438 pte_t new_pte, old_pte = *kpte;
439 pgprot_t new_prot = pte_pgprot(old_pte);
440
441 if(!pte_val(old_pte)) {
442 printk(KERN_WARNING "CPA: called for zero pte. "
443 "vaddr = %lx cpa->vaddr = %lx\n", address,
444 cpa->vaddr);
445 WARN_ON(1);
446 return -EINVAL;
447 }
276 448
277 pgprot_val(new_prot) &= ~pgprot_val(mask_clr); 449 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
278 pgprot_val(new_prot) |= pgprot_val(mask_set); 450 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
279 451
280 new_prot = static_protections(new_prot, address); 452 new_prot = static_protections(new_prot, address);
281 453
282 new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); 454 /*
283 BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte)); 455 * We need to keep the pfn from the existing PTE,
456 * after all we're only going to change it's attributes
457 * not the memory it points to
458 */
459 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
460
461 /*
462 * Do we really change anything ?
463 */
464 if (pte_val(old_pte) != pte_val(new_pte)) {
465 set_pte_atomic(kpte, new_pte);
466 cpa->flushtlb = 1;
467 }
468 cpa->numpages = 1;
469 return 0;
470 }
471
472 /*
473 * Check, whether we can keep the large page intact
474 * and just change the pte:
475 */
476 do_split = try_preserve_large_page(kpte, address, cpa);
477 /*
478 * When the range fits into the existing large page,
479 * return. cp->numpages and cpa->tlbflush have been updated in
480 * try_large_page:
481 */
482 if (do_split <= 0)
483 return do_split;
284 484
285 set_pte_atomic(kpte, new_pte); 485 /*
286 } else { 486 * We have to split the large page:
287 err = split_large_page(kpte, address); 487 */
288 if (!err) 488 err = split_large_page(kpte, address);
289 goto repeat; 489 if (!err) {
490 cpa->flushtlb = 1;
491 goto repeat;
290 } 492 }
493
291 return err; 494 return err;
292} 495}
293 496
@@ -304,19 +507,14 @@ repeat:
304 * 507 *
305 * Modules and drivers should use the set_memory_* APIs instead. 508 * Modules and drivers should use the set_memory_* APIs instead.
306 */ 509 */
307 510static int change_page_attr_addr(struct cpa_data *cpa)
308#define HIGH_MAP_START __START_KERNEL_map
309#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE)
310
311static int
312change_page_attr_addr(unsigned long address, pgprot_t mask_set,
313 pgprot_t mask_clr)
314{ 511{
315 unsigned long phys_addr = __pa(address);
316 unsigned long pfn = phys_addr >> PAGE_SHIFT;
317 int err; 512 int err;
513 unsigned long address = cpa->vaddr;
318 514
319#ifdef CONFIG_X86_64 515#ifdef CONFIG_X86_64
516 unsigned long phys_addr = __pa(address);
517
320 /* 518 /*
321 * If we are inside the high mapped kernel range, then we 519 * If we are inside the high mapped kernel range, then we
322 * fixup the low mapping first. __va() returns the virtual 520 * fixup the low mapping first. __va() returns the virtual
@@ -326,7 +524,7 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set,
326 address = (unsigned long) __va(phys_addr); 524 address = (unsigned long) __va(phys_addr);
327#endif 525#endif
328 526
329 err = __change_page_attr(address, pfn, mask_set, mask_clr); 527 err = __change_page_attr(address, cpa);
330 if (err) 528 if (err)
331 return err; 529 return err;
332 530
@@ -339,42 +537,89 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set,
339 /* 537 /*
340 * Calc the high mapping address. See __phys_addr() 538 * Calc the high mapping address. See __phys_addr()
341 * for the non obvious details. 539 * for the non obvious details.
540 *
541 * Note that NX and other required permissions are
542 * checked in static_protections().
342 */ 543 */
343 address = phys_addr + HIGH_MAP_START - phys_base; 544 address = phys_addr + HIGH_MAP_START - phys_base;
344 /* Make sure the kernel mappings stay executable */
345 pgprot_val(mask_clr) |= _PAGE_NX;
346 545
347 /* 546 /*
348 * Our high aliases are imprecise, because we check 547 * Our high aliases are imprecise, because we check
349 * everything between 0 and KERNEL_TEXT_SIZE, so do 548 * everything between 0 and KERNEL_TEXT_SIZE, so do
350 * not propagate lookup failures back to users: 549 * not propagate lookup failures back to users:
351 */ 550 */
352 __change_page_attr(address, pfn, mask_set, mask_clr); 551 __change_page_attr(address, cpa);
353 } 552 }
354#endif 553#endif
355 return err; 554 return err;
356} 555}
357 556
358static int __change_page_attr_set_clr(unsigned long addr, int numpages, 557static int __change_page_attr_set_clr(struct cpa_data *cpa)
359 pgprot_t mask_set, pgprot_t mask_clr)
360{ 558{
361 unsigned int i; 559 int ret, numpages = cpa->numpages;
362 int ret;
363 560
364 for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) { 561 while (numpages) {
365 ret = change_page_attr_addr(addr, mask_set, mask_clr); 562 /*
563 * Store the remaining nr of pages for the large page
564 * preservation check.
565 */
566 cpa->numpages = numpages;
567 ret = change_page_attr_addr(cpa);
366 if (ret) 568 if (ret)
367 return ret; 569 return ret;
368 }
369 570
571 /*
572 * Adjust the number of pages with the result of the
573 * CPA operation. Either a large page has been
574 * preserved or a single page update happened.
575 */
576 BUG_ON(cpa->numpages > numpages);
577 numpages -= cpa->numpages;
578 cpa->vaddr += cpa->numpages * PAGE_SIZE;
579 }
370 return 0; 580 return 0;
371} 581}
372 582
583static inline int cache_attr(pgprot_t attr)
584{
585 return pgprot_val(attr) &
586 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
587}
588
373static int change_page_attr_set_clr(unsigned long addr, int numpages, 589static int change_page_attr_set_clr(unsigned long addr, int numpages,
374 pgprot_t mask_set, pgprot_t mask_clr) 590 pgprot_t mask_set, pgprot_t mask_clr)
375{ 591{
376 int ret = __change_page_attr_set_clr(addr, numpages, mask_set, 592 struct cpa_data cpa;
377 mask_clr); 593 int ret, cache;
594
595 /*
596 * Check, if we are requested to change a not supported
597 * feature:
598 */
599 mask_set = canon_pgprot(mask_set);
600 mask_clr = canon_pgprot(mask_clr);
601 if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
602 return 0;
603
604 cpa.vaddr = addr;
605 cpa.numpages = numpages;
606 cpa.mask_set = mask_set;
607 cpa.mask_clr = mask_clr;
608 cpa.flushtlb = 0;
609
610 ret = __change_page_attr_set_clr(&cpa);
611
612 /*
613 * Check whether we really changed something:
614 */
615 if (!cpa.flushtlb)
616 return ret;
617
618 /*
619 * No need to flush, when we did not set any of the caching
620 * attributes:
621 */
622 cache = cache_attr(mask_set);
378 623
379 /* 624 /*
380 * On success we use clflush, when the CPU supports it to 625 * On success we use clflush, when the CPU supports it to
@@ -383,9 +628,9 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
383 * wbindv): 628 * wbindv):
384 */ 629 */
385 if (!ret && cpu_has_clflush) 630 if (!ret && cpu_has_clflush)
386 cpa_flush_range(addr, numpages); 631 cpa_flush_range(addr, numpages, cache);
387 else 632 else
388 cpa_flush_all(); 633 cpa_flush_all(cache);
389 634
390 return ret; 635 return ret;
391} 636}
@@ -489,37 +734,26 @@ int set_pages_rw(struct page *page, int numpages)
489 return set_memory_rw(addr, numpages); 734 return set_memory_rw(addr, numpages);
490} 735}
491 736
492
493#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG)
494static inline int __change_page_attr_set(unsigned long addr, int numpages,
495 pgprot_t mask)
496{
497 return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
498}
499
500static inline int __change_page_attr_clear(unsigned long addr, int numpages,
501 pgprot_t mask)
502{
503 return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
504}
505#endif
506
507#ifdef CONFIG_DEBUG_PAGEALLOC 737#ifdef CONFIG_DEBUG_PAGEALLOC
508 738
509static int __set_pages_p(struct page *page, int numpages) 739static int __set_pages_p(struct page *page, int numpages)
510{ 740{
511 unsigned long addr = (unsigned long)page_address(page); 741 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
742 .numpages = numpages,
743 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
744 .mask_clr = __pgprot(0)};
512 745
513 return __change_page_attr_set(addr, numpages, 746 return __change_page_attr_set_clr(&cpa);
514 __pgprot(_PAGE_PRESENT | _PAGE_RW));
515} 747}
516 748
517static int __set_pages_np(struct page *page, int numpages) 749static int __set_pages_np(struct page *page, int numpages)
518{ 750{
519 unsigned long addr = (unsigned long)page_address(page); 751 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
752 .numpages = numpages,
753 .mask_set = __pgprot(0),
754 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
520 755
521 return __change_page_attr_clear(addr, numpages, 756 return __change_page_attr_set_clr(&cpa);
522 __pgprot(_PAGE_PRESENT));
523} 757}
524 758
525void kernel_map_pages(struct page *page, int numpages, int enable) 759void kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index cb3aa470249..c7db504be1e 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -219,50 +219,39 @@ static inline void pgd_list_del(pgd_t *pgd)
219 list_del(&page->lru); 219 list_del(&page->lru);
220} 220}
221 221
222#define UNSHARED_PTRS_PER_PGD \
223 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
222 224
223 225static void pgd_ctor(void *p)
224#if (PTRS_PER_PMD == 1)
225/* Non-PAE pgd constructor */
226static void pgd_ctor(void *pgd)
227{ 226{
227 pgd_t *pgd = p;
228 unsigned long flags; 228 unsigned long flags;
229 229
230 /* !PAE, no pagetable sharing */ 230 /* Clear usermode parts of PGD */
231 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); 231 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
232 232
233 spin_lock_irqsave(&pgd_lock, flags); 233 spin_lock_irqsave(&pgd_lock, flags);
234 234
235 /* must happen under lock */ 235 /* If the pgd points to a shared pagetable level (either the
236 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, 236 ptes in non-PAE, or shared PMD in PAE), then just copy the
237 swapper_pg_dir + USER_PTRS_PER_PGD, 237 references from swapper_pg_dir. */
238 KERNEL_PGD_PTRS); 238 if (PAGETABLE_LEVELS == 2 ||
239 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, 239 (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
240 __pa(swapper_pg_dir) >> PAGE_SHIFT, 240 clone_pgd_range(pgd + USER_PTRS_PER_PGD,
241 USER_PTRS_PER_PGD,
242 KERNEL_PGD_PTRS);
243 pgd_list_add(pgd);
244 spin_unlock_irqrestore(&pgd_lock, flags);
245}
246#else /* PTRS_PER_PMD > 1 */
247/* PAE pgd constructor */
248static void pgd_ctor(void *pgd)
249{
250 /* PAE, kernel PMD may be shared */
251
252 if (SHARED_KERNEL_PMD) {
253 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
254 swapper_pg_dir + USER_PTRS_PER_PGD, 241 swapper_pg_dir + USER_PTRS_PER_PGD,
255 KERNEL_PGD_PTRS); 242 KERNEL_PGD_PTRS);
256 } else { 243 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
257 unsigned long flags; 244 __pa(swapper_pg_dir) >> PAGE_SHIFT,
245 USER_PTRS_PER_PGD,
246 KERNEL_PGD_PTRS);
247 }
258 248
259 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); 249 /* list required to sync kernel mapping updates */
260 spin_lock_irqsave(&pgd_lock, flags); 250 if (!SHARED_KERNEL_PMD)
261 pgd_list_add(pgd); 251 pgd_list_add(pgd);
262 spin_unlock_irqrestore(&pgd_lock, flags); 252
263 } 253 spin_unlock_irqrestore(&pgd_lock, flags);
264} 254}
265#endif /* PTRS_PER_PMD */
266 255
267static void pgd_dtor(void *pgd) 256static void pgd_dtor(void *pgd)
268{ 257{
@@ -276,9 +265,6 @@ static void pgd_dtor(void *pgd)
276 spin_unlock_irqrestore(&pgd_lock, flags); 265 spin_unlock_irqrestore(&pgd_lock, flags);
277} 266}
278 267
279#define UNSHARED_PTRS_PER_PGD \
280 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
281
282#ifdef CONFIG_X86_PAE 268#ifdef CONFIG_X86_PAE
283/* 269/*
284 * Mop up any pmd pages which may still be attached to the pgd. 270 * Mop up any pmd pages which may still be attached to the pgd.
@@ -387,13 +373,6 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
387 373
388void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 374void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
389{ 375{
390 /* This is called just after the pmd has been detached from
391 the pgd, which requires a full tlb flush to be recognized
392 by the CPU. Rather than incurring multiple tlb flushes
393 while the address space is being pulled down, make the tlb
394 gathering machinery do a full flush when we're done. */
395 tlb->fullmm = 1;
396
397 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); 376 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
398 tlb_remove_page(tlb, virt_to_page(pmd)); 377 tlb_remove_page(tlb, virt_to_page(pmd));
399} 378}