diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-02-04 12:16:03 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-02-04 12:16:03 -0500 |
commit | d2fc0bacd5c438cb459fdf531eff00ab18422a00 (patch) | |
tree | d0ea52e4d2ad2fac12e19eaf6891c6af98353cfc /arch/x86/mm | |
parent | 93890b71a34f9490673a6edd56b61c2124215e46 (diff) | |
parent | 795d45b22c079946332bf3825afefe5a981a97b6 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86
* git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86: (78 commits)
x86: fix RTC lockdep warning: potential hardirq recursion
x86: cpa, micro-optimization
x86: cpa, clean up code flow
x86: cpa, eliminate CPA_ enum
x86: cpa, cleanups
x86: implement gbpages support in change_page_attr()
x86: support gbpages in pagetable dump
x86: add gbpages support to lookup_address
x86: add pgtable accessor functions for gbpages
x86: add PUD_PAGE_SIZE
x86: add feature macros for the gbpages cpuid bit
x86: switch direct mapping setup over to set_pte
x86: fix page-present check in cpa_flush_range
x86: remove cpa warning
x86: remove now unused clear_kernel_mapping
x86: switch pci-gart over to using set_memory_np() instead of clear_kernel_mapping()
x86: cpa selftest, skip non present entries
x86: CPA fix pagetable split
x86: rename LARGE_PAGE_SIZE to PMD_PAGE_SIZE
x86: cpa, fix lookup_address
...
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/fault.c | 34 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 49 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 41 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 7 | ||||
-rw-r--r-- | arch/x86/mm/pageattr-test.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 400 | ||||
-rw-r--r-- | arch/x86/mm/pgtable_32.c | 61 |
8 files changed, 372 insertions, 229 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e4440d0abf8..ad8b9733d6b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -240,7 +240,8 @@ void dump_pagetable(unsigned long address) | |||
240 | pud = pud_offset(pgd, address); | 240 | pud = pud_offset(pgd, address); |
241 | if (bad_address(pud)) goto bad; | 241 | if (bad_address(pud)) goto bad; |
242 | printk("PUD %lx ", pud_val(*pud)); | 242 | printk("PUD %lx ", pud_val(*pud)); |
243 | if (!pud_present(*pud)) goto ret; | 243 | if (!pud_present(*pud) || pud_large(*pud)) |
244 | goto ret; | ||
244 | 245 | ||
245 | pmd = pmd_offset(pud, address); | 246 | pmd = pmd_offset(pud, address); |
246 | if (bad_address(pmd)) goto bad; | 247 | if (bad_address(pmd)) goto bad; |
@@ -508,6 +509,10 @@ static int vmalloc_fault(unsigned long address) | |||
508 | pmd_t *pmd, *pmd_ref; | 509 | pmd_t *pmd, *pmd_ref; |
509 | pte_t *pte, *pte_ref; | 510 | pte_t *pte, *pte_ref; |
510 | 511 | ||
512 | /* Make sure we are in vmalloc area */ | ||
513 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | ||
514 | return -1; | ||
515 | |||
511 | /* Copy kernel mappings over when needed. This can also | 516 | /* Copy kernel mappings over when needed. This can also |
512 | happen within a race in page table update. In the later | 517 | happen within a race in page table update. In the later |
513 | case just flush. */ | 518 | case just flush. */ |
@@ -603,6 +608,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
603 | */ | 608 | */ |
604 | #ifdef CONFIG_X86_32 | 609 | #ifdef CONFIG_X86_32 |
605 | if (unlikely(address >= TASK_SIZE)) { | 610 | if (unlikely(address >= TASK_SIZE)) { |
611 | #else | ||
612 | if (unlikely(address >= TASK_SIZE64)) { | ||
613 | #endif | ||
606 | if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && | 614 | if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && |
607 | vmalloc_fault(address) >= 0) | 615 | vmalloc_fault(address) >= 0) |
608 | return; | 616 | return; |
@@ -618,6 +626,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
618 | goto bad_area_nosemaphore; | 626 | goto bad_area_nosemaphore; |
619 | } | 627 | } |
620 | 628 | ||
629 | |||
630 | #ifdef CONFIG_X86_32 | ||
621 | /* It's safe to allow irq's after cr2 has been saved and the vmalloc | 631 | /* It's safe to allow irq's after cr2 has been saved and the vmalloc |
622 | fault has been handled. */ | 632 | fault has been handled. */ |
623 | if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) | 633 | if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) |
@@ -630,28 +640,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
630 | if (in_atomic() || !mm) | 640 | if (in_atomic() || !mm) |
631 | goto bad_area_nosemaphore; | 641 | goto bad_area_nosemaphore; |
632 | #else /* CONFIG_X86_64 */ | 642 | #else /* CONFIG_X86_64 */ |
633 | if (unlikely(address >= TASK_SIZE64)) { | ||
634 | /* | ||
635 | * Don't check for the module range here: its PML4 | ||
636 | * is always initialized because it's shared with the main | ||
637 | * kernel text. Only vmalloc may need PML4 syncups. | ||
638 | */ | ||
639 | if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && | ||
640 | ((address >= VMALLOC_START && address < VMALLOC_END))) { | ||
641 | if (vmalloc_fault(address) >= 0) | ||
642 | return; | ||
643 | } | ||
644 | |||
645 | /* Can handle a stale RO->RW TLB */ | ||
646 | if (spurious_fault(address, error_code)) | ||
647 | return; | ||
648 | |||
649 | /* | ||
650 | * Don't take the mm semaphore here. If we fixup a prefetch | ||
651 | * fault we could otherwise deadlock. | ||
652 | */ | ||
653 | goto bad_area_nosemaphore; | ||
654 | } | ||
655 | if (likely(regs->flags & X86_EFLAGS_IF)) | 643 | if (likely(regs->flags & X86_EFLAGS_IF)) |
656 | local_irq_enable(); | 644 | local_irq_enable(); |
657 | 645 | ||
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f2f36f8dae5..d1bc04006d1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/initrd.h> | 31 | #include <linux/initrd.h> |
32 | #include <linux/cpumask.h> | 32 | #include <linux/cpumask.h> |
33 | 33 | ||
34 | #include <asm/asm.h> | ||
34 | #include <asm/processor.h> | 35 | #include <asm/processor.h> |
35 | #include <asm/system.h> | 36 | #include <asm/system.h> |
36 | #include <asm/uaccess.h> | 37 | #include <asm/uaccess.h> |
@@ -718,10 +719,7 @@ static noinline int do_test_wp_bit(void) | |||
718 | "1: movb %1, %0 \n" | 719 | "1: movb %1, %0 \n" |
719 | " xorl %2, %2 \n" | 720 | " xorl %2, %2 \n" |
720 | "2: \n" | 721 | "2: \n" |
721 | ".section __ex_table, \"a\"\n" | 722 | _ASM_EXTABLE(1b,2b) |
722 | " .align 4 \n" | ||
723 | " .long 1b, 2b \n" | ||
724 | ".previous \n" | ||
725 | :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), | 723 | :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), |
726 | "=q" (tmp_reg), | 724 | "=q" (tmp_reg), |
727 | "=r" (flag) | 725 | "=r" (flag) |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index eabcaed76c2..3a98d6f724a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -273,7 +273,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) | |||
273 | int i = pmd_index(address); | 273 | int i = pmd_index(address); |
274 | 274 | ||
275 | for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { | 275 | for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { |
276 | unsigned long entry; | ||
277 | pmd_t *pmd = pmd_page + pmd_index(address); | 276 | pmd_t *pmd = pmd_page + pmd_index(address); |
278 | 277 | ||
279 | if (address >= end) { | 278 | if (address >= end) { |
@@ -287,9 +286,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) | |||
287 | if (pmd_val(*pmd)) | 286 | if (pmd_val(*pmd)) |
288 | continue; | 287 | continue; |
289 | 288 | ||
290 | entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address; | 289 | set_pte((pte_t *)pmd, |
291 | entry &= __supported_pte_mask; | 290 | pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); |
292 | set_pmd(pmd, __pmd(entry)); | ||
293 | } | 291 | } |
294 | } | 292 | } |
295 | 293 | ||
@@ -435,49 +433,6 @@ void __init paging_init(void) | |||
435 | #endif | 433 | #endif |
436 | 434 | ||
437 | /* | 435 | /* |
438 | * Unmap a kernel mapping if it exists. This is useful to avoid | ||
439 | * prefetches from the CPU leading to inconsistent cache lines. | ||
440 | * address and size must be aligned to 2MB boundaries. | ||
441 | * Does nothing when the mapping doesn't exist. | ||
442 | */ | ||
443 | void __init clear_kernel_mapping(unsigned long address, unsigned long size) | ||
444 | { | ||
445 | unsigned long end = address + size; | ||
446 | |||
447 | BUG_ON(address & ~LARGE_PAGE_MASK); | ||
448 | BUG_ON(size & ~LARGE_PAGE_MASK); | ||
449 | |||
450 | for (; address < end; address += LARGE_PAGE_SIZE) { | ||
451 | pgd_t *pgd = pgd_offset_k(address); | ||
452 | pud_t *pud; | ||
453 | pmd_t *pmd; | ||
454 | |||
455 | if (pgd_none(*pgd)) | ||
456 | continue; | ||
457 | |||
458 | pud = pud_offset(pgd, address); | ||
459 | if (pud_none(*pud)) | ||
460 | continue; | ||
461 | |||
462 | pmd = pmd_offset(pud, address); | ||
463 | if (!pmd || pmd_none(*pmd)) | ||
464 | continue; | ||
465 | |||
466 | if (!(pmd_val(*pmd) & _PAGE_PSE)) { | ||
467 | /* | ||
468 | * Could handle this, but it should not happen | ||
469 | * currently: | ||
470 | */ | ||
471 | printk(KERN_ERR "clear_kernel_mapping: " | ||
472 | "mapping has been split. will leak memory\n"); | ||
473 | pmd_ERROR(*pmd); | ||
474 | } | ||
475 | set_pmd(pmd, __pmd(0)); | ||
476 | } | ||
477 | __flush_tlb_all(); | ||
478 | } | ||
479 | |||
480 | /* | ||
481 | * Memory hotplug specific functions | 436 | * Memory hotplug specific functions |
482 | */ | 437 | */ |
483 | void online_page(struct page *page) | 438 | void online_page(struct page *page) |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index c004d94608f..ee6648fe6b1 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -70,25 +70,12 @@ int page_is_ram(unsigned long pagenr) | |||
70 | * Fix up the linear direct mapping of the kernel to avoid cache attribute | 70 | * Fix up the linear direct mapping of the kernel to avoid cache attribute |
71 | * conflicts. | 71 | * conflicts. |
72 | */ | 72 | */ |
73 | static int ioremap_change_attr(unsigned long paddr, unsigned long size, | 73 | static int ioremap_change_attr(unsigned long vaddr, unsigned long size, |
74 | enum ioremap_mode mode) | 74 | enum ioremap_mode mode) |
75 | { | 75 | { |
76 | unsigned long vaddr = (unsigned long)__va(paddr); | ||
77 | unsigned long nrpages = size >> PAGE_SHIFT; | 76 | unsigned long nrpages = size >> PAGE_SHIFT; |
78 | unsigned int level; | ||
79 | int err; | 77 | int err; |
80 | 78 | ||
81 | /* No change for pages after the last mapping */ | ||
82 | if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT)) | ||
83 | return 0; | ||
84 | |||
85 | /* | ||
86 | * If there is no identity map for this address, | ||
87 | * change_page_attr_addr is unnecessary | ||
88 | */ | ||
89 | if (!lookup_address(vaddr, &level)) | ||
90 | return 0; | ||
91 | |||
92 | switch (mode) { | 79 | switch (mode) { |
93 | case IOR_MODE_UNCACHED: | 80 | case IOR_MODE_UNCACHED: |
94 | default: | 81 | default: |
@@ -114,9 +101,8 @@ static int ioremap_change_attr(unsigned long paddr, unsigned long size, | |||
114 | static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, | 101 | static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, |
115 | enum ioremap_mode mode) | 102 | enum ioremap_mode mode) |
116 | { | 103 | { |
117 | void __iomem *addr; | 104 | unsigned long pfn, offset, last_addr, vaddr; |
118 | struct vm_struct *area; | 105 | struct vm_struct *area; |
119 | unsigned long offset, last_addr; | ||
120 | pgprot_t prot; | 106 | pgprot_t prot; |
121 | 107 | ||
122 | /* Don't allow wraparound or zero size */ | 108 | /* Don't allow wraparound or zero size */ |
@@ -133,9 +119,10 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, | |||
133 | /* | 119 | /* |
134 | * Don't allow anybody to remap normal RAM that we're using.. | 120 | * Don't allow anybody to remap normal RAM that we're using.. |
135 | */ | 121 | */ |
136 | for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped && | 122 | for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped && |
137 | (offset << PAGE_SHIFT) < last_addr; offset++) { | 123 | (pfn << PAGE_SHIFT) < last_addr; pfn++) { |
138 | if (page_is_ram(offset)) | 124 | if (page_is_ram(pfn) && pfn_valid(pfn) && |
125 | !PageReserved(pfn_to_page(pfn))) | ||
139 | return NULL; | 126 | return NULL; |
140 | } | 127 | } |
141 | 128 | ||
@@ -163,19 +150,18 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, | |||
163 | if (!area) | 150 | if (!area) |
164 | return NULL; | 151 | return NULL; |
165 | area->phys_addr = phys_addr; | 152 | area->phys_addr = phys_addr; |
166 | addr = (void __iomem *) area->addr; | 153 | vaddr = (unsigned long) area->addr; |
167 | if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, | 154 | if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { |
168 | phys_addr, prot)) { | 155 | remove_vm_area((void *)(vaddr & PAGE_MASK)); |
169 | remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); | ||
170 | return NULL; | 156 | return NULL; |
171 | } | 157 | } |
172 | 158 | ||
173 | if (ioremap_change_attr(phys_addr, size, mode) < 0) { | 159 | if (ioremap_change_attr(vaddr, size, mode) < 0) { |
174 | vunmap(addr); | 160 | vunmap(area->addr); |
175 | return NULL; | 161 | return NULL; |
176 | } | 162 | } |
177 | 163 | ||
178 | return (void __iomem *) (offset + (char __iomem *)addr); | 164 | return (void __iomem *) (vaddr + offset); |
179 | } | 165 | } |
180 | 166 | ||
181 | /** | 167 | /** |
@@ -254,9 +240,6 @@ void iounmap(volatile void __iomem *addr) | |||
254 | return; | 240 | return; |
255 | } | 241 | } |
256 | 242 | ||
257 | /* Reset the direct mapping. Can block */ | ||
258 | ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED); | ||
259 | |||
260 | /* Finally remove it */ | 243 | /* Finally remove it */ |
261 | o = remove_vm_area((void *)addr); | 244 | o = remove_vm_area((void *)addr); |
262 | BUG_ON(p != o || o == NULL); | 245 | BUG_ON(p != o || o == NULL); |
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a920d09b919..5a02bf4c91e 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -202,6 +202,8 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, | |||
202 | if (node_data[nodeid] == NULL) | 202 | if (node_data[nodeid] == NULL) |
203 | return; | 203 | return; |
204 | nodedata_phys = __pa(node_data[nodeid]); | 204 | nodedata_phys = __pa(node_data[nodeid]); |
205 | printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, | ||
206 | nodedata_phys + pgdat_size - 1); | ||
205 | 207 | ||
206 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); | 208 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); |
207 | NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; | 209 | NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; |
@@ -225,12 +227,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, | |||
225 | return; | 227 | return; |
226 | } | 228 | } |
227 | bootmap_start = __pa(bootmap); | 229 | bootmap_start = __pa(bootmap); |
228 | Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); | ||
229 | 230 | ||
230 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | 231 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), |
231 | bootmap_start >> PAGE_SHIFT, | 232 | bootmap_start >> PAGE_SHIFT, |
232 | start_pfn, end_pfn); | 233 | start_pfn, end_pfn); |
233 | 234 | ||
235 | printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", | ||
236 | bootmap_start, bootmap_start + bootmap_size - 1, | ||
237 | bootmap_pages); | ||
238 | |||
234 | free_bootmem_with_active_regions(nodeid, end); | 239 | free_bootmem_with_active_regions(nodeid, end); |
235 | 240 | ||
236 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); | 241 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); |
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 7573e786d2f..398f3a578dd 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c | |||
@@ -137,7 +137,8 @@ static __init int exercise_pageattr(void) | |||
137 | 137 | ||
138 | for (k = 0; k < len[i]; k++) { | 138 | for (k = 0; k < len[i]; k++) { |
139 | pte = lookup_address(addr[i] + k*PAGE_SIZE, &level); | 139 | pte = lookup_address(addr[i] + k*PAGE_SIZE, &level); |
140 | if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) { | 140 | if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 || |
141 | !(pte_val(*pte) & _PAGE_PRESENT)) { | ||
141 | addr[i] = 0; | 142 | addr[i] = 0; |
142 | break; | 143 | break; |
143 | } | 144 | } |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e297bd65e51..bb55a78dcd6 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -16,6 +16,17 @@ | |||
16 | #include <asm/uaccess.h> | 16 | #include <asm/uaccess.h> |
17 | #include <asm/pgalloc.h> | 17 | #include <asm/pgalloc.h> |
18 | 18 | ||
19 | /* | ||
20 | * The current flushing context - we pass it instead of 5 arguments: | ||
21 | */ | ||
22 | struct cpa_data { | ||
23 | unsigned long vaddr; | ||
24 | pgprot_t mask_set; | ||
25 | pgprot_t mask_clr; | ||
26 | int numpages; | ||
27 | int flushtlb; | ||
28 | }; | ||
29 | |||
19 | static inline int | 30 | static inline int |
20 | within(unsigned long addr, unsigned long start, unsigned long end) | 31 | within(unsigned long addr, unsigned long start, unsigned long end) |
21 | { | 32 | { |
@@ -52,21 +63,23 @@ void clflush_cache_range(void *vaddr, unsigned int size) | |||
52 | 63 | ||
53 | static void __cpa_flush_all(void *arg) | 64 | static void __cpa_flush_all(void *arg) |
54 | { | 65 | { |
66 | unsigned long cache = (unsigned long)arg; | ||
67 | |||
55 | /* | 68 | /* |
56 | * Flush all to work around Errata in early athlons regarding | 69 | * Flush all to work around Errata in early athlons regarding |
57 | * large page flushing. | 70 | * large page flushing. |
58 | */ | 71 | */ |
59 | __flush_tlb_all(); | 72 | __flush_tlb_all(); |
60 | 73 | ||
61 | if (boot_cpu_data.x86_model >= 4) | 74 | if (cache && boot_cpu_data.x86_model >= 4) |
62 | wbinvd(); | 75 | wbinvd(); |
63 | } | 76 | } |
64 | 77 | ||
65 | static void cpa_flush_all(void) | 78 | static void cpa_flush_all(unsigned long cache) |
66 | { | 79 | { |
67 | BUG_ON(irqs_disabled()); | 80 | BUG_ON(irqs_disabled()); |
68 | 81 | ||
69 | on_each_cpu(__cpa_flush_all, NULL, 1, 1); | 82 | on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); |
70 | } | 83 | } |
71 | 84 | ||
72 | static void __cpa_flush_range(void *arg) | 85 | static void __cpa_flush_range(void *arg) |
@@ -79,7 +92,7 @@ static void __cpa_flush_range(void *arg) | |||
79 | __flush_tlb_all(); | 92 | __flush_tlb_all(); |
80 | } | 93 | } |
81 | 94 | ||
82 | static void cpa_flush_range(unsigned long start, int numpages) | 95 | static void cpa_flush_range(unsigned long start, int numpages, int cache) |
83 | { | 96 | { |
84 | unsigned int i, level; | 97 | unsigned int i, level; |
85 | unsigned long addr; | 98 | unsigned long addr; |
@@ -89,6 +102,9 @@ static void cpa_flush_range(unsigned long start, int numpages) | |||
89 | 102 | ||
90 | on_each_cpu(__cpa_flush_range, NULL, 1, 1); | 103 | on_each_cpu(__cpa_flush_range, NULL, 1, 1); |
91 | 104 | ||
105 | if (!cache) | ||
106 | return; | ||
107 | |||
92 | /* | 108 | /* |
93 | * We only need to flush on one CPU, | 109 | * We only need to flush on one CPU, |
94 | * clflush is a MESI-coherent instruction that | 110 | * clflush is a MESI-coherent instruction that |
@@ -101,11 +117,27 @@ static void cpa_flush_range(unsigned long start, int numpages) | |||
101 | /* | 117 | /* |
102 | * Only flush present addresses: | 118 | * Only flush present addresses: |
103 | */ | 119 | */ |
104 | if (pte && pte_present(*pte)) | 120 | if (pte && (pte_val(*pte) & _PAGE_PRESENT)) |
105 | clflush_cache_range((void *) addr, PAGE_SIZE); | 121 | clflush_cache_range((void *) addr, PAGE_SIZE); |
106 | } | 122 | } |
107 | } | 123 | } |
108 | 124 | ||
125 | #define HIGH_MAP_START __START_KERNEL_map | ||
126 | #define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) | ||
127 | |||
128 | |||
129 | /* | ||
130 | * Converts a virtual address to a X86-64 highmap address | ||
131 | */ | ||
132 | static unsigned long virt_to_highmap(void *address) | ||
133 | { | ||
134 | #ifdef CONFIG_X86_64 | ||
135 | return __pa((unsigned long)address) + HIGH_MAP_START - phys_base; | ||
136 | #else | ||
137 | return (unsigned long)address; | ||
138 | #endif | ||
139 | } | ||
140 | |||
109 | /* | 141 | /* |
110 | * Certain areas of memory on x86 require very specific protection flags, | 142 | * Certain areas of memory on x86 require very specific protection flags, |
111 | * for example the BIOS area or kernel text. Callers don't always get this | 143 | * for example the BIOS area or kernel text. Callers don't always get this |
@@ -129,12 +161,24 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) | |||
129 | */ | 161 | */ |
130 | if (within(address, (unsigned long)_text, (unsigned long)_etext)) | 162 | if (within(address, (unsigned long)_text, (unsigned long)_etext)) |
131 | pgprot_val(forbidden) |= _PAGE_NX; | 163 | pgprot_val(forbidden) |= _PAGE_NX; |
164 | /* | ||
165 | * Do the same for the x86-64 high kernel mapping | ||
166 | */ | ||
167 | if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext))) | ||
168 | pgprot_val(forbidden) |= _PAGE_NX; | ||
169 | |||
132 | 170 | ||
133 | #ifdef CONFIG_DEBUG_RODATA | 171 | #ifdef CONFIG_DEBUG_RODATA |
134 | /* The .rodata section needs to be read-only */ | 172 | /* The .rodata section needs to be read-only */ |
135 | if (within(address, (unsigned long)__start_rodata, | 173 | if (within(address, (unsigned long)__start_rodata, |
136 | (unsigned long)__end_rodata)) | 174 | (unsigned long)__end_rodata)) |
137 | pgprot_val(forbidden) |= _PAGE_RW; | 175 | pgprot_val(forbidden) |= _PAGE_RW; |
176 | /* | ||
177 | * Do the same for the x86-64 high kernel mapping | ||
178 | */ | ||
179 | if (within(address, virt_to_highmap(__start_rodata), | ||
180 | virt_to_highmap(__end_rodata))) | ||
181 | pgprot_val(forbidden) |= _PAGE_RW; | ||
138 | #endif | 182 | #endif |
139 | 183 | ||
140 | prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); | 184 | prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); |
@@ -142,6 +186,14 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) | |||
142 | return prot; | 186 | return prot; |
143 | } | 187 | } |
144 | 188 | ||
189 | /* | ||
190 | * Lookup the page table entry for a virtual address. Return a pointer | ||
191 | * to the entry and the level of the mapping. | ||
192 | * | ||
193 | * Note: We return pud and pmd either when the entry is marked large | ||
194 | * or when the present bit is not set. Otherwise we would return a | ||
195 | * pointer to a nonexisting mapping. | ||
196 | */ | ||
145 | pte_t *lookup_address(unsigned long address, int *level) | 197 | pte_t *lookup_address(unsigned long address, int *level) |
146 | { | 198 | { |
147 | pgd_t *pgd = pgd_offset_k(address); | 199 | pgd_t *pgd = pgd_offset_k(address); |
@@ -152,21 +204,31 @@ pte_t *lookup_address(unsigned long address, int *level) | |||
152 | 204 | ||
153 | if (pgd_none(*pgd)) | 205 | if (pgd_none(*pgd)) |
154 | return NULL; | 206 | return NULL; |
207 | |||
155 | pud = pud_offset(pgd, address); | 208 | pud = pud_offset(pgd, address); |
156 | if (pud_none(*pud)) | 209 | if (pud_none(*pud)) |
157 | return NULL; | 210 | return NULL; |
211 | |||
212 | *level = PG_LEVEL_1G; | ||
213 | if (pud_large(*pud) || !pud_present(*pud)) | ||
214 | return (pte_t *)pud; | ||
215 | |||
158 | pmd = pmd_offset(pud, address); | 216 | pmd = pmd_offset(pud, address); |
159 | if (pmd_none(*pmd)) | 217 | if (pmd_none(*pmd)) |
160 | return NULL; | 218 | return NULL; |
161 | 219 | ||
162 | *level = PG_LEVEL_2M; | 220 | *level = PG_LEVEL_2M; |
163 | if (pmd_large(*pmd)) | 221 | if (pmd_large(*pmd) || !pmd_present(*pmd)) |
164 | return (pte_t *)pmd; | 222 | return (pte_t *)pmd; |
165 | 223 | ||
166 | *level = PG_LEVEL_4K; | 224 | *level = PG_LEVEL_4K; |
225 | |||
167 | return pte_offset_kernel(pmd, address); | 226 | return pte_offset_kernel(pmd, address); |
168 | } | 227 | } |
169 | 228 | ||
229 | /* | ||
230 | * Set the new pmd in all the pgds we know about: | ||
231 | */ | ||
170 | static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) | 232 | static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) |
171 | { | 233 | { |
172 | /* change init_mm */ | 234 | /* change init_mm */ |
@@ -175,6 +237,7 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) | |||
175 | if (!SHARED_KERNEL_PMD) { | 237 | if (!SHARED_KERNEL_PMD) { |
176 | struct page *page; | 238 | struct page *page; |
177 | 239 | ||
240 | address = __pa(address); | ||
178 | list_for_each_entry(page, &pgd_list, lru) { | 241 | list_for_each_entry(page, &pgd_list, lru) { |
179 | pgd_t *pgd; | 242 | pgd_t *pgd; |
180 | pud_t *pud; | 243 | pud_t *pud; |
@@ -189,18 +252,114 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) | |||
189 | #endif | 252 | #endif |
190 | } | 253 | } |
191 | 254 | ||
255 | static int | ||
256 | try_preserve_large_page(pte_t *kpte, unsigned long address, | ||
257 | struct cpa_data *cpa) | ||
258 | { | ||
259 | unsigned long nextpage_addr, numpages, pmask, psize, flags; | ||
260 | pte_t new_pte, old_pte, *tmp; | ||
261 | pgprot_t old_prot, new_prot; | ||
262 | int level, do_split = 1; | ||
263 | |||
264 | /* | ||
265 | * An Athlon 64 X2 showed hard hangs if we tried to preserve | ||
266 | * largepages and changed the PSE entry from RW to RO. | ||
267 | * | ||
268 | * As AMD CPUs have a long series of erratas in this area, | ||
269 | * (and none of the known ones seem to explain this hang), | ||
270 | * disable this code until the hang can be debugged: | ||
271 | */ | ||
272 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | ||
273 | return 1; | ||
274 | |||
275 | spin_lock_irqsave(&pgd_lock, flags); | ||
276 | /* | ||
277 | * Check for races, another CPU might have split this page | ||
278 | * up already: | ||
279 | */ | ||
280 | tmp = lookup_address(address, &level); | ||
281 | if (tmp != kpte) | ||
282 | goto out_unlock; | ||
283 | |||
284 | switch (level) { | ||
285 | case PG_LEVEL_2M: | ||
286 | psize = PMD_PAGE_SIZE; | ||
287 | pmask = PMD_PAGE_MASK; | ||
288 | break; | ||
289 | #ifdef CONFIG_X86_64 | ||
290 | case PG_LEVEL_1G: | ||
291 | psize = PMD_PAGE_SIZE; | ||
292 | pmask = PMD_PAGE_MASK; | ||
293 | break; | ||
294 | #endif | ||
295 | default: | ||
296 | do_split = -EINVAL; | ||
297 | goto out_unlock; | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * Calculate the number of pages, which fit into this large | ||
302 | * page starting at address: | ||
303 | */ | ||
304 | nextpage_addr = (address + psize) & pmask; | ||
305 | numpages = (nextpage_addr - address) >> PAGE_SHIFT; | ||
306 | if (numpages < cpa->numpages) | ||
307 | cpa->numpages = numpages; | ||
308 | |||
309 | /* | ||
310 | * We are safe now. Check whether the new pgprot is the same: | ||
311 | */ | ||
312 | old_pte = *kpte; | ||
313 | old_prot = new_prot = pte_pgprot(old_pte); | ||
314 | |||
315 | pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); | ||
316 | pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); | ||
317 | new_prot = static_protections(new_prot, address); | ||
318 | |||
319 | /* | ||
320 | * If there are no changes, return. maxpages has been updated | ||
321 | * above: | ||
322 | */ | ||
323 | if (pgprot_val(new_prot) == pgprot_val(old_prot)) { | ||
324 | do_split = 0; | ||
325 | goto out_unlock; | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * We need to change the attributes. Check, whether we can | ||
330 | * change the large page in one go. We request a split, when | ||
331 | * the address is not aligned and the number of pages is | ||
332 | * smaller than the number of pages in the large page. Note | ||
333 | * that we limited the number of possible pages already to | ||
334 | * the number of pages in the large page. | ||
335 | */ | ||
336 | if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { | ||
337 | /* | ||
338 | * The address is aligned and the number of pages | ||
339 | * covers the full page. | ||
340 | */ | ||
341 | new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); | ||
342 | __set_pmd_pte(kpte, address, new_pte); | ||
343 | cpa->flushtlb = 1; | ||
344 | do_split = 0; | ||
345 | } | ||
346 | |||
347 | out_unlock: | ||
348 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
349 | |||
350 | return do_split; | ||
351 | } | ||
352 | |||
192 | static int split_large_page(pte_t *kpte, unsigned long address) | 353 | static int split_large_page(pte_t *kpte, unsigned long address) |
193 | { | 354 | { |
194 | pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); | 355 | unsigned long flags, pfn, pfninc = 1; |
195 | gfp_t gfp_flags = GFP_KERNEL; | 356 | gfp_t gfp_flags = GFP_KERNEL; |
196 | unsigned long flags; | 357 | unsigned int i, level; |
197 | unsigned long addr; | ||
198 | pte_t *pbase, *tmp; | 358 | pte_t *pbase, *tmp; |
359 | pgprot_t ref_prot; | ||
199 | struct page *base; | 360 | struct page *base; |
200 | unsigned int i, level; | ||
201 | 361 | ||
202 | #ifdef CONFIG_DEBUG_PAGEALLOC | 362 | #ifdef CONFIG_DEBUG_PAGEALLOC |
203 | gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN; | ||
204 | gfp_flags = GFP_ATOMIC | __GFP_NOWARN; | 363 | gfp_flags = GFP_ATOMIC | __GFP_NOWARN; |
205 | #endif | 364 | #endif |
206 | base = alloc_pages(gfp_flags, 0); | 365 | base = alloc_pages(gfp_flags, 0); |
@@ -213,30 +372,41 @@ static int split_large_page(pte_t *kpte, unsigned long address) | |||
213 | * up for us already: | 372 | * up for us already: |
214 | */ | 373 | */ |
215 | tmp = lookup_address(address, &level); | 374 | tmp = lookup_address(address, &level); |
216 | if (tmp != kpte) { | 375 | if (tmp != kpte) |
217 | WARN_ON_ONCE(1); | ||
218 | goto out_unlock; | 376 | goto out_unlock; |
219 | } | ||
220 | 377 | ||
221 | address = __pa(address); | ||
222 | addr = address & LARGE_PAGE_MASK; | ||
223 | pbase = (pte_t *)page_address(base); | 378 | pbase = (pte_t *)page_address(base); |
224 | #ifdef CONFIG_X86_32 | 379 | #ifdef CONFIG_X86_32 |
225 | paravirt_alloc_pt(&init_mm, page_to_pfn(base)); | 380 | paravirt_alloc_pt(&init_mm, page_to_pfn(base)); |
226 | #endif | 381 | #endif |
382 | ref_prot = pte_pgprot(pte_clrhuge(*kpte)); | ||
383 | |||
384 | #ifdef CONFIG_X86_64 | ||
385 | if (level == PG_LEVEL_1G) { | ||
386 | pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; | ||
387 | pgprot_val(ref_prot) |= _PAGE_PSE; | ||
388 | } | ||
389 | #endif | ||
227 | 390 | ||
228 | pgprot_val(ref_prot) &= ~_PAGE_NX; | 391 | /* |
229 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) | 392 | * Get the target pfn from the original entry: |
230 | set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot)); | 393 | */ |
394 | pfn = pte_pfn(*kpte); | ||
395 | for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) | ||
396 | set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); | ||
231 | 397 | ||
232 | /* | 398 | /* |
233 | * Install the new, split up pagetable. Important detail here: | 399 | * Install the new, split up pagetable. Important details here: |
234 | * | 400 | * |
235 | * On Intel the NX bit of all levels must be cleared to make a | 401 | * On Intel the NX bit of all levels must be cleared to make a |
236 | * page executable. See section 4.13.2 of Intel 64 and IA-32 | 402 | * page executable. See section 4.13.2 of Intel 64 and IA-32 |
237 | * Architectures Software Developer's Manual). | 403 | * Architectures Software Developer's Manual). |
404 | * | ||
405 | * Mark the entry present. The current mapping might be | ||
406 | * set to not present, which we preserved above. | ||
238 | */ | 407 | */ |
239 | ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); | 408 | ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); |
409 | pgprot_val(ref_prot) |= _PAGE_PRESENT; | ||
240 | __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); | 410 | __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); |
241 | base = NULL; | 411 | base = NULL; |
242 | 412 | ||
@@ -249,18 +419,12 @@ out_unlock: | |||
249 | return 0; | 419 | return 0; |
250 | } | 420 | } |
251 | 421 | ||
252 | static int | 422 | static int __change_page_attr(unsigned long address, struct cpa_data *cpa) |
253 | __change_page_attr(unsigned long address, unsigned long pfn, | ||
254 | pgprot_t mask_set, pgprot_t mask_clr) | ||
255 | { | 423 | { |
424 | int level, do_split, err; | ||
256 | struct page *kpte_page; | 425 | struct page *kpte_page; |
257 | int level, err = 0; | ||
258 | pte_t *kpte; | 426 | pte_t *kpte; |
259 | 427 | ||
260 | #ifdef CONFIG_X86_32 | ||
261 | BUG_ON(pfn > max_low_pfn); | ||
262 | #endif | ||
263 | |||
264 | repeat: | 428 | repeat: |
265 | kpte = lookup_address(address, &level); | 429 | kpte = lookup_address(address, &level); |
266 | if (!kpte) | 430 | if (!kpte) |
@@ -271,23 +435,62 @@ repeat: | |||
271 | BUG_ON(PageCompound(kpte_page)); | 435 | BUG_ON(PageCompound(kpte_page)); |
272 | 436 | ||
273 | if (level == PG_LEVEL_4K) { | 437 | if (level == PG_LEVEL_4K) { |
274 | pgprot_t new_prot = pte_pgprot(*kpte); | ||
275 | pte_t new_pte, old_pte = *kpte; | 438 | pte_t new_pte, old_pte = *kpte; |
439 | pgprot_t new_prot = pte_pgprot(old_pte); | ||
440 | |||
441 | if(!pte_val(old_pte)) { | ||
442 | printk(KERN_WARNING "CPA: called for zero pte. " | ||
443 | "vaddr = %lx cpa->vaddr = %lx\n", address, | ||
444 | cpa->vaddr); | ||
445 | WARN_ON(1); | ||
446 | return -EINVAL; | ||
447 | } | ||
276 | 448 | ||
277 | pgprot_val(new_prot) &= ~pgprot_val(mask_clr); | 449 | pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); |
278 | pgprot_val(new_prot) |= pgprot_val(mask_set); | 450 | pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); |
279 | 451 | ||
280 | new_prot = static_protections(new_prot, address); | 452 | new_prot = static_protections(new_prot, address); |
281 | 453 | ||
282 | new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); | 454 | /* |
283 | BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte)); | 455 | * We need to keep the pfn from the existing PTE, |
456 | * after all we're only going to change it's attributes | ||
457 | * not the memory it points to | ||
458 | */ | ||
459 | new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); | ||
460 | |||
461 | /* | ||
462 | * Do we really change anything ? | ||
463 | */ | ||
464 | if (pte_val(old_pte) != pte_val(new_pte)) { | ||
465 | set_pte_atomic(kpte, new_pte); | ||
466 | cpa->flushtlb = 1; | ||
467 | } | ||
468 | cpa->numpages = 1; | ||
469 | return 0; | ||
470 | } | ||
471 | |||
472 | /* | ||
473 | * Check, whether we can keep the large page intact | ||
474 | * and just change the pte: | ||
475 | */ | ||
476 | do_split = try_preserve_large_page(kpte, address, cpa); | ||
477 | /* | ||
478 | * When the range fits into the existing large page, | ||
479 | * return. cp->numpages and cpa->tlbflush have been updated in | ||
480 | * try_large_page: | ||
481 | */ | ||
482 | if (do_split <= 0) | ||
483 | return do_split; | ||
284 | 484 | ||
285 | set_pte_atomic(kpte, new_pte); | 485 | /* |
286 | } else { | 486 | * We have to split the large page: |
287 | err = split_large_page(kpte, address); | 487 | */ |
288 | if (!err) | 488 | err = split_large_page(kpte, address); |
289 | goto repeat; | 489 | if (!err) { |
490 | cpa->flushtlb = 1; | ||
491 | goto repeat; | ||
290 | } | 492 | } |
493 | |||
291 | return err; | 494 | return err; |
292 | } | 495 | } |
293 | 496 | ||
@@ -304,19 +507,14 @@ repeat: | |||
304 | * | 507 | * |
305 | * Modules and drivers should use the set_memory_* APIs instead. | 508 | * Modules and drivers should use the set_memory_* APIs instead. |
306 | */ | 509 | */ |
307 | 510 | static int change_page_attr_addr(struct cpa_data *cpa) | |
308 | #define HIGH_MAP_START __START_KERNEL_map | ||
309 | #define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) | ||
310 | |||
311 | static int | ||
312 | change_page_attr_addr(unsigned long address, pgprot_t mask_set, | ||
313 | pgprot_t mask_clr) | ||
314 | { | 511 | { |
315 | unsigned long phys_addr = __pa(address); | ||
316 | unsigned long pfn = phys_addr >> PAGE_SHIFT; | ||
317 | int err; | 512 | int err; |
513 | unsigned long address = cpa->vaddr; | ||
318 | 514 | ||
319 | #ifdef CONFIG_X86_64 | 515 | #ifdef CONFIG_X86_64 |
516 | unsigned long phys_addr = __pa(address); | ||
517 | |||
320 | /* | 518 | /* |
321 | * If we are inside the high mapped kernel range, then we | 519 | * If we are inside the high mapped kernel range, then we |
322 | * fixup the low mapping first. __va() returns the virtual | 520 | * fixup the low mapping first. __va() returns the virtual |
@@ -326,7 +524,7 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set, | |||
326 | address = (unsigned long) __va(phys_addr); | 524 | address = (unsigned long) __va(phys_addr); |
327 | #endif | 525 | #endif |
328 | 526 | ||
329 | err = __change_page_attr(address, pfn, mask_set, mask_clr); | 527 | err = __change_page_attr(address, cpa); |
330 | if (err) | 528 | if (err) |
331 | return err; | 529 | return err; |
332 | 530 | ||
@@ -339,42 +537,89 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set, | |||
339 | /* | 537 | /* |
340 | * Calc the high mapping address. See __phys_addr() | 538 | * Calc the high mapping address. See __phys_addr() |
341 | * for the non obvious details. | 539 | * for the non obvious details. |
540 | * | ||
541 | * Note that NX and other required permissions are | ||
542 | * checked in static_protections(). | ||
342 | */ | 543 | */ |
343 | address = phys_addr + HIGH_MAP_START - phys_base; | 544 | address = phys_addr + HIGH_MAP_START - phys_base; |
344 | /* Make sure the kernel mappings stay executable */ | ||
345 | pgprot_val(mask_clr) |= _PAGE_NX; | ||
346 | 545 | ||
347 | /* | 546 | /* |
348 | * Our high aliases are imprecise, because we check | 547 | * Our high aliases are imprecise, because we check |
349 | * everything between 0 and KERNEL_TEXT_SIZE, so do | 548 | * everything between 0 and KERNEL_TEXT_SIZE, so do |
350 | * not propagate lookup failures back to users: | 549 | * not propagate lookup failures back to users: |
351 | */ | 550 | */ |
352 | __change_page_attr(address, pfn, mask_set, mask_clr); | 551 | __change_page_attr(address, cpa); |
353 | } | 552 | } |
354 | #endif | 553 | #endif |
355 | return err; | 554 | return err; |
356 | } | 555 | } |
357 | 556 | ||
358 | static int __change_page_attr_set_clr(unsigned long addr, int numpages, | 557 | static int __change_page_attr_set_clr(struct cpa_data *cpa) |
359 | pgprot_t mask_set, pgprot_t mask_clr) | ||
360 | { | 558 | { |
361 | unsigned int i; | 559 | int ret, numpages = cpa->numpages; |
362 | int ret; | ||
363 | 560 | ||
364 | for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) { | 561 | while (numpages) { |
365 | ret = change_page_attr_addr(addr, mask_set, mask_clr); | 562 | /* |
563 | * Store the remaining nr of pages for the large page | ||
564 | * preservation check. | ||
565 | */ | ||
566 | cpa->numpages = numpages; | ||
567 | ret = change_page_attr_addr(cpa); | ||
366 | if (ret) | 568 | if (ret) |
367 | return ret; | 569 | return ret; |
368 | } | ||
369 | 570 | ||
571 | /* | ||
572 | * Adjust the number of pages with the result of the | ||
573 | * CPA operation. Either a large page has been | ||
574 | * preserved or a single page update happened. | ||
575 | */ | ||
576 | BUG_ON(cpa->numpages > numpages); | ||
577 | numpages -= cpa->numpages; | ||
578 | cpa->vaddr += cpa->numpages * PAGE_SIZE; | ||
579 | } | ||
370 | return 0; | 580 | return 0; |
371 | } | 581 | } |
372 | 582 | ||
583 | static inline int cache_attr(pgprot_t attr) | ||
584 | { | ||
585 | return pgprot_val(attr) & | ||
586 | (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); | ||
587 | } | ||
588 | |||
373 | static int change_page_attr_set_clr(unsigned long addr, int numpages, | 589 | static int change_page_attr_set_clr(unsigned long addr, int numpages, |
374 | pgprot_t mask_set, pgprot_t mask_clr) | 590 | pgprot_t mask_set, pgprot_t mask_clr) |
375 | { | 591 | { |
376 | int ret = __change_page_attr_set_clr(addr, numpages, mask_set, | 592 | struct cpa_data cpa; |
377 | mask_clr); | 593 | int ret, cache; |
594 | |||
595 | /* | ||
596 | * Check, if we are requested to change a not supported | ||
597 | * feature: | ||
598 | */ | ||
599 | mask_set = canon_pgprot(mask_set); | ||
600 | mask_clr = canon_pgprot(mask_clr); | ||
601 | if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) | ||
602 | return 0; | ||
603 | |||
604 | cpa.vaddr = addr; | ||
605 | cpa.numpages = numpages; | ||
606 | cpa.mask_set = mask_set; | ||
607 | cpa.mask_clr = mask_clr; | ||
608 | cpa.flushtlb = 0; | ||
609 | |||
610 | ret = __change_page_attr_set_clr(&cpa); | ||
611 | |||
612 | /* | ||
613 | * Check whether we really changed something: | ||
614 | */ | ||
615 | if (!cpa.flushtlb) | ||
616 | return ret; | ||
617 | |||
618 | /* | ||
619 | * No need to flush, when we did not set any of the caching | ||
620 | * attributes: | ||
621 | */ | ||
622 | cache = cache_attr(mask_set); | ||
378 | 623 | ||
379 | /* | 624 | /* |
380 | * On success we use clflush, when the CPU supports it to | 625 | * On success we use clflush, when the CPU supports it to |
@@ -383,9 +628,9 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, | |||
383 | * wbindv): | 628 | * wbindv): |
384 | */ | 629 | */ |
385 | if (!ret && cpu_has_clflush) | 630 | if (!ret && cpu_has_clflush) |
386 | cpa_flush_range(addr, numpages); | 631 | cpa_flush_range(addr, numpages, cache); |
387 | else | 632 | else |
388 | cpa_flush_all(); | 633 | cpa_flush_all(cache); |
389 | 634 | ||
390 | return ret; | 635 | return ret; |
391 | } | 636 | } |
@@ -489,37 +734,26 @@ int set_pages_rw(struct page *page, int numpages) | |||
489 | return set_memory_rw(addr, numpages); | 734 | return set_memory_rw(addr, numpages); |
490 | } | 735 | } |
491 | 736 | ||
492 | |||
493 | #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG) | ||
494 | static inline int __change_page_attr_set(unsigned long addr, int numpages, | ||
495 | pgprot_t mask) | ||
496 | { | ||
497 | return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); | ||
498 | } | ||
499 | |||
500 | static inline int __change_page_attr_clear(unsigned long addr, int numpages, | ||
501 | pgprot_t mask) | ||
502 | { | ||
503 | return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); | ||
504 | } | ||
505 | #endif | ||
506 | |||
507 | #ifdef CONFIG_DEBUG_PAGEALLOC | 737 | #ifdef CONFIG_DEBUG_PAGEALLOC |
508 | 738 | ||
509 | static int __set_pages_p(struct page *page, int numpages) | 739 | static int __set_pages_p(struct page *page, int numpages) |
510 | { | 740 | { |
511 | unsigned long addr = (unsigned long)page_address(page); | 741 | struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), |
742 | .numpages = numpages, | ||
743 | .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), | ||
744 | .mask_clr = __pgprot(0)}; | ||
512 | 745 | ||
513 | return __change_page_attr_set(addr, numpages, | 746 | return __change_page_attr_set_clr(&cpa); |
514 | __pgprot(_PAGE_PRESENT | _PAGE_RW)); | ||
515 | } | 747 | } |
516 | 748 | ||
517 | static int __set_pages_np(struct page *page, int numpages) | 749 | static int __set_pages_np(struct page *page, int numpages) |
518 | { | 750 | { |
519 | unsigned long addr = (unsigned long)page_address(page); | 751 | struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), |
752 | .numpages = numpages, | ||
753 | .mask_set = __pgprot(0), | ||
754 | .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; | ||
520 | 755 | ||
521 | return __change_page_attr_clear(addr, numpages, | 756 | return __change_page_attr_set_clr(&cpa); |
522 | __pgprot(_PAGE_PRESENT)); | ||
523 | } | 757 | } |
524 | 758 | ||
525 | void kernel_map_pages(struct page *page, int numpages, int enable) | 759 | void kernel_map_pages(struct page *page, int numpages, int enable) |
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index cb3aa470249..c7db504be1e 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c | |||
@@ -219,50 +219,39 @@ static inline void pgd_list_del(pgd_t *pgd) | |||
219 | list_del(&page->lru); | 219 | list_del(&page->lru); |
220 | } | 220 | } |
221 | 221 | ||
222 | #define UNSHARED_PTRS_PER_PGD \ | ||
223 | (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) | ||
222 | 224 | ||
223 | 225 | static void pgd_ctor(void *p) | |
224 | #if (PTRS_PER_PMD == 1) | ||
225 | /* Non-PAE pgd constructor */ | ||
226 | static void pgd_ctor(void *pgd) | ||
227 | { | 226 | { |
227 | pgd_t *pgd = p; | ||
228 | unsigned long flags; | 228 | unsigned long flags; |
229 | 229 | ||
230 | /* !PAE, no pagetable sharing */ | 230 | /* Clear usermode parts of PGD */ |
231 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); | 231 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); |
232 | 232 | ||
233 | spin_lock_irqsave(&pgd_lock, flags); | 233 | spin_lock_irqsave(&pgd_lock, flags); |
234 | 234 | ||
235 | /* must happen under lock */ | 235 | /* If the pgd points to a shared pagetable level (either the |
236 | clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, | 236 | ptes in non-PAE, or shared PMD in PAE), then just copy the |
237 | swapper_pg_dir + USER_PTRS_PER_PGD, | 237 | references from swapper_pg_dir. */ |
238 | KERNEL_PGD_PTRS); | 238 | if (PAGETABLE_LEVELS == 2 || |
239 | paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, | 239 | (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { |
240 | __pa(swapper_pg_dir) >> PAGE_SHIFT, | 240 | clone_pgd_range(pgd + USER_PTRS_PER_PGD, |
241 | USER_PTRS_PER_PGD, | ||
242 | KERNEL_PGD_PTRS); | ||
243 | pgd_list_add(pgd); | ||
244 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
245 | } | ||
246 | #else /* PTRS_PER_PMD > 1 */ | ||
247 | /* PAE pgd constructor */ | ||
248 | static void pgd_ctor(void *pgd) | ||
249 | { | ||
250 | /* PAE, kernel PMD may be shared */ | ||
251 | |||
252 | if (SHARED_KERNEL_PMD) { | ||
253 | clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, | ||
254 | swapper_pg_dir + USER_PTRS_PER_PGD, | 241 | swapper_pg_dir + USER_PTRS_PER_PGD, |
255 | KERNEL_PGD_PTRS); | 242 | KERNEL_PGD_PTRS); |
256 | } else { | 243 | paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, |
257 | unsigned long flags; | 244 | __pa(swapper_pg_dir) >> PAGE_SHIFT, |
245 | USER_PTRS_PER_PGD, | ||
246 | KERNEL_PGD_PTRS); | ||
247 | } | ||
258 | 248 | ||
259 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); | 249 | /* list required to sync kernel mapping updates */ |
260 | spin_lock_irqsave(&pgd_lock, flags); | 250 | if (!SHARED_KERNEL_PMD) |
261 | pgd_list_add(pgd); | 251 | pgd_list_add(pgd); |
262 | spin_unlock_irqrestore(&pgd_lock, flags); | 252 | |
263 | } | 253 | spin_unlock_irqrestore(&pgd_lock, flags); |
264 | } | 254 | } |
265 | #endif /* PTRS_PER_PMD */ | ||
266 | 255 | ||
267 | static void pgd_dtor(void *pgd) | 256 | static void pgd_dtor(void *pgd) |
268 | { | 257 | { |
@@ -276,9 +265,6 @@ static void pgd_dtor(void *pgd) | |||
276 | spin_unlock_irqrestore(&pgd_lock, flags); | 265 | spin_unlock_irqrestore(&pgd_lock, flags); |
277 | } | 266 | } |
278 | 267 | ||
279 | #define UNSHARED_PTRS_PER_PGD \ | ||
280 | (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) | ||
281 | |||
282 | #ifdef CONFIG_X86_PAE | 268 | #ifdef CONFIG_X86_PAE |
283 | /* | 269 | /* |
284 | * Mop up any pmd pages which may still be attached to the pgd. | 270 | * Mop up any pmd pages which may still be attached to the pgd. |
@@ -387,13 +373,6 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) | |||
387 | 373 | ||
388 | void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) | 374 | void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) |
389 | { | 375 | { |
390 | /* This is called just after the pmd has been detached from | ||
391 | the pgd, which requires a full tlb flush to be recognized | ||
392 | by the CPU. Rather than incurring multiple tlb flushes | ||
393 | while the address space is being pulled down, make the tlb | ||
394 | gathering machinery do a full flush when we're done. */ | ||
395 | tlb->fullmm = 1; | ||
396 | |||
397 | paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); | 376 | paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); |
398 | tlb_remove_page(tlb, virt_to_page(pmd)); | 377 | tlb_remove_page(tlb, virt_to_page(pmd)); |
399 | } | 378 | } |