8 files changed, 372 insertions, 229 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e4440d0abf81..ad8b9733d6b3 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -240,7 +240,8 @@ void dump_pagetable(unsigned long address)
        pud = pud_offset(pgd, address);
        if (bad_address(pud)) goto bad;
        printk("PUD %lx ", pud_val(*pud));
-        if (!pud_present(*pud)) goto ret;
+        if (!pud_present(*pud) || pud_large(*pud))
+                goto ret;
        pmd = pmd_offset(pud, address);
        if (bad_address(pmd)) goto bad;
@@ -508,6 +509,10 @@ static int vmalloc_fault(unsigned long address)
        pmd_t *pmd, *pmd_ref;
        pte_t *pte, *pte_ref;
+        /* Make sure we are in vmalloc area */
+        if (!(address >= VMALLOC_START && address < VMALLOC_END))
+                return -1;
        /* Copy kernel mappings over when needed. This can also
           happen within a race in page table update. In the later
           case just flush. */
@@ -603,6 +608,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
         */
 #ifdef CONFIG_X86_32
        if (unlikely(address >= TASK_SIZE)) {
+#else
+        if (unlikely(address >= TASK_SIZE64)) {
+#endif
                if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
                    vmalloc_fault(address) >= 0)
                        return;
@@ -618,6 +626,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
                goto bad_area_nosemaphore;
        }
+#ifdef CONFIG_X86_32
        /* It's safe to allow irq's after cr2 has been saved and the vmalloc
           fault has been handled. */
        if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
@@ -630,28 +640,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
        if (in_atomic() || !mm)
                goto bad_area_nosemaphore;
 #else /* CONFIG_X86_64 */
-        if (unlikely(address >= TASK_SIZE64)) {
-                /*
-                 * Don't check for the module range here: its PML4
-                 * is always initialized because it's shared with the main
-                 * kernel text. Only vmalloc may need PML4 syncups.
-                 */
-                if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
-                      ((address >= VMALLOC_START && address < VMALLOC_END))) {
-                        if (vmalloc_fault(address) >= 0)
-                                return;
-                }
-                /* Can handle a stale RO->RW TLB */
-                if (spurious_fault(address, error_code))
-                        return;
-                /*
-                 * Don't take the mm semaphore here. If we fixup a prefetch
-                 * fault we could otherwise deadlock.
-                 */
-                goto bad_area_nosemaphore;
-        }
        if (likely(regs->flags & X86_EFLAGS_IF))
                local_irq_enable();
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f2f36f8dae52..d1bc04006d16 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
 #include <linux/initrd.h>
 #include <linux/cpumask.h>
+#include <asm/asm.h>
 #include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -718,10 +719,7 @@ static noinline int do_test_wp_bit(void)
                "1:     movb %1, %0     \n"
                "       xorl %2, %2     \n"
                "2:                     \n"
-                ".section __ex_table, \"a\"\n"
+                _ASM_EXTABLE(1b,2b)
-                "       .align 4        \n"
-                "       .long 1b, 2b    \n"
-                ".previous              \n"
                :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
                 "=q" (tmp_reg),
                 "=r" (flag)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index eabcaed76c28..3a98d6f724ab 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -273,7 +273,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
        int i = pmd_index(address);
        for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
-                unsigned long entry;
                pmd_t *pmd = pmd_page + pmd_index(address);
                if (address >= end) {
@@ -287,9 +286,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
                if (pmd_val(*pmd))
                        continue;
-                entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address;
+                set_pte((pte_t *)pmd,
-                entry &= __supported_pte_mask;
+                        pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
-                set_pmd(pmd, __pmd(entry));
        }
 }
@@ -435,49 +433,6 @@ void __init paging_init(void)
 #endif
 /*
- * Unmap a kernel mapping if it exists. This is useful to avoid
- * prefetches from the CPU leading to inconsistent cache lines.
- * address and size must be aligned to 2MB boundaries.
- * Does nothing when the mapping doesn't exist.
- */
-void __init clear_kernel_mapping(unsigned long address, unsigned long size)
-{
-        unsigned long end = address + size;
-        BUG_ON(address & ~LARGE_PAGE_MASK);
-        BUG_ON(size & ~LARGE_PAGE_MASK);
-        for (; address < end; address += LARGE_PAGE_SIZE) {
-                pgd_t *pgd = pgd_offset_k(address);
-                pud_t *pud;
-                pmd_t *pmd;
-                if (pgd_none(*pgd))
-                        continue;
-                pud = pud_offset(pgd, address);
-                if (pud_none(*pud))
-                        continue;
-                pmd = pmd_offset(pud, address);
-                if (!pmd || pmd_none(*pmd))
-                        continue;
-                if (!(pmd_val(*pmd) & _PAGE_PSE)) {
-                        /*
-                         * Could handle this, but it should not happen
-                         * currently:
-                         */
-                        printk(KERN_ERR "clear_kernel_mapping: "
-                                "mapping has been split. will leak memory\n");
-                        pmd_ERROR(*pmd);
-                }
-                set_pmd(pmd, __pmd(0));
-        }
-        __flush_tlb_all();
-}
-/*
 * Memory hotplug specific functions
 */
 void online_page(struct page *page)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c004d94608fd..ee6648fe6b15 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -70,25 +70,12 @@ int page_is_ram(unsigned long pagenr)
 * Fix up the linear direct mapping of the kernel to avoid cache attribute
 * conflicts.
 */
-static int ioremap_change_attr(unsigned long paddr, unsigned long size,
+static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
                               enum ioremap_mode mode)
 {
-        unsigned long vaddr = (unsigned long)__va(paddr);
        unsigned long nrpages = size >> PAGE_SHIFT;
-        unsigned int level;
        int err;
-        /* No change for pages after the last mapping */
-        if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT))
-                return 0;
-        /*
-         * If there is no identity map for this address,
-         * change_page_attr_addr is unnecessary
-         */
-        if (!lookup_address(vaddr, &level))
-                return 0;
        switch (mode) {
        case IOR_MODE_UNCACHED:
        default:
@@ -114,9 +101,8 @@ static int ioremap_change_attr(unsigned long paddr, unsigned long size,
 static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
                               enum ioremap_mode mode)
 {
-        void __iomem *addr;
+        unsigned long pfn, offset, last_addr, vaddr;
        struct vm_struct *area;
-        unsigned long offset, last_addr;
        pgprot_t prot;
        /* Don't allow wraparound or zero size */
@@ -133,9 +119,10 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
        /*
         * Don't allow anybody to remap normal RAM that we're using..
         */
-        for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped &&
+        for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped &&
-             (offset << PAGE_SHIFT) < last_addr; offset++) {
+             (pfn << PAGE_SHIFT) < last_addr; pfn++) {
-                if (page_is_ram(offset))
+                if (page_is_ram(pfn) && pfn_valid(pfn) &&
+                    !PageReserved(pfn_to_page(pfn)))
                        return NULL;
        }
@@ -163,19 +150,18 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
        if (!area)
                return NULL;
        area->phys_addr = phys_addr;
-        addr = (void __iomem *) area->addr;
+        vaddr = (unsigned long) area->addr;
-        if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
+        if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
-                               phys_addr, prot)) {
+                remove_vm_area((void *)(vaddr & PAGE_MASK));
-                remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
                return NULL;
        }
-        if (ioremap_change_attr(phys_addr, size, mode) < 0) {
+        if (ioremap_change_attr(vaddr, size, mode) < 0) {
-                vunmap(addr);
+                vunmap(area->addr);
                return NULL;
        }
-        return (void __iomem *) (offset + (char __iomem *)addr);
+        return (void __iomem *) (vaddr + offset);
 }
 /**
@@ -254,9 +240,6 @@ void iounmap(volatile void __iomem *addr)
                return;
        }
-        /* Reset the direct mapping. Can block */
-        ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED);
        /* Finally remove it */
        o = remove_vm_area((void *)addr);
        BUG_ON(p != o || o == NULL);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a920d09b9194..5a02bf4c91ec 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -202,6 +202,8 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
        if (node_data[nodeid] == NULL)
                return;
        nodedata_phys = __pa(node_data[nodeid]);
+        printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
+                nodedata_phys + pgdat_size - 1);
        memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
        NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
@@ -225,12 +227,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
                return;
        }
        bootmap_start = __pa(bootmap);
-        Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
        bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
                                         bootmap_start >> PAGE_SHIFT,
                                         start_pfn, end_pfn);
+        printk(KERN_INFO "  bootmap [%016lx -  %016lx] pages %lx\n",
+                 bootmap_start, bootmap_start + bootmap_size - 1,
+                 bootmap_pages);
        free_bootmem_with_active_regions(nodeid, end);
        reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 7573e786d2f2..398f3a578dde 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -137,7 +137,8 @@ static __init int exercise_pageattr(void)
                for (k = 0; k < len[i]; k++) {
                        pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
-                        if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) {
+                        if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 ||
+                            !(pte_val(*pte) & _PAGE_PRESENT)) {
                                addr[i] = 0;
                                break;
                        }
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e297bd65e513..bb55a78dcd62 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -16,6 +16,17 @@
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
+/*
+ * The current flushing context - we pass it instead of 5 arguments:
+ */
+struct cpa_data {
+        unsigned long   vaddr;
+        pgprot_t        mask_set;
+        pgprot_t        mask_clr;
+        int             numpages;
+        int             flushtlb;
+};
 static inline int
 within(unsigned long addr, unsigned long start, unsigned long end)
 {
@@ -52,21 +63,23 @@ void clflush_cache_range(void *vaddr, unsigned int size)
 static void __cpa_flush_all(void *arg)
 {
+        unsigned long cache = (unsigned long)arg;
        /*
         * Flush all to work around Errata in early athlons regarding
         * large page flushing.
         */
        __flush_tlb_all();
-        if (boot_cpu_data.x86_model >= 4)
+        if (cache && boot_cpu_data.x86_model >= 4)
                wbinvd();
 }
-static void cpa_flush_all(void)
+static void cpa_flush_all(unsigned long cache)
 {
        BUG_ON(irqs_disabled());
-        on_each_cpu(__cpa_flush_all, NULL, 1, 1);
+        on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
 }
 static void __cpa_flush_range(void *arg)
@@ -79,7 +92,7 @@ static void __cpa_flush_range(void *arg)
        __flush_tlb_all();
 }
-static void cpa_flush_range(unsigned long start, int numpages)
+static void cpa_flush_range(unsigned long start, int numpages, int cache)
 {
        unsigned int i, level;
        unsigned long addr;
@@ -89,6 +102,9 @@ static void cpa_flush_range(unsigned long start, int numpages)
        on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+        if (!cache)
+                return;
        /*
         * We only need to flush on one CPU,
         * clflush is a MESI-coherent instruction that
@@ -101,11 +117,27 @@ static void cpa_flush_range(unsigned long start, int numpages)
                /*
                 * Only flush present addresses:
                 */
-                if (pte && pte_present(*pte))
+                if (pte && (pte_val(*pte) & _PAGE_PRESENT))
                        clflush_cache_range((void *) addr, PAGE_SIZE);
        }
 }
+#define HIGH_MAP_START  __START_KERNEL_map
+#define HIGH_MAP_END    (__START_KERNEL_map + KERNEL_TEXT_SIZE)
+/*
+ * Converts a virtual address to a X86-64 highmap address
+ */
+static unsigned long virt_to_highmap(void *address)
+{
+#ifdef CONFIG_X86_64
+        return __pa((unsigned long)address) + HIGH_MAP_START - phys_base;
+#else
+        return (unsigned long)address;
+#endif
+}
 /*
 * Certain areas of memory on x86 require very specific protection flags,
 * for example the BIOS area or kernel text. Callers don't always get this
@@ -129,12 +161,24 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
         */
        if (within(address, (unsigned long)_text, (unsigned long)_etext))
                pgprot_val(forbidden) |= _PAGE_NX;
+        /*
+         * Do the same for the x86-64 high kernel mapping
+         */
+        if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext)))
+                pgprot_val(forbidden) |= _PAGE_NX;
 #ifdef CONFIG_DEBUG_RODATA
        /* The .rodata section needs to be read-only */
        if (within(address, (unsigned long)__start_rodata,
                                (unsigned long)__end_rodata))
                pgprot_val(forbidden) |= _PAGE_RW;
+        /*
+         * Do the same for the x86-64 high kernel mapping
+         */
+        if (within(address, virt_to_highmap(__start_rodata),
+                                virt_to_highmap(__end_rodata)))
+                pgprot_val(forbidden) |= _PAGE_RW;
 #endif
        prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
@@ -142,6 +186,14 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
        return prot;
 }
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
 pte_t *lookup_address(unsigned long address, int *level)
 {
        pgd_t *pgd = pgd_offset_k(address);
@@ -152,21 +204,31 @@ pte_t *lookup_address(unsigned long address, int *level)
        if (pgd_none(*pgd))
                return NULL;
        pud = pud_offset(pgd, address);
        if (pud_none(*pud))
                return NULL;
+        *level = PG_LEVEL_1G;
+        if (pud_large(*pud) || !pud_present(*pud))
+                return (pte_t *)pud;
        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                return NULL;
        *level = PG_LEVEL_2M;
-        if (pmd_large(*pmd))
+        if (pmd_large(*pmd) || !pmd_present(*pmd))
                return (pte_t *)pmd;
        *level = PG_LEVEL_4K;
        return pte_offset_kernel(pmd, address);
 }
+/*
+ * Set the new pmd in all the pgds we know about:
+ */
 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 {
        /* change init_mm */
@@ -175,6 +237,7 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
        if (!SHARED_KERNEL_PMD) {
                struct page *page;
+                address = __pa(address);
                list_for_each_entry(page, &pgd_list, lru) {
                        pgd_t *pgd;
                        pud_t *pud;
@@ -189,18 +252,114 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 #endif
 }
+static int
+try_preserve_large_page(pte_t *kpte, unsigned long address,
+                        struct cpa_data *cpa)
+{
+        unsigned long nextpage_addr, numpages, pmask, psize, flags;
+        pte_t new_pte, old_pte, *tmp;
+        pgprot_t old_prot, new_prot;
+        int level, do_split = 1;
+        /*
+         * An Athlon 64 X2 showed hard hangs if we tried to preserve
+         * largepages and changed the PSE entry from RW to RO.
+         *
+         * As AMD CPUs have a long series of erratas in this area,
+         * (and none of the known ones seem to explain this hang),
+         * disable this code until the hang can be debugged:
+         */
+        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+                return 1;
+        spin_lock_irqsave(&pgd_lock, flags);
+        /*
+         * Check for races, another CPU might have split this page
+         * up already:
+         */
+        tmp = lookup_address(address, &level);
+        if (tmp != kpte)
+                goto out_unlock;
+        switch (level) {
+        case PG_LEVEL_2M:
+                psize = PMD_PAGE_SIZE;
+                pmask = PMD_PAGE_MASK;
+                break;
+#ifdef CONFIG_X86_64
+        case PG_LEVEL_1G:
+                psize = PMD_PAGE_SIZE;
+                pmask = PMD_PAGE_MASK;
+                break;
+#endif
+        default:
+                do_split = -EINVAL;
+                goto out_unlock;
+        }
+        /*
+         * Calculate the number of pages, which fit into this large
+         * page starting at address:
+         */
+        nextpage_addr = (address + psize) & pmask;
+        numpages = (nextpage_addr - address) >> PAGE_SHIFT;
+        if (numpages < cpa->numpages)
+                cpa->numpages = numpages;
+        /*
+         * We are safe now. Check whether the new pgprot is the same:
+         */
+        old_pte = *kpte;
+        old_prot = new_prot = pte_pgprot(old_pte);
+        pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+        pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+        new_prot = static_protections(new_prot, address);
+        /*
+         * If there are no changes, return. maxpages has been updated
+         * above:
+         */
+        if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
+                do_split = 0;
+                goto out_unlock;
+        }
+        /*
+         * We need to change the attributes. Check, whether we can
+         * change the large page in one go. We request a split, when
+         * the address is not aligned and the number of pages is
+         * smaller than the number of pages in the large page. Note
+         * that we limited the number of possible pages already to
+         * the number of pages in the large page.
+         */
+        if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+                /*
+                 * The address is aligned and the number of pages
+                 * covers the full page.
+                 */
+                new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
+                __set_pmd_pte(kpte, address, new_pte);
+                cpa->flushtlb = 1;
+                do_split = 0;
+        }
+out_unlock:
+        spin_unlock_irqrestore(&pgd_lock, flags);
+        return do_split;
+}
 static int split_large_page(pte_t *kpte, unsigned long address)
 {
-        pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+        unsigned long flags, pfn, pfninc = 1;
        gfp_t gfp_flags = GFP_KERNEL;
-        unsigned long flags;
+        unsigned int i, level;
-        unsigned long addr;
        pte_t *pbase, *tmp;
+        pgprot_t ref_prot;
        struct page *base;
-        unsigned int i, level;
 #ifdef CONFIG_DEBUG_PAGEALLOC
-        gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN;
        gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
 #endif
        base = alloc_pages(gfp_flags, 0);
@@ -213,30 +372,41 @@ static int split_large_page(pte_t *kpte, unsigned long address)
         * up for us already:
         */
        tmp = lookup_address(address, &level);
-        if (tmp != kpte) {
+        if (tmp != kpte)
-                WARN_ON_ONCE(1);
                goto out_unlock;
-        }
-        address = __pa(address);
-        addr = address & LARGE_PAGE_MASK;
        pbase = (pte_t *)page_address(base);
 #ifdef CONFIG_X86_32
        paravirt_alloc_pt(&init_mm, page_to_pfn(base));
 #endif
+        ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+#ifdef CONFIG_X86_64
+        if (level == PG_LEVEL_1G) {
+                pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+                pgprot_val(ref_prot) |= _PAGE_PSE;
+        }
+#endif
-        pgprot_val(ref_prot) &= ~_PAGE_NX;
+        /*
-        for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE)
+         * Get the target pfn from the original entry:
-                set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot));
+         */
+        pfn = pte_pfn(*kpte);
+        for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
+                set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
        /*
-         * Install the new, split up pagetable. Important detail here:
+         * Install the new, split up pagetable. Important details here:
         *
         * On Intel the NX bit of all levels must be cleared to make a
         * page executable. See section 4.13.2 of Intel 64 and IA-32
         * Architectures Software Developer's Manual).
+         *
+         * Mark the entry present. The current mapping might be
+         * set to not present, which we preserved above.
         */
        ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
+        pgprot_val(ref_prot) |= _PAGE_PRESENT;
        __set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
        base = NULL;
@@ -249,18 +419,12 @@ out_unlock:
        return 0;
 }
-static int
+static int __change_page_attr(unsigned long address, struct cpa_data *cpa)
-__change_page_attr(unsigned long address, unsigned long pfn,
-                   pgprot_t mask_set, pgprot_t mask_clr)
 {
+        int level, do_split, err;
        struct page *kpte_page;
-        int level, err = 0;
        pte_t *kpte;
-#ifdef CONFIG_X86_32
-        BUG_ON(pfn > max_low_pfn);
-#endif
 repeat:
        kpte = lookup_address(address, &level);
        if (!kpte)
@@ -271,23 +435,62 @@ repeat:
        BUG_ON(PageCompound(kpte_page));
        if (level == PG_LEVEL_4K) {
-                pgprot_t new_prot = pte_pgprot(*kpte);
                pte_t new_pte, old_pte = *kpte;
+                pgprot_t new_prot = pte_pgprot(old_pte);
+                if(!pte_val(old_pte)) {
+                        printk(KERN_WARNING "CPA: called for zero pte. "
+                               "vaddr = %lx cpa->vaddr = %lx\n", address,
+                                cpa->vaddr);
+                        WARN_ON(1);
+                        return -EINVAL;
+                }
-                pgprot_val(new_prot) &= ~pgprot_val(mask_clr);
+                pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
-                pgprot_val(new_prot) |= pgprot_val(mask_set);
+                pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
                new_prot = static_protections(new_prot, address);
-                new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
+                /*
-                BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte));
+                 * We need to keep the pfn from the existing PTE,
+                 * after all we're only going to change it's attributes
+                 * not the memory it points to
+                 */
+                new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
+                /*
+                 * Do we really change anything ?
+                 */
+                if (pte_val(old_pte) != pte_val(new_pte)) {
+                        set_pte_atomic(kpte, new_pte);
+                        cpa->flushtlb = 1;
+                }
+                cpa->numpages = 1;
+                return 0;
+        }
+        /*
+         * Check, whether we can keep the large page intact
+         * and just change the pte:
+         */
+        do_split = try_preserve_large_page(kpte, address, cpa);
+        /*
+         * When the range fits into the existing large page,
+         * return. cp->numpages and cpa->tlbflush have been updated in
+         * try_large_page:
+         */
+        if (do_split <= 0)
+                return do_split;
-                set_pte_atomic(kpte, new_pte);
+        /*
-        } else {
+         * We have to split the large page:
-                err = split_large_page(kpte, address);
+         */
-                if (!err)
+        err = split_large_page(kpte, address);
-                        goto repeat;
+        if (!err) {
+                cpa->flushtlb = 1;
+                goto repeat;
        }
        return err;
 }
@@ -304,19 +507,14 @@ repeat:
 *
 * Modules and drivers should use the set_memory_* APIs instead.
 */
+static int change_page_attr_addr(struct cpa_data *cpa)
-#define HIGH_MAP_START  __START_KERNEL_map
-#define HIGH_MAP_END    (__START_KERNEL_map + KERNEL_TEXT_SIZE)
-static int
-change_page_attr_addr(unsigned long address, pgprot_t mask_set,
-                      pgprot_t mask_clr)
 {
-        unsigned long phys_addr = __pa(address);
-        unsigned long pfn = phys_addr >> PAGE_SHIFT;
        int err;
+        unsigned long address = cpa->vaddr;
 #ifdef CONFIG_X86_64
+        unsigned long phys_addr = __pa(address);
        /*
         * If we are inside the high mapped kernel range, then we
         * fixup the low mapping first. __va() returns the virtual
@@ -326,7 +524,7 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set,
                address = (unsigned long) __va(phys_addr);
 #endif
-        err = __change_page_attr(address, pfn, mask_set, mask_clr);
+        err = __change_page_attr(address, cpa);
        if (err)
                return err;
@@ -339,42 +537,89 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set,
                /*
                 * Calc the high mapping address. See __phys_addr()
                 * for the non obvious details.
+                 *
+                 * Note that NX and other required permissions are
+                 * checked in static_protections().
                 */
                address = phys_addr + HIGH_MAP_START - phys_base;
-                /* Make sure the kernel mappings stay executable */
-                pgprot_val(mask_clr) |= _PAGE_NX;
                /*
                 * Our high aliases are imprecise, because we check
                 * everything between 0 and KERNEL_TEXT_SIZE, so do
                 * not propagate lookup failures back to users:
                 */
-                __change_page_attr(address, pfn, mask_set, mask_clr);
+                __change_page_attr(address, cpa);
        }
 #endif
        return err;
 }
-static int __change_page_attr_set_clr(unsigned long addr, int numpages,
+static int __change_page_attr_set_clr(struct cpa_data *cpa)
-                                      pgprot_t mask_set, pgprot_t mask_clr)
 {
-        unsigned int i;
+        int ret, numpages = cpa->numpages;
-        int ret;
-        for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) {
+        while (numpages) {
-                ret = change_page_attr_addr(addr, mask_set, mask_clr);
+                /*
+                 * Store the remaining nr of pages for the large page
+                 * preservation check.
+                 */
+                cpa->numpages = numpages;
+                ret = change_page_attr_addr(cpa);
                if (ret)
                        return ret;
-        }
+                /*
+                 * Adjust the number of pages with the result of the
+                 * CPA operation. Either a large page has been
+                 * preserved or a single page update happened.
+                 */
+                BUG_ON(cpa->numpages > numpages);
+                numpages -= cpa->numpages;
+                cpa->vaddr += cpa->numpages * PAGE_SIZE;
+        }
        return 0;
 }
+static inline int cache_attr(pgprot_t attr)
+{
+        return pgprot_val(attr) &
+                (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
+}
 static int change_page_attr_set_clr(unsigned long addr, int numpages,
                                    pgprot_t mask_set, pgprot_t mask_clr)
 {
-        int ret = __change_page_attr_set_clr(addr, numpages, mask_set,
+        struct cpa_data cpa;
-                                             mask_clr);
+        int ret, cache;
+        /*
+         * Check, if we are requested to change a not supported
+         * feature:
+         */
+        mask_set = canon_pgprot(mask_set);
+        mask_clr = canon_pgprot(mask_clr);
+        if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
+                return 0;
+        cpa.vaddr = addr;
+        cpa.numpages = numpages;
+        cpa.mask_set = mask_set;
+        cpa.mask_clr = mask_clr;
+        cpa.flushtlb = 0;
+        ret = __change_page_attr_set_clr(&cpa);
+        /*
+         * Check whether we really changed something:
+         */
+        if (!cpa.flushtlb)
+                return ret;
+        /*
+         * No need to flush, when we did not set any of the caching
+         * attributes:
+         */
+        cache = cache_attr(mask_set);
        /*
         * On success we use clflush, when the CPU supports it to
@@ -383,9 +628,9 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
         * wbindv):
         */
        if (!ret && cpu_has_clflush)
-                cpa_flush_range(addr, numpages);
+                cpa_flush_range(addr, numpages, cache);
        else
-                cpa_flush_all();
+                cpa_flush_all(cache);
        return ret;
 }
@@ -489,37 +734,26 @@ int set_pages_rw(struct page *page, int numpages)
        return set_memory_rw(addr, numpages);
 }
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG)
-static inline int __change_page_attr_set(unsigned long addr, int numpages,
-                                         pgprot_t mask)
-{
-        return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
-}
-static inline int __change_page_attr_clear(unsigned long addr, int numpages,
-                                           pgprot_t mask)
-{
-        return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
-}
-#endif
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static int __set_pages_p(struct page *page, int numpages)
 {
-        unsigned long addr = (unsigned long)page_address(page);
+        struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+                                .numpages = numpages,
+                                .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+                                .mask_clr = __pgprot(0)};
-        return __change_page_attr_set(addr, numpages,
+        return __change_page_attr_set_clr(&cpa);
-                                      __pgprot(_PAGE_PRESENT | _PAGE_RW));
 }
 static int __set_pages_np(struct page *page, int numpages)
 {
-        unsigned long addr = (unsigned long)page_address(page);
+        struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+                                .numpages = numpages,
+                                .mask_set = __pgprot(0),
+                                .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
-        return __change_page_attr_clear(addr, numpages,
+        return __change_page_attr_set_clr(&cpa);
-                                        __pgprot(_PAGE_PRESENT));
 }
 void kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index cb3aa470249b..c7db504be1ea 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -219,50 +219,39 @@ static inline void pgd_list_del(pgd_t *pgd)
        list_del(&page->lru);
 }
+#define UNSHARED_PTRS_PER_PGD                           \
+        (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
+static void pgd_ctor(void *p)
-#if (PTRS_PER_PMD == 1)
-/* Non-PAE pgd constructor */
-static void pgd_ctor(void *pgd)
 {
+        pgd_t *pgd = p;
        unsigned long flags;
-        /* !PAE, no pagetable sharing */
+        /* Clear usermode parts of PGD */
        memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
        spin_lock_irqsave(&pgd_lock, flags);
-        /* must happen under lock */
+        /* If the pgd points to a shared pagetable level (either the
-        clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+           ptes in non-PAE, or shared PMD in PAE), then just copy the
-                        swapper_pg_dir + USER_PTRS_PER_PGD,
+           references from swapper_pg_dir. */
-                        KERNEL_PGD_PTRS);
+        if (PAGETABLE_LEVELS == 2 ||
-        paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
+            (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
-                                __pa(swapper_pg_dir) >> PAGE_SHIFT,
+                clone_pgd_range(pgd + USER_PTRS_PER_PGD,
-                                USER_PTRS_PER_PGD,
-                                KERNEL_PGD_PTRS);
-        pgd_list_add(pgd);
-        spin_unlock_irqrestore(&pgd_lock, flags);
-}
-#else  /* PTRS_PER_PMD > 1 */
-/* PAE pgd constructor */
-static void pgd_ctor(void *pgd)
-{
-        /* PAE, kernel PMD may be shared */
-        if (SHARED_KERNEL_PMD) {
-                clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
                                swapper_pg_dir + USER_PTRS_PER_PGD,
                                KERNEL_PGD_PTRS);
-        } else {
+                paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
-                unsigned long flags;
+                                        __pa(swapper_pg_dir) >> PAGE_SHIFT,
+                                        USER_PTRS_PER_PGD,
+                                        KERNEL_PGD_PTRS);
+        }
-                memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+        /* list required to sync kernel mapping updates */
-                spin_lock_irqsave(&pgd_lock, flags);
+        if (!SHARED_KERNEL_PMD)
                pgd_list_add(pgd);
-                spin_unlock_irqrestore(&pgd_lock, flags);
-        }
+        spin_unlock_irqrestore(&pgd_lock, flags);
 }
-#endif  /* PTRS_PER_PMD */
 static void pgd_dtor(void *pgd)
 {
@@ -276,9 +265,6 @@ static void pgd_dtor(void *pgd)
        spin_unlock_irqrestore(&pgd_lock, flags);
 }
-#define UNSHARED_PTRS_PER_PGD                           \
-        (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
 #ifdef CONFIG_X86_PAE
 /*
 * Mop up any pmd pages which may still be attached to the pgd.
@@ -387,13 +373,6 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
-        /* This is called just after the pmd has been detached from
-           the pgd, which requires a full tlb flush to be recognized
-           by the CPU.  Rather than incurring multiple tlb flushes
-           while the address space is being pulled down, make the tlb
-           gathering machinery do a full flush when we're done. */
-        tlb->fullmm = 1;
        paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
        tlb_remove_page(tlb, virt_to_page(pmd));
 }