15 files changed, 475 insertions, 163 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 1fbb844c3d7a..dfb932dcf136 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
 obj-y   :=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-            pat.o pgtable.o
+            pat.o pgtable.o gup.o
 obj-$(CONFIG_X86_32)            += pgtable_32.o
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 5dfef9fa061a..62fa440678d8 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -42,7 +42,6 @@
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
-static bootmem_data_t node0_bdata;
 /*
 * numa interface - we expect the numa architecture specific code to have
@@ -385,7 +384,7 @@ void __init initmem_init(unsigned long start_pfn,
        for_each_online_node(nid)
                memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
-        NODE_DATA(0)->bdata = &node0_bdata;
+        NODE_DATA(0)->bdata = &bootmem_node_data[0];
        setup_bootmem_allocator();
 }
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..007bb06c7504
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,298 @@
+/*
+ * Lockless get_user_pages_fast for x86
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/highmem.h>
+#include <asm/pgtable.h>
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+#ifndef CONFIG_X86_PAE
+        return *ptep;
+#else
+        /*
+         * With get_user_pages_fast, we walk down the pagetables without taking
+         * any locks.  For this we would like to load the pointers atoimcally,
+         * but that is not possible (without expensive cmpxchg8b) on PAE.  What
+         * we do have is the guarantee that a pte will only either go from not
+         * present to present, or present to not present or both -- it will not
+         * switch to a completely different present page without a TLB flush in
+         * between; something that we are blocking by holding interrupts off.
+         *
+         * Setting ptes from not present to present goes:
+         * ptep->pte_high = h;
+         * smp_wmb();
+         * ptep->pte_low = l;
+         *
+         * And present to not present goes:
+         * ptep->pte_low = 0;
+         * smp_wmb();
+         * ptep->pte_high = 0;
+         *
+         * We must ensure here that the load of pte_low sees l iff pte_high
+         * sees h. We load pte_high *after* loading pte_low, which ensures we
+         * don't see an older value of pte_high.  *Then* we recheck pte_low,
+         * which ensures that we haven't picked up a changed pte high. We might
+         * have got rubbish values from pte_low and pte_high, but we are
+         * guaranteed that pte_low will not have the present bit set *unless*
+         * it is 'l'. And get_user_pages_fast only operates on present ptes, so
+         * we're safe.
+         *
+         * gup_get_pte should not be used or copied outside gup.c without being
+         * very careful -- it does not atomically load the pte or anything that
+         * is likely to be useful for you.
+         */
+        pte_t pte;
+retry:
+        pte.pte_low = ptep->pte_low;
+        smp_rmb();
+        pte.pte_high = ptep->pte_high;
+        smp_rmb();
+        if (unlikely(pte.pte_low != ptep->pte_low))
+                goto retry;
+        return pte;
+#endif
+}
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        unsigned long mask;
+        pte_t *ptep;
+        mask = _PAGE_PRESENT|_PAGE_USER;
+        if (write)
+                mask |= _PAGE_RW;
+        ptep = pte_offset_map(&pmd, addr);
+        do {
+                pte_t pte = gup_get_pte(ptep);
+                struct page *page;
+                if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+                        pte_unmap(ptep);
+                        return 0;
+                }
+                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+                page = pte_page(pte);
+                get_page(page);
+                pages[*nr] = page;
+                (*nr)++;
+        } while (ptep++, addr += PAGE_SIZE, addr != end);
+        pte_unmap(ptep - 1);
+        return 1;
+}
+static inline void get_head_page_multiple(struct page *page, int nr)
+{
+        VM_BUG_ON(page != compound_head(page));
+        VM_BUG_ON(page_count(page) == 0);
+        atomic_add(nr, &page->_count);
+}
+static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        unsigned long mask;
+        pte_t pte = *(pte_t *)&pmd;
+        struct page *head, *page;
+        int refs;
+        mask = _PAGE_PRESENT|_PAGE_USER;
+        if (write)
+                mask |= _PAGE_RW;
+        if ((pte_val(pte) & mask) != mask)
+                return 0;
+        /* hugepages are never "special" */
+        VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+        refs = 0;
+        head = pte_page(pte);
+        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+        do {
+                VM_BUG_ON(compound_head(page) != head);
+                pages[*nr] = page;
+                (*nr)++;
+                page++;
+                refs++;
+        } while (addr += PAGE_SIZE, addr != end);
+        get_head_page_multiple(head, refs);
+        return 1;
+}
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+                int write, struct page **pages, int *nr)
+{
+        unsigned long next;
+        pmd_t *pmdp;
+        pmdp = pmd_offset(&pud, addr);
+        do {
+                pmd_t pmd = *pmdp;
+                next = pmd_addr_end(addr, end);
+                if (pmd_none(pmd))
+                        return 0;
+                if (unlikely(pmd_large(pmd))) {
+                        if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+                                return 0;
+                } else {
+                        if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+                                return 0;
+                }
+        } while (pmdp++, addr = next, addr != end);
+        return 1;
+}
+static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        unsigned long mask;
+        pte_t pte = *(pte_t *)&pud;
+        struct page *head, *page;
+        int refs;
+        mask = _PAGE_PRESENT|_PAGE_USER;
+        if (write)
+                mask |= _PAGE_RW;
+        if ((pte_val(pte) & mask) != mask)
+                return 0;
+        /* hugepages are never "special" */
+        VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+        refs = 0;
+        head = pte_page(pte);
+        page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+        do {
+                VM_BUG_ON(compound_head(page) != head);
+                pages[*nr] = page;
+                (*nr)++;
+                page++;
+                refs++;
+        } while (addr += PAGE_SIZE, addr != end);
+        get_head_page_multiple(head, refs);
+        return 1;
+}
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+                        int write, struct page **pages, int *nr)
+{
+        unsigned long next;
+        pud_t *pudp;
+        pudp = pud_offset(&pgd, addr);
+        do {
+                pud_t pud = *pudp;
+                next = pud_addr_end(addr, end);
+                if (pud_none(pud))
+                        return 0;
+                if (unlikely(pud_large(pud))) {
+                        if (!gup_huge_pud(pud, addr, next, write, pages, nr))
+                                return 0;
+                } else {
+                        if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+                                return 0;
+                }
+        } while (pudp++, addr = next, addr != end);
+        return 1;
+}
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+                        struct page **pages)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long addr, len, end;
+        unsigned long next;
+        pgd_t *pgdp;
+        int nr = 0;
+        start &= PAGE_MASK;
+        addr = start;
+        len = (unsigned long) nr_pages << PAGE_SHIFT;
+        end = start + len;
+        if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+                                        start, len)))
+                goto slow_irqon;
+        /*
+         * XXX: batch / limit 'nr', to avoid large irq off latency
+         * needs some instrumenting to determine the common sizes used by
+         * important workloads (eg. DB2), and whether limiting the batch size
+         * will decrease performance.
+         *
+         * It seems like we're in the clear for the moment. Direct-IO is
+         * the main guy that batches up lots of get_user_pages, and even
+         * they are limited to 64-at-a-time which is not so many.
+         */
+        /*
+         * This doesn't prevent pagetable teardown, but does prevent
+         * the pagetables and pages from being freed on x86.
+         *
+         * So long as we atomically load page table pointers versus teardown
+         * (which we do on x86, with the above PAE exception), we can follow the
+         * address down to the the page and take a ref on it.
+         */
+        local_irq_disable();
+        pgdp = pgd_offset(mm, addr);
+        do {
+                pgd_t pgd = *pgdp;
+                next = pgd_addr_end(addr, end);
+                if (pgd_none(pgd))
+                        goto slow;
+                if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+                        goto slow;
+        } while (pgdp++, addr = next, addr != end);
+        local_irq_enable();
+        VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+        return nr;
+        {
+                int ret;
+slow:
+                local_irq_enable();
+slow_irqon:
+                /* Try to get the remaining pages with get_user_pages */
+                start += nr << PAGE_SHIFT;
+                pages += nr;
+                down_read(&mm->mmap_sem);
+                ret = get_user_pages(current, mm, start,
+                        (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+                up_read(&mm->mmap_sem);
+                /* Have to be a bit careful with return values */
+                if (nr > 0) {
+                        if (ret < 0)
+                                ret = nr;
+                        else
+                                ret += nr;
+                }
+                return ret;
+        }
+}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686d..8f307d914c2e 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
        return 1;
 }
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
        pgd = pgd_offset(mm, addr);
        pud = pud_alloc(mm, pgd, addr);
        if (pud) {
-                if (pud_none(*pud))
+                if (sz == PUD_SIZE) {
-                        huge_pmd_share(mm, addr, pud);
+                        pte = (pte_t *)pud;
-                pte = (pte_t *) pmd_alloc(mm, pud, addr);
+                } else {
+                        BUG_ON(sz != PMD_SIZE);
+                        if (pud_none(*pud))
+                                huge_pmd_share(mm, addr, pud);
+                        pte = (pte_t *) pmd_alloc(mm, pud, addr);
+                }
        }
        BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
@@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        pgd = pgd_offset(mm, addr);
        if (pgd_present(*pgd)) {
                pud = pud_offset(pgd, addr);
-                if (pud_present(*pud))
+                if (pud_present(*pud)) {
+                        if (pud_large(*pud))
+                                return (pte_t *)pud;
                        pmd = pmd_offset(pud, addr);
+                }
        }
        return (pte_t *) pmd;
 }
@@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd)
        return 0;
 }
+int pud_huge(pud_t pud)
+{
+        return 0;
+}
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
                pmd_t *pmd, int write)
@@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd)
        return !!(pmd_val(pmd) & _PAGE_PSE);
 }
+int pud_huge(pud_t pud)
+{
+        return !!(pud_val(pud) & _PAGE_PSE);
+}
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
                pmd_t *pmd, int write)
@@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
        page = pte_page(*(pte_t *)pmd);
        if (page)
-                page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
+                page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+        return page;
+}
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+                pud_t *pud, int write)
+{
+        struct page *page;
+        page = pte_page(*(pte_t *)pud);
+        if (page)
+                page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
        return page;
 }
 #endif
 /* x86_64 also uses this file */
@@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
                unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags)
 {
+        struct hstate *h = hstate_file(file);
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long start_addr;
@@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
        }
 full_search:
-        addr = ALIGN(start_addr, HPAGE_SIZE);
+        addr = ALIGN(start_addr, huge_page_size(h));
        for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
                /* At this point:  (!vma || addr < vma->vm_end). */
@@ -262,7 +295,7 @@ full_search:
                }
                if (addr + mm->cached_hole_size < vma->vm_start)
                        mm->cached_hole_size = vma->vm_start - addr;
-                addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+                addr = ALIGN(vma->vm_end, huge_page_size(h));
        }
 }
@@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
                unsigned long addr0, unsigned long len,
                unsigned long pgoff, unsigned long flags)
 {
+        struct hstate *h = hstate_file(file);
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev_vma;
        unsigned long base = mm->mmap_base, addr = addr0;
@@ -290,7 +324,7 @@ try_again:
                goto fail;
        /* either no address requested or cant fit in requested address hole */
-        addr = (mm->free_area_cache - len) & HPAGE_MASK;
+        addr = (mm->free_area_cache - len) & huge_page_mask(h);
        do {
                /*
                 * Lookup failure means no vma is above this address,
@@ -321,7 +355,7 @@ try_again:
                        largest_hole = vma->vm_start - addr;
                /* try just below the current vma->vm_start */
-                addr = (vma->vm_start - len) & HPAGE_MASK;
+                addr = (vma->vm_start - len) & huge_page_mask(h);
        } while (len <= vma->vm_start);
 fail:
@@ -359,22 +393,23 @@ unsigned long
 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
 {
+        struct hstate *h = hstate_file(file);
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
-        if (len & ~HPAGE_MASK)
+        if (len & ~huge_page_mask(h))
                return -EINVAL;
        if (len > TASK_SIZE)
                return -ENOMEM;
        if (flags & MAP_FIXED) {
-                if (prepare_hugepage_range(addr, len))
+                if (prepare_hugepage_range(file, addr, len))
                        return -EINVAL;
                return addr;
        }
        if (addr) {
-                addr = ALIGN(addr, HPAGE_SIZE);
+                addr = ALIGN(addr, huge_page_size(h));
                vma = find_vma(mm, addr);
                if (TASK_SIZE - len >= addr &&
                    (!vma || addr + len <= vma->vm_start))
@@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
+#ifdef CONFIG_X86_64
+static __init int setup_hugepagesz(char *opt)
+{
+        unsigned long ps = memparse(opt, &opt);
+        if (ps == PMD_SIZE) {
+                hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+        } else if (ps == PUD_SIZE && cpu_has_gbpages) {
+                hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+        } else {
+                printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
+                        ps >> 20);
+                return 0;
+        }
+        return 1;
+}
+__setup("hugepagesz=", setup_hugepagesz);
+#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index d37f29376b0c..60ec1d08ff24 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -458,11 +458,7 @@ static void __init pagetable_init(void)
 {
        pgd_t *pgd_base = swapper_pg_dir;
-        paravirt_pagetable_setup_start(pgd_base);
        permanent_kmaps_init(pgd_base);
-        paravirt_pagetable_setup_done(pgd_base);
 }
 #ifdef CONFIG_ACPI_SLEEP
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec37121f6709..d3746efb060d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -60,7 +60,7 @@ static unsigned long dma_reserve __initdata;
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-int direct_gbpages __meminitdata
+int direct_gbpages
 #ifdef CONFIG_DIRECT_GBPAGES
                                = 1
 #endif
@@ -86,46 +86,13 @@ early_param("gbpages", parse_direct_gbpages_on);
 * around without checking the pgd every time.
 */
-void show_mem(void)
-{
-        long i, total = 0, reserved = 0;
-        long shared = 0, cached = 0;
-        struct page *page;
-        pg_data_t *pgdat;
-        printk(KERN_INFO "Mem-info:\n");
-        show_free_areas();
-        for_each_online_pgdat(pgdat) {
-                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-                        /*
-                         * This loop can take a while with 256 GB and
-                         * 4k pages so defer the NMI watchdog:
-                         */
-                        if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
-                                touch_nmi_watchdog();
-                        if (!pfn_valid(pgdat->node_start_pfn + i))
-                                continue;
-                        page = pfn_to_page(pgdat->node_start_pfn + i);
-                        total++;
-                        if (PageReserved(page))
-                                reserved++;
-                        else if (PageSwapCache(page))
-                                cached++;
-                        else if (page_count(page))
-                                shared += page_count(page) - 1;
-                }
-        }
-        printk(KERN_INFO "%lu pages of RAM\n",          total);
-        printk(KERN_INFO "%lu reserved pages\n",        reserved);
-        printk(KERN_INFO "%lu pages shared\n",          shared);
-        printk(KERN_INFO "%lu pages swap cached\n",     cached);
-}
 int after_bootmem;
-static __init void *spp_getpage(void)
+/*
+ * NOTE: This function is marked __ref because it calls __init function
+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ */
+static __ref void *spp_getpage(void)
 {
        void *ptr;
@@ -274,7 +241,7 @@ static unsigned long __initdata table_start;
 static unsigned long __meminitdata table_end;
 static unsigned long __meminitdata table_top;
-static __meminit void *alloc_low_page(unsigned long *phys)
+static __ref void *alloc_low_page(unsigned long *phys)
 {
        unsigned long pfn = table_end++;
        void *adr;
@@ -295,7 +262,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
        return adr;
 }
-static __meminit void unmap_low_page(void *adr)
+static __ref void unmap_low_page(void *adr)
 {
        if (after_bootmem)
                return;
@@ -351,6 +318,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 {
        unsigned long pages = 0;
        unsigned long last_map_addr = end;
+        unsigned long start = address;
        int i = pmd_index(address);
@@ -368,16 +336,24 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
                }
                if (pmd_val(*pmd)) {
-                        if (!pmd_large(*pmd))
+                        if (!pmd_large(*pmd)) {
+                                spin_lock(&init_mm.page_table_lock);
                                last_map_addr = phys_pte_update(pmd, address,
-                                                                 end);
+                                                                end);
+                                spin_unlock(&init_mm.page_table_lock);
+                        }
+                        /* Count entries we're using from level2_ident_pgt */
+                        if (start == 0)
+                                pages++;
                        continue;
                }
                if (page_size_mask & (1<<PG_LEVEL_2M)) {
                        pages++;
+                        spin_lock(&init_mm.page_table_lock);
                        set_pte((pte_t *)pmd,
                                pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+                        spin_unlock(&init_mm.page_table_lock);
                        last_map_addr = (address & PMD_MASK) + PMD_SIZE;
                        continue;
                }
@@ -386,7 +362,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
                last_map_addr = phys_pte_init(pte, address, end);
                unmap_low_page(pte);
+                spin_lock(&init_mm.page_table_lock);
                pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+                spin_unlock(&init_mm.page_table_lock);
        }
        update_page_count(PG_LEVEL_2M, pages);
        return last_map_addr;
@@ -399,9 +377,7 @@ phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
        pmd_t *pmd = pmd_offset(pud, 0);
        unsigned long last_map_addr;
-        spin_lock(&init_mm.page_table_lock);
        last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
-        spin_unlock(&init_mm.page_table_lock);
        __flush_tlb_all();
        return last_map_addr;
 }
@@ -437,20 +413,21 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
                if (page_size_mask & (1<<PG_LEVEL_1G)) {
                        pages++;
+                        spin_lock(&init_mm.page_table_lock);
                        set_pte((pte_t *)pud,
                                pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+                        spin_unlock(&init_mm.page_table_lock);
                        last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
                        continue;
                }
                pmd = alloc_low_page(&pmd_phys);
-                spin_lock(&init_mm.page_table_lock);
                last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
                unmap_low_page(pmd);
+                spin_lock(&init_mm.page_table_lock);
                pud_populate(&init_mm, pud, __va(pmd_phys));
                spin_unlock(&init_mm.page_table_lock);
        }
        __flush_tlb_all();
        update_page_count(PG_LEVEL_1G, pages);
@@ -542,16 +519,14 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
                        continue;
                }
-                if (after_bootmem)
+                pud = alloc_low_page(&pud_phys);
-                        pud = pud_offset(pgd, start & PGDIR_MASK);
-                else
-                        pud = alloc_low_page(&pud_phys);
                last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
                                                 page_size_mask);
                unmap_low_page(pud);
-                pgd_populate(&init_mm, pgd_offset_k(start),
-                             __va(pud_phys));
+                spin_lock(&init_mm.page_table_lock);
+                pgd_populate(&init_mm, pgd, __va(pud_phys));
+                spin_unlock(&init_mm.page_table_lock);
        }
        return last_map_addr;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 24c1d3c30186..d4b6e6a29ae3 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -170,7 +170,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
        phys_addr &= PAGE_MASK;
        size = PAGE_ALIGN(last_addr+1) - phys_addr;
-        retval = reserve_memtype(phys_addr, phys_addr + size,
+        retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
                                                prot_val, &new_prot_val);
        if (retval) {
                pr_debug("Warning: reserve_memtype returned %d\n", retval);
@@ -330,6 +330,14 @@ static void __iomem *ioremap_default(resource_size_t phys_addr,
        return (void __iomem *)ret;
 }
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+                                unsigned long prot_val)
+{
+        return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
+                                __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_prot);
 /**
 * iounmap - Free a IO remapping
 * @addr: virtual address from ioremap_*
@@ -545,13 +553,11 @@ static int __init check_early_ioremap_leak(void)
 {
        if (!early_ioremap_nested)
                return 0;
+        WARN(1, KERN_WARNING
-        printk(KERN_WARNING
               "Debug warning: early ioremap leak of %d areas detected.\n",
-               early_ioremap_nested);
+                early_ioremap_nested);
        printk(KERN_WARNING
-               "please boot with early_ioremap_debug and report the dmesg.\n");
+                "please boot with early_ioremap_debug and report the dmesg.\n");
-        WARN_ON(1);
        return 1;
 }
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index e7397e108beb..635b50e85581 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -430,7 +430,9 @@ static void enter_uniprocessor(void)
                                                "may miss events.\n");
 }
-static void leave_uniprocessor(void)
+/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
+   but this whole function is ifdefed CONFIG_HOTPLUG_CPU */
+static void __ref leave_uniprocessor(void)
 {
        int cpu;
        int err;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 9782f42dd319..a4dd793d6003 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -23,8 +23,6 @@
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
-static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 struct memnode memnode;
 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
@@ -198,7 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
                nodedata_phys + pgdat_size - 1);
        memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
-        NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
+        NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
        NODE_DATA(nodeid)->node_start_pfn = start_pfn;
        NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 0dcd42eb94e6..d4aa503caaa2 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -221,8 +221,7 @@ static int pageattr_test(void)
        failed += print_split(&sc);
        if (failed) {
-                printk(KERN_ERR "NOT PASSED. Please report.\n");
+                WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
-                WARN_ON(1);
                return -EINVAL;
        } else {
                if (print)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 65c6e46bf059..43e2f8483e4f 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -55,13 +55,19 @@ static void split_page_count(int level)
 int arch_report_meminfo(char *page)
 {
-        int n = sprintf(page, "DirectMap4k:  %8lu\n"
+        int n = sprintf(page, "DirectMap4k:  %8lu kB\n",
-                        "DirectMap2M:  %8lu\n",
+                        direct_pages_count[PG_LEVEL_4K] << 2);
-                        direct_pages_count[PG_LEVEL_4K],
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-                        direct_pages_count[PG_LEVEL_2M]);
+        n += sprintf(page + n, "DirectMap2M:  %8lu kB\n",
+                        direct_pages_count[PG_LEVEL_2M] << 11);
+#else
+        n += sprintf(page + n, "DirectMap4M:  %8lu kB\n",
+                        direct_pages_count[PG_LEVEL_2M] << 12);
+#endif
 #ifdef CONFIG_X86_64
-        n += sprintf(page + n, "DirectMap1G:  %8lu\n",
+        if (direct_gbpages)
-                     direct_pages_count[PG_LEVEL_1G]);
+                n += sprintf(page + n, "DirectMap1G:  %8lu kB\n",
+                        direct_pages_count[PG_LEVEL_1G] << 20);
 #endif
        return n;
 }
@@ -592,10 +598,9 @@ repeat:
        if (!pte_val(old_pte)) {
                if (!primary)
                        return 0;
-                printk(KERN_WARNING "CPA: called for zero pte. "
+                WARN(1, KERN_WARNING "CPA: called for zero pte. "
                       "vaddr = %lx cpa->vaddr = %lx\n", address,
                       cpa->vaddr);
-                WARN_ON(1);
                return -EINVAL;
        }
@@ -844,7 +849,7 @@ int set_memory_uc(unsigned long addr, int numpages)
        /*
         * for now UC MINUS. see comments in ioremap_nocache()
         */
-        if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+        if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
                            _PAGE_CACHE_UC_MINUS, NULL))
                return -EINVAL;
@@ -863,7 +868,7 @@ int set_memory_wc(unsigned long addr, int numpages)
        if (!pat_enabled)
                return set_memory_uc(addr, numpages);
-        if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+        if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
                _PAGE_CACHE_WC, NULL))
                return -EINVAL;
@@ -879,7 +884,7 @@ int _set_memory_wb(unsigned long addr, int numpages)
 int set_memory_wb(unsigned long addr, int numpages)
 {
-        free_memtype(addr, addr + numpages * PAGE_SIZE);
+        free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
        return _set_memory_wb(addr, numpages);
 }
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2fe30916d4b6..2a50e0fa64a5 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -207,6 +207,9 @@ static int chk_conflict(struct memtype *new, struct memtype *entry,
        return -EBUSY;
 }
+static struct memtype *cached_entry;
+static u64 cached_start;
 /*
 * req_type typically has one of the:
 * - _PAGE_CACHE_WB
@@ -280,11 +283,17 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
        spin_lock(&memtype_lock);
+        if (cached_entry && start >= cached_start)
+                entry = cached_entry;
+        else
+                entry = list_entry(&memtype_list, struct memtype, nd);
        /* Search for existing mapping that overlaps the current range */
        where = NULL;
-        list_for_each_entry(entry, &memtype_list, nd) {
+        list_for_each_entry_continue(entry, &memtype_list, nd) {
                if (end <= entry->start) {
                        where = entry->nd.prev;
+                        cached_entry = list_entry(where, struct memtype, nd);
                        break;
                } else if (start <= entry->start) { /* end > entry->start */
                        err = chk_conflict(new, entry, new_type);
@@ -292,6 +301,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
                                dprintk("Overlap at 0x%Lx-0x%Lx\n",
                                        entry->start, entry->end);
                                where = entry->nd.prev;
+                                cached_entry = list_entry(where,
+                                                        struct memtype, nd);
                        }
                        break;
                } else if (start < entry->end) { /* start > entry->start */
@@ -299,7 +310,20 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
                        if (!err) {
                                dprintk("Overlap at 0x%Lx-0x%Lx\n",
                                        entry->start, entry->end);
-                                where = &entry->nd;
+                                cached_entry = list_entry(entry->nd.prev,
+                                                        struct memtype, nd);
+                                /*
+                                 * Move to right position in the linked
+                                 * list to add this new entry
+                                 */
+                                list_for_each_entry_continue(entry,
+                                                        &memtype_list, nd) {
+                                        if (start <= entry->start) {
+                                                where = entry->nd.prev;
+                                                break;
+                                        }
+                                }
                        }
                        break;
                }
@@ -314,6 +338,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
                return err;
        }
+        cached_start = start;
        if (where)
                list_add(&new->nd, where);
        else
@@ -343,6 +369,9 @@ int free_memtype(u64 start, u64 end)
        spin_lock(&memtype_lock);
        list_for_each_entry(entry, &memtype_list, nd) {
                if (entry->start == start && entry->end == end) {
+                        if (cached_entry == entry || cached_start == start)
+                                cached_entry = NULL;
                        list_del(&entry->nd);
                        kfree(entry);
                        err = 0;
@@ -361,14 +390,6 @@ int free_memtype(u64 start, u64 end)
 }
-/*
- * /dev/mem mmap interface. The memtype used for mapping varies:
- * - Use UC for mappings with O_SYNC flag
- * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
- *   inherit the memtype from existing mapping.
- * - Else use UC_MINUS memtype (for backward compatibility with existing
- *   X drivers.
- */
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                                unsigned long size, pgprot_t vma_prot)
 {
@@ -406,14 +427,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
                                unsigned long size, pgprot_t *vma_prot)
 {
        u64 offset = ((u64) pfn) << PAGE_SHIFT;
-        unsigned long flags = _PAGE_CACHE_UC_MINUS;
+        unsigned long flags = -1;
        int retval;
        if (!range_is_allowed(pfn, size))
                return 0;
        if (file->f_flags & O_SYNC) {
-                flags = _PAGE_CACHE_UC;
+                flags = _PAGE_CACHE_UC_MINUS;
        }
 #ifdef CONFIG_X86_32
@@ -436,13 +457,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 #endif
        /*
-         * With O_SYNC, we can only take UC mapping. Fail if we cannot.
+         * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
+         *
         * Without O_SYNC, we want to get
         * - WB for WB-able memory and no other conflicting mappings
         * - UC_MINUS for non-WB-able memory with no other conflicting mappings
         * - Inherit from confliting mappings otherwise
         */
-        if (flags != _PAGE_CACHE_UC_MINUS) {
+        if (flags != -1) {
                retval = reserve_memtype(offset, offset + size, flags, NULL);
        } else {
                retval = reserve_memtype(offset, offset + size, -1, &flags);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 557b2abceef8..d50302774fe2 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -207,6 +207,9 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
        unsigned long addr;
        int i;
+        if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
+                return;
        pud = pud_offset(pgd, 0);
        for (addr = i = 0; i < PREALLOCATED_PMDS;
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b4becbf8c570..cab0abbd1ebe 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,53 +20,6 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
-void show_mem(void)
-{
-        int total = 0, reserved = 0;
-        int shared = 0, cached = 0;
-        int highmem = 0;
-        struct page *page;
-        pg_data_t *pgdat;
-        unsigned long i;
-        unsigned long flags;
-        printk(KERN_INFO "Mem-info:\n");
-        show_free_areas();
-        for_each_online_pgdat(pgdat) {
-                pgdat_resize_lock(pgdat, &flags);
-                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-                        if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
-                                touch_nmi_watchdog();
-                        page = pgdat_page_nr(pgdat, i);
-                        total++;
-                        if (PageHighMem(page))
-                                highmem++;
-                        if (PageReserved(page))
-                                reserved++;
-                        else if (PageSwapCache(page))
-                                cached++;
-                        else if (page_count(page))
-                                shared += page_count(page) - 1;
-                }
-                pgdat_resize_unlock(pgdat, &flags);
-        }
-        printk(KERN_INFO "%d pages of RAM\n", total);
-        printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
-        printk(KERN_INFO "%d reserved pages\n", reserved);
-        printk(KERN_INFO "%d pages shared\n", shared);
-        printk(KERN_INFO "%d pages swap cached\n", cached);
-        printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
-        printk(KERN_INFO "%lu pages writeback\n",
-                                        global_page_state(NR_WRITEBACK));
-        printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
-        printk(KERN_INFO "%lu pages slab\n",
-                global_page_state(NR_SLAB_RECLAIMABLE) +
-                global_page_state(NR_SLAB_UNRECLAIMABLE));
-        printk(KERN_INFO "%lu pages pagetables\n",
-                                        global_page_state(NR_PAGETABLE));
-}
 /*
 * Associate a virtual page frame with a given physical page frame 
 * and protection flags for that frame.
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 1eb2973a301c..16ae70fc57e7 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -178,7 +178,7 @@ void acpi_numa_arch_fixup(void)
 * start of the node, and that the current "end" address is after
 * the previous one.
 */
-static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
+static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
 {
        /*
         * Only add present memory as told by the e820.
@@ -189,10 +189,10 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
        if (memory_chunk->start_pfn >= max_pfn) {
                printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
                        memory_chunk->start_pfn, memory_chunk->end_pfn);
-                return;
+                return -1;
        }
        if (memory_chunk->nid != nid)
-                return;
+                return -1;
        if (!node_has_online_mem(nid))
                node_start_pfn[nid] = memory_chunk->start_pfn;
@@ -202,6 +202,8 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
        if (node_end_pfn[nid] < memory_chunk->end_pfn)
                node_end_pfn[nid] = memory_chunk->end_pfn;
+        return 0;
 }
 int __init get_memcfg_from_srat(void)
@@ -259,7 +261,9 @@ int __init get_memcfg_from_srat(void)
                printk(KERN_DEBUG
                        "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
                       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
-                node_read_chunk(chunk->nid, chunk);
+                if (node_read_chunk(chunk->nid, chunk))
+                        continue;
                e820_register_active_regions(chunk->nid, chunk->start_pfn,
                                             min(chunk->end_pfn, max_pfn));
        }