22 files changed, 269 insertions, 153 deletions
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index f4026bae6eed..05f2b4009ccc 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -1,7 +1,7 @@
 /*
 * linux/mm/allocpercpu.c
 *
- * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
+ * Separated from slab.c August 11, 2006 Christoph Lameter
 */
 #include <linux/mm.h>
 #include <linux/module.h>
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7c4f9e097095..f2e574dbc300 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -172,30 +172,22 @@ postcore_initcall(bdi_class_init);
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                const char *fmt, ...)
 {
-        char *name;
        va_list args;
        int ret = 0;
        struct device *dev;
        va_start(args, fmt);
-        name = kvasprintf(GFP_KERNEL, fmt, args);
+        dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
        va_end(args);
-        if (!name)
-                return -ENOMEM;
-        dev = device_create(bdi_class, parent, MKDEV(0, 0), name);
        if (IS_ERR(dev)) {
                ret = PTR_ERR(dev);
                goto exit;
        }
        bdi->dev = dev;
-        dev_set_drvdata(bdi->dev, bdi);
+        bdi_debug_register(bdi, dev_name(dev));
-        bdi_debug_register(bdi, name);
 exit:
-        kfree(name);
        return ret;
 }
 EXPORT_SYMBOL(bdi_register);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index e8fb927392b9..8d9f60e06f62 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -442,15 +442,17 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
        return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
 }
-void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
                                 unsigned long size, int flags)
 {
        int ret;
        ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
        if (ret < 0)
-                return;
+                return -ENOMEM;
        reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
+        return 0;
 }
 void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
diff --git a/mm/filemap.c b/mm/filemap.c
index 2dead9adf8b7..1e6a7d34874f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1461,6 +1461,11 @@ page_not_uptodate:
         */
        ClearPageError(page);
        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
+                wait_on_page_locked(page);
+                if (!PageUptodate(page))
+                        error = -EIO;
+        }
        page_cache_release(page);
        if (!error || error == AOP_TRUNCATED_PAGE)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bbf953eeb58b..ab171274ef21 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -785,7 +785,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                        continue;
                spin_lock(&dst->page_table_lock);
-                spin_lock(&src->page_table_lock);
+                spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
                if (!huge_pte_none(huge_ptep_get(src_pte))) {
                        if (cow)
                                huge_ptep_set_wrprotect(src, addr, src_pte);
diff --git a/mm/memory.c b/mm/memory.c
index 48c122d42ed7..2302d228fe04 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -311,6 +311,21 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
        if (!new)
                return -ENOMEM;
+        /*
+         * Ensure all pte setup (eg. pte page lock and page clearing) are
+         * visible before the pte is made visible to other CPUs by being
+         * put into page tables.
+         *
+         * The other side of the story is the pointer chasing in the page
+         * table walking code (when walking the page table without locking;
+         * ie. most of the time). Fortunately, these data accesses consist
+         * of a chain of data-dependent loads, meaning most CPUs (alpha
+         * being the notable exception) will already guarantee loads are
+         * seen in-order. See the alpha page table accessors for the
+         * smp_read_barrier_depends() barriers in page table walking code.
+         */
+        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
        spin_lock(&mm->page_table_lock);
        if (!pmd_present(*pmd)) {       /* Has another populated it ? */
                mm->nr_ptes++;
@@ -329,6 +344,8 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
        if (!new)
                return -ENOMEM;
+        smp_wmb(); /* See comment in __pte_alloc */
        spin_lock(&init_mm.page_table_lock);
        if (!pmd_present(*pmd)) {       /* Has another populated it ? */
                pmd_populate_kernel(&init_mm, pmd, new);
@@ -982,17 +999,15 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                goto no_page_table;
        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
-        if (!ptep)
-                goto out;
        pte = *ptep;
        if (!pte_present(pte))
-                goto unlock;
+                goto no_page;
        if ((flags & FOLL_WRITE) && !pte_write(pte))
                goto unlock;
        page = vm_normal_page(vma, address, pte);
        if (unlikely(!page))
-                goto unlock;
+                goto bad_page;
        if (flags & FOLL_GET)
                get_page(page);
@@ -1007,6 +1022,15 @@ unlock:
 out:
        return page;
+bad_page:
+        pte_unmap_unlock(ptep, ptl);
+        return ERR_PTR(-EFAULT);
+no_page:
+        pte_unmap_unlock(ptep, ptl);
+        if (!pte_none(pte))
+                return page;
+        /* Fall through to ZERO_PAGE handling */
 no_page_table:
        /*
         * When core dumping an enormous anonymous area that nobody
@@ -1021,6 +1045,26 @@ no_page_table:
        return page;
 }
+/* Can we do the FOLL_ANON optimization? */
+static inline int use_zero_page(struct vm_area_struct *vma)
+{
+        /*
+         * We don't want to optimize FOLL_ANON for make_pages_present()
+         * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
+         * we want to get the page from the page tables to make sure
+         * that we serialize and update with any other user of that
+         * mapping.
+         */
+        if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
+                return 0;
+        /*
+         * And if we have a fault or a nopfn routine, it's not an
+         * anonymous region.
+         */
+        return !vma->vm_ops ||
+                (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
+}
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long start, int len, int write, int force,
                struct page **pages, struct vm_area_struct **vmas)
@@ -1095,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                foll_flags = FOLL_TOUCH;
                if (pages)
                        foll_flags |= FOLL_GET;
-                if (!write && !(vma->vm_flags & VM_LOCKED) &&
+                if (!write && use_zero_page(vma))
-                    (!vma->vm_ops || !vma->vm_ops->fault))
                        foll_flags |= FOLL_ANON;
                do {
@@ -1108,7 +1151,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                         * be processed until returning to user space.
                         */
                        if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
-                                return -ENOMEM;
+                                return i ? i : -ENOMEM;
                        if (write)
                                foll_flags |= FOLL_WRITE;
@@ -1142,6 +1185,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                cond_resched();
                        }
+                        if (IS_ERR(page))
+                                return i ? i : PTR_ERR(page);
                        if (pages) {
                                pages[i] = page;
@@ -1652,8 +1697,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *dirty_page = NULL;
        old_page = vm_normal_page(vma, address, orig_pte);
-        if (!old_page)
+        if (!old_page) {
+                /*
+                 * VM_MIXEDMAP !pfn_valid() case
+                 *
+                 * We should not cow pages in a shared writeable mapping.
+                 * Just mark the pages writable as we can't do any dirty
+                 * accounting on raw pfn maps.
+                 */
+                if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+                                     (VM_WRITE|VM_SHARED))
+                        goto reuse;
                goto gotten;
+        }
        /*
         * Take out anonymous pages first, anonymous shared vmas are
@@ -1706,6 +1762,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        if (reuse) {
+reuse:
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1740,7 +1797,6 @@ gotten:
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (likely(pte_same(*page_table, orig_pte))) {
                if (old_page) {
-                        page_remove_rmap(old_page, vma);
                        if (!PageAnon(old_page)) {
                                dec_mm_counter(mm, file_rss);
                                inc_mm_counter(mm, anon_rss);
@@ -1762,6 +1818,32 @@ gotten:
                lru_cache_add_active(new_page);
                page_add_new_anon_rmap(new_page, vma, address);
+                if (old_page) {
+                        /*
+                         * Only after switching the pte to the new page may
+                         * we remove the mapcount here. Otherwise another
+                         * process may come and find the rmap count decremented
+                         * before the pte is switched to the new page, and
+                         * "reuse" the old page writing into it while our pte
+                         * here still points into it and can be read by other
+                         * threads.
+                         *
+                         * The critical issue is to order this
+                         * page_remove_rmap with the ptp_clear_flush above.
+                         * Those stores are ordered by (if nothing else,)
+                         * the barrier present in the atomic_add_negative
+                         * in page_remove_rmap.
+                         *
+                         * Then the TLB flush in ptep_clear_flush ensures that
+                         * no process can access the old page before the
+                         * decremented mapcount is visible. And the old page
+                         * cannot be reused until after the decremented
+                         * mapcount is visible. So transitively, TLBs to
+                         * old page will be flushed before it can be reused.
+                         */
+                        page_remove_rmap(old_page, vma);
+                }
                /* Free the old page.. */
                new_page = old_page;
                ret |= VM_FAULT_WRITE;
@@ -2278,8 +2360,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        vmf.flags = flags;
        vmf.page = NULL;
-        BUG_ON(vma->vm_flags & VM_PFNMAP);
        ret = vma->vm_ops->fault(vma, &vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
@@ -2619,6 +2699,8 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
        if (!new)
                return -ENOMEM;
+        smp_wmb(); /* See comment in __pte_alloc */
        spin_lock(&mm->page_table_lock);
        if (pgd_present(*pgd))          /* Another has populated it */
                pud_free(mm, new);
@@ -2640,6 +2722,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
        if (!new)
                return -ENOMEM;
+        smp_wmb(); /* See comment in __pte_alloc */
        spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
        if (pud_present(*pud))          /* Another has populated it */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b17dca7249f8..833f854eabe5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -159,21 +159,58 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
+                           unsigned long end_pfn)
+{
+        unsigned long old_zone_end_pfn;
+        zone_span_writelock(zone);
+        old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        if (start_pfn < zone->zone_start_pfn)
+                zone->zone_start_pfn = start_pfn;
+        zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
+                                zone->zone_start_pfn;
+        zone_span_writeunlock(zone);
+}
+static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
+                            unsigned long end_pfn)
+{
+        unsigned long old_pgdat_end_pfn =
+                pgdat->node_start_pfn + pgdat->node_spanned_pages;
+        if (start_pfn < pgdat->node_start_pfn)
+                pgdat->node_start_pfn = start_pfn;
+        pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
+                                        pgdat->node_start_pfn;
+}
 static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
        int nr_pages = PAGES_PER_SECTION;
        int nid = pgdat->node_id;
        int zone_type;
+        unsigned long flags;
        zone_type = zone - pgdat->node_zones;
        if (!zone->wait_table) {
-                int ret = 0;
+                int ret;
                ret = init_currently_empty_zone(zone, phys_start_pfn,
                                                nr_pages, MEMMAP_HOTPLUG);
-                if (ret < 0)
+                if (ret)
                        return ret;
        }
+        pgdat_resize_lock(zone->zone_pgdat, &flags);
+        grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
+        grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
+                        phys_start_pfn + nr_pages);
+        pgdat_resize_unlock(zone->zone_pgdat, &flags);
        memmap_init_zone(nr_pages, nid, zone_type,
                         phys_start_pfn, MEMMAP_HOTPLUG);
        return 0;
@@ -299,36 +336,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 }
 EXPORT_SYMBOL_GPL(__remove_pages);
-static void grow_zone_span(struct zone *zone,
-                unsigned long start_pfn, unsigned long end_pfn)
-{
-        unsigned long old_zone_end_pfn;
-        zone_span_writelock(zone);
-        old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
-        if (start_pfn < zone->zone_start_pfn)
-                zone->zone_start_pfn = start_pfn;
-        zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
-                                zone->zone_start_pfn;
-        zone_span_writeunlock(zone);
-}
-static void grow_pgdat_span(struct pglist_data *pgdat,
-                unsigned long start_pfn, unsigned long end_pfn)
-{
-        unsigned long old_pgdat_end_pfn =
-                pgdat->node_start_pfn + pgdat->node_spanned_pages;
-        if (start_pfn < pgdat->node_start_pfn)
-                pgdat->node_start_pfn = start_pfn;
-        pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
-                                        pgdat->node_start_pfn;
-}
 void online_page(struct page *page)
 {
        totalram_pages++;
@@ -367,7 +374,6 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 int online_pages(unsigned long pfn, unsigned long nr_pages)
 {
-        unsigned long flags;
        unsigned long onlined_pages = 0;
        struct zone *zone;
        int need_zonelists_rebuild = 0;
@@ -395,11 +401,6 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
         * memory_block->state_mutex.
         */
        zone = page_zone(pfn_to_page(pfn));
-        pgdat_resize_lock(zone->zone_pgdat, &flags);
-        grow_zone_span(zone, pfn, pfn + nr_pages);
-        grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
-        pgdat_resize_unlock(zone->zone_pgdat, &flags);
        /*
         * If this zone is not populated, then it is not in zonelist.
         * This means the page allocator ignores this zone.
@@ -408,8 +409,15 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        if (!populated_zone(zone))
                need_zonelists_rebuild = 1;
-        walk_memory_resource(pfn, nr_pages, &onlined_pages,
+        ret = walk_memory_resource(pfn, nr_pages, &onlined_pages,
                online_pages_range);
+        if (ret) {
+                printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
+                        nr_pages, pfn);
+                memory_notify(MEM_CANCEL_ONLINE, &arg);
+                return ret;
+        }
        zone->present_pages += onlined_pages;
        zone->zone_pgdat->node_present_pages += onlined_pages;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a37a5034f63d..c94e58b192c3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -729,7 +729,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
        } else {
                *policy = pol == &default_policy ? MPOL_DEFAULT :
                                                pol->mode;
-                *policy |= pol->flags;
+                /*
+                 * Internal mempolicy flags must be masked off before exposing
+                 * the policy to userspace.
+                 */
+                *policy |= (pol->flags & MPOL_MODE_FLAGS);
        }
        if (vma) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 449d77d409f5..55bd355d170d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -9,7 +9,7 @@
 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
 * Hirokazu Takahashi <taka@valinux.co.jp>
 * Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter
 */
 #include <linux/migrate.h>
@@ -865,6 +865,11 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
                        goto set_status;
                page = follow_page(vma, pp->addr, FOLL_GET);
+                err = PTR_ERR(page);
+                if (IS_ERR(page))
+                        goto set_status;
                err = -ENOENT;
                if (!page)
                        goto set_status;
@@ -928,6 +933,11 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
                        goto set_status;
                page = follow_page(vma, pm->addr, 0);
+                err = PTR_ERR(page);
+                if (IS_ERR(page))
+                        goto set_status;
                err = -ENOENT;
                /* Use PageReserved to check for zero page */
                if (!page || PageReserved(page))
diff --git a/mm/mmap.c b/mm/mmap.c
index fac66337da2a..3354fdd83d4b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50;       /* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_t vm_committed_space = ATOMIC_INIT(0);
+atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
 /*
 * Check that a process has enough memory to allocate a new virtual
@@ -177,7 +177,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
         * cast `allowed' as a signed long because vm_committed_space
         * sometimes has a negative value
         */
-        if (atomic_read(&vm_committed_space) < (long)allowed)
+        if (atomic_long_read(&vm_committed_space) < (long)allowed)
                return 0;
 error:
        vm_unacct_memory(pages);
@@ -245,10 +245,16 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
        unsigned long rlim, retval;
        unsigned long newbrk, oldbrk;
        struct mm_struct *mm = current->mm;
+        unsigned long min_brk;
        down_write(&mm->mmap_sem);
-        if (brk < mm->start_brk)
+#ifdef CONFIG_COMPAT_BRK
+        min_brk = mm->end_code;
+#else
+        min_brk = mm->start_brk;
+#endif
+        if (brk < min_brk)
                goto out;
        /*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4de546899dc1..acfe7c8d72fc 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -26,6 +26,13 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
+#ifndef pgprot_modify
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+        return newprot;
+}
+#endif
 static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
                int dirty_accountable)
@@ -40,19 +47,17 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                if (pte_present(oldpte)) {
                        pte_t ptent;
-                        /* Avoid an SMP race with hardware updated dirty/clean
+                        ptent = ptep_modify_prot_start(mm, addr, pte);
-                         * bits by wiping the pte and then setting the new pte
-                         * into place.
-                         */
-                        ptent = ptep_get_and_clear(mm, addr, pte);
                        ptent = pte_modify(ptent, newprot);
                        /*
                         * Avoid taking write faults for pages we know to be
                         * dirty.
                         */
                        if (dirty_accountable && pte_dirty(ptent))
                                ptent = pte_mkwrite(ptent);
-                        set_pte_at(mm, addr, pte, ptent);
+                        ptep_modify_prot_commit(mm, addr, pte, ptent);
 #ifdef CONFIG_MIGRATION
                } else if (!pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -192,7 +197,9 @@ success:
         * held in write mode.
         */
        vma->vm_flags = newflags;
-        vma->vm_page_prot = vm_get_page_prot(newflags);
+        vma->vm_page_prot = pgprot_modify(vma->vm_page_prot,
+                                          vm_get_page_prot(newflags));
        if (vma_wants_writenotify(vma)) {
                vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED);
                dirty_accountable = 1;
diff --git a/mm/nommu.c b/mm/nommu.c
index ef8c62cec697..4462b6a3fcb9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -39,7 +39,7 @@ struct page *mem_map;
 unsigned long max_mapnr;
 unsigned long num_physpages;
 unsigned long askedalloc, realalloc;
-atomic_t vm_committed_space = ATOMIC_INIT(0);
+atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
@@ -109,16 +109,23 @@ unsigned int kobjsize(const void *objp)
         * If the object we have should not have ksize performed on it,
         * return size of 0
         */
-        if (!objp || (unsigned long)objp >= memory_end || !((page = virt_to_page(objp))))
+        if (!objp || !virt_addr_valid(objp))
                return 0;
+        page = virt_to_head_page(objp);
+        /*
+         * If the allocator sets PageSlab, we know the pointer came from
+         * kmalloc().
+         */
        if (PageSlab(page))
                return ksize(objp);
-        BUG_ON(page->index < 0);
+        /*
-        BUG_ON(page->index >= MAX_ORDER);
+         * The ksize() function is only guaranteed to work for pointers
+         * returned by kmalloc(). So handle arbitrary pointers here.
-        return (PAGE_SIZE << page->index);
+         */
+        return PAGE_SIZE << compound_order(page);
 }
 /*
@@ -1410,7 +1417,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
         * cast `allowed' as a signed long because vm_committed_space
         * sometimes has a negative value
         */
-        if (atomic_read(&vm_committed_space) < (long)allowed)
+        if (atomic_long_read(&vm_committed_space) < (long)allowed)
                return 0;
 error:
        vm_unacct_memory(pages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bdd5c432c426..f32fae3121f0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -237,16 +237,7 @@ static void bad_page(struct page *page)
        printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
                KERN_EMERG "Backtrace:\n");
        dump_stack();
-        page->flags &= ~(1 << PG_lru    |
+        page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;
-                        1 << PG_private |
-                        1 << PG_locked  |
-                        1 << PG_active  |
-                        1 << PG_dirty   |
-                        1 << PG_reclaim |
-                        1 << PG_slab    |
-                        1 << PG_swapcache |
-                        1 << PG_writeback |
-                        1 << PG_buddy );
        set_page_count(page, 0);
        reset_page_mapcount(page);
        page->mapping = NULL;
@@ -463,16 +454,7 @@ static inline int free_pages_check(struct page *page)
                (page->mapping != NULL)  |
                (page_get_page_cgroup(page) != NULL) |
                (page_count(page) != 0)  |
-                (page->flags & (
+                (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
-                        1 << PG_lru     |
-                        1 << PG_private |
-                        1 << PG_locked  |
-                        1 << PG_active  |
-                        1 << PG_slab    |
-                        1 << PG_swapcache |
-                        1 << PG_writeback |
-                        1 << PG_reserved |
-                        1 << PG_buddy ))))
                bad_page(page);
        if (PageDirty(page))
                __ClearPageDirty(page);
@@ -616,17 +598,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
                (page->mapping != NULL)  |
                (page_get_page_cgroup(page) != NULL) |
                (page_count(page) != 0)  |
-                (page->flags & (
+                (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
-                        1 << PG_lru     |
-                        1 << PG_private |
-                        1 << PG_locked  |
-                        1 << PG_active  |
-                        1 << PG_dirty   |
-                        1 << PG_slab    |
-                        1 << PG_swapcache |
-                        1 << PG_writeback |
-                        1 << PG_reserved |
-                        1 << PG_buddy ))))
                bad_page(page);
        /*
@@ -1396,6 +1368,9 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
        (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
                                                        &preferred_zone);
+        if (!preferred_zone)
+                return NULL;
        classzone_idx = zone_idx(preferred_zone);
 zonelist_scan:
@@ -2353,7 +2328,6 @@ static void build_zonelists(pg_data_t *pgdat)
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
        pgdat->node_zonelists[0].zlcache_ptr = NULL;
-        pgdat->node_zonelists[1].zlcache_ptr = NULL;
 }
 #endif  /* CONFIG_NUMA */
@@ -2804,7 +2778,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
        alloc_size = zone->wait_table_hash_nr_entries
                                        * sizeof(wait_queue_head_t);
-        if (system_state == SYSTEM_BOOTING) {
+        if (!slab_is_available()) {
                zone->wait_table = (wait_queue_head_t *)
                        alloc_bootmem_node(pgdat, alloc_size);
        } else {
@@ -2862,8 +2836,6 @@ __meminit int init_currently_empty_zone(struct zone *zone,
        zone->zone_start_pfn = zone_start_pfn;
-        memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
        zone_init_free_lists(zone);
        return 0;
@@ -3380,7 +3352,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                 * is used by this zone for memmap. This affects the watermark
                 * and per-cpu initialisations
                 */
-                memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
+                memmap_pages =
+                        PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
                if (realsize >= memmap_pages) {
                        realsize -= memmap_pages;
                        printk(KERN_DEBUG
@@ -3433,6 +3406,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                ret = init_currently_empty_zone(zone, zone_start_pfn,
                                                size, MEMMAP_EARLY);
                BUG_ON(ret);
+                memmap_init(size, nid, j, zone_start_pfn);
                zone_start_pfn += size;
        }
 }
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 0afd2387e507..d5878bed7841 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -3,14 +3,14 @@
 #include <linux/sched.h>
 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-                          const struct mm_walk *walk, void *private)
+                          struct mm_walk *walk)
 {
        pte_t *pte;
        int err = 0;
        pte = pte_offset_map(pmd, addr);
        for (;;) {
-                err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
+                err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
                if (err)
                       break;
                addr += PAGE_SIZE;
@@ -24,7 +24,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 }
 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
-                          const struct mm_walk *walk, void *private)
+                          struct mm_walk *walk)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -35,15 +35,15 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd)) {
                        if (walk->pte_hole)
-                                err = walk->pte_hole(addr, next, private);
+                                err = walk->pte_hole(addr, next, walk);
                        if (err)
                                break;
                        continue;
                }
                if (walk->pmd_entry)
-                        err = walk->pmd_entry(pmd, addr, next, private);
+                        err = walk->pmd_entry(pmd, addr, next, walk);
                if (!err && walk->pte_entry)
-                        err = walk_pte_range(pmd, addr, next, walk, private);
+                        err = walk_pte_range(pmd, addr, next, walk);
                if (err)
                        break;
        } while (pmd++, addr = next, addr != end);
@@ -52,7 +52,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 }
 static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
-                          const struct mm_walk *walk, void *private)
+                          struct mm_walk *walk)
 {
        pud_t *pud;
        unsigned long next;
@@ -63,15 +63,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud)) {
                        if (walk->pte_hole)
-                                err = walk->pte_hole(addr, next, private);
+                                err = walk->pte_hole(addr, next, walk);
                        if (err)
                                break;
                        continue;
                }
                if (walk->pud_entry)
-                        err = walk->pud_entry(pud, addr, next, private);
+                        err = walk->pud_entry(pud, addr, next, walk);
                if (!err && (walk->pmd_entry || walk->pte_entry))
-                        err = walk_pmd_range(pud, addr, next, walk, private);
+                        err = walk_pmd_range(pud, addr, next, walk);
                if (err)
                        break;
        } while (pud++, addr = next, addr != end);
@@ -85,15 +85,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 * @addr: starting address
 * @end: ending address
 * @walk: set of callbacks to invoke for each level of the tree
- * @private: private data passed to the callback function
 *
 * Recursively walk the page table for the memory area in a VMA,
 * calling supplied callbacks. Callbacks are called in-order (first
 * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
 * etc.). If lower-level callbacks are omitted, walking depth is reduced.
 *
- * Each callback receives an entry pointer, the start and end of the
+ * Each callback receives an entry pointer and the start and end of the
- * associated range, and a caller-supplied private data pointer.
+ * associated range, and a copy of the original mm_walk for access to
+ * the ->private or ->mm fields.
 *
 * No locks are taken, but the bottom level iterator will map PTE
 * directories from highmem if necessary.
@@ -101,9 +101,8 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 * If any callback returns a non-zero value, the walk is aborted and
 * the return value is propagated back to the caller. Otherwise 0 is returned.
 */
-int walk_page_range(const struct mm_struct *mm,
+int walk_page_range(unsigned long addr, unsigned long end,
-                    unsigned long addr, unsigned long end,
+                    struct mm_walk *walk)
-                    const struct mm_walk *walk, void *private)
 {
        pgd_t *pgd;
        unsigned long next;
@@ -112,21 +111,24 @@ int walk_page_range(const struct mm_struct *mm,
        if (addr >= end)
                return err;
-        pgd = pgd_offset(mm, addr);
+        if (!walk->mm)
+                return -EINVAL;
+        pgd = pgd_offset(walk->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd)) {
                        if (walk->pte_hole)
-                                err = walk->pte_hole(addr, next, private);
+                                err = walk->pte_hole(addr, next, walk);
                        if (err)
                                break;
                        continue;
                }
                if (walk->pgd_entry)
-                        err = walk->pgd_entry(pgd, addr, next, private);
+                        err = walk->pgd_entry(pgd, addr, next, walk);
                if (!err &&
                    (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
-                        err = walk_pud_range(pgd, addr, next, walk, private);
+                        err = walk_pud_range(pgd, addr, next, walk);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 1c96cfc9e040..9d834aa4b979 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -207,7 +207,6 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
        spin_lock_irqsave(&pdflush_lock, flags);
        if (list_empty(&pdflush_list)) {
-                spin_unlock_irqrestore(&pdflush_lock, flags);
                ret = -1;
        } else {
                struct pdflush_work *pdf;
@@ -219,8 +218,9 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
                pdf->fn = fn;
                pdf->arg0 = arg0;
                wake_up_process(pdf->who);
-                spin_unlock_irqrestore(&pdflush_lock, flags);
        }
+        spin_unlock_irqrestore(&pdflush_lock, flags);
        return ret;
 }
diff --git a/mm/slab.c b/mm/slab.c
index 06236e4ddc1b..046607f05f3e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3263,9 +3263,12 @@ retry:
                if (cpuset_zone_allowed_hardwall(zone, flags) &&
                        cache->nodelists[nid] &&
-                        cache->nodelists[nid]->free_objects)
+                        cache->nodelists[nid]->free_objects) {
                                obj = ____cache_alloc_node(cache,
                                        flags | GFP_THISNODE, nid);
+                                if (obj)
+                                        break;
+                }
        }
        if (!obj) {
diff --git a/mm/slob.c b/mm/slob.c
index 6038cbadf796..a3ad6671adf1 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -469,8 +469,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
                        return ZERO_SIZE_PTR;
                m = slob_alloc(size + align, gfp, align, node);
-                if (m)
+                if (!m)
-                        *m = size;
+                        return NULL;
+                *m = size;
                return (void *)m + align;
        } else {
                void *ret;
diff --git a/mm/slub.c b/mm/slub.c
index a505a828ef41..1a427c0ae83b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5,7 +5,7 @@
 * The allocator synchronizes using per slab locks and only
 * uses a centralized lock to manage a pool of partial slabs.
 *
- * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
+ * (C) 2007 SGI, Christoph Lameter
 */
 #include <linux/mm.h>
@@ -2726,9 +2726,10 @@ size_t ksize(const void *object)
        page = virt_to_head_page(object);
-        if (unlikely(!PageSlab(page)))
+        if (unlikely(!PageSlab(page))) {
+                WARN_ON(!PageCompound(page));
                return PAGE_SIZE << compound_order(page);
+        }
        s = page->slab;
 #ifdef CONFIG_SLUB_DEBUG
@@ -2994,8 +2995,6 @@ void __init kmem_cache_init(void)
                create_kmalloc_cache(&kmalloc_caches[1],
                                "kmalloc-96", 96, GFP_KERNEL);
                caches++;
-        }
-        if (KMALLOC_MIN_SIZE <= 128) {
                create_kmalloc_cache(&kmalloc_caches[2],
                                "kmalloc-192", 192, GFP_KERNEL);
                caches++;
@@ -3025,6 +3024,16 @@ void __init kmem_cache_init(void)
        for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
                size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
+        if (KMALLOC_MIN_SIZE == 128) {
+                /*
+                 * The 192 byte sized cache is not used if the alignment
+                 * is 128 byte. Redirect kmalloc to use the 256 byte cache
+                 * instead.
+                 */
+                for (i = 128 + 8; i <= 192; i += 8)
+                        size_index[(i - 1) / 8] = 8;
+        }
        slab_state = UP;
        /* Provide the correct kmalloc names now that the caches are up */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 99c4f36eb8a3..a91b5f8fcaf6 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -1,7 +1,7 @@
 /*
 * Virtual Memory Map support
 *
- * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>.
+ * (C) 2007 sgi. Christoph Lameter.
 *
 * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
 * virt_to_page, page_address() to be implemented as a base offset
diff --git a/mm/swap.c b/mm/swap.c
index 91e194445a5e..45c9f25a8a3b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -503,7 +503,7 @@ void vm_acct_memory(long pages)
        local = &__get_cpu_var(committed_space);
        *local += pages;
        if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
-                atomic_add(*local, &vm_committed_space);
+                atomic_long_add(*local, &vm_committed_space);
                *local = 0;
        }
        preempt_enable();
@@ -520,7 +520,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
        committed = &per_cpu(committed_space, (long)hcpu);
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-                atomic_add(*committed, &vm_committed_space);
+                atomic_long_add(*committed, &vm_committed_space);
                *committed = 0;
                drain_cpu_pagevecs((long)hcpu);
        }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9a29901ad3b3..967d30ccd92b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1307,7 +1307,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        int priority;
-        int ret = 0;
+        unsigned long ret = 0;
        unsigned long total_scanned = 0;
        unsigned long nr_reclaimed = 0;
        struct reclaim_state *reclaim_state = current->reclaim_state;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1a32130b958c..db9eabb2c5b3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -41,7 +41,9 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
 */
 void all_vm_events(unsigned long *ret)
 {
+        get_online_cpus();
        sum_vm_events(ret, &cpu_online_map);
+        put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(all_vm_events);