45 files changed, 1304 insertions, 372 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index aa799007a11b..0bd9c2dbb2a0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -205,3 +205,6 @@ config NR_QUICK
 config VIRT_TO_BUS
        def_bool y
        depends on !ARCH_NO_VIRT_TO_BUS
+config MMU_NOTIFIER
+        bool
diff --git a/mm/Makefile b/mm/Makefile
index 06ca2381fef1..da4ccf015aea 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
 obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 843364594e23..4297bc41bfd2 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -18,27 +18,28 @@
 * Depopulating per-cpu data for a cpu going offline would be a typical
 * use case. You need to register a cpu hotplug handler for that purpose.
 */
-void percpu_depopulate(void *__pdata, int cpu)
+static void percpu_depopulate(void *__pdata, int cpu)
 {
        struct percpu_data *pdata = __percpu_disguise(__pdata);
        kfree(pdata->ptrs[cpu]);
        pdata->ptrs[cpu] = NULL;
 }
-EXPORT_SYMBOL_GPL(percpu_depopulate);
 /**
 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
 * @__pdata: per-cpu data to depopulate
 * @mask: depopulate per-cpu data for cpu's selected through mask bits
 */
-void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
+static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
 {
        int cpu;
        for_each_cpu_mask_nr(cpu, *mask)
                percpu_depopulate(__pdata, cpu);
 }
-EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
+#define percpu_depopulate_mask(__pdata, mask) \
+        __percpu_depopulate_mask((__pdata), &(mask))
 /**
 * percpu_populate - populate per-cpu data for given cpu
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
 * use case. You need to register a cpu hotplug handler for that purpose.
 * Per-cpu object is populated with zeroed buffer.
 */
-void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
+static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
 {
        struct percpu_data *pdata = __percpu_disguise(__pdata);
        int node = cpu_to_node(cpu);
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
                pdata->ptrs[cpu] = kzalloc(size, gfp);
        return pdata->ptrs[cpu];
 }
-EXPORT_SYMBOL_GPL(percpu_populate);
 /**
 * percpu_populate_mask - populate per-cpu data for more cpu's
@@ -79,8 +79,8 @@ EXPORT_SYMBOL_GPL(percpu_populate);
 *
 * Per-cpu objects are populated with zeroed buffers.
 */
-int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
+static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
-                           cpumask_t *mask)
+                                  cpumask_t *mask)
 {
        cpumask_t populated;
        int cpu;
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
                        cpu_set(cpu, populated);
        return 0;
 }
-EXPORT_SYMBOL_GPL(__percpu_populate_mask);
+#define percpu_populate_mask(__pdata, size, gfp, mask) \
+        __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 /**
 * percpu_alloc_mask - initial setup of per-cpu data
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 4af15d0340ad..ad8eec6e44a8 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -405,6 +405,29 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
 }
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
+static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
+                        unsigned long step)
+{
+        unsigned long base = bdata->node_min_pfn;
+        /*
+         * Align the index with respect to the node start so that the
+         * combination of both satisfies the requested alignment.
+         */
+        return ALIGN(base + idx, step) - base;
+}
+static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
+                        unsigned long align)
+{
+        unsigned long base = PFN_PHYS(bdata->node_min_pfn);
+        /* Same as align_idx for byte offsets */
+        return ALIGN(base + off, align) - base;
+}
 static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
                                unsigned long size, unsigned long align,
                                unsigned long goal, unsigned long limit)
@@ -441,7 +464,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
        else
                start = ALIGN(min, step);
-        sidx = start - bdata->node_min_pfn;;
+        sidx = start - bdata->node_min_pfn;
        midx = max - bdata->node_min_pfn;
        if (bdata->hint_idx > sidx) {
@@ -450,7 +473,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
                 * catch the fallback below.
                 */
                fallback = sidx + 1;
-                sidx = ALIGN(bdata->hint_idx, step);
+                sidx = align_idx(bdata, bdata->hint_idx, step);
        }
        while (1) {
@@ -459,7 +482,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
                unsigned long eidx, i, start_off, end_off;
 find_block:
                sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
-                sidx = ALIGN(sidx, step);
+                sidx = align_idx(bdata, sidx, step);
                eidx = sidx + PFN_UP(size);
                if (sidx >= midx || eidx > midx)
@@ -467,15 +490,15 @@ find_block:
                for (i = sidx; i < eidx; i++)
                        if (test_bit(i, bdata->node_bootmem_map)) {
-                                sidx = ALIGN(i, step);
+                                sidx = align_idx(bdata, i, step);
                                if (sidx == i)
                                        sidx += step;
                                goto find_block;
                        }
-                if (bdata->last_end_off &&
+                if (bdata->last_end_off & (PAGE_SIZE - 1) &&
                                PFN_DOWN(bdata->last_end_off) + 1 == sidx)
-                        start_off = ALIGN(bdata->last_end_off, align);
+                        start_off = align_off(bdata, bdata->last_end_off, align);
                else
                        start_off = PFN_PHYS(sidx);
@@ -499,7 +522,7 @@ find_block:
        }
        if (fallback) {
-                sidx = ALIGN(fallback - 1, step);
+                sidx = align_idx(bdata, fallback - 1, step);
                fallback = 0;
                goto find_block;
        }
diff --git a/mm/bounce.c b/mm/bounce.c
index b6d2d0f1019b..06722c403058 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
        /*
         * Data-less bio, nothing to bounce
         */
-        if (bio_empty_barrier(*bio_orig))
+        if (!bio_has_data(*bio_orig))
                return;
        /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 2d3ec1ffc66e..876bc595d0f8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -109,7 +109,7 @@
 /*
 * Remove a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock.
 */
 void __remove_from_page_cache(struct page *page)
 {
@@ -141,9 +141,9 @@ void remove_from_page_cache(struct page *page)
        BUG_ON(!PageLocked(page));
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
 }
 static int sync_page(void *word)
@@ -442,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 }
 /**
- * add_to_page_cache - add newly allocated pagecache pages
+ * add_to_page_cache_locked - add a locked page to the pagecache
 * @page:       page to add
 * @mapping:    the page's address_space
 * @offset:     page index
 * @gfp_mask:   page allocation mode
 *
- * This function is used to add newly allocated pagecache pages;
+ * This function is used to add a page to the pagecache. It must be locked.
- * the page is new, so we can just run SetPageLocked() against it.
- * The other page state flags were set by rmqueue().
- *
 * This function does not add the page to the LRU.  The caller must do that.
 */
-int add_to_page_cache(struct page *page, struct address_space *mapping,
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                pgoff_t offset, gfp_t gfp_mask)
 {
-        int error = mem_cgroup_cache_charge(page, current->mm,
+        int error;
+        VM_BUG_ON(!PageLocked(page));
+        error = mem_cgroup_cache_charge(page, current->mm,
                                        gfp_mask & ~__GFP_HIGHMEM);
        if (error)
                goto out;
        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (error == 0) {
-                write_lock_irq(&mapping->tree_lock);
+                page_cache_get(page);
+                page->mapping = mapping;
+                page->index = offset;
+                spin_lock_irq(&mapping->tree_lock);
                error = radix_tree_insert(&mapping->page_tree, offset, page);
-                if (!error) {
+                if (likely(!error)) {
-                        page_cache_get(page);
-                        SetPageLocked(page);
-                        page->mapping = mapping;
-                        page->index = offset;
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                } else
+                } else {
+                        page->mapping = NULL;
                        mem_cgroup_uncharge_cache_page(page);
+                        page_cache_release(page);
+                }
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
                radix_tree_preload_end();
        } else
                mem_cgroup_uncharge_cache_page(page);
 out:
        return error;
 }
-EXPORT_SYMBOL(add_to_page_cache);
+EXPORT_SYMBOL(add_to_page_cache_locked);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                pgoff_t offset, gfp_t gfp_mask)
@@ -554,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 *
 * The first mb is necessary to safely close the critical section opened by the
- * TestSetPageLocked(), the second mb is necessary to enforce ordering between
+ * test_and_set_bit() to lock the page; the second mb is necessary to enforce
- * the clear_bit and the read of the waitqueue (to avoid SMP races with a
+ * ordering between the clear_bit and the read of the waitqueue (to avoid SMP
- * parallel wait_on_page_locked()).
+ * races with a parallel wait_on_page_locked()).
 */
 void unlock_page(struct page *page)
 {
        smp_mb__before_clear_bit();
-        if (!TestClearPageLocked(page))
+        if (!test_and_clear_bit(PG_locked, &page->flags))
                BUG();
        smp_mb__after_clear_bit(); 
        wake_up_page(page, PG_locked);
@@ -633,15 +637,35 @@ void __lock_page_nosync(struct page *page)
 * Is there a pagecache struct page at the given (mapping, offset) tuple?
 * If yes, increment its refcount and return it; if no, return NULL.
 */
-struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
 {
+        void **pagep;
        struct page *page;
-        read_lock_irq(&mapping->tree_lock);
+        rcu_read_lock();
-        page = radix_tree_lookup(&mapping->page_tree, offset);
+repeat:
-        if (page)
+        page = NULL;
-                page_cache_get(page);
+        pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
-        read_unlock_irq(&mapping->tree_lock);
+        if (pagep) {
+                page = radix_tree_deref_slot(pagep);
+                if (unlikely(!page || page == RADIX_TREE_RETRY))
+                        goto repeat;
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /*
+                 * Has the page moved?
+                 * This is part of the lockless pagecache protocol. See
+                 * include/linux/pagemap.h for details.
+                 */
+                if (unlikely(page != *pagep)) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+        }
+        rcu_read_unlock();
        return page;
 }
 EXPORT_SYMBOL(find_get_page);
@@ -656,32 +680,22 @@ EXPORT_SYMBOL(find_get_page);
 *
 * Returns zero if the page was not present. find_lock_page() may sleep.
 */
-struct page *find_lock_page(struct address_space *mapping,
+struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
-                                pgoff_t offset)
 {
        struct page *page;
 repeat:
-        read_lock_irq(&mapping->tree_lock);
+        page = find_get_page(mapping, offset);
-        page = radix_tree_lookup(&mapping->page_tree, offset);
        if (page) {
-                page_cache_get(page);
+                lock_page(page);
-                if (TestSetPageLocked(page)) {
+                /* Has the page been truncated? */
-                        read_unlock_irq(&mapping->tree_lock);
+                if (unlikely(page->mapping != mapping)) {
-                        __lock_page(page);
+                        unlock_page(page);
+                        page_cache_release(page);
-                        /* Has the page been truncated while we slept? */
+                        goto repeat;
-                        if (unlikely(page->mapping != mapping)) {
-                                unlock_page(page);
-                                page_cache_release(page);
-                                goto repeat;
-                        }
-                        VM_BUG_ON(page->index != offset);
-                        goto out;
                }
+                VM_BUG_ON(page->index != offset);
        }
-        read_unlock_irq(&mapping->tree_lock);
-out:
        return page;
 }
 EXPORT_SYMBOL(find_lock_page);
@@ -747,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 {
        unsigned int i;
        unsigned int ret;
+        unsigned int nr_found;
+        rcu_read_lock();
+restart:
+        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                                (void ***)pages, start, nr_pages);
+        ret = 0;
+        for (i = 0; i < nr_found; i++) {
+                struct page *page;
+repeat:
+                page = radix_tree_deref_slot((void **)pages[i]);
+                if (unlikely(!page))
+                        continue;
+                /*
+                 * this can only trigger if nr_found == 1, making livelock
+                 * a non issue.
+                 */
+                if (unlikely(page == RADIX_TREE_RETRY))
+                        goto restart;
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /* Has the page moved? */
+                if (unlikely(page != *((void **)pages[i]))) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
-        read_lock_irq(&mapping->tree_lock);
+                pages[ret] = page;
-        ret = radix_tree_gang_lookup(&mapping->page_tree,
+                ret++;
-                                (void **)pages, start, nr_pages);
+        }
-        for (i = 0; i < ret; i++)
+        rcu_read_unlock();
-                page_cache_get(pages[i]);
-        read_unlock_irq(&mapping->tree_lock);
        return ret;
 }
@@ -774,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 {
        unsigned int i;
        unsigned int ret;
+        unsigned int nr_found;
+        rcu_read_lock();
+restart:
+        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                                (void ***)pages, index, nr_pages);
+        ret = 0;
+        for (i = 0; i < nr_found; i++) {
+                struct page *page;
+repeat:
+                page = radix_tree_deref_slot((void **)pages[i]);
+                if (unlikely(!page))
+                        continue;
+                /*
+                 * this can only trigger if nr_found == 1, making livelock
+                 * a non issue.
+                 */
+                if (unlikely(page == RADIX_TREE_RETRY))
+                        goto restart;
-        read_lock_irq(&mapping->tree_lock);
+                if (page->mapping == NULL || page->index != index)
-        ret = radix_tree_gang_lookup(&mapping->page_tree,
-                                (void **)pages, index, nr_pages);
-        for (i = 0; i < ret; i++) {
-                if (pages[i]->mapping == NULL || pages[i]->index != index)
                        break;
-                page_cache_get(pages[i]);
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /* Has the page moved? */
+                if (unlikely(page != *((void **)pages[i]))) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                pages[ret] = page;
+                ret++;
                index++;
        }
-        read_unlock_irq(&mapping->tree_lock);
+        rcu_read_unlock();
-        return i;
+        return ret;
 }
 EXPORT_SYMBOL(find_get_pages_contig);
@@ -806,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 {
        unsigned int i;
        unsigned int ret;
+        unsigned int nr_found;
+        rcu_read_lock();
+restart:
+        nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
+                                (void ***)pages, *index, nr_pages, tag);
+        ret = 0;
+        for (i = 0; i < nr_found; i++) {
+                struct page *page;
+repeat:
+                page = radix_tree_deref_slot((void **)pages[i]);
+                if (unlikely(!page))
+                        continue;
+                /*
+                 * this can only trigger if nr_found == 1, making livelock
+                 * a non issue.
+                 */
+                if (unlikely(page == RADIX_TREE_RETRY))
+                        goto restart;
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /* Has the page moved? */
+                if (unlikely(page != *((void **)pages[i]))) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                pages[ret] = page;
+                ret++;
+        }
+        rcu_read_unlock();
-        read_lock_irq(&mapping->tree_lock);
-        ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
-                                (void **)pages, *index, nr_pages, tag);
-        for (i = 0; i < ret; i++)
-                page_cache_get(pages[i]);
        if (ret)
                *index = pages[ret - 1]->index + 1;
-        read_unlock_irq(&mapping->tree_lock);
        return ret;
 }
 EXPORT_SYMBOL(find_get_pages_tag);
@@ -838,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
        struct page *page = find_get_page(mapping, index);
        if (page) {
-                if (!TestSetPageLocked(page))
+                if (trylock_page(page))
                        return page;
                page_cache_release(page);
                return NULL;
@@ -930,8 +1023,17 @@ find_page:
                                        ra, filp, page,
                                        index, last_index - index);
                }
-                if (!PageUptodate(page))
+                if (!PageUptodate(page)) {
-                        goto page_not_up_to_date;
+                        if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
+                                        !mapping->a_ops->is_partially_uptodate)
+                                goto page_not_up_to_date;
+                        if (!trylock_page(page))
+                                goto page_not_up_to_date;
+                        if (!mapping->a_ops->is_partially_uptodate(page,
+                                                                desc, offset))
+                                goto page_not_up_to_date_locked;
+                        unlock_page(page);
+                }
 page_ok:
                /*
                 * i_size must be checked after we know the page is Uptodate.
@@ -1001,6 +1103,7 @@ page_not_up_to_date:
                if (lock_page_killable(page))
                        goto readpage_eio;
+page_not_up_to_date_locked:
                /* Did it get truncated before we got the lock? */
                if (!page->mapping) {
                        unlock_page(page);
@@ -1665,8 +1768,9 @@ static int __remove_suid(struct dentry *dentry, int kill)
        return notify_change(dentry, &newattrs);
 }
-int remove_suid(struct dentry *dentry)
+int file_remove_suid(struct file *file)
 {
+        struct dentry *dentry = file->f_path.dentry;
        int killsuid = should_remove_suid(dentry);
        int killpriv = security_inode_need_killpriv(dentry);
        int error = 0;
@@ -1680,7 +1784,7 @@ int remove_suid(struct dentry *dentry)
        return error;
 }
-EXPORT_SYMBOL(remove_suid);
+EXPORT_SYMBOL(file_remove_suid);
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
                        const struct iovec *iov, size_t base, size_t bytes)
@@ -1775,7 +1879,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
                 * The !iov->iov_len check ensures we skip over unlikely
                 * zero-length segments (without overruning the iovec).
                 */
-                while (bytes || unlikely(!iov->iov_len && i->count)) {
+                while (bytes || unlikely(i->count && !iov->iov_len)) {
                        int copy;
                        copy = min(bytes, iov->iov_len - base);
@@ -2025,13 +2129,20 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
         * After a write we want buffered reads to be sure to go to disk to get
         * the new data.  We invalidate clean cached page from the region we're
         * about to write.  We do this *before* the write so that we can return
-         * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
+         * without clobbering -EIOCBQUEUED from ->direct_IO().
         */
        if (mapping->nrpages) {
                written = invalidate_inode_pages2_range(mapping,
                                        pos >> PAGE_CACHE_SHIFT, end);
-                if (written)
+                /*
+                 * If a page can not be invalidated, return 0 to fall back
+                 * to buffered write.
+                 */
+                if (written) {
+                        if (written == -EBUSY)
+                                return 0;
                        goto out;
+                }
        }
        written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
@@ -2436,7 +2547,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
        if (count == 0)
                goto out;
-        err = remove_suid(file->f_path.dentry);
+        err = file_remove_suid(file);
        if (err)
                goto out;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 3e744abcce9d..b5167dfb2f2d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,7 +13,10 @@
 #include <linux/module.h>
 #include <linux/uio.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 #include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
@@ -21,22 +24,18 @@
 * We do use our own empty page to avoid interference with other users
 * of ZERO_PAGE(), such as /dev/zero
 */
+static DEFINE_MUTEX(xip_sparse_mutex);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
 static struct page *__xip_sparse_page;
+/* called under xip_sparse_mutex */
 static struct page *xip_sparse_page(void)
 {
        if (!__xip_sparse_page) {
                struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
-                if (page) {
+                if (page)
-                        static DEFINE_SPINLOCK(xip_alloc_lock);
+                        __xip_sparse_page = page;
-                        spin_lock(&xip_alloc_lock);
-                        if (!__xip_sparse_page)
-                                __xip_sparse_page = page;
-                        else
-                                __free_page(page);
-                        spin_unlock(&xip_alloc_lock);
-                }
        }
        return __xip_sparse_page;
 }
@@ -173,22 +172,27 @@ __xip_unmap (struct address_space * mapping,
        pte_t pteval;
        spinlock_t *ptl;
        struct page *page;
+        unsigned count;
+        int locked = 0;
+        count = read_seqcount_begin(&xip_sparse_seq);
        page = __xip_sparse_page;
        if (!page)
                return;
+retry:
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-                pte = page_check_address(page, mm, address, &ptl);
+                pte = page_check_address(page, mm, address, &ptl, 1);
                if (pte) {
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
-                        pteval = ptep_clear_flush(vma, address, pte);
+                        pteval = ptep_clear_flush_notify(vma, address, pte);
                        page_remove_rmap(page, vma);
                        dec_mm_counter(mm, file_rss);
                        BUG_ON(pte_dirty(pteval));
@@ -197,6 +201,14 @@ __xip_unmap (struct address_space * mapping,
                }
        }
        spin_unlock(&mapping->i_mmap_lock);
+        if (locked) {
+                mutex_unlock(&xip_sparse_mutex);
+        } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
+                mutex_lock(&xip_sparse_mutex);
+                locked = 1;
+                goto retry;
+        }
 }
 /*
@@ -217,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        int error;
        /* XXX: are VM_FAULT_ codes OK? */
+again:
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (vmf->pgoff >= size)
                return VM_FAULT_SIGBUS;
@@ -236,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                int err;
                /* maybe shared writable, allocate new block */
+                mutex_lock(&xip_sparse_mutex);
                error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
                                                        &xip_mem, &xip_pfn);
+                mutex_unlock(&xip_sparse_mutex);
                if (error)
                        return VM_FAULT_SIGBUS;
                /* unmap sparse mappings at pgoff from all other vmas */
@@ -251,14 +265,34 @@ found:
                BUG_ON(err);
                return VM_FAULT_NOPAGE;
        } else {
+                int err, ret = VM_FAULT_OOM;
+                mutex_lock(&xip_sparse_mutex);
+                write_seqcount_begin(&xip_sparse_seq);
+                error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+                                                        &xip_mem, &xip_pfn);
+                if (unlikely(!error)) {
+                        write_seqcount_end(&xip_sparse_seq);
+                        mutex_unlock(&xip_sparse_mutex);
+                        goto again;
+                }
+                if (error != -ENODATA)
+                        goto out;
                /* not shared and writable, use xip_sparse_page() */
                page = xip_sparse_page();
                if (!page)
-                        return VM_FAULT_OOM;
+                        goto out;
+                err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+                                                        page);
+                if (err == -ENOMEM)
+                        goto out;
-                page_cache_get(page);
+                ret = VM_FAULT_NOPAGE;
-                vmf->page = page;
+out:
-                return 0;
+                write_seqcount_end(&xip_sparse_seq);
+                mutex_unlock(&xip_sparse_mutex);
+                return ret;
        }
 }
@@ -307,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf,
                                                &xip_mem, &xip_pfn);
                if (status == -ENODATA) {
                        /* we allocate a new page unmap it */
+                        mutex_lock(&xip_sparse_mutex);
                        status = a_ops->get_xip_mem(mapping, index, 1,
                                                        &xip_mem, &xip_pfn);
+                        mutex_unlock(&xip_sparse_mutex);
                        if (!status)
                                /* unmap page at pgoff from all other vmas */
                                __xip_unmap(mapping, index);
@@ -380,7 +416,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        if (count == 0)
                goto out_backing;
-        ret = remove_suid(filp->f_path.dentry);
+        ret = file_remove_suid(filp);
        if (ret)
                goto out_backing;
diff --git a/mm/fremap.c b/mm/fremap.c
index 07a9c82ce1a3..7881638e4a12 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
                spin_unlock(&mapping->i_mmap_lock);
        }
+        mmu_notifier_invalidate_range_start(mm, start, start + size);
        err = populate_range(mm, vma, start, size, pgoff);
+        mmu_notifier_invalidate_range_end(mm, start, start + size);
        if (!err && !(flags & MAP_NONBLOCK)) {
                if (unlikely(has_write_lock)) {
                        downgrade_write(&mm->mmap_sem);
diff --git a/mm/highmem.c b/mm/highmem.c
index e16e1523b688..b36b83b920ff 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -70,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
 static void flush_all_zero_pkmaps(void)
 {
        int i;
+        int need_flush = 0;
        flush_cache_kmaps();
@@ -101,8 +102,10 @@ static void flush_all_zero_pkmaps(void)
                          &pkmap_page_table[i]);
                set_page_address(page, NULL);
+                need_flush = 1;
        }
-        flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+        if (need_flush)
+                flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
 }
 /**
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a8bf4ab01f86..67a71191136e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/sysctl.h>
 #include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
@@ -19,6 +20,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/io.h>
 #include <linux/hugetlb.h>
 #include "internal.h"
@@ -563,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
                huge_page_order(h));
        if (page) {
                if (arch_prepare_hugepage(page)) {
-                        __free_pages(page, HUGETLB_PAGE_ORDER);
+                        __free_pages(page, huge_page_order(h));
                        return NULL;
                }
                prep_new_huge_page(h, page, nid);
@@ -663,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
                                        __GFP_REPEAT|__GFP_NOWARN,
                                        huge_page_order(h));
+        if (page && arch_prepare_hugepage(page)) {
+                __free_pages(page, huge_page_order(h));
+                return NULL;
+        }
        spin_lock(&hugetlb_lock);
        if (page) {
                /*
@@ -1026,18 +1033,6 @@ static void __init report_hugepages(void)
        }
 }
-static unsigned int cpuset_mems_nr(unsigned int *array)
-{
-        int node;
-        unsigned int nr = 0;
-        for_each_node_mask(node, cpuset_current_mems_allowed)
-                nr += array[node];
-        return nr;
-}
-#ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
 static void try_to_free_low(struct hstate *h, unsigned long count)
 {
@@ -1293,7 +1288,12 @@ module_exit(hugetlb_exit);
 static int __init hugetlb_init(void)
 {
-        BUILD_BUG_ON(HPAGE_SHIFT == 0);
+        /* Some platform decide whether they support huge pages at boot
+         * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
+         * there is no such support
+         */
+        if (HPAGE_SHIFT == 0)
+                return 0;
        if (!size_to_hstate(default_hstate_size)) {
                default_hstate_size = HPAGE_SIZE;
@@ -1386,6 +1386,18 @@ static int __init hugetlb_default_setup(char *s)
 }
 __setup("default_hugepagesz=", hugetlb_default_setup);
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+        int node;
+        unsigned int nr = 0;
+        for_each_node_mask(node, cpuset_current_mems_allowed)
+                nr += array[node];
+        return nr;
+}
+#ifdef CONFIG_SYSCTL
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
                           struct file *file, void __user *buffer,
                           size_t *length, loff_t *ppos)
@@ -1672,6 +1684,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));
+        mmu_notifier_invalidate_range_start(mm, start, end);
        spin_lock(&mm->page_table_lock);
        for (address = start; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address);
@@ -1713,6 +1726,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        }
        spin_unlock(&mm->page_table_lock);
        flush_tlb_range(vma, start, end);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        list_for_each_entry_safe(page, tmp, &page_list, lru) {
                list_del(&page->lru);
                put_page(page);
@@ -1928,6 +1942,18 @@ retry:
                        lock_page(page);
        }
+        /*
+         * If we are going to COW a private mapping later, we examine the
+         * pending reservations for this page now. This will ensure that
+         * any allocations necessary to record that reservation occur outside
+         * the spinlock.
+         */
+        if (write_access && !(vma->vm_flags & VM_SHARED))
+                if (vma_needs_reservation(h, vma, address) < 0) {
+                        ret = VM_FAULT_OOM;
+                        goto backout_unlocked;
+                }
        spin_lock(&mm->page_table_lock);
        size = i_size_read(mapping->host) >> huge_page_shift(h);
        if (idx >= size)
@@ -1953,6 +1979,7 @@ out:
 backout:
        spin_unlock(&mm->page_table_lock);
+backout_unlocked:
        unlock_page(page);
        put_page(page);
        goto out;
@@ -1964,6 +1991,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t *ptep;
        pte_t entry;
        int ret;
+        struct page *pagecache_page = NULL;
        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
        struct hstate *h = hstate_vma(vma);
@@ -1980,25 +2008,44 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        entry = huge_ptep_get(ptep);
        if (huge_pte_none(entry)) {
                ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
-                mutex_unlock(&hugetlb_instantiation_mutex);
+                goto out_unlock;
-                return ret;
        }
        ret = 0;
+        /*
+         * If we are going to COW the mapping later, we examine the pending
+         * reservations for this page now. This will ensure that any
+         * allocations necessary to record that reservation occur outside the
+         * spinlock. For private mappings, we also lookup the pagecache
+         * page now as it is used to determine if a reservation has been
+         * consumed.
+         */
+        if (write_access && !pte_write(entry)) {
+                if (vma_needs_reservation(h, vma, address) < 0) {
+                        ret = VM_FAULT_OOM;
+                        goto out_unlock;
+                }
+                if (!(vma->vm_flags & VM_SHARED))
+                        pagecache_page = hugetlbfs_pagecache_page(h,
+                                                                vma, address);
+        }
        spin_lock(&mm->page_table_lock);
        /* Check for a racing update before calling hugetlb_cow */
        if (likely(pte_same(entry, huge_ptep_get(ptep))))
-                if (write_access && !pte_write(entry)) {
+                if (write_access && !pte_write(entry))
-                        struct page *page;
+                        ret = hugetlb_cow(mm, vma, address, ptep, entry,
-                        page = hugetlbfs_pagecache_page(h, vma, address);
+                                                        pagecache_page);
-                        ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
-                        if (page) {
-                                unlock_page(page);
-                                put_page(page);
-                        }
-                }
        spin_unlock(&mm->page_table_lock);
+        if (pagecache_page) {
+                unlock_page(pagecache_page);
+                put_page(pagecache_page);
+        }
+out_unlock:
        mutex_unlock(&hugetlb_instantiation_mutex);
        return ret;
diff --git a/mm/madvise.c b/mm/madvise.c
index 23a0ec3e0ea0..f9349c18a1b5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
 * Application no longer needs these pages.  If the pages are dirty,
 * it's OK to just throw them away.  The app will be more careful about
 * data it wants to keep.  Be sure to free swap resources too.  The
- * zap_page_range call sets things up for refill_inactive to actually free
+ * zap_page_range call sets things up for shrink_active_list to actually free
 * these pages later if no one else has touched them in the meantime,
 * although we could add these pages to a global reuse list for
- * refill_inactive to pick up before reclaiming other pages.
+ * shrink_active_list to pick up before reclaiming other pages.
 *
 * NB: This interface discards data rather than pushes it out to swap,
 * as some implementations do.  This has performance implications for
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fba566c51322..36896f3eb7f5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
+        /*
+         * mm_update_next_owner() may clear mm->owner to NULL
+         * if it races with swapoff, page migration, etc.
+         * So this can be called with p == NULL.
+         */
+        if (unlikely(!p))
+                return NULL;
        return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
                                struct mem_cgroup, css);
 }
@@ -549,6 +557,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        if (likely(!memcg)) {
                rcu_read_lock();
                mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+                if (unlikely(!mem)) {
+                        rcu_read_unlock();
+                        kmem_cache_free(page_cgroup_cache, pc);
+                        return 0;
+                }
                /*
                 * For every charge from the cgroup, increment reference count
                 */
@@ -796,14 +809,21 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
        if (mem_cgroup_subsys.disabled)
                return 0;
+        if (!mm)
+                return 0;
        rcu_read_lock();
        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+        if (unlikely(!mem)) {
+                rcu_read_unlock();
+                return 0;
+        }
        css_get(&mem->css);
        rcu_read_unlock();
        do {
                progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
+                progress += res_counter_check_under_limit(&mem->res);
        } while (!progress && --retry);
        css_put(&mem->css);
@@ -1168,9 +1188,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
        mem = mem_cgroup_from_cont(cont);
        old_mem = mem_cgroup_from_cont(old_cont);
-        if (mem == old_mem)
-                goto out;
        /*
         * Only thread group leaders are allowed to migrate, the mm_struct is
         * in effect owned by the leader
diff --git a/mm/memory.c b/mm/memory.c
index 262e3eb6601a..1002f473f497 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -374,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 *
 * The calling function must still handle the error.
 */
-void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
+                          unsigned long vaddr)
 {
        printk(KERN_ERR "Bad pte = %08llx, process = %s, "
                        "vm_flags = %lx, vaddr = %lx\n",
@@ -651,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        unsigned long next;
        unsigned long addr = vma->vm_start;
        unsigned long end = vma->vm_end;
+        int ret;
        /*
         * Don't copy ptes where a page fault will fill them correctly.
@@ -666,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (is_vm_hugetlb_page(vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+        /*
+         * We need to invalidate the secondary MMU mappings only when
+         * there could be a permission downgrade on the ptes of the
+         * parent mm. And a permission downgrade will only happen if
+         * is_cow_mapping() returns true.
+         */
+        if (is_cow_mapping(vma->vm_flags))
+                mmu_notifier_invalidate_range_start(src_mm, addr, end);
+        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
        src_pgd = pgd_offset(src_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(src_pgd))
                        continue;
-                if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+                if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
-                                                vma, addr, next))
+                                            vma, addr, next))) {
-                        return -ENOMEM;
+                        ret = -ENOMEM;
+                        break;
+                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);
-        return 0;
+        if (is_cow_mapping(vma->vm_flags))
+                mmu_notifier_invalidate_range_end(src_mm,
+                                                  vma->vm_start, end);
+        return ret;
 }
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -880,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
        unsigned long start = start_addr;
        spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
        int fullmm = (*tlbp)->fullmm;
+        struct mm_struct *mm = vma->vm_mm;
+        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
                unsigned long end;
@@ -945,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                }
        }
 out:
+        mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
        return start;   /* which is now the end (or restart) address */
 }
@@ -972,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
        return end;
 }
+/**
+ * zap_vma_ptes - remove ptes mapping the vma
+ * @vma: vm_area_struct holding ptes to be zapped
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ *
+ * This function only unmaps ptes assigned to VM_PFNMAP vmas.
+ *
+ * The entire address range must be fully contained within the vma.
+ *
+ * Returns 0 if successful.
+ */
+int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
+                unsigned long size)
+{
+        if (address < vma->vm_start || address + size > vma->vm_end ||
+                        !(vma->vm_flags & VM_PFNMAP))
+                return -1;
+        zap_page_range(vma, address, size, NULL);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(zap_vma_ptes);
 /*
 * Do a quick page-table lookup for a single page.
 */
@@ -1615,10 +1660,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long end = addr + size;
+        unsigned long start = addr, end = addr + size;
        int err;
        BUG_ON(addr >= end);
+        mmu_notifier_invalidate_range_start(mm, start, end);
        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -1626,6 +1672,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1742,7 +1789,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * not dirty accountable.
         */
        if (PageAnon(old_page)) {
-                if (!TestSetPageLocked(old_page)) {
+                if (trylock_page(old_page)) {
                        reuse = can_share_swap_page(old_page);
                        unlock_page(old_page);
                }
@@ -1838,7 +1885,7 @@ gotten:
                 * seen in the presence of one thread doing SMC and another
                 * thread doing COW.
                 */
-                ptep_clear_flush(vma, address, page_table);
+                ptep_clear_flush_notify(vma, address, page_table);
                set_pte_at(mm, address, page_table, entry);
                update_mmu_cache(vma, address, entry);
                lru_cache_add_active(new_page);
@@ -2718,16 +2765,26 @@ int make_pages_present(unsigned long addr, unsigned long end)
        vma = find_vma(current->mm, addr);
        if (!vma)
-                return -1;
+                return -ENOMEM;
        write = (vma->vm_flags & VM_WRITE) != 0;
        BUG_ON(addr >= end);
        BUG_ON(end > vma->vm_end);
        len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
        ret = get_user_pages(current, current->mm, addr,
                        len, write, 0, NULL, NULL);
-        if (ret < 0)
+        if (ret < 0) {
+                /*
+                   SUS require strange return value to mlock
+                    - invalid addr generate to ENOMEM.
+                    - out of memory should generate EAGAIN.
+                */
+                if (ret == -EFAULT)
+                        ret = -ENOMEM;
+                else if (ret == -ENOMEM)
+                        ret = -EAGAIN;
                return ret;
-        return ret == len ? 0 : -1;
+        }
+        return ret == len ? 0 : -ENOMEM;
 }
 #if !defined(__HAVE_ARCH_GATE_AREA)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e550bec20582..83369058ec13 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 int do_migrate_pages(struct mm_struct *mm,
        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 {
-        LIST_HEAD(pagelist);
        int busy = 0;
        int err = 0;
        nodemask_t tmp;
diff --git a/mm/migrate.c b/mm/migrate.c
index d8c65a65c61d..2a80136b23bb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -285,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
        page = migration_entry_to_page(entry);
-        get_page(page);
+        /*
+         * Once radix-tree replacement of page migration started, page_count
+         * *must* be zero. And, we don't want to call wait_on_page_locked()
+         * against a page without get_page().
+         * So, we use get_page_unless_zero(), here. Even failed, page fault
+         * will occur again.
+         */
+        if (!get_page_unless_zero(page))
+                goto out;
        pte_unmap_unlock(ptep, ptl);
        wait_on_page_locked(page);
        put_page(page);
@@ -305,6 +313,7 @@ out:
 static int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page)
 {
+        int expected_count;
        void **pslot;
        if (!mapping) {
@@ -314,14 +323,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
                return 0;
        }
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        pslot = radix_tree_lookup_slot(&mapping->page_tree,
                                        page_index(page));
-        if (page_count(page) != 2 + !!PagePrivate(page) ||
+        expected_count = 2 + !!PagePrivate(page);
+        if (page_count(page) != expected_count ||
                        (struct page *)radix_tree_deref_slot(pslot) != page) {
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
+        if (!page_freeze_refs(page, expected_count)) {
+                spin_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
@@ -338,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        radix_tree_replace_slot(pslot, newpage);
+        page_unfreeze_refs(page, expected_count);
        /*
         * Drop cache reference from old page.
         * We know this isn't the last reference.
@@ -357,10 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        __dec_zone_page_state(page, NR_FILE_PAGES);
        __inc_zone_page_state(newpage, NR_FILE_PAGES);
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
-        if (!PageSwapCache(newpage)) {
+        if (!PageSwapCache(newpage))
                mem_cgroup_uncharge_cache_page(page);
-        }
        return 0;
 }
@@ -590,7 +605,7 @@ static int move_to_new_page(struct page *newpage, struct page *page)
         * establishing additional references. We are the only one
         * holding a reference to the new page at this point.
         */
-        if (TestSetPageLocked(newpage))
+        if (!trylock_page(newpage))
                BUG();
        /* Prepare mapping for the new page.*/
@@ -652,7 +667,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        BUG_ON(charge);
        rc = -EAGAIN;
-        if (TestSetPageLocked(page)) {
+        if (!trylock_page(page)) {
                if (!force)
                        goto move_newpage;
                lock_page(page);
diff --git a/mm/mlock.c b/mm/mlock.c
index 7b2656055d6a..01fbe93eff5c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -78,8 +78,6 @@ success:
        mm->locked_vm -= pages;
 out:
-        if (ret == -ENOMEM)
-                ret = -EAGAIN;
        return ret;
 }
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c6af41ea9994..4e0e26591dfa 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -12,7 +12,11 @@
 #include "internal.h"
 #ifdef CONFIG_DEBUG_MEMORY_INIT
-int __meminitdata mminit_loglevel;
+int mminit_loglevel;
+#ifndef SECTIONS_SHIFT
+#define SECTIONS_SHIFT  0
+#endif
 /* The zonelists are simply reported, validation is manual. */
 void mminit_verify_zonelist(void)
@@ -74,11 +78,7 @@ void __init mminit_verify_pageflags_layout(void)
                NR_PAGEFLAGS);
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
                "Section %d Node %d Zone %d\n",
-#ifdef SECTIONS_SHIFT
                SECTIONS_SHIFT,
-#else
-                0,
-#endif
                NODES_SHIFT,
                ZONES_SHIFT);
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
diff --git a/mm/mmap.c b/mm/mmap.c
index 5e0cc99e9cd5..e7a5a68a9c2e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,6 +26,7 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -369,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                if (vma_tmp->vm_end > addr) {
                        vma = vma_tmp;
                        if (vma_tmp->vm_start <= addr)
-                                return vma;
+                                break;
                        __rb_link = &__rb_parent->rb_left;
                } else {
                        rb_prev = __rb_parent;
@@ -1029,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
        } else {
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
+                        /*
+                         * Ignore pgoff.
+                         */
+                        pgoff = 0;
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        break;
                case MAP_PRIVATE:
@@ -2061,6 +2066,7 @@ void exit_mmap(struct mm_struct *mm)
        /* mm's last user has gone, and its about to be pulled down */
        arch_exit_mmap(mm);
+        mmu_notifier_release(mm);
        lru_add_drain();
        flush_cache_mm(mm);
@@ -2268,3 +2274,167 @@ int install_special_mapping(struct mm_struct *mm,
        return 0;
 }
+static DEFINE_MUTEX(mm_all_locks_mutex);
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
+{
+        if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+                /*
+                 * The LSB of head.next can't change from under us
+                 * because we hold the mm_all_locks_mutex.
+                 */
+                spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+                /*
+                 * We can safely modify head.next after taking the
+                 * anon_vma->lock. If some other vma in this mm shares
+                 * the same anon_vma we won't take it again.
+                 *
+                 * No need of atomic instructions here, head.next
+                 * can't change from under us thanks to the
+                 * anon_vma->lock.
+                 */
+                if (__test_and_set_bit(0, (unsigned long *)
+                                       &anon_vma->head.next))
+                        BUG();
+        }
+}
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+{
+        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+                /*
+                 * AS_MM_ALL_LOCKS can't change from under us because
+                 * we hold the mm_all_locks_mutex.
+                 *
+                 * Operations on ->flags have to be atomic because
+                 * even if AS_MM_ALL_LOCKS is stable thanks to the
+                 * mm_all_locks_mutex, there may be other cpus
+                 * changing other bitflags in parallel to us.
+                 */
+                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+                        BUG();
+                spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+        }
+}
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_sem in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_sem until mm_drop_all_locks() returns.
+ *
+ * mmap_sem in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout (for example populate_range() with
+ * nonlinear vmas). It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We can take all the locks in random order because the VM code
+ * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * takes more than one of them in a row. Secondly we're protected
+ * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        int ret = -EINTR;
+        BUG_ON(down_read_trylock(&mm->mmap_sem));
+        mutex_lock(&mm_all_locks_mutex);
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (signal_pending(current))
+                        goto out_unlock;
+                if (vma->vm_file && vma->vm_file->f_mapping)
+                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
+        }
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (signal_pending(current))
+                        goto out_unlock;
+                if (vma->anon_vma)
+                        vm_lock_anon_vma(mm, vma->anon_vma);
+        }
+        ret = 0;
+out_unlock:
+        if (ret)
+                mm_drop_all_locks(mm);
+        return ret;
+}
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+        if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+                /*
+                 * The LSB of head.next can't change to 0 from under
+                 * us because we hold the mm_all_locks_mutex.
+                 *
+                 * We must however clear the bitflag before unlocking
+                 * the vma so the users using the anon_vma->head will
+                 * never see our bitflag.
+                 *
+                 * No need of atomic instructions here, head.next
+                 * can't change from under us until we release the
+                 * anon_vma->lock.
+                 */
+                if (!__test_and_clear_bit(0, (unsigned long *)
+                                          &anon_vma->head.next))
+                        BUG();
+                spin_unlock(&anon_vma->lock);
+        }
+}
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+                /*
+                 * AS_MM_ALL_LOCKS can't change to 0 from under us
+                 * because we hold the mm_all_locks_mutex.
+                 */
+                spin_unlock(&mapping->i_mmap_lock);
+                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+                                        &mapping->flags))
+                        BUG();
+        }
+}
+/*
+ * The mmap_sem cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        BUG_ON(down_read_trylock(&mm->mmap_sem));
+        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (vma->anon_vma)
+                        vm_unlock_anon_vma(vma->anon_vma);
+                if (vma->vm_file && vma->vm_file->f_mapping)
+                        vm_unlock_mapping(vma->vm_file->f_mapping);
+        }
+        mutex_unlock(&mm_all_locks_mutex);
+}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 000000000000..5f4ef0250bee
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,277 @@
+/*
+ *  linux/mm/mmu_notifier.c
+ *
+ *  Copyright (C) 2008  Qumranet, Inc.
+ *  Copyright (C) 2008  SGI
+ *             Christoph Lameter <clameter@sgi.com>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+/*
+ * This function can't run concurrently against mmu_notifier_register
+ * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
+ * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
+ * in parallel despite there being no task using this mm any more,
+ * through the vmas outside of the exit_mmap context, such as with
+ * vmtruncate. This serializes against mmu_notifier_unregister with
+ * the mmu_notifier_mm->lock in addition to RCU and it serializes
+ * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * can't go away from under us as exit_mmap holds an mm_count pin
+ * itself.
+ */
+void __mmu_notifier_release(struct mm_struct *mm)
+{
+        struct mmu_notifier *mn;
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
+                mn = hlist_entry(mm->mmu_notifier_mm->list.first,
+                                 struct mmu_notifier,
+                                 hlist);
+                /*
+                 * We arrived before mmu_notifier_unregister so
+                 * mmu_notifier_unregister will do nothing other than
+                 * to wait ->release to finish and
+                 * mmu_notifier_unregister to return.
+                 */
+                hlist_del_init_rcu(&mn->hlist);
+                /*
+                 * RCU here will block mmu_notifier_unregister until
+                 * ->release returns.
+                 */
+                rcu_read_lock();
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+                /*
+                 * if ->release runs before mmu_notifier_unregister it
+                 * must be handled as it's the only way for the driver
+                 * to flush all existing sptes and stop the driver
+                 * from establishing any more sptes before all the
+                 * pages in the mm are freed.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+                rcu_read_unlock();
+                spin_lock(&mm->mmu_notifier_mm->lock);
+        }
+        spin_unlock(&mm->mmu_notifier_mm->lock);
+        /*
+         * synchronize_rcu here prevents mmu_notifier_release to
+         * return to exit_mmap (which would proceed freeing all pages
+         * in the mm) until the ->release method returns, if it was
+         * invoked by mmu_notifier_unregister.
+         *
+         * The mmu_notifier_mm can't go away from under us because one
+         * mm_count is hold by exit_mmap.
+         */
+        synchronize_rcu();
+}
+/*
+ * If no young bitflag is supported by the hardware, ->clear_flush_young can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+                                        unsigned long address)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        int young = 0;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->clear_flush_young)
+                        young |= mn->ops->clear_flush_young(mn, mm, address);
+        }
+        rcu_read_unlock();
+        return young;
+}
+void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+                                          unsigned long address)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->invalidate_page)
+                        mn->ops->invalidate_page(mn, mm, address);
+        }
+        rcu_read_unlock();
+}
+void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+                                  unsigned long start, unsigned long end)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->invalidate_range_start)
+                        mn->ops->invalidate_range_start(mn, mm, start, end);
+        }
+        rcu_read_unlock();
+}
+void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+                                  unsigned long start, unsigned long end)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->invalidate_range_end)
+                        mn->ops->invalidate_range_end(mn, mm, start, end);
+        }
+        rcu_read_unlock();
+}
+static int do_mmu_notifier_register(struct mmu_notifier *mn,
+                                    struct mm_struct *mm,
+                                    int take_mmap_sem)
+{
+        struct mmu_notifier_mm *mmu_notifier_mm;
+        int ret;
+        BUG_ON(atomic_read(&mm->mm_users) <= 0);
+        ret = -ENOMEM;
+        mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+        if (unlikely(!mmu_notifier_mm))
+                goto out;
+        if (take_mmap_sem)
+                down_write(&mm->mmap_sem);
+        ret = mm_take_all_locks(mm);
+        if (unlikely(ret))
+                goto out_cleanup;
+        if (!mm_has_notifiers(mm)) {
+                INIT_HLIST_HEAD(&mmu_notifier_mm->list);
+                spin_lock_init(&mmu_notifier_mm->lock);
+                mm->mmu_notifier_mm = mmu_notifier_mm;
+                mmu_notifier_mm = NULL;
+        }
+        atomic_inc(&mm->mm_count);
+        /*
+         * Serialize the update against mmu_notifier_unregister. A
+         * side note: mmu_notifier_release can't run concurrently with
+         * us because we hold the mm_users pin (either implicitly as
+         * current->mm or explicitly with get_task_mm() or similar).
+         * We can't race against any other mmu notifier method either
+         * thanks to mm_take_all_locks().
+         */
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
+        spin_unlock(&mm->mmu_notifier_mm->lock);
+        mm_drop_all_locks(mm);
+out_cleanup:
+        if (take_mmap_sem)
+                up_write(&mm->mmap_sem);
+        /* kfree() does nothing if mmu_notifier_mm is NULL */
+        kfree(mmu_notifier_mm);
+out:
+        BUG_ON(atomic_read(&mm->mm_users) <= 0);
+        return ret;
+}
+/*
+ * Must not hold mmap_sem nor any other VM related lock when calling
+ * this registration function. Must also ensure mm_users can't go down
+ * to zero while this runs to avoid races with mmu_notifier_release,
+ * so mm has to be current->mm or the mm should be pinned safely such
+ * as with get_task_mm(). If the mm is not current->mm, the mm_users
+ * pin should be released by calling mmput after mmu_notifier_register
+ * returns. mmu_notifier_unregister must be always called to
+ * unregister the notifier. mm_count is automatically pinned to allow
+ * mmu_notifier_unregister to safely run at any time later, before or
+ * after exit_mmap. ->release will always be called before exit_mmap
+ * frees the pages.
+ */
+int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+        return do_mmu_notifier_register(mn, mm, 1);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+/*
+ * Same as mmu_notifier_register but here the caller must hold the
+ * mmap_sem in write mode.
+ */
+int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+        return do_mmu_notifier_register(mn, mm, 0);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_register);
+/* this is called after the last mmu_notifier_unregister() returned */
+void __mmu_notifier_mm_destroy(struct mm_struct *mm)
+{
+        BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
+        kfree(mm->mmu_notifier_mm);
+        mm->mmu_notifier_mm = LIST_POISON1; /* debug */
+}
+/*
+ * This releases the mm_count pin automatically and frees the mm
+ * structure if it was the last user of it. It serializes against
+ * running mmu notifiers with RCU and against mmu_notifier_unregister
+ * with the unregister lock + RCU. All sptes must be dropped before
+ * calling mmu_notifier_unregister. ->release or any other notifier
+ * method may be invoked concurrently with mmu_notifier_unregister,
+ * and only after mmu_notifier_unregister returned we're guaranteed
+ * that ->release or any other method can't run anymore.
+ */
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+        BUG_ON(atomic_read(&mm->mm_count) <= 0);
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        if (!hlist_unhashed(&mn->hlist)) {
+                hlist_del_rcu(&mn->hlist);
+                /*
+                 * RCU here will force exit_mmap to wait ->release to finish
+                 * before freeing the pages.
+                 */
+                rcu_read_lock();
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+                /*
+                 * exit_mmap will block in mmu_notifier_release to
+                 * guarantee ->release is called before freeing the
+                 * pages.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+                rcu_read_unlock();
+        } else
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+        /*
+         * Wait any running method to finish, of course including
+         * ->release if it was run by mmu_notifier_relase instead of us.
+         */
+        synchronize_rcu();
+        BUG_ON(atomic_read(&mm->mm_count) <= 0);
+        mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 486ed595ee6f..16ce8b955dcf 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
                                (z->zone && !zref_in_nodemask(z, nodes)))
                        z++;
-        *zone = zonelist_zone(z++);
+        *zone = zonelist_zone(z);
        return z;
 }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index abd645a3b0a0..fded06f923f4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -203,10 +204,12 @@ success:
                dirty_accountable = 1;
        }
+        mmu_notifier_invalidate_range_start(mm, start, end);
        if (is_vm_hugetlb_page(vma))
                hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
        else
                change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
        return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
index 08e3c7f2bd15..1a7743923c8c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
        spinlock_t *old_ptl, *new_ptl;
+        unsigned long old_start;
+        old_start = old_addr;
+        mmu_notifier_invalidate_range_start(vma->vm_mm,
+                                            old_start, old_end);
        if (vma->vm_file) {
                /*
                 * Subtle point from Rajesh Venkatasubramanian: before
@@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (mapping)
                spin_unlock(&mapping->i_mmap_lock);
+        mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 }
 #define LATENCY_LIMIT   (64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index 4462b6a3fcb9..ed75bc962fbe 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,7 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
@@ -266,6 +266,27 @@ void *vmalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vmalloc_node);
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+/**
+ *      vmalloc_exec  -  allocate virtually contiguous, executable memory
+ *      @size:          allocation size
+ *
+ *      Kernel-internal function to allocate enough pages to cover @size
+ *      the page level allocator and map them into contiguous and
+ *      executable kernel virtual space.
+ *
+ *      For tight control over page level allocator and protection flags
+ *      use __vmalloc() instead.
+ */
+void *vmalloc_exec(unsigned long size)
+{
+        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+}
 /**
 * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
 *      @size:          allocation size
@@ -745,7 +766,7 @@ static unsigned long determine_vm_flags(struct file *file,
         * it's being traced - otherwise breakpoints set in it may interfere
         * with another untraced process
         */
-        if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED))
+        if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
                vm_flags &= ~VM_MAYSHARE;
        return vm_flags;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8a5467ee6265..64e5b4bcd964 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/memcontrol.h>
+#include <linux/security.h>
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
@@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * Superuser processes are usually more important, so we make it
         * less likely that we kill those.
         */
-        if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE))
+        if (has_capability(p, CAP_SYS_ADMIN) ||
+            has_capability(p, CAP_SYS_RESOURCE))
                points /= 4;
        /*
@@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * tend to only have this flag set on applications they think
         * of as important.
         */
-        if (__capable(p, CAP_SYS_RAWIO))
+        if (has_capability(p, CAP_SYS_RAWIO))
                points /= 4;
        /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 94c6d8988ab3..24de8b65fdbd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                if (!mapping)
                        return 1;
-                write_lock_irq(&mapping->tree_lock);
+                spin_lock_irq(&mapping->tree_lock);
                mapping2 = page_mapping(page);
                if (mapping2) { /* Race with truncate? */
                        BUG_ON(mapping2 != mapping);
@@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                        radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
                }
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
                if (mapping->host) {
                        /* !PageAnon && !swapper_space */
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page)
                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
-                write_lock_irqsave(&mapping->tree_lock, flags);
+                spin_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestClearPageWriteback(page);
                if (ret) {
                        radix_tree_tag_clear(&mapping->page_tree,
@@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page)
                                __bdi_writeout_inc(bdi);
                        }
                }
-                write_unlock_irqrestore(&mapping->tree_lock, flags);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestClearPageWriteback(page);
        }
@@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page)
                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
-                write_lock_irqsave(&mapping->tree_lock, flags);
+                spin_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestSetPageWriteback(page);
                if (!ret) {
                        radix_tree_tag_set(&mapping->page_tree,
@@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page)
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_DIRTY);
-                write_unlock_irqrestore(&mapping->tree_lock, flags);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestSetPageWriteback(page);
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6da667274df5..27b8681139fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -268,13 +268,14 @@ void prep_compound_page(struct page *page, unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
+        struct page *p = page + 1;
        set_compound_page_dtor(page, free_compound_page);
        set_compound_order(page, order);
        __SetPageHead(page);
-        for (i = 1; i < nr_pages; i++) {
+        for (i = 1; i < nr_pages; i++, p++) {
-                struct page *p = page + i;
+                if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
+                        p = pfn_to_page(page_to_pfn(page) + i);
                __SetPageTail(p);
                p->first_page = page;
        }
@@ -284,6 +285,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
+        struct page *p = page + 1;
        if (unlikely(compound_order(page) != order))
                bad_page(page);
@@ -291,8 +293,9 @@ static void destroy_compound_page(struct page *page, unsigned long order)
        if (unlikely(!PageHead(page)))
                        bad_page(page);
        __ClearPageHead(page);
-        for (i = 1; i < nr_pages; i++) {
+        for (i = 1; i < nr_pages; i++, p++) {
-                struct page *p = page + i;
+                if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
+                        p = pfn_to_page(page_to_pfn(page) + i);
                if (unlikely(!PageTail(p) |
                                (p->first_page != page)))
@@ -694,6 +697,9 @@ static int move_freepages(struct zone *zone,
 #endif
        for (page = start_page; page <= end_page;) {
+                /* Make sure we are not inadvertently changing nodes */
+                VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
                if (!pfn_valid_within(page_to_pfn(page))) {
                        page++;
                        continue;
@@ -2372,7 +2378,7 @@ static void build_zonelist_cache(pg_data_t *pgdat)
 #endif  /* CONFIG_NUMA */
-/* return values int ....just for stop_machine_run() */
+/* return values int ....just for stop_machine() */
 static int __build_all_zonelists(void *dummy)
 {
        int nid;
@@ -2397,7 +2403,7 @@ void build_all_zonelists(void)
        } else {
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
-                stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+                stop_machine(__build_all_zonelists, NULL, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -2516,6 +2522,10 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                        continue;
                page = pfn_to_page(pfn);
+                /* Watch out for overlapping nodes */
+                if (page_to_nid(page) != zone_to_nid(zone))
+                        continue;
                /* Blocks with reserved pages will never free, skip them. */
                if (PageReserved(page))
                        continue;
@@ -3753,23 +3763,6 @@ unsigned long __init find_min_pfn_with_active_regions(void)
        return find_min_pfn_for_node(MAX_NUMNODES);
 }
-/**
- * find_max_pfn_with_active_regions - Find the maximum PFN registered
- *
- * It returns the maximum PFN based on information provided via
- * add_active_range().
- */
-unsigned long __init find_max_pfn_with_active_regions(void)
-{
-        int i;
-        unsigned long max_pfn = 0;
-        for (i = 0; i < nr_nodemap_entries; i++)
-                max_pfn = max(max_pfn, early_node_map[i].end_pfn);
-        return max_pfn;
-}
 /*
 * early_calculate_totalpages()
 * Sum pages in active regions for movable zone.
@@ -4081,7 +4074,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] };
+struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
 EXPORT_SYMBOL(contig_page_data);
 #endif
@@ -4454,7 +4447,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        do {
                size = bucketsize << log2qty;
                if (flags & HASH_EARLY)
-                        table = alloc_bootmem(size);
+                        table = alloc_bootmem_nopanic(size);
                else if (hashdist)
                        table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
                else {
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 3444b58033c8..b70a7fec1ff6 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -2,7 +2,6 @@
 * linux/mm/page_isolation.c
 */
-#include <stddef.h>
 #include <linux/mm.h>
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
@@ -115,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
 int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 {
-        unsigned long pfn;
+        unsigned long pfn, flags;
        struct page *page;
+        struct zone *zone;
+        int ret;
        pfn = start_pfn;
        /*
@@ -132,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        if (pfn < end_pfn)
                return -EBUSY;
        /* Check all pages are free or Marked as ISOLATED */
-        if (__test_page_isolated_in_pageblock(start_pfn, end_pfn))
+        zone = page_zone(pfn_to_page(pfn));
-                return 0;
+        spin_lock_irqsave(&zone->lock, flags);
-        return -EBUSY;
+        ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
+        spin_unlock_irqrestore(&zone->lock, flags);
+        return ret ? 0 : -EBUSY;
 }
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 3f703f7cb398..8dbb6805ef35 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
 static unsigned long max_pages(unsigned long min_pages)
 {
        unsigned long node_free_pages, max;
-        struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
+        int node = numa_node_id();
+        struct zone *zones = NODE_DATA(node)->node_zones;
+        int num_cpus_on_node;
+        node_to_cpumask_ptr(cpumask_on_node, node);
        node_free_pages =
 #ifdef CONFIG_ZONE_DMA
@@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages)
                zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
        max = node_free_pages / FRACTION_OF_NODE_MEM;
+        num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
+        max /= num_cpus_on_node;
        return max(max, min_pages);
 }
diff --git a/mm/readahead.c b/mm/readahead.c
index d8723a5f6496..77e8ddf945e9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping,
        if (hit_readahead_marker) {
                pgoff_t start;
-                read_lock_irq(&mapping->tree_lock);
+                rcu_read_lock();
-                start = radix_tree_next_hole(&mapping->page_tree, offset, max+1);
+                start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
-                read_unlock_irq(&mapping->tree_lock);
+                rcu_read_unlock();
                if (!start || start - offset > max)
                        return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index abbd29f7c43f..0383acfcb068 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 #include <asm/tlbflush.h>
@@ -138,7 +139,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
                anon_vma_free(anon_vma);
 }
-static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
+static void anon_vma_ctor(void *data)
 {
        struct anon_vma *anon_vma = data;
@@ -223,10 +224,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 /*
 * Check that @page is mapped at @address into @mm.
 *
+ * If @sync is false, page_check_address may perform a racy check to avoid
+ * the page table lock when the pte is not present (helpful when reclaiming
+ * highly shared pages).
+ *
 * On success returns with pte mapped and locked.
 */
 pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-                          unsigned long address, spinlock_t **ptlp)
+                          unsigned long address, spinlock_t **ptlp, int sync)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -248,7 +253,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
        pte = pte_offset_map(pmd, address);
        /* Make a quick check before getting the lock */
-        if (!pte_present(*pte)) {
+        if (!sync && !pte_present(*pte)) {
                pte_unmap(pte);
                return NULL;
        }
@@ -280,14 +285,14 @@ static int page_referenced_one(struct page *page,
        if (address == -EFAULT)
                goto out;
-        pte = page_check_address(page, mm, address, &ptl);
+        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
                goto out;
        if (vma->vm_flags & VM_LOCKED) {
                referenced++;
                *mapcount = 1;  /* break early from loop */
-        } else if (ptep_clear_flush_young(vma, address, pte))
+        } else if (ptep_clear_flush_young_notify(vma, address, pte))
                referenced++;
        /* Pretend the page is referenced if the task has the
@@ -421,7 +426,7 @@ int page_referenced(struct page *page, int is_locked,
                        referenced += page_referenced_anon(page, mem_cont);
                else if (is_locked)
                        referenced += page_referenced_file(page, mem_cont);
-                else if (TestSetPageLocked(page))
+                else if (!trylock_page(page))
                        referenced++;
                else {
                        if (page->mapping)
@@ -449,7 +454,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
        if (address == -EFAULT)
                goto out;
-        pte = page_check_address(page, mm, address, &ptl);
+        pte = page_check_address(page, mm, address, &ptl, 1);
        if (!pte)
                goto out;
@@ -457,7 +462,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
                pte_t entry;
                flush_cache_page(vma, address, pte_pfn(*pte));
-                entry = ptep_clear_flush(vma, address, pte);
+                entry = ptep_clear_flush_notify(vma, address, pte);
                entry = pte_wrprotect(entry);
                entry = pte_mkclean(entry);
                set_pte_at(mm, address, pte, entry);
@@ -658,6 +663,22 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
                }
                /*
+                 * Now that the last pte has gone, s390 must transfer dirty
+                 * flag from storage key to struct page.  We can usually skip
+                 * this if the page is anon, so about to be freed; but perhaps
+                 * not if it's in swapcache - there might be another pte slot
+                 * containing the swap entry, but page not yet written to swap.
+                 */
+                if ((!PageAnon(page) || PageSwapCache(page)) &&
+                    page_test_dirty(page)) {
+                        page_clear_dirty(page);
+                        set_page_dirty(page);
+                }
+                mem_cgroup_uncharge_page(page);
+                __dec_zone_page_state(page,
+                        PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+                /*
                 * It would be tidy to reset the PageAnon mapping here,
                 * but that might overwrite a racing page_add_anon_rmap
                 * which increments mapcount after us but sets mapping
@@ -666,14 +687,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
                 * Leaving it set also helps swapoff to reinstate ptes
                 * faster for those pages still in swapcache.
                 */
-                if (page_test_dirty(page)) {
-                        page_clear_dirty(page);
-                        set_page_dirty(page);
-                }
-                mem_cgroup_uncharge_page(page);
-                __dec_zone_page_state(page,
-                                PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
        }
 }
@@ -695,7 +708,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        if (address == -EFAULT)
                goto out;
-        pte = page_check_address(page, mm, address, &ptl);
+        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
                goto out;
@@ -705,14 +718,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * skipped over this mm) then we should reactivate it.
         */
        if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-                        (ptep_clear_flush_young(vma, address, pte)))) {
+                        (ptep_clear_flush_young_notify(vma, address, pte)))) {
                ret = SWAP_FAIL;
                goto out_unmap;
        }
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
-        pteval = ptep_clear_flush(vma, address, pte);
+        pteval = ptep_clear_flush_notify(vma, address, pte);
        /* Move the dirty bit to the physical page now the pte is gone. */
        if (pte_dirty(pteval))
@@ -837,12 +850,12 @@ static void try_to_unmap_cluster(unsigned long cursor,
                page = vm_normal_page(vma, address, *pte);
                BUG_ON(!page || PageAnon(page));
-                if (ptep_clear_flush_young(vma, address, pte))
+                if (ptep_clear_flush_young_notify(vma, address, pte))
                        continue;
                /* Nuke the page table entry. */
                flush_cache_page(vma, address, pte_pfn(*pte));
-                pteval = ptep_clear_flush(vma, address, pte);
+                pteval = ptep_clear_flush_notify(vma, address, pte);
                /* If nonlinear, store the file page offset in the pte. */
                if (page->index != linear_page_index(vma, address))
diff --git a/mm/shmem.c b/mm/shmem.c
index f92fea94d037..04fb4f1ab88e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -936,7 +936,7 @@ found:
        spin_lock(&info->lock);
        ptr = shmem_swp_entry(info, idx, NULL);
        if (ptr && ptr->val == entry.val) {
-                error = add_to_page_cache(page, inode->i_mapping,
+                error = add_to_page_cache_locked(page, inode->i_mapping,
                                                idx, GFP_NOWAIT);
                /* does mem_cgroup_uncharge_cache_page on error */
        } else  /* we must compensate for our precharge above */
@@ -1265,7 +1265,7 @@ repeat:
                }
                /* We have to do this with page locked to prevent races */
-                if (TestSetPageLocked(swappage)) {
+                if (!trylock_page(swappage)) {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
                        wait_on_page_locked(swappage);
@@ -1301,8 +1301,8 @@ repeat:
                        SetPageUptodate(filepage);
                        set_page_dirty(filepage);
                        swap_free(swap);
-                } else if (!(error = add_to_page_cache(
+                } else if (!(error = add_to_page_cache_locked(swappage, mapping,
-                                swappage, mapping, idx, GFP_NOWAIT))) {
+                                        idx, GFP_NOWAIT))) {
                        info->flags |= SHMEM_PAGEIN;
                        shmem_swp_set(info, entry, 0);
                        shmem_swp_unmap(entry);
@@ -1329,7 +1329,7 @@ repeat:
                shmem_swp_unmap(entry);
                filepage = find_get_page(mapping, idx);
                if (filepage &&
-                    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
+                    (!PageUptodate(filepage) || !trylock_page(filepage))) {
                        spin_unlock(&info->lock);
                        wait_on_page_locked(filepage);
                        page_cache_release(filepage);
@@ -1513,7 +1513,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                inode->i_uid = current->fsuid;
                inode->i_gid = current->fsgid;
                inode->i_blocks = 0;
-                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_generation = get_seconds();
@@ -1528,6 +1527,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                        init_special_inode(inode, mode, dev);
                        break;
                case S_IFREG:
+                        inode->i_mapping->a_ops = &shmem_aops;
                        inode->i_op = &shmem_inode_operations;
                        inode->i_fop = &shmem_file_operations;
                        mpol_shared_policy_init(&info->policy,
@@ -1929,6 +1929,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                        return error;
                }
                unlock_page(page);
+                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_op = &shmem_symlink_inode_operations;
                kaddr = kmap_atomic(page, KM_USER0);
                memcpy(kaddr, symname, len);
@@ -2352,7 +2353,7 @@ static void shmem_destroy_inode(struct inode *inode)
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index f5664c5b9eb1..8e5aadd7dcd6 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask)
 * shmem_permission  -  permission() inode operation
 */
 int
-shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
+shmem_permission(struct inode *inode, int mask)
 {
        return generic_permission(inode, mask, shmem_check_acl);
 }
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537e..e76eee466886 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
        unsigned int dflags;            /* dynamic flags */
        /* constructor func */
-        void (*ctor)(struct kmem_cache *, void *);
+        void (*ctor)(void *obj);
 /* 5) cache creation/removal */
        const char *name;
@@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-        unsigned long flags,
+        unsigned long flags, void (*ctor)(void *))
-        void (*ctor)(struct kmem_cache *, void *))
 {
        size_t left_over, slab_size, ralign;
        struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                 * They must also be threaded.
                 */
                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-                        cachep->ctor(cachep, objp + obj_offset(cachep));
+                        cachep->ctor(objp + obj_offset(cachep));
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                         cachep->buffer_size / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
-                        cachep->ctor(cachep, objp);
+                        cachep->ctor(objp);
 #endif
                slab_bufctl(slabp)[i] = i + 1;
        }
@@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #endif
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
-                cachep->ctor(cachep, objp);
+                cachep->ctor(objp);
 #if ARCH_SLAB_MINALIGN
        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
@@ -4473,4 +4472,3 @@ size_t ksize(const void *objp)
        return obj_size(virt_to_cache(objp));
 }
-EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index de268eb7ac70..cb675d126791 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -514,23 +514,23 @@ size_t ksize(const void *block)
                return 0;
        sp = (struct slob_page *)virt_to_page(block);
-        if (slob_page(sp))
+        if (slob_page(sp)) {
-                return ((slob_t *)block - 1)->units + SLOB_UNIT;
+                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
-        else
+                unsigned int *m = (unsigned int *)(block - align);
+                return SLOB_UNITS(*m) * SLOB_UNIT;
+        } else
                return sp->page.private;
 }
-EXPORT_SYMBOL(ksize);
 struct kmem_cache {
        unsigned int size, align;
        unsigned long flags;
        const char *name;
-        void (*ctor)(struct kmem_cache *, void *);
+        void (*ctor)(void *);
 };
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-        size_t align, unsigned long flags,
+        size_t align, unsigned long flags, void (*ctor)(void *))
-        void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *c;
@@ -575,7 +575,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
                b = slob_new_page(flags, get_order(c->size), node);
        if (c->ctor)
-                c->ctor(c, b);
+                c->ctor(b);
        return b;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 77c21cf53ff9..0c83e6afe7b2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1012,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
 static unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
-        void (*ctor)(struct kmem_cache *, void *))
+        void (*ctor)(void *))
 {
        /*
         * Enable debugging if selected on the kernel commandline.
@@ -1040,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
-        void (*ctor)(struct kmem_cache *, void *))
+        void (*ctor)(void *))
 {
        return flags;
 }
@@ -1103,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 {
        setup_object_debug(s, page, object);
        if (unlikely(s->ctor))
-                s->ctor(s, object);
+                s->ctor(object);
 }
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1329,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
                n = get_node(s, zone_to_nid(zone));
                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-                                n->nr_partial > MIN_PARTIAL) {
+                                n->nr_partial > n->min_partial) {
                        page = get_partial_node(n);
                        if (page)
                                return page;
@@ -1381,7 +1381,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
                slab_unlock(page);
        } else {
                stat(c, DEACTIVATE_EMPTY);
-                if (n->nr_partial < MIN_PARTIAL) {
+                if (n->nr_partial < n->min_partial) {
                        /*
                         * Adding an empty slab to the partial slabs in order
                         * to avoid page allocator overhead. This slab needs
@@ -1913,13 +1913,26 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
 #endif
 }
-static void init_kmem_cache_node(struct kmem_cache_node *n)
+static void
+init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
 {
        n->nr_partial = 0;
+        /*
+         * The larger the object size is, the more pages we want on the partial
+         * list to avoid pounding the page allocator excessively.
+         */
+        n->min_partial = ilog2(s->size);
+        if (n->min_partial < MIN_PARTIAL)
+                n->min_partial = MIN_PARTIAL;
+        else if (n->min_partial > MAX_PARTIAL)
+                n->min_partial = MAX_PARTIAL;
        spin_lock_init(&n->list_lock);
        INIT_LIST_HEAD(&n->partial);
 #ifdef CONFIG_SLUB_DEBUG
        atomic_long_set(&n->nr_slabs, 0);
+        atomic_long_set(&n->total_objects, 0);
        INIT_LIST_HEAD(&n->full);
 #endif
 }
@@ -2087,7 +2100,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
        init_object(kmalloc_caches, n, 1);
        init_tracking(kmalloc_caches, n);
 #endif
-        init_kmem_cache_node(n);
+        init_kmem_cache_node(n, kmalloc_caches);
        inc_slabs_node(kmalloc_caches, node, page->objects);
        /*
@@ -2144,7 +2157,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
                }
                s->node[node] = n;
-                init_kmem_cache_node(n);
+                init_kmem_cache_node(n, s);
        }
        return 1;
 }
@@ -2155,7 +2168,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
 {
-        init_kmem_cache_node(&s->local_node);
+        init_kmem_cache_node(&s->local_node, s);
        return 1;
 }
 #endif
@@ -2286,7 +2299,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
                const char *name, size_t size,
                size_t align, unsigned long flags,
-                void (*ctor)(struct kmem_cache *, void *))
+                void (*ctor)(void *))
 {
        memset(s, 0, kmem_size);
        s->name = name;
@@ -2300,7 +2313,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
        s->refcount = 1;
 #ifdef CONFIG_NUMA
-        s->remote_node_defrag_ratio = 100;
+        s->remote_node_defrag_ratio = 1000;
 #endif
        if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
                goto error;
@@ -2715,7 +2728,6 @@ size_t ksize(const void *object)
         */
        return s->size;
 }
-EXPORT_SYMBOL(ksize);
 void kfree(const void *x)
 {
@@ -2890,7 +2902,7 @@ static int slab_mem_going_online_callback(void *arg)
                        ret = -ENOMEM;
                        goto out;
                }
-                init_kmem_cache_node(n);
+                init_kmem_cache_node(n, s);
                s->node[nid] = n;
        }
 out:
@@ -3042,7 +3054,7 @@ static int slab_unmergeable(struct kmem_cache *s)
 static struct kmem_cache *find_mergeable(size_t size,
                size_t align, unsigned long flags, const char *name,
-                void (*ctor)(struct kmem_cache *, void *))
+                void (*ctor)(void *))
 {
        struct kmem_cache *s;
@@ -3082,8 +3094,7 @@ static struct kmem_cache *find_mergeable(size_t size,
 }
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-                size_t align, unsigned long flags,
+                size_t align, unsigned long flags, void (*ctor)(void *))
-                void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *s;
@@ -4048,7 +4059,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
        if (err)
                return err;
-        if (ratio < 100)
+        if (ratio <= 100)
                s->remote_node_defrag_ratio = ratio * 10;
        return length;
diff --git a/mm/sparse.c b/mm/sparse.c
index 8ffc08990008..39db301b920d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -12,7 +12,6 @@
 #include <asm/dma.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
-#include "internal.h"
 /*
 * Permanent SPARSEMEM data:
@@ -377,7 +376,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
+static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 {
        struct page *map;
        struct mem_section *ms = __nr_to_section(pnum);
diff --git a/mm/swap.c b/mm/swap.c
index dd89234ee51f..9e0cb3118079 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -278,9 +278,10 @@ int lru_add_drain_all(void)
 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
 * for the remainder of the operation.
 *
- * The locking in this function is against shrink_cache(): we recheck the
+ * The locking in this function is against shrink_inactive_list(): we recheck
- * page count inside the lock to see whether shrink_cache grabbed the page
+ * the page count inside the lock to see whether shrink_inactive_list()
- * via the LRU.  If it did, give up: shrink_cache will free it.
+ * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()
+ * will free it.
 */
 void release_pages(struct page **pages, int nr, int cold)
 {
@@ -443,7 +444,7 @@ void pagevec_strip(struct pagevec *pvec)
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
-                if (PagePrivate(page) && !TestSetPageLocked(page)) {
+                if (PagePrivate(page) && trylock_page(page)) {
                        if (PagePrivate(page))
                                try_to_release_page(page, 0);
                        unlock_page(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..797c3831cbec 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = {
 struct address_space swapper_space = {
        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-        .tree_lock      = __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
+        .tree_lock      = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
        .a_ops          = &swap_aops,
        .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
        .backing_dev_info = &swap_backing_dev_info,
@@ -56,15 +56,16 @@ static struct {
 void show_swap_cache_info(void)
 {
-        printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n",
+        printk("%lu pages in swap cache\n", total_swapcache_pages);
+        printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
                swap_cache_info.add_total, swap_cache_info.del_total,
                swap_cache_info.find_success, swap_cache_info.find_total);
-        printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+        printk("Free swap  = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
        printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
 /*
- * add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
 * but sets SwapCache flag and private instead of mapping and index.
 */
 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -76,19 +77,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
        BUG_ON(PagePrivate(page));
        error = radix_tree_preload(gfp_mask);
        if (!error) {
-                write_lock_irq(&swapper_space.tree_lock);
+                page_cache_get(page);
+                SetPageSwapCache(page);
+                set_page_private(page, entry.val);
+                spin_lock_irq(&swapper_space.tree_lock);
                error = radix_tree_insert(&swapper_space.page_tree,
                                                entry.val, page);
-                if (!error) {
+                if (likely(!error)) {
-                        page_cache_get(page);
-                        SetPageSwapCache(page);
-                        set_page_private(page, entry.val);
                        total_swapcache_pages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
                        INC_CACHE_INFO(add_total);
                }
-                write_unlock_irq(&swapper_space.tree_lock);
+                spin_unlock_irq(&swapper_space.tree_lock);
                radix_tree_preload_end();
+                if (unlikely(error)) {
+                        set_page_private(page, 0UL);
+                        ClearPageSwapCache(page);
+                        page_cache_release(page);
+                }
        }
        return error;
 }
@@ -175,9 +183,9 @@ void delete_from_swap_cache(struct page *page)
        entry.val = page_private(page);
-        write_lock_irq(&swapper_space.tree_lock);
+        spin_lock_irq(&swapper_space.tree_lock);
        __delete_from_swap_cache(page);
-        write_unlock_irq(&swapper_space.tree_lock);
+        spin_unlock_irq(&swapper_space.tree_lock);
        swap_free(entry);
        page_cache_release(page);
@@ -193,7 +201,7 @@ void delete_from_swap_cache(struct page *page)
 */
 static inline void free_swap_cache(struct page *page)
 {
-        if (PageSwapCache(page) && !TestSetPageLocked(page)) {
+        if (PageSwapCache(page) && trylock_page(page)) {
                remove_exclusive_swap_page(page);
                unlock_page(page);
        }
@@ -294,9 +302,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                 * re-using the just freed swap entry for an existing page.
                 * May fail (-ENOMEM) if radix-tree node allocation failed.
                 */
-                SetPageLocked(new_page);
+                set_page_locked(new_page);
                err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
-                if (!err) {
+                if (likely(!err)) {
                        /*
                         * Initiate read into locked page and return.
                         */
@@ -304,7 +312,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                        swap_readpage(NULL, new_page);
                        return new_page;
                }
-                ClearPageLocked(new_page);
+                clear_page_locked(new_page);
                swap_free(entry);
        } while (err != -ENOMEM);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2f33edb8bee9..1e330f2998fa 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,8 +33,8 @@
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
-DEFINE_SPINLOCK(swap_lock);
+static DEFINE_SPINLOCK(swap_lock);
-unsigned int nr_swapfiles;
+static unsigned int nr_swapfiles;
 long total_swap_pages;
 static int swap_overflow;
 static int least_priority;
@@ -44,7 +44,7 @@ static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
-struct swap_list_t swap_list = {-1, -1};
+static struct swap_list_t swap_list = {-1, -1};
 static struct swap_info_struct swap_info[MAX_SWAPFILES];
@@ -369,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
        retval = 0;
        if (p->swap_map[swp_offset(entry)] == 1) {
                /* Recheck the page count with the swapcache lock held.. */
-                write_lock_irq(&swapper_space.tree_lock);
+                spin_lock_irq(&swapper_space.tree_lock);
                if ((page_count(page) == 2) && !PageWriteback(page)) {
                        __delete_from_swap_cache(page);
                        SetPageDirty(page);
                        retval = 1;
                }
-                write_unlock_irq(&swapper_space.tree_lock);
+                spin_unlock_irq(&swapper_space.tree_lock);
        }
        spin_unlock(&swap_lock);
@@ -403,7 +403,7 @@ void free_swap_and_cache(swp_entry_t entry)
        if (p) {
                if (swap_entry_free(p, swp_offset(entry)) == 1) {
                        page = find_get_page(&swapper_space, entry.val);
-                        if (page && unlikely(TestSetPageLocked(page))) {
+                        if (page && unlikely(!trylock_page(page))) {
                                page_cache_release(page);
                                page = NULL;
                        }
@@ -656,8 +656,8 @@ static int unuse_mm(struct mm_struct *mm,
        if (!down_read_trylock(&mm->mmap_sem)) {
                /*
-                 * Activate page so shrink_cache is unlikely to unmap its
+                 * Activate page so shrink_inactive_list is unlikely to unmap
-                 * ptes while lock is dropped, so swapoff can make progress.
+                 * its ptes while lock is dropped, so swapoff can make progress.
                 */
                activate_page(page);
                unlock_page(page);
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index ae532f501943..8d7a27a6335c 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -65,31 +65,31 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
        if (!dentry)
                goto put_memory;
+        error = -ENFILE;
+        file = get_empty_filp();
+        if (!file)
+                goto put_dentry;
        error = -ENOSPC;
        inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
        if (!inode)
-                goto put_dentry;
+                goto close_file;
        d_instantiate(dentry, inode);
-        error = -ENFILE;
+        inode->i_size = size;
-        file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
-                        &ramfs_file_operations);
-        if (!file)
-                goto put_dentry;
        inode->i_nlink = 0;     /* It is unlinked */
+        init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+                        &ramfs_file_operations);
-        /* notify everyone as to the change of file size */
+#ifndef CONFIG_MMU
-        error = do_truncate(dentry, size, 0, file);
+        error = ramfs_nommu_expand_for_mapping(inode, size);
-        if (error < 0)
+        if (error)
                goto close_file;
+#endif
        return file;
 close_file:
        put_filp(file);
-        return ERR_PTR(error);
 put_dentry:
        dput(dentry);
 put_memory:
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb63414..6650c1d878b4 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -104,7 +104,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        cancel_dirty_page(page, PAGE_CACHE_SIZE);
        remove_from_page_cache(page);
-        ClearPageUptodate(page);
        ClearPageMappedToDisk(page);
        page_cache_release(page);       /* pagecache ref */
 }
@@ -188,7 +187,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        if (page_index > next)
                                next = page_index;
                        next++;
-                        if (TestSetPageLocked(page))
+                        if (!trylock_page(page))
                                continue;
                        if (PageWriteback(page)) {
                                unlock_page(page);
@@ -281,7 +280,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping,
                        pgoff_t index;
                        int lock_failed;
-                        lock_failed = TestSetPageLocked(page);
+                        lock_failed = !trylock_page(page);
                        /*
                         * We really shouldn't be looking at the ->index of an
@@ -349,18 +348,17 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
                return 0;
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        if (PageDirty(page))
                goto failed;
        BUG_ON(PagePrivate(page));
        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
-        ClearPageUptodate(page);
        page_cache_release(page);       /* pagecache ref */
        return 1;
 failed:
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
        return 0;
 }
@@ -382,7 +380,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
- * Returns -EIO if any pages could not be invalidated.
+ * Returns -EBUSY if any pages could not be invalidated.
 */
 int invalidate_inode_pages2_range(struct address_space *mapping,
                                  pgoff_t start, pgoff_t end)
@@ -442,7 +440,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                        ret2 = do_launder_page(mapping, page);
                        if (ret2 == 0) {
                                if (!invalidate_complete_page2(mapping, page))
-                                        ret2 = -EIO;
+                                        ret2 = -EBUSY;
                        }
                        if (ret2 < 0)
                                ret = ret2;
diff --git a/mm/util.c b/mm/util.c
index 8f18683825bc..cb00b748ce47 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,9 @@
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 /**
@@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
 EXPORT_SYMBOL(kmemdup);
 /**
- * krealloc - reallocate memory. The contents will remain unchanged.
+ * __krealloc - like krealloc() but don't free @p.
 * @p: object to reallocate memory for.
 * @new_size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
 *
- * The contents of the object pointed to are preserved up to the
+ * This function is like krealloc() except it never frees the originally
- * lesser of the new and old sizes.  If @p is %NULL, krealloc()
+ * allocated buffer. Use this if you don't want to free the buffer immediately
- * behaves exactly like kmalloc().  If @size is 0 and @p is not a
+ * like, for example, with RCU.
- * %NULL pointer, the object pointed to is freed.
 */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
+void *__krealloc(const void *p, size_t new_size, gfp_t flags)
 {
        void *ret;
        size_t ks = 0;
-        if (unlikely(!new_size)) {
+        if (unlikely(!new_size))
-                kfree(p);
                return ZERO_SIZE_PTR;
-        }
        if (p)
                ks = ksize(p);
@@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
                return (void *)p;
        ret = kmalloc_track_caller(new_size, flags);
-        if (ret && p) {
+        if (ret && p)
                memcpy(ret, p, ks);
+        return ret;
+}
+EXPORT_SYMBOL(__krealloc);
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.  If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc().  If @size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+        void *ret;
+        if (unlikely(!new_size)) {
                kfree(p);
+                return ZERO_SIZE_PTR;
        }
+        ret = __krealloc(p, new_size, flags);
+        if (ret && p != ret)
+                kfree(p);
        return ret;
 }
 EXPORT_SYMBOL(krealloc);
@@ -136,3 +162,27 @@ char *strndup_user(const char __user *s, long n)
        return p;
 }
 EXPORT_SYMBOL(strndup_user);
+#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+        mm->mmap_base = TASK_UNMAPPED_BASE;
+        mm->get_unmapped_area = arch_get_unmapped_area;
+        mm->unmap_area = arch_unmap_area;
+}
+#endif
+int __attribute__((weak)) get_user_pages_fast(unsigned long start,
+                                int nr_pages, int write, struct page **pages)
+{
+        struct mm_struct *mm = current->mm;
+        int ret;
+        down_read(&mm->mmap_sem);
+        ret = get_user_pages(current, mm, start, nr_pages,
+                                        write, 0, pages, NULL);
+        up_read(&mm->mmap_sem);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 35f293816294..bba06c41fc59 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -180,6 +180,13 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
        pmd_t *pmd;
        pte_t *ptep, pte;
+        /*
+         * XXX we might need to change this if we add VIRTUAL_BUG_ON for
+         * architectures that do not vmalloc module space
+         */
+        VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) &&
+                        !is_module_address(addr));
        if (!pgd_none(*pgd)) {
                pud = pud_offset(pgd, addr);
                if (!pud_none(*pud)) {
@@ -381,16 +388,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
                return;
        if ((PAGE_SIZE-1) & (unsigned long)addr) {
-                printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
+                WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
-                WARN_ON(1);
                return;
        }
        area = remove_vm_area(addr);
        if (unlikely(!area)) {
-                printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+                WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
                                addr);
-                WARN_ON(1);
                return;
        }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26672c6cd3ce..1ff1a58e7c10 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -391,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 }
 /*
- * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * Same as remove_mapping, but if the page is removed from the mapping, it
- * someone else has a ref on the page, abort and return 0.  If it was
+ * gets returned with a refcount of 0.
- * successfully detached, return 1.  Assumes the caller has a single ref on
- * this page.
 */
-int remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page)
 {
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        /*
         * The non racy check for a busy page.
         *
@@ -427,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page)
         * Note that if SetPageDirty is always performed via set_page_dirty,
         * and thus under tree_lock, then this ordering is not required.
         */
-        if (unlikely(page_count(page) != 2))
+        if (!page_freeze_refs(page, 2))
                goto cannot_free;
-        smp_rmb();
+        /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
-        if (unlikely(PageDirty(page)))
+        if (unlikely(PageDirty(page))) {
+                page_unfreeze_refs(page, 2);
                goto cannot_free;
+        }
        if (PageSwapCache(page)) {
                swp_entry_t swap = { .val = page_private(page) };
                __delete_from_swap_cache(page);
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
                swap_free(swap);
-                __put_page(page);       /* The pagecache ref */
+        } else {
-                return 1;
+                __remove_from_page_cache(page);
+                spin_unlock_irq(&mapping->tree_lock);
        }
-        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
-        __put_page(page);
        return 1;
 cannot_free:
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
+/*
+ * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * someone else has a ref on the page, abort and return 0.  If it was
+ * successfully detached, return 1.  Assumes the caller has a single ref on
+ * this page.
+ */
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+        if (__remove_mapping(mapping, page)) {
+                /*
+                 * Unfreezing the refcount with 1 rather than 2 effectively
+                 * drops the pagecache ref for us without requiring another
+                 * atomic operation.
+                 */
+                page_unfreeze_refs(page, 1);
+                return 1;
+        }
        return 0;
 }
@@ -478,7 +496,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                page = lru_to_page(page_list);
                list_del(&page->lru);
-                if (TestSetPageLocked(page))
+                if (!trylock_page(page))
                        goto keep;
                VM_BUG_ON(PageActive(page));
@@ -564,7 +582,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 * A synchronous write - probably a ramdisk.  Go
                                 * ahead and try to reclaim the page.
                                 */
-                                if (TestSetPageLocked(page))
+                                if (!trylock_page(page))
                                        goto keep;
                                if (PageDirty(page) || PageWriteback(page))
                                        goto keep_locked;
@@ -598,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PagePrivate(page)) {
                        if (!try_to_release_page(page, sc->gfp_mask))
                                goto activate_locked;
-                        if (!mapping && page_count(page) == 1)
+                        if (!mapping && page_count(page) == 1) {
-                                goto free_it;
+                                unlock_page(page);
+                                if (put_page_testzero(page))
+                                        goto free_it;
+                                else {
+                                        /*
+                                         * rare race with speculative reference.
+                                         * the speculative reference will free
+                                         * this page shortly, so we may
+                                         * increment nr_reclaimed here (and
+                                         * leave it off the LRU).
+                                         */
+                                        nr_reclaimed++;
+                                        continue;
+                                }
+                        }
                }
-                if (!mapping || !remove_mapping(mapping, page))
+                if (!mapping || !__remove_mapping(mapping, page))
                        goto keep_locked;
-free_it:
                unlock_page(page);
+free_it:
                nr_reclaimed++;
-                if (!pagevec_add(&freed_pvec, page))
+                if (!pagevec_add(&freed_pvec, page)) {
-                        __pagevec_release_nonlru(&freed_pvec);
+                        __pagevec_free(&freed_pvec);
+                        pagevec_reinit(&freed_pvec);
+                }
                continue;
 activate_locked:
@@ -623,7 +657,7 @@ keep:
        }
        list_splice(&ret_pages, page_list);
        if (pagevec_count(&freed_pvec))
-                __pagevec_release_nonlru(&freed_pvec);
+                __pagevec_free(&freed_pvec);
        count_vm_events(PGACTIVATE, pgactivate);
        return nr_reclaimed;
 }
@@ -1374,7 +1408,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
                        congestion_wait(WRITE, HZ/10);
        }
-        /* top priority shrink_caches still had more to do? don't OOM, then */
+        /* top priority shrink_zones still had more to do? don't OOM, then */
        if (!sc->all_unreclaimable && scan_global_lru(sc))
                ret = nr_reclaimed;
 out:
@@ -1945,7 +1979,7 @@ module_init(kswapd_init)
 int zone_reclaim_mode __read_mostly;
 #define RECLAIM_OFF 0
-#define RECLAIM_ZONE (1<<0)     /* Run shrink_cache on the zone */
+#define RECLAIM_ZONE (1<<0)     /* Run shrink_inactive_list on the zone */
 #define RECLAIM_WRITE (1<<1)    /* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)     /* Swap pages out during reclaim */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b0d08e667ece..d7826af2fb07 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -516,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
                        continue;
                page = pfn_to_page(pfn);
+#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
+                /*
+                 * Ordinarily, memory holes in flatmem still have a valid
+                 * memmap for the PFN range. However, an architecture for
+                 * embedded systems (e.g. ARM) can free up the memmap backing
+                 * holes to save memory on the assumption the memmap is
+                 * never used. The page_zone linkages are then broken even
+                 * though pfn_valid() returns true. Skip the page if the
+                 * linkages are broken. Even if this test passed, the impact
+                 * is that the counters for the movable type are off but
+                 * fragmentation monitoring is likely meaningless on small
+                 * systems.
+                 */
+                if (page_zone(page) != zone)
+                        continue;
+#endif
                mtype = get_pageblock_migratetype(page);
-                count[mtype]++;
+                if (mtype < MIGRATE_TYPES)
+                        count[mtype]++;
        }
        /* Print counts */