Merge commit 'v2.6.27-rc1' into x86/core

Conflicts: include/asm-x86/dma-mapping.h include/asm-x86/namei.h include/asm-x86/uaccess.h Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2008-07-30 13:33:48 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-07-30 13:33:48 -0400
commit: 15dd859cacf312f606f54502d1f66537a1e5c78c (patch)
tree: e50e125eaa6da83fa715704e53c1bde013d1ef8e /mm
parent: b2d9d33412b9d13a40cd314d93ab517950fc5950 (diff)
parent: 6e86841d05f371b5b9b86ce76c02aaee83352298 (diff)
30 files changed, 886 insertions, 206 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index aa799007a11b..446c6588c753 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -77,6 +77,9 @@ config FLAT_NODE_MEM_MAP
        def_bool y
        depends on !SPARSEMEM
+config HAVE_GET_USER_PAGES_FAST
+        bool
 #
 # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
 # to represent different areas of memory.  This variable allows
@@ -205,3 +208,6 @@ config NR_QUICK
 config VIRT_TO_BUS
        def_bool y
        depends on !ARCH_NO_VIRT_TO_BUS
+config MMU_NOTIFIER
+        bool
diff --git a/mm/Makefile b/mm/Makefile
index 06ca2381fef1..da4ccf015aea 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
 obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 843364594e23..4297bc41bfd2 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -18,27 +18,28 @@
 * Depopulating per-cpu data for a cpu going offline would be a typical
 * use case. You need to register a cpu hotplug handler for that purpose.
 */
-void percpu_depopulate(void *__pdata, int cpu)
+static void percpu_depopulate(void *__pdata, int cpu)
 {
        struct percpu_data *pdata = __percpu_disguise(__pdata);
        kfree(pdata->ptrs[cpu]);
        pdata->ptrs[cpu] = NULL;
 }
-EXPORT_SYMBOL_GPL(percpu_depopulate);
 /**
 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
 * @__pdata: per-cpu data to depopulate
 * @mask: depopulate per-cpu data for cpu's selected through mask bits
 */
-void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
+static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
 {
        int cpu;
        for_each_cpu_mask_nr(cpu, *mask)
                percpu_depopulate(__pdata, cpu);
 }
-EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
+#define percpu_depopulate_mask(__pdata, mask) \
+        __percpu_depopulate_mask((__pdata), &(mask))
 /**
 * percpu_populate - populate per-cpu data for given cpu
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
 * use case. You need to register a cpu hotplug handler for that purpose.
 * Per-cpu object is populated with zeroed buffer.
 */
-void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
+static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
 {
        struct percpu_data *pdata = __percpu_disguise(__pdata);
        int node = cpu_to_node(cpu);
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
                pdata->ptrs[cpu] = kzalloc(size, gfp);
        return pdata->ptrs[cpu];
 }
-EXPORT_SYMBOL_GPL(percpu_populate);
 /**
 * percpu_populate_mask - populate per-cpu data for more cpu's
@@ -79,8 +79,8 @@ EXPORT_SYMBOL_GPL(percpu_populate);
 *
 * Per-cpu objects are populated with zeroed buffers.
 */
-int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
+static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
-                           cpumask_t *mask)
+                                  cpumask_t *mask)
 {
        cpumask_t populated;
        int cpu;
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
                        cpu_set(cpu, populated);
        return 0;
 }
-EXPORT_SYMBOL_GPL(__percpu_populate_mask);
+#define percpu_populate_mask(__pdata, size, gfp, mask) \
+        __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 /**
 * percpu_alloc_mask - initial setup of per-cpu data
diff --git a/mm/filemap.c b/mm/filemap.c
index 2d3ec1ffc66e..42bbc6909ba4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -109,7 +109,7 @@
 /*
 * Remove a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock.
 */
 void __remove_from_page_cache(struct page *page)
 {
@@ -141,9 +141,9 @@ void remove_from_page_cache(struct page *page)
        BUG_ON(!PageLocked(page));
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
 }
 static int sync_page(void *word)
@@ -442,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 }
 /**
- * add_to_page_cache - add newly allocated pagecache pages
+ * add_to_page_cache_locked - add a locked page to the pagecache
 * @page:       page to add
 * @mapping:    the page's address_space
 * @offset:     page index
 * @gfp_mask:   page allocation mode
 *
- * This function is used to add newly allocated pagecache pages;
+ * This function is used to add a page to the pagecache. It must be locked.
- * the page is new, so we can just run SetPageLocked() against it.
- * The other page state flags were set by rmqueue().
- *
 * This function does not add the page to the LRU.  The caller must do that.
 */
-int add_to_page_cache(struct page *page, struct address_space *mapping,
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                pgoff_t offset, gfp_t gfp_mask)
 {
-        int error = mem_cgroup_cache_charge(page, current->mm,
+        int error;
+        VM_BUG_ON(!PageLocked(page));
+        error = mem_cgroup_cache_charge(page, current->mm,
                                        gfp_mask & ~__GFP_HIGHMEM);
        if (error)
                goto out;
        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (error == 0) {
-                write_lock_irq(&mapping->tree_lock);
+                page_cache_get(page);
+                page->mapping = mapping;
+                page->index = offset;
+                spin_lock_irq(&mapping->tree_lock);
                error = radix_tree_insert(&mapping->page_tree, offset, page);
-                if (!error) {
+                if (likely(!error)) {
-                        page_cache_get(page);
-                        SetPageLocked(page);
-                        page->mapping = mapping;
-                        page->index = offset;
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                } else
+                } else {
+                        page->mapping = NULL;
                        mem_cgroup_uncharge_cache_page(page);
+                        page_cache_release(page);
+                }
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
                radix_tree_preload_end();
        } else
                mem_cgroup_uncharge_cache_page(page);
 out:
        return error;
 }
-EXPORT_SYMBOL(add_to_page_cache);
+EXPORT_SYMBOL(add_to_page_cache_locked);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                pgoff_t offset, gfp_t gfp_mask)
@@ -633,15 +637,35 @@ void __lock_page_nosync(struct page *page)
 * Is there a pagecache struct page at the given (mapping, offset) tuple?
 * If yes, increment its refcount and return it; if no, return NULL.
 */
-struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
 {
+        void **pagep;
        struct page *page;
-        read_lock_irq(&mapping->tree_lock);
+        rcu_read_lock();
-        page = radix_tree_lookup(&mapping->page_tree, offset);
+repeat:
-        if (page)
+        page = NULL;
-                page_cache_get(page);
+        pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
-        read_unlock_irq(&mapping->tree_lock);
+        if (pagep) {
+                page = radix_tree_deref_slot(pagep);
+                if (unlikely(!page || page == RADIX_TREE_RETRY))
+                        goto repeat;
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /*
+                 * Has the page moved?
+                 * This is part of the lockless pagecache protocol. See
+                 * include/linux/pagemap.h for details.
+                 */
+                if (unlikely(page != *pagep)) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+        }
+        rcu_read_unlock();
        return page;
 }
 EXPORT_SYMBOL(find_get_page);
@@ -656,32 +680,22 @@ EXPORT_SYMBOL(find_get_page);
 *
 * Returns zero if the page was not present. find_lock_page() may sleep.
 */
-struct page *find_lock_page(struct address_space *mapping,
+struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
-                                pgoff_t offset)
 {
        struct page *page;
 repeat:
-        read_lock_irq(&mapping->tree_lock);
+        page = find_get_page(mapping, offset);
-        page = radix_tree_lookup(&mapping->page_tree, offset);
        if (page) {
-                page_cache_get(page);
+                lock_page(page);
-                if (TestSetPageLocked(page)) {
+                /* Has the page been truncated? */
-                        read_unlock_irq(&mapping->tree_lock);
+                if (unlikely(page->mapping != mapping)) {
-                        __lock_page(page);
+                        unlock_page(page);
+                        page_cache_release(page);
-                        /* Has the page been truncated while we slept? */
+                        goto repeat;
-                        if (unlikely(page->mapping != mapping)) {
-                                unlock_page(page);
-                                page_cache_release(page);
-                                goto repeat;
-                        }
-                        VM_BUG_ON(page->index != offset);
-                        goto out;
                }
+                VM_BUG_ON(page->index != offset);
        }
-        read_unlock_irq(&mapping->tree_lock);
-out:
        return page;
 }
 EXPORT_SYMBOL(find_lock_page);
@@ -747,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 {
        unsigned int i;
        unsigned int ret;
+        unsigned int nr_found;
+        rcu_read_lock();
+restart:
+        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                                (void ***)pages, start, nr_pages);
+        ret = 0;
+        for (i = 0; i < nr_found; i++) {
+                struct page *page;
+repeat:
+                page = radix_tree_deref_slot((void **)pages[i]);
+                if (unlikely(!page))
+                        continue;
+                /*
+                 * this can only trigger if nr_found == 1, making livelock
+                 * a non issue.
+                 */
+                if (unlikely(page == RADIX_TREE_RETRY))
+                        goto restart;
-        read_lock_irq(&mapping->tree_lock);
+                if (!page_cache_get_speculative(page))
-        ret = radix_tree_gang_lookup(&mapping->page_tree,
+                        goto repeat;
-                                (void **)pages, start, nr_pages);
-        for (i = 0; i < ret; i++)
+                /* Has the page moved? */
-                page_cache_get(pages[i]);
+                if (unlikely(page != *((void **)pages[i]))) {
-        read_unlock_irq(&mapping->tree_lock);
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                pages[ret] = page;
+                ret++;
+        }
+        rcu_read_unlock();
        return ret;
 }
@@ -774,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 {
        unsigned int i;
        unsigned int ret;
+        unsigned int nr_found;
+        rcu_read_lock();
+restart:
+        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                                (void ***)pages, index, nr_pages);
+        ret = 0;
+        for (i = 0; i < nr_found; i++) {
+                struct page *page;
+repeat:
+                page = radix_tree_deref_slot((void **)pages[i]);
+                if (unlikely(!page))
+                        continue;
+                /*
+                 * this can only trigger if nr_found == 1, making livelock
+                 * a non issue.
+                 */
+                if (unlikely(page == RADIX_TREE_RETRY))
+                        goto restart;
-        read_lock_irq(&mapping->tree_lock);
+                if (page->mapping == NULL || page->index != index)
-        ret = radix_tree_gang_lookup(&mapping->page_tree,
-                                (void **)pages, index, nr_pages);
-        for (i = 0; i < ret; i++) {
-                if (pages[i]->mapping == NULL || pages[i]->index != index)
                        break;
-                page_cache_get(pages[i]);
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /* Has the page moved? */
+                if (unlikely(page != *((void **)pages[i]))) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                pages[ret] = page;
+                ret++;
                index++;
        }
-        read_unlock_irq(&mapping->tree_lock);
+        rcu_read_unlock();
-        return i;
+        return ret;
 }
 EXPORT_SYMBOL(find_get_pages_contig);
@@ -806,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 {
        unsigned int i;
        unsigned int ret;
+        unsigned int nr_found;
+        rcu_read_lock();
+restart:
+        nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
+                                (void ***)pages, *index, nr_pages, tag);
+        ret = 0;
+        for (i = 0; i < nr_found; i++) {
+                struct page *page;
+repeat:
+                page = radix_tree_deref_slot((void **)pages[i]);
+                if (unlikely(!page))
+                        continue;
+                /*
+                 * this can only trigger if nr_found == 1, making livelock
+                 * a non issue.
+                 */
+                if (unlikely(page == RADIX_TREE_RETRY))
+                        goto restart;
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /* Has the page moved? */
+                if (unlikely(page != *((void **)pages[i]))) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                pages[ret] = page;
+                ret++;
+        }
+        rcu_read_unlock();
-        read_lock_irq(&mapping->tree_lock);
-        ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
-                                (void **)pages, *index, nr_pages, tag);
-        for (i = 0; i < ret; i++)
-                page_cache_get(pages[i]);
        if (ret)
                *index = pages[ret - 1]->index + 1;
-        read_unlock_irq(&mapping->tree_lock);
        return ret;
 }
 EXPORT_SYMBOL(find_get_pages_tag);
@@ -930,8 +1023,17 @@ find_page:
                                        ra, filp, page,
                                        index, last_index - index);
                }
-                if (!PageUptodate(page))
+                if (!PageUptodate(page)) {
-                        goto page_not_up_to_date;
+                        if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
+                                        !mapping->a_ops->is_partially_uptodate)
+                                goto page_not_up_to_date;
+                        if (TestSetPageLocked(page))
+                                goto page_not_up_to_date;
+                        if (!mapping->a_ops->is_partially_uptodate(page,
+                                                                desc, offset))
+                                goto page_not_up_to_date_locked;
+                        unlock_page(page);
+                }
 page_ok:
                /*
                 * i_size must be checked after we know the page is Uptodate.
@@ -1001,6 +1103,7 @@ page_not_up_to_date:
                if (lock_page_killable(page))
                        goto readpage_eio;
+page_not_up_to_date_locked:
                /* Did it get truncated before we got the lock? */
                if (!page->mapping) {
                        unlock_page(page);
@@ -1665,8 +1768,9 @@ static int __remove_suid(struct dentry *dentry, int kill)
        return notify_change(dentry, &newattrs);
 }
-int remove_suid(struct dentry *dentry)
+int file_remove_suid(struct file *file)
 {
+        struct dentry *dentry = file->f_path.dentry;
        int killsuid = should_remove_suid(dentry);
        int killpriv = security_inode_need_killpriv(dentry);
        int error = 0;
@@ -1680,7 +1784,7 @@ int remove_suid(struct dentry *dentry)
        return error;
 }
-EXPORT_SYMBOL(remove_suid);
+EXPORT_SYMBOL(file_remove_suid);
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
                        const struct iovec *iov, size_t base, size_t bytes)
@@ -2436,7 +2540,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
        if (count == 0)
                goto out;
-        err = remove_suid(file->f_path.dentry);
+        err = file_remove_suid(file);
        if (err)
                goto out;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 3e744abcce9d..380ab402d711 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/uio.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 #include <linux/sched.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
@@ -188,7 +189,7 @@ __xip_unmap (struct address_space * mapping,
                if (pte) {
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
-                        pteval = ptep_clear_flush(vma, address, pte);
+                        pteval = ptep_clear_flush_notify(vma, address, pte);
                        page_remove_rmap(page, vma);
                        dec_mm_counter(mm, file_rss);
                        BUG_ON(pte_dirty(pteval));
@@ -380,7 +381,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        if (count == 0)
                goto out_backing;
-        ret = remove_suid(filp->f_path.dentry);
+        ret = file_remove_suid(filp);
        if (ret)
                goto out_backing;
diff --git a/mm/fremap.c b/mm/fremap.c
index 07a9c82ce1a3..7881638e4a12 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
                spin_unlock(&mapping->i_mmap_lock);
        }
+        mmu_notifier_invalidate_range_start(mm, start, start + size);
        err = populate_range(mm, vma, start, size, pgoff);
+        mmu_notifier_invalidate_range_end(mm, start, start + size);
        if (!err && !(flags & MAP_NONBLOCK)) {
                if (unlikely(has_write_lock)) {
                        downgrade_write(&mm->mmap_sem);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a8bf4ab01f86..254ce2b90158 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/sysctl.h>
 #include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
@@ -19,6 +20,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/io.h>
 #include <linux/hugetlb.h>
 #include "internal.h"
@@ -1026,18 +1028,6 @@ static void __init report_hugepages(void)
        }
 }
-static unsigned int cpuset_mems_nr(unsigned int *array)
-{
-        int node;
-        unsigned int nr = 0;
-        for_each_node_mask(node, cpuset_current_mems_allowed)
-                nr += array[node];
-        return nr;
-}
-#ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
 static void try_to_free_low(struct hstate *h, unsigned long count)
 {
@@ -1386,6 +1376,18 @@ static int __init hugetlb_default_setup(char *s)
 }
 __setup("default_hugepagesz=", hugetlb_default_setup);
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+        int node;
+        unsigned int nr = 0;
+        for_each_node_mask(node, cpuset_current_mems_allowed)
+                nr += array[node];
+        return nr;
+}
+#ifdef CONFIG_SYSCTL
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
                           struct file *file, void __user *buffer,
                           size_t *length, loff_t *ppos)
@@ -1672,6 +1674,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));
+        mmu_notifier_invalidate_range_start(mm, start, end);
        spin_lock(&mm->page_table_lock);
        for (address = start; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address);
@@ -1713,6 +1716,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        }
        spin_unlock(&mm->page_table_lock);
        flush_tlb_range(vma, start, end);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        list_for_each_entry_safe(page, tmp, &page_list, lru) {
                list_del(&page->lru);
                put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index 262e3eb6601a..67f0ab9077d9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -374,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 *
 * The calling function must still handle the error.
 */
-void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
+                          unsigned long vaddr)
 {
        printk(KERN_ERR "Bad pte = %08llx, process = %s, "
                        "vm_flags = %lx, vaddr = %lx\n",
@@ -651,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        unsigned long next;
        unsigned long addr = vma->vm_start;
        unsigned long end = vma->vm_end;
+        int ret;
        /*
         * Don't copy ptes where a page fault will fill them correctly.
@@ -666,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (is_vm_hugetlb_page(vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+        /*
+         * We need to invalidate the secondary MMU mappings only when
+         * there could be a permission downgrade on the ptes of the
+         * parent mm. And a permission downgrade will only happen if
+         * is_cow_mapping() returns true.
+         */
+        if (is_cow_mapping(vma->vm_flags))
+                mmu_notifier_invalidate_range_start(src_mm, addr, end);
+        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
        src_pgd = pgd_offset(src_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(src_pgd))
                        continue;
-                if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+                if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
-                                                vma, addr, next))
+                                            vma, addr, next))) {
-                        return -ENOMEM;
+                        ret = -ENOMEM;
+                        break;
+                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);
-        return 0;
+        if (is_cow_mapping(vma->vm_flags))
+                mmu_notifier_invalidate_range_end(src_mm,
+                                                  vma->vm_start, end);
+        return ret;
 }
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -880,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
        unsigned long start = start_addr;
        spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
        int fullmm = (*tlbp)->fullmm;
+        struct mm_struct *mm = vma->vm_mm;
+        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
                unsigned long end;
@@ -945,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                }
        }
 out:
+        mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
        return start;   /* which is now the end (or restart) address */
 }
@@ -1615,10 +1637,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long end = addr + size;
+        unsigned long start = addr, end = addr + size;
        int err;
        BUG_ON(addr >= end);
+        mmu_notifier_invalidate_range_start(mm, start, end);
        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -1626,6 +1649,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1838,7 +1862,7 @@ gotten:
                 * seen in the presence of one thread doing SMC and another
                 * thread doing COW.
                 */
-                ptep_clear_flush(vma, address, page_table);
+                ptep_clear_flush_notify(vma, address, page_table);
                set_pte_at(mm, address, page_table, entry);
                update_mmu_cache(vma, address, entry);
                lru_cache_add_active(new_page);
diff --git a/mm/migrate.c b/mm/migrate.c
index d8c65a65c61d..153572fb60b8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -285,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
        page = migration_entry_to_page(entry);
-        get_page(page);
+        /*
+         * Once radix-tree replacement of page migration started, page_count
+         * *must* be zero. And, we don't want to call wait_on_page_locked()
+         * against a page without get_page().
+         * So, we use get_page_unless_zero(), here. Even failed, page fault
+         * will occur again.
+         */
+        if (!get_page_unless_zero(page))
+                goto out;
        pte_unmap_unlock(ptep, ptl);
        wait_on_page_locked(page);
        put_page(page);
@@ -305,6 +313,7 @@ out:
 static int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page)
 {
+        int expected_count;
        void **pslot;
        if (!mapping) {
@@ -314,14 +323,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
                return 0;
        }
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        pslot = radix_tree_lookup_slot(&mapping->page_tree,
                                        page_index(page));
-        if (page_count(page) != 2 + !!PagePrivate(page) ||
+        expected_count = 2 + !!PagePrivate(page);
+        if (page_count(page) != expected_count ||
                        (struct page *)radix_tree_deref_slot(pslot) != page) {
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
+        if (!page_freeze_refs(page, expected_count)) {
+                spin_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
@@ -338,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        radix_tree_replace_slot(pslot, newpage);
+        page_unfreeze_refs(page, expected_count);
        /*
         * Drop cache reference from old page.
         * We know this isn't the last reference.
@@ -357,10 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        __dec_zone_page_state(page, NR_FILE_PAGES);
        __inc_zone_page_state(newpage, NR_FILE_PAGES);
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
-        if (!PageSwapCache(newpage)) {
+        if (!PageSwapCache(newpage))
                mem_cgroup_uncharge_cache_page(page);
-        }
        return 0;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 5e0cc99e9cd5..245c3d69067b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,6 +26,7 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -2061,6 +2062,7 @@ void exit_mmap(struct mm_struct *mm)
        /* mm's last user has gone, and its about to be pulled down */
        arch_exit_mmap(mm);
+        mmu_notifier_release(mm);
        lru_add_drain();
        flush_cache_mm(mm);
@@ -2268,3 +2270,161 @@ int install_special_mapping(struct mm_struct *mm,
        return 0;
 }
+static DEFINE_MUTEX(mm_all_locks_mutex);
+static void vm_lock_anon_vma(struct anon_vma *anon_vma)
+{
+        if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+                /*
+                 * The LSB of head.next can't change from under us
+                 * because we hold the mm_all_locks_mutex.
+                 */
+                spin_lock(&anon_vma->lock);
+                /*
+                 * We can safely modify head.next after taking the
+                 * anon_vma->lock. If some other vma in this mm shares
+                 * the same anon_vma we won't take it again.
+                 *
+                 * No need of atomic instructions here, head.next
+                 * can't change from under us thanks to the
+                 * anon_vma->lock.
+                 */
+                if (__test_and_set_bit(0, (unsigned long *)
+                                       &anon_vma->head.next))
+                        BUG();
+        }
+}
+static void vm_lock_mapping(struct address_space *mapping)
+{
+        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+                /*
+                 * AS_MM_ALL_LOCKS can't change from under us because
+                 * we hold the mm_all_locks_mutex.
+                 *
+                 * Operations on ->flags have to be atomic because
+                 * even if AS_MM_ALL_LOCKS is stable thanks to the
+                 * mm_all_locks_mutex, there may be other cpus
+                 * changing other bitflags in parallel to us.
+                 */
+                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+                        BUG();
+                spin_lock(&mapping->i_mmap_lock);
+        }
+}
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_sem in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_sem until mm_drop_all_locks() returns.
+ *
+ * mmap_sem in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout (for example populate_range() with
+ * nonlinear vmas). It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We can take all the locks in random order because the VM code
+ * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * takes more than one of them in a row. Secondly we're protected
+ * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        int ret = -EINTR;
+        BUG_ON(down_read_trylock(&mm->mmap_sem));
+        mutex_lock(&mm_all_locks_mutex);
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (signal_pending(current))
+                        goto out_unlock;
+                if (vma->anon_vma)
+                        vm_lock_anon_vma(vma->anon_vma);
+                if (vma->vm_file && vma->vm_file->f_mapping)
+                        vm_lock_mapping(vma->vm_file->f_mapping);
+        }
+        ret = 0;
+out_unlock:
+        if (ret)
+                mm_drop_all_locks(mm);
+        return ret;
+}
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+        if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+                /*
+                 * The LSB of head.next can't change to 0 from under
+                 * us because we hold the mm_all_locks_mutex.
+                 *
+                 * We must however clear the bitflag before unlocking
+                 * the vma so the users using the anon_vma->head will
+                 * never see our bitflag.
+                 *
+                 * No need of atomic instructions here, head.next
+                 * can't change from under us until we release the
+                 * anon_vma->lock.
+                 */
+                if (!__test_and_clear_bit(0, (unsigned long *)
+                                          &anon_vma->head.next))
+                        BUG();
+                spin_unlock(&anon_vma->lock);
+        }
+}
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+                /*
+                 * AS_MM_ALL_LOCKS can't change to 0 from under us
+                 * because we hold the mm_all_locks_mutex.
+                 */
+                spin_unlock(&mapping->i_mmap_lock);
+                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+                                        &mapping->flags))
+                        BUG();
+        }
+}
+/*
+ * The mmap_sem cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        BUG_ON(down_read_trylock(&mm->mmap_sem));
+        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (vma->anon_vma)
+                        vm_unlock_anon_vma(vma->anon_vma);
+                if (vma->vm_file && vma->vm_file->f_mapping)
+                        vm_unlock_mapping(vma->vm_file->f_mapping);
+        }
+        mutex_unlock(&mm_all_locks_mutex);
+}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 000000000000..5f4ef0250bee
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,277 @@
+/*
+ *  linux/mm/mmu_notifier.c
+ *
+ *  Copyright (C) 2008  Qumranet, Inc.
+ *  Copyright (C) 2008  SGI
+ *             Christoph Lameter <clameter@sgi.com>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+/*
+ * This function can't run concurrently against mmu_notifier_register
+ * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
+ * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
+ * in parallel despite there being no task using this mm any more,
+ * through the vmas outside of the exit_mmap context, such as with
+ * vmtruncate. This serializes against mmu_notifier_unregister with
+ * the mmu_notifier_mm->lock in addition to RCU and it serializes
+ * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * can't go away from under us as exit_mmap holds an mm_count pin
+ * itself.
+ */
+void __mmu_notifier_release(struct mm_struct *mm)
+{
+        struct mmu_notifier *mn;
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
+                mn = hlist_entry(mm->mmu_notifier_mm->list.first,
+                                 struct mmu_notifier,
+                                 hlist);
+                /*
+                 * We arrived before mmu_notifier_unregister so
+                 * mmu_notifier_unregister will do nothing other than
+                 * to wait ->release to finish and
+                 * mmu_notifier_unregister to return.
+                 */
+                hlist_del_init_rcu(&mn->hlist);
+                /*
+                 * RCU here will block mmu_notifier_unregister until
+                 * ->release returns.
+                 */
+                rcu_read_lock();
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+                /*
+                 * if ->release runs before mmu_notifier_unregister it
+                 * must be handled as it's the only way for the driver
+                 * to flush all existing sptes and stop the driver
+                 * from establishing any more sptes before all the
+                 * pages in the mm are freed.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+                rcu_read_unlock();
+                spin_lock(&mm->mmu_notifier_mm->lock);
+        }
+        spin_unlock(&mm->mmu_notifier_mm->lock);
+        /*
+         * synchronize_rcu here prevents mmu_notifier_release to
+         * return to exit_mmap (which would proceed freeing all pages
+         * in the mm) until the ->release method returns, if it was
+         * invoked by mmu_notifier_unregister.
+         *
+         * The mmu_notifier_mm can't go away from under us because one
+         * mm_count is hold by exit_mmap.
+         */
+        synchronize_rcu();
+}
+/*
+ * If no young bitflag is supported by the hardware, ->clear_flush_young can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+                                        unsigned long address)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        int young = 0;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->clear_flush_young)
+                        young |= mn->ops->clear_flush_young(mn, mm, address);
+        }
+        rcu_read_unlock();
+        return young;
+}
+void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+                                          unsigned long address)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->invalidate_page)
+                        mn->ops->invalidate_page(mn, mm, address);
+        }
+        rcu_read_unlock();
+}
+void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+                                  unsigned long start, unsigned long end)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->invalidate_range_start)
+                        mn->ops->invalidate_range_start(mn, mm, start, end);
+        }
+        rcu_read_unlock();
+}
+void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+                                  unsigned long start, unsigned long end)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->invalidate_range_end)
+                        mn->ops->invalidate_range_end(mn, mm, start, end);
+        }
+        rcu_read_unlock();
+}
+static int do_mmu_notifier_register(struct mmu_notifier *mn,
+                                    struct mm_struct *mm,
+                                    int take_mmap_sem)
+{
+        struct mmu_notifier_mm *mmu_notifier_mm;
+        int ret;
+        BUG_ON(atomic_read(&mm->mm_users) <= 0);
+        ret = -ENOMEM;
+        mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+        if (unlikely(!mmu_notifier_mm))
+                goto out;
+        if (take_mmap_sem)
+                down_write(&mm->mmap_sem);
+        ret = mm_take_all_locks(mm);
+        if (unlikely(ret))
+                goto out_cleanup;
+        if (!mm_has_notifiers(mm)) {
+                INIT_HLIST_HEAD(&mmu_notifier_mm->list);
+                spin_lock_init(&mmu_notifier_mm->lock);
+                mm->mmu_notifier_mm = mmu_notifier_mm;
+                mmu_notifier_mm = NULL;
+        }
+        atomic_inc(&mm->mm_count);
+        /*
+         * Serialize the update against mmu_notifier_unregister. A
+         * side note: mmu_notifier_release can't run concurrently with
+         * us because we hold the mm_users pin (either implicitly as
+         * current->mm or explicitly with get_task_mm() or similar).
+         * We can't race against any other mmu notifier method either
+         * thanks to mm_take_all_locks().
+         */
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
+        spin_unlock(&mm->mmu_notifier_mm->lock);
+        mm_drop_all_locks(mm);
+out_cleanup:
+        if (take_mmap_sem)
+                up_write(&mm->mmap_sem);
+        /* kfree() does nothing if mmu_notifier_mm is NULL */
+        kfree(mmu_notifier_mm);
+out:
+        BUG_ON(atomic_read(&mm->mm_users) <= 0);
+        return ret;
+}
+/*
+ * Must not hold mmap_sem nor any other VM related lock when calling
+ * this registration function. Must also ensure mm_users can't go down
+ * to zero while this runs to avoid races with mmu_notifier_release,
+ * so mm has to be current->mm or the mm should be pinned safely such
+ * as with get_task_mm(). If the mm is not current->mm, the mm_users
+ * pin should be released by calling mmput after mmu_notifier_register
+ * returns. mmu_notifier_unregister must be always called to
+ * unregister the notifier. mm_count is automatically pinned to allow
+ * mmu_notifier_unregister to safely run at any time later, before or
+ * after exit_mmap. ->release will always be called before exit_mmap
+ * frees the pages.
+ */
+int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+        return do_mmu_notifier_register(mn, mm, 1);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+/*
+ * Same as mmu_notifier_register but here the caller must hold the
+ * mmap_sem in write mode.
+ */
+int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+        return do_mmu_notifier_register(mn, mm, 0);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_register);
+/* this is called after the last mmu_notifier_unregister() returned */
+void __mmu_notifier_mm_destroy(struct mm_struct *mm)
+{
+        BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
+        kfree(mm->mmu_notifier_mm);
+        mm->mmu_notifier_mm = LIST_POISON1; /* debug */
+}
+/*
+ * This releases the mm_count pin automatically and frees the mm
+ * structure if it was the last user of it. It serializes against
+ * running mmu notifiers with RCU and against mmu_notifier_unregister
+ * with the unregister lock + RCU. All sptes must be dropped before
+ * calling mmu_notifier_unregister. ->release or any other notifier
+ * method may be invoked concurrently with mmu_notifier_unregister,
+ * and only after mmu_notifier_unregister returned we're guaranteed
+ * that ->release or any other method can't run anymore.
+ */
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+        BUG_ON(atomic_read(&mm->mm_count) <= 0);
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        if (!hlist_unhashed(&mn->hlist)) {
+                hlist_del_rcu(&mn->hlist);
+                /*
+                 * RCU here will force exit_mmap to wait ->release to finish
+                 * before freeing the pages.
+                 */
+                rcu_read_lock();
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+                /*
+                 * exit_mmap will block in mmu_notifier_release to
+                 * guarantee ->release is called before freeing the
+                 * pages.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+                rcu_read_unlock();
+        } else
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+        /*
+         * Wait any running method to finish, of course including
+         * ->release if it was run by mmu_notifier_relase instead of us.
+         */
+        synchronize_rcu();
+        BUG_ON(atomic_read(&mm->mm_count) <= 0);
+        mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index abd645a3b0a0..fded06f923f4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -203,10 +204,12 @@ success:
                dirty_accountable = 1;
        }
+        mmu_notifier_invalidate_range_start(mm, start, end);
        if (is_vm_hugetlb_page(vma))
                hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
        else
                change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
        return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
index 08e3c7f2bd15..1a7743923c8c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
        spinlock_t *old_ptl, *new_ptl;
+        unsigned long old_start;
+        old_start = old_addr;
+        mmu_notifier_invalidate_range_start(vma->vm_mm,
+                                            old_start, old_end);
        if (vma->vm_file) {
                /*
                 * Subtle point from Rajesh Venkatasubramanian: before
@@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (mapping)
                spin_unlock(&mapping->i_mmap_lock);
+        mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 }
 #define LATENCY_LIMIT   (64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index 4462b6a3fcb9..5edccd9c9218 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,7 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
@@ -745,7 +745,7 @@ static unsigned long determine_vm_flags(struct file *file,
         * it's being traced - otherwise breakpoints set in it may interfere
         * with another untraced process
         */
-        if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED))
+        if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
                vm_flags &= ~VM_MAYSHARE;
        return vm_flags;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 94c6d8988ab3..24de8b65fdbd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                if (!mapping)
                        return 1;
-                write_lock_irq(&mapping->tree_lock);
+                spin_lock_irq(&mapping->tree_lock);
                mapping2 = page_mapping(page);
                if (mapping2) { /* Race with truncate? */
                        BUG_ON(mapping2 != mapping);
@@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                        radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
                }
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
                if (mapping->host) {
                        /* !PageAnon && !swapper_space */
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page)
                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
-                write_lock_irqsave(&mapping->tree_lock, flags);
+                spin_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestClearPageWriteback(page);
                if (ret) {
                        radix_tree_tag_clear(&mapping->page_tree,
@@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page)
                                __bdi_writeout_inc(bdi);
                        }
                }
-                write_unlock_irqrestore(&mapping->tree_lock, flags);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestClearPageWriteback(page);
        }
@@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page)
                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
-                write_lock_irqsave(&mapping->tree_lock, flags);
+                spin_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestSetPageWriteback(page);
                if (!ret) {
                        radix_tree_tag_set(&mapping->page_tree,
@@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page)
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_DIRTY);
-                write_unlock_irqrestore(&mapping->tree_lock, flags);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestSetPageWriteback(page);
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6da667274df5..3cf3d05b6bd4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2372,7 +2372,7 @@ static void build_zonelist_cache(pg_data_t *pgdat)
 #endif  /* CONFIG_NUMA */
-/* return values int ....just for stop_machine_run() */
+/* return values int ....just for stop_machine() */
 static int __build_all_zonelists(void *dummy)
 {
        int nid;
@@ -2397,7 +2397,7 @@ void build_all_zonelists(void)
        } else {
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
-                stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+                stop_machine(__build_all_zonelists, NULL, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
diff --git a/mm/readahead.c b/mm/readahead.c
index d8723a5f6496..77e8ddf945e9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping,
        if (hit_readahead_marker) {
                pgoff_t start;
-                read_lock_irq(&mapping->tree_lock);
+                rcu_read_lock();
-                start = radix_tree_next_hole(&mapping->page_tree, offset, max+1);
+                start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
-                read_unlock_irq(&mapping->tree_lock);
+                rcu_read_unlock();
                if (!start || start - offset > max)
                        return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index abbd29f7c43f..99bc3f9cd796 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 #include <asm/tlbflush.h>
@@ -138,7 +139,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
                anon_vma_free(anon_vma);
 }
-static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
+static void anon_vma_ctor(void *data)
 {
        struct anon_vma *anon_vma = data;
@@ -287,7 +288,7 @@ static int page_referenced_one(struct page *page,
        if (vma->vm_flags & VM_LOCKED) {
                referenced++;
                *mapcount = 1;  /* break early from loop */
-        } else if (ptep_clear_flush_young(vma, address, pte))
+        } else if (ptep_clear_flush_young_notify(vma, address, pte))
                referenced++;
        /* Pretend the page is referenced if the task has the
@@ -457,7 +458,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
                pte_t entry;
                flush_cache_page(vma, address, pte_pfn(*pte));
-                entry = ptep_clear_flush(vma, address, pte);
+                entry = ptep_clear_flush_notify(vma, address, pte);
                entry = pte_wrprotect(entry);
                entry = pte_mkclean(entry);
                set_pte_at(mm, address, pte, entry);
@@ -705,14 +706,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * skipped over this mm) then we should reactivate it.
         */
        if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-                        (ptep_clear_flush_young(vma, address, pte)))) {
+                        (ptep_clear_flush_young_notify(vma, address, pte)))) {
                ret = SWAP_FAIL;
                goto out_unmap;
        }
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
-        pteval = ptep_clear_flush(vma, address, pte);
+        pteval = ptep_clear_flush_notify(vma, address, pte);
        /* Move the dirty bit to the physical page now the pte is gone. */
        if (pte_dirty(pteval))
@@ -837,12 +838,12 @@ static void try_to_unmap_cluster(unsigned long cursor,
                page = vm_normal_page(vma, address, *pte);
                BUG_ON(!page || PageAnon(page));
-                if (ptep_clear_flush_young(vma, address, pte))
+                if (ptep_clear_flush_young_notify(vma, address, pte))
                        continue;
                /* Nuke the page table entry. */
                flush_cache_page(vma, address, pte_pfn(*pte));
-                pteval = ptep_clear_flush(vma, address, pte);
+                pteval = ptep_clear_flush_notify(vma, address, pte);
                /* If nonlinear, store the file page offset in the pte. */
                if (page->index != linear_page_index(vma, address))
diff --git a/mm/shmem.c b/mm/shmem.c
index f92fea94d037..c1e5a3b4f758 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -936,7 +936,7 @@ found:
        spin_lock(&info->lock);
        ptr = shmem_swp_entry(info, idx, NULL);
        if (ptr && ptr->val == entry.val) {
-                error = add_to_page_cache(page, inode->i_mapping,
+                error = add_to_page_cache_locked(page, inode->i_mapping,
                                                idx, GFP_NOWAIT);
                /* does mem_cgroup_uncharge_cache_page on error */
        } else  /* we must compensate for our precharge above */
@@ -1301,8 +1301,8 @@ repeat:
                        SetPageUptodate(filepage);
                        set_page_dirty(filepage);
                        swap_free(swap);
-                } else if (!(error = add_to_page_cache(
+                } else if (!(error = add_to_page_cache_locked(swappage, mapping,
-                                swappage, mapping, idx, GFP_NOWAIT))) {
+                                        idx, GFP_NOWAIT))) {
                        info->flags |= SHMEM_PAGEIN;
                        shmem_swp_set(info, entry, 0);
                        shmem_swp_unmap(entry);
@@ -1513,7 +1513,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                inode->i_uid = current->fsuid;
                inode->i_gid = current->fsgid;
                inode->i_blocks = 0;
-                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_generation = get_seconds();
@@ -1528,6 +1527,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                        init_special_inode(inode, mode, dev);
                        break;
                case S_IFREG:
+                        inode->i_mapping->a_ops = &shmem_aops;
                        inode->i_op = &shmem_inode_operations;
                        inode->i_fop = &shmem_file_operations;
                        mpol_shared_policy_init(&info->policy,
@@ -1929,6 +1929,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                        return error;
                }
                unlock_page(page);
+                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_op = &shmem_symlink_inode_operations;
                kaddr = kmap_atomic(page, KM_USER0);
                memcpy(kaddr, symname, len);
@@ -2352,7 +2353,7 @@ static void shmem_destroy_inode(struct inode *inode)
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index f5664c5b9eb1..8e5aadd7dcd6 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask)
 * shmem_permission  -  permission() inode operation
 */
 int
-shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
+shmem_permission(struct inode *inode, int mask)
 {
        return generic_permission(inode, mask, shmem_check_acl);
 }
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537e..918f04f7fef1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
        unsigned int dflags;            /* dynamic flags */
        /* constructor func */
-        void (*ctor)(struct kmem_cache *, void *);
+        void (*ctor)(void *obj);
 /* 5) cache creation/removal */
        const char *name;
@@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-        unsigned long flags,
+        unsigned long flags, void (*ctor)(void *))
-        void (*ctor)(struct kmem_cache *, void *))
 {
        size_t left_over, slab_size, ralign;
        struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                 * They must also be threaded.
                 */
                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-                        cachep->ctor(cachep, objp + obj_offset(cachep));
+                        cachep->ctor(objp + obj_offset(cachep));
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                         cachep->buffer_size / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
-                        cachep->ctor(cachep, objp);
+                        cachep->ctor(objp);
 #endif
                slab_bufctl(slabp)[i] = i + 1;
        }
@@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #endif
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
-                cachep->ctor(cachep, objp);
+                cachep->ctor(objp);
 #if ARCH_SLAB_MINALIGN
        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
diff --git a/mm/slob.c b/mm/slob.c
index de268eb7ac70..d8fbd4d1bfa7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -525,12 +525,11 @@ struct kmem_cache {
        unsigned int size, align;
        unsigned long flags;
        const char *name;
-        void (*ctor)(struct kmem_cache *, void *);
+        void (*ctor)(void *);
 };
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-        size_t align, unsigned long flags,
+        size_t align, unsigned long flags, void (*ctor)(void *))
-        void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *c;
@@ -575,7 +574,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
                b = slob_new_page(flags, get_order(c->size), node);
        if (c->ctor)
-                c->ctor(c, b);
+                c->ctor(b);
        return b;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 77c21cf53ff9..b7e2cd5d82db 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1012,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
 static unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
-        void (*ctor)(struct kmem_cache *, void *))
+        void (*ctor)(void *))
 {
        /*
         * Enable debugging if selected on the kernel commandline.
@@ -1040,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
-        void (*ctor)(struct kmem_cache *, void *))
+        void (*ctor)(void *))
 {
        return flags;
 }
@@ -1103,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 {
        setup_object_debug(s, page, object);
        if (unlikely(s->ctor))
-                s->ctor(s, object);
+                s->ctor(object);
 }
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2286,7 +2286,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
                const char *name, size_t size,
                size_t align, unsigned long flags,
-                void (*ctor)(struct kmem_cache *, void *))
+                void (*ctor)(void *))
 {
        memset(s, 0, kmem_size);
        s->name = name;
@@ -3042,7 +3042,7 @@ static int slab_unmergeable(struct kmem_cache *s)
 static struct kmem_cache *find_mergeable(size_t size,
                size_t align, unsigned long flags, const char *name,
-                void (*ctor)(struct kmem_cache *, void *))
+                void (*ctor)(void *))
 {
        struct kmem_cache *s;
@@ -3082,8 +3082,7 @@ static struct kmem_cache *find_mergeable(size_t size,
 }
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-                size_t align, unsigned long flags,
+                size_t align, unsigned long flags, void (*ctor)(void *))
-                void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *s;
diff --git a/mm/sparse.c b/mm/sparse.c
index 8ffc08990008..5d9dbbb9d39e 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -377,7 +377,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
+static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 {
        struct page *map;
        struct mem_section *ms = __nr_to_section(pnum);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..b8035b055129 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = {
 struct address_space swapper_space = {
        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-        .tree_lock      = __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
+        .tree_lock      = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
        .a_ops          = &swap_aops,
        .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
        .backing_dev_info = &swap_backing_dev_info,
@@ -56,7 +56,8 @@ static struct {
 void show_swap_cache_info(void)
 {
-        printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n",
+        printk("%lu pages in swap cache\n", total_swapcache_pages);
+        printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
                swap_cache_info.add_total, swap_cache_info.del_total,
                swap_cache_info.find_success, swap_cache_info.find_total);
        printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
@@ -64,7 +65,7 @@ void show_swap_cache_info(void)
 }
 /*
- * add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
 * but sets SwapCache flag and private instead of mapping and index.
 */
 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -76,19 +77,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
        BUG_ON(PagePrivate(page));
        error = radix_tree_preload(gfp_mask);
        if (!error) {
-                write_lock_irq(&swapper_space.tree_lock);
+                page_cache_get(page);
+                SetPageSwapCache(page);
+                set_page_private(page, entry.val);
+                spin_lock_irq(&swapper_space.tree_lock);
                error = radix_tree_insert(&swapper_space.page_tree,
                                                entry.val, page);
-                if (!error) {
+                if (likely(!error)) {
-                        page_cache_get(page);
-                        SetPageSwapCache(page);
-                        set_page_private(page, entry.val);
                        total_swapcache_pages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
                        INC_CACHE_INFO(add_total);
                }
-                write_unlock_irq(&swapper_space.tree_lock);
+                spin_unlock_irq(&swapper_space.tree_lock);
                radix_tree_preload_end();
+                if (unlikely(error)) {
+                        set_page_private(page, 0UL);
+                        ClearPageSwapCache(page);
+                        page_cache_release(page);
+                }
        }
        return error;
 }
@@ -175,9 +183,9 @@ void delete_from_swap_cache(struct page *page)
        entry.val = page_private(page);
-        write_lock_irq(&swapper_space.tree_lock);
+        spin_lock_irq(&swapper_space.tree_lock);
        __delete_from_swap_cache(page);
-        write_unlock_irq(&swapper_space.tree_lock);
+        spin_unlock_irq(&swapper_space.tree_lock);
        swap_free(entry);
        page_cache_release(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2f33edb8bee9..6beb6251e99d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,8 +33,8 @@
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
-DEFINE_SPINLOCK(swap_lock);
+static DEFINE_SPINLOCK(swap_lock);
-unsigned int nr_swapfiles;
+static unsigned int nr_swapfiles;
 long total_swap_pages;
 static int swap_overflow;
 static int least_priority;
@@ -44,7 +44,7 @@ static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
-struct swap_list_t swap_list = {-1, -1};
+static struct swap_list_t swap_list = {-1, -1};
 static struct swap_info_struct swap_info[MAX_SWAPFILES];
@@ -369,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
        retval = 0;
        if (p->swap_map[swp_offset(entry)] == 1) {
                /* Recheck the page count with the swapcache lock held.. */
-                write_lock_irq(&swapper_space.tree_lock);
+                spin_lock_irq(&swapper_space.tree_lock);
                if ((page_count(page) == 2) && !PageWriteback(page)) {
                        __delete_from_swap_cache(page);
                        SetPageDirty(page);
                        retval = 1;
                }
-                write_unlock_irq(&swapper_space.tree_lock);
+                spin_unlock_irq(&swapper_space.tree_lock);
        }
        spin_unlock(&swap_lock);
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb63414..e68443d74567 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -349,18 +349,18 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
                return 0;
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        if (PageDirty(page))
                goto failed;
        BUG_ON(PagePrivate(page));
        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
        ClearPageUptodate(page);
        page_cache_release(page);       /* pagecache ref */
        return 1;
 failed:
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
        return 0;
 }
diff --git a/mm/util.c b/mm/util.c
index 8f18683825bc..9341ca77bd88 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,9 @@
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 /**
@@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
 EXPORT_SYMBOL(kmemdup);
 /**
- * krealloc - reallocate memory. The contents will remain unchanged.
+ * __krealloc - like krealloc() but don't free @p.
 * @p: object to reallocate memory for.
 * @new_size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
 *
- * The contents of the object pointed to are preserved up to the
+ * This function is like krealloc() except it never frees the originally
- * lesser of the new and old sizes.  If @p is %NULL, krealloc()
+ * allocated buffer. Use this if you don't want to free the buffer immediately
- * behaves exactly like kmalloc().  If @size is 0 and @p is not a
+ * like, for example, with RCU.
- * %NULL pointer, the object pointed to is freed.
 */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
+void *__krealloc(const void *p, size_t new_size, gfp_t flags)
 {
        void *ret;
        size_t ks = 0;
-        if (unlikely(!new_size)) {
+        if (unlikely(!new_size))
-                kfree(p);
                return ZERO_SIZE_PTR;
-        }
        if (p)
                ks = ksize(p);
@@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
                return (void *)p;
        ret = kmalloc_track_caller(new_size, flags);
-        if (ret && p) {
+        if (ret && p)
                memcpy(ret, p, ks);
+        return ret;
+}
+EXPORT_SYMBOL(__krealloc);
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.  If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc().  If @size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+        void *ret;
+        if (unlikely(!new_size)) {
                kfree(p);
+                return ZERO_SIZE_PTR;
        }
+        ret = __krealloc(p, new_size, flags);
+        if (ret && p != ret)
+                kfree(p);
        return ret;
 }
 EXPORT_SYMBOL(krealloc);
@@ -136,3 +162,12 @@ char *strndup_user(const char __user *s, long n)
        return p;
 }
 EXPORT_SYMBOL(strndup_user);
+#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+        mm->mmap_base = TASK_UNMAPPED_BASE;
+        mm->get_unmapped_area = arch_get_unmapped_area;
+        mm->unmap_area = arch_unmap_area;
+}
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 35f293816294..85b9a0d2c877 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -381,16 +381,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
                return;
        if ((PAGE_SIZE-1) & (unsigned long)addr) {
-                printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
+                WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
-                WARN_ON(1);
                return;
        }
        area = remove_vm_area(addr);
        if (unlikely(!area)) {
-                printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+                WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
                                addr);
-                WARN_ON(1);
                return;
        }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26672c6cd3ce..8f71761bc4b7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -391,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 }
 /*
- * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * Same as remove_mapping, but if the page is removed from the mapping, it
- * someone else has a ref on the page, abort and return 0.  If it was
+ * gets returned with a refcount of 0.
- * successfully detached, return 1.  Assumes the caller has a single ref on
- * this page.
 */
-int remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page)
 {
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        /*
         * The non racy check for a busy page.
         *
@@ -427,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page)
         * Note that if SetPageDirty is always performed via set_page_dirty,
         * and thus under tree_lock, then this ordering is not required.
         */
-        if (unlikely(page_count(page) != 2))
+        if (!page_freeze_refs(page, 2))
                goto cannot_free;
-        smp_rmb();
+        /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
-        if (unlikely(PageDirty(page)))
+        if (unlikely(PageDirty(page))) {
+                page_unfreeze_refs(page, 2);
                goto cannot_free;
+        }
        if (PageSwapCache(page)) {
                swp_entry_t swap = { .val = page_private(page) };
                __delete_from_swap_cache(page);
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
                swap_free(swap);
-                __put_page(page);       /* The pagecache ref */
+        } else {
-                return 1;
+                __remove_from_page_cache(page);
+                spin_unlock_irq(&mapping->tree_lock);
        }
-        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
-        __put_page(page);
        return 1;
 cannot_free:
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
+/*
+ * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * someone else has a ref on the page, abort and return 0.  If it was
+ * successfully detached, return 1.  Assumes the caller has a single ref on
+ * this page.
+ */
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+        if (__remove_mapping(mapping, page)) {
+                /*
+                 * Unfreezing the refcount with 1 rather than 2 effectively
+                 * drops the pagecache ref for us without requiring another
+                 * atomic operation.
+                 */
+                page_unfreeze_refs(page, 1);
+                return 1;
+        }
        return 0;
 }
@@ -598,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PagePrivate(page)) {
                        if (!try_to_release_page(page, sc->gfp_mask))
                                goto activate_locked;
-                        if (!mapping && page_count(page) == 1)
+                        if (!mapping && page_count(page) == 1) {
-                                goto free_it;
+                                unlock_page(page);
+                                if (put_page_testzero(page))
+                                        goto free_it;
+                                else {
+                                        /*
+                                         * rare race with speculative reference.
+                                         * the speculative reference will free
+                                         * this page shortly, so we may
+                                         * increment nr_reclaimed here (and
+                                         * leave it off the LRU).
+                                         */
+                                        nr_reclaimed++;
+                                        continue;
+                                }
+                        }
                }
-                if (!mapping || !remove_mapping(mapping, page))
+                if (!mapping || !__remove_mapping(mapping, page))
                        goto keep_locked;
-free_it:
                unlock_page(page);
+free_it:
                nr_reclaimed++;
-                if (!pagevec_add(&freed_pvec, page))
+                if (!pagevec_add(&freed_pvec, page)) {
-                        __pagevec_release_nonlru(&freed_pvec);
+                        __pagevec_free(&freed_pvec);
+                        pagevec_reinit(&freed_pvec);
+                }
                continue;
 activate_locked:
@@ -623,7 +657,7 @@ keep:
        }
        list_splice(&ret_pages, page_list);
        if (pagevec_count(&freed_pvec))
-                __pagevec_release_nonlru(&freed_pvec);
+                __pagevec_free(&freed_pvec);
        count_vm_events(PGACTIVATE, pgactivate);
        return nr_reclaimed;
 }
author	Ingo Molnar <mingo@elte.hu>	2008-07-30 13:33:48 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-07-30 13:33:48 -0400
commit	15dd859cacf312f606f54502d1f66537a1e5c78c (patch)
tree	e50e125eaa6da83fa715704e53c1bde013d1ef8e /mm
parent	b2d9d33412b9d13a40cd314d93ab517950fc5950 (diff)
parent	6e86841d05f371b5b9b86ce76c02aaee83352298 (diff)