20 files changed, 468 insertions, 285 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index fd3386242cf0..44cf6f0a3a6d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -128,12 +128,9 @@ config SPARSEMEM_VMEMMAP
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
        depends on SPARSEMEM || X86_64_ACPI_NUMA
-        depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG
+        depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
        depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
-comment "Memory hotplug is currently incompatible with Software Suspend"
-        depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
 config MEMORY_HOTPLUG_SPARSE
        def_bool y
        depends on SPARSEMEM && MEMORY_HOTPLUG
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1065b715ef64..0e8ca0347707 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -604,10 +604,14 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
        /*
         * Finally, kill the kernel threads. We don't need to be RCU
-         * safe anymore, since the bdi is gone from visibility.
+         * safe anymore, since the bdi is gone from visibility. Force
+         * unfreeze of the thread before calling kthread_stop(), otherwise
+         * it would never exet if it is currently stuck in the refrigerator.
         */
-        list_for_each_entry(wb, &bdi->wb_list, list)
+        list_for_each_entry(wb, &bdi->wb_list, list) {
+                thaw_process(wb->task);
                kthread_stop(wb->task);
+        }
 }
 /*
@@ -628,6 +632,8 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
 void bdi_unregister(struct backing_dev_info *bdi)
 {
        if (bdi->dev) {
+                bdi_prune_sb(bdi);
                if (!bdi_cap_flush_forker(bdi))
                        bdi_wb_shutdown(bdi);
                bdi_debug_unregister(bdi);
@@ -697,7 +703,6 @@ void bdi_destroy(struct backing_dev_info *bdi)
                spin_unlock(&inode_lock);
        }
-        bdi_prune_sb(bdi);
        bdi_unregister(bdi);
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 555d5d2731c6..d1dc23cc7f10 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -143,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
        return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
 }
+/*
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system.  Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+        unsigned long cursor, end;
+        kmemleak_free_part(__va(addr), size);
+        cursor = PFN_UP(addr);
+        end = PFN_DOWN(addr + size);
+        for (; cursor < end; cursor++) {
+                __free_pages_bootmem(pfn_to_page(cursor), 0);
+                totalram_pages++;
+        }
+}
 static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 {
        int aligned;
diff --git a/mm/filemap.c b/mm/filemap.c
index ef169f37156d..8b4d88f9249e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -260,27 +260,27 @@ int filemap_flush(struct address_space *mapping)
 EXPORT_SYMBOL(filemap_flush);
 /**
- * wait_on_page_writeback_range - wait for writeback to complete
+ * filemap_fdatawait_range - wait for writeback to complete
- * @mapping:    target address_space
+ * @mapping:            address space structure to wait for
- * @start:      beginning page index
+ * @start_byte:         offset in bytes where the range starts
- * @end:        ending page index
+ * @end_byte:           offset in bytes where the range ends (inclusive)
 *
- * Wait for writeback to complete against pages indexed by start->end
+ * Walk the list of under-writeback pages of the given address space
- * inclusive
+ * in the given range and wait for all of them.
 */
-int wait_on_page_writeback_range(struct address_space *mapping,
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
-                                pgoff_t start, pgoff_t end)
+                            loff_t end_byte)
 {
+        pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
+        pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
        struct pagevec pvec;
        int nr_pages;
        int ret = 0;
-        pgoff_t index;
-        if (end < start)
+        if (end_byte < start_byte)
                return 0;
        pagevec_init(&pvec, 0);
-        index = start;
        while ((index <= end) &&
                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
                        PAGECACHE_TAG_WRITEBACK,
@@ -310,25 +310,6 @@ int wait_on_page_writeback_range(struct address_space *mapping,
        return ret;
 }
-/**
- * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
- * @mapping: address space structure to wait for
- * @start:      offset in bytes where the range starts
- * @end:        offset in bytes where the range ends (inclusive)
- *
- * Walk the list of under-writeback pages of the given address space
- * in the given range and wait for all of them.
- *
- * This is just a simple wrapper so that callers don't have to convert offsets
- * to page indexes themselves
- */
-int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
-                            loff_t end)
-{
-        return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
-                                            end >> PAGE_CACHE_SHIFT);
-}
 EXPORT_SYMBOL(filemap_fdatawait_range);
 /**
@@ -345,8 +326,7 @@ int filemap_fdatawait(struct address_space *mapping)
        if (i_size == 0)
                return 0;
-        return wait_on_page_writeback_range(mapping, 0,
+        return filemap_fdatawait_range(mapping, 0, i_size - 1);
-                                (i_size - 1) >> PAGE_CACHE_SHIFT);
 }
 EXPORT_SYMBOL(filemap_fdatawait);
@@ -393,9 +373,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
                                                 WB_SYNC_ALL);
                /* See comment of filemap_write_and_wait() */
                if (err != -EIO) {
-                        int err2 = wait_on_page_writeback_range(mapping,
+                        int err2 = filemap_fdatawait_range(mapping,
-                                                lstart >> PAGE_CACHE_SHIFT,
+                                                lstart, lend);
-                                                lend >> PAGE_CACHE_SHIFT);
                        if (!err)
                                err = err2;
                }
@@ -1844,7 +1823,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 /*
 * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then return the number of
+ * were successfully copied.  If a fault is encountered then return the number of
 * bytes which were copied.
 */
 size_t iov_iter_copy_from_user_atomic(struct page *page,
diff --git a/mm/highmem.c b/mm/highmem.c
index 25878cc49daa..9c1e627f282e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -426,16 +426,21 @@ void __init page_address_init(void)
 void debug_kmap_atomic(enum km_type type)
 {
-        static unsigned warn_count = 10;
+        static int warn_count = 10;
-        if (unlikely(warn_count == 0))
+        if (unlikely(warn_count < 0))
                return;
        if (unlikely(in_interrupt())) {
-                if (in_irq()) {
+                if (in_nmi()) {
+                        if (type != KM_NMI && type != KM_NMI_PTE) {
+                                WARN_ON(1);
+                                warn_count--;
+                        }
+                } else if (in_irq()) {
                        if (type != KM_IRQ0 && type != KM_IRQ1 &&
                            type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
-                            type != KM_BOUNCE_READ) {
+                            type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
                                WARN_ON(1);
                                warn_count--;
                        }
@@ -452,7 +457,9 @@ void debug_kmap_atomic(enum km_type type)
        }
        if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-                        type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+                        type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
+                        type == KM_IRQ_PTE || type == KM_NMI ||
+                        type == KM_NMI_PTE ) {
                if (!irqs_disabled()) {
                        WARN_ON(1);
                        warn_count--;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 8bf765c4f58d..13f33b3081ec 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1050,8 +1050,8 @@ static void scan_object(struct kmemleak_object *object)
        unsigned long flags;
        /*
-         * Once the object->lock is aquired, the corresponding memory block
+         * Once the object->lock is acquired, the corresponding memory block
-         * cannot be freed (the same lock is aquired in delete_object).
+         * cannot be freed (the same lock is acquired in delete_object).
         */
        spin_lock_irqsave(&object->lock, flags);
        if (object->flags & OBJECT_NO_SCAN)
diff --git a/mm/ksm.c b/mm/ksm.c
index bef1af4f77e3..5575f8628fef 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1012,6 +1012,7 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page,
                struct rmap_item *tree_rmap_item;
                int ret;
+                cond_resched();
                tree_rmap_item = rb_entry(*new, struct rmap_item, node);
                page2[0] = get_mergeable_page(tree_rmap_item);
                if (!page2[0])
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f99f5991d6bb..c31a310aa146 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -209,7 +209,7 @@ struct mem_cgroup {
        int     prev_priority;  /* for recording reclaim priority */
        /*
-         * While reclaiming in a hiearchy, we cache the last child we
+         * While reclaiming in a hierarchy, we cache the last child we
         * reclaimed from.
         */
        int last_scanned_child;
@@ -1720,7 +1720,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 /*
 * While swap-in, try_charge -> commit or cancel, the page is locked.
 * And when try_charge() successfully returns, one refcnt to memcg without
- * struct page_cgroup is aquired. This refcnt will be cumsumed by
+ * struct page_cgroup is acquired. This refcnt will be consumed by
 * "commit()" or removed by "cancel()"
 */
 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
@@ -2466,7 +2466,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
        cgroup_lock();
        /*
-         * If parent's use_hiearchy is set, we can't make any modifications
+         * If parent's use_hierarchy is set, we can't make any modifications
         * in the child subtrees. If it is unset, then the change can
         * occur, provided the current cgroup has no children.
         *
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index dacc64183874..1ac49fef95ab 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -174,7 +174,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
        list_for_each_entry_safe (tk, next, to_kill, nd) {
                if (doit) {
                        /*
-                         * In case something went wrong with munmaping
+                         * In case something went wrong with munmapping
                         * make sure the process doesn't catch the
                         * signal and then access the memory. Just kill it.
                         * the signal handlers
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 821dee596377..2047465cd27c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,7 @@
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
 #include <linux/pfn.h>
+#include <linux/suspend.h>
 #include <asm/tlbflush.h>
@@ -447,7 +448,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 }
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
-static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 {
        struct pglist_data *pgdat;
        unsigned long zones_size[MAX_NR_ZONES] = {0};
@@ -484,14 +486,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
        struct resource *res;
        int ret;
+        lock_system_sleep();
        res = register_memory_resource(start, size);
+        ret = -EEXIST;
        if (!res)
-                return -EEXIST;
+                goto out;
        if (!node_online(nid)) {
                pgdat = hotadd_new_pgdat(nid, start);
+                ret = -ENOMEM;
                if (!pgdat)
-                        return -ENOMEM;
+                        goto out;
                new_pgdat = 1;
        }
@@ -514,7 +520,8 @@ int __ref add_memory(int nid, u64 start, u64 size)
                BUG_ON(ret);
        }
-        return ret;
+        goto out;
 error:
        /* rollback pgdat allocation and others */
        if (new_pgdat)
@@ -522,6 +529,8 @@ error:
        if (res)
                release_memory_resource(res);
+out:
+        unlock_system_sleep();
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_memory);
@@ -758,6 +767,8 @@ int offline_pages(unsigned long start_pfn,
        if (!test_pages_in_a_zone(start_pfn, end_pfn))
                return -EINVAL;
+        lock_system_sleep();
        zone = page_zone(pfn_to_page(start_pfn));
        node = zone_to_nid(zone);
        nr_pages = end_pfn - start_pfn;
@@ -765,7 +776,7 @@ int offline_pages(unsigned long start_pfn,
        /* set above range as isolated */
        ret = start_isolate_page_range(start_pfn, end_pfn);
        if (ret)
-                return ret;
+                goto out;
        arg.start_pfn = start_pfn;
        arg.nr_pages = nr_pages;
@@ -843,6 +854,7 @@ repeat:
        writeback_set_ratelimit();
        memory_notify(MEM_OFFLINE, &arg);
+        unlock_system_sleep();
        return 0;
 failed_removal:
@@ -852,6 +864,8 @@ failed_removal:
        /* pushback to free area */
        undo_isolate_page_range(start_pfn, end_pfn);
+out:
+        unlock_system_sleep();
        return ret;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 1a4bf4813780..7dbcb22316d2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -602,7 +602,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        struct page *newpage = get_new_page(page, private, &result);
        int rcu_locked = 0;
        int charge = 0;
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem = NULL;
        if (!newpage)
                return -ENOMEM;
diff --git a/mm/mmap.c b/mm/mmap.c
index 73f5e4b64010..ed70a68e882a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/personality.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/hugetlb.h>
 #include <linux/profile.h>
 #include <linux/module.h>
@@ -932,13 +931,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        if (!(flags & MAP_FIXED))
                addr = round_hint_to_min(addr);
-        error = arch_mmap_check(addr, len, flags);
-        if (error)
-                return error;
        /* Careful about overflows.. */
        len = PAGE_ALIGN(len);
-        if (!len || len > TASK_SIZE)
+        if (!len)
                return -ENOMEM;
        /* offset overflow? */
@@ -949,24 +944,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;
-        if (flags & MAP_HUGETLB) {
-                struct user_struct *user = NULL;
-                if (file)
-                        return -EINVAL;
-                /*
-                 * VM_NORESERVE is used because the reservations will be
-                 * taken when vm_ops->mmap() is called
-                 * A dummy user value is used because we are not locking
-                 * memory so no accounting is necessary
-                 */
-                len = ALIGN(len, huge_page_size(&default_hstate));
-                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
-                                                &user, HUGETLB_ANONHUGE_INODE);
-                if (IS_ERR(file))
-                        return PTR_ERR(file);
-        }
        /* Obtain the address to map to. we verify (or select) it and ensure
         * that it represents a valid section of the address space.
         */
@@ -1061,9 +1038,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
        if (error)
                return error;
-        error = ima_file_mmap(file, prot);
-        if (error)
-                return error;
        return mmap_region(file, addr, len, flags, vm_flags, pgoff);
 }
@@ -1459,6 +1433,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
        unsigned long (*get_area)(struct file *, unsigned long,
                                  unsigned long, unsigned long, unsigned long);
+        unsigned long error = arch_mmap_check(addr, len, flags);
+        if (error)
+                return error;
+        /* Careful about overflows.. */
+        if (len > TASK_SIZE)
+                return -ENOMEM;
        get_area = current->mm->get_unmapped_area;
        if (file && file->f_op && file->f_op->get_unmapped_area)
                get_area = file->f_op->get_unmapped_area;
@@ -2003,20 +1985,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
        if (!len)
                return addr;
-        if ((addr + len) > TASK_SIZE || (addr + len) < addr)
-                return -EINVAL;
-        if (is_hugepage_only_range(mm, addr, len))
-                return -EINVAL;
        error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
        if (error)
                return error;
        flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-        error = arch_mmap_check(addr, len, flags);
+        error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
-        if (error)
+        if (error & ~PAGE_MASK)
                return error;
        /*
diff --git a/mm/mremap.c b/mm/mremap.c
index 97bff2547719..845190898d59 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -261,6 +261,137 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        return new_addr;
 }
+static struct vm_area_struct *vma_to_resize(unsigned long addr,
+        unsigned long old_len, unsigned long new_len, unsigned long *p)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma = find_vma(mm, addr);
+        if (!vma || vma->vm_start > addr)
+                goto Efault;
+        if (is_vm_hugetlb_page(vma))
+                goto Einval;
+        /* We can't remap across vm area boundaries */
+        if (old_len > vma->vm_end - addr)
+                goto Efault;
+        if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
+                if (new_len > old_len)
+                        goto Efault;
+        }
+        if (vma->vm_flags & VM_LOCKED) {
+                unsigned long locked, lock_limit;
+                locked = mm->locked_vm << PAGE_SHIFT;
+                lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+                locked += new_len - old_len;
+                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+                        goto Eagain;
+        }
+        if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+                goto Enomem;
+        if (vma->vm_flags & VM_ACCOUNT) {
+                unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
+                if (security_vm_enough_memory(charged))
+                        goto Efault;
+                *p = charged;
+        }
+        return vma;
+Efault: /* very odd choice for most of the cases, but... */
+        return ERR_PTR(-EFAULT);
+Einval:
+        return ERR_PTR(-EINVAL);
+Enomem:
+        return ERR_PTR(-ENOMEM);
+Eagain:
+        return ERR_PTR(-EAGAIN);
+}
+static unsigned long mremap_to(unsigned long addr,
+        unsigned long old_len, unsigned long new_addr,
+        unsigned long new_len)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        unsigned long ret = -EINVAL;
+        unsigned long charged = 0;
+        unsigned long map_flags;
+        if (new_addr & ~PAGE_MASK)
+                goto out;
+        if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
+                goto out;
+        /* Check if the location we're moving into overlaps the
+         * old location at all, and fail if it does.
+         */
+        if ((new_addr <= addr) && (new_addr+new_len) > addr)
+                goto out;
+        if ((addr <= new_addr) && (addr+old_len) > new_addr)
+                goto out;
+        ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+        if (ret)
+                goto out;
+        ret = do_munmap(mm, new_addr, new_len);
+        if (ret)
+                goto out;
+        if (old_len >= new_len) {
+                ret = do_munmap(mm, addr+new_len, old_len - new_len);
+                if (ret && old_len != new_len)
+                        goto out;
+                old_len = new_len;
+        }
+        vma = vma_to_resize(addr, old_len, new_len, &charged);
+        if (IS_ERR(vma)) {
+                ret = PTR_ERR(vma);
+                goto out;
+        }
+        map_flags = MAP_FIXED;
+        if (vma->vm_flags & VM_MAYSHARE)
+                map_flags |= MAP_SHARED;
+        ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
+                                ((addr - vma->vm_start) >> PAGE_SHIFT),
+                                map_flags);
+        if (ret & ~PAGE_MASK)
+                goto out1;
+        ret = move_vma(vma, addr, old_len, new_len, new_addr);
+        if (!(ret & ~PAGE_MASK))
+                goto out;
+out1:
+        vm_unacct_memory(charged);
+out:
+        return ret;
+}
+static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
+{
+        unsigned long end = vma->vm_end + delta;
+        if (end < vma->vm_end) /* overflow */
+                return 0;
+        if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
+                return 0;
+        if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
+                              0, MAP_FIXED) & ~PAGE_MASK)
+                return 0;
+        return 1;
+}
 /*
 * Expand (or shrink) an existing mapping, potentially moving it at the
 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
@@ -294,32 +425,10 @@ unsigned long do_mremap(unsigned long addr,
        if (!new_len)
                goto out;
-        /* new_addr is only valid if MREMAP_FIXED is specified */
        if (flags & MREMAP_FIXED) {
-                if (new_addr & ~PAGE_MASK)
+                if (flags & MREMAP_MAYMOVE)
-                        goto out;
+                        ret = mremap_to(addr, old_len, new_addr, new_len);
-                if (!(flags & MREMAP_MAYMOVE))
+                goto out;
-                        goto out;
-                if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
-                        goto out;
-                /* Check if the location we're moving into overlaps the
-                 * old location at all, and fail if it does.
-                 */
-                if ((new_addr <= addr) && (new_addr+new_len) > addr)
-                        goto out;
-                if ((addr <= new_addr) && (addr+old_len) > new_addr)
-                        goto out;
-                ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
-                if (ret)
-                        goto out;
-                ret = do_munmap(mm, new_addr, new_len);
-                if (ret)
-                        goto out;
        }
        /*
@@ -332,60 +441,23 @@ unsigned long do_mremap(unsigned long addr,
                if (ret && old_len != new_len)
                        goto out;
                ret = addr;
-                if (!(flags & MREMAP_FIXED) || (new_addr == addr))
+                goto out;
-                        goto out;
-                old_len = new_len;
        }
        /*
-         * Ok, we need to grow..  or relocate.
+         * Ok, we need to grow..
         */
-        ret = -EFAULT;
+        vma = vma_to_resize(addr, old_len, new_len, &charged);
-        vma = find_vma(mm, addr);
+        if (IS_ERR(vma)) {
-        if (!vma || vma->vm_start > addr)
+                ret = PTR_ERR(vma);
-                goto out;
-        if (is_vm_hugetlb_page(vma)) {
-                ret = -EINVAL;
-                goto out;
-        }
-        /* We can't remap across vm area boundaries */
-        if (old_len > vma->vm_end - addr)
-                goto out;
-        if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
-                if (new_len > old_len)
-                        goto out;
-        }
-        if (vma->vm_flags & VM_LOCKED) {
-                unsigned long locked, lock_limit;
-                locked = mm->locked_vm << PAGE_SHIFT;
-                lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-                locked += new_len - old_len;
-                ret = -EAGAIN;
-                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-                        goto out;
-        }
-        if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
-                ret = -ENOMEM;
                goto out;
        }
-        if (vma->vm_flags & VM_ACCOUNT) {
-                charged = (new_len - old_len) >> PAGE_SHIFT;
-                if (security_vm_enough_memory(charged))
-                        goto out_nc;
-        }
        /* old_len exactly to the end of the area..
-         * And we're not relocating the area.
         */
-        if (old_len == vma->vm_end - addr &&
+        if (old_len == vma->vm_end - addr) {
-            !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
-            (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
-                unsigned long max_addr = TASK_SIZE;
-                if (vma->vm_next)
-                        max_addr = vma->vm_next->vm_start;
                /* can we just expand the current mapping? */
-                if (max_addr - addr >= new_len) {
+                if (vma_expandable(vma, new_len - old_len)) {
                        int pages = (new_len - old_len) >> PAGE_SHIFT;
                        vma_adjust(vma, vma->vm_start,
@@ -409,28 +481,27 @@ unsigned long do_mremap(unsigned long addr,
         */
        ret = -ENOMEM;
        if (flags & MREMAP_MAYMOVE) {
-                if (!(flags & MREMAP_FIXED)) {
+                unsigned long map_flags = 0;
-                        unsigned long map_flags = 0;
+                if (vma->vm_flags & VM_MAYSHARE)
-                        if (vma->vm_flags & VM_MAYSHARE)
+                        map_flags |= MAP_SHARED;
-                                map_flags |= MAP_SHARED;
+                new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
-                        new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+                                        vma->vm_pgoff +
-                                                vma->vm_pgoff, map_flags);
+                                        ((addr - vma->vm_start) >> PAGE_SHIFT),
-                        if (new_addr & ~PAGE_MASK) {
+                                        map_flags);
-                                ret = new_addr;
+                if (new_addr & ~PAGE_MASK) {
-                                goto out;
+                        ret = new_addr;
-                        }
+                        goto out;
-                        ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
-                        if (ret)
-                                goto out;
                }
+                ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+                if (ret)
+                        goto out;
                ret = move_vma(vma, addr, old_len, new_len, new_addr);
        }
 out:
        if (ret & ~PAGE_MASK)
                vm_unacct_memory(charged);
-out_nc:
        return ret;
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2c5d79236ead..0b19943ecf8b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -821,7 +821,6 @@ int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ret = 0;
        int done = 0;
        struct pagevec pvec;
@@ -834,11 +833,6 @@ int write_cache_pages(struct address_space *mapping,
        int range_whole = 0;
        long nr_to_write = wbc->nr_to_write;
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                wbc->encountered_congestion = 1;
-                return 0;
-        }
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
                writeback_index = mapping->writeback_index; /* prev offset */
@@ -957,12 +951,6 @@ continue_unlock:
                                        break;
                                }
                        }
-                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                                wbc->encountered_congestion = 1;
-                                done = 1;
-                                break;
-                        }
                }
                pagevec_release(&pvec);
                cond_resched();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cdcedf661616..2bc2ac63f41e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1769,7 +1769,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                 */
                alloc_flags &= ~ALLOC_CPUSET;
-        } else if (unlikely(rt_task(p)))
+        } else if (unlikely(rt_task(p)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
@@ -1817,9 +1817,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
+restart:
        wake_all_kswapd(order, zonelist, high_zoneidx);
-restart:
        /*
         * OK, we're below the kswapd watermark and have kicked background
         * reclaim. Now things get more complex, so set up alloc_flags according
diff --git a/mm/percpu.c b/mm/percpu.c
index d90797160c2a..5adfc268b408 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -355,62 +355,86 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 }
 /**
- * pcpu_extend_area_map - extend area map for allocation
+ * pcpu_need_to_extend - determine whether chunk area map needs to be extended
- * @chunk: target chunk
+ * @chunk: chunk of interest
 *
- * Extend area map of @chunk so that it can accomodate an allocation.
+ * Determine whether area map of @chunk needs to be extended to
- * A single allocation can split an area into three areas, so this
+ * accomodate a new allocation.
- * function makes sure that @chunk->map has at least two extra slots.
 *
 * CONTEXT:
- * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
+ * pcpu_lock.
- * if area map is extended.
 *
 * RETURNS:
- * 0 if noop, 1 if successfully extended, -errno on failure.
+ * New target map allocation length if extension is necessary, 0
+ * otherwise.
 */
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk, unsigned long *flags)
+static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
 {
        int new_alloc;
-        int *new;
-        size_t size;
-        /* has enough? */
        if (chunk->map_alloc >= chunk->map_used + 2)
                return 0;
-        spin_unlock_irqrestore(&pcpu_lock, *flags);
        new_alloc = PCPU_DFL_MAP_ALLOC;
        while (new_alloc < chunk->map_used + 2)
                new_alloc *= 2;
-        new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
+        return new_alloc;
-        if (!new) {
+}
-                spin_lock_irqsave(&pcpu_lock, *flags);
+/**
+ * pcpu_extend_area_map - extend area map of a chunk
+ * @chunk: chunk of interest
+ * @new_alloc: new target allocation length of the area map
+ *
+ * Extend area map of @chunk to have @new_alloc entries.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.  Grabs and releases pcpu_lock.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+{
+        int *old = NULL, *new = NULL;
+        size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
+        unsigned long flags;
+        new = pcpu_mem_alloc(new_size);
+        if (!new)
                return -ENOMEM;
-        }
-        /*
+        /* acquire pcpu_lock and switch to new area map */
-         * Acquire pcpu_lock and switch to new area map.  Only free
+        spin_lock_irqsave(&pcpu_lock, flags);
-         * could have happened inbetween, so map_used couldn't have
-         * grown.
+        if (new_alloc <= chunk->map_alloc)
-         */
+                goto out_unlock;
-        spin_lock_irqsave(&pcpu_lock, *flags);
-        BUG_ON(new_alloc < chunk->map_used + 2);
-        size = chunk->map_alloc * sizeof(chunk->map[0]);
+        old_size = chunk->map_alloc * sizeof(chunk->map[0]);
-        memcpy(new, chunk->map, size);
+        memcpy(new, chunk->map, old_size);
        /*
         * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
         * one of the first chunks and still using static map.
         */
        if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
-                pcpu_mem_free(chunk->map, size);
+                old = chunk->map;
        chunk->map_alloc = new_alloc;
        chunk->map = new;
+        new = NULL;
+out_unlock:
+        spin_unlock_irqrestore(&pcpu_lock, flags);
+        /*
+         * pcpu_mem_free() might end up calling vfree() which uses
+         * IRQ-unsafe lock and thus can't be called under pcpu_lock.
+         */
+        pcpu_mem_free(old, old_size);
+        pcpu_mem_free(new, new_size);
        return 0;
 }
@@ -1049,7 +1073,7 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
        static int warn_limit = 10;
        struct pcpu_chunk *chunk;
        const char *err;
-        int slot, off;
+        int slot, off, new_alloc;
        unsigned long flags;
        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
@@ -1064,14 +1088,25 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;
-                if (size > chunk->contig_hint ||
-                    pcpu_extend_area_map(chunk, &flags) < 0) {
+                if (size > chunk->contig_hint) {
-                        err = "failed to extend area map of reserved chunk";
+                        err = "alloc from reserved chunk failed";
                        goto fail_unlock;
                }
+                while ((new_alloc = pcpu_need_to_extend(chunk))) {
+                        spin_unlock_irqrestore(&pcpu_lock, flags);
+                        if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
+                                err = "failed to extend area map of reserved chunk";
+                                goto fail_unlock_mutex;
+                        }
+                        spin_lock_irqsave(&pcpu_lock, flags);
+                }
                off = pcpu_alloc_area(chunk, size, align);
                if (off >= 0)
                        goto area_found;
                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }
@@ -1083,14 +1118,20 @@ restart:
                        if (size > chunk->contig_hint)
                                continue;
-                        switch (pcpu_extend_area_map(chunk, &flags)) {
+                        new_alloc = pcpu_need_to_extend(chunk);
-                        case 0:
+                        if (new_alloc) {
-                                break;
+                                spin_unlock_irqrestore(&pcpu_lock, flags);
-                        case 1:
+                                if (pcpu_extend_area_map(chunk,
-                                goto restart;   /* pcpu_lock dropped, restart */
+                                                         new_alloc) < 0) {
-                        default:
+                                        err = "failed to extend area map";
-                                err = "failed to extend area map";
+                                        goto fail_unlock_mutex;
-                                goto fail_unlock;
+                                }
+                                spin_lock_irqsave(&pcpu_lock, flags);
+                                /*
+                                 * pcpu_lock has been dropped, need to
+                                 * restart cpu_slot list walking.
+                                 */
+                                goto restart;
                        }
                        off = pcpu_alloc_area(chunk, size, align);
diff --git a/mm/slab.c b/mm/slab.c
index 7dfa481c96ba..a6c9166996a9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -604,6 +604,26 @@ static struct kmem_cache cache_cache = {
 #define BAD_ALIEN_MAGIC 0x01020304ul
+/*
+ * chicken and egg problem: delay the per-cpu array allocation
+ * until the general caches are up.
+ */
+static enum {
+        NONE,
+        PARTIAL_AC,
+        PARTIAL_L3,
+        EARLY,
+        FULL
+} g_cpucache_up;
+/*
+ * used by boot code to determine if it can use slab based allocator
+ */
+int slab_is_available(void)
+{
+        return g_cpucache_up >= EARLY;
+}
 #ifdef CONFIG_LOCKDEP
 /*
@@ -620,40 +640,52 @@ static struct kmem_cache cache_cache = {
 static struct lock_class_key on_slab_l3_key;
 static struct lock_class_key on_slab_alc_key;
-static inline void init_lock_keys(void)
+static void init_node_lock_keys(int q)
 {
-        int q;
        struct cache_sizes *s = malloc_sizes;
-        while (s->cs_size != ULONG_MAX) {
+        if (g_cpucache_up != FULL)
-                for_each_node(q) {
+                return;
-                        struct array_cache **alc;
-                        int r;
+        for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
-                        struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
+                struct array_cache **alc;
-                        if (!l3 || OFF_SLAB(s->cs_cachep))
+                struct kmem_list3 *l3;
-                                continue;
+                int r;
-                        lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
-                        alc = l3->alien;
+                l3 = s->cs_cachep->nodelists[q];
-                        /*
+                if (!l3 || OFF_SLAB(s->cs_cachep))
-                         * FIXME: This check for BAD_ALIEN_MAGIC
+                        return;
-                         * should go away when common slab code is taught to
+                lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
-                         * work even without alien caches.
+                alc = l3->alien;
-                         * Currently, non NUMA code returns BAD_ALIEN_MAGIC
+                /*
-                         * for alloc_alien_cache,
+                 * FIXME: This check for BAD_ALIEN_MAGIC
-                         */
+                 * should go away when common slab code is taught to
-                        if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
+                 * work even without alien caches.
-                                continue;
+                 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
-                        for_each_node(r) {
+                 * for alloc_alien_cache,
-                                if (alc[r])
+                 */
-                                        lockdep_set_class(&alc[r]->lock,
+                if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
-                                             &on_slab_alc_key);
+                        return;
-                        }
+                for_each_node(r) {
+                        if (alc[r])
+                                lockdep_set_class(&alc[r]->lock,
+                                        &on_slab_alc_key);
                }
-                s++;
        }
 }
+static inline void init_lock_keys(void)
+{
+        int node;
+        for_each_node(node)
+                init_node_lock_keys(node);
+}
 #else
+static void init_node_lock_keys(int q)
+{
+}
 static inline void init_lock_keys(void)
 {
 }
@@ -665,26 +697,6 @@ static inline void init_lock_keys(void)
 static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
-/*
- * chicken and egg problem: delay the per-cpu array allocation
- * until the general caches are up.
- */
-static enum {
-        NONE,
-        PARTIAL_AC,
-        PARTIAL_L3,
-        EARLY,
-        FULL
-} g_cpucache_up;
-/*
- * used by boot code to determine if it can use slab based allocator
- */
-int slab_is_available(void)
-{
-        return g_cpucache_up >= EARLY;
-}
 static DEFINE_PER_CPU(struct delayed_work, reap_work);
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -1254,6 +1266,8 @@ static int __cpuinit cpuup_prepare(long cpu)
                kfree(shared);
                free_alien_cache(alien);
        }
+        init_node_lock_keys(node);
        return 0;
 bad:
        cpuup_canceled(cpu);
@@ -3103,13 +3117,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
        } else {
                STATS_INC_ALLOCMISS(cachep);
                objp = cache_alloc_refill(cachep, flags);
+                /*
+                 * the 'ac' may be updated by cache_alloc_refill(),
+                 * and kmemleak_erase() requires its correct value.
+                 */
+                ac = cpu_cache_get(cachep);
        }
        /*
         * To avoid a false negative, if an object that is in one of the
         * per-CPU caches is leaked, we need to make sure kmemleak doesn't
         * treat the array pointers as a reference to the object.
         */
-        kmemleak_erase(&ac->entry[ac->avail]);
+        if (objp)
+                kmemleak_erase(&ac->entry[ac->avail]);
        return objp;
 }
@@ -3306,7 +3326,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
-        if (unlikely(nodeid == -1))
+        if (nodeid == -1)
                nodeid = numa_node_id();
        if (unlikely(!cachep->nodelists[nodeid])) {
diff --git a/mm/slub.c b/mm/slub.c
index 0956396faed1..da0ce55965dc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4371,12 +4371,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
        return len + sprintf(buf + len, "\n");
 }
+static void clear_stat(struct kmem_cache *s, enum stat_item si)
+{
+        int cpu;
+        for_each_online_cpu(cpu)
+                get_cpu_slab(s, cpu)->stat[si] = 0;
+}
 #define STAT_ATTR(si, text)                                     \
 static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
 {                                                               \
        return show_stat(s, buf, si);                           \
 }                                                               \
-SLAB_ATTR_RO(text);                                             \
+static ssize_t text##_store(struct kmem_cache *s,               \
+                                const char *buf, size_t length) \
+{                                                               \
+        if (buf[0] != '0')                                      \
+                return -EINVAL;                                 \
+        clear_stat(s, si);                                      \
+        return length;                                          \
+}                                                               \
+SLAB_ATTR(text);                                                \
 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
diff --git a/mm/truncate.c b/mm/truncate.c
index 450cebdabfc0..2c147a7e5f2c 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
- * Returns -EIO if any pages could not be invalidated.
+ * Returns -EBUSY if any pages could not be invalidated.
 */
 int invalidate_inode_pages2(struct address_space *mapping)
 {
diff --git a/mm/util.c b/mm/util.c
index 7c35ad95f927..b377ce430803 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,10 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/hugetlb.h>
+#include <linux/syscalls.h>
+#include <linux/mman.h>
+#include <linux/file.h>
 #include <asm/uaccess.h>
 #define CREATE_TRACE_POINTS
@@ -268,6 +272,46 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+                unsigned long, prot, unsigned long, flags,
+                unsigned long, fd, unsigned long, pgoff)
+{
+        struct file * file = NULL;
+        unsigned long retval = -EBADF;
+        if (!(flags & MAP_ANONYMOUS)) {
+                if (unlikely(flags & MAP_HUGETLB))
+                        return -EINVAL;
+                file = fget(fd);
+                if (!file)
+                        goto out;
+        } else if (flags & MAP_HUGETLB) {
+                struct user_struct *user = NULL;
+                /*
+                 * VM_NORESERVE is used because the reservations will be
+                 * taken when vm_ops->mmap() is called
+                 * A dummy user value is used because we are not locking
+                 * memory so no accounting is necessary
+                 */
+                len = ALIGN(len, huge_page_size(&default_hstate));
+                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+                                                &user, HUGETLB_ANONHUGE_INODE);
+                if (IS_ERR(file))
+                        return PTR_ERR(file);
+        }
+        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+        down_write(&current->mm->mmap_sem);
+        retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+        up_write(&current->mm->mmap_sem);
+        if (file)
+                fput(file);
+out:
+        return retval;
+}
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);