26 files changed, 1273 insertions, 602 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 2310984591ed..17b8947aa7da 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -221,6 +221,7 @@ config KSM
 config DEFAULT_MMAP_MIN_ADDR
        int "Low address space to protect from user allocation"
+        depends on MMU
        default 4096
        help
          This is the portion of low virtual memory which should be protected
@@ -251,8 +252,9 @@ config MEMORY_FAILURE
          special hardware support and typically ECC memory.
 config HWPOISON_INJECT
-        tristate "Poison pages injector"
+        tristate "HWPoison pages injector"
-        depends on MEMORY_FAILURE && DEBUG_KERNEL
+        depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
+        select PROC_PAGE_MONITOR
 config NOMMU_INITIAL_TRIM_EXCESS
        int "Turn on mmap() excess space trimming before booting"
diff --git a/mm/Makefile b/mm/Makefile
index 82131d0f8d85..7a68d2ab5560 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
-obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_KSM) += ksm.o
diff --git a/mm/filemap.c b/mm/filemap.c
index 8b4d88f9249e..698ea80f2102 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1634,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
 static struct page *__read_cache_page(struct address_space *mapping,
                                pgoff_t index,
                                int (*filler)(void *,struct page*),
-                                void *data)
+                                void *data,
+                                gfp_t gfp)
 {
        struct page *page;
        int err;
 repeat:
        page = find_get_page(mapping, index);
        if (!page) {
-                page = page_cache_alloc_cold(mapping);
+                page = __page_cache_alloc(gfp | __GFP_COLD);
                if (!page)
                        return ERR_PTR(-ENOMEM);
                err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
@@ -1661,31 +1662,18 @@ repeat:
        return page;
 }
-/**
+static struct page *do_read_cache_page(struct address_space *mapping,
- * read_cache_page_async - read into page cache, fill it if needed
- * @mapping:    the page's address_space
- * @index:      the page index
- * @filler:     function to perform the read
- * @data:       destination for read data
- *
- * Same as read_cache_page, but don't wait for page to become unlocked
- * after submitting it to the filler.
- *
- * Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page but don't wait for it to become unlocked.
- *
- * If the page does not get brought uptodate, return -EIO.
- */
-struct page *read_cache_page_async(struct address_space *mapping,
                                pgoff_t index,
                                int (*filler)(void *,struct page*),
-                                void *data)
+                                void *data,
+                                gfp_t gfp)
 {
        struct page *page;
        int err;
 retry:
-        page = __read_cache_page(mapping, index, filler, data);
+        page = __read_cache_page(mapping, index, filler, data, gfp);
        if (IS_ERR(page))
                return page;
        if (PageUptodate(page))
@@ -1710,8 +1698,67 @@ out:
        mark_page_accessed(page);
        return page;
 }
+/**
+ * read_cache_page_async - read into page cache, fill it if needed
+ * @mapping:    the page's address_space
+ * @index:      the page index
+ * @filler:     function to perform the read
+ * @data:       destination for read data
+ *
+ * Same as read_cache_page, but don't wait for page to become unlocked
+ * after submitting it to the filler.
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page but don't wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_async(struct address_space *mapping,
+                                pgoff_t index,
+                                int (*filler)(void *,struct page*),
+                                void *data)
+{
+        return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
+}
 EXPORT_SYMBOL(read_cache_page_async);
+static struct page *wait_on_page_read(struct page *page)
+{
+        if (!IS_ERR(page)) {
+                wait_on_page_locked(page);
+                if (!PageUptodate(page)) {
+                        page_cache_release(page);
+                        page = ERR_PTR(-EIO);
+                }
+        }
+        return page;
+}
+/**
+ * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping:    the page's address_space
+ * @index:      the page index
+ * @gfp:        the page allocator flags to use if allocating
+ *
+ * This is the same as "read_mapping_page(mapping, index, NULL)", but with
+ * any new page allocations done using the specified allocation flags. Note
+ * that the Radix tree operations will still use GFP_KERNEL, so you can't
+ * expect to do this atomically or anything like that - but you can pass in
+ * other page requirements.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_gfp(struct address_space *mapping,
+                                pgoff_t index,
+                                gfp_t gfp)
+{
+        filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+        return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
+}
+EXPORT_SYMBOL(read_cache_page_gfp);
 /**
 * read_cache_page - read into page cache, fill it if needed
 * @mapping:    the page's address_space
@@ -1729,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping,
                                int (*filler)(void *,struct page*),
                                void *data)
 {
-        struct page *page;
+        return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
-        page = read_cache_page_async(mapping, index, filler, data);
-        if (IS_ERR(page))
-                goto out;
-        wait_on_page_locked(page);
-        if (!PageUptodate(page)) {
-                page_cache_release(page);
-                page = ERR_PTR(-EIO);
-        }
- out:
-        return page;
 }
 EXPORT_SYMBOL(read_cache_page);
@@ -2196,6 +2232,9 @@ again:
                if (unlikely(status))
                        break;
+                if (mapping_writably_mapped(mapping))
+                        flush_dcache_page(page);
                pagefault_disable();
                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
                pagefault_enable();
@@ -2240,7 +2279,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                size_t count, ssize_t written)
 {
        struct file *file = iocb->ki_filp;
-        struct address_space *mapping = file->f_mapping;
        ssize_t status;
        struct iov_iter i;
@@ -2252,15 +2290,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                *ppos = pos + status;
        }
        
-        /*
-         * If we get here for O_DIRECT writes then we must have fallen through
-         * to buffered writes (block instantiation inside i_size).  So we sync
-         * the file data here, to try to honour O_DIRECT expectations.
-         */
-        if (unlikely(file->f_flags & O_DIRECT) && written)
-                status = filemap_write_and_wait_range(mapping,
-                                        pos, pos + written - 1);
        return written ? written : status;
 }
 EXPORT_SYMBOL(generic_file_buffered_write);
@@ -2359,10 +2388,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                 * semantics.
                 */
                endbyte = pos + written_buffered - written - 1;
-                err = do_sync_mapping_range(file->f_mapping, pos, endbyte,
+                err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
-                                            SYNC_FILE_RANGE_WAIT_BEFORE|
-                                            SYNC_FILE_RANGE_WRITE|
-                                            SYNC_FILE_RANGE_WAIT_AFTER);
                if (err == 0) {
                        written = written_buffered;
                        invalidate_mapping_pages(mapping,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65f38c218207..2d16fa6b8c2d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -402,7 +402,7 @@ static void clear_huge_page(struct page *page,
 {
        int i;
-        if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
+        if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
                clear_gigantic_page(page, addr, sz);
                return;
        }
@@ -1515,10 +1515,9 @@ static struct attribute_group hstate_attr_group = {
        .attrs = hstate_attrs,
 };
-static int __init hugetlb_sysfs_add_hstate(struct hstate *h,
+static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
-                                struct kobject *parent,
+                                    struct kobject **hstate_kobjs,
-                                struct kobject **hstate_kobjs,
+                                    struct attribute_group *hstate_attr_group)
-                                struct attribute_group *hstate_attr_group)
 {
        int retval;
        int hi = h - hstates;
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index e1d85137f086..10ea71905c1f 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -3,18 +3,68 @@
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include "internal.h"
-static struct dentry *hwpoison_dir, *corrupt_pfn;
+static struct dentry *hwpoison_dir;
 static int hwpoison_inject(void *data, u64 val)
 {
+        unsigned long pfn = val;
+        struct page *p;
+        int err;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (!hwpoison_filter_enable)
+                goto inject;
+        if (!pfn_valid(pfn))
+                return -ENXIO;
+        p = pfn_to_page(pfn);
+        /*
+         * This implies unable to support free buddy pages.
+         */
+        if (!get_page_unless_zero(p))
+                return 0;
+        if (!PageLRU(p))
+                shake_page(p, 0);
+        /*
+         * This implies unable to support non-LRU pages.
+         */
+        if (!PageLRU(p))
+                return 0;
+        /*
+         * do a racy check with elevated page count, to make sure PG_hwpoison
+         * will only be set for the targeted owner (or on a free page).
+         * We temporarily take page lock for try_get_mem_cgroup_from_page().
+         * __memory_failure() will redo the check reliably inside page lock.
+         */
+        lock_page(p);
+        err = hwpoison_filter(p);
+        unlock_page(p);
+        if (err)
+                return 0;
+inject:
+        printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
+        return __memory_failure(pfn, 18, MF_COUNT_INCREASED);
+}
+static int hwpoison_unpoison(void *data, u64 val)
+{
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
-        return __memory_failure(val, 18, 0);
+        return unpoison_memory(val);
 }
 DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
+DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
 static void pfn_inject_exit(void)
 {
@@ -24,16 +74,63 @@ static void pfn_inject_exit(void)
 static int pfn_inject_init(void)
 {
+        struct dentry *dentry;
        hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
        if (hwpoison_dir == NULL)
                return -ENOMEM;
-        corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
+        /*
+         * Note that the below poison/unpoison interfaces do not involve
+         * hardware status change, hence do not require hardware support.
+         * They are mainly for testing hwpoison in software level.
+         */
+        dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
                                          NULL, &hwpoison_fops);
-        if (corrupt_pfn == NULL) {
+        if (!dentry)
-                pfn_inject_exit();
+                goto fail;
-                return -ENOMEM;
-        }
+        dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir,
+                                     NULL, &unpoison_fops);
+        if (!dentry)
+                goto fail;
+        dentry = debugfs_create_u32("corrupt-filter-enable", 0600,
+                                    hwpoison_dir, &hwpoison_filter_enable);
+        if (!dentry)
+                goto fail;
+        dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600,
+                                    hwpoison_dir, &hwpoison_filter_dev_major);
+        if (!dentry)
+                goto fail;
+        dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600,
+                                    hwpoison_dir, &hwpoison_filter_dev_minor);
+        if (!dentry)
+                goto fail;
+        dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600,
+                                    hwpoison_dir, &hwpoison_filter_flags_mask);
+        if (!dentry)
+                goto fail;
+        dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600,
+                                    hwpoison_dir, &hwpoison_filter_flags_value);
+        if (!dentry)
+                goto fail;
+#ifdef  CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+        dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
+                                    hwpoison_dir, &hwpoison_filter_memcg);
+        if (!dentry)
+                goto fail;
+#endif
        return 0;
+fail:
+        pfn_inject_exit();
+        return -ENOMEM;
 }
 module_init(pfn_inject_init);
diff --git a/mm/internal.h b/mm/internal.h
index 4fe67a162cb4..6a697bb97fc5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page);
 */
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 extern void prep_compound_page(struct page *page, unsigned long order);
+#ifdef CONFIG_MEMORY_FAILURE
+extern bool is_free_buddy_page(struct page *page);
+#endif
 /*
@@ -247,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 #define ZONE_RECLAIM_SOME       0
 #define ZONE_RECLAIM_SUCCESS    1
 #endif
+extern int hwpoison_filter(struct page *p);
+extern u32 hwpoison_filter_dev_major;
+extern u32 hwpoison_filter_dev_minor;
+extern u64 hwpoison_filter_flags_mask;
+extern u64 hwpoison_filter_flags_value;
+extern u64 hwpoison_filter_memcg;
+extern u32 hwpoison_filter_enable;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 13f33b3081ec..5b069e4f5e48 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -93,6 +93,7 @@
 #include <linux/nodemask.h>
 #include <linux/mm.h>
 #include <linux/workqueue.h>
+#include <linux/crc32.h>
 #include <asm/sections.h>
 #include <asm/processor.h>
@@ -108,7 +109,6 @@
 #define MSECS_MIN_AGE           5000    /* minimum object age for reporting */
 #define SECS_FIRST_SCAN         60      /* delay before the first scan */
 #define SECS_SCAN_WAIT          600     /* subsequent auto scanning delay */
-#define GRAY_LIST_PASSES        25      /* maximum number of gray list scans */
 #define MAX_SCAN_SIZE           4096    /* maximum size of a scanned block */
 #define BYTES_PER_POINTER       sizeof(void *)
@@ -119,8 +119,8 @@
 /* scanning area inside a memory block */
 struct kmemleak_scan_area {
        struct hlist_node node;
-        unsigned long offset;
+        unsigned long start;
-        size_t length;
+        size_t size;
 };
 #define KMEMLEAK_GREY   0
@@ -149,6 +149,8 @@ struct kmemleak_object {
        int min_count;
        /* the total number of pointers found pointing to this object */
        int count;
+        /* checksum for detecting modified objects */
+        u32 checksum;
        /* memory ranges to be scanned inside an object (empty for all) */
        struct hlist_head area_list;
        unsigned long trace[MAX_TRACE];
@@ -164,8 +166,6 @@ struct kmemleak_object {
 #define OBJECT_REPORTED         (1 << 1)
 /* flag set to not scan the object */
 #define OBJECT_NO_SCAN          (1 << 2)
-/* flag set on newly allocated objects */
-#define OBJECT_NEW              (1 << 3)
 /* number of bytes to print per line; must be 16 or 32 */
 #define HEX_ROW_SIZE            16
@@ -241,8 +241,6 @@ struct early_log {
        const void *ptr;                /* allocated/freed memory block */
        size_t size;                    /* memory block size */
        int min_count;                  /* minimum reference count */
-        unsigned long offset;           /* scan area offset */
-        size_t length;                  /* scan area length */
        unsigned long trace[MAX_TRACE]; /* stack trace */
        unsigned int trace_len;         /* stack trace length */
 };
@@ -323,11 +321,6 @@ static bool color_gray(const struct kmemleak_object *object)
                object->count >= object->min_count;
 }
-static bool color_black(const struct kmemleak_object *object)
-{
-        return object->min_count == KMEMLEAK_BLACK;
-}
 /*
 * Objects are considered unreferenced only if their color is white, they have
 * not be deleted and have a minimum age to avoid false positives caused by
@@ -335,7 +328,7 @@ static bool color_black(const struct kmemleak_object *object)
 */
 static bool unreferenced_object(struct kmemleak_object *object)
 {
-        return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
+        return (color_white(object) && object->flags & OBJECT_ALLOCATED) &&
                time_before_eq(object->jiffies + jiffies_min_age,
                               jiffies_last_scan);
 }
@@ -348,11 +341,13 @@ static void print_unreferenced(struct seq_file *seq,
                               struct kmemleak_object *object)
 {
        int i;
+        unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
        seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
                   object->pointer, object->size);
-        seq_printf(seq, "  comm \"%s\", pid %d, jiffies %lu\n",
+        seq_printf(seq, "  comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
-                   object->comm, object->pid, object->jiffies);
+                   object->comm, object->pid, object->jiffies,
+                   msecs_age / 1000, msecs_age % 1000);
        hex_dump_object(seq, object);
        seq_printf(seq, "  backtrace:\n");
@@ -381,6 +376,7 @@ static void dump_object_info(struct kmemleak_object *object)
        pr_notice("  min_count = %d\n", object->min_count);
        pr_notice("  count = %d\n", object->count);
        pr_notice("  flags = 0x%lx\n", object->flags);
+        pr_notice("  checksum = %d\n", object->checksum);
        pr_notice("  backtrace:\n");
        print_stack_trace(&trace, 4);
 }
@@ -522,12 +518,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        INIT_HLIST_HEAD(&object->area_list);
        spin_lock_init(&object->lock);
        atomic_set(&object->use_count, 1);
-        object->flags = OBJECT_ALLOCATED | OBJECT_NEW;
+        object->flags = OBJECT_ALLOCATED;
        object->pointer = ptr;
        object->size = size;
        object->min_count = min_count;
-        object->count = -1;                     /* no color initially */
+        object->count = 0;                      /* white color initially */
        object->jiffies = jiffies;
+        object->checksum = 0;
        /* task information */
        if (in_irq()) {
@@ -720,14 +717,13 @@ static void make_black_object(unsigned long ptr)
 * Add a scanning area to the object. If at least one such area is added,
 * kmemleak will only scan these ranges rather than the whole memory block.
 */
-static void add_scan_area(unsigned long ptr, unsigned long offset,
+static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
-                          size_t length, gfp_t gfp)
 {
        unsigned long flags;
        struct kmemleak_object *object;
        struct kmemleak_scan_area *area;
-        object = find_and_get_object(ptr, 0);
+        object = find_and_get_object(ptr, 1);
        if (!object) {
                kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
                              ptr);
@@ -741,7 +737,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
        }
        spin_lock_irqsave(&object->lock, flags);
-        if (offset + length > object->size) {
+        if (ptr + size > object->pointer + object->size) {
                kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
                dump_object_info(object);
                kmem_cache_free(scan_area_cache, area);
@@ -749,8 +745,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
        }
        INIT_HLIST_NODE(&area->node);
-        area->offset = offset;
+        area->start = ptr;
-        area->length = length;
+        area->size = size;
        hlist_add_head(&area->node, &object->area_list);
 out_unlock:
@@ -786,7 +782,7 @@ static void object_no_scan(unsigned long ptr)
 * processed later once kmemleak is fully initialized.
 */
 static void __init log_early(int op_type, const void *ptr, size_t size,
-                             int min_count, unsigned long offset, size_t length)
+                             int min_count)
 {
        unsigned long flags;
        struct early_log *log;
@@ -808,8 +804,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
        log->ptr = ptr;
        log->size = size;
        log->min_count = min_count;
-        log->offset = offset;
-        log->length = length;
        if (op_type == KMEMLEAK_ALLOC)
                log->trace_len = __save_stack_trace(log->trace);
        crt_early_log++;
@@ -858,7 +852,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
                create_object((unsigned long)ptr, size, min_count, gfp);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0);
+                log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
 }
 EXPORT_SYMBOL_GPL(kmemleak_alloc);
@@ -873,7 +867,7 @@ void __ref kmemleak_free(const void *ptr)
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
                delete_object_full((unsigned long)ptr);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
+                log_early(KMEMLEAK_FREE, ptr, 0, 0);
 }
 EXPORT_SYMBOL_GPL(kmemleak_free);
@@ -888,7 +882,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
                delete_object_part((unsigned long)ptr, size);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0);
+                log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
 }
 EXPORT_SYMBOL_GPL(kmemleak_free_part);
@@ -903,7 +897,7 @@ void __ref kmemleak_not_leak(const void *ptr)
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
                make_gray_object((unsigned long)ptr);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0);
+                log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
 }
 EXPORT_SYMBOL(kmemleak_not_leak);
@@ -919,22 +913,21 @@ void __ref kmemleak_ignore(const void *ptr)
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
                make_black_object((unsigned long)ptr);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0);
+                log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
 }
 EXPORT_SYMBOL(kmemleak_ignore);
 /*
 * Limit the range to be scanned in an allocated memory block.
 */
-void __ref kmemleak_scan_area(const void *ptr, unsigned long offset,
+void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
-                              size_t length, gfp_t gfp)
 {
        pr_debug("%s(0x%p)\n", __func__, ptr);
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
-                add_scan_area((unsigned long)ptr, offset, length, gfp);
+                add_scan_area((unsigned long)ptr, size, gfp);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length);
+                log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
 }
 EXPORT_SYMBOL(kmemleak_scan_area);
@@ -948,11 +941,25 @@ void __ref kmemleak_no_scan(const void *ptr)
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
                object_no_scan((unsigned long)ptr);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0);
+                log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
 }
 EXPORT_SYMBOL(kmemleak_no_scan);
 /*
+ * Update an object's checksum and return true if it was modified.
+ */
+static bool update_checksum(struct kmemleak_object *object)
+{
+        u32 old_csum = object->checksum;
+        if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
+                return false;
+        object->checksum = crc32(0, (void *)object->pointer, object->size);
+        return object->checksum != old_csum;
+}
+/*
 * Memory scanning is a long process and it needs to be interruptable. This
 * function checks whether such interrupt condition occured.
 */
@@ -1031,11 +1038,14 @@ static void scan_block(void *_start, void *_end,
                 * added to the gray_list.
                 */
                object->count++;
-                if (color_gray(object))
+                if (color_gray(object)) {
                        list_add_tail(&object->gray_list, &gray_list);
-                else
+                        spin_unlock_irqrestore(&object->lock, flags);
-                        put_object(object);
+                        continue;
+                }
                spin_unlock_irqrestore(&object->lock, flags);
+                put_object(object);
        }
 }
@@ -1075,14 +1085,47 @@ static void scan_object(struct kmemleak_object *object)
                }
        } else
                hlist_for_each_entry(area, elem, &object->area_list, node)
-                        scan_block((void *)(object->pointer + area->offset),
+                        scan_block((void *)area->start,
-                                   (void *)(object->pointer + area->offset
+                                   (void *)(area->start + area->size),
-                                            + area->length), object, 0);
+                                   object, 0);
 out:
        spin_unlock_irqrestore(&object->lock, flags);
 }
 /*
+ * Scan the objects already referenced (gray objects). More objects will be
+ * referenced and, if there are no memory leaks, all the objects are scanned.
+ */
+static void scan_gray_list(void)
+{
+        struct kmemleak_object *object, *tmp;
+        /*
+         * The list traversal is safe for both tail additions and removals
+         * from inside the loop. The kmemleak objects cannot be freed from
+         * outside the loop because their use_count was incremented.
+         */
+        object = list_entry(gray_list.next, typeof(*object), gray_list);
+        while (&object->gray_list != &gray_list) {
+                cond_resched();
+                /* may add new objects to the list */
+                if (!scan_should_stop())
+                        scan_object(object);
+                tmp = list_entry(object->gray_list.next, typeof(*object),
+                                 gray_list);
+                /* remove the object from the list and release it */
+                list_del(&object->gray_list);
+                put_object(object);
+                object = tmp;
+        }
+        WARN_ON(!list_empty(&gray_list));
+}
+/*
 * Scan data sections and all the referenced memory blocks allocated via the
 * kernel's standard allocators. This function must be called with the
 * scan_mutex held.
@@ -1090,10 +1133,9 @@ out:
 static void kmemleak_scan(void)
 {
        unsigned long flags;
-        struct kmemleak_object *object, *tmp;
+        struct kmemleak_object *object;
        int i;
        int new_leaks = 0;
-        int gray_list_pass = 0;
        jiffies_last_scan = jiffies;
@@ -1114,7 +1156,6 @@ static void kmemleak_scan(void)
 #endif
                /* reset the reference count (whiten the object) */
                object->count = 0;
-                object->flags &= ~OBJECT_NEW;
                if (color_gray(object) && get_object(object))
                        list_add_tail(&object->gray_list, &gray_list);
@@ -1172,62 +1213,36 @@ static void kmemleak_scan(void)
        /*
         * Scan the objects already referenced from the sections scanned
-         * above. More objects will be referenced and, if there are no memory
+         * above.
-         * leaks, all the objects will be scanned. The list traversal is safe
-         * for both tail additions and removals from inside the loop. The
-         * kmemleak objects cannot be freed from outside the loop because their
-         * use_count was increased.
         */
-repeat:
+        scan_gray_list();
-        object = list_entry(gray_list.next, typeof(*object), gray_list);
-        while (&object->gray_list != &gray_list) {
-                cond_resched();
-                /* may add new objects to the list */
-                if (!scan_should_stop())
-                        scan_object(object);
-                tmp = list_entry(object->gray_list.next, typeof(*object),
-                                 gray_list);
-                /* remove the object from the list and release it */
-                list_del(&object->gray_list);
-                put_object(object);
-                object = tmp;
-        }
-        if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
-                goto scan_end;
        /*
-         * Check for new objects allocated during this scanning and add them
+         * Check for new or unreferenced objects modified since the previous
-         * to the gray list.
+         * scan and color them gray until the next scan.
         */
        rcu_read_lock();
        list_for_each_entry_rcu(object, &object_list, object_list) {
                spin_lock_irqsave(&object->lock, flags);
-                if ((object->flags & OBJECT_NEW) && !color_black(object) &&
+                if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
-                    get_object(object)) {
+                    && update_checksum(object) && get_object(object)) {
-                        object->flags &= ~OBJECT_NEW;
+                        /* color it gray temporarily */
+                        object->count = object->min_count;
                        list_add_tail(&object->gray_list, &gray_list);
                }
                spin_unlock_irqrestore(&object->lock, flags);
        }
        rcu_read_unlock();
-        if (!list_empty(&gray_list))
+        /*
-                goto repeat;
+         * Re-scan the gray list for modified unreferenced objects.
+         */
-scan_end:
+        scan_gray_list();
-        WARN_ON(!list_empty(&gray_list));
        /*
-         * If scanning was stopped or new objects were being allocated at a
+         * If scanning was stopped do not report any new unreferenced objects.
-         * higher rate than gray list scanning, do not report any new
-         * unreferenced objects.
         */
-        if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES)
+        if (scan_should_stop())
                return;
        /*
@@ -1642,8 +1657,7 @@ void __init kmemleak_init(void)
                        kmemleak_ignore(log->ptr);
                        break;
                case KMEMLEAK_SCAN_AREA:
-                        kmemleak_scan_area(log->ptr, log->offset, log->length,
+                        kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
-                                           GFP_KERNEL);
                        break;
                case KMEMLEAK_NO_SCAN:
                        kmemleak_no_scan(log->ptr);
diff --git a/mm/maccess.c b/mm/maccess.c
index 9073695ff25f..4e348dbaecd7 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -14,7 +14,11 @@
 * Safely read from address @src to the buffer at @dst.  If a kernel fault
 * happens, handle that and return -EFAULT.
 */
-long probe_kernel_read(void *dst, void *src, size_t size)
+long __weak probe_kernel_read(void *dst, void *src, size_t size)
+    __attribute__((alias("__probe_kernel_read")));
+long __probe_kernel_read(void *dst, void *src, size_t size)
 {
        long ret;
        mm_segment_t old_fs = get_fs();
@@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
 * Safely write to address @dst from the buffer at @src.  If a kernel fault
 * happens, handle that and return -EFAULT.
 */
-long notrace __weak probe_kernel_write(void *dst, void *src, size_t size)
+long __weak probe_kernel_write(void *dst, void *src, size_t size)
+    __attribute__((alias("__probe_kernel_write")));
+long __probe_kernel_write(void *dst, void *src, size_t size)
 {
        long ret;
        mm_segment_t old_fs = get_fs();
diff --git a/mm/madvise.c b/mm/madvise.c
index 35b1479b7c9d..319528b8db74 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -9,6 +9,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/mempolicy.h>
+#include <linux/page-isolation.h>
 #include <linux/hugetlb.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
@@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma,
 /*
 * Error injection support for memory error handling.
 */
-static int madvise_hwpoison(unsigned long start, unsigned long end)
+static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 {
        int ret = 0;
@@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end)
                return -EPERM;
        for (; start < end; start += PAGE_SIZE) {
                struct page *p;
-                int ret = get_user_pages(current, current->mm, start, 1,
+                int ret = get_user_pages_fast(start, 1, 0, &p);
-                                                0, 0, &p, NULL);
                if (ret != 1)
                        return ret;
+                if (bhv == MADV_SOFT_OFFLINE) {
+                        printk(KERN_INFO "Soft offlining page %lx at %lx\n",
+                                page_to_pfn(p), start);
+                        ret = soft_offline_page(p, MF_COUNT_INCREASED);
+                        if (ret)
+                                break;
+                        continue;
+                }
                printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
                       page_to_pfn(p), start);
                /* Ignore return value for now */
-                __memory_failure(page_to_pfn(p), 0, 1);
+                __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
-                put_page(p);
        }
        return ret;
 }
@@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
        size_t len;
 #ifdef CONFIG_MEMORY_FAILURE
-        if (behavior == MADV_HWPOISON)
+        if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
-                return madvise_hwpoison(start, start+len_in);
+                return madvise_hwpoison(behavior, start, start+len_in);
 #endif
        if (!madvise_behavior_valid(behavior))
                return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 878808c4fcbe..954032b80bed 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -283,6 +283,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
        return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 }
+struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
+{
+        return &mem->css;
+}
 static struct mem_cgroup_per_zone *
 page_cgroup_zoneinfo(struct page_cgroup *pc)
 {
@@ -1536,25 +1541,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
        return container_of(css, struct mem_cgroup, css);
 }
-static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
+struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem = NULL;
        struct page_cgroup *pc;
        unsigned short id;
        swp_entry_t ent;
        VM_BUG_ON(!PageLocked(page));
-        if (!PageSwapCache(page))
-                return NULL;
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc)) {
                mem = pc->mem_cgroup;
                if (mem && !css_tryget(&mem->css))
                        mem = NULL;
-        } else {
+        } else if (PageSwapCache(page)) {
                ent.val = page_private(page);
                id = lookup_swap_cgroup(ent);
                rcu_read_lock();
@@ -1874,7 +1876,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
         */
        if (!PageSwapCache(page))
                goto charge_cur_mm;
-        mem = try_get_mem_cgroup_from_swapcache(page);
+        mem = try_get_mem_cgroup_from_page(page);
        if (!mem)
                goto charge_cur_mm;
        *ptr = mem;
@@ -2584,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
        if (free_all)
                goto try_to_free;
 move_account:
-        while (mem->res.usage > 0) {
+        do {
                ret = -EBUSY;
                if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
                        goto out;
@@ -2612,8 +2614,8 @@ move_account:
                if (ret == -ENOMEM)
                        goto try_to_free;
                cond_resched();
-        }
+        /* "ret" should also be checked to ensure all lists are empty. */
-        ret = 0;
+        } while (mem->res.usage > 0 || ret);
 out:
        css_put(&mem->css);
        return ret;
@@ -2646,10 +2648,7 @@ try_to_free:
        }
        lru_add_drain();
        /* try move_account...there may be some *locked* pages. */
-        if (mem->res.usage)
+        goto move_account;
-                goto move_account;
-        ret = 0;
-        goto out;
 }
 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 50d4f8d7024a..17299fd4577c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -34,12 +34,16 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
+#include <linux/kernel-page-flags.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/backing-dev.h>
+#include <linux/migrate.h>
+#include <linux/page-isolation.h>
+#include <linux/suspend.h>
 #include "internal.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -48,6 +52,129 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
 atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
+#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
+u32 hwpoison_filter_enable = 0;
+u32 hwpoison_filter_dev_major = ~0U;
+u32 hwpoison_filter_dev_minor = ~0U;
+u64 hwpoison_filter_flags_mask;
+u64 hwpoison_filter_flags_value;
+EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
+EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
+EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
+EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
+EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
+static int hwpoison_filter_dev(struct page *p)
+{
+        struct address_space *mapping;
+        dev_t dev;
+        if (hwpoison_filter_dev_major == ~0U &&
+            hwpoison_filter_dev_minor == ~0U)
+                return 0;
+        /*
+         * page_mapping() does not accept slab page
+         */
+        if (PageSlab(p))
+                return -EINVAL;
+        mapping = page_mapping(p);
+        if (mapping == NULL || mapping->host == NULL)
+                return -EINVAL;
+        dev = mapping->host->i_sb->s_dev;
+        if (hwpoison_filter_dev_major != ~0U &&
+            hwpoison_filter_dev_major != MAJOR(dev))
+                return -EINVAL;
+        if (hwpoison_filter_dev_minor != ~0U &&
+            hwpoison_filter_dev_minor != MINOR(dev))
+                return -EINVAL;
+        return 0;
+}
+static int hwpoison_filter_flags(struct page *p)
+{
+        if (!hwpoison_filter_flags_mask)
+                return 0;
+        if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
+                                    hwpoison_filter_flags_value)
+                return 0;
+        else
+                return -EINVAL;
+}
+/*
+ * This allows stress tests to limit test scope to a collection of tasks
+ * by putting them under some memcg. This prevents killing unrelated/important
+ * processes such as /sbin/init. Note that the target task may share clean
+ * pages with init (eg. libc text), which is harmless. If the target task
+ * share _dirty_ pages with another task B, the test scheme must make sure B
+ * is also included in the memcg. At last, due to race conditions this filter
+ * can only guarantee that the page either belongs to the memcg tasks, or is
+ * a freed page.
+ */
+#ifdef  CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+u64 hwpoison_filter_memcg;
+EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
+static int hwpoison_filter_task(struct page *p)
+{
+        struct mem_cgroup *mem;
+        struct cgroup_subsys_state *css;
+        unsigned long ino;
+        if (!hwpoison_filter_memcg)
+                return 0;
+        mem = try_get_mem_cgroup_from_page(p);
+        if (!mem)
+                return -EINVAL;
+        css = mem_cgroup_css(mem);
+        /* root_mem_cgroup has NULL dentries */
+        if (!css->cgroup->dentry)
+                return -EINVAL;
+        ino = css->cgroup->dentry->d_inode->i_ino;
+        css_put(css);
+        if (ino != hwpoison_filter_memcg)
+                return -EINVAL;
+        return 0;
+}
+#else
+static int hwpoison_filter_task(struct page *p) { return 0; }
+#endif
+int hwpoison_filter(struct page *p)
+{
+        if (!hwpoison_filter_enable)
+                return 0;
+        if (hwpoison_filter_dev(p))
+                return -EINVAL;
+        if (hwpoison_filter_flags(p))
+                return -EINVAL;
+        if (hwpoison_filter_task(p))
+                return -EINVAL;
+        return 0;
+}
+#else
+int hwpoison_filter(struct page *p)
+{
+        return 0;
+}
+#endif
+EXPORT_SYMBOL_GPL(hwpoison_filter);
 /*
 * Send all the processes who have the page mapped an ``action optional''
 * signal.
@@ -83,6 +210,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
 }
 /*
+ * When a unknown page type is encountered drain as many buffers as possible
+ * in the hope to turn the page into a LRU or free page, which we can handle.
+ */
+void shake_page(struct page *p, int access)
+{
+        if (!PageSlab(p)) {
+                lru_add_drain_all();
+                if (PageLRU(p))
+                        return;
+                drain_all_pages();
+                if (PageLRU(p) || is_free_buddy_page(p))
+                        return;
+        }
+        /*
+         * Only all shrink_slab here (which would also
+         * shrink other caches) if access is not potentially fatal.
+         */
+        if (access) {
+                int nr;
+                do {
+                        nr = shrink_slab(1000, GFP_KERNEL, 1000);
+                        if (page_count(p) == 0)
+                                break;
+                } while (nr > 10);
+        }
+}
+EXPORT_SYMBOL_GPL(shake_page);
+/*
 * Kill all processes that have a poisoned page mapped and then isolate
 * the page.
 *
@@ -177,7 +334,6 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
                         * In case something went wrong with munmapping
                         * make sure the process doesn't catch the
                         * signal and then access the memory. Just kill it.
-                         * the signal handlers
                         */
                        if (fail || tk->addr_valid == 0) {
                                printk(KERN_ERR
@@ -314,33 +470,49 @@ static void collect_procs(struct page *page, struct list_head *tokill)
 */
 enum outcome {
-        FAILED,         /* Error handling failed */
+        IGNORED,        /* Error: cannot be handled */
+        FAILED,         /* Error: handling failed */
        DELAYED,        /* Will be handled later */
-        IGNORED,        /* Error safely ignored */
        RECOVERED,      /* Successfully recovered */
 };
 static const char *action_name[] = {
+        [IGNORED] = "Ignored",
        [FAILED] = "Failed",
        [DELAYED] = "Delayed",
-        [IGNORED] = "Ignored",
        [RECOVERED] = "Recovered",
 };
 /*
- * Error hit kernel page.
+ * XXX: It is possible that a page is isolated from LRU cache,
- * Do nothing, try to be lucky and not touch this instead. For a few cases we
+ * and then kept in swap cache or failed to remove from page cache.
- * could be more sophisticated.
+ * The page count will stop it from being freed by unpoison.
+ * Stress tests should be aware of this memory leak problem.
 */
-static int me_kernel(struct page *p, unsigned long pfn)
+static int delete_from_lru_cache(struct page *p)
 {
-        return DELAYED;
+        if (!isolate_lru_page(p)) {
+                /*
+                 * Clear sensible page flags, so that the buddy system won't
+                 * complain when the page is unpoison-and-freed.
+                 */
+                ClearPageActive(p);
+                ClearPageUnevictable(p);
+                /*
+                 * drop the page count elevated by isolate_lru_page()
+                 */
+                page_cache_release(p);
+                return 0;
+        }
+        return -EIO;
 }
 /*
- * Already poisoned page.
+ * Error hit kernel page.
+ * Do nothing, try to be lucky and not touch this instead. For a few cases we
+ * could be more sophisticated.
 */
-static int me_ignore(struct page *p, unsigned long pfn)
+static int me_kernel(struct page *p, unsigned long pfn)
 {
        return IGNORED;
 }
@@ -355,14 +527,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
 }
 /*
- * Free memory
- */
-static int me_free(struct page *p, unsigned long pfn)
-{
-        return DELAYED;
-}
-/*
 * Clean (or cleaned) page cache page.
 */
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
@@ -371,6 +535,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
        int ret = FAILED;
        struct address_space *mapping;
+        delete_from_lru_cache(p);
        /*
         * For anonymous pages we're done the only reference left
         * should be the one m_f() holds.
@@ -500,14 +666,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
        /* Trigger EIO in shmem: */
        ClearPageUptodate(p);
-        return DELAYED;
+        if (!delete_from_lru_cache(p))
+                return DELAYED;
+        else
+                return FAILED;
 }
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
 {
        delete_from_swap_cache(p);
-        return RECOVERED;
+        if (!delete_from_lru_cache(p))
+                return RECOVERED;
+        else
+                return FAILED;
 }
 /*
@@ -550,7 +722,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 #define tail            (1UL << PG_tail)
 #define compound        (1UL << PG_compound)
 #define slab            (1UL << PG_slab)
-#define buddy           (1UL << PG_buddy)
 #define reserved        (1UL << PG_reserved)
 static struct page_state {
@@ -559,8 +730,11 @@ static struct page_state {
        char *msg;
        int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
-        { reserved,     reserved,       "reserved kernel",      me_ignore },
+        { reserved,     reserved,       "reserved kernel",      me_kernel },
-        { buddy,        buddy,          "free kernel",  me_free },
+        /*
+         * free pages are specially detected outside this table:
+         * PG_buddy pages only make a small fraction of all free pages.
+         */
        /*
         * Could in theory check if slab page is free or if we can drop
@@ -587,7 +761,6 @@ static struct page_state {
        { lru|dirty,    lru|dirty,      "LRU",          me_pagecache_dirty },
        { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
-        { swapbacked,   swapbacked,     "anonymous",    me_pagecache_clean },
        /*
         * Catchall entry: must be at end.
@@ -595,20 +768,31 @@ static struct page_state {
        { 0,            0,              "unknown page state",   me_unknown },
 };
+#undef dirty
+#undef sc
+#undef unevict
+#undef mlock
+#undef writeback
+#undef lru
+#undef swapbacked
+#undef head
+#undef tail
+#undef compound
+#undef slab
+#undef reserved
 static void action_result(unsigned long pfn, char *msg, int result)
 {
-        struct page *page = NULL;
+        struct page *page = pfn_to_page(pfn);
-        if (pfn_valid(pfn))
-                page = pfn_to_page(pfn);
        printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
                pfn,
-                page && PageDirty(page) ? "dirty " : "",
+                PageDirty(page) ? "dirty " : "",
                msg, action_name[result]);
 }
 static int page_action(struct page_state *ps, struct page *p,
-                        unsigned long pfn, int ref)
+                        unsigned long pfn)
 {
        int result;
        int count;
@@ -616,18 +800,22 @@ static int page_action(struct page_state *ps, struct page *p,
        result = ps->action(p, pfn);
        action_result(pfn, ps->msg, result);
-        count = page_count(p) - 1 - ref;
+        count = page_count(p) - 1;
-        if (count != 0)
+        if (ps->action == me_swapcache_dirty && result == DELAYED)
+                count--;
+        if (count != 0) {
                printk(KERN_ERR
                       "MCE %#lx: %s page still referenced by %d users\n",
                       pfn, ps->msg, count);
+                result = FAILED;
+        }
        /* Could do more checks here if page looks ok */
        /*
         * Could adjust zone counters here to correct for the missing page.
         */
-        return result == RECOVERED ? 0 : -EBUSY;
+        return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
 }
 #define N_UNMAP_TRIES 5
@@ -636,7 +824,7 @@ static int page_action(struct page_state *ps, struct page *p,
 * Do all that is necessary to remove user space mappings. Unmap
 * the pages and send SIGBUS to the processes if the data was dirty.
 */
-static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
+static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
                                  int trapno)
 {
        enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
@@ -646,15 +834,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int i;
        int kill = 1;
-        if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
+        if (PageReserved(p) || PageSlab(p))
-                return;
+                return SWAP_SUCCESS;
        /*
         * This check implies we don't kill processes if their pages
         * are in the swap cache early. Those are always late kills.
         */
        if (!page_mapped(p))
-                return;
+                return SWAP_SUCCESS;
+        if (PageCompound(p) || PageKsm(p))
+                return SWAP_FAIL;
        if (PageSwapCache(p)) {
                printk(KERN_ERR
@@ -665,6 +856,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
        /*
         * Propagate the dirty bit from PTEs to struct page first, because we
         * need this to decide if we should kill or just drop the page.
+         * XXX: the dirty test could be racy: set_page_dirty() may not always
+         * be called inside page lock (it's recommended but not enforced).
         */
        mapping = page_mapping(p);
        if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
@@ -716,11 +909,12 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
         */
        kill_procs_ao(&tokill, !!PageDirty(p), trapno,
                      ret != SWAP_SUCCESS, pfn);
+        return ret;
 }
-int __memory_failure(unsigned long pfn, int trapno, int ref)
+int __memory_failure(unsigned long pfn, int trapno, int flags)
 {
-        unsigned long lru_flag;
        struct page_state *ps;
        struct page *p;
        int res;
@@ -729,13 +923,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
                panic("Memory failure from trap %d on page %lx", trapno, pfn);
        if (!pfn_valid(pfn)) {
-                action_result(pfn, "memory outside kernel control", IGNORED);
+                printk(KERN_ERR
-                return -EIO;
+                       "MCE %#lx: memory outside kernel control\n",
+                       pfn);
+                return -ENXIO;
        }
        p = pfn_to_page(pfn);
        if (TestSetPageHWPoison(p)) {
-                action_result(pfn, "already hardware poisoned", IGNORED);
+                printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
                return 0;
        }
@@ -752,9 +948,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
         * In fact it's dangerous to directly bump up page count from 0,
         * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
         */
-        if (!get_page_unless_zero(compound_head(p))) {
+        if (!(flags & MF_COUNT_INCREASED) &&
-                action_result(pfn, "free or high order kernel", IGNORED);
+                !get_page_unless_zero(compound_head(p))) {
-                return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
+                if (is_free_buddy_page(p)) {
+                        action_result(pfn, "free buddy", DELAYED);
+                        return 0;
+                } else {
+                        action_result(pfn, "high order kernel", IGNORED);
+                        return -EBUSY;
+                }
        }
        /*
@@ -766,14 +968,19 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
         * walked by the page reclaim code, however that's not a big loss.
         */
        if (!PageLRU(p))
-                lru_add_drain_all();
+                shake_page(p, 0);
-        lru_flag = p->flags & lru;
+        if (!PageLRU(p)) {
-        if (isolate_lru_page(p)) {
+                /*
+                 * shake_page could have turned it free.
+                 */
+                if (is_free_buddy_page(p)) {
+                        action_result(pfn, "free buddy, 2nd try", DELAYED);
+                        return 0;
+                }
                action_result(pfn, "non LRU", IGNORED);
                put_page(p);
                return -EBUSY;
        }
-        page_cache_release(p);
        /*
         * Lock the page and wait for writeback to finish.
@@ -781,26 +988,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
         * and in many cases impossible, so we just avoid it here.
         */
        lock_page_nosync(p);
+        /*
+         * unpoison always clear PG_hwpoison inside page lock
+         */
+        if (!PageHWPoison(p)) {
+                printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+                res = 0;
+                goto out;
+        }
+        if (hwpoison_filter(p)) {
+                if (TestClearPageHWPoison(p))
+                        atomic_long_dec(&mce_bad_pages);
+                unlock_page(p);
+                put_page(p);
+                return 0;
+        }
        wait_on_page_writeback(p);
        /*
         * Now take care of user space mappings.
+         * Abort on fail: __remove_from_page_cache() assumes unmapped page.
         */
-        hwpoison_user_mappings(p, pfn, trapno);
+        if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
+                printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
+                res = -EBUSY;
+                goto out;
+        }
        /*
         * Torn down by someone else?
         */
-        if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
+        if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
                action_result(pfn, "already truncated LRU", IGNORED);
-                res = 0;
+                res = -EBUSY;
                goto out;
        }
        res = -EBUSY;
        for (ps = error_states;; ps++) {
-                if (((p->flags | lru_flag)& ps->mask) == ps->res) {
+                if ((p->flags & ps->mask) == ps->res) {
-                        res = page_action(ps, p, pfn, ref);
+                        res = page_action(ps, p, pfn);
                        break;
                }
        }
@@ -831,3 +1060,235 @@ void memory_failure(unsigned long pfn, int trapno)
 {
        __memory_failure(pfn, trapno, 0);
 }
+/**
+ * unpoison_memory - Unpoison a previously poisoned page
+ * @pfn: Page number of the to be unpoisoned page
+ *
+ * Software-unpoison a page that has been poisoned by
+ * memory_failure() earlier.
+ *
+ * This is only done on the software-level, so it only works
+ * for linux injected failures, not real hardware failures
+ *
+ * Returns 0 for success, otherwise -errno.
+ */
+int unpoison_memory(unsigned long pfn)
+{
+        struct page *page;
+        struct page *p;
+        int freeit = 0;
+        if (!pfn_valid(pfn))
+                return -ENXIO;
+        p = pfn_to_page(pfn);
+        page = compound_head(p);
+        if (!PageHWPoison(p)) {
+                pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
+                return 0;
+        }
+        if (!get_page_unless_zero(page)) {
+                if (TestClearPageHWPoison(p))
+                        atomic_long_dec(&mce_bad_pages);
+                pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+                return 0;
+        }
+        lock_page_nosync(page);
+        /*
+         * This test is racy because PG_hwpoison is set outside of page lock.
+         * That's acceptable because that won't trigger kernel panic. Instead,
+         * the PG_hwpoison page will be caught and isolated on the entrance to
+         * the free buddy page pool.
+         */
+        if (TestClearPageHWPoison(p)) {
+                pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
+                atomic_long_dec(&mce_bad_pages);
+                freeit = 1;
+        }
+        unlock_page(page);
+        put_page(page);
+        if (freeit)
+                put_page(page);
+        return 0;
+}
+EXPORT_SYMBOL(unpoison_memory);
+static struct page *new_page(struct page *p, unsigned long private, int **x)
+{
+        int nid = page_to_nid(p);
+        return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+}
+/*
+ * Safely get reference count of an arbitrary page.
+ * Returns 0 for a free page, -EIO for a zero refcount page
+ * that is not free, and 1 for any other page type.
+ * For 1 the page is returned with increased page count, otherwise not.
+ */
+static int get_any_page(struct page *p, unsigned long pfn, int flags)
+{
+        int ret;
+        if (flags & MF_COUNT_INCREASED)
+                return 1;
+        /*
+         * The lock_system_sleep prevents a race with memory hotplug,
+         * because the isolation assumes there's only a single user.
+         * This is a big hammer, a better would be nicer.
+         */
+        lock_system_sleep();
+        /*
+         * Isolate the page, so that it doesn't get reallocated if it
+         * was free.
+         */
+        set_migratetype_isolate(p);
+        if (!get_page_unless_zero(compound_head(p))) {
+                if (is_free_buddy_page(p)) {
+                        pr_debug("get_any_page: %#lx free buddy page\n", pfn);
+                        /* Set hwpoison bit while page is still isolated */
+                        SetPageHWPoison(p);
+                        ret = 0;
+                } else {
+                        pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+                                pfn, p->flags);
+                        ret = -EIO;
+                }
+        } else {
+                /* Not a free page */
+                ret = 1;
+        }
+        unset_migratetype_isolate(p);
+        unlock_system_sleep();
+        return ret;
+}
+/**
+ * soft_offline_page - Soft offline a page.
+ * @page: page to offline
+ * @flags: flags. Same as memory_failure().
+ *
+ * Returns 0 on success, otherwise negated errno.
+ *
+ * Soft offline a page, by migration or invalidation,
+ * without killing anything. This is for the case when
+ * a page is not corrupted yet (so it's still valid to access),
+ * but has had a number of corrected errors and is better taken
+ * out.
+ *
+ * The actual policy on when to do that is maintained by
+ * user space.
+ *
+ * This should never impact any application or cause data loss,
+ * however it might take some time.
+ *
+ * This is not a 100% solution for all memory, but tries to be
+ * ``good enough'' for the majority of memory.
+ */
+int soft_offline_page(struct page *page, int flags)
+{
+        int ret;
+        unsigned long pfn = page_to_pfn(page);
+        ret = get_any_page(page, pfn, flags);
+        if (ret < 0)
+                return ret;
+        if (ret == 0)
+                goto done;
+        /*
+         * Page cache page we can handle?
+         */
+        if (!PageLRU(page)) {
+                /*
+                 * Try to free it.
+                 */
+                put_page(page);
+                shake_page(page, 1);
+                /*
+                 * Did it turn free?
+                 */
+                ret = get_any_page(page, pfn, 0);
+                if (ret < 0)
+                        return ret;
+                if (ret == 0)
+                        goto done;
+        }
+        if (!PageLRU(page)) {
+                pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+                                pfn, page->flags);
+                return -EIO;
+        }
+        lock_page(page);
+        wait_on_page_writeback(page);
+        /*
+         * Synchronized using the page lock with memory_failure()
+         */
+        if (PageHWPoison(page)) {
+                unlock_page(page);
+                put_page(page);
+                pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+                return -EBUSY;
+        }
+        /*
+         * Try to invalidate first. This should work for
+         * non dirty unmapped page cache pages.
+         */
+        ret = invalidate_inode_page(page);
+        unlock_page(page);
+        /*
+         * Drop count because page migration doesn't like raised
+         * counts. The page could get re-allocated, but if it becomes
+         * LRU the isolation will just fail.
+         * RED-PEN would be better to keep it isolated here, but we
+         * would need to fix isolation locking first.
+         */
+        put_page(page);
+        if (ret == 1) {
+                ret = 0;
+                pr_debug("soft_offline: %#lx: invalidated\n", pfn);
+                goto done;
+        }
+        /*
+         * Simple invalidation didn't work.
+         * Try to migrate to a new page instead. migrate.c
+         * handles a large number of cases for us.
+         */
+        ret = isolate_lru_page(page);
+        if (!ret) {
+                LIST_HEAD(pagelist);
+                list_add(&page->lru, &pagelist);
+                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+                if (ret) {
+                        pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+                                pfn, ret, page->flags);
+                        if (ret > 0)
+                                ret = -EIO;
+                }
+        } else {
+                pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+                                pfn, ret, page_count(page), page->flags);
+        }
+        if (ret)
+                return ret;
+done:
+        atomic_long_add(1, &mce_bad_pages);
+        SetPageHWPoison(page);
+        /* keep elevated page count for bad page */
+        return ret;
+}
diff --git a/mm/memory.c b/mm/memory.c
index aed45eaf8ac9..09e4b1be7b67 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2555,6 +2555,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
        } else if (PageHWPoison(page)) {
+                /*
+                 * hwpoisoned dirty swapcache pages are kept for killing
+                 * owner processes (which may be unknown at hwpoison time)
+                 */
                ret = VM_FAULT_HWPOISON;
                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                goto out_release;
diff --git a/mm/migrate.c b/mm/migrate.c
index efddbf0926b2..880bd592d38e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -912,6 +912,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
                                goto out_pm;
                        err = -ENODEV;
+                        if (node < 0 || node >= MAX_NUMNODES)
+                                goto out_pm;
                        if (!node_state(node, N_HIGH_MEMORY))
                                goto out_pm;
@@ -999,33 +1002,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
 #define DO_PAGES_STAT_CHUNK_NR 16
        const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
        int chunk_status[DO_PAGES_STAT_CHUNK_NR];
-        unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
-        int err;
-        for (i = 0; i < nr_pages; i += chunk_nr) {
+        while (nr_pages) {
-                if (chunk_nr > nr_pages - i)
+                unsigned long chunk_nr;
-                        chunk_nr = nr_pages - i;
-                err = copy_from_user(chunk_pages, &pages[i],
+                chunk_nr = nr_pages;
-                                     chunk_nr * sizeof(*chunk_pages));
+                if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
-                if (err) {
+                        chunk_nr = DO_PAGES_STAT_CHUNK_NR;
-                        err = -EFAULT;
-                        goto out;
+                if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
-                }
+                        break;
                do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
-                err = copy_to_user(&status[i], chunk_status,
+                if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
-                                   chunk_nr * sizeof(*chunk_status));
+                        break;
-                if (err) {
-                        err = -EFAULT;
-                        goto out;
-                }
-        }
-        err = 0;
-out:
+                pages += chunk_nr;
-        return err;
+                status += chunk_nr;
+                nr_pages -= chunk_nr;
+        }
+        return nr_pages ? -EFAULT : 0;
 }
 /*
diff --git a/mm/mmap.c b/mm/mmap.c
index d9c77b2dbe9d..ee2298936fe6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1043,6 +1043,46 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 }
 EXPORT_SYMBOL(do_mmap_pgoff);
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+                unsigned long, prot, unsigned long, flags,
+                unsigned long, fd, unsigned long, pgoff)
+{
+        struct file *file = NULL;
+        unsigned long retval = -EBADF;
+        if (!(flags & MAP_ANONYMOUS)) {
+                if (unlikely(flags & MAP_HUGETLB))
+                        return -EINVAL;
+                file = fget(fd);
+                if (!file)
+                        goto out;
+        } else if (flags & MAP_HUGETLB) {
+                struct user_struct *user = NULL;
+                /*
+                 * VM_NORESERVE is used because the reservations will be
+                 * taken when vm_ops->mmap() is called
+                 * A dummy user value is used because we are not locking
+                 * memory so no accounting is necessary
+                 */
+                len = ALIGN(len, huge_page_size(&default_hstate));
+                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+                                                &user, HUGETLB_ANONHUGE_INODE);
+                if (IS_ERR(file))
+                        return PTR_ERR(file);
+        }
+        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+        down_write(&current->mm->mmap_sem);
+        retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+        up_write(&current->mm->mmap_sem);
+        if (file)
+                fput(file);
+out:
+        return retval;
+}
 /*
 * Some shared mappigns will want the pages marked read-only
 * to track write events. If so, we'll downgrade vm_page_prot
diff --git a/mm/nommu.c b/mm/nommu.c
index 8687973462bb..48a2ecfaf059 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
        /*
         * Ok, looks good - let it rip.
         */
+        flush_icache_range(mm->brk, brk);
        return mm->brk = brk;
 }
@@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to)
 static void __put_nommu_region(struct vm_region *region)
        __releases(nommu_region_sem)
 {
-        kenter("%p{%d}", region, atomic_read(&region->vm_usage));
+        kenter("%p{%d}", region, region->vm_usage);
        BUG_ON(!nommu_region_tree.rb_node);
-        if (atomic_dec_and_test(&region->vm_usage)) {
+        if (--region->vm_usage == 0) {
                if (region->vm_top > region->vm_start)
                        delete_nommu_region(region);
                up_write(&nommu_region_sem);
@@ -1204,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file,
        if (!vma)
                goto error_getting_vma;
-        atomic_set(&region->vm_usage, 1);
+        region->vm_usage = 1;
        region->vm_flags = vm_flags;
        region->vm_pgoff = pgoff;
@@ -1271,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file,
                        }
                        /* we've found a region we can share */
-                        atomic_inc(&pregion->vm_usage);
+                        pregion->vm_usage++;
                        vma->vm_region = pregion;
                        start = pregion->vm_start;
                        start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
@@ -1288,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file,
                                        vma->vm_region = NULL;
                                        vma->vm_start = 0;
                                        vma->vm_end = 0;
-                                        atomic_dec(&pregion->vm_usage);
+                                        pregion->vm_usage--;
                                        pregion = NULL;
                                        goto error_just_free;
                                }
@@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file,
 share:
        add_vma_to_mm(current->mm, vma);
-        up_write(&nommu_region_sem);
+        /* we flush the region from the icache only when the first executable
+         * mapping of it is made  */
+        if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
+                flush_icache_range(region->vm_start, region->vm_end);
+                region->vm_icache_flushed = true;
+        }
-        if (prot & PROT_EXEC)
+        up_write(&nommu_region_sem);
-                flush_icache_range(result, result + len);
        kleave(" = %lx", result);
        return result;
@@ -1398,6 +1403,31 @@ error_getting_region:
 }
 EXPORT_SYMBOL(do_mmap_pgoff);
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+                unsigned long, prot, unsigned long, flags,
+                unsigned long, fd, unsigned long, pgoff)
+{
+        struct file *file = NULL;
+        unsigned long retval = -EBADF;
+        if (!(flags & MAP_ANONYMOUS)) {
+                file = fget(fd);
+                if (!file)
+                        goto out;
+        }
+        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+        down_write(&current->mm->mmap_sem);
+        retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+        up_write(&current->mm->mmap_sem);
+        if (file)
+                fput(file);
+out:
+        return retval;
+}
 /*
 * split a vma into two pieces at address 'addr', a new vma is allocated either
 * for the first part or the tail.
@@ -1411,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
        kenter("");
-        /* we're only permitted to split anonymous regions that have a single
+        /* we're only permitted to split anonymous regions (these should have
-         * owner */
+         * only a single usage on the region) */
-        if (vma->vm_file ||
+        if (vma->vm_file)
-            atomic_read(&vma->vm_region->vm_usage) != 1)
                return -ENOMEM;
        if (mm->map_count >= sysctl_max_map_count)
@@ -1488,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm,
        /* cut the backing region down to size */
        region = vma->vm_region;
-        BUG_ON(atomic_read(&region->vm_usage) != 1);
+        BUG_ON(region->vm_usage != 1);
        down_write(&nommu_region_sem);
        delete_nommu_region(region);
@@ -1732,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping,
 EXPORT_SYMBOL(unmap_mapping_range);
 /*
- * ask for an unmapped area at which to create a mapping on a file
- */
-unsigned long get_unmapped_area(struct file *file, unsigned long addr,
-                                unsigned long len, unsigned long pgoff,
-                                unsigned long flags)
-{
-        unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
-                                  unsigned long, unsigned long);
-        get_area = current->mm->get_unmapped_area;
-        if (file && file->f_op && file->f_op->get_unmapped_area)
-                get_area = file->f_op->get_unmapped_area;
-        if (!get_area)
-                return -ENOSYS;
-        return get_area(file, addr, len, pgoff, flags);
-}
-EXPORT_SYMBOL(get_unmapped_area);
-/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
@@ -1891,9 +1899,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
                /* only read or write mappings where it is permitted */
                if (write && vma->vm_flags & VM_MAYWRITE)
-                        len -= copy_to_user((void *) addr, buf, len);
+                        copy_to_user_page(vma, NULL, addr,
+                                         (void *) addr, buf, len);
                else if (!write && vma->vm_flags & VM_MAYREAD)
-                        len -= copy_from_user(buf, (void *) addr, len);
+                        copy_from_user_page(vma, NULL, addr,
+                                            buf, (void *) addr, len);
                else
                        len = 0;
        } else {
@@ -1904,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
        mmput(mm);
        return len;
 }
+/**
+ * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
+ * @inode: The inode to check
+ * @size: The current filesize of the inode
+ * @newsize: The proposed filesize of the inode
+ *
+ * Check the shared mappings on an inode on behalf of a shrinking truncate to
+ * make sure that that any outstanding VMAs aren't broken and then shrink the
+ * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
+ * automatically grant mappings that are too large.
+ */
+int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
+                                size_t newsize)
+{
+        struct vm_area_struct *vma;
+        struct prio_tree_iter iter;
+        struct vm_region *region;
+        pgoff_t low, high;
+        size_t r_size, r_top;
+        low = newsize >> PAGE_SHIFT;
+        high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        down_write(&nommu_region_sem);
+        /* search for VMAs that fall within the dead zone */
+        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+                              low, high) {
+                /* found one - only interested if it's shared out of the page
+                 * cache */
+                if (vma->vm_flags & VM_SHARED) {
+                        up_write(&nommu_region_sem);
+                        return -ETXTBSY; /* not quite true, but near enough */
+                }
+        }
+        /* reduce any regions that overlap the dead zone - if in existence,
+         * these will be pointed to by VMAs that don't overlap the dead zone
+         *
+         * we don't check for any regions that start beyond the EOF as there
+         * shouldn't be any
+         */
+        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+                              0, ULONG_MAX) {
+                if (!(vma->vm_flags & VM_SHARED))
+                        continue;
+                region = vma->vm_region;
+                r_size = region->vm_top - region->vm_start;
+                r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
+                if (r_top > newsize) {
+                        region->vm_top -= r_top - newsize;
+                        if (region->vm_end > region->vm_top)
+                                region->vm_end = region->vm_top;
+                }
+        }
+        up_write(&nommu_region_sem);
+        return 0;
+}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f52481b1c1e5..237050478f28 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -459,6 +459,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        list_for_each_entry(c, &p->children, sibling) {
                if (c->mm == p->mm)
                        continue;
+                if (mem && !task_in_mem_cgroup(c, mem))
+                        continue;
                if (!oom_kill_task(c))
                        return 0;
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 850c4a7e2fe5..8deb9d0fd5b1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
+#include <linux/memory.h>
 #include <trace/events/kmem.h>
 #include <asm/tlbflush.h>
@@ -555,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
                        list_del(&page->lru);
-                        __free_one_page(page, zone, 0, migratetype);
+                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-                        trace_mm_page_pcpu_drain(page, 0, migratetype);
+                        __free_one_page(page, zone, 0, page_private(page));
+                        trace_mm_page_pcpu_drain(page, 0, page_private(page));
                } while (--count && --batch_free && !list_empty(list));
        }
        spin_unlock(&zone->lock);
@@ -1221,10 +1223,10 @@ again:
                }
                spin_lock_irqsave(&zone->lock, flags);
                page = __rmqueue(zone, order, migratetype);
-                __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
                spin_unlock(&zone->lock);
                if (!page)
                        goto failed;
+                __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
        }
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -2401,13 +2403,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
 {
        char saved_string[NUMA_ZONELIST_ORDER_LEN];
        int ret;
+        static DEFINE_MUTEX(zl_order_mutex);
+        mutex_lock(&zl_order_mutex);
        if (write)
-                strncpy(saved_string, (char*)table->data,
+                strcpy(saved_string, (char*)table->data);
-                        NUMA_ZONELIST_ORDER_LEN);
        ret = proc_dostring(table, write, buffer, length, ppos);
        if (ret)
-                return ret;
+                goto out;
        if (write) {
                int oldval = user_zonelist_order;
                if (__parse_numa_zonelist_order((char*)table->data)) {
@@ -2420,7 +2423,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
                } else if (oldval != user_zonelist_order)
                        build_all_zonelists();
        }
-        return 0;
+out:
+        mutex_unlock(&zl_order_mutex);
+        return ret;
 }
@@ -3579,7 +3584,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
 * then all holes in the requested range will be accounted for.
 */
-static unsigned long __meminit __absent_pages_in_range(int nid,
+unsigned long __meminit __absent_pages_in_range(int nid,
                                unsigned long range_start_pfn,
                                unsigned long range_end_pfn)
 {
@@ -3994,7 +3999,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
                }
                /* Merge backward if suitable */
-                if (start_pfn < early_node_map[i].end_pfn &&
+                if (start_pfn < early_node_map[i].start_pfn &&
                                end_pfn >= early_node_map[i].start_pfn) {
                        early_node_map[i].start_pfn = start_pfn;
                        return;
@@ -4108,7 +4113,7 @@ static int __init cmp_node_active_region(const void *a, const void *b)
 }
 /* sort the node_map by start_pfn */
-static void __init sort_node_map(void)
+void __init sort_node_map(void)
 {
        sort(early_node_map, (size_t)nr_nodemap_entries,
                        sizeof(struct node_active_region),
@@ -5008,23 +5013,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 int set_migratetype_isolate(struct page *page)
 {
        struct zone *zone;
-        unsigned long flags;
+        struct page *curr_page;
+        unsigned long flags, pfn, iter;
+        unsigned long immobile = 0;
+        struct memory_isolate_notify arg;
+        int notifier_ret;
        int ret = -EBUSY;
        int zone_idx;
        zone = page_zone(page);
        zone_idx = zone_idx(zone);
        spin_lock_irqsave(&zone->lock, flags);
+        if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
+            zone_idx == ZONE_MOVABLE) {
+                ret = 0;
+                goto out;
+        }
+        pfn = page_to_pfn(page);
+        arg.start_pfn = pfn;
+        arg.nr_pages = pageblock_nr_pages;
+        arg.pages_found = 0;
        /*
-         * In future, more migrate types will be able to be isolation target.
+         * It may be possible to isolate a pageblock even if the
+         * migratetype is not MIGRATE_MOVABLE. The memory isolation
+         * notifier chain is used by balloon drivers to return the
+         * number of pages in a range that are held by the balloon
+         * driver to shrink memory. If all the pages are accounted for
+         * by balloons, are free, or on the LRU, isolation can continue.
+         * Later, for example, when memory hotplug notifier runs, these
+         * pages reported as "can be isolated" should be isolated(freed)
+         * by the balloon driver through the memory notifier chain.
         */
-        if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
+        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
-            zone_idx != ZONE_MOVABLE)
+        notifier_ret = notifier_to_errno(notifier_ret);
+        if (notifier_ret || !arg.pages_found)
                goto out;
-        set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-        move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
-        ret = 0;
+                if (!pfn_valid_within(pfn))
+                        continue;
+                curr_page = pfn_to_page(iter);
+                if (!page_count(curr_page) || PageLRU(curr_page))
+                        continue;
+                immobile++;
+        }
+        if (arg.pages_found == immobile)
+                ret = 0;
 out:
+        if (!ret) {
+                set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+                move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        }
        spin_unlock_irqrestore(&zone->lock, flags);
        if (!ret)
                drain_all_pages();
@@ -5091,3 +5138,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
        spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif
+#ifdef CONFIG_MEMORY_FAILURE
+bool is_free_buddy_page(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        unsigned long pfn = page_to_pfn(page);
+        unsigned long flags;
+        int order;
+        spin_lock_irqsave(&zone->lock, flags);
+        for (order = 0; order < MAX_ORDER; order++) {
+                struct page *page_head = page - (pfn & ((1 << order) - 1));
+                if (PageBuddy(page_head) && page_order(page_head) >= order)
+                        break;
+        }
+        spin_unlock_irqrestore(&zone->lock, flags);
+        return order < MAX_ORDER;
+}
+#endif
diff --git a/mm/percpu.c b/mm/percpu.c
index 442010cc91c6..083e7c91e5f6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1271,7 +1271,7 @@ static void pcpu_reclaim(struct work_struct *work)
 */
 void free_percpu(void *ptr)
 {
-        void *addr = __pcpu_ptr_to_addr(ptr);
+        void *addr;
        struct pcpu_chunk *chunk;
        unsigned long flags;
        int off;
@@ -1279,6 +1279,8 @@ void free_percpu(void *ptr)
        if (!ptr)
                return;
+        addr = __pcpu_ptr_to_addr(ptr);
        spin_lock_irqsave(&pcpu_lock, flags);
        chunk = pcpu_chunk_addr_search(addr);
diff --git a/mm/readahead.c b/mm/readahead.c
index aa1aa2345235..033bc135a41f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -547,5 +547,17 @@ page_cache_async_readahead(struct address_space *mapping,
        /* do read-ahead */
        ondemand_readahead(mapping, ra, filp, true, offset, req_size);
+#ifdef CONFIG_BLOCK
+        /*
+         * Normally the current page is !uptodate and lock_page() will be
+         * immediately called to implicitly unplug the device. However this
+         * is not always true for RAID conifgurations, where data arrives
+         * not strictly in their submission order. In this case we need to
+         * explicitly kick off the IO.
+         */
+        if (PageUptodate(page))
+                blk_run_backing_dev(mapping->backing_dev_info, NULL);
+#endif
 }
 EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/shmem.c b/mm/shmem.c
index 4fb41c83daca..eef4ebea5158 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,7 +29,6 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/swap.h>
-#include <linux/ima.h>
 static struct vfsmount *shm_mnt;
@@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/xattr.h>
 #include <linux/exportfs.h>
+#include <linux/posix_acl.h>
 #include <linux/generic_acl.h>
 #include <linux/mman.h>
 #include <linux/string.h>
@@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
                error = inode_setattr(inode, attr);
 #ifdef CONFIG_TMPFS_POSIX_ACL
        if (!error && (attr->ia_valid & ATTR_MODE))
-                error = generic_acl_chmod(inode, &shmem_acl_ops);
+                error = generic_acl_chmod(inode);
 #endif
        if (page)
                page_cache_release(page);
@@ -1824,11 +1824,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
                                return error;
                        }
                }
-                error = shmem_acl_init(inode, dir);
+#ifdef CONFIG_TMPFS_POSIX_ACL
+                error = generic_acl_init(inode, dir);
                if (error) {
                        iput(inode);
                        return error;
                }
+#else
+                error = 0;
+#endif
                if (dir->i_mode & S_ISGID) {
                        inode->i_gid = dir->i_gid;
                        if (S_ISDIR(mode))
@@ -2043,27 +2047,28 @@ static const struct inode_operations shmem_symlink_inode_operations = {
 * filesystem level, though.
 */
-static size_t shmem_xattr_security_list(struct inode *inode, char *list,
+static size_t shmem_xattr_security_list(struct dentry *dentry, char *list,
                                        size_t list_len, const char *name,
-                                        size_t name_len)
+                                        size_t name_len, int handler_flags)
 {
-        return security_inode_listsecurity(inode, list, list_len);
+        return security_inode_listsecurity(dentry->d_inode, list, list_len);
 }
-static int shmem_xattr_security_get(struct inode *inode, const char *name,
+static int shmem_xattr_security_get(struct dentry *dentry, const char *name,
-                                    void *buffer, size_t size)
+                void *buffer, size_t size, int handler_flags)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return xattr_getsecurity(inode, name, buffer, size);
+        return xattr_getsecurity(dentry->d_inode, name, buffer, size);
 }
-static int shmem_xattr_security_set(struct inode *inode, const char *name,
+static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
-                                    const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int handler_flags)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return security_inode_setsecurity(inode, name, value, size, flags);
+        return security_inode_setsecurity(dentry->d_inode, name, value,
+                                          size, flags);
 }
 static struct xattr_handler shmem_xattr_security_handler = {
@@ -2074,8 +2079,8 @@ static struct xattr_handler shmem_xattr_security_handler = {
 };
 static struct xattr_handler *shmem_xattr_handlers[] = {
-        &shmem_xattr_acl_access_handler,
+        &generic_acl_access_handler,
-        &shmem_xattr_acl_default_handler,
+        &generic_acl_default_handler,
        &shmem_xattr_security_handler,
        NULL
 };
@@ -2454,7 +2459,7 @@ static const struct inode_operations shmem_inode_operations = {
        .getxattr       = generic_getxattr,
        .listxattr      = generic_listxattr,
        .removexattr    = generic_removexattr,
-        .check_acl      = shmem_check_acl,
+        .check_acl      = generic_check_acl,
 #endif
 };
@@ -2477,7 +2482,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
        .getxattr       = generic_getxattr,
        .listxattr      = generic_listxattr,
        .removexattr    = generic_removexattr,
-        .check_acl      = shmem_check_acl,
+        .check_acl      = generic_check_acl,
 #endif
 };
@@ -2488,7 +2493,7 @@ static const struct inode_operations shmem_special_inode_operations = {
        .getxattr       = generic_getxattr,
        .listxattr      = generic_listxattr,
        .removexattr    = generic_removexattr,
-        .check_acl      = shmem_check_acl,
+        .check_acl      = generic_check_acl,
 #endif
 };
@@ -2626,7 +2631,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        int error;
        struct file *file;
        struct inode *inode;
-        struct dentry *dentry, *root;
+        struct path path;
+        struct dentry *root;
        struct qstr this;
        if (IS_ERR(shm_mnt))
@@ -2643,38 +2649,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        this.len = strlen(name);
        this.hash = 0; /* will go */
        root = shm_mnt->mnt_root;
-        dentry = d_alloc(root, &this);
+        path.dentry = d_alloc(root, &this);
-        if (!dentry)
+        if (!path.dentry)
                goto put_memory;
+        path.mnt = mntget(shm_mnt);
-        error = -ENFILE;
-        file = get_empty_filp();
-        if (!file)
-                goto put_dentry;
        error = -ENOSPC;
        inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
        if (!inode)
-                goto close_file;
+                goto put_dentry;
-        d_instantiate(dentry, inode);
+        d_instantiate(path.dentry, inode);
        inode->i_size = size;
        inode->i_nlink = 0;     /* It is unlinked */
-        init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
-                  &shmem_file_operations);
 #ifndef CONFIG_MMU
        error = ramfs_nommu_expand_for_mapping(inode, size);
        if (error)
-                goto close_file;
+                goto put_dentry;
 #endif
-        ima_counts_get(file);
+        error = -ENFILE;
+        file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
+                  &shmem_file_operations);
+        if (!file)
+                goto put_dentry;
        return file;
-close_file:
-        put_filp(file);
 put_dentry:
-        dput(dentry);
+        path_put(&path);
 put_memory:
        shmem_unacct_size(flags, size);
        return ERR_PTR(error);
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
deleted file mode 100644
index df2c87fdae50..000000000000
--- a/mm/shmem_acl.c
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * mm/shmem_acl.c
- *
- * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
- *
- * This file is released under the GPL.
- */
-#include <linux/fs.h>
-#include <linux/shmem_fs.h>
-#include <linux/xattr.h>
-#include <linux/generic_acl.h>
-/**
- * shmem_get_acl  -   generic_acl_operations->getacl() operation
- */
-static struct posix_acl *
-shmem_get_acl(struct inode *inode, int type)
-{
-        struct posix_acl *acl = NULL;
-        spin_lock(&inode->i_lock);
-        switch(type) {
-                case ACL_TYPE_ACCESS:
-                        acl = posix_acl_dup(inode->i_acl);
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        acl = posix_acl_dup(inode->i_default_acl);
-                        break;
-        }
-        spin_unlock(&inode->i_lock);
-        return acl;
-}
-/**
- * shmem_set_acl  -   generic_acl_operations->setacl() operation
- */
-static void
-shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
-{
-        struct posix_acl *free = NULL;
-        spin_lock(&inode->i_lock);
-        switch(type) {
-                case ACL_TYPE_ACCESS:
-                        free = inode->i_acl;
-                        inode->i_acl = posix_acl_dup(acl);
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        free = inode->i_default_acl;
-                        inode->i_default_acl = posix_acl_dup(acl);
-                        break;
-        }
-        spin_unlock(&inode->i_lock);
-        posix_acl_release(free);
-}
-struct generic_acl_operations shmem_acl_ops = {
-        .getacl = shmem_get_acl,
-        .setacl = shmem_set_acl,
-};
-/**
- * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
- * shmem_xattr_acl_access_handler  -  plumbing code to implement the
- * system.posix_acl_access xattr using the generic acl functions.
- */
-static size_t
-shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
-                      const char *name, size_t name_len)
-{
-        return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
-                                list, list_size);
-}
-static int
-shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
-                     size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
-                               size);
-}
-static int
-shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
-                     size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
-                               size);
-}
-struct xattr_handler shmem_xattr_acl_access_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .list   = shmem_list_acl_access,
-        .get    = shmem_get_acl_access,
-        .set    = shmem_set_acl_access,
-};
-/**
- * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
- * shmem_xattr_acl_default_handler  -  plumbing code to implement the
- * system.posix_acl_default xattr using the generic acl functions.
- */
-static size_t
-shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
-                       const char *name, size_t name_len)
-{
-        return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
-                                list, list_size);
-}
-static int
-shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
-                      size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
-                               size);
-}
-static int
-shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
-                      size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
-                               size);
-}
-struct xattr_handler shmem_xattr_acl_default_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .list   = shmem_list_acl_default,
-        .get    = shmem_get_acl_default,
-        .set    = shmem_set_acl_default,
-};
-/**
- * shmem_acl_init  -  Inizialize the acl(s) of a new inode
- */
-int
-shmem_acl_init(struct inode *inode, struct inode *dir)
-{
-        return generic_acl_init(inode, dir, &shmem_acl_ops);
-}
-/**
- * shmem_check_acl  -  check_acl() callback for generic_permission()
- */
-int
-shmem_check_acl(struct inode *inode, int mask)
-{
-        struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
-        if (acl) {
-                int error = posix_acl_permission(inode, acl, mask);
-                posix_acl_release(acl);
-                return error;
-        }
-        return -EAGAIN;
-}
diff --git a/mm/slab.c b/mm/slab.c
index 3f4822938f46..7451bdacaf18 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -654,7 +654,7 @@ static void init_node_lock_keys(int q)
                l3 = s->cs_cachep->nodelists[q];
                if (!l3 || OFF_SLAB(s->cs_cachep))
-                        return;
+                        continue;
                lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
                alc = l3->alien;
                /*
@@ -665,7 +665,7 @@ static void init_node_lock_keys(int q)
                 * for alloc_alien_cache,
                 */
                if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
-                        return;
+                        continue;
                for_each_node(r) {
                        if (alc[r])
                                lockdep_set_class(&alc[r]->lock,
@@ -1132,7 +1132,7 @@ static void __cpuinit cpuup_canceled(long cpu)
                if (nc)
                        free_block(cachep, nc->entry, nc->avail, node);
-                if (!cpus_empty(*mask)) {
+                if (!cpumask_empty(mask)) {
                        spin_unlock_irq(&l3->list_lock);
                        goto free_array_cache;
                }
@@ -2275,9 +2275,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /*
         * Determine if the slab management is 'on' or 'off' slab.
         * (bootstrapping cannot cope with offslab caches so don't do
-         * it too early on.)
+         * it too early on. Always use on-slab management when
+         * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
         */
-        if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
+        if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
+            !(flags & SLAB_NOLEAKTRACE))
                /*
                 * Size is large, assume best to place the slab management obj
                 * off-slab (should allow better packing of objs).
@@ -2596,8 +2598,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
                 * kmemleak does not treat the ->s_mem pointer as a reference
                 * to the object. Otherwise we will not report the leak.
                 */
-                kmemleak_scan_area(slabp, offsetof(struct slab, list),
+                kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
-                                   sizeof(struct list_head), local_flags);
+                                   local_flags);
                if (!slabp)
                        return NULL;
        } else {
diff --git a/mm/truncate.c b/mm/truncate.c
index 342deee22684..e87e37244829 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -522,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
 */
 void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
 {
-        if (new < old) {
+        struct address_space *mapping = inode->i_mapping;
-                struct address_space *mapping = inode->i_mapping;
+        /*
-                /*
+         * unmap_mapping_range is called twice, first simply for
-                 * unmap_mapping_range is called twice, first simply for
+         * efficiency so that truncate_inode_pages does fewer
-                 * efficiency so that truncate_inode_pages does fewer
+         * single-page unmaps.  However after this first call, and
-                 * single-page unmaps.  However after this first call, and
+         * before truncate_inode_pages finishes, it is possible for
-                 * before truncate_inode_pages finishes, it is possible for
+         * private pages to be COWed, which remain after
-                 * private pages to be COWed, which remain after
+         * truncate_inode_pages finishes, hence the second
-                 * truncate_inode_pages finishes, hence the second
+         * unmap_mapping_range call must be made for correctness.
-                 * unmap_mapping_range call must be made for correctness.
+         */
-                 */
+        unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
-                unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+        truncate_inode_pages(mapping, new);
-                truncate_inode_pages(mapping, new);
+        unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
-                unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
-        }
 }
 EXPORT_SYMBOL(truncate_pagecache);
diff --git a/mm/util.c b/mm/util.c
index b377ce430803..834db7be240f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,10 +4,6 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/sched.h>
-#include <linux/hugetlb.h>
-#include <linux/syscalls.h>
-#include <linux/mman.h>
-#include <linux/file.h>
 #include <asm/uaccess.h>
 #define CREATE_TRACE_POINTS
@@ -224,7 +220,7 @@ char *strndup_user(const char __user *s, long n)
 }
 EXPORT_SYMBOL(strndup_user);
-#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
        mm->mmap_base = TASK_UNMAPPED_BASE;
@@ -272,46 +268,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
-SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
-                unsigned long, prot, unsigned long, flags,
-                unsigned long, fd, unsigned long, pgoff)
-{
-        struct file * file = NULL;
-        unsigned long retval = -EBADF;
-        if (!(flags & MAP_ANONYMOUS)) {
-                if (unlikely(flags & MAP_HUGETLB))
-                        return -EINVAL;
-                file = fget(fd);
-                if (!file)
-                        goto out;
-        } else if (flags & MAP_HUGETLB) {
-                struct user_struct *user = NULL;
-                /*
-                 * VM_NORESERVE is used because the reservations will be
-                 * taken when vm_ops->mmap() is called
-                 * A dummy user value is used because we are not locking
-                 * memory so no accounting is necessary
-                 */
-                len = ALIGN(len, huge_page_size(&default_hstate));
-                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
-                                                &user, HUGETLB_ANONHUGE_INODE);
-                if (IS_ERR(file))
-                        return PTR_ERR(file);
-        }
-        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-        down_write(&current->mm->mmap_sem);
-        retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-        up_write(&current->mm->mmap_sem);
-        if (file)
-                fput(file);
-out:
-        return retval;
-}
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 37e69295f250..ae007462b7f6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void)
 static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/* for per-CPU blocks */
+static void purge_fragmented_blocks_allcpus(void);
 /*
 * Purges all lazily-freed vmap areas.
 *
@@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
        } else
                spin_lock(&purge_lock);
+        if (sync)
+                purge_fragmented_blocks_allcpus();
        rcu_read_lock();
        list_for_each_entry_rcu(va, &vmap_area_list, list) {
                if (va->flags & VM_LAZY_FREE) {
@@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
        }
        rcu_read_unlock();
-        if (nr) {
+        if (nr)
-                BUG_ON(nr > atomic_read(&vmap_lazy_nr));
                atomic_sub(nr, &vmap_lazy_nr);
-        }
        if (nr || force_flush)
                flush_tlb_kernel_range(*start, *end);
@@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false;
 struct vmap_block_queue {
        spinlock_t lock;
        struct list_head free;
-        struct list_head dirty;
-        unsigned int nr_dirty;
 };
 struct vmap_block {
@@ -680,10 +682,9 @@ struct vmap_block {
        unsigned long free, dirty;
        DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
        DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
-        union {
+        struct list_head free_list;
-                struct list_head free_list;
+        struct rcu_head rcu_head;
-                struct rcu_head rcu_head;
+        struct list_head purge;
-        };
 };
 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
@@ -759,7 +760,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        vbq = &get_cpu_var(vmap_block_queue);
        vb->vbq = vbq;
        spin_lock(&vbq->lock);
-        list_add(&vb->free_list, &vbq->free);
+        list_add_rcu(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);
        put_cpu_var(vmap_block_queue);
@@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb)
        struct vmap_block *tmp;
        unsigned long vb_idx;
-        BUG_ON(!list_empty(&vb->free_list));
        vb_idx = addr_to_vb_idx(vb->va->va_start);
        spin_lock(&vmap_block_tree_lock);
        tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
@@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb)
        call_rcu(&vb->rcu_head, rcu_free_vb);
 }
+static void purge_fragmented_blocks(int cpu)
+{
+        LIST_HEAD(purge);
+        struct vmap_block *vb;
+        struct vmap_block *n_vb;
+        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+        rcu_read_lock();
+        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+                if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
+                        continue;
+                spin_lock(&vb->lock);
+                if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
+                        vb->free = 0; /* prevent further allocs after releasing lock */
+                        vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
+                        bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
+                        bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
+                        spin_lock(&vbq->lock);
+                        list_del_rcu(&vb->free_list);
+                        spin_unlock(&vbq->lock);
+                        spin_unlock(&vb->lock);
+                        list_add_tail(&vb->purge, &purge);
+                } else
+                        spin_unlock(&vb->lock);
+        }
+        rcu_read_unlock();
+        list_for_each_entry_safe(vb, n_vb, &purge, purge) {
+                list_del(&vb->purge);
+                free_vmap_block(vb);
+        }
+}
+static void purge_fragmented_blocks_thiscpu(void)
+{
+        purge_fragmented_blocks(smp_processor_id());
+}
+static void purge_fragmented_blocks_allcpus(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                purge_fragmented_blocks(cpu);
+}
 static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 {
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        unsigned long addr = 0;
        unsigned int order;
+        int purge = 0;
        BUG_ON(size & ~PAGE_MASK);
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -808,24 +856,38 @@ again:
                int i;
                spin_lock(&vb->lock);
+                if (vb->free < 1UL << order)
+                        goto next;
                i = bitmap_find_free_region(vb->alloc_map,
                                                VMAP_BBMAP_BITS, order);
-                if (i >= 0) {
+                if (i < 0) {
-                        addr = vb->va->va_start + (i << PAGE_SHIFT);
+                        if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
-                        BUG_ON(addr_to_vb_idx(addr) !=
+                                /* fragmented and no outstanding allocations */
-                                        addr_to_vb_idx(vb->va->va_start));
+                                BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
-                        vb->free -= 1UL << order;
+                                purge = 1;
-                        if (vb->free == 0) {
-                                spin_lock(&vbq->lock);
-                                list_del_init(&vb->free_list);
-                                spin_unlock(&vbq->lock);
                        }
-                        spin_unlock(&vb->lock);
+                        goto next;
-                        break;
+                }
+                addr = vb->va->va_start + (i << PAGE_SHIFT);
+                BUG_ON(addr_to_vb_idx(addr) !=
+                                addr_to_vb_idx(vb->va->va_start));
+                vb->free -= 1UL << order;
+                if (vb->free == 0) {
+                        spin_lock(&vbq->lock);
+                        list_del_rcu(&vb->free_list);
+                        spin_unlock(&vbq->lock);
                }
                spin_unlock(&vb->lock);
+                break;
+next:
+                spin_unlock(&vb->lock);
        }
+        if (purge)
+                purge_fragmented_blocks_thiscpu();
        put_cpu_var(vmap_block_queue);
        rcu_read_unlock();
@@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size)
        BUG_ON(!vb);
        spin_lock(&vb->lock);
-        bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
+        BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
        vb->dirty += 1UL << order;
        if (vb->dirty == VMAP_BBMAP_BITS) {
-                BUG_ON(vb->free || !list_empty(&vb->free_list));
+                BUG_ON(vb->free);
                spin_unlock(&vb->lock);
                free_vmap_block(vb);
        } else
@@ -1035,8 +1097,6 @@ void __init vmalloc_init(void)
                vbq = &per_cpu(vmap_block_queue, i);
                spin_lock_init(&vbq->lock);
                INIT_LIST_HEAD(&vbq->free);
-                INIT_LIST_HEAD(&vbq->dirty);
-                vbq->nr_dirty = 0;
        }
        /* Import existing vmlist entries. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 885207a6b6b7..c26986c85ce0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1922,6 +1922,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
                if (!populated_zone(zone))
                        continue;
+                if (zone_is_all_unreclaimable(zone))
+                        continue;
                if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
                                                                0, 0))
                        return 1;