15 files changed, 392 insertions, 141 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8d3457..f820e600f1ad 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -78,9 +78,6 @@
 *  ->i_mutex                   (generic_file_buffered_write)
 *    ->mmap_sem                (fault_in_pages_readable->do_page_fault)
 *
- *  ->i_mutex
- *    ->i_alloc_sem             (various)
- *
 *  inode_wb_list_lock
 *    sb_lock                   (fs/fs-writeback.c)
 *    ->mapping->tree_lock      (__sync_single_inode)
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed503..74bf193eff04 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma,
        endoff = (loff_t)(end - vma->vm_start - 1)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        /* vmtruncate_range needs to take i_mutex and i_alloc_sem */
+        /* vmtruncate_range needs to take i_mutex */
        up_read(&current->mm->mmap_sem);
        error = vmtruncate_range(mapping->host, offset, endoff);
        down_read(&current->mm->mmap_sem);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ddffc74cdebe..e013b8e57d25 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -108,10 +108,12 @@ enum mem_cgroup_events_index {
 enum mem_cgroup_events_target {
        MEM_CGROUP_TARGET_THRESH,
        MEM_CGROUP_TARGET_SOFTLIMIT,
+        MEM_CGROUP_TARGET_NUMAINFO,
        MEM_CGROUP_NTARGETS,
 };
 #define THRESHOLDS_EVENTS_TARGET (128)
 #define SOFTLIMIT_EVENTS_TARGET (1024)
+#define NUMAINFO_EVENTS_TARGET  (1024)
 struct mem_cgroup_stat_cpu {
        long count[MEM_CGROUP_STAT_NSTATS];
@@ -237,7 +239,8 @@ struct mem_cgroup {
        int last_scanned_node;
 #if MAX_NUMNODES > 1
        nodemask_t      scan_nodes;
-        unsigned long   next_scan_node_update;
+        atomic_t        numainfo_events;
+        atomic_t        numainfo_updating;
 #endif
        /*
         * Should the accounting and control be hierarchical, per subtree?
@@ -577,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
        return val;
 }
-static long mem_cgroup_local_usage(struct mem_cgroup *mem)
-{
-        long ret;
-        ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
-        ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
-        return ret;
-}
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
                                         bool charge)
 {
@@ -689,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
        case MEM_CGROUP_TARGET_SOFTLIMIT:
                next = val + SOFTLIMIT_EVENTS_TARGET;
                break;
+        case MEM_CGROUP_TARGET_NUMAINFO:
+                next = val + NUMAINFO_EVENTS_TARGET;
+                break;
        default:
                return;
        }
@@ -707,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
                mem_cgroup_threshold(mem);
                __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
                if (unlikely(__memcg_event_check(mem,
-                        MEM_CGROUP_TARGET_SOFTLIMIT))){
+                             MEM_CGROUP_TARGET_SOFTLIMIT))) {
                        mem_cgroup_update_tree(mem, page);
                        __mem_cgroup_target_update(mem,
-                                MEM_CGROUP_TARGET_SOFTLIMIT);
+                                                   MEM_CGROUP_TARGET_SOFTLIMIT);
+                }
+#if MAX_NUMNODES > 1
+                if (unlikely(__memcg_event_check(mem,
+                        MEM_CGROUP_TARGET_NUMAINFO))) {
+                        atomic_inc(&mem->numainfo_events);
+                        __mem_cgroup_target_update(mem,
+                                MEM_CGROUP_TARGET_NUMAINFO);
                }
+#endif
        }
 }
@@ -1129,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
        return MEM_CGROUP_ZSTAT(mz, lru);
 }
-#ifdef CONFIG_NUMA
 static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
                                                        int nid)
 {
@@ -1141,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
        return ret;
 }
+static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
+                                                        int nid)
+{
+        unsigned long ret;
+        ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
+                mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
+        return ret;
+}
+#if MAX_NUMNODES > 1
 static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
 {
        u64 total = 0;
@@ -1152,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
        return total;
 }
-static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
-                                                        int nid)
-{
-        unsigned long ret;
-        ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
-                mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
-        return ret;
-}
 static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
 {
        u64 total = 0;
@@ -1559,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
        return ret;
 }
+/**
+ * test_mem_cgroup_node_reclaimable
+ * @mem: the target memcg
+ * @nid: the node ID to be checked.
+ * @noswap : specify true here if the user wants flle only information.
+ *
+ * This function returns whether the specified memcg contains any
+ * reclaimable pages on a node. Returns true if there are any reclaimable
+ * pages in the node.
+ */
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+                int nid, bool noswap)
+{
+        if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
+                return true;
+        if (noswap || !total_swap_pages)
+                return false;
+        if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
+                return true;
+        return false;
+}
 #if MAX_NUMNODES > 1
 /*
@@ -1570,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
 {
        int nid;
+        /*
-        if (time_after(mem->next_scan_node_update, jiffies))
+         * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
+         * pagein/pageout changes since the last update.
+         */
+        if (!atomic_read(&mem->numainfo_events))
+                return;
+        if (atomic_inc_return(&mem->numainfo_updating) > 1)
                return;
-        mem->next_scan_node_update = jiffies + 10*HZ;
        /* make a nodemask where this memcg uses memory from */
        mem->scan_nodes = node_states[N_HIGH_MEMORY];
        for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
-                if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+                if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
-                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+                        node_clear(nid, mem->scan_nodes);
-                        continue;
-                if (total_swap_pages &&
-                    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
-                     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
-                        continue;
-                node_clear(nid, mem->scan_nodes);
        }
+        atomic_set(&mem->numainfo_events, 0);
+        atomic_set(&mem->numainfo_updating, 0);
 }
 /*
@@ -1627,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
        return node;
 }
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+        int nid;
+        /*
+         * quick check...making use of scan_node.
+         * We can skip unused nodes.
+         */
+        if (!nodes_empty(mem->scan_nodes)) {
+                for (nid = first_node(mem->scan_nodes);
+                     nid < MAX_NUMNODES;
+                     nid = next_node(nid, mem->scan_nodes)) {
+                        if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                                return true;
+                }
+        }
+        /*
+         * Check rest of nodes.
+         */
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                if (node_isset(nid, mem->scan_nodes))
+                        continue;
+                if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                        return true;
+        }
+        return false;
+}
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
 {
        return 0;
 }
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+        return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+}
 #endif
 /*
@@ -1702,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                }
                        }
                }
-                if (!mem_cgroup_local_usage(victim)) {
+                if (!mem_cgroup_reclaimable(victim, noswap)) {
                        /* this cgroup's local usage == 0 */
                        css_put(&victim->css);
                        continue;
diff --git a/mm/memory.c b/mm/memory.c
index 40b7531ee8ba..9b8a01d941cb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
        if (batch->nr == batch->max) {
                if (!tlb_next_batch(tlb))
                        return 0;
+                batch = tlb->active;
        }
        VM_BUG_ON(batch->nr > batch->max);
diff --git a/mm/nommu.c b/mm/nommu.c
index 1fd0c51b10a6..5c5c2d4b1807 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,6 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/tracehook.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
         * it's being traced - otherwise breakpoints set in it may interfere
         * with another untraced process
         */
-        if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
+        if ((flags & MAP_PRIVATE) && current->ptrace)
                vm_flags &= ~VM_MAYSHARE;
        return vm_flags;
@@ -1813,10 +1812,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        return NULL;
 }
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
-                unsigned long to, unsigned long size, pgprot_t prot)
+                unsigned long pfn, unsigned long size, pgprot_t prot)
 {
-        vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
+        if (addr != (pfn << PAGE_SHIFT))
+                return -EINVAL;
+        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
        return 0;
 }
 EXPORT_SYMBOL(remap_pfn_range);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4b0991ca351..b0be989d4365 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -339,8 +339,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                                 * then wait for it to finish before killing
                                 * some other task unnecessarily.
                                 */
-                                if (!(task_ptrace(p->group_leader) &
+                                if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
-                                                        PT_TRACE_EXIT))
                                        return ERR_PTR(-1UL);
                        }
                }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985acdab8..9119faae6e6a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4585,6 +4585,60 @@ void __init sort_node_map(void)
                        cmp_node_active_region, NULL);
 }
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
+ * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Returns the determined alignment in pfn's.  0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+        unsigned long accl_mask = 0, last_end = 0;
+        int last_nid = -1;
+        int i;
+        for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
+                int nid = early_node_map[i].nid;
+                unsigned long start = early_node_map[i].start_pfn;
+                unsigned long end = early_node_map[i].end_pfn;
+                unsigned long mask;
+                if (!start || last_nid < 0 || last_nid == nid) {
+                        last_nid = nid;
+                        last_end = end;
+                        continue;
+                }
+                /*
+                 * Start with a mask granular enough to pin-point to the
+                 * start pfn and tick off bits one-by-one until it becomes
+                 * too coarse to separate the current node from the last.
+                 */
+                mask = ~((1 << __ffs(start)) - 1);
+                while (mask && last_end <= (start & (mask << 1)))
+                        mask <<= 1;
+                /* accumulate all internode masks */
+                accl_mask |= mask;
+        }
+        /* convert mask to number of pages */
+        return ~accl_mask + 1;
+}
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
diff --git a/mm/rmap.c b/mm/rmap.c
index 23295f65ae43..9701574bb67a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,7 +21,6 @@
 * Lock ordering in mm:
 *
 * inode->i_mutex       (while writing or truncating, not reading or faulting)
- *   inode->i_alloc_sem (vmtruncate_range)
 *   mm->mmap_sem
 *     page->flags PG_locked (lock_page)
 *       mapping->i_mmap_mutex
@@ -870,11 +869,11 @@ int page_referenced(struct page *page,
                                                                vm_flags);
                if (we_locked)
                        unlock_page(page);
+                if (page_test_and_clear_young(page_to_pfn(page)))
+                        referenced++;
        }
 out:
-        if (page_test_and_clear_young(page_to_pfn(page)))
-                referenced++;
        return referenced;
 }
diff --git a/mm/slab.c b/mm/slab.c
index d96e223de775..1e523ed47c61 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic =
    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 /* internal cache of cache description objs */
+static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
 static struct kmem_cache cache_cache = {
+        .nodelists = cache_cache_nodelists,
        .batchcount = 1,
        .limit = BOOT_CPUCACHE_ENTRIES,
        .shared = 1,
@@ -1492,11 +1494,10 @@ void __init kmem_cache_init(void)
        cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
        /*
-         * struct kmem_cache size depends on nr_node_ids, which
+         * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
-         * can be less than MAX_NUMNODES.
         */
-        cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
+        cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
-                                 nr_node_ids * sizeof(struct kmem_list3 *);
+                                  nr_node_ids * sizeof(struct kmem_list3 *);
 #if DEBUG
        cache_cache.obj_size = cache_cache.buffer_size;
 #endif
@@ -2308,6 +2309,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (!cachep)
                goto oops;
+        cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
 #if DEBUG
        cachep->obj_size = size;
@@ -3153,12 +3155,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
                cachep->ctor(objp);
-#if ARCH_SLAB_MINALIGN
+        if (ARCH_SLAB_MINALIGN &&
-        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
+            ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
-                       objp, ARCH_SLAB_MINALIGN);
+                       objp, (int)ARCH_SLAB_MINALIGN);
        }
-#endif
        return objp;
 }
 #else
diff --git a/mm/slob.c b/mm/slob.c
index 46e0aee33a23..0ae881831ae2 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
        int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
        void *ret;
+        gfp &= gfp_allowed_mask;
        lockdep_trace_alloc(gfp);
        if (size < PAGE_SIZE - align) {
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
        void *b;
+        flags &= gfp_allowed_mask;
+        lockdep_trace_alloc(flags);
        if (c->size < PAGE_SIZE) {
                b = slob_alloc(c->size, flags, c->align, node);
                trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
diff --git a/mm/slub.c b/mm/slub.c
index 35f351f26193..ba83f3fd0757 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -27,6 +27,7 @@
 #include <linux/memory.h>
 #include <linux/math64.h>
 #include <linux/fault-inject.h>
+#include <linux/stacktrace.h>
 #include <trace/events/kmem.h>
@@ -191,8 +192,12 @@ static LIST_HEAD(slab_caches);
 /*
 * Tracking user of a slab.
 */
+#define TRACK_ADDRS_COUNT 16
 struct track {
        unsigned long addr;     /* Called from address */
+#ifdef CONFIG_STACKTRACE
+        unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
+#endif
        int cpu;                /* Was running on cpu */
        int pid;                /* Pid context */
        unsigned long when;     /* When did the operation occur */
@@ -420,6 +425,24 @@ static void set_track(struct kmem_cache *s, void *object,
        struct track *p = get_track(s, object, alloc);
        if (addr) {
+#ifdef CONFIG_STACKTRACE
+                struct stack_trace trace;
+                int i;
+                trace.nr_entries = 0;
+                trace.max_entries = TRACK_ADDRS_COUNT;
+                trace.entries = p->addrs;
+                trace.skip = 3;
+                save_stack_trace(&trace);
+                /* See rant in lockdep.c */
+                if (trace.nr_entries != 0 &&
+                    trace.entries[trace.nr_entries - 1] == ULONG_MAX)
+                        trace.nr_entries--;
+                for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
+                        p->addrs[i] = 0;
+#endif
                p->addr = addr;
                p->cpu = smp_processor_id();
                p->pid = current->pid;
@@ -444,6 +467,16 @@ static void print_track(const char *s, struct track *t)
        printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
                s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
+#ifdef CONFIG_STACKTRACE
+        {
+                int i;
+                for (i = 0; i < TRACK_ADDRS_COUNT; i++)
+                        if (t->addrs[i])
+                                printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
+                        else
+                                break;
+        }
+#endif
 }
 static void print_tracking(struct kmem_cache *s, void *object)
@@ -557,10 +590,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
                memset(p + s->objsize, val, s->inuse - s->objsize);
 }
-static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
+static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
 {
        while (bytes) {
-                if (*start != (u8)value)
+                if (*start != value)
                        return start;
                start++;
                bytes--;
@@ -568,6 +601,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
        return NULL;
 }
+static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
+{
+        u64 value64;
+        unsigned int words, prefix;
+        if (bytes <= 16)
+                return check_bytes8(start, value, bytes);
+        value64 = value | value << 8 | value << 16 | value << 24;
+        value64 = value64 | value64 << 32;
+        prefix = 8 - ((unsigned long)start) % 8;
+        if (prefix) {
+                u8 *r = check_bytes8(start, value, prefix);
+                if (r)
+                        return r;
+                start += prefix;
+                bytes -= prefix;
+        }
+        words = bytes / 8;
+        while (words) {
+                if (*(u64 *)start != value64)
+                        return check_bytes8(start, value, 8);
+                start += 8;
+                words--;
+        }
+        return check_bytes8(start, value, bytes % 8);
+}
 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
                                                void *from, void *to)
 {
@@ -2928,6 +2993,42 @@ size_t ksize(const void *object)
 }
 EXPORT_SYMBOL(ksize);
+#ifdef CONFIG_SLUB_DEBUG
+bool verify_mem_not_deleted(const void *x)
+{
+        struct page *page;
+        void *object = (void *)x;
+        unsigned long flags;
+        bool rv;
+        if (unlikely(ZERO_OR_NULL_PTR(x)))
+                return false;
+        local_irq_save(flags);
+        page = virt_to_head_page(x);
+        if (unlikely(!PageSlab(page))) {
+                /* maybe it was from stack? */
+                rv = true;
+                goto out_unlock;
+        }
+        slab_lock(page);
+        if (on_freelist(page->slab, page, object)) {
+                object_err(page->slab, page, object, "Object is on free-list");
+                rv = false;
+        } else {
+                rv = true;
+        }
+        slab_unlock(page);
+out_unlock:
+        local_irq_restore(flags);
+        return rv;
+}
+EXPORT_SYMBOL(verify_mem_not_deleted);
+#endif
 void kfree(const void *x)
 {
        struct page *page;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ff8dc1a18cb4..1b8c33907242 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1681,19 +1681,14 @@ out:
 }
 #ifdef CONFIG_PROC_FS
-struct proc_swaps {
-        struct seq_file seq;
-        int event;
-};
 static unsigned swaps_poll(struct file *file, poll_table *wait)
 {
-        struct proc_swaps *s = file->private_data;
+        struct seq_file *seq = file->private_data;
        poll_wait(file, &proc_poll_wait, wait);
-        if (s->event != atomic_read(&proc_poll_event)) {
+        if (seq->poll_event != atomic_read(&proc_poll_event)) {
-                s->event = atomic_read(&proc_poll_event);
+                seq->poll_event = atomic_read(&proc_poll_event);
                return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
        }
@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = {
 static int swaps_open(struct inode *inode, struct file *file)
 {
-        struct proc_swaps *s;
+        struct seq_file *seq;
        int ret;
-        s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
-        if (!s)
-                return -ENOMEM;
-        file->private_data = s;
        ret = seq_open(file, &swaps_op);
-        if (ret) {
+        if (ret)
-                kfree(s);
                return ret;
-        }
-        s->seq.private = s;
+        seq = file->private_data;
-        s->event = atomic_read(&proc_poll_event);
+        seq->poll_event = atomic_read(&proc_poll_event);
-        return ret;
+        return 0;
 }
 static const struct file_operations proc_swaps_operations = {
diff --git a/mm/truncate.c b/mm/truncate.c
index e13f22efaad7..003c6c685fc8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -622,12 +622,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
                return -ENOSYS;
        mutex_lock(&inode->i_mutex);
-        down_write(&inode->i_alloc_sem);
+        inode_dio_wait(inode);
        unmap_mapping_range(mapping, offset, (end - offset), 1);
        inode->i_op->truncate_range(inode, offset, end);
        /* unmap again to remove racily COWed private pages */
        unmap_mapping_range(mapping, offset, (end - offset), 1);
-        up_write(&inode->i_alloc_sem);
        mutex_unlock(&inode->i_mutex);
        return 0;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d34d75366a7..ab8494cde007 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -452,13 +452,6 @@ overflow:
        return ERR_PTR(-EBUSY);
 }
-static void rcu_free_va(struct rcu_head *head)
-{
-        struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
-        kfree(va);
-}
 static void __free_vmap_area(struct vmap_area *va)
 {
        BUG_ON(RB_EMPTY_NODE(&va->rb_node));
@@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va)
        if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
                vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
-        call_rcu(&va->rcu_head, rcu_free_va);
+        kfree_rcu(va, rcu_head);
 }
 /*
@@ -837,13 +830,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        return vb;
 }
-static void rcu_free_vb(struct rcu_head *head)
-{
-        struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
-        kfree(vb);
-}
 static void free_vmap_block(struct vmap_block *vb)
 {
        struct vmap_block *tmp;
@@ -856,7 +842,7 @@ static void free_vmap_block(struct vmap_block *vb)
        BUG_ON(tmp != vb);
        free_vmap_area_noflush(vb->va);
-        call_rcu(&vb->rcu_head, rcu_free_vb);
+        kfree_rcu(vb, rcu_head);
 }
 static void purge_fragmented_blocks(int cpu)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4f49535d4cd3..febbc044e792 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -250,49 +250,90 @@ unsigned long shrink_slab(struct shrink_control *shrink,
                unsigned long long delta;
                unsigned long total_scan;
                unsigned long max_pass;
+                int shrink_ret = 0;
+                long nr;
+                long new_nr;
+                long batch_size = shrinker->batch ? shrinker->batch
+                                                  : SHRINK_BATCH;
+                /*
+                 * copy the current shrinker scan count into a local variable
+                 * and zero it so that other concurrent shrinker invocations
+                 * don't also do this scanning work.
+                 */
+                do {
+                        nr = shrinker->nr;
+                } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+                total_scan = nr;
                max_pass = do_shrinker_shrink(shrinker, shrink, 0);
                delta = (4 * nr_pages_scanned) / shrinker->seeks;
                delta *= max_pass;
                do_div(delta, lru_pages + 1);
-                shrinker->nr += delta;
+                total_scan += delta;
-                if (shrinker->nr < 0) {
+                if (total_scan < 0) {
                        printk(KERN_ERR "shrink_slab: %pF negative objects to "
                               "delete nr=%ld\n",
-                               shrinker->shrink, shrinker->nr);
+                               shrinker->shrink, total_scan);
-                        shrinker->nr = max_pass;
+                        total_scan = max_pass;
                }
                /*
+                 * We need to avoid excessive windup on filesystem shrinkers
+                 * due to large numbers of GFP_NOFS allocations causing the
+                 * shrinkers to return -1 all the time. This results in a large
+                 * nr being built up so when a shrink that can do some work
+                 * comes along it empties the entire cache due to nr >>>
+                 * max_pass.  This is bad for sustaining a working set in
+                 * memory.
+                 *
+                 * Hence only allow the shrinker to scan the entire cache when
+                 * a large delta change is calculated directly.
+                 */
+                if (delta < max_pass / 4)
+                        total_scan = min(total_scan, max_pass / 2);
+                /*
                 * Avoid risking looping forever due to too large nr value:
                 * never try to free more than twice the estimate number of
                 * freeable entries.
                 */
-                if (shrinker->nr > max_pass * 2)
+                if (total_scan > max_pass * 2)
-                        shrinker->nr = max_pass * 2;
+                        total_scan = max_pass * 2;
-                total_scan = shrinker->nr;
+                trace_mm_shrink_slab_start(shrinker, shrink, nr,
-                shrinker->nr = 0;
+                                        nr_pages_scanned, lru_pages,
+                                        max_pass, delta, total_scan);
-                while (total_scan >= SHRINK_BATCH) {
+                while (total_scan >= batch_size) {
-                        long this_scan = SHRINK_BATCH;
-                        int shrink_ret;
                        int nr_before;
                        nr_before = do_shrinker_shrink(shrinker, shrink, 0);
                        shrink_ret = do_shrinker_shrink(shrinker, shrink,
-                                                        this_scan);
+                                                        batch_size);
                        if (shrink_ret == -1)
                                break;
                        if (shrink_ret < nr_before)
                                ret += nr_before - shrink_ret;
-                        count_vm_events(SLABS_SCANNED, this_scan);
+                        count_vm_events(SLABS_SCANNED, batch_size);
-                        total_scan -= this_scan;
+                        total_scan -= batch_size;
                        cond_resched();
                }
-                shrinker->nr += total_scan;
+                /*
+                 * move the unused scan count back into the shrinker in a
+                 * manner that handles concurrent updates. If we exhausted the
+                 * scan, there is no need to do an update.
+                 */
+                do {
+                        nr = shrinker->nr;
+                        new_nr = total_scan + nr;
+                        if (total_scan <= 0)
+                                break;
+                } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+                trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
        }
        up_read(&shrinker_rwsem);
 out:
@@ -2310,7 +2351,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
        for (i = 0; i <= classzone_idx; i++)
                present_pages += pgdat->node_zones[i].present_pages;
-        return balanced_pages > (present_pages >> 2);
+        /* A special case here: if zone has no page, we think it's balanced */
+        return balanced_pages >= (present_pages >> 2);
 }
 /* is kswapd sleeping prematurely? */
@@ -2326,7 +2368,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
                return true;
        /* Check the watermark levels */
-        for (i = 0; i < pgdat->nr_zones; i++) {
+        for (i = 0; i <= classzone_idx; i++) {
                struct zone *zone = pgdat->node_zones + i;
                if (!populated_zone(zone))
@@ -2344,7 +2386,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
                }
                if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
-                                                        classzone_idx, 0))
+                                                        i, 0))
                        all_zones_ok = false;
                else
                        balanced += zone->present_pages;
@@ -2451,7 +2493,6 @@ loop_again:
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
-                                *classzone_idx = i;
                                break;
                        }
                }
@@ -2510,18 +2551,18 @@ loop_again:
                                KSWAPD_ZONE_BALANCE_GAP_RATIO);
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone) + balance_gap,
-                                        end_zone, 0))
+                                        end_zone, 0)) {
                                shrink_zone(priority, zone, &sc);
-                        reclaim_state->reclaimed_slab = 0;
-                        nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
-                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-                        total_scanned += sc.nr_scanned;
-                        if (zone->all_unreclaimable)
+                                reclaim_state->reclaimed_slab = 0;
-                                continue;
+                                nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
-                        if (nr_slab == 0 &&
+                                sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-                            !zone_reclaimable(zone))
+                                total_scanned += sc.nr_scanned;
-                                zone->all_unreclaimable = 1;
+                                if (nr_slab == 0 && !zone_reclaimable(zone))
+                                        zone->all_unreclaimable = 1;
+                        }
                        /*
                         * If we've done a decent amount of scanning and
                         * the reclaim ratio is low, start doing writepage
@@ -2531,6 +2572,12 @@ loop_again:
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
+                        if (zone->all_unreclaimable) {
+                                if (end_zone && end_zone == i)
+                                        end_zone--;
+                                continue;
+                        }
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
@@ -2709,8 +2756,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 */
 static int kswapd(void *p)
 {
-        unsigned long order;
+        unsigned long order, new_order;
-        int classzone_idx;
+        int classzone_idx, new_classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
@@ -2740,17 +2787,23 @@ static int kswapd(void *p)
        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
        set_freezable();
-        order = 0;
+        order = new_order = 0;
-        classzone_idx = MAX_NR_ZONES - 1;
+        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
        for ( ; ; ) {
-                unsigned long new_order;
-                int new_classzone_idx;
                int ret;
-                new_order = pgdat->kswapd_max_order;
+                /*
-                new_classzone_idx = pgdat->classzone_idx;
+                 * If the last balance_pgdat was unsuccessful it's unlikely a
-                pgdat->kswapd_max_order = 0;
+                 * new request of a similar or harder type will succeed soon
-                pgdat->classzone_idx = MAX_NR_ZONES - 1;
+                 * so consider going to sleep on the basis we reclaimed at
+                 */
+                if (classzone_idx >= new_classzone_idx && order == new_order) {
+                        new_order = pgdat->kswapd_max_order;
+                        new_classzone_idx = pgdat->classzone_idx;
+                        pgdat->kswapd_max_order =  0;
+                        pgdat->classzone_idx = pgdat->nr_zones - 1;
+                }
                if (order < new_order || classzone_idx > new_classzone_idx) {
                        /*
                         * Don't sleep if someone wants a larger 'order'
@@ -2763,7 +2816,7 @@ static int kswapd(void *p)
                        order = pgdat->kswapd_max_order;
                        classzone_idx = pgdat->classzone_idx;
                        pgdat->kswapd_max_order = 0;
-                        pgdat->classzone_idx = MAX_NR_ZONES - 1;
+                        pgdat->classzone_idx = pgdat->nr_zones - 1;
                }
                ret = try_to_freeze();