16 files changed, 165 insertions, 151 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6b4718e2ee34..b41823cc05e6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -39,12 +39,6 @@ DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
 LIST_HEAD(bdi_pending_list);
-static struct task_struct *sync_supers_tsk;
-static struct timer_list sync_supers_timer;
-static int bdi_sync_supers(void *);
-static void sync_supers_timer_fn(unsigned long);
 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
 {
        if (wb1 < wb2) {
@@ -250,12 +244,6 @@ static int __init default_bdi_init(void)
 {
        int err;
-        sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
-        BUG_ON(IS_ERR(sync_supers_tsk));
-        setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
-        bdi_arm_supers_timer();
        err = bdi_init(&default_backing_dev_info);
        if (!err)
                bdi_register(&default_backing_dev_info, NULL, "default");
@@ -270,46 +258,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
        return wb_has_dirty_io(&bdi->wb);
 }
-/*
- * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
- * or we risk deadlocking on ->s_umount. The longer term solution would be
- * to implement sync_supers_bdi() or similar and simply do it from the
- * bdi writeback thread individually.
- */
-static int bdi_sync_supers(void *unused)
-{
-        set_user_nice(current, 0);
-        while (!kthread_should_stop()) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                schedule();
-                /*
-                 * Do this periodically, like kupdated() did before.
-                 */
-                sync_supers();
-        }
-        return 0;
-}
-void bdi_arm_supers_timer(void)
-{
-        unsigned long next;
-        if (!dirty_writeback_interval)
-                return;
-        next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
-        mod_timer(&sync_supers_timer, round_jiffies_up(next));
-}
-static void sync_supers_timer_fn(unsigned long unused)
-{
-        wake_up_process(sync_supers_tsk);
-        bdi_arm_supers_timer();
-}
 static void wakeup_timer_fn(unsigned long data)
 {
        struct backing_dev_info *bdi = (struct backing_dev_info *)data;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index bcb63ac48cc5..f468185b3b28 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -419,7 +419,7 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 }
 /**
- * reserve_bootmem - mark a page range as usable
+ * reserve_bootmem - mark a page range as reserved
 * @addr: starting address of the range
 * @size: size of the range in bytes
 * @flags: reservation flags (see linux/bootmem.h)
diff --git a/mm/compaction.c b/mm/compaction.c
index e78cb9688421..7fcd3a52e68d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -51,6 +51,47 @@ static inline bool migrate_async_suitable(int migratetype)
 }
 /*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. Check if the process needs to be scheduled or
+ * if the lock is contended. For async compaction, back out in the event
+ * if contention is severe. For sync compaction, schedule.
+ *
+ * Returns true if the lock is held.
+ * Returns false if the lock is released and compaction should abort
+ */
+static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+                                      bool locked, struct compact_control *cc)
+{
+        if (need_resched() || spin_is_contended(lock)) {
+                if (locked) {
+                        spin_unlock_irqrestore(lock, *flags);
+                        locked = false;
+                }
+                /* async aborts if taking too long or contended */
+                if (!cc->sync) {
+                        if (cc->contended)
+                                *cc->contended = true;
+                        return false;
+                }
+                cond_resched();
+                if (fatal_signal_pending(current))
+                        return false;
+        }
+        if (!locked)
+                spin_lock_irqsave(lock, *flags);
+        return true;
+}
+static inline bool compact_trylock_irqsave(spinlock_t *lock,
+                        unsigned long *flags, struct compact_control *cc)
+{
+        return compact_checklock_irqsave(lock, flags, false, cc);
+}
+/*
 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
 * pages inside of the pageblock (even though it may still end up isolating
@@ -173,7 +214,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 }
 /* Update the number of anon and file isolated pages in the zone */
-static void acct_isolated(struct zone *zone, struct compact_control *cc)
+static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
 {
        struct page *page;
        unsigned int count[2] = { 0, };
@@ -181,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
        list_for_each_entry(page, &cc->migratepages, lru)
                count[!!page_is_file_cache(page)]++;
-        __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+        /* If locked we can use the interrupt unsafe versions */
-        __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        if (locked) {
+                __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+                __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        } else {
+                mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+                mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        }
 }
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        struct list_head *migratelist = &cc->migratepages;
        isolate_mode_t mode = 0;
        struct lruvec *lruvec;
+        unsigned long flags;
+        bool locked;
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        /* Time to isolate some pages for migration */
        cond_resched();
-        spin_lock_irq(&zone->lru_lock);
+        spin_lock_irqsave(&zone->lru_lock, flags);
+        locked = true;
        for (; low_pfn < end_pfn; low_pfn++) {
                struct page *page;
-                bool locked = true;
                /* give a chance to irqs before checking need_resched() */
                if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-                        spin_unlock_irq(&zone->lru_lock);
+                        spin_unlock_irqrestore(&zone->lru_lock, flags);
                        locked = false;
                }
-                if (need_resched() || spin_is_contended(&zone->lru_lock)) {
-                        if (locked)
+                /* Check if it is ok to still hold the lock */
-                                spin_unlock_irq(&zone->lru_lock);
+                locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
-                        cond_resched();
+                                                                locked, cc);
-                        spin_lock_irq(&zone->lru_lock);
+                if (!locked)
-                        if (fatal_signal_pending(current))
+                        break;
-                                break;
-                } else if (!locked)
-                        spin_lock_irq(&zone->lru_lock);
                /*
                 * migrate_pfn does not necessarily start aligned to a
@@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                }
        }
-        acct_isolated(zone, cc);
+        acct_isolated(zone, locked, cc);
-        spin_unlock_irq(&zone->lru_lock);
+        if (locked)
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -384,6 +431,20 @@ static bool suitable_migration_target(struct page *page)
 }
 /*
+ * Returns the start pfn of the last page block in a zone.  This is the starting
+ * point for full compaction of a zone.  Compaction searches for free pages from
+ * the end of each zone, while isolate_freepages_block scans forward inside each
+ * page block.
+ */
+static unsigned long start_free_pfn(struct zone *zone)
+{
+        unsigned long free_pfn;
+        free_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        free_pfn &= ~(pageblock_nr_pages-1);
+        return free_pfn;
+}
+/*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
 */
@@ -422,17 +483,6 @@ static void isolate_freepages(struct zone *zone,
                                        pfn -= pageblock_nr_pages) {
                unsigned long isolated;
-                /*
-                 * Skip ahead if another thread is compacting in the area
-                 * simultaneously. If we wrapped around, we can only skip
-                 * ahead if zone->compact_cached_free_pfn also wrapped to
-                 * above our starting point.
-                 */
-                if (cc->order > 0 && (!cc->wrapped ||
-                                      zone->compact_cached_free_pfn >
-                                      cc->start_free_pfn))
-                        pfn = min(pfn, zone->compact_cached_free_pfn);
                if (!pfn_valid(pfn))
                        continue;
@@ -458,7 +508,16 @@ static void isolate_freepages(struct zone *zone,
                 * are disabled
                 */
                isolated = 0;
-                spin_lock_irqsave(&zone->lock, flags);
+                /*
+                 * The zone lock must be held to isolate freepages. This
+                 * unfortunately this is a very coarse lock and can be
+                 * heavily contended if there are parallel allocations
+                 * or parallel compactions. For async compaction do not
+                 * spin on the lock
+                 */
+                if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
+                        break;
                if (suitable_migration_target(page)) {
                        end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
                        isolated = isolate_freepages_block(pfn, end_pfn,
@@ -474,7 +533,15 @@ static void isolate_freepages(struct zone *zone,
                 */
                if (isolated) {
                        high_pfn = max(high_pfn, pfn);
-                        if (cc->order > 0)
+                        /*
+                         * If the free scanner has wrapped, update
+                         * compact_cached_free_pfn to point to the highest
+                         * pageblock with free pages. This reduces excessive
+                         * scanning of full pageblocks near the end of the
+                         * zone
+                         */
+                        if (cc->order > 0 && cc->wrapped)
                                zone->compact_cached_free_pfn = high_pfn;
                }
        }
@@ -484,6 +551,11 @@ static void isolate_freepages(struct zone *zone,
        cc->free_pfn = high_pfn;
        cc->nr_freepages = nr_freepages;
+        /* If compact_cached_free_pfn is reset then set it now */
+        if (cc->order > 0 && !cc->wrapped &&
+                        zone->compact_cached_free_pfn == start_free_pfn(zone))
+                zone->compact_cached_free_pfn = high_pfn;
 }
 /*
@@ -570,20 +642,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        return ISOLATE_SUCCESS;
 }
-/*
- * Returns the start pfn of the last page block in a zone.  This is the starting
- * point for full compaction of a zone.  Compaction searches for free pages from
- * the end of each zone, while isolate_freepages_block scans forward inside each
- * page block.
- */
-static unsigned long start_free_pfn(struct zone *zone)
-{
-        unsigned long free_pfn;
-        free_pfn = zone->zone_start_pfn + zone->spanned_pages;
-        free_pfn &= ~(pageblock_nr_pages-1);
-        return free_pfn;
-}
 static int compact_finished(struct zone *zone,
                            struct compact_control *cc)
 {
@@ -771,7 +829,7 @@ out:
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                 bool sync)
+                                 bool sync, bool *contended)
 {
        struct compact_control cc = {
                .nr_freepages = 0,
@@ -780,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
                .sync = sync,
+                .contended = contended,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -801,7 +860,7 @@ int sysctl_extfrag_threshold = 500;
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                        bool sync)
+                        bool sync, bool *contended)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -825,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                                                                nodemask) {
                int status;
-                status = compact_zone_order(zone, order, gfp_mask, sync);
+                status = compact_zone_order(zone, order, gfp_mask, sync,
+                                                contended);
                rc = max(status, rc);
                /* If a normal allocation would succeed, stop compacting */
@@ -861,7 +921,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                if (cc->order > 0) {
                        int ok = zone_watermark_ok(zone, cc->order,
                                                low_wmark_pages(zone), 0, 0);
-                        if (ok && cc->order > zone->compact_order_failed)
+                        if (ok && cc->order >= zone->compact_order_failed)
                                zone->compact_order_failed = cc->order + 1;
                        /* Currently async compaction is never deferred. */
                        else if (!ok && cc->sync)
diff --git a/mm/filemap.c b/mm/filemap.c
index fa5ca304148e..384344575c37 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        retval = filemap_write_and_wait_range(mapping, pos,
                                        pos + iov_length(iov, nr_segs) - 1);
                        if (!retval) {
-                                struct blk_plug plug;
-                                blk_start_plug(&plug);
                                retval = mapping->a_ops->direct_IO(READ, iocb,
                                                        iov, pos, nr_segs);
-                                blk_finish_plug(&plug);
                        }
                        if (retval > 0) {
                                *ppos = pos + retval;
@@ -2527,14 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        struct blk_plug plug;
        ssize_t ret;
        BUG_ON(iocb->ki_pos != pos);
        sb_start_write(inode->i_sb);
        mutex_lock(&inode->i_mutex);
-        blk_start_plug(&plug);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);
@@ -2545,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0 && ret > 0)
                        ret = err;
        }
-        blk_finish_plug(&plug);
        sb_end_write(inode->i_sb);
        return ret;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 57c4b9309015..141dbb695097 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1811,7 +1811,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
                        src_page = pte_page(pteval);
                        copy_user_highpage(page, src_page, address, vma);
                        VM_BUG_ON(page_mapcount(src_page) != 1);
-                        VM_BUG_ON(page_count(src_page) != 2);
                        release_pte_page(src_page);
                        /*
                         * ptl mostly unnecessary, but preempt has to
diff --git a/mm/internal.h b/mm/internal.h
index 3314f79d775a..b8c91b342e24 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -130,6 +130,7 @@ struct compact_control {
        int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
+        bool *contended;                /* True if a lock was contended */
 };
 unsigned long
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 45eb6217bf38..0de83b4541e9 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1483,13 +1483,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
        struct kmemleak_object *prev_obj = v;
        struct kmemleak_object *next_obj = NULL;
-        struct list_head *n = &prev_obj->object_list;
+        struct kmemleak_object *obj = prev_obj;
        ++(*pos);
-        list_for_each_continue_rcu(n, &object_list) {
+        list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
-                struct kmemleak_object *obj =
-                        list_entry(n, struct kmemleak_object, object_list);
                if (get_object(obj)) {
                        next_obj = obj;
                        break;
diff --git a/mm/memblock.c b/mm/memblock.c
index 4d9393c7edc9..82aa349d2f7a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -246,7 +246,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
                                min(new_area_start, memblock.current_limit),
                                new_alloc_size, PAGE_SIZE);
-                new_array = addr ? __va(addr) : 0;
+                new_array = addr ? __va(addr) : NULL;
        }
        if (!addr) {
                pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3ad25f9d1fc1..6a5b90d0cfd7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -126,9 +126,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
        struct mem_section *ms;
        struct page *page, *memmap;
-        if (!pfn_valid(start_pfn))
-                return;
        section_nr = pfn_to_section_nr(start_pfn);
        ms = __nr_to_section(section_nr);
@@ -187,9 +184,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
        end_pfn = pfn + pgdat->node_spanned_pages;
        /* register_section info */
-        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
+        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
-                register_page_bootmem_info_section(pfn);
+                /*
+                 * Some platforms can assign the same pfn to multiple nodes - on
+                 * node0 as well as nodeN.  To avoid registering a pfn against
+                 * multiple nodes we check that this pfn does not already
+                 * reside in some other node.
+                 */
+                if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
+                        register_page_bootmem_info_section(pfn);
+        }
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bd92431d4c49..4ada3be6e252 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2562,7 +2562,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
                break;
        default:
-                BUG();
+                return -EINVAL;
        }
        l = strlen(policy_modes[mode]);
diff --git a/mm/mmap.c b/mm/mmap.c
index e3e86914f11a..ae18a48e7e4e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1356,9 +1356,8 @@ out:
        } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
                make_pages_present(addr, addr + len);
-        if (file && uprobe_mmap(vma))
+        if (file)
-                /* matching probes but cannot insert */
+                uprobe_mmap(vma);
-                goto unmap_and_free_vma;
        return addr;
@@ -2309,7 +2308,7 @@ void exit_mmap(struct mm_struct *mm)
        }
        vm_unacct_memory(nr_accounted);
-        BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+        WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 /* Insert vm structure into process list sorted by address
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e5363f34e025..5ad5ce23c1e0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1532,7 +1532,6 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
        proc_dointvec(table, write, buffer, length, ppos);
-        bdi_arm_supers_timer();
        return 0;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 009ac285fea7..c13ea7538891 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -584,7 +584,7 @@ static inline void __free_one_page(struct page *page,
                combined_idx = buddy_idx & page_idx;
                higher_page = page + (combined_idx - page_idx);
                buddy_idx = __find_buddy_index(combined_idx, order + 1);
-                higher_buddy = page + (buddy_idx - combined_idx);
+                higher_buddy = higher_page + (buddy_idx - combined_idx);
                if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                        list_add_tail(&page->lru,
                                &zone->free_area[order].free_list[migratetype]);
@@ -1928,6 +1928,17 @@ this_zone_full:
                zlc_active = 0;
                goto zonelist_scan;
        }
+        if (page)
+                /*
+                 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+                 * necessary to allocate the page. The expectation is
+                 * that the caller is taking steps that will free more
+                 * memory. The caller should avoid the page being used
+                 * for !PFMEMALLOC purposes.
+                 */
+                page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
        return page;
 }
@@ -2091,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int migratetype, bool sync_migration,
-        bool *deferred_compaction,
+        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
        struct page *page;
@@ -2106,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                nodemask, sync_migration);
+                                                nodemask, sync_migration,
+                                                contended_compaction);
        current->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
@@ -2152,7 +2164,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int migratetype, bool sync_migration,
-        bool *deferred_compaction,
+        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
        return NULL;
@@ -2325,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long did_some_progress;
        bool sync_migration = false;
        bool deferred_compaction = false;
+        bool contended_compaction = false;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2389,14 +2402,6 @@ rebalance:
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
                if (page) {
-                        /*
-                         * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
-                         * necessary to allocate the page. The expectation is
-                         * that the caller is taking steps that will free more
-                         * memory. The caller should avoid the page being used
-                         * for !PFMEMALLOC purposes.
-                         */
-                        page->pfmemalloc = true;
                        goto got_pg;
                }
        }
@@ -2422,6 +2427,7 @@ rebalance:
                                        nodemask,
                                        alloc_flags, preferred_zone,
                                        migratetype, sync_migration,
+                                        &contended_compaction,
                                        &deferred_compaction,
                                        &did_some_progress);
        if (page)
@@ -2431,10 +2437,11 @@ rebalance:
        /*
         * If compaction is deferred for high-order allocations, it is because
         * sync compaction recently failed. In this is the case and the caller
-         * has requested the system not be heavily disrupted, fail the
+         * requested a movable allocation that does not heavily disrupt the
-         * allocation now instead of entering direct reclaim
+         * system then fail the allocation instead of entering direct reclaim.
         */
-        if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+        if ((deferred_compaction || contended_compaction) &&
+                                                (gfp_mask & __GFP_NO_KSWAPD))
                goto nopage;
        /* Try direct reclaim and then allocating */
@@ -2505,6 +2512,7 @@ rebalance:
                                        nodemask,
                                        alloc_flags, preferred_zone,
                                        migratetype, sync_migration,
+                                        &contended_compaction,
                                        &deferred_compaction,
                                        &did_some_progress);
                if (page)
@@ -2569,8 +2577,6 @@ retry_cpuset:
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
-        else
-                page->pfmemalloc = false;
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
diff --git a/mm/slab.c b/mm/slab.c
index 35b5cb0da554..11339110271e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -983,7 +983,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
                }
                /* The caller cannot use PFMEMALLOC objects, find another one */
-                for (i = 1; i < ac->avail; i++) {
+                for (i = 0; i < ac->avail; i++) {
                        /* If a !PFMEMALLOC object is found, swap them */
                        if (!is_obj_pfmemalloc(ac->entry[i])) {
                                objp = ac->entry[i];
@@ -1000,7 +1000,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
                l3 = cachep->nodelists[numa_mem_id()];
                if (!list_empty(&l3->slabs_free) && force_refill) {
                        struct slab *slabp = virt_to_slab(objp);
-                        ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
+                        ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem));
                        clear_obj_pfmemalloc(&objp);
                        recheck_pfmemalloc_active(cachep, ac);
                        return objp;
@@ -1032,7 +1032,7 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
 {
        if (unlikely(pfmemalloc_active)) {
                /* Some pfmemalloc slabs exist, check if this is one */
-                struct page *page = virt_to_page(objp);
+                struct page *page = virt_to_head_page(objp);
                if (PageSlabPfmemalloc(page))
                        set_obj_pfmemalloc(&objp);
        }
@@ -3260,6 +3260,7 @@ force_grow:
                /* cache_grow can reenable interrupts, then ac could change. */
                ac = cpu_cache_get(cachep);
+                node = numa_mem_id();
                /* no objects in sight? abort */
                if (!x && (ac->avail == 0 || force_refill))
diff --git a/mm/slub.c b/mm/slub.c
index 8f78e2577031..2fdd96f9e998 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1524,12 +1524,13 @@ static inline void *acquire_slab(struct kmem_cache *s,
 }
 static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
+static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
 /*
 * Try to allocate a partial slab from a specific node.
 */
-static void *get_partial_node(struct kmem_cache *s,
+static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
-                struct kmem_cache_node *n, struct kmem_cache_cpu *c)
+                                struct kmem_cache_cpu *c, gfp_t flags)
 {
        struct page *page, *page2;
        void *object = NULL;
@@ -1545,9 +1546,13 @@ static void *get_partial_node(struct kmem_cache *s,
        spin_lock(&n->list_lock);
        list_for_each_entry_safe(page, page2, &n->partial, lru) {
-                void *t = acquire_slab(s, n, page, object == NULL);
+                void *t;
                int available;
+                if (!pfmemalloc_match(page, flags))
+                        continue;
+                t = acquire_slab(s, n, page, object == NULL);
                if (!t)
                        break;
@@ -1614,7 +1619,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
                        if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
                                        n->nr_partial > s->min_partial) {
-                                object = get_partial_node(s, n, c);
+                                object = get_partial_node(s, n, c, flags);
                                if (object) {
                                        /*
                                         * Return the object even if
@@ -1643,7 +1648,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
        void *object;
        int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
-        object = get_partial_node(s, get_node(s, searchnode), c);
+        object = get_partial_node(s, get_node(s, searchnode), c, flags);
        if (object || node != NUMA_NO_NODE)
                return object;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8d01243d9560..99b434b674c0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3102,6 +3102,7 @@ int kswapd_run(int nid)
                /* failure at boot is fatal */
                BUG_ON(system_state == SYSTEM_BOOTING);
                printk("Failed to start kswapd on node %d\n",nid);
+                pgdat->kswapd = NULL;
                ret = -1;
        }
        return ret;