Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

Fun set of conflict resolutions here... For the mac80211 stuff, these were fortunately just parallel adds. Trivially resolved. In drivers/net/phy/phy.c we had a bug fix in 'net' that moved the function phy_disable_interrupts() earlier in the file, whilst in 'net-next' the phy_error() call from this function was removed. In net/ipv4/xfrm4_policy.c, David Ahern's changes to remove the 'rt_table_id' member of rtable collided with a bug fix in 'net' that added a new struct member "rt_mtu_locked" which needs to be copied over here. The mlxsw driver conflict consisted of net-next separating the span code and definitions into separate files, whilst a 'net' bug fix made some changes to that moved code. The mlx5 infiniband conflict resolution was quite non-trivial, the RDMA tree's merge commit was used as a guide here, and here are their notes: ==================== Due to bug fixes found by the syzkaller bot and taken into the for-rc branch after development for the 4.17 merge window had already started being taken into the for-next branch, there were fairly non-trivial merge issues that would need to be resolved between the for-rc branch and the for-next branch. This merge resolves those conflicts and provides a unified base upon which ongoing development for 4.17 can be based. Conflicts: drivers/infiniband/hw/mlx5/main.c - Commit 42cea83f9524 (IB/mlx5: Fix cleanup order on unload) added to for-rc and commit b5ca15ad7e61 (IB/mlx5: Add proper representors support) add as part of the devel cycle both needed to modify the init/de-init functions used by mlx5. To support the new representors, the new functions added by the cleanup patch needed to be made non-static, and the init/de-init list added by the representors patch needed to be modified to match the init/de-init list changes made by the cleanup patch. Updates: drivers/infiniband/hw/mlx5/mlx5_ib.h - Update function prototypes added by representors patch to reflect new function names as changed by cleanup patch drivers/infiniband/hw/mlx5/ib_rep.c - Update init/de-init stage list to match new order from cleanup patch ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2018-03-23 11:24:57 -0400
committer: David S. Miller <davem@davemloft.net> 2018-03-23 11:31:58 -0400
commit: 03fe2debbb2771fb90881e4ce8109b09cf772a5c (patch)
tree: fbaf8738296b2e9dcba81c6daef2d515b6c4948c /mm
parent: 6686c459e1449a3ee5f3fd313b0a559ace7a700e (diff)
parent: f36b7534b83357cf52e747905de6d65b4f7c2512 (diff)
12 files changed, 134 insertions, 109 deletions
diff --git a/mm/gup.c b/mm/gup.c
index 1b46e6e74881..6afae32571ca 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -516,7 +516,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
        }
        if (ret & VM_FAULT_RETRY) {
-                if (nonblocking)
+                if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
                        *nonblocking = 0;
                return -EBUSY;
        }
@@ -890,7 +890,10 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
                                break;
                }
                if (*locked) {
-                        /* VM_FAULT_RETRY didn't trigger */
+                        /*
+                         * VM_FAULT_RETRY didn't trigger or it was a
+                         * FOLL_NOWAIT.
+                         */
                        if (!pages_done)
                                pages_done = ret;
                        break;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 87ab9b8f56b5..5a68730eebd6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -555,7 +555,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
        VM_BUG_ON_PAGE(!PageCompound(page), page);
-        if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
+        if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg,
+                                  true)) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -1316,7 +1317,7 @@ alloc:
        }
        if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
-                                        huge_gfp, &memcg, true))) {
+                                huge_gfp | __GFP_NORETRY, &memcg, true))) {
                put_page(new_page);
                split_huge_pmd(vma, vmf->pmd, vmf->address);
                if (page)
@@ -2783,11 +2784,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
        list_for_each_safe(pos, next, &list) {
                page = list_entry((void *)pos, struct page, mapping);
-                lock_page(page);
+                if (!trylock_page(page))
+                        goto next;
                /* split_huge_page() removes page from list on success */
                if (!split_huge_page(page))
                        split++;
                unlock_page(page);
+next:
                put_page(page);
        }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c204e3d132b..976bbc5646fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
+#include <linux/mmdebug.h>
 #include <linux/sched/signal.h>
 #include <linux/rmap.h>
 #include <linux/string_helpers.h>
@@ -1583,7 +1584,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
                page = NULL;
        } else {
                h->surplus_huge_pages++;
-                h->nr_huge_pages_node[page_to_nid(page)]++;
+                h->surplus_huge_pages_node[page_to_nid(page)]++;
        }
 out_unlock:
@@ -4374,6 +4375,12 @@ int hugetlb_reserve_pages(struct inode *inode,
        struct resv_map *resv_map;
        long gbl_reserve;
+        /* This should never happen */
+        if (from > to) {
+                VM_WARN(1, "%s called with a negative range\n", __func__);
+                return -EINVAL;
+        }
        /*
         * Only apply hugepage reservation if asked. At fault time, an
         * attempt will be made for VM_NORESERVE to allocate a page
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b7e2268dfc9a..e42568284e06 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -530,7 +530,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                        goto out;
                }
-                VM_BUG_ON_PAGE(PageCompound(page), page);
+                /* TODO: teach khugepaged to collapse THP mapped with pte */
+                if (PageCompound(page)) {
+                        result = SCAN_PAGE_COMPOUND;
+                        goto out;
+                }
                VM_BUG_ON_PAGE(!PageAnon(page), page);
                /*
@@ -960,7 +965,9 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out_nolock;
        }
-        if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+        /* Do not oom kill for khugepaged charges */
+        if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
+                                           &memcg, true))) {
                result = SCAN_CGROUP_CHARGE_FAIL;
                goto out_nolock;
        }
@@ -1319,7 +1326,9 @@ static void collapse_shmem(struct mm_struct *mm,
                goto out;
        }
-        if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+        /* Do not oom kill for khugepaged charges */
+        if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
+                                           &memcg, true))) {
                result = SCAN_CGROUP_CHARGE_FAIL;
                goto out;
        }
diff --git a/mm/memblock.c b/mm/memblock.c
index 5a9ca2a1751b..48376bd33274 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
                *out_nid = r->nid;
 }
-unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
-                                                      unsigned long max_pfn)
-{
-        struct memblock_type *type = &memblock.memory;
-        unsigned int right = type->cnt;
-        unsigned int mid, left = 0;
-        phys_addr_t addr = PFN_PHYS(pfn + 1);
-        do {
-                mid = (right + left) / 2;
-                if (addr < type->regions[mid].base)
-                        right = mid;
-                else if (addr >= (type->regions[mid].base +
-                                  type->regions[mid].size))
-                        left = mid + 1;
-                else {
-                        /* addr is within the region, so pfn + 1 is valid */
-                        return min(pfn + 1, max_pfn);
-                }
-        } while (left < right);
-        if (right == type->cnt)
-                return max_pfn;
-        else
-                return min(PHYS_PFN(type->regions[right].base), max_pfn);
-}
 /**
 * memblock_set_node - set node ID on memblock regions
 * @base: base of area to set node ID for
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d879f1d8a44a..32cba0332787 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2124,6 +2124,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
        case MPOL_INTERLEAVE:
                return !!nodes_equal(a->v.nodes, b->v.nodes);
        case MPOL_PREFERRED:
+                /* a's ->flags is the same as b's */
+                if (a->flags & MPOL_F_LOCAL)
+                        return true;
                return a->v.preferred_node == b->v.preferred_node;
        default:
                BUG();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb416723538f..1741dd23e7c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1910,7 +1910,9 @@ static int move_freepages(struct zone *zone,
         * Remove at a later date when no bug reports exist related to
         * grouping pages by mobility
         */
-        VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
+        VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
+                  pfn_valid(page_to_pfn(end_page)) &&
+                  page_zone(start_page) != page_zone(end_page));
 #endif
        if (num_movable)
@@ -3594,7 +3596,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)
                return false;
        /* this guy won't enter reclaim */
-        if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+        if (current->flags & PF_MEMALLOC)
                return false;
        /* We're only interested __GFP_FS allocations for now */
@@ -5354,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                if (context != MEMMAP_EARLY)
                        goto not_early;
-                if (!early_pfn_valid(pfn)) {
+                if (!early_pfn_valid(pfn))
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-                        /*
-                         * Skip to the pfn preceding the next valid one (or
-                         * end_pfn), such that we hit a valid pfn (or end_pfn)
-                         * on our next iteration of the loop.
-                         */
-                        pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
-#endif
                        continue;
-                }
                if (!early_pfn_in_nid(pfn, nid))
                        continue;
                if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index d2a76642c4ae..38de70ab1a0d 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -34,7 +34,7 @@
 #include <linux/log2.h>
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-                               int page_start, int page_end)
+                               int page_start, int page_end, gfp_t gfp)
 {
        return 0;
 }
@@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
        /* nada */
 }
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
        const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
        struct pcpu_chunk *chunk;
        struct page *pages;
        int i;
-        chunk = pcpu_alloc_chunk();
+        chunk = pcpu_alloc_chunk(gfp);
        if (!chunk)
                return NULL;
-        pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+        pages = alloc_pages(gfp, order_base_2(nr_pages));
        if (!pages) {
                pcpu_free_chunk(chunk);
                return NULL;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9158e5a81391..d8078de912de 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
        lockdep_assert_held(&pcpu_alloc_mutex);
        if (!pages)
-                pages = pcpu_mem_zalloc(pages_size);
+                pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL);
        return pages;
 }
@@ -73,18 +73,21 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
 * @page_start: page index of the first page to be allocated
 * @page_end: page index of the last page to be allocated + 1
+ * @gfp: allocation flags passed to the underlying allocator
 *
 * Allocate pages [@page_start,@page_end) into @pages for all units.
 * The allocation is for @chunk.  Percpu core doesn't care about the
 * content of @pages and will pass it verbatim to pcpu_map_pages().
 */
 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
-                            struct page **pages, int page_start, int page_end)
+                            struct page **pages, int page_start, int page_end,
+                            gfp_t gfp)
 {
-        const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
        unsigned int cpu, tcpu;
        int i;
+        gfp |= __GFP_HIGHMEM;
        for_each_possible_cpu(cpu) {
                for (i = page_start; i < page_end; i++) {
                        struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
@@ -262,6 +265,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
 * @chunk: chunk of interest
 * @page_start: the start page
 * @page_end: the end page
+ * @gfp: allocation flags passed to the underlying memory allocator
 *
 * For each cpu, populate and map pages [@page_start,@page_end) into
 * @chunk.
@@ -270,7 +274,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
 */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-                               int page_start, int page_end)
+                               int page_start, int page_end, gfp_t gfp)
 {
        struct page **pages;
@@ -278,7 +282,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
        if (!pages)
                return -ENOMEM;
-        if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
+        if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp))
                return -ENOMEM;
        if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
@@ -325,12 +329,12 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
        pcpu_free_pages(chunk, pages, page_start, page_end);
 }
-static struct pcpu_chunk *pcpu_create_chunk(void)
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
 {
        struct pcpu_chunk *chunk;
        struct vm_struct **vms;
-        chunk = pcpu_alloc_chunk();
+        chunk = pcpu_alloc_chunk(gfp);
        if (!chunk)
                return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 50e7fdf84055..9297098519a6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,6 +80,7 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include <linux/kmemleak.h>
+#include <linux/sched.h>
 #include <asm/cacheflush.h>
 #include <asm/sections.h>
@@ -447,26 +448,25 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
 /**
 * pcpu_mem_zalloc - allocate memory
 * @size: bytes to allocate
+ * @gfp: allocation flags
 *
 * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vzalloc() is used.  The returned
+ * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
- * memory is always zeroed.
+ * This is to facilitate passing through whitelisted flags.  The
- *
+ * returned memory is always zeroed.
- * CONTEXT:
- * Does GFP_KERNEL allocation.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
-static void *pcpu_mem_zalloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
 {
        if (WARN_ON_ONCE(!slab_is_available()))
                return NULL;
        if (size <= PAGE_SIZE)
-                return kzalloc(size, GFP_KERNEL);
+                return kzalloc(size, gfp);
        else
-                return vzalloc(size);
+                return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
 }
 /**
@@ -1154,12 +1154,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
        return chunk;
 }
-static struct pcpu_chunk *pcpu_alloc_chunk(void)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
 {
        struct pcpu_chunk *chunk;
        int region_bits;
-        chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
+        chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
        if (!chunk)
                return NULL;
@@ -1168,17 +1168,17 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
        region_bits = pcpu_chunk_map_bits(chunk);
        chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
-                                           sizeof(chunk->alloc_map[0]));
+                                           sizeof(chunk->alloc_map[0]), gfp);
        if (!chunk->alloc_map)
                goto alloc_map_fail;
        chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
-                                           sizeof(chunk->bound_map[0]));
+                                           sizeof(chunk->bound_map[0]), gfp);
        if (!chunk->bound_map)
                goto bound_map_fail;
        chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
-                                           sizeof(chunk->md_blocks[0]));
+                                           sizeof(chunk->md_blocks[0]), gfp);
        if (!chunk->md_blocks)
                goto md_blocks_fail;
@@ -1277,9 +1277,11 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
 * pcpu_addr_to_page            - translate address to physical address
 * pcpu_verify_alloc_info       - check alloc_info is acceptable during init
 */
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
+                               int page_start, int page_end, gfp_t gfp);
-static struct pcpu_chunk *pcpu_create_chunk(void);
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
+                                  int page_start, int page_end);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
 static struct page *pcpu_addr_to_page(void *addr);
 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -1339,6 +1341,8 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
                                 gfp_t gfp)
 {
+        /* whitelisted flags that can be passed to the backing allocators */
+        gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
        bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
        bool do_warn = !(gfp & __GFP_NOWARN);
        static int warn_limit = 10;
@@ -1369,8 +1373,17 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
                return NULL;
        }
-        if (!is_atomic)
+        if (!is_atomic) {
-                mutex_lock(&pcpu_alloc_mutex);
+                /*
+                 * pcpu_balance_workfn() allocates memory under this mutex,
+                 * and it may wait for memory reclaim. Allow current task
+                 * to become OOM victim, in case of memory pressure.
+                 */
+                if (gfp & __GFP_NOFAIL)
+                        mutex_lock(&pcpu_alloc_mutex);
+                else if (mutex_lock_killable(&pcpu_alloc_mutex))
+                        return NULL;
+        }
        spin_lock_irqsave(&pcpu_lock, flags);
@@ -1421,7 +1434,7 @@ restart:
        }
        if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
-                chunk = pcpu_create_chunk();
+                chunk = pcpu_create_chunk(pcpu_gfp);
                if (!chunk) {
                        err = "failed to allocate new chunk";
                        goto fail;
@@ -1450,7 +1463,7 @@ area_found:
                                           page_start, page_end) {
                        WARN_ON(chunk->immutable);
-                        ret = pcpu_populate_chunk(chunk, rs, re);
+                        ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
                        spin_lock_irqsave(&pcpu_lock, flags);
                        if (ret) {
@@ -1561,10 +1574,17 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
 * pcpu_balance_workfn - manage the amount of free chunks and populated pages
 * @work: unused
 *
- * Reclaim all fully free chunks except for the first one.
+ * Reclaim all fully free chunks except for the first one.  This is also
+ * responsible for maintaining the pool of empty populated pages.  However,
+ * it is possible that this is called when physical memory is scarce causing
+ * OOM killer to be triggered.  We should avoid doing so until an actual
+ * allocation causes the failure as it is possible that requests can be
+ * serviced from already backed regions.
 */
 static void pcpu_balance_workfn(struct work_struct *work)
 {
+        /* gfp flags passed to underlying allocators */
+        const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
        LIST_HEAD(to_free);
        struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
        struct pcpu_chunk *chunk, *next;
@@ -1600,6 +1620,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
                        spin_unlock_irq(&pcpu_lock);
                }
                pcpu_destroy_chunk(chunk);
+                cond_resched();
        }
        /*
@@ -1645,7 +1666,7 @@ retry_pop:
                                           chunk->nr_pages) {
                        int nr = min(re - rs, nr_to_pop);
-                        ret = pcpu_populate_chunk(chunk, rs, rs + nr);
+                        ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
                        if (!ret) {
                                nr_to_pop -= nr;
                                spin_lock_irq(&pcpu_lock);
@@ -1662,7 +1683,7 @@ retry_pop:
        if (nr_to_pop) {
                /* ran out of chunks to populate, create a new one and retry */
-                chunk = pcpu_create_chunk();
+                chunk = pcpu_create_chunk(gfp);
                if (chunk) {
                        spin_lock_irq(&pcpu_lock);
                        pcpu_chunk_relocate(chunk, -1);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1907688b75ee..b85919243399 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -493,36 +493,45 @@ next:
                info = list_entry(pos, struct shmem_inode_info, shrinklist);
                inode = &info->vfs_inode;
-                if (nr_to_split && split >= nr_to_split) {
+                if (nr_to_split && split >= nr_to_split)
-                        iput(inode);
+                        goto leave;
-                        continue;
-                }
-                page = find_lock_page(inode->i_mapping,
+                page = find_get_page(inode->i_mapping,
                                (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
                if (!page)
                        goto drop;
+                /* No huge page at the end of the file: nothing to split */
                if (!PageTransHuge(page)) {
-                        unlock_page(page);
                        put_page(page);
                        goto drop;
                }
+                /*
+                 * Leave the inode on the list if we failed to lock
+                 * the page at this time.
+                 *
+                 * Waiting for the lock may lead to deadlock in the
+                 * reclaim path.
+                 */
+                if (!trylock_page(page)) {
+                        put_page(page);
+                        goto leave;
+                }
                ret = split_huge_page(page);
                unlock_page(page);
                put_page(page);
-                if (ret) {
+                /* If split failed leave the inode on the list */
-                        /* split failed: leave it on the list */
+                if (ret)
-                        iput(inode);
+                        goto leave;
-                        continue;
-                }
                split++;
 drop:
                list_del_init(&info->shrinklist);
                removed++;
+leave:
                iput(inode);
        }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bee53495a829..cd5dc3faaa57 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1780,6 +1780,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                set_bit(PGDAT_WRITEBACK, &pgdat->flags);
        /*
+         * If dirty pages are scanned that are not queued for IO, it
+         * implies that flushers are not doing their job. This can
+         * happen when memory pressure pushes dirty pages to the end of
+         * the LRU before the dirty limits are breached and the dirty
+         * data has expired. It can also happen when the proportion of
+         * dirty pages grows not through writes but through memory
+         * pressure reclaiming all the clean cache. And in some cases,
+         * the flushers simply cannot keep up with the allocation
+         * rate. Nudge the flusher threads in case they are asleep.
+         */
+        if (stat.nr_unqueued_dirty == nr_taken)
+                wakeup_flusher_threads(WB_REASON_VMSCAN);
+        /*
         * Legacy memcg will stall in page writeback so avoid forcibly
         * stalling here.
         */
@@ -1791,22 +1805,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
                        set_bit(PGDAT_CONGESTED, &pgdat->flags);
-                /*
+                /* Allow kswapd to start writing pages during reclaim. */
-                 * If dirty pages are scanned that are not queued for IO, it
+                if (stat.nr_unqueued_dirty == nr_taken)
-                 * implies that flushers are not doing their job. This can
-                 * happen when memory pressure pushes dirty pages to the end of
-                 * the LRU before the dirty limits are breached and the dirty
-                 * data has expired. It can also happen when the proportion of
-                 * dirty pages grows not through writes but through memory
-                 * pressure reclaiming all the clean cache. And in some cases,
-                 * the flushers simply cannot keep up with the allocation
-                 * rate. Nudge the flusher threads in case they are asleep, but
-                 * also allow kswapd to start writing pages during reclaim.
-                 */
-                if (stat.nr_unqueued_dirty == nr_taken) {
-                        wakeup_flusher_threads(WB_REASON_VMSCAN);
                        set_bit(PGDAT_DIRTY, &pgdat->flags);
-                }
                /*
                 * If kswapd scans pages marked marked for immediate
author	David S. Miller <davem@davemloft.net>	2018-03-23 11:24:57 -0400
committer	David S. Miller <davem@davemloft.net>	2018-03-23 11:31:58 -0400
commit	03fe2debbb2771fb90881e4ce8109b09cf772a5c (patch)
tree	fbaf8738296b2e9dcba81c6daef2d515b6c4948c /mm
parent	6686c459e1449a3ee5f3fd313b0a559ace7a700e (diff)
parent	f36b7534b83357cf52e747905de6d65b4f7c2512 (diff)