Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton: "13 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm, thp: do not cause memcg oom for thp mm/vmscan: wake up flushers for legacy cgroups too Revert "mm: page_alloc: skip over regions of invalid pfns where possible" mm/shmem: do not wait for lock_page() in shmem_unused_huge_shrink() mm/thp: do not wait for lock_page() in deferred_split_scan() mm/khugepaged.c: convert VM_BUG_ON() to collapse fail x86/mm: implement free pmd/pte page interfaces mm/vmalloc: add interfaces to free unmapped page table h8300: remove extraneous __BIG_ENDIAN definition hugetlbfs: check for pgoff value overflow lockdep: fix fs_reclaim warning MAINTAINERS: update Mark Fasheh's e-mail mm/mempolicy.c: avoid use uninitialized preferred_node
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-03-22 21:48:43 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-03-22 21:48:43 -0400
commit: f36b7534b83357cf52e747905de6d65b4f7c2512 (patch)
tree: ca52ebdc4aaa738bd464b22a06ed034e41c46acb
parent: 8401c72c593d2be8607d2a0a4551ee5c867d6f2f (diff)
parent: 9d3c3354bb85bab4d865fe95039443f09a4c8394 (diff)
16 files changed, 153 insertions, 79 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 4e62756936fa..73c0cdabf755 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10334,7 +10334,7 @@ F:	drivers/oprofile/
 F:      include/linux/oprofile.h
 ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
-M:      Mark Fasheh <mfasheh@versity.com>
+M:      Mark Fasheh <mark@fasheh.com>
 M:      Joel Becker <jlbec@evilplan.org>
 L:      ocfs2-devel@oss.oracle.com (moderated for non-subscribers)
 W:      http://ocfs2.wiki.kernel.org
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8c704f1e53c2..2dbb2c9f1ec1 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -972,3 +972,13 @@ int pmd_clear_huge(pmd_t *pmdp)
        pmd_clear(pmdp);
        return 1;
 }
+int pud_free_pmd_page(pud_t *pud)
+{
+        return pud_none(*pud);
+}
+int pmd_free_pte_page(pmd_t *pmd)
+{
+        return pmd_none(*pmd);
+}
diff --git a/arch/h8300/include/asm/byteorder.h b/arch/h8300/include/asm/byteorder.h
index ecff2d1ca5a3..6eaa7ad5fc2c 100644
--- a/arch/h8300/include/asm/byteorder.h
+++ b/arch/h8300/include/asm/byteorder.h
@@ -2,7 +2,6 @@
 #ifndef __H8300_BYTEORDER_H__
 #define __H8300_BYTEORDER_H__
-#define __BIG_ENDIAN __ORDER_BIG_ENDIAN__
 #include <linux/byteorder/big_endian.h>
 #endif
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 004abf9ebf12..34cda7e0551b 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -702,4 +702,52 @@ int pmd_clear_huge(pmd_t *pmd)
        return 0;
 }
+/**
+ * pud_free_pmd_page - Clear pud entry and free pmd page.
+ * @pud: Pointer to a PUD.
+ *
+ * Context: The pud range has been unmaped and TLB purged.
+ * Return: 1 if clearing the entry succeeded. 0 otherwise.
+ */
+int pud_free_pmd_page(pud_t *pud)
+{
+        pmd_t *pmd;
+        int i;
+        if (pud_none(*pud))
+                return 1;
+        pmd = (pmd_t *)pud_page_vaddr(*pud);
+        for (i = 0; i < PTRS_PER_PMD; i++)
+                if (!pmd_free_pte_page(&pmd[i]))
+                        return 0;
+        pud_clear(pud);
+        free_page((unsigned long)pmd);
+        return 1;
+}
+/**
+ * pmd_free_pte_page - Clear pmd entry and free pte page.
+ * @pmd: Pointer to a PMD.
+ *
+ * Context: The pmd range has been unmaped and TLB purged.
+ * Return: 1 if clearing the entry succeeded. 0 otherwise.
+ */
+int pmd_free_pte_page(pmd_t *pmd)
+{
+        pte_t *pte;
+        if (pmd_none(*pmd))
+                return 1;
+        pte = (pte_t *)pmd_page_vaddr(*pmd);
+        pmd_clear(pmd);
+        free_page((unsigned long)pte);
+        return 1;
+}
 #endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8fe1b0aa2896..b9a254dcc0e7 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec)
        pagevec_reinit(pvec);
 }
+/*
+ * Mask used when checking the page offset value passed in via system
+ * calls.  This value will be converted to a loff_t which is signed.
+ * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
+ * value.  The extra bit (- 1 in the shift value) is to take the sign
+ * bit into account.
+ */
+#define PGOFF_LOFFT_MAX \
+        (((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct inode *inode = file_inode(file);
@@ -127,12 +137,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
        vma->vm_ops = &hugetlb_vm_ops;
        /*
-         * Offset passed to mmap (before page shift) could have been
+         * page based offset in vm_pgoff could be sufficiently large to
-         * negative when represented as a (l)off_t.
+         * overflow a (l)off_t when converted to byte offset.
         */
-        if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0)
+        if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
                return -EINVAL;
+        /* must be huge page aligned */
        if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
                return -EINVAL;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 2cfa3075d148..bfbb44a5ad38 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -983,6 +983,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
 int pud_clear_huge(pud_t *pud);
 int pmd_clear_huge(pmd_t *pmd);
+int pud_free_pmd_page(pud_t *pud);
+int pmd_free_pte_page(pmd_t *pmd);
 #else   /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
 static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
 {
@@ -1008,6 +1010,14 @@ static inline int pmd_clear_huge(pmd_t *pmd)
 {
        return 0;
 }
+static inline int pud_free_pmd_page(pud_t *pud)
+{
+        return 0;
+}
+static inline int pmd_free_pte_page(pmd_t *pmd)
+{
+        return 0;
+}
 #endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
 #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 8be5077efb5f..f92ea7783652 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
                            unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
                          unsigned long *out_end_pfn, int *out_nid);
-unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
 /**
 * for_each_mem_pfn_range - early memory pfn range iterator
diff --git a/lib/ioremap.c b/lib/ioremap.c
index b808a390e4c3..54e5bbaa3200 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -91,7 +91,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
                if (ioremap_pmd_enabled() &&
                    ((next - addr) == PMD_SIZE) &&
-                    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
+                    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
+                    pmd_free_pte_page(pmd)) {
                        if (pmd_set_huge(pmd, phys_addr + addr, prot))
                                continue;
                }
@@ -117,7 +118,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
                if (ioremap_pud_enabled() &&
                    ((next - addr) == PUD_SIZE) &&
-                    IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
+                    IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
+                    pud_free_pmd_page(pud)) {
                        if (pud_set_huge(pud, phys_addr + addr, prot))
                                continue;
                }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 87ab9b8f56b5..5a68730eebd6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -555,7 +555,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
        VM_BUG_ON_PAGE(!PageCompound(page), page);
-        if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
+        if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg,
+                                  true)) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -1316,7 +1317,7 @@ alloc:
        }
        if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
-                                        huge_gfp, &memcg, true))) {
+                                huge_gfp | __GFP_NORETRY, &memcg, true))) {
                put_page(new_page);
                split_huge_pmd(vma, vmf->pmd, vmf->address);
                if (page)
@@ -2783,11 +2784,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
        list_for_each_safe(pos, next, &list) {
                page = list_entry((void *)pos, struct page, mapping);
-                lock_page(page);
+                if (!trylock_page(page))
+                        goto next;
                /* split_huge_page() removes page from list on success */
                if (!split_huge_page(page))
                        split++;
                unlock_page(page);
+next:
                put_page(page);
        }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a963f2034dfc..976bbc5646fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
+#include <linux/mmdebug.h>
 #include <linux/sched/signal.h>
 #include <linux/rmap.h>
 #include <linux/string_helpers.h>
@@ -4374,6 +4375,12 @@ int hugetlb_reserve_pages(struct inode *inode,
        struct resv_map *resv_map;
        long gbl_reserve;
+        /* This should never happen */
+        if (from > to) {
+                VM_WARN(1, "%s called with a negative range\n", __func__);
+                return -EINVAL;
+        }
        /*
         * Only apply hugepage reservation if asked. At fault time, an
         * attempt will be made for VM_NORESERVE to allocate a page
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b7e2268dfc9a..e42568284e06 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -530,7 +530,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                        goto out;
                }
-                VM_BUG_ON_PAGE(PageCompound(page), page);
+                /* TODO: teach khugepaged to collapse THP mapped with pte */
+                if (PageCompound(page)) {
+                        result = SCAN_PAGE_COMPOUND;
+                        goto out;
+                }
                VM_BUG_ON_PAGE(!PageAnon(page), page);
                /*
@@ -960,7 +965,9 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out_nolock;
        }
-        if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+        /* Do not oom kill for khugepaged charges */
+        if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
+                                           &memcg, true))) {
                result = SCAN_CGROUP_CHARGE_FAIL;
                goto out_nolock;
        }
@@ -1319,7 +1326,9 @@ static void collapse_shmem(struct mm_struct *mm,
                goto out;
        }
-        if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+        /* Do not oom kill for khugepaged charges */
+        if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY,
+                                           &memcg, true))) {
                result = SCAN_CGROUP_CHARGE_FAIL;
                goto out;
        }
diff --git a/mm/memblock.c b/mm/memblock.c
index b6ba6b7adadc..48376bd33274 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
                *out_nid = r->nid;
 }
-unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
-                                                      unsigned long max_pfn)
-{
-        struct memblock_type *type = &memblock.memory;
-        unsigned int right = type->cnt;
-        unsigned int mid, left = 0;
-        phys_addr_t addr = PFN_PHYS(++pfn);
-        do {
-                mid = (right + left) / 2;
-                if (addr < type->regions[mid].base)
-                        right = mid;
-                else if (addr >= (type->regions[mid].base +
-                                  type->regions[mid].size))
-                        left = mid + 1;
-                else {
-                        /* addr is within the region, so pfn is valid */
-                        return pfn;
-                }
-        } while (left < right);
-        if (right == type->cnt)
-                return -1UL;
-        else
-                return PHYS_PFN(type->regions[right].base);
-}
 /**
 * memblock_set_node - set node ID on memblock regions
 * @base: base of area to set node ID for
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d879f1d8a44a..32cba0332787 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2124,6 +2124,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
        case MPOL_INTERLEAVE:
                return !!nodes_equal(a->v.nodes, b->v.nodes);
        case MPOL_PREFERRED:
+                /* a's ->flags is the same as b's */
+                if (a->flags & MPOL_F_LOCAL)
+                        return true;
                return a->v.preferred_node == b->v.preferred_node;
        default:
                BUG();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 635d7dd29d7f..1741dd23e7c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3596,7 +3596,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)
                return false;
        /* this guy won't enter reclaim */
-        if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+        if (current->flags & PF_MEMALLOC)
                return false;
        /* We're only interested __GFP_FS allocations for now */
@@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                if (context != MEMMAP_EARLY)
                        goto not_early;
-                if (!early_pfn_valid(pfn)) {
+                if (!early_pfn_valid(pfn))
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-                        /*
-                         * Skip to the pfn preceding the next valid one (or
-                         * end_pfn), such that we hit a valid pfn (or end_pfn)
-                         * on our next iteration of the loop.
-                         */
-                        pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
-#endif
                        continue;
-                }
                if (!early_pfn_in_nid(pfn, nid))
                        continue;
                if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
diff --git a/mm/shmem.c b/mm/shmem.c
index 1907688b75ee..b85919243399 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -493,36 +493,45 @@ next:
                info = list_entry(pos, struct shmem_inode_info, shrinklist);
                inode = &info->vfs_inode;
-                if (nr_to_split && split >= nr_to_split) {
+                if (nr_to_split && split >= nr_to_split)
-                        iput(inode);
+                        goto leave;
-                        continue;
-                }
-                page = find_lock_page(inode->i_mapping,
+                page = find_get_page(inode->i_mapping,
                                (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
                if (!page)
                        goto drop;
+                /* No huge page at the end of the file: nothing to split */
                if (!PageTransHuge(page)) {
-                        unlock_page(page);
                        put_page(page);
                        goto drop;
                }
+                /*
+                 * Leave the inode on the list if we failed to lock
+                 * the page at this time.
+                 *
+                 * Waiting for the lock may lead to deadlock in the
+                 * reclaim path.
+                 */
+                if (!trylock_page(page)) {
+                        put_page(page);
+                        goto leave;
+                }
                ret = split_huge_page(page);
                unlock_page(page);
                put_page(page);
-                if (ret) {
+                /* If split failed leave the inode on the list */
-                        /* split failed: leave it on the list */
+                if (ret)
-                        iput(inode);
+                        goto leave;
-                        continue;
-                }
                split++;
 drop:
                list_del_init(&info->shrinklist);
                removed++;
+leave:
                iput(inode);
        }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bee53495a829..cd5dc3faaa57 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1780,6 +1780,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                set_bit(PGDAT_WRITEBACK, &pgdat->flags);
        /*
+         * If dirty pages are scanned that are not queued for IO, it
+         * implies that flushers are not doing their job. This can
+         * happen when memory pressure pushes dirty pages to the end of
+         * the LRU before the dirty limits are breached and the dirty
+         * data has expired. It can also happen when the proportion of
+         * dirty pages grows not through writes but through memory
+         * pressure reclaiming all the clean cache. And in some cases,
+         * the flushers simply cannot keep up with the allocation
+         * rate. Nudge the flusher threads in case they are asleep.
+         */
+        if (stat.nr_unqueued_dirty == nr_taken)
+                wakeup_flusher_threads(WB_REASON_VMSCAN);
+        /*
         * Legacy memcg will stall in page writeback so avoid forcibly
         * stalling here.
         */
@@ -1791,22 +1805,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
                        set_bit(PGDAT_CONGESTED, &pgdat->flags);
-                /*
+                /* Allow kswapd to start writing pages during reclaim. */
-                 * If dirty pages are scanned that are not queued for IO, it
+                if (stat.nr_unqueued_dirty == nr_taken)
-                 * implies that flushers are not doing their job. This can
-                 * happen when memory pressure pushes dirty pages to the end of
-                 * the LRU before the dirty limits are breached and the dirty
-                 * data has expired. It can also happen when the proportion of
-                 * dirty pages grows not through writes but through memory
-                 * pressure reclaiming all the clean cache. And in some cases,
-                 * the flushers simply cannot keep up with the allocation
-                 * rate. Nudge the flusher threads in case they are asleep, but
-                 * also allow kswapd to start writing pages during reclaim.
-                 */
-                if (stat.nr_unqueued_dirty == nr_taken) {
-                        wakeup_flusher_threads(WB_REASON_VMSCAN);
                        set_bit(PGDAT_DIRTY, &pgdat->flags);
-                }
                /*
                 * If kswapd scans pages marked marked for immediate
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-03-22 21:48:43 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-03-22 21:48:43 -0400
commit	f36b7534b83357cf52e747905de6d65b4f7c2512 (patch)
tree	ca52ebdc4aaa738bd464b22a06ed034e41c46acb
parent	8401c72c593d2be8607d2a0a4551ee5c867d6f2f (diff)
parent	9d3c3354bb85bab4d865fe95039443f09a4c8394 (diff)