Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"

This reverts commit 89c83fb539f95491be80cdd5158e6f0ce329e317. This should have been done as part of 2f0799a0ffc0 ("mm, thp: restore node-local hugepage allocations"). The movement of the thp allocation policy from alloc_pages_vma() to alloc_hugepage_direct_gfpmask() was intended to only set __GFP_THISNODE for mempolicies that are not MPOL_BIND whereas the revert could set this regardless of mempolicy. While the check for MPOL_BIND between alloc_hugepage_direct_gfpmask() and alloc_pages_vma() was racy, that has since been removed since the revert. What is left is the possibility to use __GFP_THISNODE in policy_node() when it is unexpected because the special handling for hugepages in alloc_pages_vma() was removed as part of the consolidation. Secondly, prior to 89c83fb539f9, alloc_pages_vma() implemented a somewhat different policy for hugepage allocations, which were allocated through alloc_hugepage_vma(). For hugepage allocations, if the allocating process's node is in the set of allowed nodes, allocate with __GFP_THISNODE for that node (for MPOL_PREFERRED, use that node with __GFP_THISNODE instead). This was changed for shmem_alloc_hugepage() to allow fallback to other nodes in 89c83fb539f9 as it did for new_page() in mm/mempolicy.c which is functionally different behavior and removes the requirement to only allocate hugepages locally. So this commit does a full revert of 89c83fb539f9 instead of the partial revert that was done in 2f0799a0ffc0. The result is the same thp allocation policy for 4.20 that was in 4.19. Fixes: 89c83fb539f9 ("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask") Fixes: 2f0799a0ffc0 ("mm, thp: restore node-local hugepage allocations") Signed-off-by: David Rientjes <rientjes@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: David Rientjes <rientjes@google.com> 2018-12-07 17:50:16 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-12-08 13:26:20 -0500
commit: 356ff8a9a78fb35d6482584d260c3754dcbdf669 (patch)
tree: 1893db13840f8d37c1562d417b93827f5425170c
parent: 5f179793f0a73965681db6a3203fa1baabd9b3c3 (diff)
4 files changed, 51 insertions, 22 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 76f8db0b0e71..0705164f928c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -510,18 +510,22 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 }
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
                        struct vm_area_struct *vma, unsigned long addr,
-                        int node);
+                        int node, bool hugepage);
+#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
+        alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
 #else
 #define alloc_pages(gfp_mask, order) \
                alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\
+#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
+        alloc_pages(gfp_mask, order)
+#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
        alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 #define alloc_page_vma(gfp_mask, vma, addr)                     \
-        alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
+        alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
 #define alloc_page_vma_node(gfp_mask, vma, addr, node)          \
-        alloc_pages_vma(gfp_mask, 0, vma, addr, node)
+        alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f2d19e4fe854..5da55b38b1b7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -629,30 +629,30 @@ release:
 *          available
 * never: never stall for any thp allocation
 */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 {
        const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
-        const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE;
        /* Always do synchronous compaction */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
-                return GFP_TRANSHUGE | __GFP_THISNODE |
+                return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
-                       (vma_madvised ? 0 : __GFP_NORETRY);
        /* Kick kcompactd and fail quickly */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
-                return gfp_mask | __GFP_KSWAPD_RECLAIM;
+                return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
        /* Synchronous compaction if madvised, otherwise kick kcompactd */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
-                return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM :
+                return GFP_TRANSHUGE_LIGHT |
-                                                  __GFP_KSWAPD_RECLAIM);
+                        (vma_madvised ? __GFP_DIRECT_RECLAIM :
+                                        __GFP_KSWAPD_RECLAIM);
        /* Only do synchronous compaction if madvised */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
-                return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+                return GFP_TRANSHUGE_LIGHT |
+                       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
-        return gfp_mask;
+        return GFP_TRANSHUGE_LIGHT;
 }
 /* Caller must hold page table lock. */
@@ -724,8 +724,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
                        pte_free(vma->vm_mm, pgtable);
                return ret;
        }
-        gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
+        gfp = alloc_hugepage_direct_gfpmask(vma);
-        page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
+        page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
        if (unlikely(!page)) {
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -1295,9 +1295,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 alloc:
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow()) {
-                huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
+                huge_gfp = alloc_hugepage_direct_gfpmask(vma);
-                new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
+                new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
-                                haddr, numa_node_id());
        } else
                new_page = NULL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 69e278b469ef..d4496d9d34f5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start)
        } else if (PageTransHuge(page)) {
                struct page *thp;
-                thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
+                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
-                                address, numa_node_id());
+                                         HPAGE_PMD_ORDER);
                if (!thp)
                        return NULL;
                prep_transhuge_page(thp);
@@ -2011,6 +2011,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 *      @vma:  Pointer to VMA or NULL if not available.
 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 *      @node: Which node to prefer for allocation (modulo policy).
+ *      @hugepage: for hugepages try only the preferred node if possible
 *
 *      This function allocates a page from the kernel page pool and applies
 *      a NUMA policy associated with the VMA or the current process.
@@ -2021,7 +2022,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-                unsigned long addr, int node)
+                unsigned long addr, int node, bool hugepage)
 {
        struct mempolicy *pol;
        struct page *page;
@@ -2039,6 +2040,31 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                goto out;
        }
+        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
+                int hpage_node = node;
+                /*
+                 * For hugepage allocation and non-interleave policy which
+                 * allows the current node (or other explicitly preferred
+                 * node) we only try to allocate from the current/preferred
+                 * node and don't fall back to other nodes, as the cost of
+                 * remote accesses would likely offset THP benefits.
+                 *
+                 * If the policy is interleave, or does not allow the current
+                 * node in its nodemask, we allocate the standard way.
+                 */
+                if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
+                        hpage_node = pol->v.preferred_node;
+                nmask = policy_nodemask(gfp, pol);
+                if (!nmask || node_isset(hpage_node, *nmask)) {
+                        mpol_cond_put(pol);
+                        page = __alloc_pages_node(hpage_node,
+                                                gfp | __GFP_THISNODE, order);
+                        goto out;
+                }
+        }
        nmask = policy_nodemask(gfp, pol);
        preferred_nid = policy_node(gfp, pol, node);
        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
diff --git a/mm/shmem.c b/mm/shmem.c
index cddc72ac44d8..921f80488bb3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1439,7 +1439,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
        shmem_pseudo_vma_init(&pvma, info, hindex);
        page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-                        HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
+                        HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
        shmem_pseudo_vma_destroy(&pvma);
        if (page)
                prep_transhuge_page(page);
author	David Rientjes <rientjes@google.com>	2018-12-07 17:50:16 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-12-08 13:26:20 -0500
commit	356ff8a9a78fb35d6482584d260c3754dcbdf669 (patch)
tree	1893db13840f8d37c1562d417b93827f5425170c
parent	5f179793f0a73965681db6a3203fa1baabd9b3c3 (diff)