5 files changed, 40 insertions, 77 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 24bcc5eec6b4..76f8db0b0e71 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 }
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
                        struct vm_area_struct *vma, unsigned long addr,
-                        int node, bool hugepage);
+                        int node);
-#define alloc_hugepage_vma(gfp_mask, vma, addr, order)  \
-        alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
 #else
 #define alloc_pages(gfp_mask, order) \
                alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
+#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\
-        alloc_pages(gfp_mask, order)
-#define alloc_hugepage_vma(gfp_mask, vma, addr, order)  \
        alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 #define alloc_page_vma(gfp_mask, vma, addr)                     \
-        alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
+        alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
 #define alloc_page_vma_node(gfp_mask, vma, addr, node)          \
-        alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
+        alloc_pages_vma(gfp_mask, 0, vma, addr, node)
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 5228c62af416..bac395f1d00a 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 struct mempolicy *get_task_policy(struct task_struct *p);
 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr);
+struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+                                                unsigned long addr);
 bool vma_policy_mof(struct vm_area_struct *vma);
 extern void numa_default_policy(void);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4e4ef8fa479d..55478ab3c83b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -629,21 +629,40 @@ release:
 *          available
 * never: never stall for any thp allocation
 */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
 {
        const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+        gfp_t this_node = 0;
+#ifdef CONFIG_NUMA
+        struct mempolicy *pol;
+        /*
+         * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
+         * specified, to express a general desire to stay on the current
+         * node for optimistic allocation attempts. If the defrag mode
+         * and/or madvise hint requires the direct reclaim then we prefer
+         * to fallback to other node rather than node reclaim because that
+         * can lead to excessive reclaim even though there is free memory
+         * on other nodes. We expect that NUMA preferences are specified
+         * by memory policies.
+         */
+        pol = get_vma_policy(vma, addr);
+        if (pol->mode != MPOL_BIND)
+                this_node = __GFP_THISNODE;
+        mpol_cond_put(pol);
+#endif
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
-                return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+                return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-                                                             __GFP_KSWAPD_RECLAIM);
+                                                             __GFP_KSWAPD_RECLAIM | this_node);
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-                                                             0);
+                                                             this_node);
-        return GFP_TRANSHUGE_LIGHT;
+        return GFP_TRANSHUGE_LIGHT | this_node;
 }
 /* Caller must hold page table lock. */
@@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
                        pte_free(vma->vm_mm, pgtable);
                return ret;
        }
-        gfp = alloc_hugepage_direct_gfpmask(vma);
+        gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
-        page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+        page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
        if (unlikely(!page)) {
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -1286,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 alloc:
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow()) {
-                huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+                huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
-                new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
+                new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
+                                haddr, numa_node_id());
        } else
                new_page = NULL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 58fb833fce0c..5837a067124d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start)
        } else if (PageTransHuge(page)) {
                struct page *thp;
-                thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
+                thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
-                                         HPAGE_PMD_ORDER);
+                                address, numa_node_id());
                if (!thp)
                        return NULL;
                prep_transhuge_page(thp);
@@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
 * freeing by another task.  It is the caller's responsibility to free the
 * extra reference for shared policies.
 */
-static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                                unsigned long addr)
 {
        struct mempolicy *pol = __get_vma_policy(vma, addr);
@@ -2011,7 +2011,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 *      @vma:  Pointer to VMA or NULL if not available.
 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 *      @node: Which node to prefer for allocation (modulo policy).
- *      @hugepage: for hugepages try only the preferred node if possible
 *
 *      This function allocates a page from the kernel page pool and applies
 *      a NUMA policy associated with the VMA or the current process.
@@ -2022,7 +2021,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-                unsigned long addr, int node, bool hugepage)
+                unsigned long addr, int node)
 {
        struct mempolicy *pol;
        struct page *page;
@@ -2040,60 +2039,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                goto out;
        }
-        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
-                int hpage_node = node;
-                /*
-                 * For hugepage allocation and non-interleave policy which
-                 * allows the current node (or other explicitly preferred
-                 * node) we only try to allocate from the current/preferred
-                 * node and don't fall back to other nodes, as the cost of
-                 * remote accesses would likely offset THP benefits.
-                 *
-                 * If the policy is interleave, or does not allow the current
-                 * node in its nodemask, we allocate the standard way.
-                 */
-                if (pol->mode == MPOL_PREFERRED &&
-                                                !(pol->flags & MPOL_F_LOCAL))
-                        hpage_node = pol->v.preferred_node;
-                nmask = policy_nodemask(gfp, pol);
-                if (!nmask || node_isset(hpage_node, *nmask)) {
-                        mpol_cond_put(pol);
-                        /*
-                         * We cannot invoke reclaim if __GFP_THISNODE
-                         * is set. Invoking reclaim with
-                         * __GFP_THISNODE set, would cause THP
-                         * allocations to trigger heavy swapping
-                         * despite there may be tons of free memory
-                         * (including potentially plenty of THP
-                         * already available in the buddy) on all the
-                         * other NUMA nodes.
-                         *
-                         * At most we could invoke compaction when
-                         * __GFP_THISNODE is set (but we would need to
-                         * refrain from invoking reclaim even if
-                         * compaction returned COMPACT_SKIPPED because
-                         * there wasn't not enough memory to succeed
-                         * compaction). For now just avoid
-                         * __GFP_THISNODE instead of limiting the
-                         * allocation path to a strict and single
-                         * compaction invocation.
-                         *
-                         * Supposedly if direct reclaim was enabled by
-                         * the caller, the app prefers THP regardless
-                         * of the node it comes from so this would be
-                         * more desiderable behavior than only
-                         * providing THP originated from the local
-                         * node in such case.
-                         */
-                        if (!(gfp & __GFP_DIRECT_RECLAIM))
-                                gfp |= __GFP_THISNODE;
-                        page = __alloc_pages_node(hpage_node, gfp, order);
-                        goto out;
-                }
-        }
        nmask = policy_nodemask(gfp, pol);
        preferred_nid = policy_node(gfp, pol, node);
        page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
diff --git a/mm/shmem.c b/mm/shmem.c
index 56bf122e0bb4..ea26d7a0342d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1435,7 +1435,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
        shmem_pseudo_vma_init(&pvma, info, hindex);
        page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-                        HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+                        HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
        shmem_pseudo_vma_destroy(&pvma);
        if (page)
                prep_transhuge_page(page);