summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/gfp.h12
-rw-r--r--include/linux/mempolicy.h2
-rw-r--r--mm/huge_memory.c38
-rw-r--r--mm/mempolicy.c63
-rw-r--r--mm/shmem.c2
5 files changed, 40 insertions, 77 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 24bcc5eec6b4..76f8db0b0e71 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
510} 510}
511extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, 511extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
512 struct vm_area_struct *vma, unsigned long addr, 512 struct vm_area_struct *vma, unsigned long addr,
513 int node, bool hugepage); 513 int node);
514#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
515 alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
516#else 514#else
517#define alloc_pages(gfp_mask, order) \ 515#define alloc_pages(gfp_mask, order) \
518 alloc_pages_node(numa_node_id(), gfp_mask, order) 516 alloc_pages_node(numa_node_id(), gfp_mask, order)
519#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ 517#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\
520 alloc_pages(gfp_mask, order)
521#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
522 alloc_pages(gfp_mask, order) 518 alloc_pages(gfp_mask, order)
523#endif 519#endif
524#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) 520#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
525#define alloc_page_vma(gfp_mask, vma, addr) \ 521#define alloc_page_vma(gfp_mask, vma, addr) \
526 alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) 522 alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
527#define alloc_page_vma_node(gfp_mask, vma, addr, node) \ 523#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
528 alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) 524 alloc_pages_vma(gfp_mask, 0, vma, addr, node)
529 525
530extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); 526extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
531extern unsigned long get_zeroed_page(gfp_t gfp_mask); 527extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 5228c62af416..bac395f1d00a 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
139struct mempolicy *get_task_policy(struct task_struct *p); 139struct mempolicy *get_task_policy(struct task_struct *p);
140struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 140struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
141 unsigned long addr); 141 unsigned long addr);
142struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
143 unsigned long addr);
142bool vma_policy_mof(struct vm_area_struct *vma); 144bool vma_policy_mof(struct vm_area_struct *vma);
143 145
144extern void numa_default_policy(void); 146extern void numa_default_policy(void);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4e4ef8fa479d..55478ab3c83b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -629,21 +629,40 @@ release:
629 * available 629 * available
630 * never: never stall for any thp allocation 630 * never: never stall for any thp allocation
631 */ 631 */
632static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) 632static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
633{ 633{
634 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); 634 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
635 gfp_t this_node = 0;
636
637#ifdef CONFIG_NUMA
638 struct mempolicy *pol;
639 /*
640 * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
641 * specified, to express a general desire to stay on the current
642 * node for optimistic allocation attempts. If the defrag mode
643 * and/or madvise hint requires the direct reclaim then we prefer
644 * to fallback to other node rather than node reclaim because that
645 * can lead to excessive reclaim even though there is free memory
646 * on other nodes. We expect that NUMA preferences are specified
647 * by memory policies.
648 */
649 pol = get_vma_policy(vma, addr);
650 if (pol->mode != MPOL_BIND)
651 this_node = __GFP_THISNODE;
652 mpol_cond_put(pol);
653#endif
635 654
636 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 655 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
637 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 656 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
638 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 657 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
639 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 658 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
640 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 659 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
641 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 660 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
642 __GFP_KSWAPD_RECLAIM); 661 __GFP_KSWAPD_RECLAIM | this_node);
643 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 662 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
644 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 663 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
645 0); 664 this_node);
646 return GFP_TRANSHUGE_LIGHT; 665 return GFP_TRANSHUGE_LIGHT | this_node;
647} 666}
648 667
649/* Caller must hold page table lock. */ 668/* Caller must hold page table lock. */
@@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
715 pte_free(vma->vm_mm, pgtable); 734 pte_free(vma->vm_mm, pgtable);
716 return ret; 735 return ret;
717 } 736 }
718 gfp = alloc_hugepage_direct_gfpmask(vma); 737 gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
719 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); 738 page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
720 if (unlikely(!page)) { 739 if (unlikely(!page)) {
721 count_vm_event(THP_FAULT_FALLBACK); 740 count_vm_event(THP_FAULT_FALLBACK);
722 return VM_FAULT_FALLBACK; 741 return VM_FAULT_FALLBACK;
@@ -1286,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
1286alloc: 1305alloc:
1287 if (transparent_hugepage_enabled(vma) && 1306 if (transparent_hugepage_enabled(vma) &&
1288 !transparent_hugepage_debug_cow()) { 1307 !transparent_hugepage_debug_cow()) {
1289 huge_gfp = alloc_hugepage_direct_gfpmask(vma); 1308 huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
1290 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); 1309 new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
1310 haddr, numa_node_id());
1291 } else 1311 } else
1292 new_page = NULL; 1312 new_page = NULL;
1293 1313
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 58fb833fce0c..5837a067124d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start)
1116 } else if (PageTransHuge(page)) { 1116 } else if (PageTransHuge(page)) {
1117 struct page *thp; 1117 struct page *thp;
1118 1118
1119 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, 1119 thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
1120 HPAGE_PMD_ORDER); 1120 address, numa_node_id());
1121 if (!thp) 1121 if (!thp)
1122 return NULL; 1122 return NULL;
1123 prep_transhuge_page(thp); 1123 prep_transhuge_page(thp);
@@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1662 * freeing by another task. It is the caller's responsibility to free the 1662 * freeing by another task. It is the caller's responsibility to free the
1663 * extra reference for shared policies. 1663 * extra reference for shared policies.
1664 */ 1664 */
1665static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1665struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1666 unsigned long addr) 1666 unsigned long addr)
1667{ 1667{
1668 struct mempolicy *pol = __get_vma_policy(vma, addr); 1668 struct mempolicy *pol = __get_vma_policy(vma, addr);
@@ -2011,7 +2011,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2011 * @vma: Pointer to VMA or NULL if not available. 2011 * @vma: Pointer to VMA or NULL if not available.
2012 * @addr: Virtual Address of the allocation. Must be inside the VMA. 2012 * @addr: Virtual Address of the allocation. Must be inside the VMA.
2013 * @node: Which node to prefer for allocation (modulo policy). 2013 * @node: Which node to prefer for allocation (modulo policy).
2014 * @hugepage: for hugepages try only the preferred node if possible
2015 * 2014 *
2016 * This function allocates a page from the kernel page pool and applies 2015 * This function allocates a page from the kernel page pool and applies
2017 * a NUMA policy associated with the VMA or the current process. 2016 * a NUMA policy associated with the VMA or the current process.
@@ -2022,7 +2021,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2022 */ 2021 */
2023struct page * 2022struct page *
2024alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 2023alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2025 unsigned long addr, int node, bool hugepage) 2024 unsigned long addr, int node)
2026{ 2025{
2027 struct mempolicy *pol; 2026 struct mempolicy *pol;
2028 struct page *page; 2027 struct page *page;
@@ -2040,60 +2039,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2040 goto out; 2039 goto out;
2041 } 2040 }
2042 2041
2043 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2044 int hpage_node = node;
2045
2046 /*
2047 * For hugepage allocation and non-interleave policy which
2048 * allows the current node (or other explicitly preferred
2049 * node) we only try to allocate from the current/preferred
2050 * node and don't fall back to other nodes, as the cost of
2051 * remote accesses would likely offset THP benefits.
2052 *
2053 * If the policy is interleave, or does not allow the current
2054 * node in its nodemask, we allocate the standard way.
2055 */
2056 if (pol->mode == MPOL_PREFERRED &&
2057 !(pol->flags & MPOL_F_LOCAL))
2058 hpage_node = pol->v.preferred_node;
2059
2060 nmask = policy_nodemask(gfp, pol);
2061 if (!nmask || node_isset(hpage_node, *nmask)) {
2062 mpol_cond_put(pol);
2063 /*
2064 * We cannot invoke reclaim if __GFP_THISNODE
2065 * is set. Invoking reclaim with
2066 * __GFP_THISNODE set, would cause THP
2067 * allocations to trigger heavy swapping
2068 * despite there may be tons of free memory
2069 * (including potentially plenty of THP
2070 * already available in the buddy) on all the
2071 * other NUMA nodes.
2072 *
2073 * At most we could invoke compaction when
2074 * __GFP_THISNODE is set (but we would need to
2075 * refrain from invoking reclaim even if
2076 * compaction returned COMPACT_SKIPPED because
2077 * there wasn't not enough memory to succeed
2078 * compaction). For now just avoid
2079 * __GFP_THISNODE instead of limiting the
2080 * allocation path to a strict and single
2081 * compaction invocation.
2082 *
2083 * Supposedly if direct reclaim was enabled by
2084 * the caller, the app prefers THP regardless
2085 * of the node it comes from so this would be
2086 * more desiderable behavior than only
2087 * providing THP originated from the local
2088 * node in such case.
2089 */
2090 if (!(gfp & __GFP_DIRECT_RECLAIM))
2091 gfp |= __GFP_THISNODE;
2092 page = __alloc_pages_node(hpage_node, gfp, order);
2093 goto out;
2094 }
2095 }
2096
2097 nmask = policy_nodemask(gfp, pol); 2042 nmask = policy_nodemask(gfp, pol);
2098 preferred_nid = policy_node(gfp, pol, node); 2043 preferred_nid = policy_node(gfp, pol, node);
2099 page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); 2044 page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
diff --git a/mm/shmem.c b/mm/shmem.c
index 56bf122e0bb4..ea26d7a0342d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1435,7 +1435,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
1435 1435
1436 shmem_pseudo_vma_init(&pvma, info, hindex); 1436 shmem_pseudo_vma_init(&pvma, info, hindex);
1437 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, 1437 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
1438 HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); 1438 HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
1439 shmem_pseudo_vma_destroy(&pvma); 1439 shmem_pseudo_vma_destroy(&pvma);
1440 if (page) 1440 if (page)
1441 prep_transhuge_page(page); 1441 prep_transhuge_page(page);