aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2018-11-02 18:48:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-11-03 13:09:37 -0400
commit89c83fb539f95491be80cdd5158e6f0ce329e317 (patch)
treee6234f49d51e065ddefec34f677b220a68150139
parent6194ae4242dec0c9d604bc05df83aa9260a899e4 (diff)
mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask
THP allocation mode is quite complex and it depends on the defrag mode. This complexity is hidden in alloc_hugepage_direct_gfpmask from a large part currently. The NUMA special casing (namely __GFP_THISNODE) is however independent and placed in alloc_pages_vma currently. This both adds an unnecessary branch to all vma based page allocation requests and it makes the code more complex unnecessarily as well. Not to mention that e.g. shmem THP used to do the node reclaiming unconditionally regardless of the defrag mode until recently. This was not only unexpected behavior but it was also hardly a good default behavior and I strongly suspect it was just a side effect of the code sharing more than a deliberate decision which suggests that such a layering is wrong. Get rid of the thp special casing from alloc_pages_vma and move the logic to alloc_hugepage_direct_gfpmask. __GFP_THISNODE is applied to the resulting gfp mask only when the direct reclaim is not requested and when there is no explicit numa binding to preserve the current logic. Please note that there's also a slight difference wrt MPOL_BIND now. The previous code would avoid using __GFP_THISNODE if the local node was outside of policy_nodemask(). After this patch __GFP_THISNODE is avoided for all MPOL_BIND policies. So there's a difference that if local node is actually allowed by the bind policy's nodemask, previously __GFP_THISNODE would be added, but now it won't be. From the behavior POV this is still correct because the policy nodemask is used. Link: http://lkml.kernel.org/r/20180925120326.24392-3-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Stefan Priebe - Profihost AG <s.priebe@profihost.ag> Cc: Zi Yan <zi.yan@cs.rutgers.edu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/gfp.h12
-rw-r--r--include/linux/mempolicy.h2
-rw-r--r--mm/huge_memory.c38
-rw-r--r--mm/mempolicy.c63
-rw-r--r--mm/shmem.c2
5 files changed, 40 insertions, 77 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 24bcc5eec6b4..76f8db0b0e71 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
510} 510}
511extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, 511extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
512 struct vm_area_struct *vma, unsigned long addr, 512 struct vm_area_struct *vma, unsigned long addr,
513 int node, bool hugepage); 513 int node);
514#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
515 alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
516#else 514#else
517#define alloc_pages(gfp_mask, order) \ 515#define alloc_pages(gfp_mask, order) \
518 alloc_pages_node(numa_node_id(), gfp_mask, order) 516 alloc_pages_node(numa_node_id(), gfp_mask, order)
519#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ 517#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\
520 alloc_pages(gfp_mask, order)
521#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
522 alloc_pages(gfp_mask, order) 518 alloc_pages(gfp_mask, order)
523#endif 519#endif
524#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) 520#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
525#define alloc_page_vma(gfp_mask, vma, addr) \ 521#define alloc_page_vma(gfp_mask, vma, addr) \
526 alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) 522 alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
527#define alloc_page_vma_node(gfp_mask, vma, addr, node) \ 523#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
528 alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) 524 alloc_pages_vma(gfp_mask, 0, vma, addr, node)
529 525
530extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); 526extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
531extern unsigned long get_zeroed_page(gfp_t gfp_mask); 527extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 5228c62af416..bac395f1d00a 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
139struct mempolicy *get_task_policy(struct task_struct *p); 139struct mempolicy *get_task_policy(struct task_struct *p);
140struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 140struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
141 unsigned long addr); 141 unsigned long addr);
142struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
143 unsigned long addr);
142bool vma_policy_mof(struct vm_area_struct *vma); 144bool vma_policy_mof(struct vm_area_struct *vma);
143 145
144extern void numa_default_policy(void); 146extern void numa_default_policy(void);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4e4ef8fa479d..55478ab3c83b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -629,21 +629,40 @@ release:
629 * available 629 * available
630 * never: never stall for any thp allocation 630 * never: never stall for any thp allocation
631 */ 631 */
632static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) 632static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
633{ 633{
634 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); 634 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
635 gfp_t this_node = 0;
636
637#ifdef CONFIG_NUMA
638 struct mempolicy *pol;
639 /*
640 * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
641 * specified, to express a general desire to stay on the current
642 * node for optimistic allocation attempts. If the defrag mode
643 * and/or madvise hint requires the direct reclaim then we prefer
644 * to fallback to other node rather than node reclaim because that
645 * can lead to excessive reclaim even though there is free memory
646 * on other nodes. We expect that NUMA preferences are specified
647 * by memory policies.
648 */
649 pol = get_vma_policy(vma, addr);
650 if (pol->mode != MPOL_BIND)
651 this_node = __GFP_THISNODE;
652 mpol_cond_put(pol);
653#endif
635 654
636 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 655 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
637 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 656 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
638 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 657 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
639 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 658 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
640 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 659 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
641 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 660 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
642 __GFP_KSWAPD_RECLAIM); 661 __GFP_KSWAPD_RECLAIM | this_node);
643 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 662 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
644 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 663 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
645 0); 664 this_node);
646 return GFP_TRANSHUGE_LIGHT; 665 return GFP_TRANSHUGE_LIGHT | this_node;
647} 666}
648 667
649/* Caller must hold page table lock. */ 668/* Caller must hold page table lock. */
@@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
715 pte_free(vma->vm_mm, pgtable); 734 pte_free(vma->vm_mm, pgtable);
716 return ret; 735 return ret;
717 } 736 }
718 gfp = alloc_hugepage_direct_gfpmask(vma); 737 gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
719 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); 738 page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
720 if (unlikely(!page)) { 739 if (unlikely(!page)) {
721 count_vm_event(THP_FAULT_FALLBACK); 740 count_vm_event(THP_FAULT_FALLBACK);
722 return VM_FAULT_FALLBACK; 741 return VM_FAULT_FALLBACK;
@@ -1286,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
1286alloc: 1305alloc:
1287 if (transparent_hugepage_enabled(vma) && 1306 if (transparent_hugepage_enabled(vma) &&
1288 !transparent_hugepage_debug_cow()) { 1307 !transparent_hugepage_debug_cow()) {
1289 huge_gfp = alloc_hugepage_direct_gfpmask(vma); 1308 huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
1290 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); 1309 new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
1310 haddr, numa_node_id());
1291 } else 1311 } else
1292 new_page = NULL; 1312 new_page = NULL;
1293 1313
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 58fb833fce0c..5837a067124d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start)
1116 } else if (PageTransHuge(page)) { 1116 } else if (PageTransHuge(page)) {
1117 struct page *thp; 1117 struct page *thp;
1118 1118
1119 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, 1119 thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
1120 HPAGE_PMD_ORDER); 1120 address, numa_node_id());
1121 if (!thp) 1121 if (!thp)
1122 return NULL; 1122 return NULL;
1123 prep_transhuge_page(thp); 1123 prep_transhuge_page(thp);
@@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1662 * freeing by another task. It is the caller's responsibility to free the 1662 * freeing by another task. It is the caller's responsibility to free the
1663 * extra reference for shared policies. 1663 * extra reference for shared policies.
1664 */ 1664 */
1665static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 1665struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1666 unsigned long addr) 1666 unsigned long addr)
1667{ 1667{
1668 struct mempolicy *pol = __get_vma_policy(vma, addr); 1668 struct mempolicy *pol = __get_vma_policy(vma, addr);
@@ -2011,7 +2011,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2011 * @vma: Pointer to VMA or NULL if not available. 2011 * @vma: Pointer to VMA or NULL if not available.
2012 * @addr: Virtual Address of the allocation. Must be inside the VMA. 2012 * @addr: Virtual Address of the allocation. Must be inside the VMA.
2013 * @node: Which node to prefer for allocation (modulo policy). 2013 * @node: Which node to prefer for allocation (modulo policy).
2014 * @hugepage: for hugepages try only the preferred node if possible
2015 * 2014 *
2016 * This function allocates a page from the kernel page pool and applies 2015 * This function allocates a page from the kernel page pool and applies
2017 * a NUMA policy associated with the VMA or the current process. 2016 * a NUMA policy associated with the VMA or the current process.
@@ -2022,7 +2021,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2022 */ 2021 */
2023struct page * 2022struct page *
2024alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 2023alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2025 unsigned long addr, int node, bool hugepage) 2024 unsigned long addr, int node)
2026{ 2025{
2027 struct mempolicy *pol; 2026 struct mempolicy *pol;
2028 struct page *page; 2027 struct page *page;
@@ -2040,60 +2039,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2040 goto out; 2039 goto out;
2041 } 2040 }
2042 2041
2043 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2044 int hpage_node = node;
2045
2046 /*
2047 * For hugepage allocation and non-interleave policy which
2048 * allows the current node (or other explicitly preferred
2049 * node) we only try to allocate from the current/preferred
2050 * node and don't fall back to other nodes, as the cost of
2051 * remote accesses would likely offset THP benefits.
2052 *
2053 * If the policy is interleave, or does not allow the current
2054 * node in its nodemask, we allocate the standard way.
2055 */
2056 if (pol->mode == MPOL_PREFERRED &&
2057 !(pol->flags & MPOL_F_LOCAL))
2058 hpage_node = pol->v.preferred_node;
2059
2060 nmask = policy_nodemask(gfp, pol);
2061 if (!nmask || node_isset(hpage_node, *nmask)) {
2062 mpol_cond_put(pol);
2063 /*
2064 * We cannot invoke reclaim if __GFP_THISNODE
2065 * is set. Invoking reclaim with
2066 * __GFP_THISNODE set, would cause THP
2067 * allocations to trigger heavy swapping
2068 * despite there may be tons of free memory
2069 * (including potentially plenty of THP
2070 * already available in the buddy) on all the
2071 * other NUMA nodes.
2072 *
2073 * At most we could invoke compaction when
2074 * __GFP_THISNODE is set (but we would need to
2075 * refrain from invoking reclaim even if
2076 * compaction returned COMPACT_SKIPPED because
2077 * there wasn't not enough memory to succeed
2078 * compaction). For now just avoid
2079 * __GFP_THISNODE instead of limiting the
2080 * allocation path to a strict and single
2081 * compaction invocation.
2082 *
2083 * Supposedly if direct reclaim was enabled by
2084 * the caller, the app prefers THP regardless
2085 * of the node it comes from so this would be
2086 * more desiderable behavior than only
2087 * providing THP originated from the local
2088 * node in such case.
2089 */
2090 if (!(gfp & __GFP_DIRECT_RECLAIM))
2091 gfp |= __GFP_THISNODE;
2092 page = __alloc_pages_node(hpage_node, gfp, order);
2093 goto out;
2094 }
2095 }
2096
2097 nmask = policy_nodemask(gfp, pol); 2042 nmask = policy_nodemask(gfp, pol);
2098 preferred_nid = policy_node(gfp, pol, node); 2043 preferred_nid = policy_node(gfp, pol, node);
2099 page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); 2044 page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
diff --git a/mm/shmem.c b/mm/shmem.c
index 56bf122e0bb4..ea26d7a0342d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1435,7 +1435,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
1435 1435
1436 shmem_pseudo_vma_init(&pvma, info, hindex); 1436 shmem_pseudo_vma_init(&pvma, info, hindex);
1437 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, 1437 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
1438 HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); 1438 HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
1439 shmem_pseudo_vma_destroy(&pvma); 1439 shmem_pseudo_vma_destroy(&pvma);
1440 if (page) 1440 if (page)
1441 prep_transhuge_page(page); 1441 prep_transhuge_page(page);