From 3b3636924dfe1e80f9bef2c0dc1207e16f3b078a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Apr 2015 16:13:29 -0700 Subject: mm, memcg: sync allocation and memcg charge gfp flags for THP memcg currently uses hardcoded GFP_TRANSHUGE gfp flags for all THP charges. THP allocations, however, might be using different flags depending on /sys/kernel/mm/transparent_hugepage/{,khugepaged/}defrag and the current allocation context. The primary difference is that defrag configured to "madvise" value will clear __GFP_WAIT flag from the core gfp mask to make the allocation lighter for all mappings which are not backed by VM_HUGEPAGE vmas. If memcg charge path ignores this fact we will get light allocation but the a potential memcg reclaim would kill the whole point of the configuration. Fix the mismatch by providing the same gfp mask used for the allocation to the charge functions. This is quite easy for all paths except for hugepaged kernel thread with !CONFIG_NUMA which is doing a pre-allocation long before the allocated page is used in collapse_huge_page via khugepaged_alloc_page. To prevent from cluttering the whole code path from khugepaged_do_scan we simply return the current flags as per khugepaged_defrag() value which might have changed since the preallocation. If somebody changed the value of the knob we would charge differently but this shouldn't happen often and it is definitely not critical because it would only lead to a reduced success rate of one-off THP promotion. [akpm@linux-foundation.org: fix weird code layout while we're there] [rientjes@google.com: clean up around alloc_hugepage_gfpmask()] Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Johannes Weiner Acked-by: David Rientjes Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3afb5cbe1312..4914e1b29fdb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -708,7 +708,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, - struct page *page) + struct page *page, gfp_t gfp) { struct mem_cgroup *memcg; pgtable_t pgtable; @@ -716,7 +716,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) + if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) return VM_FAULT_OOM; pgtable = pte_alloc_one(mm, haddr); @@ -822,7 +822,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1080,6 +1080,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t huge_gfp; /* for allocation and charge */ ptl = pmd_lockptr(mm, pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); @@ -1106,10 +1107,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - gfp_t gfp; - - gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); - new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; @@ -1130,8 +1129,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, - GFP_TRANSHUGE, &memcg))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { put_page(new_page); if (page) { split_huge_page(page); @@ -2323,19 +2321,13 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return true; } -static struct page -*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int node) { - gfp_t flags; - VM_BUG_ON_PAGE(*hpage, *hpage); - /* Only allocate from the target node */ - flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | - __GFP_THISNODE; - /* * Before allocating the hugepage, release the mmap_sem read lock. * The allocation can take potentially a long time if it involves @@ -2344,7 +2336,7 @@ static struct page */ up_read(&mm->mmap_sem); - *hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER); + *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); @@ -2397,13 +2389,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return true; } -static struct page -*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int node) { up_read(&mm->mmap_sem); VM_BUG_ON(!*hpage); + return *hpage; } #endif @@ -2438,16 +2431,21 @@ static void collapse_huge_page(struct mm_struct *mm, struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); + /* Only allocate from the target node */ + gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | + __GFP_THISNODE; + /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); + new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); if (!new_page) return; if (unlikely(mem_cgroup_try_charge(new_page, mm, - GFP_TRANSHUGE, &memcg))) + gfp, &memcg))) return; /* -- cgit v1.2.2 From 4db0c3c2983cc6b7a08a33542af5e14de8a9258c Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 15 Apr 2015 16:14:08 -0700 Subject: mm: remove rest of ACCESS_ONCE() usages We converted some of the usages of ACCESS_ONCE to READ_ONCE in the mm/ tree since it doesn't work reliably on non-scalar types. This patch removes the rest of the usages of ACCESS_ONCE, and use the new READ_ONCE API for the read accesses. This makes things cleaner, instead of using separate/multiple sets of APIs. Signed-off-by: Jason Low Acked-by: Michal Hocko Acked-by: Davidlohr Bueso Acked-by: Rik van Riel Reviewed-by: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4914e1b29fdb..1db93fbda06a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -183,7 +183,7 @@ static struct page *get_huge_zero_page(void) struct page *zero_page; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) - return ACCESS_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_page); zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); @@ -202,7 +202,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); - return ACCESS_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_page); } static void put_huge_zero_page(void) -- cgit v1.2.2 From 65ebb64f4d2ce8eba4d0ec82d6cf65022e70e4a1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Apr 2015 16:14:20 -0700 Subject: thp: handle errors in hugepage_init() properly We miss error-handling in few cases hugepage_init(). Let's fix that. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1db93fbda06a..c257006749bb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; static int khugepaged(void *none); static int khugepaged_slab_init(void); +static void khugepaged_slab_exit(void); #define MM_SLOTS_HASH_BITS 10 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -634,13 +635,15 @@ static int __init hugepage_init(void) err = hugepage_init_sysfs(&hugepage_kobj); if (err) - return err; + goto err_sysfs; err = khugepaged_slab_init(); if (err) - goto out; + goto err_slab; - register_shrinker(&huge_zero_page_shrinker); + err = register_shrinker(&huge_zero_page_shrinker); + if (err) + goto err_hzp_shrinker; /* * By default disable transparent hugepages on smaller systems, @@ -650,11 +653,18 @@ static int __init hugepage_init(void) if (totalram_pages < (512 << (20 - PAGE_SHIFT))) transparent_hugepage_flags = 0; - start_khugepaged(); + err = start_khugepaged(); + if (err) + goto err_khugepaged; return 0; -out: +err_khugepaged: + unregister_shrinker(&huge_zero_page_shrinker); +err_hzp_shrinker: + khugepaged_slab_exit(); +err_slab: hugepage_exit_sysfs(hugepage_kobj); +err_sysfs: return err; } subsys_initcall(hugepage_init); @@ -1974,6 +1984,11 @@ static int __init khugepaged_slab_init(void) return 0; } +static void __init khugepaged_slab_exit(void) +{ + kmem_cache_destroy(mm_slot_cache); +} + static inline struct mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ -- cgit v1.2.2 From ae7efa507dee4813ec4bcdadebeec191e247d0c9 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Apr 2015 16:14:23 -0700 Subject: thp: do not adjust zone water marks if khugepaged is not started set_recommended_min_free_kbytes() adjusts zone water marks to be suitable for khugepaged. We avoid doing this if khugepaged is disabled, but don't catch the case when khugepaged is failed to start. Let's address this by checking khugepaged_thread instead of khugepaged_enabled() in set_recommended_min_free_kbytes(). It's NULL if the kernel thread is stopped or failed to start. Signed-off-by: Kirill A. Shutemov Cc: David Rientjes Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c257006749bb..0af19fffe1df 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -110,7 +110,8 @@ static int set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - if (!khugepaged_enabled()) + /* khugepaged thread has stopped to failed to start */ + if (!khugepaged_thread) return 0; for_each_populated_zone(zone) -- cgit v1.2.2 From 79553da293d38d63097278de13e28a3b371f43c1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Apr 2015 16:14:56 -0700 Subject: thp: cleanup khugepaged startup Few trivial cleanups: - no need to call set_recommended_min_free_kbytes() from late_initcall() -- start_khugepaged() calls it; - no need to call set_recommended_min_free_kbytes() from start_khugepaged() if khugepaged is not started; - there isn't much point in running start_khugepaged() if we've just set transparent_hugepage_flags to zero; - start_khugepaged() is misnamed -- it also used to stop the thread; Signed-off-by: Kirill A. Shutemov Cc: David Rientjes Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0af19fffe1df..078832cf3636 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -110,10 +110,6 @@ static int set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - /* khugepaged thread has stopped to failed to start */ - if (!khugepaged_thread) - return 0; - for_each_populated_zone(zone) nr_zones++; @@ -145,9 +141,8 @@ static int set_recommended_min_free_kbytes(void) setup_per_zone_wmarks(); return 0; } -late_initcall(set_recommended_min_free_kbytes); -static int start_khugepaged(void) +static int start_stop_khugepaged(void) { int err = 0; if (khugepaged_enabled()) { @@ -158,6 +153,7 @@ static int start_khugepaged(void) pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; + goto fail; } if (!list_empty(&khugepaged_scan.mm_head)) @@ -168,7 +164,7 @@ static int start_khugepaged(void) kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } - +fail: return err; } @@ -302,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj, int err; mutex_lock(&khugepaged_mutex); - err = start_khugepaged(); + err = start_stop_khugepaged(); mutex_unlock(&khugepaged_mutex); if (err) @@ -651,10 +647,12 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) + if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; + return 0; + } - err = start_khugepaged(); + err = start_stop_khugepaged(); if (err) goto err_khugepaged; -- cgit v1.2.2