aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2012-12-12 16:51:06 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 20:38:31 -0500
commit97ae17497e996ff09bf97b6db3b33f7fd4029092 (patch)
tree688ef847ee8114475d2da141a2ab054c5f13f52a /mm/huge_memory.c
parent78ca0e679203bbf74f8febd9725a1c8dd083d073 (diff)
thp: implement refcounting for huge zero page
H. Peter Anvin doesn't like huge zero page which sticks in memory forever after the first allocation. Here's implementation of lockless refcounting for huge zero page. We have two basic primitives: {get,put}_huge_zero_page(). They manipulate reference counter. If counter is 0, get_huge_zero_page() allocates a new huge page and takes two references: one for caller and one for shrinker. We free the page only in shrinker callback if counter is 1 (only shrinker has the reference). put_huge_zero_page() only decrements counter. Counter is never zero in put_huge_zero_page() since shrinker holds on reference. Freeing huge zero page in shrinker callback helps to avoid frequent allocate-free. Refcounting has cost. On 4 socket machine I observe ~1% slowdown on parallel (40 processes) read page faulting comparing to lazy huge page allocation. I think it's pretty reasonable for synthetic benchmark. [lliubbo@gmail.com: fix mismerge] Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: "H. Peter Anvin" <hpa@linux.intel.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Bob Liu <lliubbo@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c113
1 files changed, 88 insertions, 25 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9539d6654bb9..d89220cb1d9f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,12 +12,14 @@
12#include <linux/mmu_notifier.h> 12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h> 13#include <linux/rmap.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/shrinker.h>
15#include <linux/mm_inline.h> 16#include <linux/mm_inline.h>
16#include <linux/kthread.h> 17#include <linux/kthread.h>
17#include <linux/khugepaged.h> 18#include <linux/khugepaged.h>
18#include <linux/freezer.h> 19#include <linux/freezer.h>
19#include <linux/mman.h> 20#include <linux/mman.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22
21#include <asm/tlb.h> 23#include <asm/tlb.h>
22#include <asm/pgalloc.h> 24#include <asm/pgalloc.h>
23#include "internal.h" 25#include "internal.h"
@@ -47,7 +49,6 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
47/* during fragmentation poll the hugepage allocator once every minute */ 49/* during fragmentation poll the hugepage allocator once every minute */
48static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 50static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
49static struct task_struct *khugepaged_thread __read_mostly; 51static struct task_struct *khugepaged_thread __read_mostly;
50static unsigned long huge_zero_pfn __read_mostly;
51static DEFINE_MUTEX(khugepaged_mutex); 52static DEFINE_MUTEX(khugepaged_mutex);
52static DEFINE_SPINLOCK(khugepaged_mm_lock); 53static DEFINE_SPINLOCK(khugepaged_mm_lock);
53static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 54static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
@@ -160,31 +161,74 @@ static int start_khugepaged(void)
160 return err; 161 return err;
161} 162}
162 163
163static int init_huge_zero_pfn(void) 164static atomic_t huge_zero_refcount;
165static unsigned long huge_zero_pfn __read_mostly;
166
167static inline bool is_huge_zero_pfn(unsigned long pfn)
164{ 168{
165 struct page *hpage; 169 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
166 unsigned long pfn; 170 return zero_pfn && pfn == zero_pfn;
171}
167 172
168 hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 173static inline bool is_huge_zero_pmd(pmd_t pmd)
174{
175 return is_huge_zero_pfn(pmd_pfn(pmd));
176}
177
178static unsigned long get_huge_zero_page(void)
179{
180 struct page *zero_page;
181retry:
182 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
183 return ACCESS_ONCE(huge_zero_pfn);
184
185 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
169 HPAGE_PMD_ORDER); 186 HPAGE_PMD_ORDER);
170 if (!hpage) 187 if (!zero_page)
171 return -ENOMEM; 188 return 0;
172 pfn = page_to_pfn(hpage); 189 preempt_disable();
173 if (cmpxchg(&huge_zero_pfn, 0, pfn)) 190 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
174 __free_page(hpage); 191 preempt_enable();
175 return 0; 192 __free_page(zero_page);
193 goto retry;
194 }
195
196 /* We take additional reference here. It will be put back by shrinker */
197 atomic_set(&huge_zero_refcount, 2);
198 preempt_enable();
199 return ACCESS_ONCE(huge_zero_pfn);
176} 200}
177 201
178static inline bool is_huge_zero_pfn(unsigned long pfn) 202static void put_huge_zero_page(void)
179{ 203{
180 return huge_zero_pfn && pfn == huge_zero_pfn; 204 /*
205 * Counter should never go to zero here. Only shrinker can put
206 * last reference.
207 */
208 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
181} 209}
182 210
183static inline bool is_huge_zero_pmd(pmd_t pmd) 211static int shrink_huge_zero_page(struct shrinker *shrink,
212 struct shrink_control *sc)
184{ 213{
185 return is_huge_zero_pfn(pmd_pfn(pmd)); 214 if (!sc->nr_to_scan)
215 /* we can free zero page only if last reference remains */
216 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
217
218 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
219 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
220 BUG_ON(zero_pfn == 0);
221 __free_page(__pfn_to_page(zero_pfn));
222 }
223
224 return 0;
186} 225}
187 226
227static struct shrinker huge_zero_page_shrinker = {
228 .shrink = shrink_huge_zero_page,
229 .seeks = DEFAULT_SEEKS,
230};
231
188#ifdef CONFIG_SYSFS 232#ifdef CONFIG_SYSFS
189 233
190static ssize_t double_flag_show(struct kobject *kobj, 234static ssize_t double_flag_show(struct kobject *kobj,
@@ -576,6 +620,8 @@ static int __init hugepage_init(void)
576 goto out; 620 goto out;
577 } 621 }
578 622
623 register_shrinker(&huge_zero_page_shrinker);
624
579 /* 625 /*
580 * By default disable transparent hugepages on smaller systems, 626 * By default disable transparent hugepages on smaller systems,
581 * where the extra memory used could hurt more than TLB overhead 627 * where the extra memory used could hurt more than TLB overhead
@@ -705,10 +751,11 @@ static inline struct page *alloc_hugepage(int defrag)
705#endif 751#endif
706 752
707static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 753static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
708 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) 754 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
755 unsigned long zero_pfn)
709{ 756{
710 pmd_t entry; 757 pmd_t entry;
711 entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot); 758 entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
712 entry = pmd_wrprotect(entry); 759 entry = pmd_wrprotect(entry);
713 entry = pmd_mkhuge(entry); 760 entry = pmd_mkhuge(entry);
714 set_pmd_at(mm, haddr, pmd, entry); 761 set_pmd_at(mm, haddr, pmd, entry);
@@ -731,15 +778,19 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
731 return VM_FAULT_OOM; 778 return VM_FAULT_OOM;
732 if (!(flags & FAULT_FLAG_WRITE)) { 779 if (!(flags & FAULT_FLAG_WRITE)) {
733 pgtable_t pgtable; 780 pgtable_t pgtable;
734 if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) { 781 unsigned long zero_pfn;
735 count_vm_event(THP_FAULT_FALLBACK);
736 goto out;
737 }
738 pgtable = pte_alloc_one(mm, haddr); 782 pgtable = pte_alloc_one(mm, haddr);
739 if (unlikely(!pgtable)) 783 if (unlikely(!pgtable))
740 return VM_FAULT_OOM; 784 return VM_FAULT_OOM;
785 zero_pfn = get_huge_zero_page();
786 if (unlikely(!zero_pfn)) {
787 pte_free(mm, pgtable);
788 count_vm_event(THP_FAULT_FALLBACK);
789 goto out;
790 }
741 spin_lock(&mm->page_table_lock); 791 spin_lock(&mm->page_table_lock);
742 set_huge_zero_page(pgtable, mm, vma, haddr, pmd); 792 set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
793 zero_pfn);
743 spin_unlock(&mm->page_table_lock); 794 spin_unlock(&mm->page_table_lock);
744 return 0; 795 return 0;
745 } 796 }
@@ -813,7 +864,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
813 * a page table. 864 * a page table.
814 */ 865 */
815 if (is_huge_zero_pmd(pmd)) { 866 if (is_huge_zero_pmd(pmd)) {
816 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd); 867 unsigned long zero_pfn;
868 /*
869 * get_huge_zero_page() will never allocate a new page here,
870 * since we already have a zero page to copy. It just takes a
871 * reference.
872 */
873 zero_pfn = get_huge_zero_page();
874 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
875 zero_pfn);
817 ret = 0; 876 ret = 0;
818 goto out_unlock; 877 goto out_unlock;
819 } 878 }
@@ -923,6 +982,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
923 smp_wmb(); /* make pte visible before pmd */ 982 smp_wmb(); /* make pte visible before pmd */
924 pmd_populate(mm, pmd, pgtable); 983 pmd_populate(mm, pmd, pgtable);
925 spin_unlock(&mm->page_table_lock); 984 spin_unlock(&mm->page_table_lock);
985 put_huge_zero_page();
926 inc_mm_counter(mm, MM_ANONPAGES); 986 inc_mm_counter(mm, MM_ANONPAGES);
927 987
928 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 988 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1123,9 +1183,10 @@ alloc:
1123 page_add_new_anon_rmap(new_page, vma, haddr); 1183 page_add_new_anon_rmap(new_page, vma, haddr);
1124 set_pmd_at(mm, haddr, pmd, entry); 1184 set_pmd_at(mm, haddr, pmd, entry);
1125 update_mmu_cache_pmd(vma, address, pmd); 1185 update_mmu_cache_pmd(vma, address, pmd);
1126 if (is_huge_zero_pmd(orig_pmd)) 1186 if (is_huge_zero_pmd(orig_pmd)) {
1127 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 1187 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1128 else { 1188 put_huge_zero_page();
1189 } else {
1129 VM_BUG_ON(!PageHead(page)); 1190 VM_BUG_ON(!PageHead(page));
1130 page_remove_rmap(page); 1191 page_remove_rmap(page);
1131 put_page(page); 1192 put_page(page);
@@ -1202,6 +1263,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1202 if (is_huge_zero_pmd(orig_pmd)) { 1263 if (is_huge_zero_pmd(orig_pmd)) {
1203 tlb->mm->nr_ptes--; 1264 tlb->mm->nr_ptes--;
1204 spin_unlock(&tlb->mm->page_table_lock); 1265 spin_unlock(&tlb->mm->page_table_lock);
1266 put_huge_zero_page();
1205 } else { 1267 } else {
1206 page = pmd_page(orig_pmd); 1268 page = pmd_page(orig_pmd);
1207 page_remove_rmap(page); 1269 page_remove_rmap(page);
@@ -2511,6 +2573,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2511 } 2573 }
2512 smp_wmb(); /* make pte visible before pmd */ 2574 smp_wmb(); /* make pte visible before pmd */
2513 pmd_populate(mm, pmd, pgtable); 2575 pmd_populate(mm, pmd, pgtable);
2576 put_huge_zero_page();
2514} 2577}
2515 2578
2516void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, 2579void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,