aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mm/huge_memory.c113
1 files changed, 88 insertions, 25 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9539d6654bb9..d89220cb1d9f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,12 +12,14 @@
12#include <linux/mmu_notifier.h> 12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h> 13#include <linux/rmap.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/shrinker.h>
15#include <linux/mm_inline.h> 16#include <linux/mm_inline.h>
16#include <linux/kthread.h> 17#include <linux/kthread.h>
17#include <linux/khugepaged.h> 18#include <linux/khugepaged.h>
18#include <linux/freezer.h> 19#include <linux/freezer.h>
19#include <linux/mman.h> 20#include <linux/mman.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22
21#include <asm/tlb.h> 23#include <asm/tlb.h>
22#include <asm/pgalloc.h> 24#include <asm/pgalloc.h>
23#include "internal.h" 25#include "internal.h"
@@ -47,7 +49,6 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
47/* during fragmentation poll the hugepage allocator once every minute */ 49/* during fragmentation poll the hugepage allocator once every minute */
48static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 50static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
49static struct task_struct *khugepaged_thread __read_mostly; 51static struct task_struct *khugepaged_thread __read_mostly;
50static unsigned long huge_zero_pfn __read_mostly;
51static DEFINE_MUTEX(khugepaged_mutex); 52static DEFINE_MUTEX(khugepaged_mutex);
52static DEFINE_SPINLOCK(khugepaged_mm_lock); 53static DEFINE_SPINLOCK(khugepaged_mm_lock);
53static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 54static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
@@ -160,31 +161,74 @@ static int start_khugepaged(void)
160 return err; 161 return err;
161} 162}
162 163
163static int init_huge_zero_pfn(void) 164static atomic_t huge_zero_refcount;
165static unsigned long huge_zero_pfn __read_mostly;
166
167static inline bool is_huge_zero_pfn(unsigned long pfn)
164{ 168{
165 struct page *hpage; 169 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
166 unsigned long pfn; 170 return zero_pfn && pfn == zero_pfn;
171}
167 172
168 hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 173static inline bool is_huge_zero_pmd(pmd_t pmd)
174{
175 return is_huge_zero_pfn(pmd_pfn(pmd));
176}
177
178static unsigned long get_huge_zero_page(void)
179{
180 struct page *zero_page;
181retry:
182 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
183 return ACCESS_ONCE(huge_zero_pfn);
184
185 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
169 HPAGE_PMD_ORDER); 186 HPAGE_PMD_ORDER);
170 if (!hpage) 187 if (!zero_page)
171 return -ENOMEM; 188 return 0;
172 pfn = page_to_pfn(hpage); 189 preempt_disable();
173 if (cmpxchg(&huge_zero_pfn, 0, pfn)) 190 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
174 __free_page(hpage); 191 preempt_enable();
175 return 0; 192 __free_page(zero_page);
193 goto retry;
194 }
195
196 /* We take additional reference here. It will be put back by shrinker */
197 atomic_set(&huge_zero_refcount, 2);
198 preempt_enable();
199 return ACCESS_ONCE(huge_zero_pfn);
176} 200}
177 201
178static inline bool is_huge_zero_pfn(unsigned long pfn) 202static void put_huge_zero_page(void)
179{ 203{
180 return huge_zero_pfn && pfn == huge_zero_pfn; 204 /*
205 * Counter should never go to zero here. Only shrinker can put
206 * last reference.
207 */
208 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
181} 209}
182 210
183static inline bool is_huge_zero_pmd(pmd_t pmd) 211static int shrink_huge_zero_page(struct shrinker *shrink,
212 struct shrink_control *sc)
184{ 213{
185 return is_huge_zero_pfn(pmd_pfn(pmd)); 214 if (!sc->nr_to_scan)
215 /* we can free zero page only if last reference remains */
216 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
217
218 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
219 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
220 BUG_ON(zero_pfn == 0);
221 __free_page(__pfn_to_page(zero_pfn));
222 }
223
224 return 0;
186} 225}
187 226
227static struct shrinker huge_zero_page_shrinker = {
228 .shrink = shrink_huge_zero_page,
229 .seeks = DEFAULT_SEEKS,
230};
231
188#ifdef CONFIG_SYSFS 232#ifdef CONFIG_SYSFS
189 233
190static ssize_t double_flag_show(struct kobject *kobj, 234static ssize_t double_flag_show(struct kobject *kobj,
@@ -576,6 +620,8 @@ static int __init hugepage_init(void)
576 goto out; 620 goto out;
577 } 621 }
578 622
623 register_shrinker(&huge_zero_page_shrinker);
624
579 /* 625 /*
580 * By default disable transparent hugepages on smaller systems, 626 * By default disable transparent hugepages on smaller systems,
581 * where the extra memory used could hurt more than TLB overhead 627 * where the extra memory used could hurt more than TLB overhead
@@ -705,10 +751,11 @@ static inline struct page *alloc_hugepage(int defrag)
705#endif 751#endif
706 752
707static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 753static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
708 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) 754 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
755 unsigned long zero_pfn)
709{ 756{
710 pmd_t entry; 757 pmd_t entry;
711 entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot); 758 entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
712 entry = pmd_wrprotect(entry); 759 entry = pmd_wrprotect(entry);
713 entry = pmd_mkhuge(entry); 760 entry = pmd_mkhuge(entry);
714 set_pmd_at(mm, haddr, pmd, entry); 761 set_pmd_at(mm, haddr, pmd, entry);
@@ -731,15 +778,19 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
731 return VM_FAULT_OOM; 778 return VM_FAULT_OOM;
732 if (!(flags & FAULT_FLAG_WRITE)) { 779 if (!(flags & FAULT_FLAG_WRITE)) {
733 pgtable_t pgtable; 780 pgtable_t pgtable;
734 if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) { 781 unsigned long zero_pfn;
735 count_vm_event(THP_FAULT_FALLBACK);
736 goto out;
737 }
738 pgtable = pte_alloc_one(mm, haddr); 782 pgtable = pte_alloc_one(mm, haddr);
739 if (unlikely(!pgtable)) 783 if (unlikely(!pgtable))
740 return VM_FAULT_OOM; 784 return VM_FAULT_OOM;
785 zero_pfn = get_huge_zero_page();
786 if (unlikely(!zero_pfn)) {
787 pte_free(mm, pgtable);
788 count_vm_event(THP_FAULT_FALLBACK);
789 goto out;
790 }
741 spin_lock(&mm->page_table_lock); 791 spin_lock(&mm->page_table_lock);
742 set_huge_zero_page(pgtable, mm, vma, haddr, pmd); 792 set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
793 zero_pfn);
743 spin_unlock(&mm->page_table_lock); 794 spin_unlock(&mm->page_table_lock);
744 return 0; 795 return 0;
745 } 796 }
@@ -813,7 +864,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
813 * a page table. 864 * a page table.
814 */ 865 */
815 if (is_huge_zero_pmd(pmd)) { 866 if (is_huge_zero_pmd(pmd)) {
816 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd); 867 unsigned long zero_pfn;
868 /*
869 * get_huge_zero_page() will never allocate a new page here,
870 * since we already have a zero page to copy. It just takes a
871 * reference.
872 */
873 zero_pfn = get_huge_zero_page();
874 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
875 zero_pfn);
817 ret = 0; 876 ret = 0;
818 goto out_unlock; 877 goto out_unlock;
819 } 878 }
@@ -923,6 +982,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
923 smp_wmb(); /* make pte visible before pmd */ 982 smp_wmb(); /* make pte visible before pmd */
924 pmd_populate(mm, pmd, pgtable); 983 pmd_populate(mm, pmd, pgtable);
925 spin_unlock(&mm->page_table_lock); 984 spin_unlock(&mm->page_table_lock);
985 put_huge_zero_page();
926 inc_mm_counter(mm, MM_ANONPAGES); 986 inc_mm_counter(mm, MM_ANONPAGES);
927 987
928 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 988 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1123,9 +1183,10 @@ alloc:
1123 page_add_new_anon_rmap(new_page, vma, haddr); 1183 page_add_new_anon_rmap(new_page, vma, haddr);
1124 set_pmd_at(mm, haddr, pmd, entry); 1184 set_pmd_at(mm, haddr, pmd, entry);
1125 update_mmu_cache_pmd(vma, address, pmd); 1185 update_mmu_cache_pmd(vma, address, pmd);
1126 if (is_huge_zero_pmd(orig_pmd)) 1186 if (is_huge_zero_pmd(orig_pmd)) {
1127 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 1187 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1128 else { 1188 put_huge_zero_page();
1189 } else {
1129 VM_BUG_ON(!PageHead(page)); 1190 VM_BUG_ON(!PageHead(page));
1130 page_remove_rmap(page); 1191 page_remove_rmap(page);
1131 put_page(page); 1192 put_page(page);
@@ -1202,6 +1263,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1202 if (is_huge_zero_pmd(orig_pmd)) { 1263 if (is_huge_zero_pmd(orig_pmd)) {
1203 tlb->mm->nr_ptes--; 1264 tlb->mm->nr_ptes--;
1204 spin_unlock(&tlb->mm->page_table_lock); 1265 spin_unlock(&tlb->mm->page_table_lock);
1266 put_huge_zero_page();
1205 } else { 1267 } else {
1206 page = pmd_page(orig_pmd); 1268 page = pmd_page(orig_pmd);
1207 page_remove_rmap(page); 1269 page_remove_rmap(page);
@@ -2511,6 +2573,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2511 } 2573 }
2512 smp_wmb(); /* make pte visible before pmd */ 2574 smp_wmb(); /* make pte visible before pmd */
2513 pmd_populate(mm, pmd, pgtable); 2575 pmd_populate(mm, pmd, pgtable);
2576 put_huge_zero_page();
2514} 2577}
2515 2578
2516void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, 2579void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,