diff options
-rw-r--r-- | mm/huge_memory.c | 113 |
1 files changed, 88 insertions, 25 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9539d6654bb9..d89220cb1d9f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -12,12 +12,14 @@ | |||
12 | #include <linux/mmu_notifier.h> | 12 | #include <linux/mmu_notifier.h> |
13 | #include <linux/rmap.h> | 13 | #include <linux/rmap.h> |
14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
15 | #include <linux/shrinker.h> | ||
15 | #include <linux/mm_inline.h> | 16 | #include <linux/mm_inline.h> |
16 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
17 | #include <linux/khugepaged.h> | 18 | #include <linux/khugepaged.h> |
18 | #include <linux/freezer.h> | 19 | #include <linux/freezer.h> |
19 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
20 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | |||
21 | #include <asm/tlb.h> | 23 | #include <asm/tlb.h> |
22 | #include <asm/pgalloc.h> | 24 | #include <asm/pgalloc.h> |
23 | #include "internal.h" | 25 | #include "internal.h" |
@@ -47,7 +49,6 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; | |||
47 | /* during fragmentation poll the hugepage allocator once every minute */ | 49 | /* during fragmentation poll the hugepage allocator once every minute */ |
48 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; | 50 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; |
49 | static struct task_struct *khugepaged_thread __read_mostly; | 51 | static struct task_struct *khugepaged_thread __read_mostly; |
50 | static unsigned long huge_zero_pfn __read_mostly; | ||
51 | static DEFINE_MUTEX(khugepaged_mutex); | 52 | static DEFINE_MUTEX(khugepaged_mutex); |
52 | static DEFINE_SPINLOCK(khugepaged_mm_lock); | 53 | static DEFINE_SPINLOCK(khugepaged_mm_lock); |
53 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); | 54 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); |
@@ -160,31 +161,74 @@ static int start_khugepaged(void) | |||
160 | return err; | 161 | return err; |
161 | } | 162 | } |
162 | 163 | ||
163 | static int init_huge_zero_pfn(void) | 164 | static atomic_t huge_zero_refcount; |
165 | static unsigned long huge_zero_pfn __read_mostly; | ||
166 | |||
167 | static inline bool is_huge_zero_pfn(unsigned long pfn) | ||
164 | { | 168 | { |
165 | struct page *hpage; | 169 | unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); |
166 | unsigned long pfn; | 170 | return zero_pfn && pfn == zero_pfn; |
171 | } | ||
167 | 172 | ||
168 | hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | 173 | static inline bool is_huge_zero_pmd(pmd_t pmd) |
174 | { | ||
175 | return is_huge_zero_pfn(pmd_pfn(pmd)); | ||
176 | } | ||
177 | |||
178 | static unsigned long get_huge_zero_page(void) | ||
179 | { | ||
180 | struct page *zero_page; | ||
181 | retry: | ||
182 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | ||
183 | return ACCESS_ONCE(huge_zero_pfn); | ||
184 | |||
185 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | ||
169 | HPAGE_PMD_ORDER); | 186 | HPAGE_PMD_ORDER); |
170 | if (!hpage) | 187 | if (!zero_page) |
171 | return -ENOMEM; | 188 | return 0; |
172 | pfn = page_to_pfn(hpage); | 189 | preempt_disable(); |
173 | if (cmpxchg(&huge_zero_pfn, 0, pfn)) | 190 | if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { |
174 | __free_page(hpage); | 191 | preempt_enable(); |
175 | return 0; | 192 | __free_page(zero_page); |
193 | goto retry; | ||
194 | } | ||
195 | |||
196 | /* We take additional reference here. It will be put back by shrinker */ | ||
197 | atomic_set(&huge_zero_refcount, 2); | ||
198 | preempt_enable(); | ||
199 | return ACCESS_ONCE(huge_zero_pfn); | ||
176 | } | 200 | } |
177 | 201 | ||
178 | static inline bool is_huge_zero_pfn(unsigned long pfn) | 202 | static void put_huge_zero_page(void) |
179 | { | 203 | { |
180 | return huge_zero_pfn && pfn == huge_zero_pfn; | 204 | /* |
205 | * Counter should never go to zero here. Only shrinker can put | ||
206 | * last reference. | ||
207 | */ | ||
208 | BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); | ||
181 | } | 209 | } |
182 | 210 | ||
183 | static inline bool is_huge_zero_pmd(pmd_t pmd) | 211 | static int shrink_huge_zero_page(struct shrinker *shrink, |
212 | struct shrink_control *sc) | ||
184 | { | 213 | { |
185 | return is_huge_zero_pfn(pmd_pfn(pmd)); | 214 | if (!sc->nr_to_scan) |
215 | /* we can free zero page only if last reference remains */ | ||
216 | return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; | ||
217 | |||
218 | if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { | ||
219 | unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); | ||
220 | BUG_ON(zero_pfn == 0); | ||
221 | __free_page(__pfn_to_page(zero_pfn)); | ||
222 | } | ||
223 | |||
224 | return 0; | ||
186 | } | 225 | } |
187 | 226 | ||
227 | static struct shrinker huge_zero_page_shrinker = { | ||
228 | .shrink = shrink_huge_zero_page, | ||
229 | .seeks = DEFAULT_SEEKS, | ||
230 | }; | ||
231 | |||
188 | #ifdef CONFIG_SYSFS | 232 | #ifdef CONFIG_SYSFS |
189 | 233 | ||
190 | static ssize_t double_flag_show(struct kobject *kobj, | 234 | static ssize_t double_flag_show(struct kobject *kobj, |
@@ -576,6 +620,8 @@ static int __init hugepage_init(void) | |||
576 | goto out; | 620 | goto out; |
577 | } | 621 | } |
578 | 622 | ||
623 | register_shrinker(&huge_zero_page_shrinker); | ||
624 | |||
579 | /* | 625 | /* |
580 | * By default disable transparent hugepages on smaller systems, | 626 | * By default disable transparent hugepages on smaller systems, |
581 | * where the extra memory used could hurt more than TLB overhead | 627 | * where the extra memory used could hurt more than TLB overhead |
@@ -705,10 +751,11 @@ static inline struct page *alloc_hugepage(int defrag) | |||
705 | #endif | 751 | #endif |
706 | 752 | ||
707 | static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 753 | static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
708 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) | 754 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
755 | unsigned long zero_pfn) | ||
709 | { | 756 | { |
710 | pmd_t entry; | 757 | pmd_t entry; |
711 | entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot); | 758 | entry = pfn_pmd(zero_pfn, vma->vm_page_prot); |
712 | entry = pmd_wrprotect(entry); | 759 | entry = pmd_wrprotect(entry); |
713 | entry = pmd_mkhuge(entry); | 760 | entry = pmd_mkhuge(entry); |
714 | set_pmd_at(mm, haddr, pmd, entry); | 761 | set_pmd_at(mm, haddr, pmd, entry); |
@@ -731,15 +778,19 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
731 | return VM_FAULT_OOM; | 778 | return VM_FAULT_OOM; |
732 | if (!(flags & FAULT_FLAG_WRITE)) { | 779 | if (!(flags & FAULT_FLAG_WRITE)) { |
733 | pgtable_t pgtable; | 780 | pgtable_t pgtable; |
734 | if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) { | 781 | unsigned long zero_pfn; |
735 | count_vm_event(THP_FAULT_FALLBACK); | ||
736 | goto out; | ||
737 | } | ||
738 | pgtable = pte_alloc_one(mm, haddr); | 782 | pgtable = pte_alloc_one(mm, haddr); |
739 | if (unlikely(!pgtable)) | 783 | if (unlikely(!pgtable)) |
740 | return VM_FAULT_OOM; | 784 | return VM_FAULT_OOM; |
785 | zero_pfn = get_huge_zero_page(); | ||
786 | if (unlikely(!zero_pfn)) { | ||
787 | pte_free(mm, pgtable); | ||
788 | count_vm_event(THP_FAULT_FALLBACK); | ||
789 | goto out; | ||
790 | } | ||
741 | spin_lock(&mm->page_table_lock); | 791 | spin_lock(&mm->page_table_lock); |
742 | set_huge_zero_page(pgtable, mm, vma, haddr, pmd); | 792 | set_huge_zero_page(pgtable, mm, vma, haddr, pmd, |
793 | zero_pfn); | ||
743 | spin_unlock(&mm->page_table_lock); | 794 | spin_unlock(&mm->page_table_lock); |
744 | return 0; | 795 | return 0; |
745 | } | 796 | } |
@@ -813,7 +864,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
813 | * a page table. | 864 | * a page table. |
814 | */ | 865 | */ |
815 | if (is_huge_zero_pmd(pmd)) { | 866 | if (is_huge_zero_pmd(pmd)) { |
816 | set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd); | 867 | unsigned long zero_pfn; |
868 | /* | ||
869 | * get_huge_zero_page() will never allocate a new page here, | ||
870 | * since we already have a zero page to copy. It just takes a | ||
871 | * reference. | ||
872 | */ | ||
873 | zero_pfn = get_huge_zero_page(); | ||
874 | set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, | ||
875 | zero_pfn); | ||
817 | ret = 0; | 876 | ret = 0; |
818 | goto out_unlock; | 877 | goto out_unlock; |
819 | } | 878 | } |
@@ -923,6 +982,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | |||
923 | smp_wmb(); /* make pte visible before pmd */ | 982 | smp_wmb(); /* make pte visible before pmd */ |
924 | pmd_populate(mm, pmd, pgtable); | 983 | pmd_populate(mm, pmd, pgtable); |
925 | spin_unlock(&mm->page_table_lock); | 984 | spin_unlock(&mm->page_table_lock); |
985 | put_huge_zero_page(); | ||
926 | inc_mm_counter(mm, MM_ANONPAGES); | 986 | inc_mm_counter(mm, MM_ANONPAGES); |
927 | 987 | ||
928 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 988 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
@@ -1123,9 +1183,10 @@ alloc: | |||
1123 | page_add_new_anon_rmap(new_page, vma, haddr); | 1183 | page_add_new_anon_rmap(new_page, vma, haddr); |
1124 | set_pmd_at(mm, haddr, pmd, entry); | 1184 | set_pmd_at(mm, haddr, pmd, entry); |
1125 | update_mmu_cache_pmd(vma, address, pmd); | 1185 | update_mmu_cache_pmd(vma, address, pmd); |
1126 | if (is_huge_zero_pmd(orig_pmd)) | 1186 | if (is_huge_zero_pmd(orig_pmd)) { |
1127 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 1187 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1128 | else { | 1188 | put_huge_zero_page(); |
1189 | } else { | ||
1129 | VM_BUG_ON(!PageHead(page)); | 1190 | VM_BUG_ON(!PageHead(page)); |
1130 | page_remove_rmap(page); | 1191 | page_remove_rmap(page); |
1131 | put_page(page); | 1192 | put_page(page); |
@@ -1202,6 +1263,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1202 | if (is_huge_zero_pmd(orig_pmd)) { | 1263 | if (is_huge_zero_pmd(orig_pmd)) { |
1203 | tlb->mm->nr_ptes--; | 1264 | tlb->mm->nr_ptes--; |
1204 | spin_unlock(&tlb->mm->page_table_lock); | 1265 | spin_unlock(&tlb->mm->page_table_lock); |
1266 | put_huge_zero_page(); | ||
1205 | } else { | 1267 | } else { |
1206 | page = pmd_page(orig_pmd); | 1268 | page = pmd_page(orig_pmd); |
1207 | page_remove_rmap(page); | 1269 | page_remove_rmap(page); |
@@ -2511,6 +2573,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, | |||
2511 | } | 2573 | } |
2512 | smp_wmb(); /* make pte visible before pmd */ | 2574 | smp_wmb(); /* make pte visible before pmd */ |
2513 | pmd_populate(mm, pmd, pgtable); | 2575 | pmd_populate(mm, pmd, pgtable); |
2576 | put_huge_zero_page(); | ||
2514 | } | 2577 | } |
2515 | 2578 | ||
2516 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | 2579 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, |