diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 359 |
1 files changed, 333 insertions, 26 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5f902e20e8c0..827d9c813051 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -12,12 +12,14 @@ | |||
12 | #include <linux/mmu_notifier.h> | 12 | #include <linux/mmu_notifier.h> |
13 | #include <linux/rmap.h> | 13 | #include <linux/rmap.h> |
14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
15 | #include <linux/shrinker.h> | ||
15 | #include <linux/mm_inline.h> | 16 | #include <linux/mm_inline.h> |
16 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
17 | #include <linux/khugepaged.h> | 18 | #include <linux/khugepaged.h> |
18 | #include <linux/freezer.h> | 19 | #include <linux/freezer.h> |
19 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
20 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | |||
21 | #include <asm/tlb.h> | 23 | #include <asm/tlb.h> |
22 | #include <asm/pgalloc.h> | 24 | #include <asm/pgalloc.h> |
23 | #include "internal.h" | 25 | #include "internal.h" |
@@ -37,7 +39,8 @@ unsigned long transparent_hugepage_flags __read_mostly = | |||
37 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| | 39 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| |
38 | #endif | 40 | #endif |
39 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| | 41 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| |
40 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | 42 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| |
43 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
41 | 44 | ||
42 | /* default scan 8*512 pte (or vmas) every 30 second */ | 45 | /* default scan 8*512 pte (or vmas) every 30 second */ |
43 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; | 46 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; |
@@ -159,6 +162,77 @@ static int start_khugepaged(void) | |||
159 | return err; | 162 | return err; |
160 | } | 163 | } |
161 | 164 | ||
165 | static atomic_t huge_zero_refcount; | ||
166 | static unsigned long huge_zero_pfn __read_mostly; | ||
167 | |||
168 | static inline bool is_huge_zero_pfn(unsigned long pfn) | ||
169 | { | ||
170 | unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); | ||
171 | return zero_pfn && pfn == zero_pfn; | ||
172 | } | ||
173 | |||
174 | static inline bool is_huge_zero_pmd(pmd_t pmd) | ||
175 | { | ||
176 | return is_huge_zero_pfn(pmd_pfn(pmd)); | ||
177 | } | ||
178 | |||
179 | static unsigned long get_huge_zero_page(void) | ||
180 | { | ||
181 | struct page *zero_page; | ||
182 | retry: | ||
183 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | ||
184 | return ACCESS_ONCE(huge_zero_pfn); | ||
185 | |||
186 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | ||
187 | HPAGE_PMD_ORDER); | ||
188 | if (!zero_page) { | ||
189 | count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); | ||
190 | return 0; | ||
191 | } | ||
192 | count_vm_event(THP_ZERO_PAGE_ALLOC); | ||
193 | preempt_disable(); | ||
194 | if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { | ||
195 | preempt_enable(); | ||
196 | __free_page(zero_page); | ||
197 | goto retry; | ||
198 | } | ||
199 | |||
200 | /* We take additional reference here. It will be put back by shrinker */ | ||
201 | atomic_set(&huge_zero_refcount, 2); | ||
202 | preempt_enable(); | ||
203 | return ACCESS_ONCE(huge_zero_pfn); | ||
204 | } | ||
205 | |||
206 | static void put_huge_zero_page(void) | ||
207 | { | ||
208 | /* | ||
209 | * Counter should never go to zero here. Only shrinker can put | ||
210 | * last reference. | ||
211 | */ | ||
212 | BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); | ||
213 | } | ||
214 | |||
215 | static int shrink_huge_zero_page(struct shrinker *shrink, | ||
216 | struct shrink_control *sc) | ||
217 | { | ||
218 | if (!sc->nr_to_scan) | ||
219 | /* we can free zero page only if last reference remains */ | ||
220 | return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; | ||
221 | |||
222 | if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { | ||
223 | unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); | ||
224 | BUG_ON(zero_pfn == 0); | ||
225 | __free_page(__pfn_to_page(zero_pfn)); | ||
226 | } | ||
227 | |||
228 | return 0; | ||
229 | } | ||
230 | |||
231 | static struct shrinker huge_zero_page_shrinker = { | ||
232 | .shrink = shrink_huge_zero_page, | ||
233 | .seeks = DEFAULT_SEEKS, | ||
234 | }; | ||
235 | |||
162 | #ifdef CONFIG_SYSFS | 236 | #ifdef CONFIG_SYSFS |
163 | 237 | ||
164 | static ssize_t double_flag_show(struct kobject *kobj, | 238 | static ssize_t double_flag_show(struct kobject *kobj, |
@@ -284,6 +358,20 @@ static ssize_t defrag_store(struct kobject *kobj, | |||
284 | static struct kobj_attribute defrag_attr = | 358 | static struct kobj_attribute defrag_attr = |
285 | __ATTR(defrag, 0644, defrag_show, defrag_store); | 359 | __ATTR(defrag, 0644, defrag_show, defrag_store); |
286 | 360 | ||
361 | static ssize_t use_zero_page_show(struct kobject *kobj, | ||
362 | struct kobj_attribute *attr, char *buf) | ||
363 | { | ||
364 | return single_flag_show(kobj, attr, buf, | ||
365 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
366 | } | ||
367 | static ssize_t use_zero_page_store(struct kobject *kobj, | ||
368 | struct kobj_attribute *attr, const char *buf, size_t count) | ||
369 | { | ||
370 | return single_flag_store(kobj, attr, buf, count, | ||
371 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
372 | } | ||
373 | static struct kobj_attribute use_zero_page_attr = | ||
374 | __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); | ||
287 | #ifdef CONFIG_DEBUG_VM | 375 | #ifdef CONFIG_DEBUG_VM |
288 | static ssize_t debug_cow_show(struct kobject *kobj, | 376 | static ssize_t debug_cow_show(struct kobject *kobj, |
289 | struct kobj_attribute *attr, char *buf) | 377 | struct kobj_attribute *attr, char *buf) |
@@ -305,6 +393,7 @@ static struct kobj_attribute debug_cow_attr = | |||
305 | static struct attribute *hugepage_attr[] = { | 393 | static struct attribute *hugepage_attr[] = { |
306 | &enabled_attr.attr, | 394 | &enabled_attr.attr, |
307 | &defrag_attr.attr, | 395 | &defrag_attr.attr, |
396 | &use_zero_page_attr.attr, | ||
308 | #ifdef CONFIG_DEBUG_VM | 397 | #ifdef CONFIG_DEBUG_VM |
309 | &debug_cow_attr.attr, | 398 | &debug_cow_attr.attr, |
310 | #endif | 399 | #endif |
@@ -550,6 +639,8 @@ static int __init hugepage_init(void) | |||
550 | goto out; | 639 | goto out; |
551 | } | 640 | } |
552 | 641 | ||
642 | register_shrinker(&huge_zero_page_shrinker); | ||
643 | |||
553 | /* | 644 | /* |
554 | * By default disable transparent hugepages on smaller systems, | 645 | * By default disable transparent hugepages on smaller systems, |
555 | * where the extra memory used could hurt more than TLB overhead | 646 | * where the extra memory used could hurt more than TLB overhead |
@@ -678,6 +769,22 @@ static inline struct page *alloc_hugepage(int defrag) | |||
678 | } | 769 | } |
679 | #endif | 770 | #endif |
680 | 771 | ||
772 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | ||
773 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | ||
774 | unsigned long zero_pfn) | ||
775 | { | ||
776 | pmd_t entry; | ||
777 | if (!pmd_none(*pmd)) | ||
778 | return false; | ||
779 | entry = pfn_pmd(zero_pfn, vma->vm_page_prot); | ||
780 | entry = pmd_wrprotect(entry); | ||
781 | entry = pmd_mkhuge(entry); | ||
782 | set_pmd_at(mm, haddr, pmd, entry); | ||
783 | pgtable_trans_huge_deposit(mm, pgtable); | ||
784 | mm->nr_ptes++; | ||
785 | return true; | ||
786 | } | ||
787 | |||
681 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 788 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
682 | unsigned long address, pmd_t *pmd, | 789 | unsigned long address, pmd_t *pmd, |
683 | unsigned int flags) | 790 | unsigned int flags) |
@@ -691,6 +798,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
691 | return VM_FAULT_OOM; | 798 | return VM_FAULT_OOM; |
692 | if (unlikely(khugepaged_enter(vma))) | 799 | if (unlikely(khugepaged_enter(vma))) |
693 | return VM_FAULT_OOM; | 800 | return VM_FAULT_OOM; |
801 | if (!(flags & FAULT_FLAG_WRITE) && | ||
802 | transparent_hugepage_use_zero_page()) { | ||
803 | pgtable_t pgtable; | ||
804 | unsigned long zero_pfn; | ||
805 | bool set; | ||
806 | pgtable = pte_alloc_one(mm, haddr); | ||
807 | if (unlikely(!pgtable)) | ||
808 | return VM_FAULT_OOM; | ||
809 | zero_pfn = get_huge_zero_page(); | ||
810 | if (unlikely(!zero_pfn)) { | ||
811 | pte_free(mm, pgtable); | ||
812 | count_vm_event(THP_FAULT_FALLBACK); | ||
813 | goto out; | ||
814 | } | ||
815 | spin_lock(&mm->page_table_lock); | ||
816 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | ||
817 | zero_pfn); | ||
818 | spin_unlock(&mm->page_table_lock); | ||
819 | if (!set) { | ||
820 | pte_free(mm, pgtable); | ||
821 | put_huge_zero_page(); | ||
822 | } | ||
823 | return 0; | ||
824 | } | ||
694 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 825 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
695 | vma, haddr, numa_node_id(), 0); | 826 | vma, haddr, numa_node_id(), 0); |
696 | if (unlikely(!page)) { | 827 | if (unlikely(!page)) { |
@@ -755,6 +886,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
755 | pte_free(dst_mm, pgtable); | 886 | pte_free(dst_mm, pgtable); |
756 | goto out_unlock; | 887 | goto out_unlock; |
757 | } | 888 | } |
889 | /* | ||
890 | * mm->page_table_lock is enough to be sure that huge zero pmd is not | ||
891 | * under splitting since we don't split the page itself, only pmd to | ||
892 | * a page table. | ||
893 | */ | ||
894 | if (is_huge_zero_pmd(pmd)) { | ||
895 | unsigned long zero_pfn; | ||
896 | bool set; | ||
897 | /* | ||
898 | * get_huge_zero_page() will never allocate a new page here, | ||
899 | * since we already have a zero page to copy. It just takes a | ||
900 | * reference. | ||
901 | */ | ||
902 | zero_pfn = get_huge_zero_page(); | ||
903 | set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, | ||
904 | zero_pfn); | ||
905 | BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ | ||
906 | ret = 0; | ||
907 | goto out_unlock; | ||
908 | } | ||
758 | if (unlikely(pmd_trans_splitting(pmd))) { | 909 | if (unlikely(pmd_trans_splitting(pmd))) { |
759 | /* split huge page running from under us */ | 910 | /* split huge page running from under us */ |
760 | spin_unlock(&src_mm->page_table_lock); | 911 | spin_unlock(&src_mm->page_table_lock); |
@@ -806,6 +957,80 @@ unlock: | |||
806 | spin_unlock(&mm->page_table_lock); | 957 | spin_unlock(&mm->page_table_lock); |
807 | } | 958 | } |
808 | 959 | ||
960 | static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | ||
961 | struct vm_area_struct *vma, unsigned long address, | ||
962 | pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) | ||
963 | { | ||
964 | pgtable_t pgtable; | ||
965 | pmd_t _pmd; | ||
966 | struct page *page; | ||
967 | int i, ret = 0; | ||
968 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
969 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
970 | |||
971 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
972 | if (!page) { | ||
973 | ret |= VM_FAULT_OOM; | ||
974 | goto out; | ||
975 | } | ||
976 | |||
977 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { | ||
978 | put_page(page); | ||
979 | ret |= VM_FAULT_OOM; | ||
980 | goto out; | ||
981 | } | ||
982 | |||
983 | clear_user_highpage(page, address); | ||
984 | __SetPageUptodate(page); | ||
985 | |||
986 | mmun_start = haddr; | ||
987 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
988 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
989 | |||
990 | spin_lock(&mm->page_table_lock); | ||
991 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
992 | goto out_free_page; | ||
993 | |||
994 | pmdp_clear_flush(vma, haddr, pmd); | ||
995 | /* leave pmd empty until pte is filled */ | ||
996 | |||
997 | pgtable = pgtable_trans_huge_withdraw(mm); | ||
998 | pmd_populate(mm, &_pmd, pgtable); | ||
999 | |||
1000 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
1001 | pte_t *pte, entry; | ||
1002 | if (haddr == (address & PAGE_MASK)) { | ||
1003 | entry = mk_pte(page, vma->vm_page_prot); | ||
1004 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
1005 | page_add_new_anon_rmap(page, vma, haddr); | ||
1006 | } else { | ||
1007 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | ||
1008 | entry = pte_mkspecial(entry); | ||
1009 | } | ||
1010 | pte = pte_offset_map(&_pmd, haddr); | ||
1011 | VM_BUG_ON(!pte_none(*pte)); | ||
1012 | set_pte_at(mm, haddr, pte, entry); | ||
1013 | pte_unmap(pte); | ||
1014 | } | ||
1015 | smp_wmb(); /* make pte visible before pmd */ | ||
1016 | pmd_populate(mm, pmd, pgtable); | ||
1017 | spin_unlock(&mm->page_table_lock); | ||
1018 | put_huge_zero_page(); | ||
1019 | inc_mm_counter(mm, MM_ANONPAGES); | ||
1020 | |||
1021 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1022 | |||
1023 | ret |= VM_FAULT_WRITE; | ||
1024 | out: | ||
1025 | return ret; | ||
1026 | out_free_page: | ||
1027 | spin_unlock(&mm->page_table_lock); | ||
1028 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1029 | mem_cgroup_uncharge_page(page); | ||
1030 | put_page(page); | ||
1031 | goto out; | ||
1032 | } | ||
1033 | |||
809 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 1034 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
810 | struct vm_area_struct *vma, | 1035 | struct vm_area_struct *vma, |
811 | unsigned long address, | 1036 | unsigned long address, |
@@ -912,19 +1137,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
912 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | 1137 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) |
913 | { | 1138 | { |
914 | int ret = 0; | 1139 | int ret = 0; |
915 | struct page *page, *new_page; | 1140 | struct page *page = NULL, *new_page; |
916 | unsigned long haddr; | 1141 | unsigned long haddr; |
917 | unsigned long mmun_start; /* For mmu_notifiers */ | 1142 | unsigned long mmun_start; /* For mmu_notifiers */ |
918 | unsigned long mmun_end; /* For mmu_notifiers */ | 1143 | unsigned long mmun_end; /* For mmu_notifiers */ |
919 | 1144 | ||
920 | VM_BUG_ON(!vma->anon_vma); | 1145 | VM_BUG_ON(!vma->anon_vma); |
1146 | haddr = address & HPAGE_PMD_MASK; | ||
1147 | if (is_huge_zero_pmd(orig_pmd)) | ||
1148 | goto alloc; | ||
921 | spin_lock(&mm->page_table_lock); | 1149 | spin_lock(&mm->page_table_lock); |
922 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1150 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
923 | goto out_unlock; | 1151 | goto out_unlock; |
924 | 1152 | ||
925 | page = pmd_page(orig_pmd); | 1153 | page = pmd_page(orig_pmd); |
926 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); | 1154 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); |
927 | haddr = address & HPAGE_PMD_MASK; | ||
928 | if (page_mapcount(page) == 1) { | 1155 | if (page_mapcount(page) == 1) { |
929 | pmd_t entry; | 1156 | pmd_t entry; |
930 | entry = pmd_mkyoung(orig_pmd); | 1157 | entry = pmd_mkyoung(orig_pmd); |
@@ -936,7 +1163,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
936 | } | 1163 | } |
937 | get_page(page); | 1164 | get_page(page); |
938 | spin_unlock(&mm->page_table_lock); | 1165 | spin_unlock(&mm->page_table_lock); |
939 | 1166 | alloc: | |
940 | if (transparent_hugepage_enabled(vma) && | 1167 | if (transparent_hugepage_enabled(vma) && |
941 | !transparent_hugepage_debug_cow()) | 1168 | !transparent_hugepage_debug_cow()) |
942 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 1169 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
@@ -946,24 +1173,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
946 | 1173 | ||
947 | if (unlikely(!new_page)) { | 1174 | if (unlikely(!new_page)) { |
948 | count_vm_event(THP_FAULT_FALLBACK); | 1175 | count_vm_event(THP_FAULT_FALLBACK); |
949 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 1176 | if (is_huge_zero_pmd(orig_pmd)) { |
950 | pmd, orig_pmd, page, haddr); | 1177 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, |
951 | if (ret & VM_FAULT_OOM) | 1178 | address, pmd, orig_pmd, haddr); |
952 | split_huge_page(page); | 1179 | } else { |
953 | put_page(page); | 1180 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
1181 | pmd, orig_pmd, page, haddr); | ||
1182 | if (ret & VM_FAULT_OOM) | ||
1183 | split_huge_page(page); | ||
1184 | put_page(page); | ||
1185 | } | ||
954 | goto out; | 1186 | goto out; |
955 | } | 1187 | } |
956 | count_vm_event(THP_FAULT_ALLOC); | 1188 | count_vm_event(THP_FAULT_ALLOC); |
957 | 1189 | ||
958 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1190 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
959 | put_page(new_page); | 1191 | put_page(new_page); |
960 | split_huge_page(page); | 1192 | if (page) { |
961 | put_page(page); | 1193 | split_huge_page(page); |
1194 | put_page(page); | ||
1195 | } | ||
962 | ret |= VM_FAULT_OOM; | 1196 | ret |= VM_FAULT_OOM; |
963 | goto out; | 1197 | goto out; |
964 | } | 1198 | } |
965 | 1199 | ||
966 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | 1200 | if (is_huge_zero_pmd(orig_pmd)) |
1201 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); | ||
1202 | else | ||
1203 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | ||
967 | __SetPageUptodate(new_page); | 1204 | __SetPageUptodate(new_page); |
968 | 1205 | ||
969 | mmun_start = haddr; | 1206 | mmun_start = haddr; |
@@ -971,7 +1208,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
971 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1208 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
972 | 1209 | ||
973 | spin_lock(&mm->page_table_lock); | 1210 | spin_lock(&mm->page_table_lock); |
974 | put_page(page); | 1211 | if (page) |
1212 | put_page(page); | ||
975 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 1213 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
976 | spin_unlock(&mm->page_table_lock); | 1214 | spin_unlock(&mm->page_table_lock); |
977 | mem_cgroup_uncharge_page(new_page); | 1215 | mem_cgroup_uncharge_page(new_page); |
@@ -979,14 +1217,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
979 | goto out_mn; | 1217 | goto out_mn; |
980 | } else { | 1218 | } else { |
981 | pmd_t entry; | 1219 | pmd_t entry; |
982 | VM_BUG_ON(!PageHead(page)); | ||
983 | entry = mk_huge_pmd(new_page, vma); | 1220 | entry = mk_huge_pmd(new_page, vma); |
984 | pmdp_clear_flush(vma, haddr, pmd); | 1221 | pmdp_clear_flush(vma, haddr, pmd); |
985 | page_add_new_anon_rmap(new_page, vma, haddr); | 1222 | page_add_new_anon_rmap(new_page, vma, haddr); |
986 | set_pmd_at(mm, haddr, pmd, entry); | 1223 | set_pmd_at(mm, haddr, pmd, entry); |
987 | update_mmu_cache_pmd(vma, address, pmd); | 1224 | update_mmu_cache_pmd(vma, address, pmd); |
988 | page_remove_rmap(page); | 1225 | if (is_huge_zero_pmd(orig_pmd)) { |
989 | put_page(page); | 1226 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1227 | put_huge_zero_page(); | ||
1228 | } else { | ||
1229 | VM_BUG_ON(!PageHead(page)); | ||
1230 | page_remove_rmap(page); | ||
1231 | put_page(page); | ||
1232 | } | ||
990 | ret |= VM_FAULT_WRITE; | 1233 | ret |= VM_FAULT_WRITE; |
991 | } | 1234 | } |
992 | spin_unlock(&mm->page_table_lock); | 1235 | spin_unlock(&mm->page_table_lock); |
@@ -1055,15 +1298,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1055 | pmd_t orig_pmd; | 1298 | pmd_t orig_pmd; |
1056 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); | 1299 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); |
1057 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); | 1300 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); |
1058 | page = pmd_page(orig_pmd); | ||
1059 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1301 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1060 | page_remove_rmap(page); | 1302 | if (is_huge_zero_pmd(orig_pmd)) { |
1061 | VM_BUG_ON(page_mapcount(page) < 0); | 1303 | tlb->mm->nr_ptes--; |
1062 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1304 | spin_unlock(&tlb->mm->page_table_lock); |
1063 | VM_BUG_ON(!PageHead(page)); | 1305 | put_huge_zero_page(); |
1064 | tlb->mm->nr_ptes--; | 1306 | } else { |
1065 | spin_unlock(&tlb->mm->page_table_lock); | 1307 | page = pmd_page(orig_pmd); |
1066 | tlb_remove_page(tlb, page); | 1308 | page_remove_rmap(page); |
1309 | VM_BUG_ON(page_mapcount(page) < 0); | ||
1310 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
1311 | VM_BUG_ON(!PageHead(page)); | ||
1312 | tlb->mm->nr_ptes--; | ||
1313 | spin_unlock(&tlb->mm->page_table_lock); | ||
1314 | tlb_remove_page(tlb, page); | ||
1315 | } | ||
1067 | pte_free(tlb->mm, pgtable); | 1316 | pte_free(tlb->mm, pgtable); |
1068 | ret = 1; | 1317 | ret = 1; |
1069 | } | 1318 | } |
@@ -1135,6 +1384,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1135 | pmd_t entry; | 1384 | pmd_t entry; |
1136 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1385 | entry = pmdp_get_and_clear(mm, addr, pmd); |
1137 | entry = pmd_modify(entry, newprot); | 1386 | entry = pmd_modify(entry, newprot); |
1387 | BUG_ON(pmd_write(entry)); | ||
1138 | set_pmd_at(mm, addr, pmd, entry); | 1388 | set_pmd_at(mm, addr, pmd, entry); |
1139 | spin_unlock(&vma->vm_mm->page_table_lock); | 1389 | spin_unlock(&vma->vm_mm->page_table_lock); |
1140 | ret = 1; | 1390 | ret = 1; |
@@ -1477,6 +1727,7 @@ int split_huge_page(struct page *page) | |||
1477 | struct anon_vma *anon_vma; | 1727 | struct anon_vma *anon_vma; |
1478 | int ret = 1; | 1728 | int ret = 1; |
1479 | 1729 | ||
1730 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); | ||
1480 | BUG_ON(!PageAnon(page)); | 1731 | BUG_ON(!PageAnon(page)); |
1481 | anon_vma = page_lock_anon_vma(page); | 1732 | anon_vma = page_lock_anon_vma(page); |
1482 | if (!anon_vma) | 1733 | if (!anon_vma) |
@@ -2336,19 +2587,65 @@ static int khugepaged(void *none) | |||
2336 | return 0; | 2587 | return 0; |
2337 | } | 2588 | } |
2338 | 2589 | ||
2339 | void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | 2590 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, |
2591 | unsigned long haddr, pmd_t *pmd) | ||
2592 | { | ||
2593 | struct mm_struct *mm = vma->vm_mm; | ||
2594 | pgtable_t pgtable; | ||
2595 | pmd_t _pmd; | ||
2596 | int i; | ||
2597 | |||
2598 | pmdp_clear_flush(vma, haddr, pmd); | ||
2599 | /* leave pmd empty until pte is filled */ | ||
2600 | |||
2601 | pgtable = pgtable_trans_huge_withdraw(mm); | ||
2602 | pmd_populate(mm, &_pmd, pgtable); | ||
2603 | |||
2604 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
2605 | pte_t *pte, entry; | ||
2606 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | ||
2607 | entry = pte_mkspecial(entry); | ||
2608 | pte = pte_offset_map(&_pmd, haddr); | ||
2609 | VM_BUG_ON(!pte_none(*pte)); | ||
2610 | set_pte_at(mm, haddr, pte, entry); | ||
2611 | pte_unmap(pte); | ||
2612 | } | ||
2613 | smp_wmb(); /* make pte visible before pmd */ | ||
2614 | pmd_populate(mm, pmd, pgtable); | ||
2615 | put_huge_zero_page(); | ||
2616 | } | ||
2617 | |||
2618 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | ||
2619 | pmd_t *pmd) | ||
2340 | { | 2620 | { |
2341 | struct page *page; | 2621 | struct page *page; |
2622 | struct mm_struct *mm = vma->vm_mm; | ||
2623 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
2624 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2625 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2626 | |||
2627 | BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); | ||
2342 | 2628 | ||
2629 | mmun_start = haddr; | ||
2630 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
2631 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
2343 | spin_lock(&mm->page_table_lock); | 2632 | spin_lock(&mm->page_table_lock); |
2344 | if (unlikely(!pmd_trans_huge(*pmd))) { | 2633 | if (unlikely(!pmd_trans_huge(*pmd))) { |
2345 | spin_unlock(&mm->page_table_lock); | 2634 | spin_unlock(&mm->page_table_lock); |
2635 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2636 | return; | ||
2637 | } | ||
2638 | if (is_huge_zero_pmd(*pmd)) { | ||
2639 | __split_huge_zero_page_pmd(vma, haddr, pmd); | ||
2640 | spin_unlock(&mm->page_table_lock); | ||
2641 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2346 | return; | 2642 | return; |
2347 | } | 2643 | } |
2348 | page = pmd_page(*pmd); | 2644 | page = pmd_page(*pmd); |
2349 | VM_BUG_ON(!page_count(page)); | 2645 | VM_BUG_ON(!page_count(page)); |
2350 | get_page(page); | 2646 | get_page(page); |
2351 | spin_unlock(&mm->page_table_lock); | 2647 | spin_unlock(&mm->page_table_lock); |
2648 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2352 | 2649 | ||
2353 | split_huge_page(page); | 2650 | split_huge_page(page); |
2354 | 2651 | ||
@@ -2356,6 +2653,16 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | |||
2356 | BUG_ON(pmd_trans_huge(*pmd)); | 2653 | BUG_ON(pmd_trans_huge(*pmd)); |
2357 | } | 2654 | } |
2358 | 2655 | ||
2656 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | ||
2657 | pmd_t *pmd) | ||
2658 | { | ||
2659 | struct vm_area_struct *vma; | ||
2660 | |||
2661 | vma = find_vma(mm, address); | ||
2662 | BUG_ON(vma == NULL); | ||
2663 | split_huge_page_pmd(vma, address, pmd); | ||
2664 | } | ||
2665 | |||
2359 | static void split_huge_page_address(struct mm_struct *mm, | 2666 | static void split_huge_page_address(struct mm_struct *mm, |
2360 | unsigned long address) | 2667 | unsigned long address) |
2361 | { | 2668 | { |
@@ -2370,7 +2677,7 @@ static void split_huge_page_address(struct mm_struct *mm, | |||
2370 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | 2677 | * Caller holds the mmap_sem write mode, so a huge pmd cannot |
2371 | * materialize from under us. | 2678 | * materialize from under us. |
2372 | */ | 2679 | */ |
2373 | split_huge_page_pmd(mm, pmd); | 2680 | split_huge_page_pmd_mm(mm, address, pmd); |
2374 | } | 2681 | } |
2375 | 2682 | ||
2376 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, | 2683 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, |