aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c359
1 files changed, 333 insertions, 26 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5f902e20e8c0..827d9c813051 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,12 +12,14 @@
12#include <linux/mmu_notifier.h> 12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h> 13#include <linux/rmap.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/shrinker.h>
15#include <linux/mm_inline.h> 16#include <linux/mm_inline.h>
16#include <linux/kthread.h> 17#include <linux/kthread.h>
17#include <linux/khugepaged.h> 18#include <linux/khugepaged.h>
18#include <linux/freezer.h> 19#include <linux/freezer.h>
19#include <linux/mman.h> 20#include <linux/mman.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22
21#include <asm/tlb.h> 23#include <asm/tlb.h>
22#include <asm/pgalloc.h> 24#include <asm/pgalloc.h>
23#include "internal.h" 25#include "internal.h"
@@ -37,7 +39,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
37 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 39 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
38#endif 40#endif
39 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| 41 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
40 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 42 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
43 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
41 44
42/* default scan 8*512 pte (or vmas) every 30 second */ 45/* default scan 8*512 pte (or vmas) every 30 second */
43static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; 46static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
@@ -159,6 +162,77 @@ static int start_khugepaged(void)
159 return err; 162 return err;
160} 163}
161 164
165static atomic_t huge_zero_refcount;
166static unsigned long huge_zero_pfn __read_mostly;
167
168static inline bool is_huge_zero_pfn(unsigned long pfn)
169{
170 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
171 return zero_pfn && pfn == zero_pfn;
172}
173
174static inline bool is_huge_zero_pmd(pmd_t pmd)
175{
176 return is_huge_zero_pfn(pmd_pfn(pmd));
177}
178
179static unsigned long get_huge_zero_page(void)
180{
181 struct page *zero_page;
182retry:
183 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
184 return ACCESS_ONCE(huge_zero_pfn);
185
186 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
187 HPAGE_PMD_ORDER);
188 if (!zero_page) {
189 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
190 return 0;
191 }
192 count_vm_event(THP_ZERO_PAGE_ALLOC);
193 preempt_disable();
194 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
195 preempt_enable();
196 __free_page(zero_page);
197 goto retry;
198 }
199
200 /* We take additional reference here. It will be put back by shrinker */
201 atomic_set(&huge_zero_refcount, 2);
202 preempt_enable();
203 return ACCESS_ONCE(huge_zero_pfn);
204}
205
206static void put_huge_zero_page(void)
207{
208 /*
209 * Counter should never go to zero here. Only shrinker can put
210 * last reference.
211 */
212 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
213}
214
215static int shrink_huge_zero_page(struct shrinker *shrink,
216 struct shrink_control *sc)
217{
218 if (!sc->nr_to_scan)
219 /* we can free zero page only if last reference remains */
220 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
221
222 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
223 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
224 BUG_ON(zero_pfn == 0);
225 __free_page(__pfn_to_page(zero_pfn));
226 }
227
228 return 0;
229}
230
231static struct shrinker huge_zero_page_shrinker = {
232 .shrink = shrink_huge_zero_page,
233 .seeks = DEFAULT_SEEKS,
234};
235
162#ifdef CONFIG_SYSFS 236#ifdef CONFIG_SYSFS
163 237
164static ssize_t double_flag_show(struct kobject *kobj, 238static ssize_t double_flag_show(struct kobject *kobj,
@@ -284,6 +358,20 @@ static ssize_t defrag_store(struct kobject *kobj,
284static struct kobj_attribute defrag_attr = 358static struct kobj_attribute defrag_attr =
285 __ATTR(defrag, 0644, defrag_show, defrag_store); 359 __ATTR(defrag, 0644, defrag_show, defrag_store);
286 360
361static ssize_t use_zero_page_show(struct kobject *kobj,
362 struct kobj_attribute *attr, char *buf)
363{
364 return single_flag_show(kobj, attr, buf,
365 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
366}
367static ssize_t use_zero_page_store(struct kobject *kobj,
368 struct kobj_attribute *attr, const char *buf, size_t count)
369{
370 return single_flag_store(kobj, attr, buf, count,
371 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
372}
373static struct kobj_attribute use_zero_page_attr =
374 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
287#ifdef CONFIG_DEBUG_VM 375#ifdef CONFIG_DEBUG_VM
288static ssize_t debug_cow_show(struct kobject *kobj, 376static ssize_t debug_cow_show(struct kobject *kobj,
289 struct kobj_attribute *attr, char *buf) 377 struct kobj_attribute *attr, char *buf)
@@ -305,6 +393,7 @@ static struct kobj_attribute debug_cow_attr =
305static struct attribute *hugepage_attr[] = { 393static struct attribute *hugepage_attr[] = {
306 &enabled_attr.attr, 394 &enabled_attr.attr,
307 &defrag_attr.attr, 395 &defrag_attr.attr,
396 &use_zero_page_attr.attr,
308#ifdef CONFIG_DEBUG_VM 397#ifdef CONFIG_DEBUG_VM
309 &debug_cow_attr.attr, 398 &debug_cow_attr.attr,
310#endif 399#endif
@@ -550,6 +639,8 @@ static int __init hugepage_init(void)
550 goto out; 639 goto out;
551 } 640 }
552 641
642 register_shrinker(&huge_zero_page_shrinker);
643
553 /* 644 /*
554 * By default disable transparent hugepages on smaller systems, 645 * By default disable transparent hugepages on smaller systems,
555 * where the extra memory used could hurt more than TLB overhead 646 * where the extra memory used could hurt more than TLB overhead
@@ -678,6 +769,22 @@ static inline struct page *alloc_hugepage(int defrag)
678} 769}
679#endif 770#endif
680 771
772static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
773 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
774 unsigned long zero_pfn)
775{
776 pmd_t entry;
777 if (!pmd_none(*pmd))
778 return false;
779 entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
780 entry = pmd_wrprotect(entry);
781 entry = pmd_mkhuge(entry);
782 set_pmd_at(mm, haddr, pmd, entry);
783 pgtable_trans_huge_deposit(mm, pgtable);
784 mm->nr_ptes++;
785 return true;
786}
787
681int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 788int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
682 unsigned long address, pmd_t *pmd, 789 unsigned long address, pmd_t *pmd,
683 unsigned int flags) 790 unsigned int flags)
@@ -691,6 +798,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
691 return VM_FAULT_OOM; 798 return VM_FAULT_OOM;
692 if (unlikely(khugepaged_enter(vma))) 799 if (unlikely(khugepaged_enter(vma)))
693 return VM_FAULT_OOM; 800 return VM_FAULT_OOM;
801 if (!(flags & FAULT_FLAG_WRITE) &&
802 transparent_hugepage_use_zero_page()) {
803 pgtable_t pgtable;
804 unsigned long zero_pfn;
805 bool set;
806 pgtable = pte_alloc_one(mm, haddr);
807 if (unlikely(!pgtable))
808 return VM_FAULT_OOM;
809 zero_pfn = get_huge_zero_page();
810 if (unlikely(!zero_pfn)) {
811 pte_free(mm, pgtable);
812 count_vm_event(THP_FAULT_FALLBACK);
813 goto out;
814 }
815 spin_lock(&mm->page_table_lock);
816 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
817 zero_pfn);
818 spin_unlock(&mm->page_table_lock);
819 if (!set) {
820 pte_free(mm, pgtable);
821 put_huge_zero_page();
822 }
823 return 0;
824 }
694 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 825 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
695 vma, haddr, numa_node_id(), 0); 826 vma, haddr, numa_node_id(), 0);
696 if (unlikely(!page)) { 827 if (unlikely(!page)) {
@@ -755,6 +886,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
755 pte_free(dst_mm, pgtable); 886 pte_free(dst_mm, pgtable);
756 goto out_unlock; 887 goto out_unlock;
757 } 888 }
889 /*
890 * mm->page_table_lock is enough to be sure that huge zero pmd is not
891 * under splitting since we don't split the page itself, only pmd to
892 * a page table.
893 */
894 if (is_huge_zero_pmd(pmd)) {
895 unsigned long zero_pfn;
896 bool set;
897 /*
898 * get_huge_zero_page() will never allocate a new page here,
899 * since we already have a zero page to copy. It just takes a
900 * reference.
901 */
902 zero_pfn = get_huge_zero_page();
903 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
904 zero_pfn);
905 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
906 ret = 0;
907 goto out_unlock;
908 }
758 if (unlikely(pmd_trans_splitting(pmd))) { 909 if (unlikely(pmd_trans_splitting(pmd))) {
759 /* split huge page running from under us */ 910 /* split huge page running from under us */
760 spin_unlock(&src_mm->page_table_lock); 911 spin_unlock(&src_mm->page_table_lock);
@@ -806,6 +957,80 @@ unlock:
806 spin_unlock(&mm->page_table_lock); 957 spin_unlock(&mm->page_table_lock);
807} 958}
808 959
960static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
961 struct vm_area_struct *vma, unsigned long address,
962 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
963{
964 pgtable_t pgtable;
965 pmd_t _pmd;
966 struct page *page;
967 int i, ret = 0;
968 unsigned long mmun_start; /* For mmu_notifiers */
969 unsigned long mmun_end; /* For mmu_notifiers */
970
971 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
972 if (!page) {
973 ret |= VM_FAULT_OOM;
974 goto out;
975 }
976
977 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
978 put_page(page);
979 ret |= VM_FAULT_OOM;
980 goto out;
981 }
982
983 clear_user_highpage(page, address);
984 __SetPageUptodate(page);
985
986 mmun_start = haddr;
987 mmun_end = haddr + HPAGE_PMD_SIZE;
988 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
989
990 spin_lock(&mm->page_table_lock);
991 if (unlikely(!pmd_same(*pmd, orig_pmd)))
992 goto out_free_page;
993
994 pmdp_clear_flush(vma, haddr, pmd);
995 /* leave pmd empty until pte is filled */
996
997 pgtable = pgtable_trans_huge_withdraw(mm);
998 pmd_populate(mm, &_pmd, pgtable);
999
1000 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1001 pte_t *pte, entry;
1002 if (haddr == (address & PAGE_MASK)) {
1003 entry = mk_pte(page, vma->vm_page_prot);
1004 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1005 page_add_new_anon_rmap(page, vma, haddr);
1006 } else {
1007 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
1008 entry = pte_mkspecial(entry);
1009 }
1010 pte = pte_offset_map(&_pmd, haddr);
1011 VM_BUG_ON(!pte_none(*pte));
1012 set_pte_at(mm, haddr, pte, entry);
1013 pte_unmap(pte);
1014 }
1015 smp_wmb(); /* make pte visible before pmd */
1016 pmd_populate(mm, pmd, pgtable);
1017 spin_unlock(&mm->page_table_lock);
1018 put_huge_zero_page();
1019 inc_mm_counter(mm, MM_ANONPAGES);
1020
1021 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1022
1023 ret |= VM_FAULT_WRITE;
1024out:
1025 return ret;
1026out_free_page:
1027 spin_unlock(&mm->page_table_lock);
1028 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1029 mem_cgroup_uncharge_page(page);
1030 put_page(page);
1031 goto out;
1032}
1033
809static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1034static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
810 struct vm_area_struct *vma, 1035 struct vm_area_struct *vma,
811 unsigned long address, 1036 unsigned long address,
@@ -912,19 +1137,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
912 unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 1137 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
913{ 1138{
914 int ret = 0; 1139 int ret = 0;
915 struct page *page, *new_page; 1140 struct page *page = NULL, *new_page;
916 unsigned long haddr; 1141 unsigned long haddr;
917 unsigned long mmun_start; /* For mmu_notifiers */ 1142 unsigned long mmun_start; /* For mmu_notifiers */
918 unsigned long mmun_end; /* For mmu_notifiers */ 1143 unsigned long mmun_end; /* For mmu_notifiers */
919 1144
920 VM_BUG_ON(!vma->anon_vma); 1145 VM_BUG_ON(!vma->anon_vma);
1146 haddr = address & HPAGE_PMD_MASK;
1147 if (is_huge_zero_pmd(orig_pmd))
1148 goto alloc;
921 spin_lock(&mm->page_table_lock); 1149 spin_lock(&mm->page_table_lock);
922 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1150 if (unlikely(!pmd_same(*pmd, orig_pmd)))
923 goto out_unlock; 1151 goto out_unlock;
924 1152
925 page = pmd_page(orig_pmd); 1153 page = pmd_page(orig_pmd);
926 VM_BUG_ON(!PageCompound(page) || !PageHead(page)); 1154 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
927 haddr = address & HPAGE_PMD_MASK;
928 if (page_mapcount(page) == 1) { 1155 if (page_mapcount(page) == 1) {
929 pmd_t entry; 1156 pmd_t entry;
930 entry = pmd_mkyoung(orig_pmd); 1157 entry = pmd_mkyoung(orig_pmd);
@@ -936,7 +1163,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
936 } 1163 }
937 get_page(page); 1164 get_page(page);
938 spin_unlock(&mm->page_table_lock); 1165 spin_unlock(&mm->page_table_lock);
939 1166alloc:
940 if (transparent_hugepage_enabled(vma) && 1167 if (transparent_hugepage_enabled(vma) &&
941 !transparent_hugepage_debug_cow()) 1168 !transparent_hugepage_debug_cow())
942 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 1169 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -946,24 +1173,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
946 1173
947 if (unlikely(!new_page)) { 1174 if (unlikely(!new_page)) {
948 count_vm_event(THP_FAULT_FALLBACK); 1175 count_vm_event(THP_FAULT_FALLBACK);
949 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1176 if (is_huge_zero_pmd(orig_pmd)) {
950 pmd, orig_pmd, page, haddr); 1177 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
951 if (ret & VM_FAULT_OOM) 1178 address, pmd, orig_pmd, haddr);
952 split_huge_page(page); 1179 } else {
953 put_page(page); 1180 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
1181 pmd, orig_pmd, page, haddr);
1182 if (ret & VM_FAULT_OOM)
1183 split_huge_page(page);
1184 put_page(page);
1185 }
954 goto out; 1186 goto out;
955 } 1187 }
956 count_vm_event(THP_FAULT_ALLOC); 1188 count_vm_event(THP_FAULT_ALLOC);
957 1189
958 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1190 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
959 put_page(new_page); 1191 put_page(new_page);
960 split_huge_page(page); 1192 if (page) {
961 put_page(page); 1193 split_huge_page(page);
1194 put_page(page);
1195 }
962 ret |= VM_FAULT_OOM; 1196 ret |= VM_FAULT_OOM;
963 goto out; 1197 goto out;
964 } 1198 }
965 1199
966 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1200 if (is_huge_zero_pmd(orig_pmd))
1201 clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
1202 else
1203 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
967 __SetPageUptodate(new_page); 1204 __SetPageUptodate(new_page);
968 1205
969 mmun_start = haddr; 1206 mmun_start = haddr;
@@ -971,7 +1208,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
971 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1208 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
972 1209
973 spin_lock(&mm->page_table_lock); 1210 spin_lock(&mm->page_table_lock);
974 put_page(page); 1211 if (page)
1212 put_page(page);
975 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1213 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
976 spin_unlock(&mm->page_table_lock); 1214 spin_unlock(&mm->page_table_lock);
977 mem_cgroup_uncharge_page(new_page); 1215 mem_cgroup_uncharge_page(new_page);
@@ -979,14 +1217,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
979 goto out_mn; 1217 goto out_mn;
980 } else { 1218 } else {
981 pmd_t entry; 1219 pmd_t entry;
982 VM_BUG_ON(!PageHead(page));
983 entry = mk_huge_pmd(new_page, vma); 1220 entry = mk_huge_pmd(new_page, vma);
984 pmdp_clear_flush(vma, haddr, pmd); 1221 pmdp_clear_flush(vma, haddr, pmd);
985 page_add_new_anon_rmap(new_page, vma, haddr); 1222 page_add_new_anon_rmap(new_page, vma, haddr);
986 set_pmd_at(mm, haddr, pmd, entry); 1223 set_pmd_at(mm, haddr, pmd, entry);
987 update_mmu_cache_pmd(vma, address, pmd); 1224 update_mmu_cache_pmd(vma, address, pmd);
988 page_remove_rmap(page); 1225 if (is_huge_zero_pmd(orig_pmd)) {
989 put_page(page); 1226 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1227 put_huge_zero_page();
1228 } else {
1229 VM_BUG_ON(!PageHead(page));
1230 page_remove_rmap(page);
1231 put_page(page);
1232 }
990 ret |= VM_FAULT_WRITE; 1233 ret |= VM_FAULT_WRITE;
991 } 1234 }
992 spin_unlock(&mm->page_table_lock); 1235 spin_unlock(&mm->page_table_lock);
@@ -1055,15 +1298,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1055 pmd_t orig_pmd; 1298 pmd_t orig_pmd;
1056 pgtable = pgtable_trans_huge_withdraw(tlb->mm); 1299 pgtable = pgtable_trans_huge_withdraw(tlb->mm);
1057 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1300 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1058 page = pmd_page(orig_pmd);
1059 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1301 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1060 page_remove_rmap(page); 1302 if (is_huge_zero_pmd(orig_pmd)) {
1061 VM_BUG_ON(page_mapcount(page) < 0); 1303 tlb->mm->nr_ptes--;
1062 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1304 spin_unlock(&tlb->mm->page_table_lock);
1063 VM_BUG_ON(!PageHead(page)); 1305 put_huge_zero_page();
1064 tlb->mm->nr_ptes--; 1306 } else {
1065 spin_unlock(&tlb->mm->page_table_lock); 1307 page = pmd_page(orig_pmd);
1066 tlb_remove_page(tlb, page); 1308 page_remove_rmap(page);
1309 VM_BUG_ON(page_mapcount(page) < 0);
1310 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1311 VM_BUG_ON(!PageHead(page));
1312 tlb->mm->nr_ptes--;
1313 spin_unlock(&tlb->mm->page_table_lock);
1314 tlb_remove_page(tlb, page);
1315 }
1067 pte_free(tlb->mm, pgtable); 1316 pte_free(tlb->mm, pgtable);
1068 ret = 1; 1317 ret = 1;
1069 } 1318 }
@@ -1135,6 +1384,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1135 pmd_t entry; 1384 pmd_t entry;
1136 entry = pmdp_get_and_clear(mm, addr, pmd); 1385 entry = pmdp_get_and_clear(mm, addr, pmd);
1137 entry = pmd_modify(entry, newprot); 1386 entry = pmd_modify(entry, newprot);
1387 BUG_ON(pmd_write(entry));
1138 set_pmd_at(mm, addr, pmd, entry); 1388 set_pmd_at(mm, addr, pmd, entry);
1139 spin_unlock(&vma->vm_mm->page_table_lock); 1389 spin_unlock(&vma->vm_mm->page_table_lock);
1140 ret = 1; 1390 ret = 1;
@@ -1477,6 +1727,7 @@ int split_huge_page(struct page *page)
1477 struct anon_vma *anon_vma; 1727 struct anon_vma *anon_vma;
1478 int ret = 1; 1728 int ret = 1;
1479 1729
1730 BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1480 BUG_ON(!PageAnon(page)); 1731 BUG_ON(!PageAnon(page));
1481 anon_vma = page_lock_anon_vma(page); 1732 anon_vma = page_lock_anon_vma(page);
1482 if (!anon_vma) 1733 if (!anon_vma)
@@ -2336,19 +2587,65 @@ static int khugepaged(void *none)
2336 return 0; 2587 return 0;
2337} 2588}
2338 2589
2339void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) 2590static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2591 unsigned long haddr, pmd_t *pmd)
2592{
2593 struct mm_struct *mm = vma->vm_mm;
2594 pgtable_t pgtable;
2595 pmd_t _pmd;
2596 int i;
2597
2598 pmdp_clear_flush(vma, haddr, pmd);
2599 /* leave pmd empty until pte is filled */
2600
2601 pgtable = pgtable_trans_huge_withdraw(mm);
2602 pmd_populate(mm, &_pmd, pgtable);
2603
2604 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2605 pte_t *pte, entry;
2606 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2607 entry = pte_mkspecial(entry);
2608 pte = pte_offset_map(&_pmd, haddr);
2609 VM_BUG_ON(!pte_none(*pte));
2610 set_pte_at(mm, haddr, pte, entry);
2611 pte_unmap(pte);
2612 }
2613 smp_wmb(); /* make pte visible before pmd */
2614 pmd_populate(mm, pmd, pgtable);
2615 put_huge_zero_page();
2616}
2617
2618void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2619 pmd_t *pmd)
2340{ 2620{
2341 struct page *page; 2621 struct page *page;
2622 struct mm_struct *mm = vma->vm_mm;
2623 unsigned long haddr = address & HPAGE_PMD_MASK;
2624 unsigned long mmun_start; /* For mmu_notifiers */
2625 unsigned long mmun_end; /* For mmu_notifiers */
2626
2627 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
2342 2628
2629 mmun_start = haddr;
2630 mmun_end = haddr + HPAGE_PMD_SIZE;
2631 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2343 spin_lock(&mm->page_table_lock); 2632 spin_lock(&mm->page_table_lock);
2344 if (unlikely(!pmd_trans_huge(*pmd))) { 2633 if (unlikely(!pmd_trans_huge(*pmd))) {
2345 spin_unlock(&mm->page_table_lock); 2634 spin_unlock(&mm->page_table_lock);
2635 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2636 return;
2637 }
2638 if (is_huge_zero_pmd(*pmd)) {
2639 __split_huge_zero_page_pmd(vma, haddr, pmd);
2640 spin_unlock(&mm->page_table_lock);
2641 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2346 return; 2642 return;
2347 } 2643 }
2348 page = pmd_page(*pmd); 2644 page = pmd_page(*pmd);
2349 VM_BUG_ON(!page_count(page)); 2645 VM_BUG_ON(!page_count(page));
2350 get_page(page); 2646 get_page(page);
2351 spin_unlock(&mm->page_table_lock); 2647 spin_unlock(&mm->page_table_lock);
2648 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2352 2649
2353 split_huge_page(page); 2650 split_huge_page(page);
2354 2651
@@ -2356,6 +2653,16 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2356 BUG_ON(pmd_trans_huge(*pmd)); 2653 BUG_ON(pmd_trans_huge(*pmd));
2357} 2654}
2358 2655
2656void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
2657 pmd_t *pmd)
2658{
2659 struct vm_area_struct *vma;
2660
2661 vma = find_vma(mm, address);
2662 BUG_ON(vma == NULL);
2663 split_huge_page_pmd(vma, address, pmd);
2664}
2665
2359static void split_huge_page_address(struct mm_struct *mm, 2666static void split_huge_page_address(struct mm_struct *mm,
2360 unsigned long address) 2667 unsigned long address)
2361{ 2668{
@@ -2370,7 +2677,7 @@ static void split_huge_page_address(struct mm_struct *mm,
2370 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2677 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2371 * materialize from under us. 2678 * materialize from under us.
2372 */ 2679 */
2373 split_huge_page_pmd(mm, pmd); 2680 split_huge_page_pmd_mm(mm, address, pmd);
2374} 2681}
2375 2682
2376void __vma_adjust_trans_huge(struct vm_area_struct *vma, 2683void __vma_adjust_trans_huge(struct vm_area_struct *vma,