aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-13 16:11:15 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-13 16:11:15 -0500
commitf6e858a00af788bab0fd4c0b7f5cd788000edc18 (patch)
treef9403ca3671be9821dbf83e726e61dbe75fbca6b /mm/huge_memory.c
parent193c0d682525987db59ac3a24531a77e4947aa95 (diff)
parent98870901cce098bbe94d90d2c41d8d1fa8d94392 (diff)
Merge branch 'akpm' (Andrew's patch-bomb)
Merge misc VM changes from Andrew Morton: "The rest of most-of-MM. The other MM bits await a slab merge. This patch includes the addition of a huge zero_page. Not a performance boost but it an save large amounts of physical memory in some situations. Also a bunch of Fujitsu engineers are working on memory hotplug. Which, as it turns out, was badly broken. About half of their patches are included here; the remainder are 3.8 material." However, this merge disables CONFIG_MOVABLE_NODE, which was totally broken. We don't add new features with "default y", nor do we add Kconfig questions that are incomprehensible to most people without any help text. Does the feature even make sense without compaction or memory hotplug? * akpm: (54 commits) mm/bootmem.c: remove unused wrapper function reserve_bootmem_generic() mm/memory.c: remove unused code from do_wp_page() asm-generic, mm: pgtable: consolidate zero page helpers mm/hugetlb.c: fix warning on freeing hwpoisoned hugepage hwpoison, hugetlbfs: fix RSS-counter warning hwpoison, hugetlbfs: fix "bad pmd" warning in unmapping hwpoisoned hugepage mm: protect against concurrent vma expansion memcg: do not check for mm in __mem_cgroup_count_vm_event tmpfs: support SEEK_DATA and SEEK_HOLE (reprise) mm: provide more accurate estimation of pages occupied by memmap fs/buffer.c: remove redundant initialization in alloc_page_buffers() fs/buffer.c: do not inline exported function writeback: fix a typo in comment mm: introduce new field "managed_pages" to struct zone mm, oom: remove statically defined arch functions of same name mm, oom: remove redundant sleep in pagefault oom handler mm, oom: cleanup pagefault oom handler memory_hotplug: allow online/offline memory to result movable node numa: add CONFIG_MOVABLE_NODE for movable-dedicated node mm, memcg: avoid unnecessary function call when memcg is disabled ...
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c359
1 files changed, 333 insertions, 26 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5f902e20e8c0..827d9c813051 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,12 +12,14 @@
12#include <linux/mmu_notifier.h> 12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h> 13#include <linux/rmap.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/shrinker.h>
15#include <linux/mm_inline.h> 16#include <linux/mm_inline.h>
16#include <linux/kthread.h> 17#include <linux/kthread.h>
17#include <linux/khugepaged.h> 18#include <linux/khugepaged.h>
18#include <linux/freezer.h> 19#include <linux/freezer.h>
19#include <linux/mman.h> 20#include <linux/mman.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22
21#include <asm/tlb.h> 23#include <asm/tlb.h>
22#include <asm/pgalloc.h> 24#include <asm/pgalloc.h>
23#include "internal.h" 25#include "internal.h"
@@ -37,7 +39,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
37 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 39 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
38#endif 40#endif
39 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| 41 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
40 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 42 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
43 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
41 44
42/* default scan 8*512 pte (or vmas) every 30 second */ 45/* default scan 8*512 pte (or vmas) every 30 second */
43static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; 46static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
@@ -159,6 +162,77 @@ static int start_khugepaged(void)
159 return err; 162 return err;
160} 163}
161 164
165static atomic_t huge_zero_refcount;
166static unsigned long huge_zero_pfn __read_mostly;
167
168static inline bool is_huge_zero_pfn(unsigned long pfn)
169{
170 unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
171 return zero_pfn && pfn == zero_pfn;
172}
173
174static inline bool is_huge_zero_pmd(pmd_t pmd)
175{
176 return is_huge_zero_pfn(pmd_pfn(pmd));
177}
178
179static unsigned long get_huge_zero_page(void)
180{
181 struct page *zero_page;
182retry:
183 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
184 return ACCESS_ONCE(huge_zero_pfn);
185
186 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
187 HPAGE_PMD_ORDER);
188 if (!zero_page) {
189 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
190 return 0;
191 }
192 count_vm_event(THP_ZERO_PAGE_ALLOC);
193 preempt_disable();
194 if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
195 preempt_enable();
196 __free_page(zero_page);
197 goto retry;
198 }
199
200 /* We take additional reference here. It will be put back by shrinker */
201 atomic_set(&huge_zero_refcount, 2);
202 preempt_enable();
203 return ACCESS_ONCE(huge_zero_pfn);
204}
205
206static void put_huge_zero_page(void)
207{
208 /*
209 * Counter should never go to zero here. Only shrinker can put
210 * last reference.
211 */
212 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
213}
214
215static int shrink_huge_zero_page(struct shrinker *shrink,
216 struct shrink_control *sc)
217{
218 if (!sc->nr_to_scan)
219 /* we can free zero page only if last reference remains */
220 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
221
222 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
223 unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
224 BUG_ON(zero_pfn == 0);
225 __free_page(__pfn_to_page(zero_pfn));
226 }
227
228 return 0;
229}
230
231static struct shrinker huge_zero_page_shrinker = {
232 .shrink = shrink_huge_zero_page,
233 .seeks = DEFAULT_SEEKS,
234};
235
162#ifdef CONFIG_SYSFS 236#ifdef CONFIG_SYSFS
163 237
164static ssize_t double_flag_show(struct kobject *kobj, 238static ssize_t double_flag_show(struct kobject *kobj,
@@ -284,6 +358,20 @@ static ssize_t defrag_store(struct kobject *kobj,
284static struct kobj_attribute defrag_attr = 358static struct kobj_attribute defrag_attr =
285 __ATTR(defrag, 0644, defrag_show, defrag_store); 359 __ATTR(defrag, 0644, defrag_show, defrag_store);
286 360
361static ssize_t use_zero_page_show(struct kobject *kobj,
362 struct kobj_attribute *attr, char *buf)
363{
364 return single_flag_show(kobj, attr, buf,
365 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
366}
367static ssize_t use_zero_page_store(struct kobject *kobj,
368 struct kobj_attribute *attr, const char *buf, size_t count)
369{
370 return single_flag_store(kobj, attr, buf, count,
371 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
372}
373static struct kobj_attribute use_zero_page_attr =
374 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
287#ifdef CONFIG_DEBUG_VM 375#ifdef CONFIG_DEBUG_VM
288static ssize_t debug_cow_show(struct kobject *kobj, 376static ssize_t debug_cow_show(struct kobject *kobj,
289 struct kobj_attribute *attr, char *buf) 377 struct kobj_attribute *attr, char *buf)
@@ -305,6 +393,7 @@ static struct kobj_attribute debug_cow_attr =
305static struct attribute *hugepage_attr[] = { 393static struct attribute *hugepage_attr[] = {
306 &enabled_attr.attr, 394 &enabled_attr.attr,
307 &defrag_attr.attr, 395 &defrag_attr.attr,
396 &use_zero_page_attr.attr,
308#ifdef CONFIG_DEBUG_VM 397#ifdef CONFIG_DEBUG_VM
309 &debug_cow_attr.attr, 398 &debug_cow_attr.attr,
310#endif 399#endif
@@ -550,6 +639,8 @@ static int __init hugepage_init(void)
550 goto out; 639 goto out;
551 } 640 }
552 641
642 register_shrinker(&huge_zero_page_shrinker);
643
553 /* 644 /*
554 * By default disable transparent hugepages on smaller systems, 645 * By default disable transparent hugepages on smaller systems,
555 * where the extra memory used could hurt more than TLB overhead 646 * where the extra memory used could hurt more than TLB overhead
@@ -678,6 +769,22 @@ static inline struct page *alloc_hugepage(int defrag)
678} 769}
679#endif 770#endif
680 771
772static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
773 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
774 unsigned long zero_pfn)
775{
776 pmd_t entry;
777 if (!pmd_none(*pmd))
778 return false;
779 entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
780 entry = pmd_wrprotect(entry);
781 entry = pmd_mkhuge(entry);
782 set_pmd_at(mm, haddr, pmd, entry);
783 pgtable_trans_huge_deposit(mm, pgtable);
784 mm->nr_ptes++;
785 return true;
786}
787
681int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 788int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
682 unsigned long address, pmd_t *pmd, 789 unsigned long address, pmd_t *pmd,
683 unsigned int flags) 790 unsigned int flags)
@@ -691,6 +798,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
691 return VM_FAULT_OOM; 798 return VM_FAULT_OOM;
692 if (unlikely(khugepaged_enter(vma))) 799 if (unlikely(khugepaged_enter(vma)))
693 return VM_FAULT_OOM; 800 return VM_FAULT_OOM;
801 if (!(flags & FAULT_FLAG_WRITE) &&
802 transparent_hugepage_use_zero_page()) {
803 pgtable_t pgtable;
804 unsigned long zero_pfn;
805 bool set;
806 pgtable = pte_alloc_one(mm, haddr);
807 if (unlikely(!pgtable))
808 return VM_FAULT_OOM;
809 zero_pfn = get_huge_zero_page();
810 if (unlikely(!zero_pfn)) {
811 pte_free(mm, pgtable);
812 count_vm_event(THP_FAULT_FALLBACK);
813 goto out;
814 }
815 spin_lock(&mm->page_table_lock);
816 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
817 zero_pfn);
818 spin_unlock(&mm->page_table_lock);
819 if (!set) {
820 pte_free(mm, pgtable);
821 put_huge_zero_page();
822 }
823 return 0;
824 }
694 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 825 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
695 vma, haddr, numa_node_id(), 0); 826 vma, haddr, numa_node_id(), 0);
696 if (unlikely(!page)) { 827 if (unlikely(!page)) {
@@ -755,6 +886,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
755 pte_free(dst_mm, pgtable); 886 pte_free(dst_mm, pgtable);
756 goto out_unlock; 887 goto out_unlock;
757 } 888 }
889 /*
890 * mm->page_table_lock is enough to be sure that huge zero pmd is not
891 * under splitting since we don't split the page itself, only pmd to
892 * a page table.
893 */
894 if (is_huge_zero_pmd(pmd)) {
895 unsigned long zero_pfn;
896 bool set;
897 /*
898 * get_huge_zero_page() will never allocate a new page here,
899 * since we already have a zero page to copy. It just takes a
900 * reference.
901 */
902 zero_pfn = get_huge_zero_page();
903 set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
904 zero_pfn);
905 BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
906 ret = 0;
907 goto out_unlock;
908 }
758 if (unlikely(pmd_trans_splitting(pmd))) { 909 if (unlikely(pmd_trans_splitting(pmd))) {
759 /* split huge page running from under us */ 910 /* split huge page running from under us */
760 spin_unlock(&src_mm->page_table_lock); 911 spin_unlock(&src_mm->page_table_lock);
@@ -806,6 +957,80 @@ unlock:
806 spin_unlock(&mm->page_table_lock); 957 spin_unlock(&mm->page_table_lock);
807} 958}
808 959
960static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
961 struct vm_area_struct *vma, unsigned long address,
962 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
963{
964 pgtable_t pgtable;
965 pmd_t _pmd;
966 struct page *page;
967 int i, ret = 0;
968 unsigned long mmun_start; /* For mmu_notifiers */
969 unsigned long mmun_end; /* For mmu_notifiers */
970
971 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
972 if (!page) {
973 ret |= VM_FAULT_OOM;
974 goto out;
975 }
976
977 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
978 put_page(page);
979 ret |= VM_FAULT_OOM;
980 goto out;
981 }
982
983 clear_user_highpage(page, address);
984 __SetPageUptodate(page);
985
986 mmun_start = haddr;
987 mmun_end = haddr + HPAGE_PMD_SIZE;
988 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
989
990 spin_lock(&mm->page_table_lock);
991 if (unlikely(!pmd_same(*pmd, orig_pmd)))
992 goto out_free_page;
993
994 pmdp_clear_flush(vma, haddr, pmd);
995 /* leave pmd empty until pte is filled */
996
997 pgtable = pgtable_trans_huge_withdraw(mm);
998 pmd_populate(mm, &_pmd, pgtable);
999
1000 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1001 pte_t *pte, entry;
1002 if (haddr == (address & PAGE_MASK)) {
1003 entry = mk_pte(page, vma->vm_page_prot);
1004 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1005 page_add_new_anon_rmap(page, vma, haddr);
1006 } else {
1007 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
1008 entry = pte_mkspecial(entry);
1009 }
1010 pte = pte_offset_map(&_pmd, haddr);
1011 VM_BUG_ON(!pte_none(*pte));
1012 set_pte_at(mm, haddr, pte, entry);
1013 pte_unmap(pte);
1014 }
1015 smp_wmb(); /* make pte visible before pmd */
1016 pmd_populate(mm, pmd, pgtable);
1017 spin_unlock(&mm->page_table_lock);
1018 put_huge_zero_page();
1019 inc_mm_counter(mm, MM_ANONPAGES);
1020
1021 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1022
1023 ret |= VM_FAULT_WRITE;
1024out:
1025 return ret;
1026out_free_page:
1027 spin_unlock(&mm->page_table_lock);
1028 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1029 mem_cgroup_uncharge_page(page);
1030 put_page(page);
1031 goto out;
1032}
1033
809static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1034static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
810 struct vm_area_struct *vma, 1035 struct vm_area_struct *vma,
811 unsigned long address, 1036 unsigned long address,
@@ -912,19 +1137,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
912 unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 1137 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
913{ 1138{
914 int ret = 0; 1139 int ret = 0;
915 struct page *page, *new_page; 1140 struct page *page = NULL, *new_page;
916 unsigned long haddr; 1141 unsigned long haddr;
917 unsigned long mmun_start; /* For mmu_notifiers */ 1142 unsigned long mmun_start; /* For mmu_notifiers */
918 unsigned long mmun_end; /* For mmu_notifiers */ 1143 unsigned long mmun_end; /* For mmu_notifiers */
919 1144
920 VM_BUG_ON(!vma->anon_vma); 1145 VM_BUG_ON(!vma->anon_vma);
1146 haddr = address & HPAGE_PMD_MASK;
1147 if (is_huge_zero_pmd(orig_pmd))
1148 goto alloc;
921 spin_lock(&mm->page_table_lock); 1149 spin_lock(&mm->page_table_lock);
922 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1150 if (unlikely(!pmd_same(*pmd, orig_pmd)))
923 goto out_unlock; 1151 goto out_unlock;
924 1152
925 page = pmd_page(orig_pmd); 1153 page = pmd_page(orig_pmd);
926 VM_BUG_ON(!PageCompound(page) || !PageHead(page)); 1154 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
927 haddr = address & HPAGE_PMD_MASK;
928 if (page_mapcount(page) == 1) { 1155 if (page_mapcount(page) == 1) {
929 pmd_t entry; 1156 pmd_t entry;
930 entry = pmd_mkyoung(orig_pmd); 1157 entry = pmd_mkyoung(orig_pmd);
@@ -936,7 +1163,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
936 } 1163 }
937 get_page(page); 1164 get_page(page);
938 spin_unlock(&mm->page_table_lock); 1165 spin_unlock(&mm->page_table_lock);
939 1166alloc:
940 if (transparent_hugepage_enabled(vma) && 1167 if (transparent_hugepage_enabled(vma) &&
941 !transparent_hugepage_debug_cow()) 1168 !transparent_hugepage_debug_cow())
942 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 1169 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -946,24 +1173,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
946 1173
947 if (unlikely(!new_page)) { 1174 if (unlikely(!new_page)) {
948 count_vm_event(THP_FAULT_FALLBACK); 1175 count_vm_event(THP_FAULT_FALLBACK);
949 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1176 if (is_huge_zero_pmd(orig_pmd)) {
950 pmd, orig_pmd, page, haddr); 1177 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
951 if (ret & VM_FAULT_OOM) 1178 address, pmd, orig_pmd, haddr);
952 split_huge_page(page); 1179 } else {
953 put_page(page); 1180 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
1181 pmd, orig_pmd, page, haddr);
1182 if (ret & VM_FAULT_OOM)
1183 split_huge_page(page);
1184 put_page(page);
1185 }
954 goto out; 1186 goto out;
955 } 1187 }
956 count_vm_event(THP_FAULT_ALLOC); 1188 count_vm_event(THP_FAULT_ALLOC);
957 1189
958 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1190 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
959 put_page(new_page); 1191 put_page(new_page);
960 split_huge_page(page); 1192 if (page) {
961 put_page(page); 1193 split_huge_page(page);
1194 put_page(page);
1195 }
962 ret |= VM_FAULT_OOM; 1196 ret |= VM_FAULT_OOM;
963 goto out; 1197 goto out;
964 } 1198 }
965 1199
966 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1200 if (is_huge_zero_pmd(orig_pmd))
1201 clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
1202 else
1203 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
967 __SetPageUptodate(new_page); 1204 __SetPageUptodate(new_page);
968 1205
969 mmun_start = haddr; 1206 mmun_start = haddr;
@@ -971,7 +1208,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
971 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1208 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
972 1209
973 spin_lock(&mm->page_table_lock); 1210 spin_lock(&mm->page_table_lock);
974 put_page(page); 1211 if (page)
1212 put_page(page);
975 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1213 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
976 spin_unlock(&mm->page_table_lock); 1214 spin_unlock(&mm->page_table_lock);
977 mem_cgroup_uncharge_page(new_page); 1215 mem_cgroup_uncharge_page(new_page);
@@ -979,14 +1217,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
979 goto out_mn; 1217 goto out_mn;
980 } else { 1218 } else {
981 pmd_t entry; 1219 pmd_t entry;
982 VM_BUG_ON(!PageHead(page));
983 entry = mk_huge_pmd(new_page, vma); 1220 entry = mk_huge_pmd(new_page, vma);
984 pmdp_clear_flush(vma, haddr, pmd); 1221 pmdp_clear_flush(vma, haddr, pmd);
985 page_add_new_anon_rmap(new_page, vma, haddr); 1222 page_add_new_anon_rmap(new_page, vma, haddr);
986 set_pmd_at(mm, haddr, pmd, entry); 1223 set_pmd_at(mm, haddr, pmd, entry);
987 update_mmu_cache_pmd(vma, address, pmd); 1224 update_mmu_cache_pmd(vma, address, pmd);
988 page_remove_rmap(page); 1225 if (is_huge_zero_pmd(orig_pmd)) {
989 put_page(page); 1226 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
1227 put_huge_zero_page();
1228 } else {
1229 VM_BUG_ON(!PageHead(page));
1230 page_remove_rmap(page);
1231 put_page(page);
1232 }
990 ret |= VM_FAULT_WRITE; 1233 ret |= VM_FAULT_WRITE;
991 } 1234 }
992 spin_unlock(&mm->page_table_lock); 1235 spin_unlock(&mm->page_table_lock);
@@ -1055,15 +1298,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1055 pmd_t orig_pmd; 1298 pmd_t orig_pmd;
1056 pgtable = pgtable_trans_huge_withdraw(tlb->mm); 1299 pgtable = pgtable_trans_huge_withdraw(tlb->mm);
1057 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1300 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1058 page = pmd_page(orig_pmd);
1059 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1301 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1060 page_remove_rmap(page); 1302 if (is_huge_zero_pmd(orig_pmd)) {
1061 VM_BUG_ON(page_mapcount(page) < 0); 1303 tlb->mm->nr_ptes--;
1062 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1304 spin_unlock(&tlb->mm->page_table_lock);
1063 VM_BUG_ON(!PageHead(page)); 1305 put_huge_zero_page();
1064 tlb->mm->nr_ptes--; 1306 } else {
1065 spin_unlock(&tlb->mm->page_table_lock); 1307 page = pmd_page(orig_pmd);
1066 tlb_remove_page(tlb, page); 1308 page_remove_rmap(page);
1309 VM_BUG_ON(page_mapcount(page) < 0);
1310 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1311 VM_BUG_ON(!PageHead(page));
1312 tlb->mm->nr_ptes--;
1313 spin_unlock(&tlb->mm->page_table_lock);
1314 tlb_remove_page(tlb, page);
1315 }
1067 pte_free(tlb->mm, pgtable); 1316 pte_free(tlb->mm, pgtable);
1068 ret = 1; 1317 ret = 1;
1069 } 1318 }
@@ -1135,6 +1384,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1135 pmd_t entry; 1384 pmd_t entry;
1136 entry = pmdp_get_and_clear(mm, addr, pmd); 1385 entry = pmdp_get_and_clear(mm, addr, pmd);
1137 entry = pmd_modify(entry, newprot); 1386 entry = pmd_modify(entry, newprot);
1387 BUG_ON(pmd_write(entry));
1138 set_pmd_at(mm, addr, pmd, entry); 1388 set_pmd_at(mm, addr, pmd, entry);
1139 spin_unlock(&vma->vm_mm->page_table_lock); 1389 spin_unlock(&vma->vm_mm->page_table_lock);
1140 ret = 1; 1390 ret = 1;
@@ -1477,6 +1727,7 @@ int split_huge_page(struct page *page)
1477 struct anon_vma *anon_vma; 1727 struct anon_vma *anon_vma;
1478 int ret = 1; 1728 int ret = 1;
1479 1729
1730 BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1480 BUG_ON(!PageAnon(page)); 1731 BUG_ON(!PageAnon(page));
1481 anon_vma = page_lock_anon_vma(page); 1732 anon_vma = page_lock_anon_vma(page);
1482 if (!anon_vma) 1733 if (!anon_vma)
@@ -2336,19 +2587,65 @@ static int khugepaged(void *none)
2336 return 0; 2587 return 0;
2337} 2588}
2338 2589
2339void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) 2590static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2591 unsigned long haddr, pmd_t *pmd)
2592{
2593 struct mm_struct *mm = vma->vm_mm;
2594 pgtable_t pgtable;
2595 pmd_t _pmd;
2596 int i;
2597
2598 pmdp_clear_flush(vma, haddr, pmd);
2599 /* leave pmd empty until pte is filled */
2600
2601 pgtable = pgtable_trans_huge_withdraw(mm);
2602 pmd_populate(mm, &_pmd, pgtable);
2603
2604 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2605 pte_t *pte, entry;
2606 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2607 entry = pte_mkspecial(entry);
2608 pte = pte_offset_map(&_pmd, haddr);
2609 VM_BUG_ON(!pte_none(*pte));
2610 set_pte_at(mm, haddr, pte, entry);
2611 pte_unmap(pte);
2612 }
2613 smp_wmb(); /* make pte visible before pmd */
2614 pmd_populate(mm, pmd, pgtable);
2615 put_huge_zero_page();
2616}
2617
2618void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2619 pmd_t *pmd)
2340{ 2620{
2341 struct page *page; 2621 struct page *page;
2622 struct mm_struct *mm = vma->vm_mm;
2623 unsigned long haddr = address & HPAGE_PMD_MASK;
2624 unsigned long mmun_start; /* For mmu_notifiers */
2625 unsigned long mmun_end; /* For mmu_notifiers */
2626
2627 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
2342 2628
2629 mmun_start = haddr;
2630 mmun_end = haddr + HPAGE_PMD_SIZE;
2631 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2343 spin_lock(&mm->page_table_lock); 2632 spin_lock(&mm->page_table_lock);
2344 if (unlikely(!pmd_trans_huge(*pmd))) { 2633 if (unlikely(!pmd_trans_huge(*pmd))) {
2345 spin_unlock(&mm->page_table_lock); 2634 spin_unlock(&mm->page_table_lock);
2635 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2636 return;
2637 }
2638 if (is_huge_zero_pmd(*pmd)) {
2639 __split_huge_zero_page_pmd(vma, haddr, pmd);
2640 spin_unlock(&mm->page_table_lock);
2641 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2346 return; 2642 return;
2347 } 2643 }
2348 page = pmd_page(*pmd); 2644 page = pmd_page(*pmd);
2349 VM_BUG_ON(!page_count(page)); 2645 VM_BUG_ON(!page_count(page));
2350 get_page(page); 2646 get_page(page);
2351 spin_unlock(&mm->page_table_lock); 2647 spin_unlock(&mm->page_table_lock);
2648 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2352 2649
2353 split_huge_page(page); 2650 split_huge_page(page);
2354 2651
@@ -2356,6 +2653,16 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2356 BUG_ON(pmd_trans_huge(*pmd)); 2653 BUG_ON(pmd_trans_huge(*pmd));
2357} 2654}
2358 2655
2656void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
2657 pmd_t *pmd)
2658{
2659 struct vm_area_struct *vma;
2660
2661 vma = find_vma(mm, address);
2662 BUG_ON(vma == NULL);
2663 split_huge_page_pmd(vma, address, pmd);
2664}
2665
2359static void split_huge_page_address(struct mm_struct *mm, 2666static void split_huge_page_address(struct mm_struct *mm,
2360 unsigned long address) 2667 unsigned long address)
2361{ 2668{
@@ -2370,7 +2677,7 @@ static void split_huge_page_address(struct mm_struct *mm,
2370 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2677 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2371 * materialize from under us. 2678 * materialize from under us.
2372 */ 2679 */
2373 split_huge_page_pmd(mm, pmd); 2680 split_huge_page_pmd_mm(mm, address, pmd);
2374} 2681}
2375 2682
2376void __vma_adjust_trans_huge(struct vm_area_struct *vma, 2683void __vma_adjust_trans_huge(struct vm_area_struct *vma,