aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c441
1 files changed, 211 insertions, 230 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 141dbb695097..40f17c34b415 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -17,6 +17,7 @@
17#include <linux/khugepaged.h> 17#include <linux/khugepaged.h>
18#include <linux/freezer.h> 18#include <linux/freezer.h>
19#include <linux/mman.h> 19#include <linux/mman.h>
20#include <linux/pagemap.h>
20#include <asm/tlb.h> 21#include <asm/tlb.h>
21#include <asm/pgalloc.h> 22#include <asm/pgalloc.h>
22#include "internal.h" 23#include "internal.h"
@@ -102,10 +103,7 @@ static int set_recommended_min_free_kbytes(void)
102 unsigned long recommended_min; 103 unsigned long recommended_min;
103 extern int min_free_kbytes; 104 extern int min_free_kbytes;
104 105
105 if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, 106 if (!khugepaged_enabled())
106 &transparent_hugepage_flags) &&
107 !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
108 &transparent_hugepage_flags))
109 return 0; 107 return 0;
110 108
111 for_each_populated_zone(zone) 109 for_each_populated_zone(zone)
@@ -139,12 +137,6 @@ static int start_khugepaged(void)
139{ 137{
140 int err = 0; 138 int err = 0;
141 if (khugepaged_enabled()) { 139 if (khugepaged_enabled()) {
142 int wakeup;
143 if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
144 err = -ENOMEM;
145 goto out;
146 }
147 mutex_lock(&khugepaged_mutex);
148 if (!khugepaged_thread) 140 if (!khugepaged_thread)
149 khugepaged_thread = kthread_run(khugepaged, NULL, 141 khugepaged_thread = kthread_run(khugepaged, NULL,
150 "khugepaged"); 142 "khugepaged");
@@ -154,16 +146,16 @@ static int start_khugepaged(void)
154 err = PTR_ERR(khugepaged_thread); 146 err = PTR_ERR(khugepaged_thread);
155 khugepaged_thread = NULL; 147 khugepaged_thread = NULL;
156 } 148 }
157 wakeup = !list_empty(&khugepaged_scan.mm_head); 149
158 mutex_unlock(&khugepaged_mutex); 150 if (!list_empty(&khugepaged_scan.mm_head))
159 if (wakeup)
160 wake_up_interruptible(&khugepaged_wait); 151 wake_up_interruptible(&khugepaged_wait);
161 152
162 set_recommended_min_free_kbytes(); 153 set_recommended_min_free_kbytes();
163 } else 154 } else if (khugepaged_thread) {
164 /* wakeup to exit */ 155 kthread_stop(khugepaged_thread);
165 wake_up_interruptible(&khugepaged_wait); 156 khugepaged_thread = NULL;
166out: 157 }
158
167 return err; 159 return err;
168} 160}
169 161
@@ -224,18 +216,16 @@ static ssize_t enabled_store(struct kobject *kobj,
224 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 216 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
225 217
226 if (ret > 0) { 218 if (ret > 0) {
227 int err = start_khugepaged(); 219 int err;
220
221 mutex_lock(&khugepaged_mutex);
222 err = start_khugepaged();
223 mutex_unlock(&khugepaged_mutex);
224
228 if (err) 225 if (err)
229 ret = err; 226 ret = err;
230 } 227 }
231 228
232 if (ret > 0 &&
233 (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
234 &transparent_hugepage_flags) ||
235 test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
236 &transparent_hugepage_flags)))
237 set_recommended_min_free_kbytes();
238
239 return ret; 229 return ret;
240} 230}
241static struct kobj_attribute enabled_attr = 231static struct kobj_attribute enabled_attr =
@@ -570,8 +560,6 @@ static int __init hugepage_init(void)
570 560
571 start_khugepaged(); 561 start_khugepaged();
572 562
573 set_recommended_min_free_kbytes();
574
575 return 0; 563 return 0;
576out: 564out:
577 hugepage_exit_sysfs(hugepage_kobj); 565 hugepage_exit_sysfs(hugepage_kobj);
@@ -611,19 +599,6 @@ out:
611} 599}
612__setup("transparent_hugepage=", setup_transparent_hugepage); 600__setup("transparent_hugepage=", setup_transparent_hugepage);
613 601
614static void prepare_pmd_huge_pte(pgtable_t pgtable,
615 struct mm_struct *mm)
616{
617 assert_spin_locked(&mm->page_table_lock);
618
619 /* FIFO */
620 if (!mm->pmd_huge_pte)
621 INIT_LIST_HEAD(&pgtable->lru);
622 else
623 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
624 mm->pmd_huge_pte = pgtable;
625}
626
627static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 602static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
628{ 603{
629 if (likely(vma->vm_flags & VM_WRITE)) 604 if (likely(vma->vm_flags & VM_WRITE))
@@ -665,7 +640,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
665 */ 640 */
666 page_add_new_anon_rmap(page, vma, haddr); 641 page_add_new_anon_rmap(page, vma, haddr);
667 set_pmd_at(mm, haddr, pmd, entry); 642 set_pmd_at(mm, haddr, pmd, entry);
668 prepare_pmd_huge_pte(pgtable, mm); 643 pgtable_trans_huge_deposit(mm, pgtable);
669 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 644 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
670 mm->nr_ptes++; 645 mm->nr_ptes++;
671 spin_unlock(&mm->page_table_lock); 646 spin_unlock(&mm->page_table_lock);
@@ -791,7 +766,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
791 pmdp_set_wrprotect(src_mm, addr, src_pmd); 766 pmdp_set_wrprotect(src_mm, addr, src_pmd);
792 pmd = pmd_mkold(pmd_wrprotect(pmd)); 767 pmd = pmd_mkold(pmd_wrprotect(pmd));
793 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 768 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
794 prepare_pmd_huge_pte(pgtable, dst_mm); 769 pgtable_trans_huge_deposit(dst_mm, pgtable);
795 dst_mm->nr_ptes++; 770 dst_mm->nr_ptes++;
796 771
797 ret = 0; 772 ret = 0;
@@ -802,25 +777,6 @@ out:
802 return ret; 777 return ret;
803} 778}
804 779
805/* no "address" argument so destroys page coloring of some arch */
806pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
807{
808 pgtable_t pgtable;
809
810 assert_spin_locked(&mm->page_table_lock);
811
812 /* FIFO */
813 pgtable = mm->pmd_huge_pte;
814 if (list_empty(&pgtable->lru))
815 mm->pmd_huge_pte = NULL;
816 else {
817 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
818 struct page, lru);
819 list_del(&pgtable->lru);
820 }
821 return pgtable;
822}
823
824static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 780static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
825 struct vm_area_struct *vma, 781 struct vm_area_struct *vma,
826 unsigned long address, 782 unsigned long address,
@@ -832,6 +788,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
832 pmd_t _pmd; 788 pmd_t _pmd;
833 int ret = 0, i; 789 int ret = 0, i;
834 struct page **pages; 790 struct page **pages;
791 unsigned long mmun_start; /* For mmu_notifiers */
792 unsigned long mmun_end; /* For mmu_notifiers */
835 793
836 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 794 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
837 GFP_KERNEL); 795 GFP_KERNEL);
@@ -868,15 +826,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
868 cond_resched(); 826 cond_resched();
869 } 827 }
870 828
829 mmun_start = haddr;
830 mmun_end = haddr + HPAGE_PMD_SIZE;
831 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
832
871 spin_lock(&mm->page_table_lock); 833 spin_lock(&mm->page_table_lock);
872 if (unlikely(!pmd_same(*pmd, orig_pmd))) 834 if (unlikely(!pmd_same(*pmd, orig_pmd)))
873 goto out_free_pages; 835 goto out_free_pages;
874 VM_BUG_ON(!PageHead(page)); 836 VM_BUG_ON(!PageHead(page));
875 837
876 pmdp_clear_flush_notify(vma, haddr, pmd); 838 pmdp_clear_flush(vma, haddr, pmd);
877 /* leave pmd empty until pte is filled */ 839 /* leave pmd empty until pte is filled */
878 840
879 pgtable = get_pmd_huge_pte(mm); 841 pgtable = pgtable_trans_huge_withdraw(mm);
880 pmd_populate(mm, &_pmd, pgtable); 842 pmd_populate(mm, &_pmd, pgtable);
881 843
882 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 844 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -896,6 +858,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
896 page_remove_rmap(page); 858 page_remove_rmap(page);
897 spin_unlock(&mm->page_table_lock); 859 spin_unlock(&mm->page_table_lock);
898 860
861 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
862
899 ret |= VM_FAULT_WRITE; 863 ret |= VM_FAULT_WRITE;
900 put_page(page); 864 put_page(page);
901 865
@@ -904,6 +868,7 @@ out:
904 868
905out_free_pages: 869out_free_pages:
906 spin_unlock(&mm->page_table_lock); 870 spin_unlock(&mm->page_table_lock);
871 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
907 mem_cgroup_uncharge_start(); 872 mem_cgroup_uncharge_start();
908 for (i = 0; i < HPAGE_PMD_NR; i++) { 873 for (i = 0; i < HPAGE_PMD_NR; i++) {
909 mem_cgroup_uncharge_page(pages[i]); 874 mem_cgroup_uncharge_page(pages[i]);
@@ -920,6 +885,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
920 int ret = 0; 885 int ret = 0;
921 struct page *page, *new_page; 886 struct page *page, *new_page;
922 unsigned long haddr; 887 unsigned long haddr;
888 unsigned long mmun_start; /* For mmu_notifiers */
889 unsigned long mmun_end; /* For mmu_notifiers */
923 890
924 VM_BUG_ON(!vma->anon_vma); 891 VM_BUG_ON(!vma->anon_vma);
925 spin_lock(&mm->page_table_lock); 892 spin_lock(&mm->page_table_lock);
@@ -934,7 +901,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
934 entry = pmd_mkyoung(orig_pmd); 901 entry = pmd_mkyoung(orig_pmd);
935 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 902 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
936 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) 903 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
937 update_mmu_cache(vma, address, entry); 904 update_mmu_cache_pmd(vma, address, pmd);
938 ret |= VM_FAULT_WRITE; 905 ret |= VM_FAULT_WRITE;
939 goto out_unlock; 906 goto out_unlock;
940 } 907 }
@@ -970,38 +937,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
970 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 937 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
971 __SetPageUptodate(new_page); 938 __SetPageUptodate(new_page);
972 939
940 mmun_start = haddr;
941 mmun_end = haddr + HPAGE_PMD_SIZE;
942 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
943
973 spin_lock(&mm->page_table_lock); 944 spin_lock(&mm->page_table_lock);
974 put_page(page); 945 put_page(page);
975 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 946 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
976 spin_unlock(&mm->page_table_lock); 947 spin_unlock(&mm->page_table_lock);
977 mem_cgroup_uncharge_page(new_page); 948 mem_cgroup_uncharge_page(new_page);
978 put_page(new_page); 949 put_page(new_page);
979 goto out; 950 goto out_mn;
980 } else { 951 } else {
981 pmd_t entry; 952 pmd_t entry;
982 VM_BUG_ON(!PageHead(page)); 953 VM_BUG_ON(!PageHead(page));
983 entry = mk_pmd(new_page, vma->vm_page_prot); 954 entry = mk_pmd(new_page, vma->vm_page_prot);
984 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 955 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
985 entry = pmd_mkhuge(entry); 956 entry = pmd_mkhuge(entry);
986 pmdp_clear_flush_notify(vma, haddr, pmd); 957 pmdp_clear_flush(vma, haddr, pmd);
987 page_add_new_anon_rmap(new_page, vma, haddr); 958 page_add_new_anon_rmap(new_page, vma, haddr);
988 set_pmd_at(mm, haddr, pmd, entry); 959 set_pmd_at(mm, haddr, pmd, entry);
989 update_mmu_cache(vma, address, entry); 960 update_mmu_cache_pmd(vma, address, pmd);
990 page_remove_rmap(page); 961 page_remove_rmap(page);
991 put_page(page); 962 put_page(page);
992 ret |= VM_FAULT_WRITE; 963 ret |= VM_FAULT_WRITE;
993 } 964 }
994out_unlock:
995 spin_unlock(&mm->page_table_lock); 965 spin_unlock(&mm->page_table_lock);
966out_mn:
967 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
996out: 968out:
997 return ret; 969 return ret;
970out_unlock:
971 spin_unlock(&mm->page_table_lock);
972 return ret;
998} 973}
999 974
1000struct page *follow_trans_huge_pmd(struct mm_struct *mm, 975struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1001 unsigned long addr, 976 unsigned long addr,
1002 pmd_t *pmd, 977 pmd_t *pmd,
1003 unsigned int flags) 978 unsigned int flags)
1004{ 979{
980 struct mm_struct *mm = vma->vm_mm;
1005 struct page *page = NULL; 981 struct page *page = NULL;
1006 982
1007 assert_spin_locked(&mm->page_table_lock); 983 assert_spin_locked(&mm->page_table_lock);
@@ -1024,6 +1000,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
1024 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 1000 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
1025 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); 1001 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
1026 } 1002 }
1003 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1004 if (page->mapping && trylock_page(page)) {
1005 lru_add_drain();
1006 if (page->mapping)
1007 mlock_vma_page(page);
1008 unlock_page(page);
1009 }
1010 }
1027 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1011 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1028 VM_BUG_ON(!PageCompound(page)); 1012 VM_BUG_ON(!PageCompound(page));
1029 if (flags & FOLL_GET) 1013 if (flags & FOLL_GET)
@@ -1041,9 +1025,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1041 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1025 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1042 struct page *page; 1026 struct page *page;
1043 pgtable_t pgtable; 1027 pgtable_t pgtable;
1044 pgtable = get_pmd_huge_pte(tlb->mm); 1028 pmd_t orig_pmd;
1045 page = pmd_page(*pmd); 1029 pgtable = pgtable_trans_huge_withdraw(tlb->mm);
1046 pmd_clear(pmd); 1030 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1031 page = pmd_page(orig_pmd);
1047 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1032 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1048 page_remove_rmap(page); 1033 page_remove_rmap(page);
1049 VM_BUG_ON(page_mapcount(page) < 0); 1034 VM_BUG_ON(page_mapcount(page) < 0);
@@ -1207,7 +1192,11 @@ static int __split_huge_page_splitting(struct page *page,
1207 struct mm_struct *mm = vma->vm_mm; 1192 struct mm_struct *mm = vma->vm_mm;
1208 pmd_t *pmd; 1193 pmd_t *pmd;
1209 int ret = 0; 1194 int ret = 0;
1195 /* For mmu_notifiers */
1196 const unsigned long mmun_start = address;
1197 const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
1210 1198
1199 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1211 spin_lock(&mm->page_table_lock); 1200 spin_lock(&mm->page_table_lock);
1212 pmd = page_check_address_pmd(page, mm, address, 1201 pmd = page_check_address_pmd(page, mm, address,
1213 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); 1202 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1219,10 +1208,11 @@ static int __split_huge_page_splitting(struct page *page,
1219 * and it won't wait on the anon_vma->root->mutex to 1208 * and it won't wait on the anon_vma->root->mutex to
1220 * serialize against split_huge_page*. 1209 * serialize against split_huge_page*.
1221 */ 1210 */
1222 pmdp_splitting_flush_notify(vma, address, pmd); 1211 pmdp_splitting_flush(vma, address, pmd);
1223 ret = 1; 1212 ret = 1;
1224 } 1213 }
1225 spin_unlock(&mm->page_table_lock); 1214 spin_unlock(&mm->page_table_lock);
1215 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1226 1216
1227 return ret; 1217 return ret;
1228} 1218}
@@ -1358,11 +1348,11 @@ static int __split_huge_page_map(struct page *page,
1358 pmd = page_check_address_pmd(page, mm, address, 1348 pmd = page_check_address_pmd(page, mm, address,
1359 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); 1349 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1360 if (pmd) { 1350 if (pmd) {
1361 pgtable = get_pmd_huge_pte(mm); 1351 pgtable = pgtable_trans_huge_withdraw(mm);
1362 pmd_populate(mm, &_pmd, pgtable); 1352 pmd_populate(mm, &_pmd, pgtable);
1363 1353
1364 for (i = 0, haddr = address; i < HPAGE_PMD_NR; 1354 haddr = address;
1365 i++, haddr += PAGE_SIZE) { 1355 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1366 pte_t *pte, entry; 1356 pte_t *pte, entry;
1367 BUG_ON(PageCompound(page+i)); 1357 BUG_ON(PageCompound(page+i));
1368 entry = mk_pte(page + i, vma->vm_page_prot); 1358 entry = mk_pte(page + i, vma->vm_page_prot);
@@ -1406,8 +1396,7 @@ static int __split_huge_page_map(struct page *page,
1406 * SMP TLB and finally we write the non-huge version 1396 * SMP TLB and finally we write the non-huge version
1407 * of the pmd entry with pmd_populate. 1397 * of the pmd entry with pmd_populate.
1408 */ 1398 */
1409 set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); 1399 pmdp_invalidate(vma, address, pmd);
1410 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
1411 pmd_populate(mm, pmd, pgtable); 1400 pmd_populate(mm, pmd, pgtable);
1412 ret = 1; 1401 ret = 1;
1413 } 1402 }
@@ -1421,18 +1410,17 @@ static void __split_huge_page(struct page *page,
1421 struct anon_vma *anon_vma) 1410 struct anon_vma *anon_vma)
1422{ 1411{
1423 int mapcount, mapcount2; 1412 int mapcount, mapcount2;
1413 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1424 struct anon_vma_chain *avc; 1414 struct anon_vma_chain *avc;
1425 1415
1426 BUG_ON(!PageHead(page)); 1416 BUG_ON(!PageHead(page));
1427 BUG_ON(PageTail(page)); 1417 BUG_ON(PageTail(page));
1428 1418
1429 mapcount = 0; 1419 mapcount = 0;
1430 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1420 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1431 struct vm_area_struct *vma = avc->vma; 1421 struct vm_area_struct *vma = avc->vma;
1432 unsigned long addr = vma_address(page, vma); 1422 unsigned long addr = vma_address(page, vma);
1433 BUG_ON(is_vma_temporary_stack(vma)); 1423 BUG_ON(is_vma_temporary_stack(vma));
1434 if (addr == -EFAULT)
1435 continue;
1436 mapcount += __split_huge_page_splitting(page, vma, addr); 1424 mapcount += __split_huge_page_splitting(page, vma, addr);
1437 } 1425 }
1438 /* 1426 /*
@@ -1453,12 +1441,10 @@ static void __split_huge_page(struct page *page,
1453 __split_huge_page_refcount(page); 1441 __split_huge_page_refcount(page);
1454 1442
1455 mapcount2 = 0; 1443 mapcount2 = 0;
1456 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1444 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1457 struct vm_area_struct *vma = avc->vma; 1445 struct vm_area_struct *vma = avc->vma;
1458 unsigned long addr = vma_address(page, vma); 1446 unsigned long addr = vma_address(page, vma);
1459 BUG_ON(is_vma_temporary_stack(vma)); 1447 BUG_ON(is_vma_temporary_stack(vma));
1460 if (addr == -EFAULT)
1461 continue;
1462 mapcount2 += __split_huge_page_map(page, vma, addr); 1448 mapcount2 += __split_huge_page_map(page, vma, addr);
1463 } 1449 }
1464 if (mapcount != mapcount2) 1450 if (mapcount != mapcount2)
@@ -1491,12 +1477,13 @@ out:
1491 return ret; 1477 return ret;
1492} 1478}
1493 1479
1494#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ 1480#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1495 VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1496 1481
1497int hugepage_madvise(struct vm_area_struct *vma, 1482int hugepage_madvise(struct vm_area_struct *vma,
1498 unsigned long *vm_flags, int advice) 1483 unsigned long *vm_flags, int advice)
1499{ 1484{
1485 struct mm_struct *mm = vma->vm_mm;
1486
1500 switch (advice) { 1487 switch (advice) {
1501 case MADV_HUGEPAGE: 1488 case MADV_HUGEPAGE:
1502 /* 1489 /*
@@ -1504,6 +1491,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
1504 */ 1491 */
1505 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) 1492 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
1506 return -EINVAL; 1493 return -EINVAL;
1494 if (mm->def_flags & VM_NOHUGEPAGE)
1495 return -EINVAL;
1507 *vm_flags &= ~VM_NOHUGEPAGE; 1496 *vm_flags &= ~VM_NOHUGEPAGE;
1508 *vm_flags |= VM_HUGEPAGE; 1497 *vm_flags |= VM_HUGEPAGE;
1509 /* 1498 /*
@@ -1655,11 +1644,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1655 if (vma->vm_ops) 1644 if (vma->vm_ops)
1656 /* khugepaged not yet working on file or special mappings */ 1645 /* khugepaged not yet working on file or special mappings */
1657 return 0; 1646 return 0;
1658 /* 1647 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
1659 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1660 * true too, verify it here.
1661 */
1662 VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1663 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 1648 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1664 hend = vma->vm_end & HPAGE_PMD_MASK; 1649 hend = vma->vm_end & HPAGE_PMD_MASK;
1665 if (hstart < hend) 1650 if (hstart < hend)
@@ -1833,28 +1818,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1833 } 1818 }
1834} 1819}
1835 1820
1836static void collapse_huge_page(struct mm_struct *mm, 1821static void khugepaged_alloc_sleep(void)
1837 unsigned long address,
1838 struct page **hpage,
1839 struct vm_area_struct *vma,
1840 int node)
1841{ 1822{
1842 pgd_t *pgd; 1823 wait_event_freezable_timeout(khugepaged_wait, false,
1843 pud_t *pud; 1824 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
1844 pmd_t *pmd, _pmd; 1825}
1845 pte_t *pte;
1846 pgtable_t pgtable;
1847 struct page *new_page;
1848 spinlock_t *ptl;
1849 int isolated;
1850 unsigned long hstart, hend;
1851 1826
1852 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1827#ifdef CONFIG_NUMA
1853#ifndef CONFIG_NUMA 1828static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
1854 up_read(&mm->mmap_sem); 1829{
1855 VM_BUG_ON(!*hpage); 1830 if (IS_ERR(*hpage)) {
1856 new_page = *hpage; 1831 if (!*wait)
1857#else 1832 return false;
1833
1834 *wait = false;
1835 *hpage = NULL;
1836 khugepaged_alloc_sleep();
1837 } else if (*hpage) {
1838 put_page(*hpage);
1839 *hpage = NULL;
1840 }
1841
1842 return true;
1843}
1844
1845static struct page
1846*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
1847 struct vm_area_struct *vma, unsigned long address,
1848 int node)
1849{
1858 VM_BUG_ON(*hpage); 1850 VM_BUG_ON(*hpage);
1859 /* 1851 /*
1860 * Allocate the page while the vma is still valid and under 1852 * Allocate the page while the vma is still valid and under
@@ -1866,7 +1858,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1866 * mmap_sem in read mode is good idea also to allow greater 1858 * mmap_sem in read mode is good idea also to allow greater
1867 * scalability. 1859 * scalability.
1868 */ 1860 */
1869 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 1861 *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
1870 node, __GFP_OTHER_NODE); 1862 node, __GFP_OTHER_NODE);
1871 1863
1872 /* 1864 /*
@@ -1874,20 +1866,85 @@ static void collapse_huge_page(struct mm_struct *mm,
1874 * preparation for taking it in write mode. 1866 * preparation for taking it in write mode.
1875 */ 1867 */
1876 up_read(&mm->mmap_sem); 1868 up_read(&mm->mmap_sem);
1877 if (unlikely(!new_page)) { 1869 if (unlikely(!*hpage)) {
1878 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 1870 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1879 *hpage = ERR_PTR(-ENOMEM); 1871 *hpage = ERR_PTR(-ENOMEM);
1880 return; 1872 return NULL;
1881 } 1873 }
1882#endif
1883 1874
1884 count_vm_event(THP_COLLAPSE_ALLOC); 1875 count_vm_event(THP_COLLAPSE_ALLOC);
1885 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1876 return *hpage;
1886#ifdef CONFIG_NUMA 1877}
1887 put_page(new_page); 1878#else
1879static struct page *khugepaged_alloc_hugepage(bool *wait)
1880{
1881 struct page *hpage;
1882
1883 do {
1884 hpage = alloc_hugepage(khugepaged_defrag());
1885 if (!hpage) {
1886 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1887 if (!*wait)
1888 return NULL;
1889
1890 *wait = false;
1891 khugepaged_alloc_sleep();
1892 } else
1893 count_vm_event(THP_COLLAPSE_ALLOC);
1894 } while (unlikely(!hpage) && likely(khugepaged_enabled()));
1895
1896 return hpage;
1897}
1898
1899static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
1900{
1901 if (!*hpage)
1902 *hpage = khugepaged_alloc_hugepage(wait);
1903
1904 if (unlikely(!*hpage))
1905 return false;
1906
1907 return true;
1908}
1909
1910static struct page
1911*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
1912 struct vm_area_struct *vma, unsigned long address,
1913 int node)
1914{
1915 up_read(&mm->mmap_sem);
1916 VM_BUG_ON(!*hpage);
1917 return *hpage;
1918}
1888#endif 1919#endif
1920
1921static void collapse_huge_page(struct mm_struct *mm,
1922 unsigned long address,
1923 struct page **hpage,
1924 struct vm_area_struct *vma,
1925 int node)
1926{
1927 pgd_t *pgd;
1928 pud_t *pud;
1929 pmd_t *pmd, _pmd;
1930 pte_t *pte;
1931 pgtable_t pgtable;
1932 struct page *new_page;
1933 spinlock_t *ptl;
1934 int isolated;
1935 unsigned long hstart, hend;
1936 unsigned long mmun_start; /* For mmu_notifiers */
1937 unsigned long mmun_end; /* For mmu_notifiers */
1938
1939 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1940
1941 /* release the mmap_sem read lock. */
1942 new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
1943 if (!new_page)
1944 return;
1945
1946 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
1889 return; 1947 return;
1890 }
1891 1948
1892 /* 1949 /*
1893 * Prevent all access to pagetables with the exception of 1950 * Prevent all access to pagetables with the exception of
@@ -1912,11 +1969,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1912 goto out; 1969 goto out;
1913 if (is_vma_temporary_stack(vma)) 1970 if (is_vma_temporary_stack(vma))
1914 goto out; 1971 goto out;
1915 /* 1972 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
1916 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1917 * true too, verify it here.
1918 */
1919 VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1920 1973
1921 pgd = pgd_offset(mm, address); 1974 pgd = pgd_offset(mm, address);
1922 if (!pgd_present(*pgd)) 1975 if (!pgd_present(*pgd))
@@ -1936,6 +1989,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1936 pte = pte_offset_map(pmd, address); 1989 pte = pte_offset_map(pmd, address);
1937 ptl = pte_lockptr(mm, pmd); 1990 ptl = pte_lockptr(mm, pmd);
1938 1991
1992 mmun_start = address;
1993 mmun_end = address + HPAGE_PMD_SIZE;
1994 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1939 spin_lock(&mm->page_table_lock); /* probably unnecessary */ 1995 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1940 /* 1996 /*
1941 * After this gup_fast can't run anymore. This also removes 1997 * After this gup_fast can't run anymore. This also removes
@@ -1943,8 +1999,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1943 * huge and small TLB entries for the same virtual address 1999 * huge and small TLB entries for the same virtual address
1944 * to avoid the risk of CPU bugs in that area. 2000 * to avoid the risk of CPU bugs in that area.
1945 */ 2001 */
1946 _pmd = pmdp_clear_flush_notify(vma, address, pmd); 2002 _pmd = pmdp_clear_flush(vma, address, pmd);
1947 spin_unlock(&mm->page_table_lock); 2003 spin_unlock(&mm->page_table_lock);
2004 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1948 2005
1949 spin_lock(ptl); 2006 spin_lock(ptl);
1950 isolated = __collapse_huge_page_isolate(vma, address, pte); 2007 isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1970,8 +2027,6 @@ static void collapse_huge_page(struct mm_struct *mm,
1970 pte_unmap(pte); 2027 pte_unmap(pte);
1971 __SetPageUptodate(new_page); 2028 __SetPageUptodate(new_page);
1972 pgtable = pmd_pgtable(_pmd); 2029 pgtable = pmd_pgtable(_pmd);
1973 VM_BUG_ON(page_count(pgtable) != 1);
1974 VM_BUG_ON(page_mapcount(pgtable) != 0);
1975 2030
1976 _pmd = mk_pmd(new_page, vma->vm_page_prot); 2031 _pmd = mk_pmd(new_page, vma->vm_page_prot);
1977 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 2032 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
@@ -1988,13 +2043,12 @@ static void collapse_huge_page(struct mm_struct *mm,
1988 BUG_ON(!pmd_none(*pmd)); 2043 BUG_ON(!pmd_none(*pmd));
1989 page_add_new_anon_rmap(new_page, vma, address); 2044 page_add_new_anon_rmap(new_page, vma, address);
1990 set_pmd_at(mm, address, pmd, _pmd); 2045 set_pmd_at(mm, address, pmd, _pmd);
1991 update_mmu_cache(vma, address, _pmd); 2046 update_mmu_cache_pmd(vma, address, pmd);
1992 prepare_pmd_huge_pte(pgtable, mm); 2047 pgtable_trans_huge_deposit(mm, pgtable);
1993 spin_unlock(&mm->page_table_lock); 2048 spin_unlock(&mm->page_table_lock);
1994 2049
1995#ifndef CONFIG_NUMA
1996 *hpage = NULL; 2050 *hpage = NULL;
1997#endif 2051
1998 khugepaged_pages_collapsed++; 2052 khugepaged_pages_collapsed++;
1999out_up_write: 2053out_up_write:
2000 up_write(&mm->mmap_sem); 2054 up_write(&mm->mmap_sem);
@@ -2002,9 +2056,6 @@ out_up_write:
2002 2056
2003out: 2057out:
2004 mem_cgroup_uncharge_page(new_page); 2058 mem_cgroup_uncharge_page(new_page);
2005#ifdef CONFIG_NUMA
2006 put_page(new_page);
2007#endif
2008 goto out_up_write; 2059 goto out_up_write;
2009} 2060}
2010 2061
@@ -2154,12 +2205,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2154 goto skip; 2205 goto skip;
2155 if (is_vma_temporary_stack(vma)) 2206 if (is_vma_temporary_stack(vma))
2156 goto skip; 2207 goto skip;
2157 /* 2208 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2158 * If is_pfn_mapping() is true is_learn_pfn_mapping()
2159 * must be true too, verify it here.
2160 */
2161 VM_BUG_ON(is_linear_pfn_mapping(vma) ||
2162 vma->vm_flags & VM_NO_THP);
2163 2209
2164 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2210 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2165 hend = vma->vm_end & HPAGE_PMD_MASK; 2211 hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2234,32 +2280,23 @@ static int khugepaged_has_work(void)
2234static int khugepaged_wait_event(void) 2280static int khugepaged_wait_event(void)
2235{ 2281{
2236 return !list_empty(&khugepaged_scan.mm_head) || 2282 return !list_empty(&khugepaged_scan.mm_head) ||
2237 !khugepaged_enabled(); 2283 kthread_should_stop();
2238} 2284}
2239 2285
2240static void khugepaged_do_scan(struct page **hpage) 2286static void khugepaged_do_scan(void)
2241{ 2287{
2288 struct page *hpage = NULL;
2242 unsigned int progress = 0, pass_through_head = 0; 2289 unsigned int progress = 0, pass_through_head = 0;
2243 unsigned int pages = khugepaged_pages_to_scan; 2290 unsigned int pages = khugepaged_pages_to_scan;
2291 bool wait = true;
2244 2292
2245 barrier(); /* write khugepaged_pages_to_scan to local stack */ 2293 barrier(); /* write khugepaged_pages_to_scan to local stack */
2246 2294
2247 while (progress < pages) { 2295 while (progress < pages) {
2248 cond_resched(); 2296 if (!khugepaged_prealloc_page(&hpage, &wait))
2249
2250#ifndef CONFIG_NUMA
2251 if (!*hpage) {
2252 *hpage = alloc_hugepage(khugepaged_defrag());
2253 if (unlikely(!*hpage)) {
2254 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2255 break;
2256 }
2257 count_vm_event(THP_COLLAPSE_ALLOC);
2258 }
2259#else
2260 if (IS_ERR(*hpage))
2261 break; 2297 break;
2262#endif 2298
2299 cond_resched();
2263 2300
2264 if (unlikely(kthread_should_stop() || freezing(current))) 2301 if (unlikely(kthread_should_stop() || freezing(current)))
2265 break; 2302 break;
@@ -2270,73 +2307,32 @@ static void khugepaged_do_scan(struct page **hpage)
2270 if (khugepaged_has_work() && 2307 if (khugepaged_has_work() &&
2271 pass_through_head < 2) 2308 pass_through_head < 2)
2272 progress += khugepaged_scan_mm_slot(pages - progress, 2309 progress += khugepaged_scan_mm_slot(pages - progress,
2273 hpage); 2310 &hpage);
2274 else 2311 else
2275 progress = pages; 2312 progress = pages;
2276 spin_unlock(&khugepaged_mm_lock); 2313 spin_unlock(&khugepaged_mm_lock);
2277 } 2314 }
2278}
2279 2315
2280static void khugepaged_alloc_sleep(void) 2316 if (!IS_ERR_OR_NULL(hpage))
2281{ 2317 put_page(hpage);
2282 wait_event_freezable_timeout(khugepaged_wait, false,
2283 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2284} 2318}
2285 2319
2286#ifndef CONFIG_NUMA 2320static void khugepaged_wait_work(void)
2287static struct page *khugepaged_alloc_hugepage(void)
2288{ 2321{
2289 struct page *hpage; 2322 try_to_freeze();
2290
2291 do {
2292 hpage = alloc_hugepage(khugepaged_defrag());
2293 if (!hpage) {
2294 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2295 khugepaged_alloc_sleep();
2296 } else
2297 count_vm_event(THP_COLLAPSE_ALLOC);
2298 } while (unlikely(!hpage) &&
2299 likely(khugepaged_enabled()));
2300 return hpage;
2301}
2302#endif
2303 2323
2304static void khugepaged_loop(void) 2324 if (khugepaged_has_work()) {
2305{ 2325 if (!khugepaged_scan_sleep_millisecs)
2306 struct page *hpage; 2326 return;
2307 2327
2308#ifdef CONFIG_NUMA 2328 wait_event_freezable_timeout(khugepaged_wait,
2309 hpage = NULL; 2329 kthread_should_stop(),
2310#endif 2330 msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2311 while (likely(khugepaged_enabled())) { 2331 return;
2312#ifndef CONFIG_NUMA
2313 hpage = khugepaged_alloc_hugepage();
2314 if (unlikely(!hpage))
2315 break;
2316#else
2317 if (IS_ERR(hpage)) {
2318 khugepaged_alloc_sleep();
2319 hpage = NULL;
2320 }
2321#endif
2322
2323 khugepaged_do_scan(&hpage);
2324#ifndef CONFIG_NUMA
2325 if (hpage)
2326 put_page(hpage);
2327#endif
2328 try_to_freeze();
2329 if (unlikely(kthread_should_stop()))
2330 break;
2331 if (khugepaged_has_work()) {
2332 if (!khugepaged_scan_sleep_millisecs)
2333 continue;
2334 wait_event_freezable_timeout(khugepaged_wait, false,
2335 msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2336 } else if (khugepaged_enabled())
2337 wait_event_freezable(khugepaged_wait,
2338 khugepaged_wait_event());
2339 } 2332 }
2333
2334 if (khugepaged_enabled())
2335 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2340} 2336}
2341 2337
2342static int khugepaged(void *none) 2338static int khugepaged(void *none)
@@ -2346,20 +2342,9 @@ static int khugepaged(void *none)
2346 set_freezable(); 2342 set_freezable();
2347 set_user_nice(current, 19); 2343 set_user_nice(current, 19);
2348 2344
2349 /* serialize with start_khugepaged() */ 2345 while (!kthread_should_stop()) {
2350 mutex_lock(&khugepaged_mutex); 2346 khugepaged_do_scan();
2351 2347 khugepaged_wait_work();
2352 for (;;) {
2353 mutex_unlock(&khugepaged_mutex);
2354 VM_BUG_ON(khugepaged_thread != current);
2355 khugepaged_loop();
2356 VM_BUG_ON(khugepaged_thread != current);
2357
2358 mutex_lock(&khugepaged_mutex);
2359 if (!khugepaged_enabled())
2360 break;
2361 if (unlikely(kthread_should_stop()))
2362 break;
2363 } 2348 }
2364 2349
2365 spin_lock(&khugepaged_mm_lock); 2350 spin_lock(&khugepaged_mm_lock);
@@ -2368,10 +2353,6 @@ static int khugepaged(void *none)
2368 if (mm_slot) 2353 if (mm_slot)
2369 collect_mm_slot(mm_slot); 2354 collect_mm_slot(mm_slot);
2370 spin_unlock(&khugepaged_mm_lock); 2355 spin_unlock(&khugepaged_mm_lock);
2371
2372 khugepaged_thread = NULL;
2373 mutex_unlock(&khugepaged_mutex);
2374
2375 return 0; 2356 return 0;
2376} 2357}
2377 2358