aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c15
-rw-r--r--mm/huge_memory.c108
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/internal.h7
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/memcontrol.c7
-rw-r--r--mm/memory-failure.c7
-rw-r--r--mm/memory.c198
-rw-r--r--mm/memory_hotplug.c3
-rw-r--r--mm/mempolicy.c283
-rw-r--r--mm/migrate.c337
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c135
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page_alloc.c10
-rw-r--r--mm/pgtable-generic.c9
-rw-r--r--mm/rmap.c66
-rw-r--r--mm/vmstat.c16
18 files changed, 1098 insertions, 131 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 129791218226..5ad7f4f4d6f7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -303,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
303 if (blockpfn == end_pfn) 303 if (blockpfn == end_pfn)
304 update_pageblock_skip(cc, valid_page, total_isolated, false); 304 update_pageblock_skip(cc, valid_page, total_isolated, false);
305 305
306 count_vm_events(COMPACTFREE_SCANNED, nr_scanned);
307 if (total_isolated)
308 count_vm_events(COMPACTISOLATED, total_isolated);
309
306 return total_isolated; 310 return total_isolated;
307} 311}
308 312
@@ -609,6 +613,10 @@ next_pageblock:
609 613
610 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 614 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
611 615
616 count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned);
617 if (nr_isolated)
618 count_vm_events(COMPACTISOLATED, nr_isolated);
619
612 return low_pfn; 620 return low_pfn;
613} 621}
614 622
@@ -1015,14 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1015 nr_migrate = cc->nr_migratepages; 1023 nr_migrate = cc->nr_migratepages;
1016 err = migrate_pages(&cc->migratepages, compaction_alloc, 1024 err = migrate_pages(&cc->migratepages, compaction_alloc,
1017 (unsigned long)cc, false, 1025 (unsigned long)cc, false,
1018 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); 1026 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
1027 MR_COMPACTION);
1019 update_nr_listpages(cc); 1028 update_nr_listpages(cc);
1020 nr_remaining = cc->nr_migratepages; 1029 nr_remaining = cc->nr_migratepages;
1021 1030
1022 count_vm_event(COMPACTBLOCKS);
1023 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
1024 if (nr_remaining)
1025 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
1026 trace_mm_compaction_migratepages(nr_migrate - nr_remaining, 1031 trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
1027 nr_remaining); 1032 nr_remaining);
1028 1033
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 827d9c813051..d7ee1691fd21 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -19,6 +19,7 @@
19#include <linux/freezer.h> 19#include <linux/freezer.h>
20#include <linux/mman.h> 20#include <linux/mman.h>
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/migrate.h>
22 23
23#include <asm/tlb.h> 24#include <asm/tlb.h>
24#include <asm/pgalloc.h> 25#include <asm/pgalloc.h>
@@ -690,7 +691,7 @@ out:
690} 691}
691__setup("transparent_hugepage=", setup_transparent_hugepage); 692__setup("transparent_hugepage=", setup_transparent_hugepage);
692 693
693static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 694pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
694{ 695{
695 if (likely(vma->vm_flags & VM_WRITE)) 696 if (likely(vma->vm_flags & VM_WRITE))
696 pmd = pmd_mkwrite(pmd); 697 pmd = pmd_mkwrite(pmd);
@@ -848,7 +849,8 @@ out:
848 * run pte_offset_map on the pmd, if an huge pmd could 849 * run pte_offset_map on the pmd, if an huge pmd could
849 * materialize from under us from a different thread. 850 * materialize from under us from a different thread.
850 */ 851 */
851 if (unlikely(__pte_alloc(mm, vma, pmd, address))) 852 if (unlikely(pmd_none(*pmd)) &&
853 unlikely(__pte_alloc(mm, vma, pmd, address)))
852 return VM_FAULT_OOM; 854 return VM_FAULT_OOM;
853 /* if an huge pmd materialized from under us just retry later */ 855 /* if an huge pmd materialized from under us just retry later */
854 if (unlikely(pmd_trans_huge(*pmd))) 856 if (unlikely(pmd_trans_huge(*pmd)))
@@ -1287,6 +1289,81 @@ out:
1287 return page; 1289 return page;
1288} 1290}
1289 1291
1292/* NUMA hinting page fault entry point for trans huge pmds */
1293int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1294 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1295{
1296 struct page *page;
1297 unsigned long haddr = addr & HPAGE_PMD_MASK;
1298 int target_nid;
1299 int current_nid = -1;
1300 bool migrated;
1301 bool page_locked = false;
1302
1303 spin_lock(&mm->page_table_lock);
1304 if (unlikely(!pmd_same(pmd, *pmdp)))
1305 goto out_unlock;
1306
1307 page = pmd_page(pmd);
1308 get_page(page);
1309 current_nid = page_to_nid(page);
1310 count_vm_numa_event(NUMA_HINT_FAULTS);
1311 if (current_nid == numa_node_id())
1312 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1313
1314 target_nid = mpol_misplaced(page, vma, haddr);
1315 if (target_nid == -1) {
1316 put_page(page);
1317 goto clear_pmdnuma;
1318 }
1319
1320 /* Acquire the page lock to serialise THP migrations */
1321 spin_unlock(&mm->page_table_lock);
1322 lock_page(page);
1323 page_locked = true;
1324
1325 /* Confirm the PTE did not while locked */
1326 spin_lock(&mm->page_table_lock);
1327 if (unlikely(!pmd_same(pmd, *pmdp))) {
1328 unlock_page(page);
1329 put_page(page);
1330 goto out_unlock;
1331 }
1332 spin_unlock(&mm->page_table_lock);
1333
1334 /* Migrate the THP to the requested node */
1335 migrated = migrate_misplaced_transhuge_page(mm, vma,
1336 pmdp, pmd, addr,
1337 page, target_nid);
1338 if (migrated)
1339 current_nid = target_nid;
1340 else {
1341 spin_lock(&mm->page_table_lock);
1342 if (unlikely(!pmd_same(pmd, *pmdp))) {
1343 unlock_page(page);
1344 goto out_unlock;
1345 }
1346 goto clear_pmdnuma;
1347 }
1348
1349 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1350 return 0;
1351
1352clear_pmdnuma:
1353 pmd = pmd_mknonnuma(pmd);
1354 set_pmd_at(mm, haddr, pmdp, pmd);
1355 VM_BUG_ON(pmd_numa(*pmdp));
1356 update_mmu_cache_pmd(vma, addr, pmdp);
1357 if (page_locked)
1358 unlock_page(page);
1359
1360out_unlock:
1361 spin_unlock(&mm->page_table_lock);
1362 if (current_nid != -1)
1363 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1364 return 0;
1365}
1366
1290int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1367int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1291 pmd_t *pmd, unsigned long addr) 1368 pmd_t *pmd, unsigned long addr)
1292{ 1369{
@@ -1375,7 +1452,7 @@ out:
1375} 1452}
1376 1453
1377int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1454int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1378 unsigned long addr, pgprot_t newprot) 1455 unsigned long addr, pgprot_t newprot, int prot_numa)
1379{ 1456{
1380 struct mm_struct *mm = vma->vm_mm; 1457 struct mm_struct *mm = vma->vm_mm;
1381 int ret = 0; 1458 int ret = 0;
@@ -1383,7 +1460,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1383 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1460 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1384 pmd_t entry; 1461 pmd_t entry;
1385 entry = pmdp_get_and_clear(mm, addr, pmd); 1462 entry = pmdp_get_and_clear(mm, addr, pmd);
1386 entry = pmd_modify(entry, newprot); 1463 if (!prot_numa)
1464 entry = pmd_modify(entry, newprot);
1465 else {
1466 struct page *page = pmd_page(*pmd);
1467
1468 /* only check non-shared pages */
1469 if (page_mapcount(page) == 1 &&
1470 !pmd_numa(*pmd)) {
1471 entry = pmd_mknuma(entry);
1472 }
1473 }
1387 BUG_ON(pmd_write(entry)); 1474 BUG_ON(pmd_write(entry));
1388 set_pmd_at(mm, addr, pmd, entry); 1475 set_pmd_at(mm, addr, pmd, entry);
1389 spin_unlock(&vma->vm_mm->page_table_lock); 1476 spin_unlock(&vma->vm_mm->page_table_lock);
@@ -1474,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,
1474 * We can't temporarily set the pmd to null in order 1561 * We can't temporarily set the pmd to null in order
1475 * to split it, the pmd must remain marked huge at all 1562 * to split it, the pmd must remain marked huge at all
1476 * times or the VM won't take the pmd_trans_huge paths 1563 * times or the VM won't take the pmd_trans_huge paths
1477 * and it won't wait on the anon_vma->root->mutex to 1564 * and it won't wait on the anon_vma->root->rwsem to
1478 * serialize against split_huge_page*. 1565 * serialize against split_huge_page*.
1479 */ 1566 */
1480 pmdp_splitting_flush(vma, address, pmd); 1567 pmdp_splitting_flush(vma, address, pmd);
@@ -1565,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)
1565 page_tail->mapping = page->mapping; 1652 page_tail->mapping = page->mapping;
1566 1653
1567 page_tail->index = page->index + i; 1654 page_tail->index = page->index + i;
1655 page_xchg_last_nid(page_tail, page_last_nid(page));
1568 1656
1569 BUG_ON(!PageAnon(page_tail)); 1657 BUG_ON(!PageAnon(page_tail));
1570 BUG_ON(!PageUptodate(page_tail)); 1658 BUG_ON(!PageUptodate(page_tail));
@@ -1632,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,
1632 BUG_ON(page_mapcount(page) != 1); 1720 BUG_ON(page_mapcount(page) != 1);
1633 if (!pmd_young(*pmd)) 1721 if (!pmd_young(*pmd))
1634 entry = pte_mkold(entry); 1722 entry = pte_mkold(entry);
1723 if (pmd_numa(*pmd))
1724 entry = pte_mknuma(entry);
1635 pte = pte_offset_map(&_pmd, haddr); 1725 pte = pte_offset_map(&_pmd, haddr);
1636 BUG_ON(!pte_none(*pte)); 1726 BUG_ON(!pte_none(*pte));
1637 set_pte_at(mm, haddr, pte, entry); 1727 set_pte_at(mm, haddr, pte, entry);
@@ -1674,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,
1674 return ret; 1764 return ret;
1675} 1765}
1676 1766
1677/* must be called with anon_vma->root->mutex hold */ 1767/* must be called with anon_vma->root->rwsem held */
1678static void __split_huge_page(struct page *page, 1768static void __split_huge_page(struct page *page,
1679 struct anon_vma *anon_vma) 1769 struct anon_vma *anon_vma)
1680{ 1770{
@@ -1729,7 +1819,7 @@ int split_huge_page(struct page *page)
1729 1819
1730 BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); 1820 BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1731 BUG_ON(!PageAnon(page)); 1821 BUG_ON(!PageAnon(page));
1732 anon_vma = page_lock_anon_vma(page); 1822 anon_vma = page_lock_anon_vma_read(page);
1733 if (!anon_vma) 1823 if (!anon_vma)
1734 goto out; 1824 goto out;
1735 ret = 0; 1825 ret = 0;
@@ -1742,7 +1832,7 @@ int split_huge_page(struct page *page)
1742 1832
1743 BUG_ON(PageCompound(page)); 1833 BUG_ON(PageCompound(page));
1744out_unlock: 1834out_unlock:
1745 page_unlock_anon_vma(anon_vma); 1835 page_unlock_anon_vma_read(anon_vma);
1746out: 1836out:
1747 return ret; 1837 return ret;
1748} 1838}
@@ -2234,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2234 if (pmd_trans_huge(*pmd)) 2324 if (pmd_trans_huge(*pmd))
2235 goto out; 2325 goto out;
2236 2326
2237 anon_vma_lock(vma->anon_vma); 2327 anon_vma_lock_write(vma->anon_vma);
2238 2328
2239 pte = pte_offset_map(pmd, address); 2329 pte = pte_offset_map(pmd, address);
2240 ptl = pte_lockptr(mm, pmd); 2330 ptl = pte_lockptr(mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 88e7293b96bd..e5318c7793ae 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3016,7 +3016,7 @@ same_page:
3016 return i ? i : -EFAULT; 3016 return i ? i : -EFAULT;
3017} 3017}
3018 3018
3019void hugetlb_change_protection(struct vm_area_struct *vma, 3019unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3020 unsigned long address, unsigned long end, pgprot_t newprot) 3020 unsigned long address, unsigned long end, pgprot_t newprot)
3021{ 3021{
3022 struct mm_struct *mm = vma->vm_mm; 3022 struct mm_struct *mm = vma->vm_mm;
@@ -3024,6 +3024,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
3024 pte_t *ptep; 3024 pte_t *ptep;
3025 pte_t pte; 3025 pte_t pte;
3026 struct hstate *h = hstate_vma(vma); 3026 struct hstate *h = hstate_vma(vma);
3027 unsigned long pages = 0;
3027 3028
3028 BUG_ON(address >= end); 3029 BUG_ON(address >= end);
3029 flush_cache_range(vma, address, end); 3030 flush_cache_range(vma, address, end);
@@ -3034,12 +3035,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
3034 ptep = huge_pte_offset(mm, address); 3035 ptep = huge_pte_offset(mm, address);
3035 if (!ptep) 3036 if (!ptep)
3036 continue; 3037 continue;
3037 if (huge_pmd_unshare(mm, &address, ptep)) 3038 if (huge_pmd_unshare(mm, &address, ptep)) {
3039 pages++;
3038 continue; 3040 continue;
3041 }
3039 if (!huge_pte_none(huge_ptep_get(ptep))) { 3042 if (!huge_pte_none(huge_ptep_get(ptep))) {
3040 pte = huge_ptep_get_and_clear(mm, address, ptep); 3043 pte = huge_ptep_get_and_clear(mm, address, ptep);
3041 pte = pte_mkhuge(pte_modify(pte, newprot)); 3044 pte = pte_mkhuge(pte_modify(pte, newprot));
3042 set_huge_pte_at(mm, address, ptep, pte); 3045 set_huge_pte_at(mm, address, ptep, pte);
3046 pages++;
3043 } 3047 }
3044 } 3048 }
3045 spin_unlock(&mm->page_table_lock); 3049 spin_unlock(&mm->page_table_lock);
@@ -3051,6 +3055,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
3051 */ 3055 */
3052 flush_tlb_range(vma, start, end); 3056 flush_tlb_range(vma, start, end);
3053 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3057 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
3058
3059 return pages << h->order;
3054} 3060}
3055 3061
3056int hugetlb_reserve_pages(struct inode *inode, 3062int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/internal.h b/mm/internal.h
index 52d1fa957194..d597f94cc205 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -217,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
217{ 217{
218 if (TestClearPageMlocked(page)) { 218 if (TestClearPageMlocked(page)) {
219 unsigned long flags; 219 unsigned long flags;
220 int nr_pages = hpage_nr_pages(page);
220 221
221 local_irq_save(flags); 222 local_irq_save(flags);
222 __dec_zone_page_state(page, NR_MLOCK); 223 __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
223 SetPageMlocked(newpage); 224 SetPageMlocked(newpage);
224 __inc_zone_page_state(newpage, NR_MLOCK); 225 __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
225 local_irq_restore(flags); 226 local_irq_restore(flags);
226 } 227 }
227} 228}
228 229
230extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
231
229#ifdef CONFIG_TRANSPARENT_HUGEPAGE 232#ifdef CONFIG_TRANSPARENT_HUGEPAGE
230extern unsigned long vma_address(struct page *page, 233extern unsigned long vma_address(struct page *page,
231 struct vm_area_struct *vma); 234 struct vm_area_struct *vma);
diff --git a/mm/ksm.c b/mm/ksm.c
index 382d930a0bf1..82dfb4b54321 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1624,7 +1624,7 @@ again:
1624 struct anon_vma_chain *vmac; 1624 struct anon_vma_chain *vmac;
1625 struct vm_area_struct *vma; 1625 struct vm_area_struct *vma;
1626 1626
1627 anon_vma_lock(anon_vma); 1627 anon_vma_lock_write(anon_vma);
1628 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1628 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1629 0, ULONG_MAX) { 1629 0, ULONG_MAX) {
1630 vma = vmac->vma; 1630 vma = vmac->vma;
@@ -1678,7 +1678,7 @@ again:
1678 struct anon_vma_chain *vmac; 1678 struct anon_vma_chain *vmac;
1679 struct vm_area_struct *vma; 1679 struct vm_area_struct *vma;
1680 1680
1681 anon_vma_lock(anon_vma); 1681 anon_vma_lock_write(anon_vma);
1682 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1682 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1683 0, ULONG_MAX) { 1683 0, ULONG_MAX) {
1684 vma = vmac->vma; 1684 vma = vmac->vma;
@@ -1731,7 +1731,7 @@ again:
1731 struct anon_vma_chain *vmac; 1731 struct anon_vma_chain *vmac;
1732 struct vm_area_struct *vma; 1732 struct vm_area_struct *vma;
1733 1733
1734 anon_vma_lock(anon_vma); 1734 anon_vma_lock_write(anon_vma);
1735 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 1735 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1736 0, ULONG_MAX) { 1736 0, ULONG_MAX) {
1737 vma = vmac->vma; 1737 vma = vmac->vma;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6c055929c8cc..bbfac5063ca8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3289,15 +3289,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3289 struct mem_cgroup **memcgp) 3289 struct mem_cgroup **memcgp)
3290{ 3290{
3291 struct mem_cgroup *memcg = NULL; 3291 struct mem_cgroup *memcg = NULL;
3292 unsigned int nr_pages = 1;
3292 struct page_cgroup *pc; 3293 struct page_cgroup *pc;
3293 enum charge_type ctype; 3294 enum charge_type ctype;
3294 3295
3295 *memcgp = NULL; 3296 *memcgp = NULL;
3296 3297
3297 VM_BUG_ON(PageTransHuge(page));
3298 if (mem_cgroup_disabled()) 3298 if (mem_cgroup_disabled())
3299 return; 3299 return;
3300 3300
3301 if (PageTransHuge(page))
3302 nr_pages <<= compound_order(page);
3303
3301 pc = lookup_page_cgroup(page); 3304 pc = lookup_page_cgroup(page);
3302 lock_page_cgroup(pc); 3305 lock_page_cgroup(pc);
3303 if (PageCgroupUsed(pc)) { 3306 if (PageCgroupUsed(pc)) {
@@ -3359,7 +3362,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3359 * charged to the res_counter since we plan on replacing the 3362 * charged to the res_counter since we plan on replacing the
3360 * old one and only one page is going to be left afterwards. 3363 * old one and only one page is going to be left afterwards.
3361 */ 3364 */
3362 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); 3365 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
3363} 3366}
3364 3367
3365/* remove redundant charge if migration failed*/ 3368/* remove redundant charge if migration failed*/
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 108c52fa60f6..c6e4dd3e1c08 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
402 struct anon_vma *av; 402 struct anon_vma *av;
403 pgoff_t pgoff; 403 pgoff_t pgoff;
404 404
405 av = page_lock_anon_vma(page); 405 av = page_lock_anon_vma_read(page);
406 if (av == NULL) /* Not actually mapped anymore */ 406 if (av == NULL) /* Not actually mapped anymore */
407 return; 407 return;
408 408
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
423 } 423 }
424 } 424 }
425 read_unlock(&tasklist_lock); 425 read_unlock(&tasklist_lock);
426 page_unlock_anon_vma(av); 426 page_unlock_anon_vma_read(av);
427} 427}
428 428
429/* 429/*
@@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
1566 page_is_file_cache(page)); 1566 page_is_file_cache(page));
1567 list_add(&page->lru, &pagelist); 1567 list_add(&page->lru, &pagelist);
1568 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1568 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1569 false, MIGRATE_SYNC); 1569 false, MIGRATE_SYNC,
1570 MR_MEMORY_FAILURE);
1570 if (ret) { 1571 if (ret) {
1571 putback_lru_pages(&pagelist); 1572 putback_lru_pages(&pagelist);
1572 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1573 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index db2e9e797a05..e6a3b933517e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,7 @@
57#include <linux/swapops.h> 57#include <linux/swapops.h>
58#include <linux/elf.h> 58#include <linux/elf.h>
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60#include <linux/migrate.h>
60 61
61#include <asm/io.h> 62#include <asm/io.h>
62#include <asm/pgalloc.h> 63#include <asm/pgalloc.h>
@@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1503 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1504 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1504 goto out; 1505 goto out;
1505 } 1506 }
1507 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1508 goto no_page_table;
1506 if (pmd_trans_huge(*pmd)) { 1509 if (pmd_trans_huge(*pmd)) {
1507 if (flags & FOLL_SPLIT) { 1510 if (flags & FOLL_SPLIT) {
1508 split_huge_page_pmd(vma, address, pmd); 1511 split_huge_page_pmd(vma, address, pmd);
@@ -1532,6 +1535,8 @@ split_fallthrough:
1532 pte = *ptep; 1535 pte = *ptep;
1533 if (!pte_present(pte)) 1536 if (!pte_present(pte))
1534 goto no_page; 1537 goto no_page;
1538 if ((flags & FOLL_NUMA) && pte_numa(pte))
1539 goto no_page;
1535 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1540 if ((flags & FOLL_WRITE) && !pte_write(pte))
1536 goto unlock; 1541 goto unlock;
1537 1542
@@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1683 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1688 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1684 vm_flags &= (gup_flags & FOLL_FORCE) ? 1689 vm_flags &= (gup_flags & FOLL_FORCE) ?
1685 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1690 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1691
1692 /*
1693 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
1694 * would be called on PROT_NONE ranges. We must never invoke
1695 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
1696 * page faults would unprotect the PROT_NONE ranges if
1697 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
1698 * bitflag. So to avoid that, don't set FOLL_NUMA if
1699 * FOLL_FORCE is set.
1700 */
1701 if (!(gup_flags & FOLL_FORCE))
1702 gup_flags |= FOLL_NUMA;
1703
1686 i = 0; 1704 i = 0;
1687 1705
1688 do { 1706 do {
@@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3412 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3430 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3413} 3431}
3414 3432
3433int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3434 unsigned long addr, int current_nid)
3435{
3436 get_page(page);
3437
3438 count_vm_numa_event(NUMA_HINT_FAULTS);
3439 if (current_nid == numa_node_id())
3440 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3441
3442 return mpol_misplaced(page, vma, addr);
3443}
3444
3445int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3446 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3447{
3448 struct page *page = NULL;
3449 spinlock_t *ptl;
3450 int current_nid = -1;
3451 int target_nid;
3452 bool migrated = false;
3453
3454 /*
3455 * The "pte" at this point cannot be used safely without
3456 * validation through pte_unmap_same(). It's of NUMA type but
3457 * the pfn may be screwed if the read is non atomic.
3458 *
3459 * ptep_modify_prot_start is not called as this is clearing
3460 * the _PAGE_NUMA bit and it is not really expected that there
3461 * would be concurrent hardware modifications to the PTE.
3462 */
3463 ptl = pte_lockptr(mm, pmd);
3464 spin_lock(ptl);
3465 if (unlikely(!pte_same(*ptep, pte))) {
3466 pte_unmap_unlock(ptep, ptl);
3467 goto out;
3468 }
3469
3470 pte = pte_mknonnuma(pte);
3471 set_pte_at(mm, addr, ptep, pte);
3472 update_mmu_cache(vma, addr, ptep);
3473
3474 page = vm_normal_page(vma, addr, pte);
3475 if (!page) {
3476 pte_unmap_unlock(ptep, ptl);
3477 return 0;
3478 }
3479
3480 current_nid = page_to_nid(page);
3481 target_nid = numa_migrate_prep(page, vma, addr, current_nid);
3482 pte_unmap_unlock(ptep, ptl);
3483 if (target_nid == -1) {
3484 /*
3485 * Account for the fault against the current node if it not
3486 * being replaced regardless of where the page is located.
3487 */
3488 current_nid = numa_node_id();
3489 put_page(page);
3490 goto out;
3491 }
3492
3493 /* Migrate to the requested node */
3494 migrated = migrate_misplaced_page(page, target_nid);
3495 if (migrated)
3496 current_nid = target_nid;
3497
3498out:
3499 if (current_nid != -1)
3500 task_numa_fault(current_nid, 1, migrated);
3501 return 0;
3502}
3503
3504/* NUMA hinting page fault entry point for regular pmds */
3505#ifdef CONFIG_NUMA_BALANCING
3506static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3507 unsigned long addr, pmd_t *pmdp)
3508{
3509 pmd_t pmd;
3510 pte_t *pte, *orig_pte;
3511 unsigned long _addr = addr & PMD_MASK;
3512 unsigned long offset;
3513 spinlock_t *ptl;
3514 bool numa = false;
3515 int local_nid = numa_node_id();
3516
3517 spin_lock(&mm->page_table_lock);
3518 pmd = *pmdp;
3519 if (pmd_numa(pmd)) {
3520 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3521 numa = true;
3522 }
3523 spin_unlock(&mm->page_table_lock);
3524
3525 if (!numa)
3526 return 0;
3527
3528 /* we're in a page fault so some vma must be in the range */
3529 BUG_ON(!vma);
3530 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3531 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3532 VM_BUG_ON(offset >= PMD_SIZE);
3533 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3534 pte += offset >> PAGE_SHIFT;
3535 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3536 pte_t pteval = *pte;
3537 struct page *page;
3538 int curr_nid = local_nid;
3539 int target_nid;
3540 bool migrated;
3541 if (!pte_present(pteval))
3542 continue;
3543 if (!pte_numa(pteval))
3544 continue;
3545 if (addr >= vma->vm_end) {
3546 vma = find_vma(mm, addr);
3547 /* there's a pte present so there must be a vma */
3548 BUG_ON(!vma);
3549 BUG_ON(addr < vma->vm_start);
3550 }
3551 if (pte_numa(pteval)) {
3552 pteval = pte_mknonnuma(pteval);
3553 set_pte_at(mm, addr, pte, pteval);
3554 }
3555 page = vm_normal_page(vma, addr, pteval);
3556 if (unlikely(!page))
3557 continue;
3558 /* only check non-shared pages */
3559 if (unlikely(page_mapcount(page) != 1))
3560 continue;
3561
3562 /*
3563 * Note that the NUMA fault is later accounted to either
3564 * the node that is currently running or where the page is
3565 * migrated to.
3566 */
3567 curr_nid = local_nid;
3568 target_nid = numa_migrate_prep(page, vma, addr,
3569 page_to_nid(page));
3570 if (target_nid == -1) {
3571 put_page(page);
3572 continue;
3573 }
3574
3575 /* Migrate to the requested node */
3576 pte_unmap_unlock(pte, ptl);
3577 migrated = migrate_misplaced_page(page, target_nid);
3578 if (migrated)
3579 curr_nid = target_nid;
3580 task_numa_fault(curr_nid, 1, migrated);
3581
3582 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3583 }
3584 pte_unmap_unlock(orig_pte, ptl);
3585
3586 return 0;
3587}
3588#else
3589static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3590 unsigned long addr, pmd_t *pmdp)
3591{
3592 BUG();
3593}
3594#endif /* CONFIG_NUMA_BALANCING */
3595
3415/* 3596/*
3416 * These routines also need to handle stuff like marking pages dirty 3597 * These routines also need to handle stuff like marking pages dirty
3417 * and/or accessed for architectures that don't do it in hardware (most 3598 * and/or accessed for architectures that don't do it in hardware (most
@@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm,
3450 pte, pmd, flags, entry); 3631 pte, pmd, flags, entry);
3451 } 3632 }
3452 3633
3634 if (pte_numa(entry))
3635 return do_numa_page(mm, vma, address, entry, pte, pmd);
3636
3453 ptl = pte_lockptr(mm, pmd); 3637 ptl = pte_lockptr(mm, pmd);
3454 spin_lock(ptl); 3638 spin_lock(ptl);
3455 if (unlikely(!pte_same(*pte, entry))) 3639 if (unlikely(!pte_same(*pte, entry)))
@@ -3520,8 +3704,11 @@ retry:
3520 if (pmd_trans_huge(orig_pmd)) { 3704 if (pmd_trans_huge(orig_pmd)) {
3521 unsigned int dirty = flags & FAULT_FLAG_WRITE; 3705 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3522 3706
3523 if (dirty && !pmd_write(orig_pmd) && 3707 if (pmd_numa(orig_pmd))
3524 !pmd_trans_splitting(orig_pmd)) { 3708 return do_huge_pmd_numa_page(mm, vma, address,
3709 orig_pmd, pmd);
3710
3711 if (dirty && !pmd_write(orig_pmd)) {
3525 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3712 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3526 orig_pmd); 3713 orig_pmd);
3527 /* 3714 /*
@@ -3536,16 +3723,21 @@ retry:
3536 huge_pmd_set_accessed(mm, vma, address, pmd, 3723 huge_pmd_set_accessed(mm, vma, address, pmd,
3537 orig_pmd, dirty); 3724 orig_pmd, dirty);
3538 } 3725 }
3726
3539 return 0; 3727 return 0;
3540 } 3728 }
3541 } 3729 }
3542 3730
3731 if (pmd_numa(*pmd))
3732 return do_pmd_numa_page(mm, vma, address, pmd);
3733
3543 /* 3734 /*
3544 * Use __pte_alloc instead of pte_alloc_map, because we can't 3735 * Use __pte_alloc instead of pte_alloc_map, because we can't
3545 * run pte_offset_map on the pmd, if an huge pmd could 3736 * run pte_offset_map on the pmd, if an huge pmd could
3546 * materialize from under us from a different thread. 3737 * materialize from under us from a different thread.
3547 */ 3738 */
3548 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) 3739 if (unlikely(pmd_none(*pmd)) &&
3740 unlikely(__pte_alloc(mm, vma, pmd, address)))
3549 return VM_FAULT_OOM; 3741 return VM_FAULT_OOM;
3550 /* if an huge pmd materialized from under us just retry later */ 3742 /* if an huge pmd materialized from under us just retry later */
3551 if (unlikely(pmd_trans_huge(*pmd))) 3743 if (unlikely(pmd_trans_huge(*pmd)))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 518baa896e83..962e353aa86f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1055,7 +1055,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1055 * migrate_pages returns # of failed pages. 1055 * migrate_pages returns # of failed pages.
1056 */ 1056 */
1057 ret = migrate_pages(&source, alloc_migrate_target, 0, 1057 ret = migrate_pages(&source, alloc_migrate_target, 0,
1058 true, MIGRATE_SYNC); 1058 true, MIGRATE_SYNC,
1059 MR_MEMORY_HOTPLUG);
1059 if (ret) 1060 if (ret)
1060 putback_lru_pages(&source); 1061 putback_lru_pages(&source);
1061 } 1062 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index aaf54566cb6b..d1b315e98627 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
90#include <linux/syscalls.h> 90#include <linux/syscalls.h>
91#include <linux/ctype.h> 91#include <linux/ctype.h>
92#include <linux/mm_inline.h> 92#include <linux/mm_inline.h>
93#include <linux/mmu_notifier.h>
93 94
94#include <asm/tlbflush.h> 95#include <asm/tlbflush.h>
95#include <asm/uaccess.h> 96#include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
117 .flags = MPOL_F_LOCAL, 118 .flags = MPOL_F_LOCAL,
118}; 119};
119 120
121static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122
123static struct mempolicy *get_task_policy(struct task_struct *p)
124{
125 struct mempolicy *pol = p->mempolicy;
126 int node;
127
128 if (!pol) {
129 node = numa_node_id();
130 if (node != -1)
131 pol = &preferred_node_policy[node];
132
133 /* preferred_node_policy is not initialised early in boot */
134 if (!pol->mode)
135 pol = NULL;
136 }
137
138 return pol;
139}
140
120static const struct mempolicy_operations { 141static const struct mempolicy_operations {
121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 142 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 /* 143 /*
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
254 if (mode == MPOL_DEFAULT) { 275 if (mode == MPOL_DEFAULT) {
255 if (nodes && !nodes_empty(*nodes)) 276 if (nodes && !nodes_empty(*nodes))
256 return ERR_PTR(-EINVAL); 277 return ERR_PTR(-EINVAL);
257 return NULL; /* simply delete any existing policy */ 278 return NULL;
258 } 279 }
259 VM_BUG_ON(!nodes); 280 VM_BUG_ON(!nodes);
260 281
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
269 (flags & MPOL_F_RELATIVE_NODES))) 290 (flags & MPOL_F_RELATIVE_NODES)))
270 return ERR_PTR(-EINVAL); 291 return ERR_PTR(-EINVAL);
271 } 292 }
293 } else if (mode == MPOL_LOCAL) {
294 if (!nodes_empty(*nodes))
295 return ERR_PTR(-EINVAL);
296 mode = MPOL_PREFERRED;
272 } else if (nodes_empty(*nodes)) 297 } else if (nodes_empty(*nodes))
273 return ERR_PTR(-EINVAL); 298 return ERR_PTR(-EINVAL);
274 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 299 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
561 return 0; 586 return 0;
562} 587}
563 588
589#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
590/*
591 * This is used to mark a range of virtual addresses to be inaccessible.
592 * These are later cleared by a NUMA hinting fault. Depending on these
593 * faults, pages may be migrated for better NUMA placement.
594 *
595 * This is assuming that NUMA faults are handled using PROT_NONE. If
596 * an architecture makes a different choice, it will need further
597 * changes to the core.
598 */
599unsigned long change_prot_numa(struct vm_area_struct *vma,
600 unsigned long addr, unsigned long end)
601{
602 int nr_updated;
603 BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
604
605 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
606 if (nr_updated)
607 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
608
609 return nr_updated;
610}
611#else
612static unsigned long change_prot_numa(struct vm_area_struct *vma,
613 unsigned long addr, unsigned long end)
614{
615 return 0;
616}
617#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
618
564/* 619/*
565 * Check if all pages in a range are on a set of nodes. 620 * Check if all pages in a range are on a set of nodes.
566 * If pagelist != NULL then isolate pages from the LRU and 621 * If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
579 return ERR_PTR(-EFAULT); 634 return ERR_PTR(-EFAULT);
580 prev = NULL; 635 prev = NULL;
581 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 636 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
637 unsigned long endvma = vma->vm_end;
638
639 if (endvma > end)
640 endvma = end;
641 if (vma->vm_start > start)
642 start = vma->vm_start;
643
582 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 644 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
583 if (!vma->vm_next && vma->vm_end < end) 645 if (!vma->vm_next && vma->vm_end < end)
584 return ERR_PTR(-EFAULT); 646 return ERR_PTR(-EFAULT);
585 if (prev && prev->vm_end < vma->vm_start) 647 if (prev && prev->vm_end < vma->vm_start)
586 return ERR_PTR(-EFAULT); 648 return ERR_PTR(-EFAULT);
587 } 649 }
588 if (!is_vm_hugetlb_page(vma) && 650
589 ((flags & MPOL_MF_STRICT) || 651 if (is_vm_hugetlb_page(vma))
652 goto next;
653
654 if (flags & MPOL_MF_LAZY) {
655 change_prot_numa(vma, start, endvma);
656 goto next;
657 }
658
659 if ((flags & MPOL_MF_STRICT) ||
590 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 660 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
591 vma_migratable(vma)))) { 661 vma_migratable(vma))) {
592 unsigned long endvma = vma->vm_end;
593 662
594 if (endvma > end)
595 endvma = end;
596 if (vma->vm_start > start)
597 start = vma->vm_start;
598 err = check_pgd_range(vma, start, endvma, nodes, 663 err = check_pgd_range(vma, start, endvma, nodes,
599 flags, private); 664 flags, private);
600 if (err) { 665 if (err) {
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
602 break; 667 break;
603 } 668 }
604 } 669 }
670next:
605 prev = vma; 671 prev = vma;
606 } 672 }
607 return first; 673 return first;
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
961 1027
962 if (!list_empty(&pagelist)) { 1028 if (!list_empty(&pagelist)) {
963 err = migrate_pages(&pagelist, new_node_page, dest, 1029 err = migrate_pages(&pagelist, new_node_page, dest,
964 false, MIGRATE_SYNC); 1030 false, MIGRATE_SYNC,
1031 MR_SYSCALL);
965 if (err) 1032 if (err)
966 putback_lru_pages(&pagelist); 1033 putback_lru_pages(&pagelist);
967 } 1034 }
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1133 int err; 1200 int err;
1134 LIST_HEAD(pagelist); 1201 LIST_HEAD(pagelist);
1135 1202
1136 if (flags & ~(unsigned long)(MPOL_MF_STRICT | 1203 if (flags & ~(unsigned long)MPOL_MF_VALID)
1137 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1138 return -EINVAL; 1204 return -EINVAL;
1139 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1205 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1140 return -EPERM; 1206 return -EPERM;
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
1157 if (IS_ERR(new)) 1223 if (IS_ERR(new))
1158 return PTR_ERR(new); 1224 return PTR_ERR(new);
1159 1225
1226 if (flags & MPOL_MF_LAZY)
1227 new->flags |= MPOL_F_MOF;
1228
1160 /* 1229 /*
1161 * If we are using the default policy then operation 1230 * If we are using the default policy then operation
1162 * on discontinuous address spaces is okay after all 1231 * on discontinuous address spaces is okay after all
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
1193 vma = check_range(mm, start, end, nmask, 1262 vma = check_range(mm, start, end, nmask,
1194 flags | MPOL_MF_INVERT, &pagelist); 1263 flags | MPOL_MF_INVERT, &pagelist);
1195 1264
1196 err = PTR_ERR(vma); 1265 err = PTR_ERR(vma); /* maybe ... */
1197 if (!IS_ERR(vma)) { 1266 if (!IS_ERR(vma))
1198 int nr_failed = 0;
1199
1200 err = mbind_range(mm, start, end, new); 1267 err = mbind_range(mm, start, end, new);
1201 1268
1269 if (!err) {
1270 int nr_failed = 0;
1271
1202 if (!list_empty(&pagelist)) { 1272 if (!list_empty(&pagelist)) {
1273 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1203 nr_failed = migrate_pages(&pagelist, new_vma_page, 1274 nr_failed = migrate_pages(&pagelist, new_vma_page,
1204 (unsigned long)vma, 1275 (unsigned long)vma,
1205 false, MIGRATE_SYNC); 1276 false, MIGRATE_SYNC,
1277 MR_MEMPOLICY_MBIND);
1206 if (nr_failed) 1278 if (nr_failed)
1207 putback_lru_pages(&pagelist); 1279 putback_lru_pages(&pagelist);
1208 } 1280 }
1209 1281
1210 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1282 if (nr_failed && (flags & MPOL_MF_STRICT))
1211 err = -EIO; 1283 err = -EIO;
1212 } else 1284 } else
1213 putback_lru_pages(&pagelist); 1285 putback_lru_pages(&pagelist);
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1546struct mempolicy *get_vma_policy(struct task_struct *task, 1618struct mempolicy *get_vma_policy(struct task_struct *task,
1547 struct vm_area_struct *vma, unsigned long addr) 1619 struct vm_area_struct *vma, unsigned long addr)
1548{ 1620{
1549 struct mempolicy *pol = task->mempolicy; 1621 struct mempolicy *pol = get_task_policy(task);
1550 1622
1551 if (vma) { 1623 if (vma) {
1552 if (vma->vm_ops && vma->vm_ops->get_policy) { 1624 if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1956,7 +2028,7 @@ retry_cpuset:
1956 */ 2028 */
1957struct page *alloc_pages_current(gfp_t gfp, unsigned order) 2029struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1958{ 2030{
1959 struct mempolicy *pol = current->mempolicy; 2031 struct mempolicy *pol = get_task_policy(current);
1960 struct page *page; 2032 struct page *page;
1961 unsigned int cpuset_mems_cookie; 2033 unsigned int cpuset_mems_cookie;
1962 2034
@@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n)
2140 kmem_cache_free(sn_cache, n); 2212 kmem_cache_free(sn_cache, n);
2141} 2213}
2142 2214
2215/**
2216 * mpol_misplaced - check whether current page node is valid in policy
2217 *
2218 * @page - page to be checked
2219 * @vma - vm area where page mapped
2220 * @addr - virtual address where page mapped
2221 *
2222 * Lookup current policy node id for vma,addr and "compare to" page's
2223 * node id.
2224 *
2225 * Returns:
2226 * -1 - not misplaced, page is in the right node
2227 * node - node id where the page should be
2228 *
2229 * Policy determination "mimics" alloc_page_vma().
2230 * Called from fault path where we know the vma and faulting address.
2231 */
2232int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2233{
2234 struct mempolicy *pol;
2235 struct zone *zone;
2236 int curnid = page_to_nid(page);
2237 unsigned long pgoff;
2238 int polnid = -1;
2239 int ret = -1;
2240
2241 BUG_ON(!vma);
2242
2243 pol = get_vma_policy(current, vma, addr);
2244 if (!(pol->flags & MPOL_F_MOF))
2245 goto out;
2246
2247 switch (pol->mode) {
2248 case MPOL_INTERLEAVE:
2249 BUG_ON(addr >= vma->vm_end);
2250 BUG_ON(addr < vma->vm_start);
2251
2252 pgoff = vma->vm_pgoff;
2253 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2254 polnid = offset_il_node(pol, vma, pgoff);
2255 break;
2256
2257 case MPOL_PREFERRED:
2258 if (pol->flags & MPOL_F_LOCAL)
2259 polnid = numa_node_id();
2260 else
2261 polnid = pol->v.preferred_node;
2262 break;
2263
2264 case MPOL_BIND:
2265 /*
2266 * allows binding to multiple nodes.
2267 * use current page if in policy nodemask,
2268 * else select nearest allowed node, if any.
2269 * If no allowed nodes, use current [!misplaced].
2270 */
2271 if (node_isset(curnid, pol->v.nodes))
2272 goto out;
2273 (void)first_zones_zonelist(
2274 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2275 gfp_zone(GFP_HIGHUSER),
2276 &pol->v.nodes, &zone);
2277 polnid = zone->node;
2278 break;
2279
2280 default:
2281 BUG();
2282 }
2283
2284 /* Migrate the page towards the node whose CPU is referencing it */
2285 if (pol->flags & MPOL_F_MORON) {
2286 int last_nid;
2287
2288 polnid = numa_node_id();
2289
2290 /*
2291 * Multi-stage node selection is used in conjunction
2292 * with a periodic migration fault to build a temporal
2293 * task<->page relation. By using a two-stage filter we
2294 * remove short/unlikely relations.
2295 *
2296 * Using P(p) ~ n_p / n_t as per frequentist
2297 * probability, we can equate a task's usage of a
2298 * particular page (n_p) per total usage of this
2299 * page (n_t) (in a given time-span) to a probability.
2300 *
2301 * Our periodic faults will sample this probability and
2302 * getting the same result twice in a row, given these
2303 * samples are fully independent, is then given by
2304 * P(n)^2, provided our sample period is sufficiently
2305 * short compared to the usage pattern.
2306 *
2307 * This quadric squishes small probabilities, making
2308 * it less likely we act on an unlikely task<->page
2309 * relation.
2310 */
2311 last_nid = page_xchg_last_nid(page, polnid);
2312 if (last_nid != polnid)
2313 goto out;
2314 }
2315
2316 if (curnid != polnid)
2317 ret = polnid;
2318out:
2319 mpol_cond_put(pol);
2320
2321 return ret;
2322}
2323
2143static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2324static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2144{ 2325{
2145 pr_debug("deleting %lx-l%lx\n", n->start, n->end); 2326 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2305,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p)
2305 mutex_unlock(&p->mutex); 2486 mutex_unlock(&p->mutex);
2306} 2487}
2307 2488
2489#ifdef CONFIG_NUMA_BALANCING
2490static bool __initdata numabalancing_override;
2491
2492static void __init check_numabalancing_enable(void)
2493{
2494 bool numabalancing_default = false;
2495
2496 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2497 numabalancing_default = true;
2498
2499 if (nr_node_ids > 1 && !numabalancing_override) {
2500 printk(KERN_INFO "Enabling automatic NUMA balancing. "
2501 "Configure with numa_balancing= or sysctl");
2502 set_numabalancing_state(numabalancing_default);
2503 }
2504}
2505
2506static int __init setup_numabalancing(char *str)
2507{
2508 int ret = 0;
2509 if (!str)
2510 goto out;
2511 numabalancing_override = true;
2512
2513 if (!strcmp(str, "enable")) {
2514 set_numabalancing_state(true);
2515 ret = 1;
2516 } else if (!strcmp(str, "disable")) {
2517 set_numabalancing_state(false);
2518 ret = 1;
2519 }
2520out:
2521 if (!ret)
2522 printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2523
2524 return ret;
2525}
2526__setup("numa_balancing=", setup_numabalancing);
2527#else
2528static inline void __init check_numabalancing_enable(void)
2529{
2530}
2531#endif /* CONFIG_NUMA_BALANCING */
2532
2308/* assumes fs == KERNEL_DS */ 2533/* assumes fs == KERNEL_DS */
2309void __init numa_policy_init(void) 2534void __init numa_policy_init(void)
2310{ 2535{
@@ -2320,6 +2545,15 @@ void __init numa_policy_init(void)
2320 sizeof(struct sp_node), 2545 sizeof(struct sp_node),
2321 0, SLAB_PANIC, NULL); 2546 0, SLAB_PANIC, NULL);
2322 2547
2548 for_each_node(nid) {
2549 preferred_node_policy[nid] = (struct mempolicy) {
2550 .refcnt = ATOMIC_INIT(1),
2551 .mode = MPOL_PREFERRED,
2552 .flags = MPOL_F_MOF | MPOL_F_MORON,
2553 .v = { .preferred_node = nid, },
2554 };
2555 }
2556
2323 /* 2557 /*
2324 * Set interleaving policy for system init. Interleaving is only 2558 * Set interleaving policy for system init. Interleaving is only
2325 * enabled across suitably sized nodes (default is >= 16MB), or 2559 * enabled across suitably sized nodes (default is >= 16MB), or
@@ -2346,6 +2580,8 @@ void __init numa_policy_init(void)
2346 2580
2347 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 2581 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2348 printk("numa_policy_init: interleaving failed\n"); 2582 printk("numa_policy_init: interleaving failed\n");
2583
2584 check_numabalancing_enable();
2349} 2585}
2350 2586
2351/* Reset policy of current process to default */ 2587/* Reset policy of current process to default */
@@ -2362,14 +2598,13 @@ void numa_default_policy(void)
2362 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag 2598 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
2363 * Used only for mpol_parse_str() and mpol_to_str() 2599 * Used only for mpol_parse_str() and mpol_to_str()
2364 */ 2600 */
2365#define MPOL_LOCAL MPOL_MAX
2366static const char * const policy_modes[] = 2601static const char * const policy_modes[] =
2367{ 2602{
2368 [MPOL_DEFAULT] = "default", 2603 [MPOL_DEFAULT] = "default",
2369 [MPOL_PREFERRED] = "prefer", 2604 [MPOL_PREFERRED] = "prefer",
2370 [MPOL_BIND] = "bind", 2605 [MPOL_BIND] = "bind",
2371 [MPOL_INTERLEAVE] = "interleave", 2606 [MPOL_INTERLEAVE] = "interleave",
2372 [MPOL_LOCAL] = "local" 2607 [MPOL_LOCAL] = "local",
2373}; 2608};
2374 2609
2375 2610
@@ -2415,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2415 if (flags) 2650 if (flags)
2416 *flags++ = '\0'; /* terminate mode string */ 2651 *flags++ = '\0'; /* terminate mode string */
2417 2652
2418 for (mode = 0; mode <= MPOL_LOCAL; mode++) { 2653 for (mode = 0; mode < MPOL_MAX; mode++) {
2419 if (!strcmp(str, policy_modes[mode])) { 2654 if (!strcmp(str, policy_modes[mode])) {
2420 break; 2655 break;
2421 } 2656 }
2422 } 2657 }
2423 if (mode > MPOL_LOCAL) 2658 if (mode >= MPOL_MAX)
2424 goto out; 2659 goto out;
2425 2660
2426 switch (mode) { 2661 switch (mode) {
diff --git a/mm/migrate.c b/mm/migrate.c
index cae02711181d..32efd8028bc9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -39,6 +39,9 @@
39 39
40#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
41 41
42#define CREATE_TRACE_POINTS
43#include <trace/events/migrate.h>
44
42#include "internal.h" 45#include "internal.h"
43 46
44/* 47/*
@@ -293,7 +296,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
293 struct page *newpage, struct page *page, 296 struct page *newpage, struct page *page,
294 struct buffer_head *head, enum migrate_mode mode) 297 struct buffer_head *head, enum migrate_mode mode)
295{ 298{
296 int expected_count; 299 int expected_count = 0;
297 void **pslot; 300 void **pslot;
298 301
299 if (!mapping) { 302 if (!mapping) {
@@ -421,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
421 */ 424 */
422void migrate_page_copy(struct page *newpage, struct page *page) 425void migrate_page_copy(struct page *newpage, struct page *page)
423{ 426{
424 if (PageHuge(page)) 427 if (PageHuge(page) || PageTransHuge(page))
425 copy_huge_page(newpage, page); 428 copy_huge_page(newpage, page);
426 else 429 else
427 copy_highpage(newpage, page); 430 copy_highpage(newpage, page);
@@ -765,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
765 */ 768 */
766 if (PageAnon(page)) { 769 if (PageAnon(page)) {
767 /* 770 /*
768 * Only page_lock_anon_vma() understands the subtleties of 771 * Only page_lock_anon_vma_read() understands the subtleties of
769 * getting a hold on an anon_vma from outside one of its mms. 772 * getting a hold on an anon_vma from outside one of its mms.
770 */ 773 */
771 anon_vma = page_get_anon_vma(page); 774 anon_vma = page_get_anon_vma(page);
@@ -998,10 +1001,11 @@ out:
998 */ 1001 */
999int migrate_pages(struct list_head *from, 1002int migrate_pages(struct list_head *from,
1000 new_page_t get_new_page, unsigned long private, bool offlining, 1003 new_page_t get_new_page, unsigned long private, bool offlining,
1001 enum migrate_mode mode) 1004 enum migrate_mode mode, int reason)
1002{ 1005{
1003 int retry = 1; 1006 int retry = 1;
1004 int nr_failed = 0; 1007 int nr_failed = 0;
1008 int nr_succeeded = 0;
1005 int pass = 0; 1009 int pass = 0;
1006 struct page *page; 1010 struct page *page;
1007 struct page *page2; 1011 struct page *page2;
@@ -1028,6 +1032,7 @@ int migrate_pages(struct list_head *from,
1028 retry++; 1032 retry++;
1029 break; 1033 break;
1030 case MIGRATEPAGE_SUCCESS: 1034 case MIGRATEPAGE_SUCCESS:
1035 nr_succeeded++;
1031 break; 1036 break;
1032 default: 1037 default:
1033 /* Permanent failure */ 1038 /* Permanent failure */
@@ -1038,6 +1043,12 @@ int migrate_pages(struct list_head *from,
1038 } 1043 }
1039 rc = nr_failed + retry; 1044 rc = nr_failed + retry;
1040out: 1045out:
1046 if (nr_succeeded)
1047 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1048 if (nr_failed)
1049 count_vm_events(PGMIGRATE_FAIL, nr_failed);
1050 trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
1051
1041 if (!swapwrite) 1052 if (!swapwrite)
1042 current->flags &= ~PF_SWAPWRITE; 1053 current->flags &= ~PF_SWAPWRITE;
1043 1054
@@ -1176,7 +1187,8 @@ set_status:
1176 err = 0; 1187 err = 0;
1177 if (!list_empty(&pagelist)) { 1188 if (!list_empty(&pagelist)) {
1178 err = migrate_pages(&pagelist, new_page_node, 1189 err = migrate_pages(&pagelist, new_page_node,
1179 (unsigned long)pm, 0, MIGRATE_SYNC); 1190 (unsigned long)pm, 0, MIGRATE_SYNC,
1191 MR_SYSCALL);
1180 if (err) 1192 if (err)
1181 putback_lru_pages(&pagelist); 1193 putback_lru_pages(&pagelist);
1182 } 1194 }
@@ -1440,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1440 } 1452 }
1441 return err; 1453 return err;
1442} 1454}
1443#endif 1455
1456#ifdef CONFIG_NUMA_BALANCING
1457/*
1458 * Returns true if this is a safe migration target node for misplaced NUMA
1459 * pages. Currently it only checks the watermarks which crude
1460 */
1461static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1462 int nr_migrate_pages)
1463{
1464 int z;
1465 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1466 struct zone *zone = pgdat->node_zones + z;
1467
1468 if (!populated_zone(zone))
1469 continue;
1470
1471 if (zone->all_unreclaimable)
1472 continue;
1473
1474 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
1475 if (!zone_watermark_ok(zone, 0,
1476 high_wmark_pages(zone) +
1477 nr_migrate_pages,
1478 0, 0))
1479 continue;
1480 return true;
1481 }
1482 return false;
1483}
1484
1485static struct page *alloc_misplaced_dst_page(struct page *page,
1486 unsigned long data,
1487 int **result)
1488{
1489 int nid = (int) data;
1490 struct page *newpage;
1491
1492 newpage = alloc_pages_exact_node(nid,
1493 (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
1494 __GFP_NOMEMALLOC | __GFP_NORETRY |
1495 __GFP_NOWARN) &
1496 ~GFP_IOFS, 0);
1497 if (newpage)
1498 page_xchg_last_nid(newpage, page_last_nid(page));
1499
1500 return newpage;
1501}
1502
1503/*
1504 * page migration rate limiting control.
1505 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
1506 * window of time. Default here says do not migrate more than 1280M per second.
1507 * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
1508 * as it is faults that reset the window, pte updates will happen unconditionally
1509 * if there has not been a fault since @pteupdate_interval_millisecs after the
1510 * throttle window closed.
1511 */
1512static unsigned int migrate_interval_millisecs __read_mostly = 100;
1513static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
1514static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1515
1516/* Returns true if NUMA migration is currently rate limited */
1517bool migrate_ratelimited(int node)
1518{
1519 pg_data_t *pgdat = NODE_DATA(node);
1520
1521 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
1522 msecs_to_jiffies(pteupdate_interval_millisecs)))
1523 return false;
1524
1525 if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
1526 return false;
1527
1528 return true;
1529}
1530
1531/* Returns true if the node is migrate rate-limited after the update */
1532bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
1533{
1534 bool rate_limited = false;
1535
1536 /*
1537 * Rate-limit the amount of data that is being migrated to a node.
1538 * Optimal placement is no good if the memory bus is saturated and
1539 * all the time is being spent migrating!
1540 */
1541 spin_lock(&pgdat->numabalancing_migrate_lock);
1542 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
1543 pgdat->numabalancing_migrate_nr_pages = 0;
1544 pgdat->numabalancing_migrate_next_window = jiffies +
1545 msecs_to_jiffies(migrate_interval_millisecs);
1546 }
1547 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
1548 rate_limited = true;
1549 else
1550 pgdat->numabalancing_migrate_nr_pages += nr_pages;
1551 spin_unlock(&pgdat->numabalancing_migrate_lock);
1552
1553 return rate_limited;
1554}
1555
1556int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1557{
1558 int ret = 0;
1559
1560 /* Avoid migrating to a node that is nearly full */
1561 if (migrate_balanced_pgdat(pgdat, 1)) {
1562 int page_lru;
1563
1564 if (isolate_lru_page(page)) {
1565 put_page(page);
1566 return 0;
1567 }
1568
1569 /* Page is isolated */
1570 ret = 1;
1571 page_lru = page_is_file_cache(page);
1572 if (!PageTransHuge(page))
1573 inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
1574 else
1575 mod_zone_page_state(page_zone(page),
1576 NR_ISOLATED_ANON + page_lru,
1577 HPAGE_PMD_NR);
1578 }
1579
1580 /*
1581 * Page is either isolated or there is not enough space on the target
1582 * node. If isolated, then it has taken a reference count and the
1583 * callers reference can be safely dropped without the page
1584 * disappearing underneath us during migration. Otherwise the page is
1585 * not to be migrated but the callers reference should still be
1586 * dropped so it does not leak.
1587 */
1588 put_page(page);
1589
1590 return ret;
1591}
1592
1593/*
1594 * Attempt to migrate a misplaced page to the specified destination
1595 * node. Caller is expected to have an elevated reference count on
1596 * the page that will be dropped by this function before returning.
1597 */
1598int migrate_misplaced_page(struct page *page, int node)
1599{
1600 pg_data_t *pgdat = NODE_DATA(node);
1601 int isolated = 0;
1602 int nr_remaining;
1603 LIST_HEAD(migratepages);
1604
1605 /*
1606 * Don't migrate pages that are mapped in multiple processes.
1607 * TODO: Handle false sharing detection instead of this hammer
1608 */
1609 if (page_mapcount(page) != 1) {
1610 put_page(page);
1611 goto out;
1612 }
1613
1614 /*
1615 * Rate-limit the amount of data that is being migrated to a node.
1616 * Optimal placement is no good if the memory bus is saturated and
1617 * all the time is being spent migrating!
1618 */
1619 if (numamigrate_update_ratelimit(pgdat, 1)) {
1620 put_page(page);
1621 goto out;
1622 }
1623
1624 isolated = numamigrate_isolate_page(pgdat, page);
1625 if (!isolated)
1626 goto out;
1627
1628 list_add(&page->lru, &migratepages);
1629 nr_remaining = migrate_pages(&migratepages,
1630 alloc_misplaced_dst_page,
1631 node, false, MIGRATE_ASYNC,
1632 MR_NUMA_MISPLACED);
1633 if (nr_remaining) {
1634 putback_lru_pages(&migratepages);
1635 isolated = 0;
1636 } else
1637 count_vm_numa_event(NUMA_PAGE_MIGRATE);
1638 BUG_ON(!list_empty(&migratepages));
1639out:
1640 return isolated;
1641}
1642#endif /* CONFIG_NUMA_BALANCING */
1643
1644#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1645int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1646 struct vm_area_struct *vma,
1647 pmd_t *pmd, pmd_t entry,
1648 unsigned long address,
1649 struct page *page, int node)
1650{
1651 unsigned long haddr = address & HPAGE_PMD_MASK;
1652 pg_data_t *pgdat = NODE_DATA(node);
1653 int isolated = 0;
1654 struct page *new_page = NULL;
1655 struct mem_cgroup *memcg = NULL;
1656 int page_lru = page_is_file_cache(page);
1657
1658 /*
1659 * Don't migrate pages that are mapped in multiple processes.
1660 * TODO: Handle false sharing detection instead of this hammer
1661 */
1662 if (page_mapcount(page) != 1)
1663 goto out_dropref;
1664
1665 /*
1666 * Rate-limit the amount of data that is being migrated to a node.
1667 * Optimal placement is no good if the memory bus is saturated and
1668 * all the time is being spent migrating!
1669 */
1670 if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
1671 goto out_dropref;
1672
1673 new_page = alloc_pages_node(node,
1674 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
1675 if (!new_page) {
1676 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1677 goto out_dropref;
1678 }
1679 page_xchg_last_nid(new_page, page_last_nid(page));
1680
1681 isolated = numamigrate_isolate_page(pgdat, page);
1682 if (!isolated) {
1683 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1684 put_page(new_page);
1685 goto out_keep_locked;
1686 }
1687
1688 /* Prepare a page as a migration target */
1689 __set_page_locked(new_page);
1690 SetPageSwapBacked(new_page);
1691
1692 /* anon mapping, we can simply copy page->mapping to the new page: */
1693 new_page->mapping = page->mapping;
1694 new_page->index = page->index;
1695 migrate_page_copy(new_page, page);
1696 WARN_ON(PageLRU(new_page));
1697
1698 /* Recheck the target PMD */
1699 spin_lock(&mm->page_table_lock);
1700 if (unlikely(!pmd_same(*pmd, entry))) {
1701 spin_unlock(&mm->page_table_lock);
1702
1703 /* Reverse changes made by migrate_page_copy() */
1704 if (TestClearPageActive(new_page))
1705 SetPageActive(page);
1706 if (TestClearPageUnevictable(new_page))
1707 SetPageUnevictable(page);
1708 mlock_migrate_page(page, new_page);
1709
1710 unlock_page(new_page);
1711 put_page(new_page); /* Free it */
1712
1713 unlock_page(page);
1714 putback_lru_page(page);
1715
1716 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1717 goto out;
1718 }
1719
1720 /*
1721 * Traditional migration needs to prepare the memcg charge
1722 * transaction early to prevent the old page from being
1723 * uncharged when installing migration entries. Here we can
1724 * save the potential rollback and start the charge transfer
1725 * only when migration is already known to end successfully.
1726 */
1727 mem_cgroup_prepare_migration(page, new_page, &memcg);
1728
1729 entry = mk_pmd(new_page, vma->vm_page_prot);
1730 entry = pmd_mknonnuma(entry);
1731 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1732 entry = pmd_mkhuge(entry);
1733
1734 page_add_new_anon_rmap(new_page, vma, haddr);
1735
1736 set_pmd_at(mm, haddr, pmd, entry);
1737 update_mmu_cache_pmd(vma, address, entry);
1738 page_remove_rmap(page);
1739 /*
1740 * Finish the charge transaction under the page table lock to
1741 * prevent split_huge_page() from dividing up the charge
1742 * before it's fully transferred to the new page.
1743 */
1744 mem_cgroup_end_migration(memcg, page, new_page, true);
1745 spin_unlock(&mm->page_table_lock);
1746
1747 unlock_page(new_page);
1748 unlock_page(page);
1749 put_page(page); /* Drop the rmap reference */
1750 put_page(page); /* Drop the LRU isolation reference */
1751
1752 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1753 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1754
1755out:
1756 mod_zone_page_state(page_zone(page),
1757 NR_ISOLATED_ANON + page_lru,
1758 -HPAGE_PMD_NR);
1759 return isolated;
1760
1761out_dropref:
1762 put_page(page);
1763out_keep_locked:
1764 return 0;
1765}
1766#endif /* CONFIG_NUMA_BALANCING */
1767
1768#endif /* CONFIG_NUMA */
diff --git a/mm/mmap.c b/mm/mmap.c
index 2b7d9e78a569..f54b235f29a9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -736,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end);
736 if (anon_vma) { 736 if (anon_vma) {
737 VM_BUG_ON(adjust_next && next->anon_vma && 737 VM_BUG_ON(adjust_next && next->anon_vma &&
738 anon_vma != next->anon_vma); 738 anon_vma != next->anon_vma);
739 anon_vma_lock(anon_vma); 739 anon_vma_lock_write(anon_vma);
740 anon_vma_interval_tree_pre_update_vma(vma); 740 anon_vma_interval_tree_pre_update_vma(vma);
741 if (adjust_next) 741 if (adjust_next)
742 anon_vma_interval_tree_pre_update_vma(next); 742 anon_vma_interval_tree_pre_update_vma(next);
@@ -2886,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2886 * The LSB of head.next can't change from under us 2886 * The LSB of head.next can't change from under us
2887 * because we hold the mm_all_locks_mutex. 2887 * because we hold the mm_all_locks_mutex.
2888 */ 2888 */
2889 mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); 2889 down_write(&anon_vma->root->rwsem);
2890 /* 2890 /*
2891 * We can safely modify head.next after taking the 2891 * We can safely modify head.next after taking the
2892 * anon_vma->root->mutex. If some other vma in this mm shares 2892 * anon_vma->root->rwsem. If some other vma in this mm shares
2893 * the same anon_vma we won't take it again. 2893 * the same anon_vma we won't take it again.
2894 * 2894 *
2895 * No need of atomic instructions here, head.next 2895 * No need of atomic instructions here, head.next
2896 * can't change from under us thanks to the 2896 * can't change from under us thanks to the
2897 * anon_vma->root->mutex. 2897 * anon_vma->root->rwsem.
2898 */ 2898 */
2899 if (__test_and_set_bit(0, (unsigned long *) 2899 if (__test_and_set_bit(0, (unsigned long *)
2900 &anon_vma->root->rb_root.rb_node)) 2900 &anon_vma->root->rb_root.rb_node))
@@ -2996,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2996 * 2996 *
2997 * No need of atomic instructions here, head.next 2997 * No need of atomic instructions here, head.next
2998 * can't change from under us until we release the 2998 * can't change from under us until we release the
2999 * anon_vma->root->mutex. 2999 * anon_vma->root->rwsem.
3000 */ 3000 */
3001 if (!__test_and_clear_bit(0, (unsigned long *) 3001 if (!__test_and_clear_bit(0, (unsigned long *)
3002 &anon_vma->root->rb_root.rb_node)) 3002 &anon_vma->root->rb_root.rb_node))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e8c3938db6fa..3dca970367db 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
35} 35}
36#endif 36#endif
37 37
38static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, 38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39 unsigned long addr, unsigned long end, pgprot_t newprot, 39 unsigned long addr, unsigned long end, pgprot_t newprot,
40 int dirty_accountable) 40 int dirty_accountable, int prot_numa, bool *ret_all_same_node)
41{ 41{
42 struct mm_struct *mm = vma->vm_mm;
42 pte_t *pte, oldpte; 43 pte_t *pte, oldpte;
43 spinlock_t *ptl; 44 spinlock_t *ptl;
45 unsigned long pages = 0;
46 bool all_same_node = true;
47 int last_nid = -1;
44 48
45 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 49 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
46 arch_enter_lazy_mmu_mode(); 50 arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
48 oldpte = *pte; 52 oldpte = *pte;
49 if (pte_present(oldpte)) { 53 if (pte_present(oldpte)) {
50 pte_t ptent; 54 pte_t ptent;
55 bool updated = false;
51 56
52 ptent = ptep_modify_prot_start(mm, addr, pte); 57 ptent = ptep_modify_prot_start(mm, addr, pte);
53 ptent = pte_modify(ptent, newprot); 58 if (!prot_numa) {
59 ptent = pte_modify(ptent, newprot);
60 updated = true;
61 } else {
62 struct page *page;
63
64 page = vm_normal_page(vma, addr, oldpte);
65 if (page) {
66 int this_nid = page_to_nid(page);
67 if (last_nid == -1)
68 last_nid = this_nid;
69 if (last_nid != this_nid)
70 all_same_node = false;
71
72 /* only check non-shared pages */
73 if (!pte_numa(oldpte) &&
74 page_mapcount(page) == 1) {
75 ptent = pte_mknuma(ptent);
76 updated = true;
77 }
78 }
79 }
54 80
55 /* 81 /*
56 * Avoid taking write faults for pages we know to be 82 * Avoid taking write faults for pages we know to be
57 * dirty. 83 * dirty.
58 */ 84 */
59 if (dirty_accountable && pte_dirty(ptent)) 85 if (dirty_accountable && pte_dirty(ptent)) {
60 ptent = pte_mkwrite(ptent); 86 ptent = pte_mkwrite(ptent);
87 updated = true;
88 }
61 89
90 if (updated)
91 pages++;
62 ptep_modify_prot_commit(mm, addr, pte, ptent); 92 ptep_modify_prot_commit(mm, addr, pte, ptent);
63 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { 93 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
64 swp_entry_t entry = pte_to_swp_entry(oldpte); 94 swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,18 +102,40 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
72 set_pte_at(mm, addr, pte, 102 set_pte_at(mm, addr, pte,
73 swp_entry_to_pte(entry)); 103 swp_entry_to_pte(entry));
74 } 104 }
105 pages++;
75 } 106 }
76 } while (pte++, addr += PAGE_SIZE, addr != end); 107 } while (pte++, addr += PAGE_SIZE, addr != end);
77 arch_leave_lazy_mmu_mode(); 108 arch_leave_lazy_mmu_mode();
78 pte_unmap_unlock(pte - 1, ptl); 109 pte_unmap_unlock(pte - 1, ptl);
110
111 *ret_all_same_node = all_same_node;
112 return pages;
79} 113}
80 114
81static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, 115#ifdef CONFIG_NUMA_BALANCING
116static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
117 pmd_t *pmd)
118{
119 spin_lock(&mm->page_table_lock);
120 set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
121 spin_unlock(&mm->page_table_lock);
122}
123#else
124static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
125 pmd_t *pmd)
126{
127 BUG();
128}
129#endif /* CONFIG_NUMA_BALANCING */
130
131static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
82 unsigned long addr, unsigned long end, pgprot_t newprot, 132 unsigned long addr, unsigned long end, pgprot_t newprot,
83 int dirty_accountable) 133 int dirty_accountable, int prot_numa)
84{ 134{
85 pmd_t *pmd; 135 pmd_t *pmd;
86 unsigned long next; 136 unsigned long next;
137 unsigned long pages = 0;
138 bool all_same_node;
87 139
88 pmd = pmd_offset(pud, addr); 140 pmd = pmd_offset(pud, addr);
89 do { 141 do {
@@ -91,42 +143,59 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
91 if (pmd_trans_huge(*pmd)) { 143 if (pmd_trans_huge(*pmd)) {
92 if (next - addr != HPAGE_PMD_SIZE) 144 if (next - addr != HPAGE_PMD_SIZE)
93 split_huge_page_pmd(vma, addr, pmd); 145 split_huge_page_pmd(vma, addr, pmd);
94 else if (change_huge_pmd(vma, pmd, addr, newprot)) 146 else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) {
147 pages += HPAGE_PMD_NR;
95 continue; 148 continue;
149 }
96 /* fall through */ 150 /* fall through */
97 } 151 }
98 if (pmd_none_or_clear_bad(pmd)) 152 if (pmd_none_or_clear_bad(pmd))
99 continue; 153 continue;
100 change_pte_range(vma->vm_mm, pmd, addr, next, newprot, 154 pages += change_pte_range(vma, pmd, addr, next, newprot,
101 dirty_accountable); 155 dirty_accountable, prot_numa, &all_same_node);
156
157 /*
158 * If we are changing protections for NUMA hinting faults then
159 * set pmd_numa if the examined pages were all on the same
160 * node. This allows a regular PMD to be handled as one fault
161 * and effectively batches the taking of the PTL
162 */
163 if (prot_numa && all_same_node)
164 change_pmd_protnuma(vma->vm_mm, addr, pmd);
102 } while (pmd++, addr = next, addr != end); 165 } while (pmd++, addr = next, addr != end);
166
167 return pages;
103} 168}
104 169
105static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 170static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
106 unsigned long addr, unsigned long end, pgprot_t newprot, 171 unsigned long addr, unsigned long end, pgprot_t newprot,
107 int dirty_accountable) 172 int dirty_accountable, int prot_numa)
108{ 173{
109 pud_t *pud; 174 pud_t *pud;
110 unsigned long next; 175 unsigned long next;
176 unsigned long pages = 0;
111 177
112 pud = pud_offset(pgd, addr); 178 pud = pud_offset(pgd, addr);
113 do { 179 do {
114 next = pud_addr_end(addr, end); 180 next = pud_addr_end(addr, end);
115 if (pud_none_or_clear_bad(pud)) 181 if (pud_none_or_clear_bad(pud))
116 continue; 182 continue;
117 change_pmd_range(vma, pud, addr, next, newprot, 183 pages += change_pmd_range(vma, pud, addr, next, newprot,
118 dirty_accountable); 184 dirty_accountable, prot_numa);
119 } while (pud++, addr = next, addr != end); 185 } while (pud++, addr = next, addr != end);
186
187 return pages;
120} 188}
121 189
122static void change_protection(struct vm_area_struct *vma, 190static unsigned long change_protection_range(struct vm_area_struct *vma,
123 unsigned long addr, unsigned long end, pgprot_t newprot, 191 unsigned long addr, unsigned long end, pgprot_t newprot,
124 int dirty_accountable) 192 int dirty_accountable, int prot_numa)
125{ 193{
126 struct mm_struct *mm = vma->vm_mm; 194 struct mm_struct *mm = vma->vm_mm;
127 pgd_t *pgd; 195 pgd_t *pgd;
128 unsigned long next; 196 unsigned long next;
129 unsigned long start = addr; 197 unsigned long start = addr;
198 unsigned long pages = 0;
130 199
131 BUG_ON(addr >= end); 200 BUG_ON(addr >= end);
132 pgd = pgd_offset(mm, addr); 201 pgd = pgd_offset(mm, addr);
@@ -135,10 +204,32 @@ static void change_protection(struct vm_area_struct *vma,
135 next = pgd_addr_end(addr, end); 204 next = pgd_addr_end(addr, end);
136 if (pgd_none_or_clear_bad(pgd)) 205 if (pgd_none_or_clear_bad(pgd))
137 continue; 206 continue;
138 change_pud_range(vma, pgd, addr, next, newprot, 207 pages += change_pud_range(vma, pgd, addr, next, newprot,
139 dirty_accountable); 208 dirty_accountable, prot_numa);
140 } while (pgd++, addr = next, addr != end); 209 } while (pgd++, addr = next, addr != end);
141 flush_tlb_range(vma, start, end); 210
211 /* Only flush the TLB if we actually modified any entries: */
212 if (pages)
213 flush_tlb_range(vma, start, end);
214
215 return pages;
216}
217
218unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
219 unsigned long end, pgprot_t newprot,
220 int dirty_accountable, int prot_numa)
221{
222 struct mm_struct *mm = vma->vm_mm;
223 unsigned long pages;
224
225 mmu_notifier_invalidate_range_start(mm, start, end);
226 if (is_vm_hugetlb_page(vma))
227 pages = hugetlb_change_protection(vma, start, end, newprot);
228 else
229 pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
230 mmu_notifier_invalidate_range_end(mm, start, end);
231
232 return pages;
142} 233}
143 234
144int 235int
@@ -213,12 +304,8 @@ success:
213 dirty_accountable = 1; 304 dirty_accountable = 1;
214 } 305 }
215 306
216 mmu_notifier_invalidate_range_start(mm, start, end); 307 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0);
217 if (is_vm_hugetlb_page(vma)) 308
218 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
219 else
220 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
221 mmu_notifier_invalidate_range_end(mm, start, end);
222 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 309 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
223 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 310 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
224 perf_event_mmap(vma); 311 perf_event_mmap(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index eabb24da6c9e..e1031e1f6a61 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
104 } 104 }
105 if (vma->anon_vma) { 105 if (vma->anon_vma) {
106 anon_vma = vma->anon_vma; 106 anon_vma = vma->anon_vma;
107 anon_vma_lock(anon_vma); 107 anon_vma_lock_write(anon_vma);
108 } 108 }
109 } 109 }
110 110
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 83637dfba110..d037c8bc1512 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -611,6 +611,7 @@ static inline int free_pages_check(struct page *page)
611 bad_page(page); 611 bad_page(page);
612 return 1; 612 return 1;
613 } 613 }
614 reset_page_last_nid(page);
614 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 615 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
615 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 616 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
616 return 0; 617 return 0;
@@ -3883,6 +3884,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3883 mminit_verify_page_links(page, zone, nid, pfn); 3884 mminit_verify_page_links(page, zone, nid, pfn);
3884 init_page_count(page); 3885 init_page_count(page);
3885 reset_page_mapcount(page); 3886 reset_page_mapcount(page);
3887 reset_page_last_nid(page);
3886 SetPageReserved(page); 3888 SetPageReserved(page);
3887 /* 3889 /*
3888 * Mark the block movable so that blocks are reserved for 3890 * Mark the block movable so that blocks are reserved for
@@ -4526,6 +4528,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4526 int ret; 4528 int ret;
4527 4529
4528 pgdat_resize_init(pgdat); 4530 pgdat_resize_init(pgdat);
4531#ifdef CONFIG_NUMA_BALANCING
4532 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4533 pgdat->numabalancing_migrate_nr_pages = 0;
4534 pgdat->numabalancing_migrate_next_window = jiffies;
4535#endif
4529 init_waitqueue_head(&pgdat->kswapd_wait); 4536 init_waitqueue_head(&pgdat->kswapd_wait);
4530 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4537 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4531 pgdat_page_cgroup_init(pgdat); 4538 pgdat_page_cgroup_init(pgdat);
@@ -5800,7 +5807,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5800 5807
5801 ret = migrate_pages(&cc->migratepages, 5808 ret = migrate_pages(&cc->migratepages,
5802 alloc_migrate_target, 5809 alloc_migrate_target,
5803 0, false, MIGRATE_SYNC); 5810 0, false, MIGRATE_SYNC,
5811 MR_CMA);
5804 } 5812 }
5805 5813
5806 putback_movable_pages(&cc->migratepages); 5814 putback_movable_pages(&cc->migratepages);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e642627da6b7..0c8323fe6c8f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -12,8 +12,8 @@
12 12
13#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 13#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
14/* 14/*
15 * Only sets the access flags (dirty, accessed, and 15 * Only sets the access flags (dirty, accessed), as well as write
16 * writable). Furthermore, we know it always gets set to a "more 16 * permission. Furthermore, we know it always gets set to a "more
17 * permissive" setting, which allows most architectures to optimize 17 * permissive" setting, which allows most architectures to optimize
18 * this. We return whether the PTE actually changed, which in turn 18 * this. We return whether the PTE actually changed, which in turn
19 * instructs the caller to do things like update__mmu_cache. This 19 * instructs the caller to do things like update__mmu_cache. This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
27 int changed = !pte_same(*ptep, entry); 27 int changed = !pte_same(*ptep, entry);
28 if (changed) { 28 if (changed) {
29 set_pte_at(vma->vm_mm, address, ptep, entry); 29 set_pte_at(vma->vm_mm, address, ptep, entry);
30 flush_tlb_page(vma, address); 30 flush_tlb_fix_spurious_fault(vma, address);
31 } 31 }
32 return changed; 32 return changed;
33} 33}
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
88{ 88{
89 pte_t pte; 89 pte_t pte;
90 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); 90 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
91 flush_tlb_page(vma, address); 91 if (pte_accessible(pte))
92 flush_tlb_page(vma, address);
92 return pte; 93 return pte;
93} 94}
94#endif 95#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index face808a489e..2c78f8cadc95 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
24 * mm->mmap_sem 24 * mm->mmap_sem
25 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
26 * mapping->i_mmap_mutex 26 * mapping->i_mmap_mutex
27 * anon_vma->mutex 27 * anon_vma->rwsem
28 * mm->page_table_lock or pte_lock 28 * mm->page_table_lock or pte_lock
29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 29 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
30 * swap_lock (in swap_duplicate, swap_info_get) 30 * swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within bdi.wb->list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
39 * 39 *
40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) 40 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
41 * ->tasklist_lock 41 * ->tasklist_lock
42 * pte map lock 42 * pte map lock
43 */ 43 */
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
87 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 87 VM_BUG_ON(atomic_read(&anon_vma->refcount));
88 88
89 /* 89 /*
90 * Synchronize against page_lock_anon_vma() such that 90 * Synchronize against page_lock_anon_vma_read() such that
91 * we can safely hold the lock without the anon_vma getting 91 * we can safely hold the lock without the anon_vma getting
92 * freed. 92 * freed.
93 * 93 *
94 * Relies on the full mb implied by the atomic_dec_and_test() from 94 * Relies on the full mb implied by the atomic_dec_and_test() from
95 * put_anon_vma() against the acquire barrier implied by 95 * put_anon_vma() against the acquire barrier implied by
96 * mutex_trylock() from page_lock_anon_vma(). This orders: 96 * down_read_trylock() from page_lock_anon_vma_read(). This orders:
97 * 97 *
98 * page_lock_anon_vma() VS put_anon_vma() 98 * page_lock_anon_vma_read() VS put_anon_vma()
99 * mutex_trylock() atomic_dec_and_test() 99 * down_read_trylock() atomic_dec_and_test()
100 * LOCK MB 100 * LOCK MB
101 * atomic_read() mutex_is_locked() 101 * atomic_read() rwsem_is_locked()
102 * 102 *
103 * LOCK should suffice since the actual taking of the lock must 103 * LOCK should suffice since the actual taking of the lock must
104 * happen _before_ what follows. 104 * happen _before_ what follows.
105 */ 105 */
106 if (mutex_is_locked(&anon_vma->root->mutex)) { 106 if (rwsem_is_locked(&anon_vma->root->rwsem)) {
107 anon_vma_lock(anon_vma); 107 anon_vma_lock_write(anon_vma);
108 anon_vma_unlock(anon_vma); 108 anon_vma_unlock(anon_vma);
109 } 109 }
110 110
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
146 * allocate a new one. 146 * allocate a new one.
147 * 147 *
148 * Anon-vma allocations are very subtle, because we may have 148 * Anon-vma allocations are very subtle, because we may have
149 * optimistically looked up an anon_vma in page_lock_anon_vma() 149 * optimistically looked up an anon_vma in page_lock_anon_vma_read()
150 * and that may actually touch the spinlock even in the newly 150 * and that may actually touch the spinlock even in the newly
151 * allocated vma (it depends on RCU to make sure that the 151 * allocated vma (it depends on RCU to make sure that the
152 * anon_vma isn't actually destroyed). 152 * anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
181 allocated = anon_vma; 181 allocated = anon_vma;
182 } 182 }
183 183
184 anon_vma_lock(anon_vma); 184 anon_vma_lock_write(anon_vma);
185 /* page_table_lock to protect against threads */ 185 /* page_table_lock to protect against threads */
186 spin_lock(&mm->page_table_lock); 186 spin_lock(&mm->page_table_lock);
187 if (likely(!vma->anon_vma)) { 187 if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
219 struct anon_vma *new_root = anon_vma->root; 219 struct anon_vma *new_root = anon_vma->root;
220 if (new_root != root) { 220 if (new_root != root) {
221 if (WARN_ON_ONCE(root)) 221 if (WARN_ON_ONCE(root))
222 mutex_unlock(&root->mutex); 222 up_write(&root->rwsem);
223 root = new_root; 223 root = new_root;
224 mutex_lock(&root->mutex); 224 down_write(&root->rwsem);
225 } 225 }
226 return root; 226 return root;
227} 227}
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
229static inline void unlock_anon_vma_root(struct anon_vma *root) 229static inline void unlock_anon_vma_root(struct anon_vma *root)
230{ 230{
231 if (root) 231 if (root)
232 mutex_unlock(&root->mutex); 232 up_write(&root->rwsem);
233} 233}
234 234
235/* 235/*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
306 get_anon_vma(anon_vma->root); 306 get_anon_vma(anon_vma->root);
307 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 307 /* Mark this anon_vma as the one where our new (COWed) pages go. */
308 vma->anon_vma = anon_vma; 308 vma->anon_vma = anon_vma;
309 anon_vma_lock(anon_vma); 309 anon_vma_lock_write(anon_vma);
310 anon_vma_chain_link(vma, avc, anon_vma); 310 anon_vma_chain_link(vma, avc, anon_vma);
311 anon_vma_unlock(anon_vma); 311 anon_vma_unlock(anon_vma);
312 312
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
349 /* 349 /*
350 * Iterate the list once more, it now only contains empty and unlinked 350 * Iterate the list once more, it now only contains empty and unlinked
351 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 351 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
352 * needing to acquire the anon_vma->root->mutex. 352 * needing to write-acquire the anon_vma->root->rwsem.
353 */ 353 */
354 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 354 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
355 struct anon_vma *anon_vma = avc->anon_vma; 355 struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
365{ 365{
366 struct anon_vma *anon_vma = data; 366 struct anon_vma *anon_vma = data;
367 367
368 mutex_init(&anon_vma->mutex); 368 init_rwsem(&anon_vma->rwsem);
369 atomic_set(&anon_vma->refcount, 0); 369 atomic_set(&anon_vma->refcount, 0);
370 anon_vma->rb_root = RB_ROOT; 370 anon_vma->rb_root = RB_ROOT;
371} 371}
@@ -442,7 +442,7 @@ out:
442 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 442 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
443 * reference like with page_get_anon_vma() and then block on the mutex. 443 * reference like with page_get_anon_vma() and then block on the mutex.
444 */ 444 */
445struct anon_vma *page_lock_anon_vma(struct page *page) 445struct anon_vma *page_lock_anon_vma_read(struct page *page)
446{ 446{
447 struct anon_vma *anon_vma = NULL; 447 struct anon_vma *anon_vma = NULL;
448 struct anon_vma *root_anon_vma; 448 struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
457 457
458 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 458 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
459 root_anon_vma = ACCESS_ONCE(anon_vma->root); 459 root_anon_vma = ACCESS_ONCE(anon_vma->root);
460 if (mutex_trylock(&root_anon_vma->mutex)) { 460 if (down_read_trylock(&root_anon_vma->rwsem)) {
461 /* 461 /*
462 * If the page is still mapped, then this anon_vma is still 462 * If the page is still mapped, then this anon_vma is still
463 * its anon_vma, and holding the mutex ensures that it will 463 * its anon_vma, and holding the mutex ensures that it will
464 * not go away, see anon_vma_free(). 464 * not go away, see anon_vma_free().
465 */ 465 */
466 if (!page_mapped(page)) { 466 if (!page_mapped(page)) {
467 mutex_unlock(&root_anon_vma->mutex); 467 up_read(&root_anon_vma->rwsem);
468 anon_vma = NULL; 468 anon_vma = NULL;
469 } 469 }
470 goto out; 470 goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
484 484
485 /* we pinned the anon_vma, its safe to sleep */ 485 /* we pinned the anon_vma, its safe to sleep */
486 rcu_read_unlock(); 486 rcu_read_unlock();
487 anon_vma_lock(anon_vma); 487 anon_vma_lock_read(anon_vma);
488 488
489 if (atomic_dec_and_test(&anon_vma->refcount)) { 489 if (atomic_dec_and_test(&anon_vma->refcount)) {
490 /* 490 /*
491 * Oops, we held the last refcount, release the lock 491 * Oops, we held the last refcount, release the lock
492 * and bail -- can't simply use put_anon_vma() because 492 * and bail -- can't simply use put_anon_vma() because
493 * we'll deadlock on the anon_vma_lock() recursion. 493 * we'll deadlock on the anon_vma_lock_write() recursion.
494 */ 494 */
495 anon_vma_unlock(anon_vma); 495 anon_vma_unlock_read(anon_vma);
496 __put_anon_vma(anon_vma); 496 __put_anon_vma(anon_vma);
497 anon_vma = NULL; 497 anon_vma = NULL;
498 } 498 }
@@ -504,9 +504,9 @@ out:
504 return anon_vma; 504 return anon_vma;
505} 505}
506 506
507void page_unlock_anon_vma(struct anon_vma *anon_vma) 507void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
508{ 508{
509 anon_vma_unlock(anon_vma); 509 anon_vma_unlock_read(anon_vma);
510} 510}
511 511
512/* 512/*
@@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page,
744 struct anon_vma_chain *avc; 744 struct anon_vma_chain *avc;
745 int referenced = 0; 745 int referenced = 0;
746 746
747 anon_vma = page_lock_anon_vma(page); 747 anon_vma = page_lock_anon_vma_read(page);
748 if (!anon_vma) 748 if (!anon_vma)
749 return referenced; 749 return referenced;
750 750
@@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page,
766 break; 766 break;
767 } 767 }
768 768
769 page_unlock_anon_vma(anon_vma); 769 page_unlock_anon_vma_read(anon_vma);
770 return referenced; 770 return referenced;
771} 771}
772 772
@@ -1315,7 +1315,7 @@ out_mlock:
1315 /* 1315 /*
1316 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 1316 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1317 * unstable result and race. Plus, We can't wait here because 1317 * unstable result and race. Plus, We can't wait here because
1318 * we now hold anon_vma->mutex or mapping->i_mmap_mutex. 1318 * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
1319 * if trylock failed, the page remain in evictable lru and later 1319 * if trylock failed, the page remain in evictable lru and later
1320 * vmscan could retry to move the page to unevictable lru if the 1320 * vmscan could retry to move the page to unevictable lru if the
1321 * page is actually mlocked. 1321 * page is actually mlocked.
@@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1480 struct anon_vma_chain *avc; 1480 struct anon_vma_chain *avc;
1481 int ret = SWAP_AGAIN; 1481 int ret = SWAP_AGAIN;
1482 1482
1483 anon_vma = page_lock_anon_vma(page); 1483 anon_vma = page_lock_anon_vma_read(page);
1484 if (!anon_vma) 1484 if (!anon_vma)
1485 return ret; 1485 return ret;
1486 1486
@@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1507 break; 1507 break;
1508 } 1508 }
1509 1509
1510 page_unlock_anon_vma(anon_vma); 1510 page_unlock_anon_vma_read(anon_vma);
1511 return ret; 1511 return ret;
1512} 1512}
1513 1513
@@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1702 int ret = SWAP_AGAIN; 1702 int ret = SWAP_AGAIN;
1703 1703
1704 /* 1704 /*
1705 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() 1705 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
1706 * because that depends on page_mapped(); but not all its usages 1706 * because that depends on page_mapped(); but not all its usages
1707 * are holding mmap_sem. Users without mmap_sem are required to 1707 * are holding mmap_sem. Users without mmap_sem are required to
1708 * take a reference count to prevent the anon_vma disappearing 1708 * take a reference count to prevent the anon_vma disappearing
@@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1710 anon_vma = page_anon_vma(page); 1710 anon_vma = page_anon_vma(page);
1711 if (!anon_vma) 1711 if (!anon_vma)
1712 return ret; 1712 return ret;
1713 anon_vma_lock(anon_vma); 1713 anon_vma_lock_read(anon_vma);
1714 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { 1714 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1715 struct vm_area_struct *vma = avc->vma; 1715 struct vm_area_struct *vma = avc->vma;
1716 unsigned long address = vma_address(page, vma); 1716 unsigned long address = vma_address(page, vma);
@@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1718 if (ret != SWAP_AGAIN) 1718 if (ret != SWAP_AGAIN)
1719 break; 1719 break;
1720 } 1720 }
1721 anon_vma_unlock(anon_vma); 1721 anon_vma_unlock_read(anon_vma);
1722 return ret; 1722 return ret;
1723} 1723}
1724 1724
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df14808f0a36..9800306c8195 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = {
774 774
775 "pgrotated", 775 "pgrotated",
776 776
777#ifdef CONFIG_NUMA_BALANCING
778 "numa_pte_updates",
779 "numa_hint_faults",
780 "numa_hint_faults_local",
781 "numa_pages_migrated",
782#endif
783#ifdef CONFIG_MIGRATION
784 "pgmigrate_success",
785 "pgmigrate_fail",
786#endif
777#ifdef CONFIG_COMPACTION 787#ifdef CONFIG_COMPACTION
778 "compact_blocks_moved", 788 "compact_migrate_scanned",
779 "compact_pages_moved", 789 "compact_free_scanned",
780 "compact_pagemigrate_failed", 790 "compact_isolated",
781 "compact_stall", 791 "compact_stall",
782 "compact_fail", 792 "compact_fail",
783 "compact_success", 793 "compact_success",