diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 15 | ||||
-rw-r--r-- | mm/huge_memory.c | 108 | ||||
-rw-r--r-- | mm/hugetlb.c | 10 | ||||
-rw-r--r-- | mm/internal.h | 7 | ||||
-rw-r--r-- | mm/ksm.c | 6 | ||||
-rw-r--r-- | mm/memcontrol.c | 7 | ||||
-rw-r--r-- | mm/memory-failure.c | 7 | ||||
-rw-r--r-- | mm/memory.c | 198 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 3 | ||||
-rw-r--r-- | mm/mempolicy.c | 283 | ||||
-rw-r--r-- | mm/migrate.c | 337 | ||||
-rw-r--r-- | mm/mmap.c | 10 | ||||
-rw-r--r-- | mm/mprotect.c | 135 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 10 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 9 | ||||
-rw-r--r-- | mm/rmap.c | 66 | ||||
-rw-r--r-- | mm/vmstat.c | 16 |
18 files changed, 1098 insertions, 131 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 129791218226..5ad7f4f4d6f7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -303,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
303 | if (blockpfn == end_pfn) | 303 | if (blockpfn == end_pfn) |
304 | update_pageblock_skip(cc, valid_page, total_isolated, false); | 304 | update_pageblock_skip(cc, valid_page, total_isolated, false); |
305 | 305 | ||
306 | count_vm_events(COMPACTFREE_SCANNED, nr_scanned); | ||
307 | if (total_isolated) | ||
308 | count_vm_events(COMPACTISOLATED, total_isolated); | ||
309 | |||
306 | return total_isolated; | 310 | return total_isolated; |
307 | } | 311 | } |
308 | 312 | ||
@@ -609,6 +613,10 @@ next_pageblock: | |||
609 | 613 | ||
610 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 614 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
611 | 615 | ||
616 | count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned); | ||
617 | if (nr_isolated) | ||
618 | count_vm_events(COMPACTISOLATED, nr_isolated); | ||
619 | |||
612 | return low_pfn; | 620 | return low_pfn; |
613 | } | 621 | } |
614 | 622 | ||
@@ -1015,14 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1015 | nr_migrate = cc->nr_migratepages; | 1023 | nr_migrate = cc->nr_migratepages; |
1016 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 1024 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
1017 | (unsigned long)cc, false, | 1025 | (unsigned long)cc, false, |
1018 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); | 1026 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, |
1027 | MR_COMPACTION); | ||
1019 | update_nr_listpages(cc); | 1028 | update_nr_listpages(cc); |
1020 | nr_remaining = cc->nr_migratepages; | 1029 | nr_remaining = cc->nr_migratepages; |
1021 | 1030 | ||
1022 | count_vm_event(COMPACTBLOCKS); | ||
1023 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); | ||
1024 | if (nr_remaining) | ||
1025 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); | ||
1026 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, | 1031 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, |
1027 | nr_remaining); | 1032 | nr_remaining); |
1028 | 1033 | ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 827d9c813051..d7ee1691fd21 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/freezer.h> | 19 | #include <linux/freezer.h> |
20 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/migrate.h> | ||
22 | 23 | ||
23 | #include <asm/tlb.h> | 24 | #include <asm/tlb.h> |
24 | #include <asm/pgalloc.h> | 25 | #include <asm/pgalloc.h> |
@@ -690,7 +691,7 @@ out: | |||
690 | } | 691 | } |
691 | __setup("transparent_hugepage=", setup_transparent_hugepage); | 692 | __setup("transparent_hugepage=", setup_transparent_hugepage); |
692 | 693 | ||
693 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | 694 | pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) |
694 | { | 695 | { |
695 | if (likely(vma->vm_flags & VM_WRITE)) | 696 | if (likely(vma->vm_flags & VM_WRITE)) |
696 | pmd = pmd_mkwrite(pmd); | 697 | pmd = pmd_mkwrite(pmd); |
@@ -848,7 +849,8 @@ out: | |||
848 | * run pte_offset_map on the pmd, if an huge pmd could | 849 | * run pte_offset_map on the pmd, if an huge pmd could |
849 | * materialize from under us from a different thread. | 850 | * materialize from under us from a different thread. |
850 | */ | 851 | */ |
851 | if (unlikely(__pte_alloc(mm, vma, pmd, address))) | 852 | if (unlikely(pmd_none(*pmd)) && |
853 | unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
852 | return VM_FAULT_OOM; | 854 | return VM_FAULT_OOM; |
853 | /* if an huge pmd materialized from under us just retry later */ | 855 | /* if an huge pmd materialized from under us just retry later */ |
854 | if (unlikely(pmd_trans_huge(*pmd))) | 856 | if (unlikely(pmd_trans_huge(*pmd))) |
@@ -1287,6 +1289,81 @@ out: | |||
1287 | return page; | 1289 | return page; |
1288 | } | 1290 | } |
1289 | 1291 | ||
1292 | /* NUMA hinting page fault entry point for trans huge pmds */ | ||
1293 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
1294 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | ||
1295 | { | ||
1296 | struct page *page; | ||
1297 | unsigned long haddr = addr & HPAGE_PMD_MASK; | ||
1298 | int target_nid; | ||
1299 | int current_nid = -1; | ||
1300 | bool migrated; | ||
1301 | bool page_locked = false; | ||
1302 | |||
1303 | spin_lock(&mm->page_table_lock); | ||
1304 | if (unlikely(!pmd_same(pmd, *pmdp))) | ||
1305 | goto out_unlock; | ||
1306 | |||
1307 | page = pmd_page(pmd); | ||
1308 | get_page(page); | ||
1309 | current_nid = page_to_nid(page); | ||
1310 | count_vm_numa_event(NUMA_HINT_FAULTS); | ||
1311 | if (current_nid == numa_node_id()) | ||
1312 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | ||
1313 | |||
1314 | target_nid = mpol_misplaced(page, vma, haddr); | ||
1315 | if (target_nid == -1) { | ||
1316 | put_page(page); | ||
1317 | goto clear_pmdnuma; | ||
1318 | } | ||
1319 | |||
1320 | /* Acquire the page lock to serialise THP migrations */ | ||
1321 | spin_unlock(&mm->page_table_lock); | ||
1322 | lock_page(page); | ||
1323 | page_locked = true; | ||
1324 | |||
1325 | /* Confirm the PTE did not while locked */ | ||
1326 | spin_lock(&mm->page_table_lock); | ||
1327 | if (unlikely(!pmd_same(pmd, *pmdp))) { | ||
1328 | unlock_page(page); | ||
1329 | put_page(page); | ||
1330 | goto out_unlock; | ||
1331 | } | ||
1332 | spin_unlock(&mm->page_table_lock); | ||
1333 | |||
1334 | /* Migrate the THP to the requested node */ | ||
1335 | migrated = migrate_misplaced_transhuge_page(mm, vma, | ||
1336 | pmdp, pmd, addr, | ||
1337 | page, target_nid); | ||
1338 | if (migrated) | ||
1339 | current_nid = target_nid; | ||
1340 | else { | ||
1341 | spin_lock(&mm->page_table_lock); | ||
1342 | if (unlikely(!pmd_same(pmd, *pmdp))) { | ||
1343 | unlock_page(page); | ||
1344 | goto out_unlock; | ||
1345 | } | ||
1346 | goto clear_pmdnuma; | ||
1347 | } | ||
1348 | |||
1349 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | ||
1350 | return 0; | ||
1351 | |||
1352 | clear_pmdnuma: | ||
1353 | pmd = pmd_mknonnuma(pmd); | ||
1354 | set_pmd_at(mm, haddr, pmdp, pmd); | ||
1355 | VM_BUG_ON(pmd_numa(*pmdp)); | ||
1356 | update_mmu_cache_pmd(vma, addr, pmdp); | ||
1357 | if (page_locked) | ||
1358 | unlock_page(page); | ||
1359 | |||
1360 | out_unlock: | ||
1361 | spin_unlock(&mm->page_table_lock); | ||
1362 | if (current_nid != -1) | ||
1363 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | ||
1364 | return 0; | ||
1365 | } | ||
1366 | |||
1290 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1367 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, |
1291 | pmd_t *pmd, unsigned long addr) | 1368 | pmd_t *pmd, unsigned long addr) |
1292 | { | 1369 | { |
@@ -1375,7 +1452,7 @@ out: | |||
1375 | } | 1452 | } |
1376 | 1453 | ||
1377 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 1454 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
1378 | unsigned long addr, pgprot_t newprot) | 1455 | unsigned long addr, pgprot_t newprot, int prot_numa) |
1379 | { | 1456 | { |
1380 | struct mm_struct *mm = vma->vm_mm; | 1457 | struct mm_struct *mm = vma->vm_mm; |
1381 | int ret = 0; | 1458 | int ret = 0; |
@@ -1383,7 +1460,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1383 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1460 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1384 | pmd_t entry; | 1461 | pmd_t entry; |
1385 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1462 | entry = pmdp_get_and_clear(mm, addr, pmd); |
1386 | entry = pmd_modify(entry, newprot); | 1463 | if (!prot_numa) |
1464 | entry = pmd_modify(entry, newprot); | ||
1465 | else { | ||
1466 | struct page *page = pmd_page(*pmd); | ||
1467 | |||
1468 | /* only check non-shared pages */ | ||
1469 | if (page_mapcount(page) == 1 && | ||
1470 | !pmd_numa(*pmd)) { | ||
1471 | entry = pmd_mknuma(entry); | ||
1472 | } | ||
1473 | } | ||
1387 | BUG_ON(pmd_write(entry)); | 1474 | BUG_ON(pmd_write(entry)); |
1388 | set_pmd_at(mm, addr, pmd, entry); | 1475 | set_pmd_at(mm, addr, pmd, entry); |
1389 | spin_unlock(&vma->vm_mm->page_table_lock); | 1476 | spin_unlock(&vma->vm_mm->page_table_lock); |
@@ -1474,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page, | |||
1474 | * We can't temporarily set the pmd to null in order | 1561 | * We can't temporarily set the pmd to null in order |
1475 | * to split it, the pmd must remain marked huge at all | 1562 | * to split it, the pmd must remain marked huge at all |
1476 | * times or the VM won't take the pmd_trans_huge paths | 1563 | * times or the VM won't take the pmd_trans_huge paths |
1477 | * and it won't wait on the anon_vma->root->mutex to | 1564 | * and it won't wait on the anon_vma->root->rwsem to |
1478 | * serialize against split_huge_page*. | 1565 | * serialize against split_huge_page*. |
1479 | */ | 1566 | */ |
1480 | pmdp_splitting_flush(vma, address, pmd); | 1567 | pmdp_splitting_flush(vma, address, pmd); |
@@ -1565,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1565 | page_tail->mapping = page->mapping; | 1652 | page_tail->mapping = page->mapping; |
1566 | 1653 | ||
1567 | page_tail->index = page->index + i; | 1654 | page_tail->index = page->index + i; |
1655 | page_xchg_last_nid(page_tail, page_last_nid(page)); | ||
1568 | 1656 | ||
1569 | BUG_ON(!PageAnon(page_tail)); | 1657 | BUG_ON(!PageAnon(page_tail)); |
1570 | BUG_ON(!PageUptodate(page_tail)); | 1658 | BUG_ON(!PageUptodate(page_tail)); |
@@ -1632,6 +1720,8 @@ static int __split_huge_page_map(struct page *page, | |||
1632 | BUG_ON(page_mapcount(page) != 1); | 1720 | BUG_ON(page_mapcount(page) != 1); |
1633 | if (!pmd_young(*pmd)) | 1721 | if (!pmd_young(*pmd)) |
1634 | entry = pte_mkold(entry); | 1722 | entry = pte_mkold(entry); |
1723 | if (pmd_numa(*pmd)) | ||
1724 | entry = pte_mknuma(entry); | ||
1635 | pte = pte_offset_map(&_pmd, haddr); | 1725 | pte = pte_offset_map(&_pmd, haddr); |
1636 | BUG_ON(!pte_none(*pte)); | 1726 | BUG_ON(!pte_none(*pte)); |
1637 | set_pte_at(mm, haddr, pte, entry); | 1727 | set_pte_at(mm, haddr, pte, entry); |
@@ -1674,7 +1764,7 @@ static int __split_huge_page_map(struct page *page, | |||
1674 | return ret; | 1764 | return ret; |
1675 | } | 1765 | } |
1676 | 1766 | ||
1677 | /* must be called with anon_vma->root->mutex hold */ | 1767 | /* must be called with anon_vma->root->rwsem held */ |
1678 | static void __split_huge_page(struct page *page, | 1768 | static void __split_huge_page(struct page *page, |
1679 | struct anon_vma *anon_vma) | 1769 | struct anon_vma *anon_vma) |
1680 | { | 1770 | { |
@@ -1729,7 +1819,7 @@ int split_huge_page(struct page *page) | |||
1729 | 1819 | ||
1730 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); | 1820 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); |
1731 | BUG_ON(!PageAnon(page)); | 1821 | BUG_ON(!PageAnon(page)); |
1732 | anon_vma = page_lock_anon_vma(page); | 1822 | anon_vma = page_lock_anon_vma_read(page); |
1733 | if (!anon_vma) | 1823 | if (!anon_vma) |
1734 | goto out; | 1824 | goto out; |
1735 | ret = 0; | 1825 | ret = 0; |
@@ -1742,7 +1832,7 @@ int split_huge_page(struct page *page) | |||
1742 | 1832 | ||
1743 | BUG_ON(PageCompound(page)); | 1833 | BUG_ON(PageCompound(page)); |
1744 | out_unlock: | 1834 | out_unlock: |
1745 | page_unlock_anon_vma(anon_vma); | 1835 | page_unlock_anon_vma_read(anon_vma); |
1746 | out: | 1836 | out: |
1747 | return ret; | 1837 | return ret; |
1748 | } | 1838 | } |
@@ -2234,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2234 | if (pmd_trans_huge(*pmd)) | 2324 | if (pmd_trans_huge(*pmd)) |
2235 | goto out; | 2325 | goto out; |
2236 | 2326 | ||
2237 | anon_vma_lock(vma->anon_vma); | 2327 | anon_vma_lock_write(vma->anon_vma); |
2238 | 2328 | ||
2239 | pte = pte_offset_map(pmd, address); | 2329 | pte = pte_offset_map(pmd, address); |
2240 | ptl = pte_lockptr(mm, pmd); | 2330 | ptl = pte_lockptr(mm, pmd); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 88e7293b96bd..e5318c7793ae 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -3016,7 +3016,7 @@ same_page: | |||
3016 | return i ? i : -EFAULT; | 3016 | return i ? i : -EFAULT; |
3017 | } | 3017 | } |
3018 | 3018 | ||
3019 | void hugetlb_change_protection(struct vm_area_struct *vma, | 3019 | unsigned long hugetlb_change_protection(struct vm_area_struct *vma, |
3020 | unsigned long address, unsigned long end, pgprot_t newprot) | 3020 | unsigned long address, unsigned long end, pgprot_t newprot) |
3021 | { | 3021 | { |
3022 | struct mm_struct *mm = vma->vm_mm; | 3022 | struct mm_struct *mm = vma->vm_mm; |
@@ -3024,6 +3024,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
3024 | pte_t *ptep; | 3024 | pte_t *ptep; |
3025 | pte_t pte; | 3025 | pte_t pte; |
3026 | struct hstate *h = hstate_vma(vma); | 3026 | struct hstate *h = hstate_vma(vma); |
3027 | unsigned long pages = 0; | ||
3027 | 3028 | ||
3028 | BUG_ON(address >= end); | 3029 | BUG_ON(address >= end); |
3029 | flush_cache_range(vma, address, end); | 3030 | flush_cache_range(vma, address, end); |
@@ -3034,12 +3035,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
3034 | ptep = huge_pte_offset(mm, address); | 3035 | ptep = huge_pte_offset(mm, address); |
3035 | if (!ptep) | 3036 | if (!ptep) |
3036 | continue; | 3037 | continue; |
3037 | if (huge_pmd_unshare(mm, &address, ptep)) | 3038 | if (huge_pmd_unshare(mm, &address, ptep)) { |
3039 | pages++; | ||
3038 | continue; | 3040 | continue; |
3041 | } | ||
3039 | if (!huge_pte_none(huge_ptep_get(ptep))) { | 3042 | if (!huge_pte_none(huge_ptep_get(ptep))) { |
3040 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 3043 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
3041 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 3044 | pte = pte_mkhuge(pte_modify(pte, newprot)); |
3042 | set_huge_pte_at(mm, address, ptep, pte); | 3045 | set_huge_pte_at(mm, address, ptep, pte); |
3046 | pages++; | ||
3043 | } | 3047 | } |
3044 | } | 3048 | } |
3045 | spin_unlock(&mm->page_table_lock); | 3049 | spin_unlock(&mm->page_table_lock); |
@@ -3051,6 +3055,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
3051 | */ | 3055 | */ |
3052 | flush_tlb_range(vma, start, end); | 3056 | flush_tlb_range(vma, start, end); |
3053 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3057 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); |
3058 | |||
3059 | return pages << h->order; | ||
3054 | } | 3060 | } |
3055 | 3061 | ||
3056 | int hugetlb_reserve_pages(struct inode *inode, | 3062 | int hugetlb_reserve_pages(struct inode *inode, |
diff --git a/mm/internal.h b/mm/internal.h index 52d1fa957194..d597f94cc205 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -217,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
217 | { | 217 | { |
218 | if (TestClearPageMlocked(page)) { | 218 | if (TestClearPageMlocked(page)) { |
219 | unsigned long flags; | 219 | unsigned long flags; |
220 | int nr_pages = hpage_nr_pages(page); | ||
220 | 221 | ||
221 | local_irq_save(flags); | 222 | local_irq_save(flags); |
222 | __dec_zone_page_state(page, NR_MLOCK); | 223 | __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
223 | SetPageMlocked(newpage); | 224 | SetPageMlocked(newpage); |
224 | __inc_zone_page_state(newpage, NR_MLOCK); | 225 | __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); |
225 | local_irq_restore(flags); | 226 | local_irq_restore(flags); |
226 | } | 227 | } |
227 | } | 228 | } |
228 | 229 | ||
230 | extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); | ||
231 | |||
229 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 232 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
230 | extern unsigned long vma_address(struct page *page, | 233 | extern unsigned long vma_address(struct page *page, |
231 | struct vm_area_struct *vma); | 234 | struct vm_area_struct *vma); |
@@ -1624,7 +1624,7 @@ again: | |||
1624 | struct anon_vma_chain *vmac; | 1624 | struct anon_vma_chain *vmac; |
1625 | struct vm_area_struct *vma; | 1625 | struct vm_area_struct *vma; |
1626 | 1626 | ||
1627 | anon_vma_lock(anon_vma); | 1627 | anon_vma_lock_write(anon_vma); |
1628 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1628 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1629 | 0, ULONG_MAX) { | 1629 | 0, ULONG_MAX) { |
1630 | vma = vmac->vma; | 1630 | vma = vmac->vma; |
@@ -1678,7 +1678,7 @@ again: | |||
1678 | struct anon_vma_chain *vmac; | 1678 | struct anon_vma_chain *vmac; |
1679 | struct vm_area_struct *vma; | 1679 | struct vm_area_struct *vma; |
1680 | 1680 | ||
1681 | anon_vma_lock(anon_vma); | 1681 | anon_vma_lock_write(anon_vma); |
1682 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1682 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1683 | 0, ULONG_MAX) { | 1683 | 0, ULONG_MAX) { |
1684 | vma = vmac->vma; | 1684 | vma = vmac->vma; |
@@ -1731,7 +1731,7 @@ again: | |||
1731 | struct anon_vma_chain *vmac; | 1731 | struct anon_vma_chain *vmac; |
1732 | struct vm_area_struct *vma; | 1732 | struct vm_area_struct *vma; |
1733 | 1733 | ||
1734 | anon_vma_lock(anon_vma); | 1734 | anon_vma_lock_write(anon_vma); |
1735 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1735 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1736 | 0, ULONG_MAX) { | 1736 | 0, ULONG_MAX) { |
1737 | vma = vmac->vma; | 1737 | vma = vmac->vma; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c055929c8cc..bbfac5063ca8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -3289,15 +3289,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | |||
3289 | struct mem_cgroup **memcgp) | 3289 | struct mem_cgroup **memcgp) |
3290 | { | 3290 | { |
3291 | struct mem_cgroup *memcg = NULL; | 3291 | struct mem_cgroup *memcg = NULL; |
3292 | unsigned int nr_pages = 1; | ||
3292 | struct page_cgroup *pc; | 3293 | struct page_cgroup *pc; |
3293 | enum charge_type ctype; | 3294 | enum charge_type ctype; |
3294 | 3295 | ||
3295 | *memcgp = NULL; | 3296 | *memcgp = NULL; |
3296 | 3297 | ||
3297 | VM_BUG_ON(PageTransHuge(page)); | ||
3298 | if (mem_cgroup_disabled()) | 3298 | if (mem_cgroup_disabled()) |
3299 | return; | 3299 | return; |
3300 | 3300 | ||
3301 | if (PageTransHuge(page)) | ||
3302 | nr_pages <<= compound_order(page); | ||
3303 | |||
3301 | pc = lookup_page_cgroup(page); | 3304 | pc = lookup_page_cgroup(page); |
3302 | lock_page_cgroup(pc); | 3305 | lock_page_cgroup(pc); |
3303 | if (PageCgroupUsed(pc)) { | 3306 | if (PageCgroupUsed(pc)) { |
@@ -3359,7 +3362,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | |||
3359 | * charged to the res_counter since we plan on replacing the | 3362 | * charged to the res_counter since we plan on replacing the |
3360 | * old one and only one page is going to be left afterwards. | 3363 | * old one and only one page is going to be left afterwards. |
3361 | */ | 3364 | */ |
3362 | __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); | 3365 | __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); |
3363 | } | 3366 | } |
3364 | 3367 | ||
3365 | /* remove redundant charge if migration failed*/ | 3368 | /* remove redundant charge if migration failed*/ |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 108c52fa60f6..c6e4dd3e1c08 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
402 | struct anon_vma *av; | 402 | struct anon_vma *av; |
403 | pgoff_t pgoff; | 403 | pgoff_t pgoff; |
404 | 404 | ||
405 | av = page_lock_anon_vma(page); | 405 | av = page_lock_anon_vma_read(page); |
406 | if (av == NULL) /* Not actually mapped anymore */ | 406 | if (av == NULL) /* Not actually mapped anymore */ |
407 | return; | 407 | return; |
408 | 408 | ||
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
423 | } | 423 | } |
424 | } | 424 | } |
425 | read_unlock(&tasklist_lock); | 425 | read_unlock(&tasklist_lock); |
426 | page_unlock_anon_vma(av); | 426 | page_unlock_anon_vma_read(av); |
427 | } | 427 | } |
428 | 428 | ||
429 | /* | 429 | /* |
@@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags) | |||
1566 | page_is_file_cache(page)); | 1566 | page_is_file_cache(page)); |
1567 | list_add(&page->lru, &pagelist); | 1567 | list_add(&page->lru, &pagelist); |
1568 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1568 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1569 | false, MIGRATE_SYNC); | 1569 | false, MIGRATE_SYNC, |
1570 | MR_MEMORY_FAILURE); | ||
1570 | if (ret) { | 1571 | if (ret) { |
1571 | putback_lru_pages(&pagelist); | 1572 | putback_lru_pages(&pagelist); |
1572 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1573 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
diff --git a/mm/memory.c b/mm/memory.c index db2e9e797a05..e6a3b933517e 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/swapops.h> | 57 | #include <linux/swapops.h> |
58 | #include <linux/elf.h> | 58 | #include <linux/elf.h> |
59 | #include <linux/gfp.h> | 59 | #include <linux/gfp.h> |
60 | #include <linux/migrate.h> | ||
60 | 61 | ||
61 | #include <asm/io.h> | 62 | #include <asm/io.h> |
62 | #include <asm/pgalloc.h> | 63 | #include <asm/pgalloc.h> |
@@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1503 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1504 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
1504 | goto out; | 1505 | goto out; |
1505 | } | 1506 | } |
1507 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | ||
1508 | goto no_page_table; | ||
1506 | if (pmd_trans_huge(*pmd)) { | 1509 | if (pmd_trans_huge(*pmd)) { |
1507 | if (flags & FOLL_SPLIT) { | 1510 | if (flags & FOLL_SPLIT) { |
1508 | split_huge_page_pmd(vma, address, pmd); | 1511 | split_huge_page_pmd(vma, address, pmd); |
@@ -1532,6 +1535,8 @@ split_fallthrough: | |||
1532 | pte = *ptep; | 1535 | pte = *ptep; |
1533 | if (!pte_present(pte)) | 1536 | if (!pte_present(pte)) |
1534 | goto no_page; | 1537 | goto no_page; |
1538 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | ||
1539 | goto no_page; | ||
1535 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 1540 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
1536 | goto unlock; | 1541 | goto unlock; |
1537 | 1542 | ||
@@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1683 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 1688 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
1684 | vm_flags &= (gup_flags & FOLL_FORCE) ? | 1689 | vm_flags &= (gup_flags & FOLL_FORCE) ? |
1685 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 1690 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
1691 | |||
1692 | /* | ||
1693 | * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault | ||
1694 | * would be called on PROT_NONE ranges. We must never invoke | ||
1695 | * handle_mm_fault on PROT_NONE ranges or the NUMA hinting | ||
1696 | * page faults would unprotect the PROT_NONE ranges if | ||
1697 | * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd | ||
1698 | * bitflag. So to avoid that, don't set FOLL_NUMA if | ||
1699 | * FOLL_FORCE is set. | ||
1700 | */ | ||
1701 | if (!(gup_flags & FOLL_FORCE)) | ||
1702 | gup_flags |= FOLL_NUMA; | ||
1703 | |||
1686 | i = 0; | 1704 | i = 0; |
1687 | 1705 | ||
1688 | do { | 1706 | do { |
@@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3412 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 3430 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
3413 | } | 3431 | } |
3414 | 3432 | ||
3433 | int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | ||
3434 | unsigned long addr, int current_nid) | ||
3435 | { | ||
3436 | get_page(page); | ||
3437 | |||
3438 | count_vm_numa_event(NUMA_HINT_FAULTS); | ||
3439 | if (current_nid == numa_node_id()) | ||
3440 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | ||
3441 | |||
3442 | return mpol_misplaced(page, vma, addr); | ||
3443 | } | ||
3444 | |||
3445 | int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3446 | unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) | ||
3447 | { | ||
3448 | struct page *page = NULL; | ||
3449 | spinlock_t *ptl; | ||
3450 | int current_nid = -1; | ||
3451 | int target_nid; | ||
3452 | bool migrated = false; | ||
3453 | |||
3454 | /* | ||
3455 | * The "pte" at this point cannot be used safely without | ||
3456 | * validation through pte_unmap_same(). It's of NUMA type but | ||
3457 | * the pfn may be screwed if the read is non atomic. | ||
3458 | * | ||
3459 | * ptep_modify_prot_start is not called as this is clearing | ||
3460 | * the _PAGE_NUMA bit and it is not really expected that there | ||
3461 | * would be concurrent hardware modifications to the PTE. | ||
3462 | */ | ||
3463 | ptl = pte_lockptr(mm, pmd); | ||
3464 | spin_lock(ptl); | ||
3465 | if (unlikely(!pte_same(*ptep, pte))) { | ||
3466 | pte_unmap_unlock(ptep, ptl); | ||
3467 | goto out; | ||
3468 | } | ||
3469 | |||
3470 | pte = pte_mknonnuma(pte); | ||
3471 | set_pte_at(mm, addr, ptep, pte); | ||
3472 | update_mmu_cache(vma, addr, ptep); | ||
3473 | |||
3474 | page = vm_normal_page(vma, addr, pte); | ||
3475 | if (!page) { | ||
3476 | pte_unmap_unlock(ptep, ptl); | ||
3477 | return 0; | ||
3478 | } | ||
3479 | |||
3480 | current_nid = page_to_nid(page); | ||
3481 | target_nid = numa_migrate_prep(page, vma, addr, current_nid); | ||
3482 | pte_unmap_unlock(ptep, ptl); | ||
3483 | if (target_nid == -1) { | ||
3484 | /* | ||
3485 | * Account for the fault against the current node if it not | ||
3486 | * being replaced regardless of where the page is located. | ||
3487 | */ | ||
3488 | current_nid = numa_node_id(); | ||
3489 | put_page(page); | ||
3490 | goto out; | ||
3491 | } | ||
3492 | |||
3493 | /* Migrate to the requested node */ | ||
3494 | migrated = migrate_misplaced_page(page, target_nid); | ||
3495 | if (migrated) | ||
3496 | current_nid = target_nid; | ||
3497 | |||
3498 | out: | ||
3499 | if (current_nid != -1) | ||
3500 | task_numa_fault(current_nid, 1, migrated); | ||
3501 | return 0; | ||
3502 | } | ||
3503 | |||
3504 | /* NUMA hinting page fault entry point for regular pmds */ | ||
3505 | #ifdef CONFIG_NUMA_BALANCING | ||
3506 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3507 | unsigned long addr, pmd_t *pmdp) | ||
3508 | { | ||
3509 | pmd_t pmd; | ||
3510 | pte_t *pte, *orig_pte; | ||
3511 | unsigned long _addr = addr & PMD_MASK; | ||
3512 | unsigned long offset; | ||
3513 | spinlock_t *ptl; | ||
3514 | bool numa = false; | ||
3515 | int local_nid = numa_node_id(); | ||
3516 | |||
3517 | spin_lock(&mm->page_table_lock); | ||
3518 | pmd = *pmdp; | ||
3519 | if (pmd_numa(pmd)) { | ||
3520 | set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); | ||
3521 | numa = true; | ||
3522 | } | ||
3523 | spin_unlock(&mm->page_table_lock); | ||
3524 | |||
3525 | if (!numa) | ||
3526 | return 0; | ||
3527 | |||
3528 | /* we're in a page fault so some vma must be in the range */ | ||
3529 | BUG_ON(!vma); | ||
3530 | BUG_ON(vma->vm_start >= _addr + PMD_SIZE); | ||
3531 | offset = max(_addr, vma->vm_start) & ~PMD_MASK; | ||
3532 | VM_BUG_ON(offset >= PMD_SIZE); | ||
3533 | orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); | ||
3534 | pte += offset >> PAGE_SHIFT; | ||
3535 | for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { | ||
3536 | pte_t pteval = *pte; | ||
3537 | struct page *page; | ||
3538 | int curr_nid = local_nid; | ||
3539 | int target_nid; | ||
3540 | bool migrated; | ||
3541 | if (!pte_present(pteval)) | ||
3542 | continue; | ||
3543 | if (!pte_numa(pteval)) | ||
3544 | continue; | ||
3545 | if (addr >= vma->vm_end) { | ||
3546 | vma = find_vma(mm, addr); | ||
3547 | /* there's a pte present so there must be a vma */ | ||
3548 | BUG_ON(!vma); | ||
3549 | BUG_ON(addr < vma->vm_start); | ||
3550 | } | ||
3551 | if (pte_numa(pteval)) { | ||
3552 | pteval = pte_mknonnuma(pteval); | ||
3553 | set_pte_at(mm, addr, pte, pteval); | ||
3554 | } | ||
3555 | page = vm_normal_page(vma, addr, pteval); | ||
3556 | if (unlikely(!page)) | ||
3557 | continue; | ||
3558 | /* only check non-shared pages */ | ||
3559 | if (unlikely(page_mapcount(page) != 1)) | ||
3560 | continue; | ||
3561 | |||
3562 | /* | ||
3563 | * Note that the NUMA fault is later accounted to either | ||
3564 | * the node that is currently running or where the page is | ||
3565 | * migrated to. | ||
3566 | */ | ||
3567 | curr_nid = local_nid; | ||
3568 | target_nid = numa_migrate_prep(page, vma, addr, | ||
3569 | page_to_nid(page)); | ||
3570 | if (target_nid == -1) { | ||
3571 | put_page(page); | ||
3572 | continue; | ||
3573 | } | ||
3574 | |||
3575 | /* Migrate to the requested node */ | ||
3576 | pte_unmap_unlock(pte, ptl); | ||
3577 | migrated = migrate_misplaced_page(page, target_nid); | ||
3578 | if (migrated) | ||
3579 | curr_nid = target_nid; | ||
3580 | task_numa_fault(curr_nid, 1, migrated); | ||
3581 | |||
3582 | pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); | ||
3583 | } | ||
3584 | pte_unmap_unlock(orig_pte, ptl); | ||
3585 | |||
3586 | return 0; | ||
3587 | } | ||
3588 | #else | ||
3589 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3590 | unsigned long addr, pmd_t *pmdp) | ||
3591 | { | ||
3592 | BUG(); | ||
3593 | } | ||
3594 | #endif /* CONFIG_NUMA_BALANCING */ | ||
3595 | |||
3415 | /* | 3596 | /* |
3416 | * These routines also need to handle stuff like marking pages dirty | 3597 | * These routines also need to handle stuff like marking pages dirty |
3417 | * and/or accessed for architectures that don't do it in hardware (most | 3598 | * and/or accessed for architectures that don't do it in hardware (most |
@@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm, | |||
3450 | pte, pmd, flags, entry); | 3631 | pte, pmd, flags, entry); |
3451 | } | 3632 | } |
3452 | 3633 | ||
3634 | if (pte_numa(entry)) | ||
3635 | return do_numa_page(mm, vma, address, entry, pte, pmd); | ||
3636 | |||
3453 | ptl = pte_lockptr(mm, pmd); | 3637 | ptl = pte_lockptr(mm, pmd); |
3454 | spin_lock(ptl); | 3638 | spin_lock(ptl); |
3455 | if (unlikely(!pte_same(*pte, entry))) | 3639 | if (unlikely(!pte_same(*pte, entry))) |
@@ -3520,8 +3704,11 @@ retry: | |||
3520 | if (pmd_trans_huge(orig_pmd)) { | 3704 | if (pmd_trans_huge(orig_pmd)) { |
3521 | unsigned int dirty = flags & FAULT_FLAG_WRITE; | 3705 | unsigned int dirty = flags & FAULT_FLAG_WRITE; |
3522 | 3706 | ||
3523 | if (dirty && !pmd_write(orig_pmd) && | 3707 | if (pmd_numa(orig_pmd)) |
3524 | !pmd_trans_splitting(orig_pmd)) { | 3708 | return do_huge_pmd_numa_page(mm, vma, address, |
3709 | orig_pmd, pmd); | ||
3710 | |||
3711 | if (dirty && !pmd_write(orig_pmd)) { | ||
3525 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, | 3712 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, |
3526 | orig_pmd); | 3713 | orig_pmd); |
3527 | /* | 3714 | /* |
@@ -3536,16 +3723,21 @@ retry: | |||
3536 | huge_pmd_set_accessed(mm, vma, address, pmd, | 3723 | huge_pmd_set_accessed(mm, vma, address, pmd, |
3537 | orig_pmd, dirty); | 3724 | orig_pmd, dirty); |
3538 | } | 3725 | } |
3726 | |||
3539 | return 0; | 3727 | return 0; |
3540 | } | 3728 | } |
3541 | } | 3729 | } |
3542 | 3730 | ||
3731 | if (pmd_numa(*pmd)) | ||
3732 | return do_pmd_numa_page(mm, vma, address, pmd); | ||
3733 | |||
3543 | /* | 3734 | /* |
3544 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 3735 | * Use __pte_alloc instead of pte_alloc_map, because we can't |
3545 | * run pte_offset_map on the pmd, if an huge pmd could | 3736 | * run pte_offset_map on the pmd, if an huge pmd could |
3546 | * materialize from under us from a different thread. | 3737 | * materialize from under us from a different thread. |
3547 | */ | 3738 | */ |
3548 | if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) | 3739 | if (unlikely(pmd_none(*pmd)) && |
3740 | unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
3549 | return VM_FAULT_OOM; | 3741 | return VM_FAULT_OOM; |
3550 | /* if an huge pmd materialized from under us just retry later */ | 3742 | /* if an huge pmd materialized from under us just retry later */ |
3551 | if (unlikely(pmd_trans_huge(*pmd))) | 3743 | if (unlikely(pmd_trans_huge(*pmd))) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 518baa896e83..962e353aa86f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -1055,7 +1055,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1055 | * migrate_pages returns # of failed pages. | 1055 | * migrate_pages returns # of failed pages. |
1056 | */ | 1056 | */ |
1057 | ret = migrate_pages(&source, alloc_migrate_target, 0, | 1057 | ret = migrate_pages(&source, alloc_migrate_target, 0, |
1058 | true, MIGRATE_SYNC); | 1058 | true, MIGRATE_SYNC, |
1059 | MR_MEMORY_HOTPLUG); | ||
1059 | if (ret) | 1060 | if (ret) |
1060 | putback_lru_pages(&source); | 1061 | putback_lru_pages(&source); |
1061 | } | 1062 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index aaf54566cb6b..d1b315e98627 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -90,6 +90,7 @@ | |||
90 | #include <linux/syscalls.h> | 90 | #include <linux/syscalls.h> |
91 | #include <linux/ctype.h> | 91 | #include <linux/ctype.h> |
92 | #include <linux/mm_inline.h> | 92 | #include <linux/mm_inline.h> |
93 | #include <linux/mmu_notifier.h> | ||
93 | 94 | ||
94 | #include <asm/tlbflush.h> | 95 | #include <asm/tlbflush.h> |
95 | #include <asm/uaccess.h> | 96 | #include <asm/uaccess.h> |
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = { | |||
117 | .flags = MPOL_F_LOCAL, | 118 | .flags = MPOL_F_LOCAL, |
118 | }; | 119 | }; |
119 | 120 | ||
121 | static struct mempolicy preferred_node_policy[MAX_NUMNODES]; | ||
122 | |||
123 | static struct mempolicy *get_task_policy(struct task_struct *p) | ||
124 | { | ||
125 | struct mempolicy *pol = p->mempolicy; | ||
126 | int node; | ||
127 | |||
128 | if (!pol) { | ||
129 | node = numa_node_id(); | ||
130 | if (node != -1) | ||
131 | pol = &preferred_node_policy[node]; | ||
132 | |||
133 | /* preferred_node_policy is not initialised early in boot */ | ||
134 | if (!pol->mode) | ||
135 | pol = NULL; | ||
136 | } | ||
137 | |||
138 | return pol; | ||
139 | } | ||
140 | |||
120 | static const struct mempolicy_operations { | 141 | static const struct mempolicy_operations { |
121 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); | 142 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); |
122 | /* | 143 | /* |
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
254 | if (mode == MPOL_DEFAULT) { | 275 | if (mode == MPOL_DEFAULT) { |
255 | if (nodes && !nodes_empty(*nodes)) | 276 | if (nodes && !nodes_empty(*nodes)) |
256 | return ERR_PTR(-EINVAL); | 277 | return ERR_PTR(-EINVAL); |
257 | return NULL; /* simply delete any existing policy */ | 278 | return NULL; |
258 | } | 279 | } |
259 | VM_BUG_ON(!nodes); | 280 | VM_BUG_ON(!nodes); |
260 | 281 | ||
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
269 | (flags & MPOL_F_RELATIVE_NODES))) | 290 | (flags & MPOL_F_RELATIVE_NODES))) |
270 | return ERR_PTR(-EINVAL); | 291 | return ERR_PTR(-EINVAL); |
271 | } | 292 | } |
293 | } else if (mode == MPOL_LOCAL) { | ||
294 | if (!nodes_empty(*nodes)) | ||
295 | return ERR_PTR(-EINVAL); | ||
296 | mode = MPOL_PREFERRED; | ||
272 | } else if (nodes_empty(*nodes)) | 297 | } else if (nodes_empty(*nodes)) |
273 | return ERR_PTR(-EINVAL); | 298 | return ERR_PTR(-EINVAL); |
274 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 299 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma, | |||
561 | return 0; | 586 | return 0; |
562 | } | 587 | } |
563 | 588 | ||
589 | #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE | ||
590 | /* | ||
591 | * This is used to mark a range of virtual addresses to be inaccessible. | ||
592 | * These are later cleared by a NUMA hinting fault. Depending on these | ||
593 | * faults, pages may be migrated for better NUMA placement. | ||
594 | * | ||
595 | * This is assuming that NUMA faults are handled using PROT_NONE. If | ||
596 | * an architecture makes a different choice, it will need further | ||
597 | * changes to the core. | ||
598 | */ | ||
599 | unsigned long change_prot_numa(struct vm_area_struct *vma, | ||
600 | unsigned long addr, unsigned long end) | ||
601 | { | ||
602 | int nr_updated; | ||
603 | BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); | ||
604 | |||
605 | nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); | ||
606 | if (nr_updated) | ||
607 | count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); | ||
608 | |||
609 | return nr_updated; | ||
610 | } | ||
611 | #else | ||
612 | static unsigned long change_prot_numa(struct vm_area_struct *vma, | ||
613 | unsigned long addr, unsigned long end) | ||
614 | { | ||
615 | return 0; | ||
616 | } | ||
617 | #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ | ||
618 | |||
564 | /* | 619 | /* |
565 | * Check if all pages in a range are on a set of nodes. | 620 | * Check if all pages in a range are on a set of nodes. |
566 | * If pagelist != NULL then isolate pages from the LRU and | 621 | * If pagelist != NULL then isolate pages from the LRU and |
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
579 | return ERR_PTR(-EFAULT); | 634 | return ERR_PTR(-EFAULT); |
580 | prev = NULL; | 635 | prev = NULL; |
581 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 636 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
637 | unsigned long endvma = vma->vm_end; | ||
638 | |||
639 | if (endvma > end) | ||
640 | endvma = end; | ||
641 | if (vma->vm_start > start) | ||
642 | start = vma->vm_start; | ||
643 | |||
582 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | 644 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { |
583 | if (!vma->vm_next && vma->vm_end < end) | 645 | if (!vma->vm_next && vma->vm_end < end) |
584 | return ERR_PTR(-EFAULT); | 646 | return ERR_PTR(-EFAULT); |
585 | if (prev && prev->vm_end < vma->vm_start) | 647 | if (prev && prev->vm_end < vma->vm_start) |
586 | return ERR_PTR(-EFAULT); | 648 | return ERR_PTR(-EFAULT); |
587 | } | 649 | } |
588 | if (!is_vm_hugetlb_page(vma) && | 650 | |
589 | ((flags & MPOL_MF_STRICT) || | 651 | if (is_vm_hugetlb_page(vma)) |
652 | goto next; | ||
653 | |||
654 | if (flags & MPOL_MF_LAZY) { | ||
655 | change_prot_numa(vma, start, endvma); | ||
656 | goto next; | ||
657 | } | ||
658 | |||
659 | if ((flags & MPOL_MF_STRICT) || | ||
590 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | 660 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && |
591 | vma_migratable(vma)))) { | 661 | vma_migratable(vma))) { |
592 | unsigned long endvma = vma->vm_end; | ||
593 | 662 | ||
594 | if (endvma > end) | ||
595 | endvma = end; | ||
596 | if (vma->vm_start > start) | ||
597 | start = vma->vm_start; | ||
598 | err = check_pgd_range(vma, start, endvma, nodes, | 663 | err = check_pgd_range(vma, start, endvma, nodes, |
599 | flags, private); | 664 | flags, private); |
600 | if (err) { | 665 | if (err) { |
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
602 | break; | 667 | break; |
603 | } | 668 | } |
604 | } | 669 | } |
670 | next: | ||
605 | prev = vma; | 671 | prev = vma; |
606 | } | 672 | } |
607 | return first; | 673 | return first; |
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
961 | 1027 | ||
962 | if (!list_empty(&pagelist)) { | 1028 | if (!list_empty(&pagelist)) { |
963 | err = migrate_pages(&pagelist, new_node_page, dest, | 1029 | err = migrate_pages(&pagelist, new_node_page, dest, |
964 | false, MIGRATE_SYNC); | 1030 | false, MIGRATE_SYNC, |
1031 | MR_SYSCALL); | ||
965 | if (err) | 1032 | if (err) |
966 | putback_lru_pages(&pagelist); | 1033 | putback_lru_pages(&pagelist); |
967 | } | 1034 | } |
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1133 | int err; | 1200 | int err; |
1134 | LIST_HEAD(pagelist); | 1201 | LIST_HEAD(pagelist); |
1135 | 1202 | ||
1136 | if (flags & ~(unsigned long)(MPOL_MF_STRICT | | 1203 | if (flags & ~(unsigned long)MPOL_MF_VALID) |
1137 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
1138 | return -EINVAL; | 1204 | return -EINVAL; |
1139 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) | 1205 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) |
1140 | return -EPERM; | 1206 | return -EPERM; |
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1157 | if (IS_ERR(new)) | 1223 | if (IS_ERR(new)) |
1158 | return PTR_ERR(new); | 1224 | return PTR_ERR(new); |
1159 | 1225 | ||
1226 | if (flags & MPOL_MF_LAZY) | ||
1227 | new->flags |= MPOL_F_MOF; | ||
1228 | |||
1160 | /* | 1229 | /* |
1161 | * If we are using the default policy then operation | 1230 | * If we are using the default policy then operation |
1162 | * on discontinuous address spaces is okay after all | 1231 | * on discontinuous address spaces is okay after all |
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1193 | vma = check_range(mm, start, end, nmask, | 1262 | vma = check_range(mm, start, end, nmask, |
1194 | flags | MPOL_MF_INVERT, &pagelist); | 1263 | flags | MPOL_MF_INVERT, &pagelist); |
1195 | 1264 | ||
1196 | err = PTR_ERR(vma); | 1265 | err = PTR_ERR(vma); /* maybe ... */ |
1197 | if (!IS_ERR(vma)) { | 1266 | if (!IS_ERR(vma)) |
1198 | int nr_failed = 0; | ||
1199 | |||
1200 | err = mbind_range(mm, start, end, new); | 1267 | err = mbind_range(mm, start, end, new); |
1201 | 1268 | ||
1269 | if (!err) { | ||
1270 | int nr_failed = 0; | ||
1271 | |||
1202 | if (!list_empty(&pagelist)) { | 1272 | if (!list_empty(&pagelist)) { |
1273 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | ||
1203 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1274 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1204 | (unsigned long)vma, | 1275 | (unsigned long)vma, |
1205 | false, MIGRATE_SYNC); | 1276 | false, MIGRATE_SYNC, |
1277 | MR_MEMPOLICY_MBIND); | ||
1206 | if (nr_failed) | 1278 | if (nr_failed) |
1207 | putback_lru_pages(&pagelist); | 1279 | putback_lru_pages(&pagelist); |
1208 | } | 1280 | } |
1209 | 1281 | ||
1210 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 1282 | if (nr_failed && (flags & MPOL_MF_STRICT)) |
1211 | err = -EIO; | 1283 | err = -EIO; |
1212 | } else | 1284 | } else |
1213 | putback_lru_pages(&pagelist); | 1285 | putback_lru_pages(&pagelist); |
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
1546 | struct mempolicy *get_vma_policy(struct task_struct *task, | 1618 | struct mempolicy *get_vma_policy(struct task_struct *task, |
1547 | struct vm_area_struct *vma, unsigned long addr) | 1619 | struct vm_area_struct *vma, unsigned long addr) |
1548 | { | 1620 | { |
1549 | struct mempolicy *pol = task->mempolicy; | 1621 | struct mempolicy *pol = get_task_policy(task); |
1550 | 1622 | ||
1551 | if (vma) { | 1623 | if (vma) { |
1552 | if (vma->vm_ops && vma->vm_ops->get_policy) { | 1624 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
@@ -1956,7 +2028,7 @@ retry_cpuset: | |||
1956 | */ | 2028 | */ |
1957 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) | 2029 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) |
1958 | { | 2030 | { |
1959 | struct mempolicy *pol = current->mempolicy; | 2031 | struct mempolicy *pol = get_task_policy(current); |
1960 | struct page *page; | 2032 | struct page *page; |
1961 | unsigned int cpuset_mems_cookie; | 2033 | unsigned int cpuset_mems_cookie; |
1962 | 2034 | ||
@@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n) | |||
2140 | kmem_cache_free(sn_cache, n); | 2212 | kmem_cache_free(sn_cache, n); |
2141 | } | 2213 | } |
2142 | 2214 | ||
2215 | /** | ||
2216 | * mpol_misplaced - check whether current page node is valid in policy | ||
2217 | * | ||
2218 | * @page - page to be checked | ||
2219 | * @vma - vm area where page mapped | ||
2220 | * @addr - virtual address where page mapped | ||
2221 | * | ||
2222 | * Lookup current policy node id for vma,addr and "compare to" page's | ||
2223 | * node id. | ||
2224 | * | ||
2225 | * Returns: | ||
2226 | * -1 - not misplaced, page is in the right node | ||
2227 | * node - node id where the page should be | ||
2228 | * | ||
2229 | * Policy determination "mimics" alloc_page_vma(). | ||
2230 | * Called from fault path where we know the vma and faulting address. | ||
2231 | */ | ||
2232 | int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) | ||
2233 | { | ||
2234 | struct mempolicy *pol; | ||
2235 | struct zone *zone; | ||
2236 | int curnid = page_to_nid(page); | ||
2237 | unsigned long pgoff; | ||
2238 | int polnid = -1; | ||
2239 | int ret = -1; | ||
2240 | |||
2241 | BUG_ON(!vma); | ||
2242 | |||
2243 | pol = get_vma_policy(current, vma, addr); | ||
2244 | if (!(pol->flags & MPOL_F_MOF)) | ||
2245 | goto out; | ||
2246 | |||
2247 | switch (pol->mode) { | ||
2248 | case MPOL_INTERLEAVE: | ||
2249 | BUG_ON(addr >= vma->vm_end); | ||
2250 | BUG_ON(addr < vma->vm_start); | ||
2251 | |||
2252 | pgoff = vma->vm_pgoff; | ||
2253 | pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; | ||
2254 | polnid = offset_il_node(pol, vma, pgoff); | ||
2255 | break; | ||
2256 | |||
2257 | case MPOL_PREFERRED: | ||
2258 | if (pol->flags & MPOL_F_LOCAL) | ||
2259 | polnid = numa_node_id(); | ||
2260 | else | ||
2261 | polnid = pol->v.preferred_node; | ||
2262 | break; | ||
2263 | |||
2264 | case MPOL_BIND: | ||
2265 | /* | ||
2266 | * allows binding to multiple nodes. | ||
2267 | * use current page if in policy nodemask, | ||
2268 | * else select nearest allowed node, if any. | ||
2269 | * If no allowed nodes, use current [!misplaced]. | ||
2270 | */ | ||
2271 | if (node_isset(curnid, pol->v.nodes)) | ||
2272 | goto out; | ||
2273 | (void)first_zones_zonelist( | ||
2274 | node_zonelist(numa_node_id(), GFP_HIGHUSER), | ||
2275 | gfp_zone(GFP_HIGHUSER), | ||
2276 | &pol->v.nodes, &zone); | ||
2277 | polnid = zone->node; | ||
2278 | break; | ||
2279 | |||
2280 | default: | ||
2281 | BUG(); | ||
2282 | } | ||
2283 | |||
2284 | /* Migrate the page towards the node whose CPU is referencing it */ | ||
2285 | if (pol->flags & MPOL_F_MORON) { | ||
2286 | int last_nid; | ||
2287 | |||
2288 | polnid = numa_node_id(); | ||
2289 | |||
2290 | /* | ||
2291 | * Multi-stage node selection is used in conjunction | ||
2292 | * with a periodic migration fault to build a temporal | ||
2293 | * task<->page relation. By using a two-stage filter we | ||
2294 | * remove short/unlikely relations. | ||
2295 | * | ||
2296 | * Using P(p) ~ n_p / n_t as per frequentist | ||
2297 | * probability, we can equate a task's usage of a | ||
2298 | * particular page (n_p) per total usage of this | ||
2299 | * page (n_t) (in a given time-span) to a probability. | ||
2300 | * | ||
2301 | * Our periodic faults will sample this probability and | ||
2302 | * getting the same result twice in a row, given these | ||
2303 | * samples are fully independent, is then given by | ||
2304 | * P(n)^2, provided our sample period is sufficiently | ||
2305 | * short compared to the usage pattern. | ||
2306 | * | ||
2307 | * This quadric squishes small probabilities, making | ||
2308 | * it less likely we act on an unlikely task<->page | ||
2309 | * relation. | ||
2310 | */ | ||
2311 | last_nid = page_xchg_last_nid(page, polnid); | ||
2312 | if (last_nid != polnid) | ||
2313 | goto out; | ||
2314 | } | ||
2315 | |||
2316 | if (curnid != polnid) | ||
2317 | ret = polnid; | ||
2318 | out: | ||
2319 | mpol_cond_put(pol); | ||
2320 | |||
2321 | return ret; | ||
2322 | } | ||
2323 | |||
2143 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) | 2324 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) |
2144 | { | 2325 | { |
2145 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); | 2326 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); |
@@ -2305,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
2305 | mutex_unlock(&p->mutex); | 2486 | mutex_unlock(&p->mutex); |
2306 | } | 2487 | } |
2307 | 2488 | ||
2489 | #ifdef CONFIG_NUMA_BALANCING | ||
2490 | static bool __initdata numabalancing_override; | ||
2491 | |||
2492 | static void __init check_numabalancing_enable(void) | ||
2493 | { | ||
2494 | bool numabalancing_default = false; | ||
2495 | |||
2496 | if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) | ||
2497 | numabalancing_default = true; | ||
2498 | |||
2499 | if (nr_node_ids > 1 && !numabalancing_override) { | ||
2500 | printk(KERN_INFO "Enabling automatic NUMA balancing. " | ||
2501 | "Configure with numa_balancing= or sysctl"); | ||
2502 | set_numabalancing_state(numabalancing_default); | ||
2503 | } | ||
2504 | } | ||
2505 | |||
2506 | static int __init setup_numabalancing(char *str) | ||
2507 | { | ||
2508 | int ret = 0; | ||
2509 | if (!str) | ||
2510 | goto out; | ||
2511 | numabalancing_override = true; | ||
2512 | |||
2513 | if (!strcmp(str, "enable")) { | ||
2514 | set_numabalancing_state(true); | ||
2515 | ret = 1; | ||
2516 | } else if (!strcmp(str, "disable")) { | ||
2517 | set_numabalancing_state(false); | ||
2518 | ret = 1; | ||
2519 | } | ||
2520 | out: | ||
2521 | if (!ret) | ||
2522 | printk(KERN_WARNING "Unable to parse numa_balancing=\n"); | ||
2523 | |||
2524 | return ret; | ||
2525 | } | ||
2526 | __setup("numa_balancing=", setup_numabalancing); | ||
2527 | #else | ||
2528 | static inline void __init check_numabalancing_enable(void) | ||
2529 | { | ||
2530 | } | ||
2531 | #endif /* CONFIG_NUMA_BALANCING */ | ||
2532 | |||
2308 | /* assumes fs == KERNEL_DS */ | 2533 | /* assumes fs == KERNEL_DS */ |
2309 | void __init numa_policy_init(void) | 2534 | void __init numa_policy_init(void) |
2310 | { | 2535 | { |
@@ -2320,6 +2545,15 @@ void __init numa_policy_init(void) | |||
2320 | sizeof(struct sp_node), | 2545 | sizeof(struct sp_node), |
2321 | 0, SLAB_PANIC, NULL); | 2546 | 0, SLAB_PANIC, NULL); |
2322 | 2547 | ||
2548 | for_each_node(nid) { | ||
2549 | preferred_node_policy[nid] = (struct mempolicy) { | ||
2550 | .refcnt = ATOMIC_INIT(1), | ||
2551 | .mode = MPOL_PREFERRED, | ||
2552 | .flags = MPOL_F_MOF | MPOL_F_MORON, | ||
2553 | .v = { .preferred_node = nid, }, | ||
2554 | }; | ||
2555 | } | ||
2556 | |||
2323 | /* | 2557 | /* |
2324 | * Set interleaving policy for system init. Interleaving is only | 2558 | * Set interleaving policy for system init. Interleaving is only |
2325 | * enabled across suitably sized nodes (default is >= 16MB), or | 2559 | * enabled across suitably sized nodes (default is >= 16MB), or |
@@ -2346,6 +2580,8 @@ void __init numa_policy_init(void) | |||
2346 | 2580 | ||
2347 | if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) | 2581 | if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) |
2348 | printk("numa_policy_init: interleaving failed\n"); | 2582 | printk("numa_policy_init: interleaving failed\n"); |
2583 | |||
2584 | check_numabalancing_enable(); | ||
2349 | } | 2585 | } |
2350 | 2586 | ||
2351 | /* Reset policy of current process to default */ | 2587 | /* Reset policy of current process to default */ |
@@ -2362,14 +2598,13 @@ void numa_default_policy(void) | |||
2362 | * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag | 2598 | * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag |
2363 | * Used only for mpol_parse_str() and mpol_to_str() | 2599 | * Used only for mpol_parse_str() and mpol_to_str() |
2364 | */ | 2600 | */ |
2365 | #define MPOL_LOCAL MPOL_MAX | ||
2366 | static const char * const policy_modes[] = | 2601 | static const char * const policy_modes[] = |
2367 | { | 2602 | { |
2368 | [MPOL_DEFAULT] = "default", | 2603 | [MPOL_DEFAULT] = "default", |
2369 | [MPOL_PREFERRED] = "prefer", | 2604 | [MPOL_PREFERRED] = "prefer", |
2370 | [MPOL_BIND] = "bind", | 2605 | [MPOL_BIND] = "bind", |
2371 | [MPOL_INTERLEAVE] = "interleave", | 2606 | [MPOL_INTERLEAVE] = "interleave", |
2372 | [MPOL_LOCAL] = "local" | 2607 | [MPOL_LOCAL] = "local", |
2373 | }; | 2608 | }; |
2374 | 2609 | ||
2375 | 2610 | ||
@@ -2415,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2415 | if (flags) | 2650 | if (flags) |
2416 | *flags++ = '\0'; /* terminate mode string */ | 2651 | *flags++ = '\0'; /* terminate mode string */ |
2417 | 2652 | ||
2418 | for (mode = 0; mode <= MPOL_LOCAL; mode++) { | 2653 | for (mode = 0; mode < MPOL_MAX; mode++) { |
2419 | if (!strcmp(str, policy_modes[mode])) { | 2654 | if (!strcmp(str, policy_modes[mode])) { |
2420 | break; | 2655 | break; |
2421 | } | 2656 | } |
2422 | } | 2657 | } |
2423 | if (mode > MPOL_LOCAL) | 2658 | if (mode >= MPOL_MAX) |
2424 | goto out; | 2659 | goto out; |
2425 | 2660 | ||
2426 | switch (mode) { | 2661 | switch (mode) { |
diff --git a/mm/migrate.c b/mm/migrate.c index cae02711181d..32efd8028bc9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -39,6 +39,9 @@ | |||
39 | 39 | ||
40 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
41 | 41 | ||
42 | #define CREATE_TRACE_POINTS | ||
43 | #include <trace/events/migrate.h> | ||
44 | |||
42 | #include "internal.h" | 45 | #include "internal.h" |
43 | 46 | ||
44 | /* | 47 | /* |
@@ -293,7 +296,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
293 | struct page *newpage, struct page *page, | 296 | struct page *newpage, struct page *page, |
294 | struct buffer_head *head, enum migrate_mode mode) | 297 | struct buffer_head *head, enum migrate_mode mode) |
295 | { | 298 | { |
296 | int expected_count; | 299 | int expected_count = 0; |
297 | void **pslot; | 300 | void **pslot; |
298 | 301 | ||
299 | if (!mapping) { | 302 | if (!mapping) { |
@@ -421,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
421 | */ | 424 | */ |
422 | void migrate_page_copy(struct page *newpage, struct page *page) | 425 | void migrate_page_copy(struct page *newpage, struct page *page) |
423 | { | 426 | { |
424 | if (PageHuge(page)) | 427 | if (PageHuge(page) || PageTransHuge(page)) |
425 | copy_huge_page(newpage, page); | 428 | copy_huge_page(newpage, page); |
426 | else | 429 | else |
427 | copy_highpage(newpage, page); | 430 | copy_highpage(newpage, page); |
@@ -765,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
765 | */ | 768 | */ |
766 | if (PageAnon(page)) { | 769 | if (PageAnon(page)) { |
767 | /* | 770 | /* |
768 | * Only page_lock_anon_vma() understands the subtleties of | 771 | * Only page_lock_anon_vma_read() understands the subtleties of |
769 | * getting a hold on an anon_vma from outside one of its mms. | 772 | * getting a hold on an anon_vma from outside one of its mms. |
770 | */ | 773 | */ |
771 | anon_vma = page_get_anon_vma(page); | 774 | anon_vma = page_get_anon_vma(page); |
@@ -998,10 +1001,11 @@ out: | |||
998 | */ | 1001 | */ |
999 | int migrate_pages(struct list_head *from, | 1002 | int migrate_pages(struct list_head *from, |
1000 | new_page_t get_new_page, unsigned long private, bool offlining, | 1003 | new_page_t get_new_page, unsigned long private, bool offlining, |
1001 | enum migrate_mode mode) | 1004 | enum migrate_mode mode, int reason) |
1002 | { | 1005 | { |
1003 | int retry = 1; | 1006 | int retry = 1; |
1004 | int nr_failed = 0; | 1007 | int nr_failed = 0; |
1008 | int nr_succeeded = 0; | ||
1005 | int pass = 0; | 1009 | int pass = 0; |
1006 | struct page *page; | 1010 | struct page *page; |
1007 | struct page *page2; | 1011 | struct page *page2; |
@@ -1028,6 +1032,7 @@ int migrate_pages(struct list_head *from, | |||
1028 | retry++; | 1032 | retry++; |
1029 | break; | 1033 | break; |
1030 | case MIGRATEPAGE_SUCCESS: | 1034 | case MIGRATEPAGE_SUCCESS: |
1035 | nr_succeeded++; | ||
1031 | break; | 1036 | break; |
1032 | default: | 1037 | default: |
1033 | /* Permanent failure */ | 1038 | /* Permanent failure */ |
@@ -1038,6 +1043,12 @@ int migrate_pages(struct list_head *from, | |||
1038 | } | 1043 | } |
1039 | rc = nr_failed + retry; | 1044 | rc = nr_failed + retry; |
1040 | out: | 1045 | out: |
1046 | if (nr_succeeded) | ||
1047 | count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); | ||
1048 | if (nr_failed) | ||
1049 | count_vm_events(PGMIGRATE_FAIL, nr_failed); | ||
1050 | trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); | ||
1051 | |||
1041 | if (!swapwrite) | 1052 | if (!swapwrite) |
1042 | current->flags &= ~PF_SWAPWRITE; | 1053 | current->flags &= ~PF_SWAPWRITE; |
1043 | 1054 | ||
@@ -1176,7 +1187,8 @@ set_status: | |||
1176 | err = 0; | 1187 | err = 0; |
1177 | if (!list_empty(&pagelist)) { | 1188 | if (!list_empty(&pagelist)) { |
1178 | err = migrate_pages(&pagelist, new_page_node, | 1189 | err = migrate_pages(&pagelist, new_page_node, |
1179 | (unsigned long)pm, 0, MIGRATE_SYNC); | 1190 | (unsigned long)pm, 0, MIGRATE_SYNC, |
1191 | MR_SYSCALL); | ||
1180 | if (err) | 1192 | if (err) |
1181 | putback_lru_pages(&pagelist); | 1193 | putback_lru_pages(&pagelist); |
1182 | } | 1194 | } |
@@ -1440,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, | |||
1440 | } | 1452 | } |
1441 | return err; | 1453 | return err; |
1442 | } | 1454 | } |
1443 | #endif | 1455 | |
1456 | #ifdef CONFIG_NUMA_BALANCING | ||
1457 | /* | ||
1458 | * Returns true if this is a safe migration target node for misplaced NUMA | ||
1459 | * pages. Currently it only checks the watermarks which crude | ||
1460 | */ | ||
1461 | static bool migrate_balanced_pgdat(struct pglist_data *pgdat, | ||
1462 | int nr_migrate_pages) | ||
1463 | { | ||
1464 | int z; | ||
1465 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { | ||
1466 | struct zone *zone = pgdat->node_zones + z; | ||
1467 | |||
1468 | if (!populated_zone(zone)) | ||
1469 | continue; | ||
1470 | |||
1471 | if (zone->all_unreclaimable) | ||
1472 | continue; | ||
1473 | |||
1474 | /* Avoid waking kswapd by allocating pages_to_migrate pages. */ | ||
1475 | if (!zone_watermark_ok(zone, 0, | ||
1476 | high_wmark_pages(zone) + | ||
1477 | nr_migrate_pages, | ||
1478 | 0, 0)) | ||
1479 | continue; | ||
1480 | return true; | ||
1481 | } | ||
1482 | return false; | ||
1483 | } | ||
1484 | |||
1485 | static struct page *alloc_misplaced_dst_page(struct page *page, | ||
1486 | unsigned long data, | ||
1487 | int **result) | ||
1488 | { | ||
1489 | int nid = (int) data; | ||
1490 | struct page *newpage; | ||
1491 | |||
1492 | newpage = alloc_pages_exact_node(nid, | ||
1493 | (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | | ||
1494 | __GFP_NOMEMALLOC | __GFP_NORETRY | | ||
1495 | __GFP_NOWARN) & | ||
1496 | ~GFP_IOFS, 0); | ||
1497 | if (newpage) | ||
1498 | page_xchg_last_nid(newpage, page_last_nid(page)); | ||
1499 | |||
1500 | return newpage; | ||
1501 | } | ||
1502 | |||
1503 | /* | ||
1504 | * page migration rate limiting control. | ||
1505 | * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs | ||
1506 | * window of time. Default here says do not migrate more than 1280M per second. | ||
1507 | * If a node is rate-limited then PTE NUMA updates are also rate-limited. However | ||
1508 | * as it is faults that reset the window, pte updates will happen unconditionally | ||
1509 | * if there has not been a fault since @pteupdate_interval_millisecs after the | ||
1510 | * throttle window closed. | ||
1511 | */ | ||
1512 | static unsigned int migrate_interval_millisecs __read_mostly = 100; | ||
1513 | static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; | ||
1514 | static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); | ||
1515 | |||
1516 | /* Returns true if NUMA migration is currently rate limited */ | ||
1517 | bool migrate_ratelimited(int node) | ||
1518 | { | ||
1519 | pg_data_t *pgdat = NODE_DATA(node); | ||
1520 | |||
1521 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + | ||
1522 | msecs_to_jiffies(pteupdate_interval_millisecs))) | ||
1523 | return false; | ||
1524 | |||
1525 | if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) | ||
1526 | return false; | ||
1527 | |||
1528 | return true; | ||
1529 | } | ||
1530 | |||
1531 | /* Returns true if the node is migrate rate-limited after the update */ | ||
1532 | bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) | ||
1533 | { | ||
1534 | bool rate_limited = false; | ||
1535 | |||
1536 | /* | ||
1537 | * Rate-limit the amount of data that is being migrated to a node. | ||
1538 | * Optimal placement is no good if the memory bus is saturated and | ||
1539 | * all the time is being spent migrating! | ||
1540 | */ | ||
1541 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
1542 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { | ||
1543 | pgdat->numabalancing_migrate_nr_pages = 0; | ||
1544 | pgdat->numabalancing_migrate_next_window = jiffies + | ||
1545 | msecs_to_jiffies(migrate_interval_millisecs); | ||
1546 | } | ||
1547 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) | ||
1548 | rate_limited = true; | ||
1549 | else | ||
1550 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | ||
1551 | spin_unlock(&pgdat->numabalancing_migrate_lock); | ||
1552 | |||
1553 | return rate_limited; | ||
1554 | } | ||
1555 | |||
1556 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | ||
1557 | { | ||
1558 | int ret = 0; | ||
1559 | |||
1560 | /* Avoid migrating to a node that is nearly full */ | ||
1561 | if (migrate_balanced_pgdat(pgdat, 1)) { | ||
1562 | int page_lru; | ||
1563 | |||
1564 | if (isolate_lru_page(page)) { | ||
1565 | put_page(page); | ||
1566 | return 0; | ||
1567 | } | ||
1568 | |||
1569 | /* Page is isolated */ | ||
1570 | ret = 1; | ||
1571 | page_lru = page_is_file_cache(page); | ||
1572 | if (!PageTransHuge(page)) | ||
1573 | inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); | ||
1574 | else | ||
1575 | mod_zone_page_state(page_zone(page), | ||
1576 | NR_ISOLATED_ANON + page_lru, | ||
1577 | HPAGE_PMD_NR); | ||
1578 | } | ||
1579 | |||
1580 | /* | ||
1581 | * Page is either isolated or there is not enough space on the target | ||
1582 | * node. If isolated, then it has taken a reference count and the | ||
1583 | * callers reference can be safely dropped without the page | ||
1584 | * disappearing underneath us during migration. Otherwise the page is | ||
1585 | * not to be migrated but the callers reference should still be | ||
1586 | * dropped so it does not leak. | ||
1587 | */ | ||
1588 | put_page(page); | ||
1589 | |||
1590 | return ret; | ||
1591 | } | ||
1592 | |||
1593 | /* | ||
1594 | * Attempt to migrate a misplaced page to the specified destination | ||
1595 | * node. Caller is expected to have an elevated reference count on | ||
1596 | * the page that will be dropped by this function before returning. | ||
1597 | */ | ||
1598 | int migrate_misplaced_page(struct page *page, int node) | ||
1599 | { | ||
1600 | pg_data_t *pgdat = NODE_DATA(node); | ||
1601 | int isolated = 0; | ||
1602 | int nr_remaining; | ||
1603 | LIST_HEAD(migratepages); | ||
1604 | |||
1605 | /* | ||
1606 | * Don't migrate pages that are mapped in multiple processes. | ||
1607 | * TODO: Handle false sharing detection instead of this hammer | ||
1608 | */ | ||
1609 | if (page_mapcount(page) != 1) { | ||
1610 | put_page(page); | ||
1611 | goto out; | ||
1612 | } | ||
1613 | |||
1614 | /* | ||
1615 | * Rate-limit the amount of data that is being migrated to a node. | ||
1616 | * Optimal placement is no good if the memory bus is saturated and | ||
1617 | * all the time is being spent migrating! | ||
1618 | */ | ||
1619 | if (numamigrate_update_ratelimit(pgdat, 1)) { | ||
1620 | put_page(page); | ||
1621 | goto out; | ||
1622 | } | ||
1623 | |||
1624 | isolated = numamigrate_isolate_page(pgdat, page); | ||
1625 | if (!isolated) | ||
1626 | goto out; | ||
1627 | |||
1628 | list_add(&page->lru, &migratepages); | ||
1629 | nr_remaining = migrate_pages(&migratepages, | ||
1630 | alloc_misplaced_dst_page, | ||
1631 | node, false, MIGRATE_ASYNC, | ||
1632 | MR_NUMA_MISPLACED); | ||
1633 | if (nr_remaining) { | ||
1634 | putback_lru_pages(&migratepages); | ||
1635 | isolated = 0; | ||
1636 | } else | ||
1637 | count_vm_numa_event(NUMA_PAGE_MIGRATE); | ||
1638 | BUG_ON(!list_empty(&migratepages)); | ||
1639 | out: | ||
1640 | return isolated; | ||
1641 | } | ||
1642 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1643 | |||
1644 | #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) | ||
1645 | int migrate_misplaced_transhuge_page(struct mm_struct *mm, | ||
1646 | struct vm_area_struct *vma, | ||
1647 | pmd_t *pmd, pmd_t entry, | ||
1648 | unsigned long address, | ||
1649 | struct page *page, int node) | ||
1650 | { | ||
1651 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
1652 | pg_data_t *pgdat = NODE_DATA(node); | ||
1653 | int isolated = 0; | ||
1654 | struct page *new_page = NULL; | ||
1655 | struct mem_cgroup *memcg = NULL; | ||
1656 | int page_lru = page_is_file_cache(page); | ||
1657 | |||
1658 | /* | ||
1659 | * Don't migrate pages that are mapped in multiple processes. | ||
1660 | * TODO: Handle false sharing detection instead of this hammer | ||
1661 | */ | ||
1662 | if (page_mapcount(page) != 1) | ||
1663 | goto out_dropref; | ||
1664 | |||
1665 | /* | ||
1666 | * Rate-limit the amount of data that is being migrated to a node. | ||
1667 | * Optimal placement is no good if the memory bus is saturated and | ||
1668 | * all the time is being spent migrating! | ||
1669 | */ | ||
1670 | if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) | ||
1671 | goto out_dropref; | ||
1672 | |||
1673 | new_page = alloc_pages_node(node, | ||
1674 | (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); | ||
1675 | if (!new_page) { | ||
1676 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1677 | goto out_dropref; | ||
1678 | } | ||
1679 | page_xchg_last_nid(new_page, page_last_nid(page)); | ||
1680 | |||
1681 | isolated = numamigrate_isolate_page(pgdat, page); | ||
1682 | if (!isolated) { | ||
1683 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1684 | put_page(new_page); | ||
1685 | goto out_keep_locked; | ||
1686 | } | ||
1687 | |||
1688 | /* Prepare a page as a migration target */ | ||
1689 | __set_page_locked(new_page); | ||
1690 | SetPageSwapBacked(new_page); | ||
1691 | |||
1692 | /* anon mapping, we can simply copy page->mapping to the new page: */ | ||
1693 | new_page->mapping = page->mapping; | ||
1694 | new_page->index = page->index; | ||
1695 | migrate_page_copy(new_page, page); | ||
1696 | WARN_ON(PageLRU(new_page)); | ||
1697 | |||
1698 | /* Recheck the target PMD */ | ||
1699 | spin_lock(&mm->page_table_lock); | ||
1700 | if (unlikely(!pmd_same(*pmd, entry))) { | ||
1701 | spin_unlock(&mm->page_table_lock); | ||
1702 | |||
1703 | /* Reverse changes made by migrate_page_copy() */ | ||
1704 | if (TestClearPageActive(new_page)) | ||
1705 | SetPageActive(page); | ||
1706 | if (TestClearPageUnevictable(new_page)) | ||
1707 | SetPageUnevictable(page); | ||
1708 | mlock_migrate_page(page, new_page); | ||
1709 | |||
1710 | unlock_page(new_page); | ||
1711 | put_page(new_page); /* Free it */ | ||
1712 | |||
1713 | unlock_page(page); | ||
1714 | putback_lru_page(page); | ||
1715 | |||
1716 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1717 | goto out; | ||
1718 | } | ||
1719 | |||
1720 | /* | ||
1721 | * Traditional migration needs to prepare the memcg charge | ||
1722 | * transaction early to prevent the old page from being | ||
1723 | * uncharged when installing migration entries. Here we can | ||
1724 | * save the potential rollback and start the charge transfer | ||
1725 | * only when migration is already known to end successfully. | ||
1726 | */ | ||
1727 | mem_cgroup_prepare_migration(page, new_page, &memcg); | ||
1728 | |||
1729 | entry = mk_pmd(new_page, vma->vm_page_prot); | ||
1730 | entry = pmd_mknonnuma(entry); | ||
1731 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
1732 | entry = pmd_mkhuge(entry); | ||
1733 | |||
1734 | page_add_new_anon_rmap(new_page, vma, haddr); | ||
1735 | |||
1736 | set_pmd_at(mm, haddr, pmd, entry); | ||
1737 | update_mmu_cache_pmd(vma, address, entry); | ||
1738 | page_remove_rmap(page); | ||
1739 | /* | ||
1740 | * Finish the charge transaction under the page table lock to | ||
1741 | * prevent split_huge_page() from dividing up the charge | ||
1742 | * before it's fully transferred to the new page. | ||
1743 | */ | ||
1744 | mem_cgroup_end_migration(memcg, page, new_page, true); | ||
1745 | spin_unlock(&mm->page_table_lock); | ||
1746 | |||
1747 | unlock_page(new_page); | ||
1748 | unlock_page(page); | ||
1749 | put_page(page); /* Drop the rmap reference */ | ||
1750 | put_page(page); /* Drop the LRU isolation reference */ | ||
1751 | |||
1752 | count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); | ||
1753 | count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); | ||
1754 | |||
1755 | out: | ||
1756 | mod_zone_page_state(page_zone(page), | ||
1757 | NR_ISOLATED_ANON + page_lru, | ||
1758 | -HPAGE_PMD_NR); | ||
1759 | return isolated; | ||
1760 | |||
1761 | out_dropref: | ||
1762 | put_page(page); | ||
1763 | out_keep_locked: | ||
1764 | return 0; | ||
1765 | } | ||
1766 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1767 | |||
1768 | #endif /* CONFIG_NUMA */ | ||
@@ -736,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
736 | if (anon_vma) { | 736 | if (anon_vma) { |
737 | VM_BUG_ON(adjust_next && next->anon_vma && | 737 | VM_BUG_ON(adjust_next && next->anon_vma && |
738 | anon_vma != next->anon_vma); | 738 | anon_vma != next->anon_vma); |
739 | anon_vma_lock(anon_vma); | 739 | anon_vma_lock_write(anon_vma); |
740 | anon_vma_interval_tree_pre_update_vma(vma); | 740 | anon_vma_interval_tree_pre_update_vma(vma); |
741 | if (adjust_next) | 741 | if (adjust_next) |
742 | anon_vma_interval_tree_pre_update_vma(next); | 742 | anon_vma_interval_tree_pre_update_vma(next); |
@@ -2886,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | |||
2886 | * The LSB of head.next can't change from under us | 2886 | * The LSB of head.next can't change from under us |
2887 | * because we hold the mm_all_locks_mutex. | 2887 | * because we hold the mm_all_locks_mutex. |
2888 | */ | 2888 | */ |
2889 | mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); | 2889 | down_write(&anon_vma->root->rwsem); |
2890 | /* | 2890 | /* |
2891 | * We can safely modify head.next after taking the | 2891 | * We can safely modify head.next after taking the |
2892 | * anon_vma->root->mutex. If some other vma in this mm shares | 2892 | * anon_vma->root->rwsem. If some other vma in this mm shares |
2893 | * the same anon_vma we won't take it again. | 2893 | * the same anon_vma we won't take it again. |
2894 | * | 2894 | * |
2895 | * No need of atomic instructions here, head.next | 2895 | * No need of atomic instructions here, head.next |
2896 | * can't change from under us thanks to the | 2896 | * can't change from under us thanks to the |
2897 | * anon_vma->root->mutex. | 2897 | * anon_vma->root->rwsem. |
2898 | */ | 2898 | */ |
2899 | if (__test_and_set_bit(0, (unsigned long *) | 2899 | if (__test_and_set_bit(0, (unsigned long *) |
2900 | &anon_vma->root->rb_root.rb_node)) | 2900 | &anon_vma->root->rb_root.rb_node)) |
@@ -2996,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | |||
2996 | * | 2996 | * |
2997 | * No need of atomic instructions here, head.next | 2997 | * No need of atomic instructions here, head.next |
2998 | * can't change from under us until we release the | 2998 | * can't change from under us until we release the |
2999 | * anon_vma->root->mutex. | 2999 | * anon_vma->root->rwsem. |
3000 | */ | 3000 | */ |
3001 | if (!__test_and_clear_bit(0, (unsigned long *) | 3001 | if (!__test_and_clear_bit(0, (unsigned long *) |
3002 | &anon_vma->root->rb_root.rb_node)) | 3002 | &anon_vma->root->rb_root.rb_node)) |
diff --git a/mm/mprotect.c b/mm/mprotect.c index e8c3938db6fa..3dca970367db 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) | |||
35 | } | 35 | } |
36 | #endif | 36 | #endif |
37 | 37 | ||
38 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | 38 | static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
39 | unsigned long addr, unsigned long end, pgprot_t newprot, | 39 | unsigned long addr, unsigned long end, pgprot_t newprot, |
40 | int dirty_accountable) | 40 | int dirty_accountable, int prot_numa, bool *ret_all_same_node) |
41 | { | 41 | { |
42 | struct mm_struct *mm = vma->vm_mm; | ||
42 | pte_t *pte, oldpte; | 43 | pte_t *pte, oldpte; |
43 | spinlock_t *ptl; | 44 | spinlock_t *ptl; |
45 | unsigned long pages = 0; | ||
46 | bool all_same_node = true; | ||
47 | int last_nid = -1; | ||
44 | 48 | ||
45 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 49 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
46 | arch_enter_lazy_mmu_mode(); | 50 | arch_enter_lazy_mmu_mode(); |
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
48 | oldpte = *pte; | 52 | oldpte = *pte; |
49 | if (pte_present(oldpte)) { | 53 | if (pte_present(oldpte)) { |
50 | pte_t ptent; | 54 | pte_t ptent; |
55 | bool updated = false; | ||
51 | 56 | ||
52 | ptent = ptep_modify_prot_start(mm, addr, pte); | 57 | ptent = ptep_modify_prot_start(mm, addr, pte); |
53 | ptent = pte_modify(ptent, newprot); | 58 | if (!prot_numa) { |
59 | ptent = pte_modify(ptent, newprot); | ||
60 | updated = true; | ||
61 | } else { | ||
62 | struct page *page; | ||
63 | |||
64 | page = vm_normal_page(vma, addr, oldpte); | ||
65 | if (page) { | ||
66 | int this_nid = page_to_nid(page); | ||
67 | if (last_nid == -1) | ||
68 | last_nid = this_nid; | ||
69 | if (last_nid != this_nid) | ||
70 | all_same_node = false; | ||
71 | |||
72 | /* only check non-shared pages */ | ||
73 | if (!pte_numa(oldpte) && | ||
74 | page_mapcount(page) == 1) { | ||
75 | ptent = pte_mknuma(ptent); | ||
76 | updated = true; | ||
77 | } | ||
78 | } | ||
79 | } | ||
54 | 80 | ||
55 | /* | 81 | /* |
56 | * Avoid taking write faults for pages we know to be | 82 | * Avoid taking write faults for pages we know to be |
57 | * dirty. | 83 | * dirty. |
58 | */ | 84 | */ |
59 | if (dirty_accountable && pte_dirty(ptent)) | 85 | if (dirty_accountable && pte_dirty(ptent)) { |
60 | ptent = pte_mkwrite(ptent); | 86 | ptent = pte_mkwrite(ptent); |
87 | updated = true; | ||
88 | } | ||
61 | 89 | ||
90 | if (updated) | ||
91 | pages++; | ||
62 | ptep_modify_prot_commit(mm, addr, pte, ptent); | 92 | ptep_modify_prot_commit(mm, addr, pte, ptent); |
63 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { | 93 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { |
64 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 94 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
@@ -72,18 +102,40 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
72 | set_pte_at(mm, addr, pte, | 102 | set_pte_at(mm, addr, pte, |
73 | swp_entry_to_pte(entry)); | 103 | swp_entry_to_pte(entry)); |
74 | } | 104 | } |
105 | pages++; | ||
75 | } | 106 | } |
76 | } while (pte++, addr += PAGE_SIZE, addr != end); | 107 | } while (pte++, addr += PAGE_SIZE, addr != end); |
77 | arch_leave_lazy_mmu_mode(); | 108 | arch_leave_lazy_mmu_mode(); |
78 | pte_unmap_unlock(pte - 1, ptl); | 109 | pte_unmap_unlock(pte - 1, ptl); |
110 | |||
111 | *ret_all_same_node = all_same_node; | ||
112 | return pages; | ||
79 | } | 113 | } |
80 | 114 | ||
81 | static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 115 | #ifdef CONFIG_NUMA_BALANCING |
116 | static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, | ||
117 | pmd_t *pmd) | ||
118 | { | ||
119 | spin_lock(&mm->page_table_lock); | ||
120 | set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); | ||
121 | spin_unlock(&mm->page_table_lock); | ||
122 | } | ||
123 | #else | ||
124 | static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, | ||
125 | pmd_t *pmd) | ||
126 | { | ||
127 | BUG(); | ||
128 | } | ||
129 | #endif /* CONFIG_NUMA_BALANCING */ | ||
130 | |||
131 | static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, | ||
82 | unsigned long addr, unsigned long end, pgprot_t newprot, | 132 | unsigned long addr, unsigned long end, pgprot_t newprot, |
83 | int dirty_accountable) | 133 | int dirty_accountable, int prot_numa) |
84 | { | 134 | { |
85 | pmd_t *pmd; | 135 | pmd_t *pmd; |
86 | unsigned long next; | 136 | unsigned long next; |
137 | unsigned long pages = 0; | ||
138 | bool all_same_node; | ||
87 | 139 | ||
88 | pmd = pmd_offset(pud, addr); | 140 | pmd = pmd_offset(pud, addr); |
89 | do { | 141 | do { |
@@ -91,42 +143,59 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
91 | if (pmd_trans_huge(*pmd)) { | 143 | if (pmd_trans_huge(*pmd)) { |
92 | if (next - addr != HPAGE_PMD_SIZE) | 144 | if (next - addr != HPAGE_PMD_SIZE) |
93 | split_huge_page_pmd(vma, addr, pmd); | 145 | split_huge_page_pmd(vma, addr, pmd); |
94 | else if (change_huge_pmd(vma, pmd, addr, newprot)) | 146 | else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { |
147 | pages += HPAGE_PMD_NR; | ||
95 | continue; | 148 | continue; |
149 | } | ||
96 | /* fall through */ | 150 | /* fall through */ |
97 | } | 151 | } |
98 | if (pmd_none_or_clear_bad(pmd)) | 152 | if (pmd_none_or_clear_bad(pmd)) |
99 | continue; | 153 | continue; |
100 | change_pte_range(vma->vm_mm, pmd, addr, next, newprot, | 154 | pages += change_pte_range(vma, pmd, addr, next, newprot, |
101 | dirty_accountable); | 155 | dirty_accountable, prot_numa, &all_same_node); |
156 | |||
157 | /* | ||
158 | * If we are changing protections for NUMA hinting faults then | ||
159 | * set pmd_numa if the examined pages were all on the same | ||
160 | * node. This allows a regular PMD to be handled as one fault | ||
161 | * and effectively batches the taking of the PTL | ||
162 | */ | ||
163 | if (prot_numa && all_same_node) | ||
164 | change_pmd_protnuma(vma->vm_mm, addr, pmd); | ||
102 | } while (pmd++, addr = next, addr != end); | 165 | } while (pmd++, addr = next, addr != end); |
166 | |||
167 | return pages; | ||
103 | } | 168 | } |
104 | 169 | ||
105 | static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 170 | static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
106 | unsigned long addr, unsigned long end, pgprot_t newprot, | 171 | unsigned long addr, unsigned long end, pgprot_t newprot, |
107 | int dirty_accountable) | 172 | int dirty_accountable, int prot_numa) |
108 | { | 173 | { |
109 | pud_t *pud; | 174 | pud_t *pud; |
110 | unsigned long next; | 175 | unsigned long next; |
176 | unsigned long pages = 0; | ||
111 | 177 | ||
112 | pud = pud_offset(pgd, addr); | 178 | pud = pud_offset(pgd, addr); |
113 | do { | 179 | do { |
114 | next = pud_addr_end(addr, end); | 180 | next = pud_addr_end(addr, end); |
115 | if (pud_none_or_clear_bad(pud)) | 181 | if (pud_none_or_clear_bad(pud)) |
116 | continue; | 182 | continue; |
117 | change_pmd_range(vma, pud, addr, next, newprot, | 183 | pages += change_pmd_range(vma, pud, addr, next, newprot, |
118 | dirty_accountable); | 184 | dirty_accountable, prot_numa); |
119 | } while (pud++, addr = next, addr != end); | 185 | } while (pud++, addr = next, addr != end); |
186 | |||
187 | return pages; | ||
120 | } | 188 | } |
121 | 189 | ||
122 | static void change_protection(struct vm_area_struct *vma, | 190 | static unsigned long change_protection_range(struct vm_area_struct *vma, |
123 | unsigned long addr, unsigned long end, pgprot_t newprot, | 191 | unsigned long addr, unsigned long end, pgprot_t newprot, |
124 | int dirty_accountable) | 192 | int dirty_accountable, int prot_numa) |
125 | { | 193 | { |
126 | struct mm_struct *mm = vma->vm_mm; | 194 | struct mm_struct *mm = vma->vm_mm; |
127 | pgd_t *pgd; | 195 | pgd_t *pgd; |
128 | unsigned long next; | 196 | unsigned long next; |
129 | unsigned long start = addr; | 197 | unsigned long start = addr; |
198 | unsigned long pages = 0; | ||
130 | 199 | ||
131 | BUG_ON(addr >= end); | 200 | BUG_ON(addr >= end); |
132 | pgd = pgd_offset(mm, addr); | 201 | pgd = pgd_offset(mm, addr); |
@@ -135,10 +204,32 @@ static void change_protection(struct vm_area_struct *vma, | |||
135 | next = pgd_addr_end(addr, end); | 204 | next = pgd_addr_end(addr, end); |
136 | if (pgd_none_or_clear_bad(pgd)) | 205 | if (pgd_none_or_clear_bad(pgd)) |
137 | continue; | 206 | continue; |
138 | change_pud_range(vma, pgd, addr, next, newprot, | 207 | pages += change_pud_range(vma, pgd, addr, next, newprot, |
139 | dirty_accountable); | 208 | dirty_accountable, prot_numa); |
140 | } while (pgd++, addr = next, addr != end); | 209 | } while (pgd++, addr = next, addr != end); |
141 | flush_tlb_range(vma, start, end); | 210 | |
211 | /* Only flush the TLB if we actually modified any entries: */ | ||
212 | if (pages) | ||
213 | flush_tlb_range(vma, start, end); | ||
214 | |||
215 | return pages; | ||
216 | } | ||
217 | |||
218 | unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, | ||
219 | unsigned long end, pgprot_t newprot, | ||
220 | int dirty_accountable, int prot_numa) | ||
221 | { | ||
222 | struct mm_struct *mm = vma->vm_mm; | ||
223 | unsigned long pages; | ||
224 | |||
225 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
226 | if (is_vm_hugetlb_page(vma)) | ||
227 | pages = hugetlb_change_protection(vma, start, end, newprot); | ||
228 | else | ||
229 | pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); | ||
230 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
231 | |||
232 | return pages; | ||
142 | } | 233 | } |
143 | 234 | ||
144 | int | 235 | int |
@@ -213,12 +304,8 @@ success: | |||
213 | dirty_accountable = 1; | 304 | dirty_accountable = 1; |
214 | } | 305 | } |
215 | 306 | ||
216 | mmu_notifier_invalidate_range_start(mm, start, end); | 307 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); |
217 | if (is_vm_hugetlb_page(vma)) | 308 | |
218 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); | ||
219 | else | ||
220 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); | ||
221 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
222 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 309 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
223 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 310 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
224 | perf_event_mmap(vma); | 311 | perf_event_mmap(vma); |
diff --git a/mm/mremap.c b/mm/mremap.c index eabb24da6c9e..e1031e1f6a61 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
104 | } | 104 | } |
105 | if (vma->anon_vma) { | 105 | if (vma->anon_vma) { |
106 | anon_vma = vma->anon_vma; | 106 | anon_vma = vma->anon_vma; |
107 | anon_vma_lock(anon_vma); | 107 | anon_vma_lock_write(anon_vma); |
108 | } | 108 | } |
109 | } | 109 | } |
110 | 110 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83637dfba110..d037c8bc1512 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -611,6 +611,7 @@ static inline int free_pages_check(struct page *page) | |||
611 | bad_page(page); | 611 | bad_page(page); |
612 | return 1; | 612 | return 1; |
613 | } | 613 | } |
614 | reset_page_last_nid(page); | ||
614 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 615 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
615 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 616 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
616 | return 0; | 617 | return 0; |
@@ -3883,6 +3884,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
3883 | mminit_verify_page_links(page, zone, nid, pfn); | 3884 | mminit_verify_page_links(page, zone, nid, pfn); |
3884 | init_page_count(page); | 3885 | init_page_count(page); |
3885 | reset_page_mapcount(page); | 3886 | reset_page_mapcount(page); |
3887 | reset_page_last_nid(page); | ||
3886 | SetPageReserved(page); | 3888 | SetPageReserved(page); |
3887 | /* | 3889 | /* |
3888 | * Mark the block movable so that blocks are reserved for | 3890 | * Mark the block movable so that blocks are reserved for |
@@ -4526,6 +4528,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4526 | int ret; | 4528 | int ret; |
4527 | 4529 | ||
4528 | pgdat_resize_init(pgdat); | 4530 | pgdat_resize_init(pgdat); |
4531 | #ifdef CONFIG_NUMA_BALANCING | ||
4532 | spin_lock_init(&pgdat->numabalancing_migrate_lock); | ||
4533 | pgdat->numabalancing_migrate_nr_pages = 0; | ||
4534 | pgdat->numabalancing_migrate_next_window = jiffies; | ||
4535 | #endif | ||
4529 | init_waitqueue_head(&pgdat->kswapd_wait); | 4536 | init_waitqueue_head(&pgdat->kswapd_wait); |
4530 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | 4537 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4531 | pgdat_page_cgroup_init(pgdat); | 4538 | pgdat_page_cgroup_init(pgdat); |
@@ -5800,7 +5807,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5800 | 5807 | ||
5801 | ret = migrate_pages(&cc->migratepages, | 5808 | ret = migrate_pages(&cc->migratepages, |
5802 | alloc_migrate_target, | 5809 | alloc_migrate_target, |
5803 | 0, false, MIGRATE_SYNC); | 5810 | 0, false, MIGRATE_SYNC, |
5811 | MR_CMA); | ||
5804 | } | 5812 | } |
5805 | 5813 | ||
5806 | putback_movable_pages(&cc->migratepages); | 5814 | putback_movable_pages(&cc->migratepages); |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e642627da6b7..0c8323fe6c8f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -12,8 +12,8 @@ | |||
12 | 12 | ||
13 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | 13 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS |
14 | /* | 14 | /* |
15 | * Only sets the access flags (dirty, accessed, and | 15 | * Only sets the access flags (dirty, accessed), as well as write |
16 | * writable). Furthermore, we know it always gets set to a "more | 16 | * permission. Furthermore, we know it always gets set to a "more |
17 | * permissive" setting, which allows most architectures to optimize | 17 | * permissive" setting, which allows most architectures to optimize |
18 | * this. We return whether the PTE actually changed, which in turn | 18 | * this. We return whether the PTE actually changed, which in turn |
19 | * instructs the caller to do things like update__mmu_cache. This | 19 | * instructs the caller to do things like update__mmu_cache. This |
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, | |||
27 | int changed = !pte_same(*ptep, entry); | 27 | int changed = !pte_same(*ptep, entry); |
28 | if (changed) { | 28 | if (changed) { |
29 | set_pte_at(vma->vm_mm, address, ptep, entry); | 29 | set_pte_at(vma->vm_mm, address, ptep, entry); |
30 | flush_tlb_page(vma, address); | 30 | flush_tlb_fix_spurious_fault(vma, address); |
31 | } | 31 | } |
32 | return changed; | 32 | return changed; |
33 | } | 33 | } |
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, | |||
88 | { | 88 | { |
89 | pte_t pte; | 89 | pte_t pte; |
90 | pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); | 90 | pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); |
91 | flush_tlb_page(vma, address); | 91 | if (pte_accessible(pte)) |
92 | flush_tlb_page(vma, address); | ||
92 | return pte; | 93 | return pte; |
93 | } | 94 | } |
94 | #endif | 95 | #endif |
@@ -24,7 +24,7 @@ | |||
24 | * mm->mmap_sem | 24 | * mm->mmap_sem |
25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
26 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_mutex |
27 | * anon_vma->mutex | 27 | * anon_vma->rwsem |
28 | * mm->page_table_lock or pte_lock | 28 | * mm->page_table_lock or pte_lock |
29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
30 | * swap_lock (in swap_duplicate, swap_info_get) | 30 | * swap_lock (in swap_duplicate, swap_info_get) |
@@ -37,7 +37,7 @@ | |||
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within bdi.wb->list_lock in __sync_single_inode) | 38 | * within bdi.wb->list_lock in __sync_single_inode) |
39 | * | 39 | * |
40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) | 40 | * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) |
41 | * ->tasklist_lock | 41 | * ->tasklist_lock |
42 | * pte map lock | 42 | * pte map lock |
43 | */ | 43 | */ |
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
87 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); | 87 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); |
88 | 88 | ||
89 | /* | 89 | /* |
90 | * Synchronize against page_lock_anon_vma() such that | 90 | * Synchronize against page_lock_anon_vma_read() such that |
91 | * we can safely hold the lock without the anon_vma getting | 91 | * we can safely hold the lock without the anon_vma getting |
92 | * freed. | 92 | * freed. |
93 | * | 93 | * |
94 | * Relies on the full mb implied by the atomic_dec_and_test() from | 94 | * Relies on the full mb implied by the atomic_dec_and_test() from |
95 | * put_anon_vma() against the acquire barrier implied by | 95 | * put_anon_vma() against the acquire barrier implied by |
96 | * mutex_trylock() from page_lock_anon_vma(). This orders: | 96 | * down_read_trylock() from page_lock_anon_vma_read(). This orders: |
97 | * | 97 | * |
98 | * page_lock_anon_vma() VS put_anon_vma() | 98 | * page_lock_anon_vma_read() VS put_anon_vma() |
99 | * mutex_trylock() atomic_dec_and_test() | 99 | * down_read_trylock() atomic_dec_and_test() |
100 | * LOCK MB | 100 | * LOCK MB |
101 | * atomic_read() mutex_is_locked() | 101 | * atomic_read() rwsem_is_locked() |
102 | * | 102 | * |
103 | * LOCK should suffice since the actual taking of the lock must | 103 | * LOCK should suffice since the actual taking of the lock must |
104 | * happen _before_ what follows. | 104 | * happen _before_ what follows. |
105 | */ | 105 | */ |
106 | if (mutex_is_locked(&anon_vma->root->mutex)) { | 106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { |
107 | anon_vma_lock(anon_vma); | 107 | anon_vma_lock_write(anon_vma); |
108 | anon_vma_unlock(anon_vma); | 108 | anon_vma_unlock(anon_vma); |
109 | } | 109 | } |
110 | 110 | ||
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
146 | * allocate a new one. | 146 | * allocate a new one. |
147 | * | 147 | * |
148 | * Anon-vma allocations are very subtle, because we may have | 148 | * Anon-vma allocations are very subtle, because we may have |
149 | * optimistically looked up an anon_vma in page_lock_anon_vma() | 149 | * optimistically looked up an anon_vma in page_lock_anon_vma_read() |
150 | * and that may actually touch the spinlock even in the newly | 150 | * and that may actually touch the spinlock even in the newly |
151 | * allocated vma (it depends on RCU to make sure that the | 151 | * allocated vma (it depends on RCU to make sure that the |
152 | * anon_vma isn't actually destroyed). | 152 | * anon_vma isn't actually destroyed). |
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
181 | allocated = anon_vma; | 181 | allocated = anon_vma; |
182 | } | 182 | } |
183 | 183 | ||
184 | anon_vma_lock(anon_vma); | 184 | anon_vma_lock_write(anon_vma); |
185 | /* page_table_lock to protect against threads */ | 185 | /* page_table_lock to protect against threads */ |
186 | spin_lock(&mm->page_table_lock); | 186 | spin_lock(&mm->page_table_lock); |
187 | if (likely(!vma->anon_vma)) { | 187 | if (likely(!vma->anon_vma)) { |
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct | |||
219 | struct anon_vma *new_root = anon_vma->root; | 219 | struct anon_vma *new_root = anon_vma->root; |
220 | if (new_root != root) { | 220 | if (new_root != root) { |
221 | if (WARN_ON_ONCE(root)) | 221 | if (WARN_ON_ONCE(root)) |
222 | mutex_unlock(&root->mutex); | 222 | up_write(&root->rwsem); |
223 | root = new_root; | 223 | root = new_root; |
224 | mutex_lock(&root->mutex); | 224 | down_write(&root->rwsem); |
225 | } | 225 | } |
226 | return root; | 226 | return root; |
227 | } | 227 | } |
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct | |||
229 | static inline void unlock_anon_vma_root(struct anon_vma *root) | 229 | static inline void unlock_anon_vma_root(struct anon_vma *root) |
230 | { | 230 | { |
231 | if (root) | 231 | if (root) |
232 | mutex_unlock(&root->mutex); | 232 | up_write(&root->rwsem); |
233 | } | 233 | } |
234 | 234 | ||
235 | /* | 235 | /* |
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
306 | get_anon_vma(anon_vma->root); | 306 | get_anon_vma(anon_vma->root); |
307 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | 307 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ |
308 | vma->anon_vma = anon_vma; | 308 | vma->anon_vma = anon_vma; |
309 | anon_vma_lock(anon_vma); | 309 | anon_vma_lock_write(anon_vma); |
310 | anon_vma_chain_link(vma, avc, anon_vma); | 310 | anon_vma_chain_link(vma, avc, anon_vma); |
311 | anon_vma_unlock(anon_vma); | 311 | anon_vma_unlock(anon_vma); |
312 | 312 | ||
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) | |||
349 | /* | 349 | /* |
350 | * Iterate the list once more, it now only contains empty and unlinked | 350 | * Iterate the list once more, it now only contains empty and unlinked |
351 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() | 351 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() |
352 | * needing to acquire the anon_vma->root->mutex. | 352 | * needing to write-acquire the anon_vma->root->rwsem. |
353 | */ | 353 | */ |
354 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | 354 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
355 | struct anon_vma *anon_vma = avc->anon_vma; | 355 | struct anon_vma *anon_vma = avc->anon_vma; |
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data) | |||
365 | { | 365 | { |
366 | struct anon_vma *anon_vma = data; | 366 | struct anon_vma *anon_vma = data; |
367 | 367 | ||
368 | mutex_init(&anon_vma->mutex); | 368 | init_rwsem(&anon_vma->rwsem); |
369 | atomic_set(&anon_vma->refcount, 0); | 369 | atomic_set(&anon_vma->refcount, 0); |
370 | anon_vma->rb_root = RB_ROOT; | 370 | anon_vma->rb_root = RB_ROOT; |
371 | } | 371 | } |
@@ -442,7 +442,7 @@ out: | |||
442 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a | 442 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a |
443 | * reference like with page_get_anon_vma() and then block on the mutex. | 443 | * reference like with page_get_anon_vma() and then block on the mutex. |
444 | */ | 444 | */ |
445 | struct anon_vma *page_lock_anon_vma(struct page *page) | 445 | struct anon_vma *page_lock_anon_vma_read(struct page *page) |
446 | { | 446 | { |
447 | struct anon_vma *anon_vma = NULL; | 447 | struct anon_vma *anon_vma = NULL; |
448 | struct anon_vma *root_anon_vma; | 448 | struct anon_vma *root_anon_vma; |
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
457 | 457 | ||
458 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 458 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
459 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | 459 | root_anon_vma = ACCESS_ONCE(anon_vma->root); |
460 | if (mutex_trylock(&root_anon_vma->mutex)) { | 460 | if (down_read_trylock(&root_anon_vma->rwsem)) { |
461 | /* | 461 | /* |
462 | * If the page is still mapped, then this anon_vma is still | 462 | * If the page is still mapped, then this anon_vma is still |
463 | * its anon_vma, and holding the mutex ensures that it will | 463 | * its anon_vma, and holding the mutex ensures that it will |
464 | * not go away, see anon_vma_free(). | 464 | * not go away, see anon_vma_free(). |
465 | */ | 465 | */ |
466 | if (!page_mapped(page)) { | 466 | if (!page_mapped(page)) { |
467 | mutex_unlock(&root_anon_vma->mutex); | 467 | up_read(&root_anon_vma->rwsem); |
468 | anon_vma = NULL; | 468 | anon_vma = NULL; |
469 | } | 469 | } |
470 | goto out; | 470 | goto out; |
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
484 | 484 | ||
485 | /* we pinned the anon_vma, its safe to sleep */ | 485 | /* we pinned the anon_vma, its safe to sleep */ |
486 | rcu_read_unlock(); | 486 | rcu_read_unlock(); |
487 | anon_vma_lock(anon_vma); | 487 | anon_vma_lock_read(anon_vma); |
488 | 488 | ||
489 | if (atomic_dec_and_test(&anon_vma->refcount)) { | 489 | if (atomic_dec_and_test(&anon_vma->refcount)) { |
490 | /* | 490 | /* |
491 | * Oops, we held the last refcount, release the lock | 491 | * Oops, we held the last refcount, release the lock |
492 | * and bail -- can't simply use put_anon_vma() because | 492 | * and bail -- can't simply use put_anon_vma() because |
493 | * we'll deadlock on the anon_vma_lock() recursion. | 493 | * we'll deadlock on the anon_vma_lock_write() recursion. |
494 | */ | 494 | */ |
495 | anon_vma_unlock(anon_vma); | 495 | anon_vma_unlock_read(anon_vma); |
496 | __put_anon_vma(anon_vma); | 496 | __put_anon_vma(anon_vma); |
497 | anon_vma = NULL; | 497 | anon_vma = NULL; |
498 | } | 498 | } |
@@ -504,9 +504,9 @@ out: | |||
504 | return anon_vma; | 504 | return anon_vma; |
505 | } | 505 | } |
506 | 506 | ||
507 | void page_unlock_anon_vma(struct anon_vma *anon_vma) | 507 | void page_unlock_anon_vma_read(struct anon_vma *anon_vma) |
508 | { | 508 | { |
509 | anon_vma_unlock(anon_vma); | 509 | anon_vma_unlock_read(anon_vma); |
510 | } | 510 | } |
511 | 511 | ||
512 | /* | 512 | /* |
@@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page, | |||
744 | struct anon_vma_chain *avc; | 744 | struct anon_vma_chain *avc; |
745 | int referenced = 0; | 745 | int referenced = 0; |
746 | 746 | ||
747 | anon_vma = page_lock_anon_vma(page); | 747 | anon_vma = page_lock_anon_vma_read(page); |
748 | if (!anon_vma) | 748 | if (!anon_vma) |
749 | return referenced; | 749 | return referenced; |
750 | 750 | ||
@@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page, | |||
766 | break; | 766 | break; |
767 | } | 767 | } |
768 | 768 | ||
769 | page_unlock_anon_vma(anon_vma); | 769 | page_unlock_anon_vma_read(anon_vma); |
770 | return referenced; | 770 | return referenced; |
771 | } | 771 | } |
772 | 772 | ||
@@ -1315,7 +1315,7 @@ out_mlock: | |||
1315 | /* | 1315 | /* |
1316 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | 1316 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes |
1317 | * unstable result and race. Plus, We can't wait here because | 1317 | * unstable result and race. Plus, We can't wait here because |
1318 | * we now hold anon_vma->mutex or mapping->i_mmap_mutex. | 1318 | * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. |
1319 | * if trylock failed, the page remain in evictable lru and later | 1319 | * if trylock failed, the page remain in evictable lru and later |
1320 | * vmscan could retry to move the page to unevictable lru if the | 1320 | * vmscan could retry to move the page to unevictable lru if the |
1321 | * page is actually mlocked. | 1321 | * page is actually mlocked. |
@@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1480 | struct anon_vma_chain *avc; | 1480 | struct anon_vma_chain *avc; |
1481 | int ret = SWAP_AGAIN; | 1481 | int ret = SWAP_AGAIN; |
1482 | 1482 | ||
1483 | anon_vma = page_lock_anon_vma(page); | 1483 | anon_vma = page_lock_anon_vma_read(page); |
1484 | if (!anon_vma) | 1484 | if (!anon_vma) |
1485 | return ret; | 1485 | return ret; |
1486 | 1486 | ||
@@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1507 | break; | 1507 | break; |
1508 | } | 1508 | } |
1509 | 1509 | ||
1510 | page_unlock_anon_vma(anon_vma); | 1510 | page_unlock_anon_vma_read(anon_vma); |
1511 | return ret; | 1511 | return ret; |
1512 | } | 1512 | } |
1513 | 1513 | ||
@@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1702 | int ret = SWAP_AGAIN; | 1702 | int ret = SWAP_AGAIN; |
1703 | 1703 | ||
1704 | /* | 1704 | /* |
1705 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | 1705 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() |
1706 | * because that depends on page_mapped(); but not all its usages | 1706 | * because that depends on page_mapped(); but not all its usages |
1707 | * are holding mmap_sem. Users without mmap_sem are required to | 1707 | * are holding mmap_sem. Users without mmap_sem are required to |
1708 | * take a reference count to prevent the anon_vma disappearing | 1708 | * take a reference count to prevent the anon_vma disappearing |
@@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1710 | anon_vma = page_anon_vma(page); | 1710 | anon_vma = page_anon_vma(page); |
1711 | if (!anon_vma) | 1711 | if (!anon_vma) |
1712 | return ret; | 1712 | return ret; |
1713 | anon_vma_lock(anon_vma); | 1713 | anon_vma_lock_read(anon_vma); |
1714 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1714 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1715 | struct vm_area_struct *vma = avc->vma; | 1715 | struct vm_area_struct *vma = avc->vma; |
1716 | unsigned long address = vma_address(page, vma); | 1716 | unsigned long address = vma_address(page, vma); |
@@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1718 | if (ret != SWAP_AGAIN) | 1718 | if (ret != SWAP_AGAIN) |
1719 | break; | 1719 | break; |
1720 | } | 1720 | } |
1721 | anon_vma_unlock(anon_vma); | 1721 | anon_vma_unlock_read(anon_vma); |
1722 | return ret; | 1722 | return ret; |
1723 | } | 1723 | } |
1724 | 1724 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index df14808f0a36..9800306c8195 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = { | |||
774 | 774 | ||
775 | "pgrotated", | 775 | "pgrotated", |
776 | 776 | ||
777 | #ifdef CONFIG_NUMA_BALANCING | ||
778 | "numa_pte_updates", | ||
779 | "numa_hint_faults", | ||
780 | "numa_hint_faults_local", | ||
781 | "numa_pages_migrated", | ||
782 | #endif | ||
783 | #ifdef CONFIG_MIGRATION | ||
784 | "pgmigrate_success", | ||
785 | "pgmigrate_fail", | ||
786 | #endif | ||
777 | #ifdef CONFIG_COMPACTION | 787 | #ifdef CONFIG_COMPACTION |
778 | "compact_blocks_moved", | 788 | "compact_migrate_scanned", |
779 | "compact_pages_moved", | 789 | "compact_free_scanned", |
780 | "compact_pagemigrate_failed", | 790 | "compact_isolated", |
781 | "compact_stall", | 791 | "compact_stall", |
782 | "compact_fail", | 792 | "compact_fail", |
783 | "compact_success", | 793 | "compact_success", |