aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2012-11-19 07:35:47 -0500
committerMel Gorman <mgorman@suse.de>2012-12-11 09:42:57 -0500
commitb32967ff101a7508f70be8de59b278d4df92fa00 (patch)
treeb106d5eea06f97d0174f483d6a05a8b7ddd64154 /mm
parent5bca23035391928c4c7301835accca3551b96cc2 (diff)
mm: numa: Add THP migration for the NUMA working set scanning fault case.
Note: This is very heavily based on a patch from Peter Zijlstra with fixes from Ingo Molnar, Hugh Dickins and Johannes Weiner. That patch put a lot of migration logic into mm/huge_memory.c where it does not belong. This version puts tries to share some of the migration logic with migrate_misplaced_page. However, it should be noted that now migrate.c is doing more with the pagetable manipulation than is preferred. The end result is barely recognisable so as before, the signed-offs had to be removed but will be re-added if the original authors are ok with it. Add THP migration for the NUMA working set scanning fault case. It uses the page lock to serialize. No migration pte dance is necessary because the pte is already unmapped when we decide to migrate. [dhillf@gmail.com: Fix memory leak on isolation failure] [dhillf@gmail.com: Fix transfer of last_nid information] Signed-off-by: Mel Gorman <mgorman@suse.de>
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c59
-rw-r--r--mm/internal.h7
-rw-r--r--mm/memcontrol.c7
-rw-r--r--mm/migrate.c231
4 files changed, 240 insertions, 64 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 199b261a257e..711baf84b153 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -600,7 +600,7 @@ out:
600} 600}
601__setup("transparent_hugepage=", setup_transparent_hugepage); 601__setup("transparent_hugepage=", setup_transparent_hugepage);
602 602
603static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 603pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
604{ 604{
605 if (likely(vma->vm_flags & VM_WRITE)) 605 if (likely(vma->vm_flags & VM_WRITE))
606 pmd = pmd_mkwrite(pmd); 606 pmd = pmd_mkwrite(pmd);
@@ -1023,10 +1023,12 @@ out:
1023int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 1023int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1024 unsigned long addr, pmd_t pmd, pmd_t *pmdp) 1024 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1025{ 1025{
1026 struct page *page = NULL; 1026 struct page *page;
1027 unsigned long haddr = addr & HPAGE_PMD_MASK; 1027 unsigned long haddr = addr & HPAGE_PMD_MASK;
1028 int target_nid; 1028 int target_nid;
1029 int current_nid = -1; 1029 int current_nid = -1;
1030 bool migrated;
1031 bool page_locked = false;
1030 1032
1031 spin_lock(&mm->page_table_lock); 1033 spin_lock(&mm->page_table_lock);
1032 if (unlikely(!pmd_same(pmd, *pmdp))) 1034 if (unlikely(!pmd_same(pmd, *pmdp)))
@@ -1034,42 +1036,61 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1034 1036
1035 page = pmd_page(pmd); 1037 page = pmd_page(pmd);
1036 get_page(page); 1038 get_page(page);
1037 spin_unlock(&mm->page_table_lock);
1038 current_nid = page_to_nid(page); 1039 current_nid = page_to_nid(page);
1039 count_vm_numa_event(NUMA_HINT_FAULTS); 1040 count_vm_numa_event(NUMA_HINT_FAULTS);
1040 if (current_nid == numa_node_id()) 1041 if (current_nid == numa_node_id())
1041 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1042 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1042 1043
1043 target_nid = mpol_misplaced(page, vma, haddr); 1044 target_nid = mpol_misplaced(page, vma, haddr);
1044 if (target_nid == -1) 1045 if (target_nid == -1) {
1046 put_page(page);
1045 goto clear_pmdnuma; 1047 goto clear_pmdnuma;
1048 }
1046 1049
1047 /* 1050 /* Acquire the page lock to serialise THP migrations */
1048 * Due to lacking code to migrate thp pages, we'll split 1051 spin_unlock(&mm->page_table_lock);
1049 * (which preserves the special PROT_NONE) and re-take the 1052 lock_page(page);
1050 * fault on the normal pages. 1053 page_locked = true;
1051 */
1052 split_huge_page(page);
1053 put_page(page);
1054
1055 return 0;
1056 1054
1057clear_pmdnuma: 1055 /* Confirm the PTE did not while locked */
1058 spin_lock(&mm->page_table_lock); 1056 spin_lock(&mm->page_table_lock);
1059 if (unlikely(!pmd_same(pmd, *pmdp))) 1057 if (unlikely(!pmd_same(pmd, *pmdp))) {
1058 unlock_page(page);
1059 put_page(page);
1060 goto out_unlock; 1060 goto out_unlock;
1061 }
1062 spin_unlock(&mm->page_table_lock);
1063
1064 /* Migrate the THP to the requested node */
1065 migrated = migrate_misplaced_transhuge_page(mm, vma,
1066 pmdp, pmd, addr,
1067 page, target_nid);
1068 if (migrated)
1069 current_nid = target_nid;
1070 else {
1071 spin_lock(&mm->page_table_lock);
1072 if (unlikely(!pmd_same(pmd, *pmdp))) {
1073 unlock_page(page);
1074 goto out_unlock;
1075 }
1076 goto clear_pmdnuma;
1077 }
1078
1079 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1080 return 0;
1061 1081
1082clear_pmdnuma:
1062 pmd = pmd_mknonnuma(pmd); 1083 pmd = pmd_mknonnuma(pmd);
1063 set_pmd_at(mm, haddr, pmdp, pmd); 1084 set_pmd_at(mm, haddr, pmdp, pmd);
1064 VM_BUG_ON(pmd_numa(*pmdp)); 1085 VM_BUG_ON(pmd_numa(*pmdp));
1065 update_mmu_cache_pmd(vma, addr, pmdp); 1086 update_mmu_cache_pmd(vma, addr, pmdp);
1087 if (page_locked)
1088 unlock_page(page);
1066 1089
1067out_unlock: 1090out_unlock:
1068 spin_unlock(&mm->page_table_lock); 1091 spin_unlock(&mm->page_table_lock);
1069 if (page) { 1092 if (current_nid != -1)
1070 put_page(page); 1093 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1071 task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false);
1072 }
1073 return 0; 1094 return 0;
1074} 1095}
1075 1096
diff --git a/mm/internal.h b/mm/internal.h
index a4fa284f6bc2..7e60ac826f2b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -212,15 +212,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
212{ 212{
213 if (TestClearPageMlocked(page)) { 213 if (TestClearPageMlocked(page)) {
214 unsigned long flags; 214 unsigned long flags;
215 int nr_pages = hpage_nr_pages(page);
215 216
216 local_irq_save(flags); 217 local_irq_save(flags);
217 __dec_zone_page_state(page, NR_MLOCK); 218 __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
218 SetPageMlocked(newpage); 219 SetPageMlocked(newpage);
219 __inc_zone_page_state(newpage, NR_MLOCK); 220 __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
220 local_irq_restore(flags); 221 local_irq_restore(flags);
221 } 222 }
222} 223}
223 224
225extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
226
224#ifdef CONFIG_TRANSPARENT_HUGEPAGE 227#ifdef CONFIG_TRANSPARENT_HUGEPAGE
225extern unsigned long vma_address(struct page *page, 228extern unsigned long vma_address(struct page *page,
226 struct vm_area_struct *vma); 229 struct vm_area_struct *vma);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dd39ba000b31..d97af9636ab2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3288,15 +3288,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3288 struct mem_cgroup **memcgp) 3288 struct mem_cgroup **memcgp)
3289{ 3289{
3290 struct mem_cgroup *memcg = NULL; 3290 struct mem_cgroup *memcg = NULL;
3291 unsigned int nr_pages = 1;
3291 struct page_cgroup *pc; 3292 struct page_cgroup *pc;
3292 enum charge_type ctype; 3293 enum charge_type ctype;
3293 3294
3294 *memcgp = NULL; 3295 *memcgp = NULL;
3295 3296
3296 VM_BUG_ON(PageTransHuge(page));
3297 if (mem_cgroup_disabled()) 3297 if (mem_cgroup_disabled())
3298 return; 3298 return;
3299 3299
3300 if (PageTransHuge(page))
3301 nr_pages <<= compound_order(page);
3302
3300 pc = lookup_page_cgroup(page); 3303 pc = lookup_page_cgroup(page);
3301 lock_page_cgroup(pc); 3304 lock_page_cgroup(pc);
3302 if (PageCgroupUsed(pc)) { 3305 if (PageCgroupUsed(pc)) {
@@ -3358,7 +3361,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3358 * charged to the res_counter since we plan on replacing the 3361 * charged to the res_counter since we plan on replacing the
3359 * old one and only one page is going to be left afterwards. 3362 * old one and only one page is going to be left afterwards.
3360 */ 3363 */
3361 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); 3364 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
3362} 3365}
3363 3366
3364/* remove redundant charge if migration failed*/ 3367/* remove redundant charge if migration failed*/
diff --git a/mm/migrate.c b/mm/migrate.c
index 2a5ce135eef0..c9400960fd52 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -410,7 +410,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
410 */ 410 */
411void migrate_page_copy(struct page *newpage, struct page *page) 411void migrate_page_copy(struct page *newpage, struct page *page)
412{ 412{
413 if (PageHuge(page)) 413 if (PageHuge(page) || PageTransHuge(page))
414 copy_huge_page(newpage, page); 414 copy_huge_page(newpage, page);
415 else 415 else
416 copy_highpage(newpage, page); 416 copy_highpage(newpage, page);
@@ -1491,25 +1491,10 @@ bool migrate_ratelimited(int node)
1491 return true; 1491 return true;
1492} 1492}
1493 1493
1494/* 1494/* Returns true if the node is migrate rate-limited after the update */
1495 * Attempt to migrate a misplaced page to the specified destination 1495bool numamigrate_update_ratelimit(pg_data_t *pgdat)
1496 * node. Caller is expected to have an elevated reference count on
1497 * the page that will be dropped by this function before returning.
1498 */
1499int migrate_misplaced_page(struct page *page, int node)
1500{ 1496{
1501 pg_data_t *pgdat = NODE_DATA(node); 1497 bool rate_limited = false;
1502 int isolated = 0;
1503 LIST_HEAD(migratepages);
1504
1505 /*
1506 * Don't migrate pages that are mapped in multiple processes.
1507 * TODO: Handle false sharing detection instead of this hammer
1508 */
1509 if (page_mapcount(page) != 1) {
1510 put_page(page);
1511 goto out;
1512 }
1513 1498
1514 /* 1499 /*
1515 * Rate-limit the amount of data that is being migrated to a node. 1500 * Rate-limit the amount of data that is being migrated to a node.
@@ -1522,13 +1507,18 @@ int migrate_misplaced_page(struct page *page, int node)
1522 pgdat->numabalancing_migrate_next_window = jiffies + 1507 pgdat->numabalancing_migrate_next_window = jiffies +
1523 msecs_to_jiffies(migrate_interval_millisecs); 1508 msecs_to_jiffies(migrate_interval_millisecs);
1524 } 1509 }
1525 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { 1510 if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
1526 spin_unlock(&pgdat->numabalancing_migrate_lock); 1511 rate_limited = true;
1527 put_page(page); 1512 else
1528 goto out; 1513 pgdat->numabalancing_migrate_nr_pages++;
1529 }
1530 pgdat->numabalancing_migrate_nr_pages++;
1531 spin_unlock(&pgdat->numabalancing_migrate_lock); 1514 spin_unlock(&pgdat->numabalancing_migrate_lock);
1515
1516 return rate_limited;
1517}
1518
1519int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1520{
1521 int ret = 0;
1532 1522
1533 /* Avoid migrating to a node that is nearly full */ 1523 /* Avoid migrating to a node that is nearly full */
1534 if (migrate_balanced_pgdat(pgdat, 1)) { 1524 if (migrate_balanced_pgdat(pgdat, 1)) {
@@ -1536,13 +1526,18 @@ int migrate_misplaced_page(struct page *page, int node)
1536 1526
1537 if (isolate_lru_page(page)) { 1527 if (isolate_lru_page(page)) {
1538 put_page(page); 1528 put_page(page);
1539 goto out; 1529 return 0;
1540 } 1530 }
1541 isolated = 1;
1542 1531
1532 /* Page is isolated */
1533 ret = 1;
1543 page_lru = page_is_file_cache(page); 1534 page_lru = page_is_file_cache(page);
1544 inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); 1535 if (!PageTransHuge(page))
1545 list_add(&page->lru, &migratepages); 1536 inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
1537 else
1538 mod_zone_page_state(page_zone(page),
1539 NR_ISOLATED_ANON + page_lru,
1540 HPAGE_PMD_NR);
1546 } 1541 }
1547 1542
1548 /* 1543 /*
@@ -1555,23 +1550,177 @@ int migrate_misplaced_page(struct page *page, int node)
1555 */ 1550 */
1556 put_page(page); 1551 put_page(page);
1557 1552
1558 if (isolated) { 1553 return ret;
1559 int nr_remaining; 1554}
1560 1555
1561 nr_remaining = migrate_pages(&migratepages, 1556/*
1562 alloc_misplaced_dst_page, 1557 * Attempt to migrate a misplaced page to the specified destination
1563 node, false, MIGRATE_ASYNC, 1558 * node. Caller is expected to have an elevated reference count on
1564 MR_NUMA_MISPLACED); 1559 * the page that will be dropped by this function before returning.
1565 if (nr_remaining) { 1560 */
1566 putback_lru_pages(&migratepages); 1561int migrate_misplaced_page(struct page *page, int node)
1567 isolated = 0; 1562{
1568 } else 1563 pg_data_t *pgdat = NODE_DATA(node);
1569 count_vm_numa_event(NUMA_PAGE_MIGRATE); 1564 int isolated = 0;
1565 int nr_remaining;
1566 LIST_HEAD(migratepages);
1567
1568 /*
1569 * Don't migrate pages that are mapped in multiple processes.
1570 * TODO: Handle false sharing detection instead of this hammer
1571 */
1572 if (page_mapcount(page) != 1) {
1573 put_page(page);
1574 goto out;
1570 } 1575 }
1576
1577 /*
1578 * Rate-limit the amount of data that is being migrated to a node.
1579 * Optimal placement is no good if the memory bus is saturated and
1580 * all the time is being spent migrating!
1581 */
1582 if (numamigrate_update_ratelimit(pgdat)) {
1583 put_page(page);
1584 goto out;
1585 }
1586
1587 isolated = numamigrate_isolate_page(pgdat, page);
1588 if (!isolated)
1589 goto out;
1590
1591 list_add(&page->lru, &migratepages);
1592 nr_remaining = migrate_pages(&migratepages,
1593 alloc_misplaced_dst_page,
1594 node, false, MIGRATE_ASYNC,
1595 MR_NUMA_MISPLACED);
1596 if (nr_remaining) {
1597 putback_lru_pages(&migratepages);
1598 isolated = 0;
1599 } else
1600 count_vm_numa_event(NUMA_PAGE_MIGRATE);
1571 BUG_ON(!list_empty(&migratepages)); 1601 BUG_ON(!list_empty(&migratepages));
1572out: 1602out:
1573 return isolated; 1603 return isolated;
1574} 1604}
1605
1606int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1607 struct vm_area_struct *vma,
1608 pmd_t *pmd, pmd_t entry,
1609 unsigned long address,
1610 struct page *page, int node)
1611{
1612 unsigned long haddr = address & HPAGE_PMD_MASK;
1613 pg_data_t *pgdat = NODE_DATA(node);
1614 int isolated = 0;
1615 struct page *new_page = NULL;
1616 struct mem_cgroup *memcg = NULL;
1617 int page_lru = page_is_file_cache(page);
1618
1619 /*
1620 * Don't migrate pages that are mapped in multiple processes.
1621 * TODO: Handle false sharing detection instead of this hammer
1622 */
1623 if (page_mapcount(page) != 1)
1624 goto out_dropref;
1625
1626 /*
1627 * Rate-limit the amount of data that is being migrated to a node.
1628 * Optimal placement is no good if the memory bus is saturated and
1629 * all the time is being spent migrating!
1630 */
1631 if (numamigrate_update_ratelimit(pgdat))
1632 goto out_dropref;
1633
1634 new_page = alloc_pages_node(node,
1635 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
1636 if (!new_page)
1637 goto out_dropref;
1638 page_xchg_last_nid(new_page, page_last_nid(page));
1639
1640 isolated = numamigrate_isolate_page(pgdat, page);
1641 if (!isolated) {
1642 put_page(new_page);
1643 goto out_keep_locked;
1644 }
1645
1646 /* Prepare a page as a migration target */
1647 __set_page_locked(new_page);
1648 SetPageSwapBacked(new_page);
1649
1650 /* anon mapping, we can simply copy page->mapping to the new page: */
1651 new_page->mapping = page->mapping;
1652 new_page->index = page->index;
1653 migrate_page_copy(new_page, page);
1654 WARN_ON(PageLRU(new_page));
1655
1656 /* Recheck the target PMD */
1657 spin_lock(&mm->page_table_lock);
1658 if (unlikely(!pmd_same(*pmd, entry))) {
1659 spin_unlock(&mm->page_table_lock);
1660
1661 /* Reverse changes made by migrate_page_copy() */
1662 if (TestClearPageActive(new_page))
1663 SetPageActive(page);
1664 if (TestClearPageUnevictable(new_page))
1665 SetPageUnevictable(page);
1666 mlock_migrate_page(page, new_page);
1667
1668 unlock_page(new_page);
1669 put_page(new_page); /* Free it */
1670
1671 unlock_page(page);
1672 putback_lru_page(page);
1673
1674 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1675 goto out;
1676 }
1677
1678 /*
1679 * Traditional migration needs to prepare the memcg charge
1680 * transaction early to prevent the old page from being
1681 * uncharged when installing migration entries. Here we can
1682 * save the potential rollback and start the charge transfer
1683 * only when migration is already known to end successfully.
1684 */
1685 mem_cgroup_prepare_migration(page, new_page, &memcg);
1686
1687 entry = mk_pmd(new_page, vma->vm_page_prot);
1688 entry = pmd_mknonnuma(entry);
1689 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1690 entry = pmd_mkhuge(entry);
1691
1692 page_add_new_anon_rmap(new_page, vma, haddr);
1693
1694 set_pmd_at(mm, haddr, pmd, entry);
1695 update_mmu_cache_pmd(vma, address, entry);
1696 page_remove_rmap(page);
1697 /*
1698 * Finish the charge transaction under the page table lock to
1699 * prevent split_huge_page() from dividing up the charge
1700 * before it's fully transferred to the new page.
1701 */
1702 mem_cgroup_end_migration(memcg, page, new_page, true);
1703 spin_unlock(&mm->page_table_lock);
1704
1705 unlock_page(new_page);
1706 unlock_page(page);
1707 put_page(page); /* Drop the rmap reference */
1708 put_page(page); /* Drop the LRU isolation reference */
1709
1710 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1711 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1712
1713out:
1714 mod_zone_page_state(page_zone(page),
1715 NR_ISOLATED_ANON + page_lru,
1716 -HPAGE_PMD_NR);
1717 return isolated;
1718
1719out_dropref:
1720 put_page(page);
1721out_keep_locked:
1722 return 0;
1723}
1575#endif /* CONFIG_NUMA_BALANCING */ 1724#endif /* CONFIG_NUMA_BALANCING */
1576 1725
1577#endif /* CONFIG_NUMA */ 1726#endif /* CONFIG_NUMA */