diff options
Diffstat (limited to 'mm/migrate.c')
-rw-r--r-- | mm/migrate.c | 337 |
1 files changed, 331 insertions, 6 deletions
diff --git a/mm/migrate.c b/mm/migrate.c index cae02711181d..32efd8028bc9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -39,6 +39,9 @@ | |||
39 | 39 | ||
40 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
41 | 41 | ||
42 | #define CREATE_TRACE_POINTS | ||
43 | #include <trace/events/migrate.h> | ||
44 | |||
42 | #include "internal.h" | 45 | #include "internal.h" |
43 | 46 | ||
44 | /* | 47 | /* |
@@ -293,7 +296,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
293 | struct page *newpage, struct page *page, | 296 | struct page *newpage, struct page *page, |
294 | struct buffer_head *head, enum migrate_mode mode) | 297 | struct buffer_head *head, enum migrate_mode mode) |
295 | { | 298 | { |
296 | int expected_count; | 299 | int expected_count = 0; |
297 | void **pslot; | 300 | void **pslot; |
298 | 301 | ||
299 | if (!mapping) { | 302 | if (!mapping) { |
@@ -421,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
421 | */ | 424 | */ |
422 | void migrate_page_copy(struct page *newpage, struct page *page) | 425 | void migrate_page_copy(struct page *newpage, struct page *page) |
423 | { | 426 | { |
424 | if (PageHuge(page)) | 427 | if (PageHuge(page) || PageTransHuge(page)) |
425 | copy_huge_page(newpage, page); | 428 | copy_huge_page(newpage, page); |
426 | else | 429 | else |
427 | copy_highpage(newpage, page); | 430 | copy_highpage(newpage, page); |
@@ -765,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
765 | */ | 768 | */ |
766 | if (PageAnon(page)) { | 769 | if (PageAnon(page)) { |
767 | /* | 770 | /* |
768 | * Only page_lock_anon_vma() understands the subtleties of | 771 | * Only page_lock_anon_vma_read() understands the subtleties of |
769 | * getting a hold on an anon_vma from outside one of its mms. | 772 | * getting a hold on an anon_vma from outside one of its mms. |
770 | */ | 773 | */ |
771 | anon_vma = page_get_anon_vma(page); | 774 | anon_vma = page_get_anon_vma(page); |
@@ -998,10 +1001,11 @@ out: | |||
998 | */ | 1001 | */ |
999 | int migrate_pages(struct list_head *from, | 1002 | int migrate_pages(struct list_head *from, |
1000 | new_page_t get_new_page, unsigned long private, bool offlining, | 1003 | new_page_t get_new_page, unsigned long private, bool offlining, |
1001 | enum migrate_mode mode) | 1004 | enum migrate_mode mode, int reason) |
1002 | { | 1005 | { |
1003 | int retry = 1; | 1006 | int retry = 1; |
1004 | int nr_failed = 0; | 1007 | int nr_failed = 0; |
1008 | int nr_succeeded = 0; | ||
1005 | int pass = 0; | 1009 | int pass = 0; |
1006 | struct page *page; | 1010 | struct page *page; |
1007 | struct page *page2; | 1011 | struct page *page2; |
@@ -1028,6 +1032,7 @@ int migrate_pages(struct list_head *from, | |||
1028 | retry++; | 1032 | retry++; |
1029 | break; | 1033 | break; |
1030 | case MIGRATEPAGE_SUCCESS: | 1034 | case MIGRATEPAGE_SUCCESS: |
1035 | nr_succeeded++; | ||
1031 | break; | 1036 | break; |
1032 | default: | 1037 | default: |
1033 | /* Permanent failure */ | 1038 | /* Permanent failure */ |
@@ -1038,6 +1043,12 @@ int migrate_pages(struct list_head *from, | |||
1038 | } | 1043 | } |
1039 | rc = nr_failed + retry; | 1044 | rc = nr_failed + retry; |
1040 | out: | 1045 | out: |
1046 | if (nr_succeeded) | ||
1047 | count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); | ||
1048 | if (nr_failed) | ||
1049 | count_vm_events(PGMIGRATE_FAIL, nr_failed); | ||
1050 | trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); | ||
1051 | |||
1041 | if (!swapwrite) | 1052 | if (!swapwrite) |
1042 | current->flags &= ~PF_SWAPWRITE; | 1053 | current->flags &= ~PF_SWAPWRITE; |
1043 | 1054 | ||
@@ -1176,7 +1187,8 @@ set_status: | |||
1176 | err = 0; | 1187 | err = 0; |
1177 | if (!list_empty(&pagelist)) { | 1188 | if (!list_empty(&pagelist)) { |
1178 | err = migrate_pages(&pagelist, new_page_node, | 1189 | err = migrate_pages(&pagelist, new_page_node, |
1179 | (unsigned long)pm, 0, MIGRATE_SYNC); | 1190 | (unsigned long)pm, 0, MIGRATE_SYNC, |
1191 | MR_SYSCALL); | ||
1180 | if (err) | 1192 | if (err) |
1181 | putback_lru_pages(&pagelist); | 1193 | putback_lru_pages(&pagelist); |
1182 | } | 1194 | } |
@@ -1440,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, | |||
1440 | } | 1452 | } |
1441 | return err; | 1453 | return err; |
1442 | } | 1454 | } |
1443 | #endif | 1455 | |
1456 | #ifdef CONFIG_NUMA_BALANCING | ||
1457 | /* | ||
1458 | * Returns true if this is a safe migration target node for misplaced NUMA | ||
1459 | * pages. Currently it only checks the watermarks which crude | ||
1460 | */ | ||
1461 | static bool migrate_balanced_pgdat(struct pglist_data *pgdat, | ||
1462 | int nr_migrate_pages) | ||
1463 | { | ||
1464 | int z; | ||
1465 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { | ||
1466 | struct zone *zone = pgdat->node_zones + z; | ||
1467 | |||
1468 | if (!populated_zone(zone)) | ||
1469 | continue; | ||
1470 | |||
1471 | if (zone->all_unreclaimable) | ||
1472 | continue; | ||
1473 | |||
1474 | /* Avoid waking kswapd by allocating pages_to_migrate pages. */ | ||
1475 | if (!zone_watermark_ok(zone, 0, | ||
1476 | high_wmark_pages(zone) + | ||
1477 | nr_migrate_pages, | ||
1478 | 0, 0)) | ||
1479 | continue; | ||
1480 | return true; | ||
1481 | } | ||
1482 | return false; | ||
1483 | } | ||
1484 | |||
1485 | static struct page *alloc_misplaced_dst_page(struct page *page, | ||
1486 | unsigned long data, | ||
1487 | int **result) | ||
1488 | { | ||
1489 | int nid = (int) data; | ||
1490 | struct page *newpage; | ||
1491 | |||
1492 | newpage = alloc_pages_exact_node(nid, | ||
1493 | (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | | ||
1494 | __GFP_NOMEMALLOC | __GFP_NORETRY | | ||
1495 | __GFP_NOWARN) & | ||
1496 | ~GFP_IOFS, 0); | ||
1497 | if (newpage) | ||
1498 | page_xchg_last_nid(newpage, page_last_nid(page)); | ||
1499 | |||
1500 | return newpage; | ||
1501 | } | ||
1502 | |||
1503 | /* | ||
1504 | * page migration rate limiting control. | ||
1505 | * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs | ||
1506 | * window of time. Default here says do not migrate more than 1280M per second. | ||
1507 | * If a node is rate-limited then PTE NUMA updates are also rate-limited. However | ||
1508 | * as it is faults that reset the window, pte updates will happen unconditionally | ||
1509 | * if there has not been a fault since @pteupdate_interval_millisecs after the | ||
1510 | * throttle window closed. | ||
1511 | */ | ||
1512 | static unsigned int migrate_interval_millisecs __read_mostly = 100; | ||
1513 | static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; | ||
1514 | static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); | ||
1515 | |||
1516 | /* Returns true if NUMA migration is currently rate limited */ | ||
1517 | bool migrate_ratelimited(int node) | ||
1518 | { | ||
1519 | pg_data_t *pgdat = NODE_DATA(node); | ||
1520 | |||
1521 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + | ||
1522 | msecs_to_jiffies(pteupdate_interval_millisecs))) | ||
1523 | return false; | ||
1524 | |||
1525 | if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) | ||
1526 | return false; | ||
1527 | |||
1528 | return true; | ||
1529 | } | ||
1530 | |||
1531 | /* Returns true if the node is migrate rate-limited after the update */ | ||
1532 | bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) | ||
1533 | { | ||
1534 | bool rate_limited = false; | ||
1535 | |||
1536 | /* | ||
1537 | * Rate-limit the amount of data that is being migrated to a node. | ||
1538 | * Optimal placement is no good if the memory bus is saturated and | ||
1539 | * all the time is being spent migrating! | ||
1540 | */ | ||
1541 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
1542 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { | ||
1543 | pgdat->numabalancing_migrate_nr_pages = 0; | ||
1544 | pgdat->numabalancing_migrate_next_window = jiffies + | ||
1545 | msecs_to_jiffies(migrate_interval_millisecs); | ||
1546 | } | ||
1547 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) | ||
1548 | rate_limited = true; | ||
1549 | else | ||
1550 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | ||
1551 | spin_unlock(&pgdat->numabalancing_migrate_lock); | ||
1552 | |||
1553 | return rate_limited; | ||
1554 | } | ||
1555 | |||
1556 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | ||
1557 | { | ||
1558 | int ret = 0; | ||
1559 | |||
1560 | /* Avoid migrating to a node that is nearly full */ | ||
1561 | if (migrate_balanced_pgdat(pgdat, 1)) { | ||
1562 | int page_lru; | ||
1563 | |||
1564 | if (isolate_lru_page(page)) { | ||
1565 | put_page(page); | ||
1566 | return 0; | ||
1567 | } | ||
1568 | |||
1569 | /* Page is isolated */ | ||
1570 | ret = 1; | ||
1571 | page_lru = page_is_file_cache(page); | ||
1572 | if (!PageTransHuge(page)) | ||
1573 | inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); | ||
1574 | else | ||
1575 | mod_zone_page_state(page_zone(page), | ||
1576 | NR_ISOLATED_ANON + page_lru, | ||
1577 | HPAGE_PMD_NR); | ||
1578 | } | ||
1579 | |||
1580 | /* | ||
1581 | * Page is either isolated or there is not enough space on the target | ||
1582 | * node. If isolated, then it has taken a reference count and the | ||
1583 | * callers reference can be safely dropped without the page | ||
1584 | * disappearing underneath us during migration. Otherwise the page is | ||
1585 | * not to be migrated but the callers reference should still be | ||
1586 | * dropped so it does not leak. | ||
1587 | */ | ||
1588 | put_page(page); | ||
1589 | |||
1590 | return ret; | ||
1591 | } | ||
1592 | |||
1593 | /* | ||
1594 | * Attempt to migrate a misplaced page to the specified destination | ||
1595 | * node. Caller is expected to have an elevated reference count on | ||
1596 | * the page that will be dropped by this function before returning. | ||
1597 | */ | ||
1598 | int migrate_misplaced_page(struct page *page, int node) | ||
1599 | { | ||
1600 | pg_data_t *pgdat = NODE_DATA(node); | ||
1601 | int isolated = 0; | ||
1602 | int nr_remaining; | ||
1603 | LIST_HEAD(migratepages); | ||
1604 | |||
1605 | /* | ||
1606 | * Don't migrate pages that are mapped in multiple processes. | ||
1607 | * TODO: Handle false sharing detection instead of this hammer | ||
1608 | */ | ||
1609 | if (page_mapcount(page) != 1) { | ||
1610 | put_page(page); | ||
1611 | goto out; | ||
1612 | } | ||
1613 | |||
1614 | /* | ||
1615 | * Rate-limit the amount of data that is being migrated to a node. | ||
1616 | * Optimal placement is no good if the memory bus is saturated and | ||
1617 | * all the time is being spent migrating! | ||
1618 | */ | ||
1619 | if (numamigrate_update_ratelimit(pgdat, 1)) { | ||
1620 | put_page(page); | ||
1621 | goto out; | ||
1622 | } | ||
1623 | |||
1624 | isolated = numamigrate_isolate_page(pgdat, page); | ||
1625 | if (!isolated) | ||
1626 | goto out; | ||
1627 | |||
1628 | list_add(&page->lru, &migratepages); | ||
1629 | nr_remaining = migrate_pages(&migratepages, | ||
1630 | alloc_misplaced_dst_page, | ||
1631 | node, false, MIGRATE_ASYNC, | ||
1632 | MR_NUMA_MISPLACED); | ||
1633 | if (nr_remaining) { | ||
1634 | putback_lru_pages(&migratepages); | ||
1635 | isolated = 0; | ||
1636 | } else | ||
1637 | count_vm_numa_event(NUMA_PAGE_MIGRATE); | ||
1638 | BUG_ON(!list_empty(&migratepages)); | ||
1639 | out: | ||
1640 | return isolated; | ||
1641 | } | ||
1642 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1643 | |||
1644 | #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) | ||
1645 | int migrate_misplaced_transhuge_page(struct mm_struct *mm, | ||
1646 | struct vm_area_struct *vma, | ||
1647 | pmd_t *pmd, pmd_t entry, | ||
1648 | unsigned long address, | ||
1649 | struct page *page, int node) | ||
1650 | { | ||
1651 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
1652 | pg_data_t *pgdat = NODE_DATA(node); | ||
1653 | int isolated = 0; | ||
1654 | struct page *new_page = NULL; | ||
1655 | struct mem_cgroup *memcg = NULL; | ||
1656 | int page_lru = page_is_file_cache(page); | ||
1657 | |||
1658 | /* | ||
1659 | * Don't migrate pages that are mapped in multiple processes. | ||
1660 | * TODO: Handle false sharing detection instead of this hammer | ||
1661 | */ | ||
1662 | if (page_mapcount(page) != 1) | ||
1663 | goto out_dropref; | ||
1664 | |||
1665 | /* | ||
1666 | * Rate-limit the amount of data that is being migrated to a node. | ||
1667 | * Optimal placement is no good if the memory bus is saturated and | ||
1668 | * all the time is being spent migrating! | ||
1669 | */ | ||
1670 | if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) | ||
1671 | goto out_dropref; | ||
1672 | |||
1673 | new_page = alloc_pages_node(node, | ||
1674 | (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); | ||
1675 | if (!new_page) { | ||
1676 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1677 | goto out_dropref; | ||
1678 | } | ||
1679 | page_xchg_last_nid(new_page, page_last_nid(page)); | ||
1680 | |||
1681 | isolated = numamigrate_isolate_page(pgdat, page); | ||
1682 | if (!isolated) { | ||
1683 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1684 | put_page(new_page); | ||
1685 | goto out_keep_locked; | ||
1686 | } | ||
1687 | |||
1688 | /* Prepare a page as a migration target */ | ||
1689 | __set_page_locked(new_page); | ||
1690 | SetPageSwapBacked(new_page); | ||
1691 | |||
1692 | /* anon mapping, we can simply copy page->mapping to the new page: */ | ||
1693 | new_page->mapping = page->mapping; | ||
1694 | new_page->index = page->index; | ||
1695 | migrate_page_copy(new_page, page); | ||
1696 | WARN_ON(PageLRU(new_page)); | ||
1697 | |||
1698 | /* Recheck the target PMD */ | ||
1699 | spin_lock(&mm->page_table_lock); | ||
1700 | if (unlikely(!pmd_same(*pmd, entry))) { | ||
1701 | spin_unlock(&mm->page_table_lock); | ||
1702 | |||
1703 | /* Reverse changes made by migrate_page_copy() */ | ||
1704 | if (TestClearPageActive(new_page)) | ||
1705 | SetPageActive(page); | ||
1706 | if (TestClearPageUnevictable(new_page)) | ||
1707 | SetPageUnevictable(page); | ||
1708 | mlock_migrate_page(page, new_page); | ||
1709 | |||
1710 | unlock_page(new_page); | ||
1711 | put_page(new_page); /* Free it */ | ||
1712 | |||
1713 | unlock_page(page); | ||
1714 | putback_lru_page(page); | ||
1715 | |||
1716 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1717 | goto out; | ||
1718 | } | ||
1719 | |||
1720 | /* | ||
1721 | * Traditional migration needs to prepare the memcg charge | ||
1722 | * transaction early to prevent the old page from being | ||
1723 | * uncharged when installing migration entries. Here we can | ||
1724 | * save the potential rollback and start the charge transfer | ||
1725 | * only when migration is already known to end successfully. | ||
1726 | */ | ||
1727 | mem_cgroup_prepare_migration(page, new_page, &memcg); | ||
1728 | |||
1729 | entry = mk_pmd(new_page, vma->vm_page_prot); | ||
1730 | entry = pmd_mknonnuma(entry); | ||
1731 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
1732 | entry = pmd_mkhuge(entry); | ||
1733 | |||
1734 | page_add_new_anon_rmap(new_page, vma, haddr); | ||
1735 | |||
1736 | set_pmd_at(mm, haddr, pmd, entry); | ||
1737 | update_mmu_cache_pmd(vma, address, entry); | ||
1738 | page_remove_rmap(page); | ||
1739 | /* | ||
1740 | * Finish the charge transaction under the page table lock to | ||
1741 | * prevent split_huge_page() from dividing up the charge | ||
1742 | * before it's fully transferred to the new page. | ||
1743 | */ | ||
1744 | mem_cgroup_end_migration(memcg, page, new_page, true); | ||
1745 | spin_unlock(&mm->page_table_lock); | ||
1746 | |||
1747 | unlock_page(new_page); | ||
1748 | unlock_page(page); | ||
1749 | put_page(page); /* Drop the rmap reference */ | ||
1750 | put_page(page); /* Drop the LRU isolation reference */ | ||
1751 | |||
1752 | count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); | ||
1753 | count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); | ||
1754 | |||
1755 | out: | ||
1756 | mod_zone_page_state(page_zone(page), | ||
1757 | NR_ISOLATED_ANON + page_lru, | ||
1758 | -HPAGE_PMD_NR); | ||
1759 | return isolated; | ||
1760 | |||
1761 | out_dropref: | ||
1762 | put_page(page); | ||
1763 | out_keep_locked: | ||
1764 | return 0; | ||
1765 | } | ||
1766 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1767 | |||
1768 | #endif /* CONFIG_NUMA */ | ||