diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 335 |
1 files changed, 161 insertions, 174 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 64e438898832..885207a6b6b7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -55,6 +55,11 @@ struct scan_control { | |||
55 | /* Number of pages freed so far during a call to shrink_zones() */ | 55 | /* Number of pages freed so far during a call to shrink_zones() */ |
56 | unsigned long nr_reclaimed; | 56 | unsigned long nr_reclaimed; |
57 | 57 | ||
58 | /* How many pages shrink_list() should reclaim */ | ||
59 | unsigned long nr_to_reclaim; | ||
60 | |||
61 | unsigned long hibernation_mode; | ||
62 | |||
58 | /* This context's GFP mask */ | 63 | /* This context's GFP mask */ |
59 | gfp_t gfp_mask; | 64 | gfp_t gfp_mask; |
60 | 65 | ||
@@ -66,12 +71,6 @@ struct scan_control { | |||
66 | /* Can pages be swapped as part of reclaim? */ | 71 | /* Can pages be swapped as part of reclaim? */ |
67 | int may_swap; | 72 | int may_swap; |
68 | 73 | ||
69 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for | ||
70 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. | ||
71 | * In this context, it doesn't matter that we scan the | ||
72 | * whole list at once. */ | ||
73 | int swap_cluster_max; | ||
74 | |||
75 | int swappiness; | 74 | int swappiness; |
76 | 75 | ||
77 | int all_unreclaimable; | 76 | int all_unreclaimable; |
@@ -358,7 +357,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
358 | * stalls if we need to run get_block(). We could test | 357 | * stalls if we need to run get_block(). We could test |
359 | * PagePrivate for that. | 358 | * PagePrivate for that. |
360 | * | 359 | * |
361 | * If this process is currently in generic_file_write() against | 360 | * If this process is currently in __generic_file_aio_write() against |
362 | * this page's queue, we can perform writeback even if that | 361 | * this page's queue, we can perform writeback even if that |
363 | * will block. | 362 | * will block. |
364 | * | 363 | * |
@@ -544,6 +543,16 @@ redo: | |||
544 | */ | 543 | */ |
545 | lru = LRU_UNEVICTABLE; | 544 | lru = LRU_UNEVICTABLE; |
546 | add_page_to_unevictable_list(page); | 545 | add_page_to_unevictable_list(page); |
546 | /* | ||
547 | * When racing with an mlock clearing (page is | ||
548 | * unlocked), make sure that if the other thread does | ||
549 | * not observe our setting of PG_lru and fails | ||
550 | * isolation, we see PG_mlocked cleared below and move | ||
551 | * the page back to the evictable list. | ||
552 | * | ||
553 | * The other side is TestClearPageMlocked(). | ||
554 | */ | ||
555 | smp_mb(); | ||
547 | } | 556 | } |
548 | 557 | ||
549 | /* | 558 | /* |
@@ -1088,7 +1097,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1088 | int lumpy_reclaim = 0; | 1097 | int lumpy_reclaim = 0; |
1089 | 1098 | ||
1090 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1099 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1091 | congestion_wait(WRITE, HZ/10); | 1100 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1092 | 1101 | ||
1093 | /* We are about to die and free our memory. Return now. */ | 1102 | /* We are about to die and free our memory. Return now. */ |
1094 | if (fatal_signal_pending(current)) | 1103 | if (fatal_signal_pending(current)) |
@@ -1122,7 +1131,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1122 | unsigned long nr_anon; | 1131 | unsigned long nr_anon; |
1123 | unsigned long nr_file; | 1132 | unsigned long nr_file; |
1124 | 1133 | ||
1125 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, | 1134 | nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, |
1126 | &page_list, &nr_scan, sc->order, mode, | 1135 | &page_list, &nr_scan, sc->order, mode, |
1127 | zone, sc->mem_cgroup, 0, file); | 1136 | zone, sc->mem_cgroup, 0, file); |
1128 | 1137 | ||
@@ -1156,10 +1165,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1156 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); | 1165 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); |
1157 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); | 1166 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); |
1158 | 1167 | ||
1159 | reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; | 1168 | reclaim_stat->recent_scanned[0] += nr_anon; |
1160 | reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; | 1169 | reclaim_stat->recent_scanned[1] += nr_file; |
1161 | reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; | ||
1162 | reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; | ||
1163 | 1170 | ||
1164 | spin_unlock_irq(&zone->lru_lock); | 1171 | spin_unlock_irq(&zone->lru_lock); |
1165 | 1172 | ||
@@ -1356,7 +1363,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1356 | * IO, plus JVM can create lots of anon VM_EXEC pages, | 1363 | * IO, plus JVM can create lots of anon VM_EXEC pages, |
1357 | * so we ignore them here. | 1364 | * so we ignore them here. |
1358 | */ | 1365 | */ |
1359 | if ((vm_flags & VM_EXEC) && !PageAnon(page)) { | 1366 | if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { |
1360 | list_add(&page->lru, &l_active); | 1367 | list_add(&page->lru, &l_active); |
1361 | continue; | 1368 | continue; |
1362 | } | 1369 | } |
@@ -1454,20 +1461,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | |||
1454 | return low; | 1461 | return low; |
1455 | } | 1462 | } |
1456 | 1463 | ||
1464 | static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, | ||
1465 | int file) | ||
1466 | { | ||
1467 | if (file) | ||
1468 | return inactive_file_is_low(zone, sc); | ||
1469 | else | ||
1470 | return inactive_anon_is_low(zone, sc); | ||
1471 | } | ||
1472 | |||
1457 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1473 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
1458 | struct zone *zone, struct scan_control *sc, int priority) | 1474 | struct zone *zone, struct scan_control *sc, int priority) |
1459 | { | 1475 | { |
1460 | int file = is_file_lru(lru); | 1476 | int file = is_file_lru(lru); |
1461 | 1477 | ||
1462 | if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { | 1478 | if (is_active_lru(lru)) { |
1463 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1479 | if (inactive_list_is_low(zone, sc, file)) |
1480 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
1464 | return 0; | 1481 | return 0; |
1465 | } | 1482 | } |
1466 | 1483 | ||
1467 | if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { | ||
1468 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
1469 | return 0; | ||
1470 | } | ||
1471 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | 1484 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); |
1472 | } | 1485 | } |
1473 | 1486 | ||
@@ -1557,15 +1570,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1557 | * until we collected @swap_cluster_max pages to scan. | 1570 | * until we collected @swap_cluster_max pages to scan. |
1558 | */ | 1571 | */ |
1559 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | 1572 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, |
1560 | unsigned long *nr_saved_scan, | 1573 | unsigned long *nr_saved_scan) |
1561 | unsigned long swap_cluster_max) | ||
1562 | { | 1574 | { |
1563 | unsigned long nr; | 1575 | unsigned long nr; |
1564 | 1576 | ||
1565 | *nr_saved_scan += nr_to_scan; | 1577 | *nr_saved_scan += nr_to_scan; |
1566 | nr = *nr_saved_scan; | 1578 | nr = *nr_saved_scan; |
1567 | 1579 | ||
1568 | if (nr >= swap_cluster_max) | 1580 | if (nr >= SWAP_CLUSTER_MAX) |
1569 | *nr_saved_scan = 0; | 1581 | *nr_saved_scan = 0; |
1570 | else | 1582 | else |
1571 | nr = 0; | 1583 | nr = 0; |
@@ -1584,7 +1596,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1584 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ | 1596 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ |
1585 | enum lru_list l; | 1597 | enum lru_list l; |
1586 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1598 | unsigned long nr_reclaimed = sc->nr_reclaimed; |
1587 | unsigned long swap_cluster_max = sc->swap_cluster_max; | 1599 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1588 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1600 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1589 | int noswap = 0; | 1601 | int noswap = 0; |
1590 | 1602 | ||
@@ -1606,15 +1618,15 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1606 | scan = (scan * percent[file]) / 100; | 1618 | scan = (scan * percent[file]) / 100; |
1607 | } | 1619 | } |
1608 | nr[l] = nr_scan_try_batch(scan, | 1620 | nr[l] = nr_scan_try_batch(scan, |
1609 | &reclaim_stat->nr_saved_scan[l], | 1621 | &reclaim_stat->nr_saved_scan[l]); |
1610 | swap_cluster_max); | ||
1611 | } | 1622 | } |
1612 | 1623 | ||
1613 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1624 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
1614 | nr[LRU_INACTIVE_FILE]) { | 1625 | nr[LRU_INACTIVE_FILE]) { |
1615 | for_each_evictable_lru(l) { | 1626 | for_each_evictable_lru(l) { |
1616 | if (nr[l]) { | 1627 | if (nr[l]) { |
1617 | nr_to_scan = min(nr[l], swap_cluster_max); | 1628 | nr_to_scan = min_t(unsigned long, |
1629 | nr[l], SWAP_CLUSTER_MAX); | ||
1618 | nr[l] -= nr_to_scan; | 1630 | nr[l] -= nr_to_scan; |
1619 | 1631 | ||
1620 | nr_reclaimed += shrink_list(l, nr_to_scan, | 1632 | nr_reclaimed += shrink_list(l, nr_to_scan, |
@@ -1629,8 +1641,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1629 | * with multiple processes reclaiming pages, the total | 1641 | * with multiple processes reclaiming pages, the total |
1630 | * freeing target can get unreasonably large. | 1642 | * freeing target can get unreasonably large. |
1631 | */ | 1643 | */ |
1632 | if (nr_reclaimed > swap_cluster_max && | 1644 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
1633 | priority < DEF_PRIORITY && !current_is_kswapd()) | ||
1634 | break; | 1645 | break; |
1635 | } | 1646 | } |
1636 | 1647 | ||
@@ -1728,6 +1739,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1728 | struct zoneref *z; | 1739 | struct zoneref *z; |
1729 | struct zone *zone; | 1740 | struct zone *zone; |
1730 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | 1741 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); |
1742 | unsigned long writeback_threshold; | ||
1731 | 1743 | ||
1732 | delayacct_freepages_start(); | 1744 | delayacct_freepages_start(); |
1733 | 1745 | ||
@@ -1763,7 +1775,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1763 | } | 1775 | } |
1764 | } | 1776 | } |
1765 | total_scanned += sc->nr_scanned; | 1777 | total_scanned += sc->nr_scanned; |
1766 | if (sc->nr_reclaimed >= sc->swap_cluster_max) { | 1778 | if (sc->nr_reclaimed >= sc->nr_to_reclaim) { |
1767 | ret = sc->nr_reclaimed; | 1779 | ret = sc->nr_reclaimed; |
1768 | goto out; | 1780 | goto out; |
1769 | } | 1781 | } |
@@ -1775,14 +1787,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1775 | * that's undesirable in laptop mode, where we *want* lumpy | 1787 | * that's undesirable in laptop mode, where we *want* lumpy |
1776 | * writeout. So in laptop mode, write out the whole world. | 1788 | * writeout. So in laptop mode, write out the whole world. |
1777 | */ | 1789 | */ |
1778 | if (total_scanned > sc->swap_cluster_max + | 1790 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; |
1779 | sc->swap_cluster_max / 2) { | 1791 | if (total_scanned > writeback_threshold) { |
1780 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); | 1792 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); |
1781 | sc->may_writepage = 1; | 1793 | sc->may_writepage = 1; |
1782 | } | 1794 | } |
1783 | 1795 | ||
1784 | /* Take a nap, wait for some writeback to complete */ | 1796 | /* Take a nap, wait for some writeback to complete */ |
1785 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) | 1797 | if (!sc->hibernation_mode && sc->nr_scanned && |
1798 | priority < DEF_PRIORITY - 2) | ||
1786 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1799 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1787 | } | 1800 | } |
1788 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 1801 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
@@ -1821,7 +1834,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
1821 | struct scan_control sc = { | 1834 | struct scan_control sc = { |
1822 | .gfp_mask = gfp_mask, | 1835 | .gfp_mask = gfp_mask, |
1823 | .may_writepage = !laptop_mode, | 1836 | .may_writepage = !laptop_mode, |
1824 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1837 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
1825 | .may_unmap = 1, | 1838 | .may_unmap = 1, |
1826 | .may_swap = 1, | 1839 | .may_swap = 1, |
1827 | .swappiness = vm_swappiness, | 1840 | .swappiness = vm_swappiness, |
@@ -1845,7 +1858,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
1845 | .may_writepage = !laptop_mode, | 1858 | .may_writepage = !laptop_mode, |
1846 | .may_unmap = 1, | 1859 | .may_unmap = 1, |
1847 | .may_swap = !noswap, | 1860 | .may_swap = !noswap, |
1848 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1849 | .swappiness = swappiness, | 1861 | .swappiness = swappiness, |
1850 | .order = 0, | 1862 | .order = 0, |
1851 | .mem_cgroup = mem, | 1863 | .mem_cgroup = mem, |
@@ -1879,7 +1891,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1879 | .may_writepage = !laptop_mode, | 1891 | .may_writepage = !laptop_mode, |
1880 | .may_unmap = 1, | 1892 | .may_unmap = 1, |
1881 | .may_swap = !noswap, | 1893 | .may_swap = !noswap, |
1882 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1894 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
1883 | .swappiness = swappiness, | 1895 | .swappiness = swappiness, |
1884 | .order = 0, | 1896 | .order = 0, |
1885 | .mem_cgroup = mem_cont, | 1897 | .mem_cgroup = mem_cont, |
@@ -1894,6 +1906,30 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1894 | } | 1906 | } |
1895 | #endif | 1907 | #endif |
1896 | 1908 | ||
1909 | /* is kswapd sleeping prematurely? */ | ||
1910 | static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | ||
1911 | { | ||
1912 | int i; | ||
1913 | |||
1914 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | ||
1915 | if (remaining) | ||
1916 | return 1; | ||
1917 | |||
1918 | /* If after HZ/10, a zone is below the high mark, it's premature */ | ||
1919 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
1920 | struct zone *zone = pgdat->node_zones + i; | ||
1921 | |||
1922 | if (!populated_zone(zone)) | ||
1923 | continue; | ||
1924 | |||
1925 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | ||
1926 | 0, 0)) | ||
1927 | return 1; | ||
1928 | } | ||
1929 | |||
1930 | return 0; | ||
1931 | } | ||
1932 | |||
1897 | /* | 1933 | /* |
1898 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1934 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1899 | * they are all at high_wmark_pages(zone). | 1935 | * they are all at high_wmark_pages(zone). |
@@ -1926,7 +1962,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1926 | .gfp_mask = GFP_KERNEL, | 1962 | .gfp_mask = GFP_KERNEL, |
1927 | .may_unmap = 1, | 1963 | .may_unmap = 1, |
1928 | .may_swap = 1, | 1964 | .may_swap = 1, |
1929 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1965 | /* |
1966 | * kswapd doesn't want to be bailed out while reclaim. because | ||
1967 | * we want to put equal scanning pressure on each zone. | ||
1968 | */ | ||
1969 | .nr_to_reclaim = ULONG_MAX, | ||
1930 | .swappiness = vm_swappiness, | 1970 | .swappiness = vm_swappiness, |
1931 | .order = order, | 1971 | .order = order, |
1932 | .mem_cgroup = NULL, | 1972 | .mem_cgroup = NULL, |
@@ -1951,6 +1991,7 @@ loop_again: | |||
1951 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 1991 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1952 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 1992 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
1953 | unsigned long lru_pages = 0; | 1993 | unsigned long lru_pages = 0; |
1994 | int has_under_min_watermark_zone = 0; | ||
1954 | 1995 | ||
1955 | /* The swap token gets in the way of swapout... */ | 1996 | /* The swap token gets in the way of swapout... */ |
1956 | if (!priority) | 1997 | if (!priority) |
@@ -2057,6 +2098,15 @@ loop_again: | |||
2057 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 2098 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
2058 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2099 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2059 | sc.may_writepage = 1; | 2100 | sc.may_writepage = 1; |
2101 | |||
2102 | /* | ||
2103 | * We are still under min water mark. it mean we have | ||
2104 | * GFP_ATOMIC allocation failure risk. Hurry up! | ||
2105 | */ | ||
2106 | if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), | ||
2107 | end_zone, 0)) | ||
2108 | has_under_min_watermark_zone = 1; | ||
2109 | |||
2060 | } | 2110 | } |
2061 | if (all_zones_ok) | 2111 | if (all_zones_ok) |
2062 | break; /* kswapd: all done */ | 2112 | break; /* kswapd: all done */ |
@@ -2064,8 +2114,12 @@ loop_again: | |||
2064 | * OK, kswapd is getting into trouble. Take a nap, then take | 2114 | * OK, kswapd is getting into trouble. Take a nap, then take |
2065 | * another pass across the zones. | 2115 | * another pass across the zones. |
2066 | */ | 2116 | */ |
2067 | if (total_scanned && priority < DEF_PRIORITY - 2) | 2117 | if (total_scanned && (priority < DEF_PRIORITY - 2)) { |
2068 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 2118 | if (has_under_min_watermark_zone) |
2119 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | ||
2120 | else | ||
2121 | congestion_wait(BLK_RW_ASYNC, HZ/10); | ||
2122 | } | ||
2069 | 2123 | ||
2070 | /* | 2124 | /* |
2071 | * We do this so kswapd doesn't build up large priorities for | 2125 | * We do this so kswapd doesn't build up large priorities for |
@@ -2163,6 +2217,7 @@ static int kswapd(void *p) | |||
2163 | order = 0; | 2217 | order = 0; |
2164 | for ( ; ; ) { | 2218 | for ( ; ; ) { |
2165 | unsigned long new_order; | 2219 | unsigned long new_order; |
2220 | int ret; | ||
2166 | 2221 | ||
2167 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2222 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
2168 | new_order = pgdat->kswapd_max_order; | 2223 | new_order = pgdat->kswapd_max_order; |
@@ -2174,19 +2229,45 @@ static int kswapd(void *p) | |||
2174 | */ | 2229 | */ |
2175 | order = new_order; | 2230 | order = new_order; |
2176 | } else { | 2231 | } else { |
2177 | if (!freezing(current)) | 2232 | if (!freezing(current) && !kthread_should_stop()) { |
2178 | schedule(); | 2233 | long remaining = 0; |
2234 | |||
2235 | /* Try to sleep for a short interval */ | ||
2236 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2237 | remaining = schedule_timeout(HZ/10); | ||
2238 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2239 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2240 | } | ||
2241 | |||
2242 | /* | ||
2243 | * After a short sleep, check if it was a | ||
2244 | * premature sleep. If not, then go fully | ||
2245 | * to sleep until explicitly woken up | ||
2246 | */ | ||
2247 | if (!sleeping_prematurely(pgdat, order, remaining)) | ||
2248 | schedule(); | ||
2249 | else { | ||
2250 | if (remaining) | ||
2251 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2252 | else | ||
2253 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2254 | } | ||
2255 | } | ||
2179 | 2256 | ||
2180 | order = pgdat->kswapd_max_order; | 2257 | order = pgdat->kswapd_max_order; |
2181 | } | 2258 | } |
2182 | finish_wait(&pgdat->kswapd_wait, &wait); | 2259 | finish_wait(&pgdat->kswapd_wait, &wait); |
2183 | 2260 | ||
2184 | if (!try_to_freeze()) { | 2261 | ret = try_to_freeze(); |
2185 | /* We can speed up thawing tasks if we don't call | 2262 | if (kthread_should_stop()) |
2186 | * balance_pgdat after returning from the refrigerator | 2263 | break; |
2187 | */ | 2264 | |
2265 | /* | ||
2266 | * We can speed up thawing tasks if we don't call balance_pgdat | ||
2267 | * after returning from the refrigerator | ||
2268 | */ | ||
2269 | if (!ret) | ||
2188 | balance_pgdat(pgdat, order); | 2270 | balance_pgdat(pgdat, order); |
2189 | } | ||
2190 | } | 2271 | } |
2191 | return 0; | 2272 | return 0; |
2192 | } | 2273 | } |
@@ -2250,148 +2331,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone) | |||
2250 | 2331 | ||
2251 | #ifdef CONFIG_HIBERNATION | 2332 | #ifdef CONFIG_HIBERNATION |
2252 | /* | 2333 | /* |
2253 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages | 2334 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of |
2254 | * from LRU lists system-wide, for given pass and priority. | ||
2255 | * | ||
2256 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | ||
2257 | */ | ||
2258 | static void shrink_all_zones(unsigned long nr_pages, int prio, | ||
2259 | int pass, struct scan_control *sc) | ||
2260 | { | ||
2261 | struct zone *zone; | ||
2262 | unsigned long nr_reclaimed = 0; | ||
2263 | struct zone_reclaim_stat *reclaim_stat; | ||
2264 | |||
2265 | for_each_populated_zone(zone) { | ||
2266 | enum lru_list l; | ||
2267 | |||
2268 | if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) | ||
2269 | continue; | ||
2270 | |||
2271 | for_each_evictable_lru(l) { | ||
2272 | enum zone_stat_item ls = NR_LRU_BASE + l; | ||
2273 | unsigned long lru_pages = zone_page_state(zone, ls); | ||
2274 | |||
2275 | /* For pass = 0, we don't shrink the active list */ | ||
2276 | if (pass == 0 && (l == LRU_ACTIVE_ANON || | ||
2277 | l == LRU_ACTIVE_FILE)) | ||
2278 | continue; | ||
2279 | |||
2280 | reclaim_stat = get_reclaim_stat(zone, sc); | ||
2281 | reclaim_stat->nr_saved_scan[l] += | ||
2282 | (lru_pages >> prio) + 1; | ||
2283 | if (reclaim_stat->nr_saved_scan[l] | ||
2284 | >= nr_pages || pass > 3) { | ||
2285 | unsigned long nr_to_scan; | ||
2286 | |||
2287 | reclaim_stat->nr_saved_scan[l] = 0; | ||
2288 | nr_to_scan = min(nr_pages, lru_pages); | ||
2289 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, | ||
2290 | sc, prio); | ||
2291 | if (nr_reclaimed >= nr_pages) { | ||
2292 | sc->nr_reclaimed += nr_reclaimed; | ||
2293 | return; | ||
2294 | } | ||
2295 | } | ||
2296 | } | ||
2297 | } | ||
2298 | sc->nr_reclaimed += nr_reclaimed; | ||
2299 | } | ||
2300 | |||
2301 | /* | ||
2302 | * Try to free `nr_pages' of memory, system-wide, and return the number of | ||
2303 | * freed pages. | 2335 | * freed pages. |
2304 | * | 2336 | * |
2305 | * Rather than trying to age LRUs the aim is to preserve the overall | 2337 | * Rather than trying to age LRUs the aim is to preserve the overall |
2306 | * LRU order by reclaiming preferentially | 2338 | * LRU order by reclaiming preferentially |
2307 | * inactive > active > active referenced > active mapped | 2339 | * inactive > active > active referenced > active mapped |
2308 | */ | 2340 | */ |
2309 | unsigned long shrink_all_memory(unsigned long nr_pages) | 2341 | unsigned long shrink_all_memory(unsigned long nr_to_reclaim) |
2310 | { | 2342 | { |
2311 | unsigned long lru_pages, nr_slab; | ||
2312 | int pass; | ||
2313 | struct reclaim_state reclaim_state; | 2343 | struct reclaim_state reclaim_state; |
2314 | struct scan_control sc = { | 2344 | struct scan_control sc = { |
2315 | .gfp_mask = GFP_KERNEL, | 2345 | .gfp_mask = GFP_HIGHUSER_MOVABLE, |
2316 | .may_unmap = 0, | 2346 | .may_swap = 1, |
2347 | .may_unmap = 1, | ||
2317 | .may_writepage = 1, | 2348 | .may_writepage = 1, |
2349 | .nr_to_reclaim = nr_to_reclaim, | ||
2350 | .hibernation_mode = 1, | ||
2351 | .swappiness = vm_swappiness, | ||
2352 | .order = 0, | ||
2318 | .isolate_pages = isolate_pages_global, | 2353 | .isolate_pages = isolate_pages_global, |
2319 | .nr_reclaimed = 0, | ||
2320 | }; | 2354 | }; |
2355 | struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | ||
2356 | struct task_struct *p = current; | ||
2357 | unsigned long nr_reclaimed; | ||
2321 | 2358 | ||
2322 | current->reclaim_state = &reclaim_state; | 2359 | p->flags |= PF_MEMALLOC; |
2323 | 2360 | lockdep_set_current_reclaim_state(sc.gfp_mask); | |
2324 | lru_pages = global_reclaimable_pages(); | 2361 | reclaim_state.reclaimed_slab = 0; |
2325 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); | 2362 | p->reclaim_state = &reclaim_state; |
2326 | /* If slab caches are huge, it's better to hit them first */ | ||
2327 | while (nr_slab >= lru_pages) { | ||
2328 | reclaim_state.reclaimed_slab = 0; | ||
2329 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
2330 | if (!reclaim_state.reclaimed_slab) | ||
2331 | break; | ||
2332 | |||
2333 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2334 | if (sc.nr_reclaimed >= nr_pages) | ||
2335 | goto out; | ||
2336 | |||
2337 | nr_slab -= reclaim_state.reclaimed_slab; | ||
2338 | } | ||
2339 | |||
2340 | /* | ||
2341 | * We try to shrink LRUs in 5 passes: | ||
2342 | * 0 = Reclaim from inactive_list only | ||
2343 | * 1 = Reclaim from active list but don't reclaim mapped | ||
2344 | * 2 = 2nd pass of type 1 | ||
2345 | * 3 = Reclaim mapped (normal reclaim) | ||
2346 | * 4 = 2nd pass of type 3 | ||
2347 | */ | ||
2348 | for (pass = 0; pass < 5; pass++) { | ||
2349 | int prio; | ||
2350 | |||
2351 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | ||
2352 | if (pass > 2) | ||
2353 | sc.may_unmap = 1; | ||
2354 | |||
2355 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | ||
2356 | unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; | ||
2357 | |||
2358 | sc.nr_scanned = 0; | ||
2359 | sc.swap_cluster_max = nr_to_scan; | ||
2360 | shrink_all_zones(nr_to_scan, prio, pass, &sc); | ||
2361 | if (sc.nr_reclaimed >= nr_pages) | ||
2362 | goto out; | ||
2363 | |||
2364 | reclaim_state.reclaimed_slab = 0; | ||
2365 | shrink_slab(sc.nr_scanned, sc.gfp_mask, | ||
2366 | global_reclaimable_pages()); | ||
2367 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2368 | if (sc.nr_reclaimed >= nr_pages) | ||
2369 | goto out; | ||
2370 | |||
2371 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | ||
2372 | congestion_wait(BLK_RW_ASYNC, HZ / 10); | ||
2373 | } | ||
2374 | } | ||
2375 | |||
2376 | /* | ||
2377 | * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be | ||
2378 | * something in slab caches | ||
2379 | */ | ||
2380 | if (!sc.nr_reclaimed) { | ||
2381 | do { | ||
2382 | reclaim_state.reclaimed_slab = 0; | ||
2383 | shrink_slab(nr_pages, sc.gfp_mask, | ||
2384 | global_reclaimable_pages()); | ||
2385 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2386 | } while (sc.nr_reclaimed < nr_pages && | ||
2387 | reclaim_state.reclaimed_slab > 0); | ||
2388 | } | ||
2389 | 2363 | ||
2364 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | ||
2390 | 2365 | ||
2391 | out: | 2366 | p->reclaim_state = NULL; |
2392 | current->reclaim_state = NULL; | 2367 | lockdep_clear_current_reclaim_state(); |
2368 | p->flags &= ~PF_MEMALLOC; | ||
2393 | 2369 | ||
2394 | return sc.nr_reclaimed; | 2370 | return nr_reclaimed; |
2395 | } | 2371 | } |
2396 | #endif /* CONFIG_HIBERNATION */ | 2372 | #endif /* CONFIG_HIBERNATION */ |
2397 | 2373 | ||
@@ -2441,6 +2417,17 @@ int kswapd_run(int nid) | |||
2441 | return ret; | 2417 | return ret; |
2442 | } | 2418 | } |
2443 | 2419 | ||
2420 | /* | ||
2421 | * Called by memory hotplug when all memory in a node is offlined. | ||
2422 | */ | ||
2423 | void kswapd_stop(int nid) | ||
2424 | { | ||
2425 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; | ||
2426 | |||
2427 | if (kswapd) | ||
2428 | kthread_stop(kswapd); | ||
2429 | } | ||
2430 | |||
2444 | static int __init kswapd_init(void) | 2431 | static int __init kswapd_init(void) |
2445 | { | 2432 | { |
2446 | int nid; | 2433 | int nid; |
@@ -2543,8 +2530,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2543 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 2530 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
2544 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 2531 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
2545 | .may_swap = 1, | 2532 | .may_swap = 1, |
2546 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 2533 | .nr_to_reclaim = max_t(unsigned long, nr_pages, |
2547 | SWAP_CLUSTER_MAX), | 2534 | SWAP_CLUSTER_MAX), |
2548 | .gfp_mask = gfp_mask, | 2535 | .gfp_mask = gfp_mask, |
2549 | .swappiness = vm_swappiness, | 2536 | .swappiness = vm_swappiness, |
2550 | .order = order, | 2537 | .order = order, |