diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 396 |
1 files changed, 197 insertions, 199 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index b7719ec10dc5..a1893c050795 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -105,7 +105,6 @@ struct scan_control { | |||
105 | 105 | ||
106 | /* Which cgroup do we reclaim from */ | 106 | /* Which cgroup do we reclaim from */ |
107 | struct mem_cgroup *mem_cgroup; | 107 | struct mem_cgroup *mem_cgroup; |
108 | struct memcg_scanrecord *memcg_record; | ||
109 | 108 | ||
110 | /* | 109 | /* |
111 | * Nodemask of nodes allowed by the caller. If NULL, all nodes | 110 | * Nodemask of nodes allowed by the caller. If NULL, all nodes |
@@ -496,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
496 | return PAGE_ACTIVATE; | 495 | return PAGE_ACTIVATE; |
497 | } | 496 | } |
498 | 497 | ||
499 | /* | ||
500 | * Wait on writeback if requested to. This happens when | ||
501 | * direct reclaiming a large contiguous area and the | ||
502 | * first attempt to free a range of pages fails. | ||
503 | */ | ||
504 | if (PageWriteback(page) && | ||
505 | (sc->reclaim_mode & RECLAIM_MODE_SYNC)) | ||
506 | wait_on_page_writeback(page); | ||
507 | |||
508 | if (!PageWriteback(page)) { | 498 | if (!PageWriteback(page)) { |
509 | /* synchronous write or broken a_ops? */ | 499 | /* synchronous write or broken a_ops? */ |
510 | ClearPageReclaim(page); | 500 | ClearPageReclaim(page); |
@@ -643,13 +633,14 @@ redo: | |||
643 | lru = LRU_UNEVICTABLE; | 633 | lru = LRU_UNEVICTABLE; |
644 | add_page_to_unevictable_list(page); | 634 | add_page_to_unevictable_list(page); |
645 | /* | 635 | /* |
646 | * When racing with an mlock clearing (page is | 636 | * When racing with an mlock or AS_UNEVICTABLE clearing |
647 | * unlocked), make sure that if the other thread does | 637 | * (page is unlocked) make sure that if the other thread |
648 | * not observe our setting of PG_lru and fails | 638 | * does not observe our setting of PG_lru and fails |
649 | * isolation, we see PG_mlocked cleared below and move | 639 | * isolation/check_move_unevictable_page, |
640 | * we see PG_mlocked/AS_UNEVICTABLE cleared below and move | ||
650 | * the page back to the evictable list. | 641 | * the page back to the evictable list. |
651 | * | 642 | * |
652 | * The other side is TestClearPageMlocked(). | 643 | * The other side is TestClearPageMlocked() or shmem_lock(). |
653 | */ | 644 | */ |
654 | smp_mb(); | 645 | smp_mb(); |
655 | } | 646 | } |
@@ -760,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) | |||
760 | */ | 751 | */ |
761 | static unsigned long shrink_page_list(struct list_head *page_list, | 752 | static unsigned long shrink_page_list(struct list_head *page_list, |
762 | struct zone *zone, | 753 | struct zone *zone, |
763 | struct scan_control *sc) | 754 | struct scan_control *sc, |
755 | int priority, | ||
756 | unsigned long *ret_nr_dirty, | ||
757 | unsigned long *ret_nr_writeback) | ||
764 | { | 758 | { |
765 | LIST_HEAD(ret_pages); | 759 | LIST_HEAD(ret_pages); |
766 | LIST_HEAD(free_pages); | 760 | LIST_HEAD(free_pages); |
@@ -768,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
768 | unsigned long nr_dirty = 0; | 762 | unsigned long nr_dirty = 0; |
769 | unsigned long nr_congested = 0; | 763 | unsigned long nr_congested = 0; |
770 | unsigned long nr_reclaimed = 0; | 764 | unsigned long nr_reclaimed = 0; |
765 | unsigned long nr_writeback = 0; | ||
771 | 766 | ||
772 | cond_resched(); | 767 | cond_resched(); |
773 | 768 | ||
@@ -804,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
804 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 799 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
805 | 800 | ||
806 | if (PageWriteback(page)) { | 801 | if (PageWriteback(page)) { |
802 | nr_writeback++; | ||
807 | /* | 803 | /* |
808 | * Synchronous reclaim is performed in two passes, | 804 | * Synchronous reclaim cannot queue pages for |
809 | * first an asynchronous pass over the list to | 805 | * writeback due to the possibility of stack overflow |
810 | * start parallel writeback, and a second synchronous | 806 | * but if it encounters a page under writeback, wait |
811 | * pass to wait for the IO to complete. Wait here | 807 | * for the IO to complete. |
812 | * for any page for which writeback has already | ||
813 | * started. | ||
814 | */ | 808 | */ |
815 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && | 809 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && |
816 | may_enter_fs) | 810 | may_enter_fs) |
@@ -866,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
866 | if (PageDirty(page)) { | 860 | if (PageDirty(page)) { |
867 | nr_dirty++; | 861 | nr_dirty++; |
868 | 862 | ||
863 | /* | ||
864 | * Only kswapd can writeback filesystem pages to | ||
865 | * avoid risk of stack overflow but do not writeback | ||
866 | * unless under significant pressure. | ||
867 | */ | ||
868 | if (page_is_file_cache(page) && | ||
869 | (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { | ||
870 | /* | ||
871 | * Immediately reclaim when written back. | ||
872 | * Similar in principal to deactivate_page() | ||
873 | * except we already have the page isolated | ||
874 | * and know it's dirty | ||
875 | */ | ||
876 | inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); | ||
877 | SetPageReclaim(page); | ||
878 | |||
879 | goto keep_locked; | ||
880 | } | ||
881 | |||
869 | if (references == PAGEREF_RECLAIM_CLEAN) | 882 | if (references == PAGEREF_RECLAIM_CLEAN) |
870 | goto keep_locked; | 883 | goto keep_locked; |
871 | if (!may_enter_fs) | 884 | if (!may_enter_fs) |
@@ -1000,6 +1013,8 @@ keep_lumpy: | |||
1000 | 1013 | ||
1001 | list_splice(&ret_pages, page_list); | 1014 | list_splice(&ret_pages, page_list); |
1002 | count_vm_events(PGACTIVATE, pgactivate); | 1015 | count_vm_events(PGACTIVATE, pgactivate); |
1016 | *ret_nr_dirty += nr_dirty; | ||
1017 | *ret_nr_writeback += nr_writeback; | ||
1003 | return nr_reclaimed; | 1018 | return nr_reclaimed; |
1004 | } | 1019 | } |
1005 | 1020 | ||
@@ -1013,23 +1028,27 @@ keep_lumpy: | |||
1013 | * | 1028 | * |
1014 | * returns 0 on success, -ve errno on failure. | 1029 | * returns 0 on success, -ve errno on failure. |
1015 | */ | 1030 | */ |
1016 | int __isolate_lru_page(struct page *page, int mode, int file) | 1031 | int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) |
1017 | { | 1032 | { |
1033 | bool all_lru_mode; | ||
1018 | int ret = -EINVAL; | 1034 | int ret = -EINVAL; |
1019 | 1035 | ||
1020 | /* Only take pages on the LRU. */ | 1036 | /* Only take pages on the LRU. */ |
1021 | if (!PageLRU(page)) | 1037 | if (!PageLRU(page)) |
1022 | return ret; | 1038 | return ret; |
1023 | 1039 | ||
1040 | all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == | ||
1041 | (ISOLATE_ACTIVE|ISOLATE_INACTIVE); | ||
1042 | |||
1024 | /* | 1043 | /* |
1025 | * When checking the active state, we need to be sure we are | 1044 | * When checking the active state, we need to be sure we are |
1026 | * dealing with comparible boolean values. Take the logical not | 1045 | * dealing with comparible boolean values. Take the logical not |
1027 | * of each. | 1046 | * of each. |
1028 | */ | 1047 | */ |
1029 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) | 1048 | if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) |
1030 | return ret; | 1049 | return ret; |
1031 | 1050 | ||
1032 | if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) | 1051 | if (!all_lru_mode && !!page_is_file_cache(page) != file) |
1033 | return ret; | 1052 | return ret; |
1034 | 1053 | ||
1035 | /* | 1054 | /* |
@@ -1042,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
1042 | 1061 | ||
1043 | ret = -EBUSY; | 1062 | ret = -EBUSY; |
1044 | 1063 | ||
1064 | if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) | ||
1065 | return ret; | ||
1066 | |||
1067 | if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) | ||
1068 | return ret; | ||
1069 | |||
1045 | if (likely(get_page_unless_zero(page))) { | 1070 | if (likely(get_page_unless_zero(page))) { |
1046 | /* | 1071 | /* |
1047 | * Be careful not to clear PageLRU until after we're | 1072 | * Be careful not to clear PageLRU until after we're |
@@ -1077,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
1077 | */ | 1102 | */ |
1078 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1103 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1079 | struct list_head *src, struct list_head *dst, | 1104 | struct list_head *src, struct list_head *dst, |
1080 | unsigned long *scanned, int order, int mode, int file) | 1105 | unsigned long *scanned, int order, isolate_mode_t mode, |
1106 | int file) | ||
1081 | { | 1107 | { |
1082 | unsigned long nr_taken = 0; | 1108 | unsigned long nr_taken = 0; |
1083 | unsigned long nr_lumpy_taken = 0; | 1109 | unsigned long nr_lumpy_taken = 0; |
@@ -1202,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1202 | static unsigned long isolate_pages_global(unsigned long nr, | 1228 | static unsigned long isolate_pages_global(unsigned long nr, |
1203 | struct list_head *dst, | 1229 | struct list_head *dst, |
1204 | unsigned long *scanned, int order, | 1230 | unsigned long *scanned, int order, |
1205 | int mode, struct zone *z, | 1231 | isolate_mode_t mode, |
1206 | int active, int file) | 1232 | struct zone *z, int active, int file) |
1207 | { | 1233 | { |
1208 | int lru = LRU_BASE; | 1234 | int lru = LRU_BASE; |
1209 | if (active) | 1235 | if (active) |
@@ -1349,8 +1375,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, | |||
1349 | int file = is_file_lru(lru); | 1375 | int file = is_file_lru(lru); |
1350 | int numpages = hpage_nr_pages(page); | 1376 | int numpages = hpage_nr_pages(page); |
1351 | reclaim_stat->recent_rotated[file] += numpages; | 1377 | reclaim_stat->recent_rotated[file] += numpages; |
1352 | if (!scanning_global_lru(sc)) | ||
1353 | sc->memcg_record->nr_rotated[file] += numpages; | ||
1354 | } | 1378 | } |
1355 | if (!pagevec_add(&pvec, page)) { | 1379 | if (!pagevec_add(&pvec, page)) { |
1356 | spin_unlock_irq(&zone->lru_lock); | 1380 | spin_unlock_irq(&zone->lru_lock); |
@@ -1394,14 +1418,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, | |||
1394 | 1418 | ||
1395 | reclaim_stat->recent_scanned[0] += *nr_anon; | 1419 | reclaim_stat->recent_scanned[0] += *nr_anon; |
1396 | reclaim_stat->recent_scanned[1] += *nr_file; | 1420 | reclaim_stat->recent_scanned[1] += *nr_file; |
1397 | if (!scanning_global_lru(sc)) { | ||
1398 | sc->memcg_record->nr_scanned[0] += *nr_anon; | ||
1399 | sc->memcg_record->nr_scanned[1] += *nr_file; | ||
1400 | } | ||
1401 | } | 1421 | } |
1402 | 1422 | ||
1403 | /* | 1423 | /* |
1404 | * Returns true if the caller should wait to clean dirty/writeback pages. | 1424 | * Returns true if a direct reclaim should wait on pages under writeback. |
1405 | * | 1425 | * |
1406 | * If we are direct reclaiming for contiguous pages and we do not reclaim | 1426 | * If we are direct reclaiming for contiguous pages and we do not reclaim |
1407 | * everything in the list, try again and wait for writeback IO to complete. | 1427 | * everything in the list, try again and wait for writeback IO to complete. |
@@ -1423,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, | |||
1423 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) | 1443 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) |
1424 | return false; | 1444 | return false; |
1425 | 1445 | ||
1426 | /* If we have relaimed everything on the isolated list, no stall */ | 1446 | /* If we have reclaimed everything on the isolated list, no stall */ |
1427 | if (nr_freed == nr_taken) | 1447 | if (nr_freed == nr_taken) |
1428 | return false; | 1448 | return false; |
1429 | 1449 | ||
@@ -1455,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1455 | unsigned long nr_taken; | 1475 | unsigned long nr_taken; |
1456 | unsigned long nr_anon; | 1476 | unsigned long nr_anon; |
1457 | unsigned long nr_file; | 1477 | unsigned long nr_file; |
1478 | unsigned long nr_dirty = 0; | ||
1479 | unsigned long nr_writeback = 0; | ||
1480 | isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; | ||
1458 | 1481 | ||
1459 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1482 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1460 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1483 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -1465,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1465 | } | 1488 | } |
1466 | 1489 | ||
1467 | set_reclaim_mode(priority, sc, false); | 1490 | set_reclaim_mode(priority, sc, false); |
1491 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | ||
1492 | reclaim_mode |= ISOLATE_ACTIVE; | ||
1493 | |||
1468 | lru_add_drain(); | 1494 | lru_add_drain(); |
1495 | |||
1496 | if (!sc->may_unmap) | ||
1497 | reclaim_mode |= ISOLATE_UNMAPPED; | ||
1498 | if (!sc->may_writepage) | ||
1499 | reclaim_mode |= ISOLATE_CLEAN; | ||
1500 | |||
1469 | spin_lock_irq(&zone->lru_lock); | 1501 | spin_lock_irq(&zone->lru_lock); |
1470 | 1502 | ||
1471 | if (scanning_global_lru(sc)) { | 1503 | if (scanning_global_lru(sc)) { |
1472 | nr_taken = isolate_pages_global(nr_to_scan, | 1504 | nr_taken = isolate_pages_global(nr_to_scan, &page_list, |
1473 | &page_list, &nr_scanned, sc->order, | 1505 | &nr_scanned, sc->order, reclaim_mode, zone, 0, file); |
1474 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? | ||
1475 | ISOLATE_BOTH : ISOLATE_INACTIVE, | ||
1476 | zone, 0, file); | ||
1477 | zone->pages_scanned += nr_scanned; | 1506 | zone->pages_scanned += nr_scanned; |
1478 | if (current_is_kswapd()) | 1507 | if (current_is_kswapd()) |
1479 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, | 1508 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, |
@@ -1482,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1482 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1511 | __count_zone_vm_events(PGSCAN_DIRECT, zone, |
1483 | nr_scanned); | 1512 | nr_scanned); |
1484 | } else { | 1513 | } else { |
1485 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, | 1514 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, |
1486 | &page_list, &nr_scanned, sc->order, | 1515 | &nr_scanned, sc->order, reclaim_mode, zone, |
1487 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? | 1516 | sc->mem_cgroup, 0, file); |
1488 | ISOLATE_BOTH : ISOLATE_INACTIVE, | ||
1489 | zone, sc->mem_cgroup, | ||
1490 | 0, file); | ||
1491 | /* | 1517 | /* |
1492 | * mem_cgroup_isolate_pages() keeps track of | 1518 | * mem_cgroup_isolate_pages() keeps track of |
1493 | * scanned pages on its own. | 1519 | * scanned pages on its own. |
@@ -1503,17 +1529,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1503 | 1529 | ||
1504 | spin_unlock_irq(&zone->lru_lock); | 1530 | spin_unlock_irq(&zone->lru_lock); |
1505 | 1531 | ||
1506 | nr_reclaimed = shrink_page_list(&page_list, zone, sc); | 1532 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, |
1533 | &nr_dirty, &nr_writeback); | ||
1507 | 1534 | ||
1508 | /* Check if we should syncronously wait for writeback */ | 1535 | /* Check if we should syncronously wait for writeback */ |
1509 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1536 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
1510 | set_reclaim_mode(priority, sc, true); | 1537 | set_reclaim_mode(priority, sc, true); |
1511 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | 1538 | nr_reclaimed += shrink_page_list(&page_list, zone, sc, |
1539 | priority, &nr_dirty, &nr_writeback); | ||
1512 | } | 1540 | } |
1513 | 1541 | ||
1514 | if (!scanning_global_lru(sc)) | ||
1515 | sc->memcg_record->nr_freed[file] += nr_reclaimed; | ||
1516 | |||
1517 | local_irq_disable(); | 1542 | local_irq_disable(); |
1518 | if (current_is_kswapd()) | 1543 | if (current_is_kswapd()) |
1519 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); | 1544 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); |
@@ -1521,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1521 | 1546 | ||
1522 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); | 1547 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); |
1523 | 1548 | ||
1549 | /* | ||
1550 | * If reclaim is isolating dirty pages under writeback, it implies | ||
1551 | * that the long-lived page allocation rate is exceeding the page | ||
1552 | * laundering rate. Either the global limits are not being effective | ||
1553 | * at throttling processes due to the page distribution throughout | ||
1554 | * zones or there is heavy usage of a slow backing device. The | ||
1555 | * only option is to throttle from reclaim context which is not ideal | ||
1556 | * as there is no guarantee the dirtying process is throttled in the | ||
1557 | * same way balance_dirty_pages() manages. | ||
1558 | * | ||
1559 | * This scales the number of dirty pages that must be under writeback | ||
1560 | * before throttling depending on priority. It is a simple backoff | ||
1561 | * function that has the most effect in the range DEF_PRIORITY to | ||
1562 | * DEF_PRIORITY-2 which is the priority reclaim is considered to be | ||
1563 | * in trouble and reclaim is considered to be in trouble. | ||
1564 | * | ||
1565 | * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle | ||
1566 | * DEF_PRIORITY-1 50% must be PageWriteback | ||
1567 | * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble | ||
1568 | * ... | ||
1569 | * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any | ||
1570 | * isolated page is PageWriteback | ||
1571 | */ | ||
1572 | if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) | ||
1573 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); | ||
1574 | |||
1524 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, | 1575 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, |
1525 | zone_idx(zone), | 1576 | zone_idx(zone), |
1526 | nr_scanned, nr_reclaimed, | 1577 | nr_scanned, nr_reclaimed, |
@@ -1592,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1592 | struct page *page; | 1643 | struct page *page; |
1593 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1644 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1594 | unsigned long nr_rotated = 0; | 1645 | unsigned long nr_rotated = 0; |
1646 | isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; | ||
1595 | 1647 | ||
1596 | lru_add_drain(); | 1648 | lru_add_drain(); |
1649 | |||
1650 | if (!sc->may_unmap) | ||
1651 | reclaim_mode |= ISOLATE_UNMAPPED; | ||
1652 | if (!sc->may_writepage) | ||
1653 | reclaim_mode |= ISOLATE_CLEAN; | ||
1654 | |||
1597 | spin_lock_irq(&zone->lru_lock); | 1655 | spin_lock_irq(&zone->lru_lock); |
1598 | if (scanning_global_lru(sc)) { | 1656 | if (scanning_global_lru(sc)) { |
1599 | nr_taken = isolate_pages_global(nr_pages, &l_hold, | 1657 | nr_taken = isolate_pages_global(nr_pages, &l_hold, |
1600 | &pgscanned, sc->order, | 1658 | &pgscanned, sc->order, |
1601 | ISOLATE_ACTIVE, zone, | 1659 | reclaim_mode, zone, |
1602 | 1, file); | 1660 | 1, file); |
1603 | zone->pages_scanned += pgscanned; | 1661 | zone->pages_scanned += pgscanned; |
1604 | } else { | 1662 | } else { |
1605 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, | 1663 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, |
1606 | &pgscanned, sc->order, | 1664 | &pgscanned, sc->order, |
1607 | ISOLATE_ACTIVE, zone, | 1665 | reclaim_mode, zone, |
1608 | sc->mem_cgroup, 1, file); | 1666 | sc->mem_cgroup, 1, file); |
1609 | /* | 1667 | /* |
1610 | * mem_cgroup_isolate_pages() keeps track of | 1668 | * mem_cgroup_isolate_pages() keeps track of |
@@ -1613,8 +1671,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1613 | } | 1671 | } |
1614 | 1672 | ||
1615 | reclaim_stat->recent_scanned[file] += nr_taken; | 1673 | reclaim_stat->recent_scanned[file] += nr_taken; |
1616 | if (!scanning_global_lru(sc)) | ||
1617 | sc->memcg_record->nr_scanned[file] += nr_taken; | ||
1618 | 1674 | ||
1619 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | 1675 | __count_zone_vm_events(PGREFILL, zone, pgscanned); |
1620 | if (file) | 1676 | if (file) |
@@ -1666,8 +1722,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1666 | * get_scan_ratio. | 1722 | * get_scan_ratio. |
1667 | */ | 1723 | */ |
1668 | reclaim_stat->recent_rotated[file] += nr_rotated; | 1724 | reclaim_stat->recent_rotated[file] += nr_rotated; |
1669 | if (!scanning_global_lru(sc)) | ||
1670 | sc->memcg_record->nr_rotated[file] += nr_rotated; | ||
1671 | 1725 | ||
1672 | move_active_pages_to_lru(zone, &l_active, | 1726 | move_active_pages_to_lru(zone, &l_active, |
1673 | LRU_ACTIVE + file * LRU_FILE); | 1727 | LRU_ACTIVE + file * LRU_FILE); |
@@ -1713,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | |||
1713 | if (scanning_global_lru(sc)) | 1767 | if (scanning_global_lru(sc)) |
1714 | low = inactive_anon_is_low_global(zone); | 1768 | low = inactive_anon_is_low_global(zone); |
1715 | else | 1769 | else |
1716 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); | 1770 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); |
1717 | return low; | 1771 | return low; |
1718 | } | 1772 | } |
1719 | #else | 1773 | #else |
@@ -1756,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | |||
1756 | if (scanning_global_lru(sc)) | 1810 | if (scanning_global_lru(sc)) |
1757 | low = inactive_file_is_low_global(zone); | 1811 | low = inactive_file_is_low_global(zone); |
1758 | else | 1812 | else |
1759 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); | 1813 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone); |
1760 | return low; | 1814 | return low; |
1761 | } | 1815 | } |
1762 | 1816 | ||
@@ -1808,23 +1862,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1808 | u64 fraction[2], denominator; | 1862 | u64 fraction[2], denominator; |
1809 | enum lru_list l; | 1863 | enum lru_list l; |
1810 | int noswap = 0; | 1864 | int noswap = 0; |
1811 | int force_scan = 0; | 1865 | bool force_scan = false; |
1812 | unsigned long nr_force_scan[2]; | ||
1813 | |||
1814 | |||
1815 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1816 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1817 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1818 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1819 | 1866 | ||
1820 | if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { | 1867 | /* |
1821 | /* kswapd does zone balancing and need to scan this zone */ | 1868 | * If the zone or memcg is small, nr[l] can be 0. This |
1822 | if (scanning_global_lru(sc) && current_is_kswapd()) | 1869 | * results in no scanning on this priority and a potential |
1823 | force_scan = 1; | 1870 | * priority drop. Global direct reclaim can go to the next |
1824 | /* memcg may have small limit and need to avoid priority drop */ | 1871 | * zone and tends to have no problems. Global kswapd is for |
1825 | if (!scanning_global_lru(sc)) | 1872 | * zone balancing and it needs to scan a minimum amount. When |
1826 | force_scan = 1; | 1873 | * reclaiming for a memcg, a priority drop can cause high |
1827 | } | 1874 | * latencies, so it's better to scan a minimum amount there as |
1875 | * well. | ||
1876 | */ | ||
1877 | if (scanning_global_lru(sc) && current_is_kswapd()) | ||
1878 | force_scan = true; | ||
1879 | if (!scanning_global_lru(sc)) | ||
1880 | force_scan = true; | ||
1828 | 1881 | ||
1829 | /* If we have no swap space, do not bother scanning anon pages. */ | 1882 | /* If we have no swap space, do not bother scanning anon pages. */ |
1830 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1883 | if (!sc->may_swap || (nr_swap_pages <= 0)) { |
@@ -1832,11 +1885,14 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1832 | fraction[0] = 0; | 1885 | fraction[0] = 0; |
1833 | fraction[1] = 1; | 1886 | fraction[1] = 1; |
1834 | denominator = 1; | 1887 | denominator = 1; |
1835 | nr_force_scan[0] = 0; | ||
1836 | nr_force_scan[1] = SWAP_CLUSTER_MAX; | ||
1837 | goto out; | 1888 | goto out; |
1838 | } | 1889 | } |
1839 | 1890 | ||
1891 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1892 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1893 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1894 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1895 | |||
1840 | if (scanning_global_lru(sc)) { | 1896 | if (scanning_global_lru(sc)) { |
1841 | free = zone_page_state(zone, NR_FREE_PAGES); | 1897 | free = zone_page_state(zone, NR_FREE_PAGES); |
1842 | /* If we have very few page cache pages, | 1898 | /* If we have very few page cache pages, |
@@ -1845,8 +1901,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1845 | fraction[0] = 1; | 1901 | fraction[0] = 1; |
1846 | fraction[1] = 0; | 1902 | fraction[1] = 0; |
1847 | denominator = 1; | 1903 | denominator = 1; |
1848 | nr_force_scan[0] = SWAP_CLUSTER_MAX; | ||
1849 | nr_force_scan[1] = 0; | ||
1850 | goto out; | 1904 | goto out; |
1851 | } | 1905 | } |
1852 | } | 1906 | } |
@@ -1895,11 +1949,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1895 | fraction[0] = ap; | 1949 | fraction[0] = ap; |
1896 | fraction[1] = fp; | 1950 | fraction[1] = fp; |
1897 | denominator = ap + fp + 1; | 1951 | denominator = ap + fp + 1; |
1898 | if (force_scan) { | ||
1899 | unsigned long scan = SWAP_CLUSTER_MAX; | ||
1900 | nr_force_scan[0] = div64_u64(scan * ap, denominator); | ||
1901 | nr_force_scan[1] = div64_u64(scan * fp, denominator); | ||
1902 | } | ||
1903 | out: | 1952 | out: |
1904 | for_each_evictable_lru(l) { | 1953 | for_each_evictable_lru(l) { |
1905 | int file = is_file_lru(l); | 1954 | int file = is_file_lru(l); |
@@ -1908,20 +1957,10 @@ out: | |||
1908 | scan = zone_nr_lru_pages(zone, sc, l); | 1957 | scan = zone_nr_lru_pages(zone, sc, l); |
1909 | if (priority || noswap) { | 1958 | if (priority || noswap) { |
1910 | scan >>= priority; | 1959 | scan >>= priority; |
1960 | if (!scan && force_scan) | ||
1961 | scan = SWAP_CLUSTER_MAX; | ||
1911 | scan = div64_u64(scan * fraction[file], denominator); | 1962 | scan = div64_u64(scan * fraction[file], denominator); |
1912 | } | 1963 | } |
1913 | |||
1914 | /* | ||
1915 | * If zone is small or memcg is small, nr[l] can be 0. | ||
1916 | * This results no-scan on this priority and priority drop down. | ||
1917 | * For global direct reclaim, it can visit next zone and tend | ||
1918 | * not to have problems. For global kswapd, it's for zone | ||
1919 | * balancing and it need to scan a small amounts. When using | ||
1920 | * memcg, priority drop can cause big latency. So, it's better | ||
1921 | * to scan small amount. See may_noscan above. | ||
1922 | */ | ||
1923 | if (!scan && force_scan) | ||
1924 | scan = nr_force_scan[file]; | ||
1925 | nr[l] = scan; | 1964 | nr[l] = scan; |
1926 | } | 1965 | } |
1927 | } | 1966 | } |
@@ -2000,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone, | |||
2000 | enum lru_list l; | 2039 | enum lru_list l; |
2001 | unsigned long nr_reclaimed, nr_scanned; | 2040 | unsigned long nr_reclaimed, nr_scanned; |
2002 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 2041 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
2042 | struct blk_plug plug; | ||
2003 | 2043 | ||
2004 | restart: | 2044 | restart: |
2005 | nr_reclaimed = 0; | 2045 | nr_reclaimed = 0; |
2006 | nr_scanned = sc->nr_scanned; | 2046 | nr_scanned = sc->nr_scanned; |
2007 | get_scan_count(zone, sc, nr, priority); | 2047 | get_scan_count(zone, sc, nr, priority); |
2008 | 2048 | ||
2049 | blk_start_plug(&plug); | ||
2009 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 2050 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
2010 | nr[LRU_INACTIVE_FILE]) { | 2051 | nr[LRU_INACTIVE_FILE]) { |
2011 | for_each_evictable_lru(l) { | 2052 | for_each_evictable_lru(l) { |
@@ -2029,6 +2070,7 @@ restart: | |||
2029 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 2070 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
2030 | break; | 2071 | break; |
2031 | } | 2072 | } |
2073 | blk_finish_plug(&plug); | ||
2032 | sc->nr_reclaimed += nr_reclaimed; | 2074 | sc->nr_reclaimed += nr_reclaimed; |
2033 | 2075 | ||
2034 | /* | 2076 | /* |
@@ -2061,14 +2103,19 @@ restart: | |||
2061 | * | 2103 | * |
2062 | * If a zone is deemed to be full of pinned pages then just give it a light | 2104 | * If a zone is deemed to be full of pinned pages then just give it a light |
2063 | * scan then give up on it. | 2105 | * scan then give up on it. |
2106 | * | ||
2107 | * This function returns true if a zone is being reclaimed for a costly | ||
2108 | * high-order allocation and compaction is either ready to begin or deferred. | ||
2109 | * This indicates to the caller that it should retry the allocation or fail. | ||
2064 | */ | 2110 | */ |
2065 | static void shrink_zones(int priority, struct zonelist *zonelist, | 2111 | static bool shrink_zones(int priority, struct zonelist *zonelist, |
2066 | struct scan_control *sc) | 2112 | struct scan_control *sc) |
2067 | { | 2113 | { |
2068 | struct zoneref *z; | 2114 | struct zoneref *z; |
2069 | struct zone *zone; | 2115 | struct zone *zone; |
2070 | unsigned long nr_soft_reclaimed; | 2116 | unsigned long nr_soft_reclaimed; |
2071 | unsigned long nr_soft_scanned; | 2117 | unsigned long nr_soft_scanned; |
2118 | bool should_abort_reclaim = false; | ||
2072 | 2119 | ||
2073 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2120 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2074 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2121 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -2083,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
2083 | continue; | 2130 | continue; |
2084 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2131 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2085 | continue; /* Let kswapd poll it */ | 2132 | continue; /* Let kswapd poll it */ |
2133 | if (COMPACTION_BUILD) { | ||
2134 | /* | ||
2135 | * If we already have plenty of memory free for | ||
2136 | * compaction in this zone, don't free any more. | ||
2137 | * Even though compaction is invoked for any | ||
2138 | * non-zero order, only frequent costly order | ||
2139 | * reclamation is disruptive enough to become a | ||
2140 | * noticable problem, like transparent huge page | ||
2141 | * allocations. | ||
2142 | */ | ||
2143 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER && | ||
2144 | (compaction_suitable(zone, sc->order) || | ||
2145 | compaction_deferred(zone))) { | ||
2146 | should_abort_reclaim = true; | ||
2147 | continue; | ||
2148 | } | ||
2149 | } | ||
2086 | /* | 2150 | /* |
2087 | * This steals pages from memory cgroups over softlimit | 2151 | * This steals pages from memory cgroups over softlimit |
2088 | * and returns the number of reclaimed pages and | 2152 | * and returns the number of reclaimed pages and |
@@ -2100,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
2100 | 2164 | ||
2101 | shrink_zone(priority, zone, sc); | 2165 | shrink_zone(priority, zone, sc); |
2102 | } | 2166 | } |
2167 | |||
2168 | return should_abort_reclaim; | ||
2103 | } | 2169 | } |
2104 | 2170 | ||
2105 | static bool zone_reclaimable(struct zone *zone) | 2171 | static bool zone_reclaimable(struct zone *zone) |
@@ -2164,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2164 | sc->nr_scanned = 0; | 2230 | sc->nr_scanned = 0; |
2165 | if (!priority) | 2231 | if (!priority) |
2166 | disable_swap_token(sc->mem_cgroup); | 2232 | disable_swap_token(sc->mem_cgroup); |
2167 | shrink_zones(priority, zonelist, sc); | 2233 | if (shrink_zones(priority, zonelist, sc)) |
2234 | break; | ||
2235 | |||
2168 | /* | 2236 | /* |
2169 | * Don't shrink slabs when reclaiming memory from | 2237 | * Don't shrink slabs when reclaiming memory from |
2170 | * over limit cgroups | 2238 | * over limit cgroups |
@@ -2198,7 +2266,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2198 | */ | 2266 | */ |
2199 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; | 2267 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; |
2200 | if (total_scanned > writeback_threshold) { | 2268 | if (total_scanned > writeback_threshold) { |
2201 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); | 2269 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, |
2270 | WB_REASON_TRY_TO_FREE_PAGES); | ||
2202 | sc->may_writepage = 1; | 2271 | sc->may_writepage = 1; |
2203 | } | 2272 | } |
2204 | 2273 | ||
@@ -2268,10 +2337,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2268 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2337 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
2269 | 2338 | ||
2270 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 2339 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, |
2271 | gfp_t gfp_mask, bool noswap, | 2340 | gfp_t gfp_mask, bool noswap, |
2272 | struct zone *zone, | 2341 | struct zone *zone, |
2273 | struct memcg_scanrecord *rec, | 2342 | unsigned long *nr_scanned) |
2274 | unsigned long *scanned) | ||
2275 | { | 2343 | { |
2276 | struct scan_control sc = { | 2344 | struct scan_control sc = { |
2277 | .nr_scanned = 0, | 2345 | .nr_scanned = 0, |
@@ -2281,9 +2349,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2281 | .may_swap = !noswap, | 2349 | .may_swap = !noswap, |
2282 | .order = 0, | 2350 | .order = 0, |
2283 | .mem_cgroup = mem, | 2351 | .mem_cgroup = mem, |
2284 | .memcg_record = rec, | ||
2285 | }; | 2352 | }; |
2286 | ktime_t start, end; | ||
2287 | 2353 | ||
2288 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2354 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2289 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2355 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
@@ -2292,7 +2358,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2292 | sc.may_writepage, | 2358 | sc.may_writepage, |
2293 | sc.gfp_mask); | 2359 | sc.gfp_mask); |
2294 | 2360 | ||
2295 | start = ktime_get(); | ||
2296 | /* | 2361 | /* |
2297 | * NOTE: Although we can get the priority field, using it | 2362 | * NOTE: Although we can get the priority field, using it |
2298 | * here is not a good idea, since it limits the pages we can scan. | 2363 | * here is not a good idea, since it limits the pages we can scan. |
@@ -2301,25 +2366,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2301 | * the priority and make it zero. | 2366 | * the priority and make it zero. |
2302 | */ | 2367 | */ |
2303 | shrink_zone(0, zone, &sc); | 2368 | shrink_zone(0, zone, &sc); |
2304 | end = ktime_get(); | ||
2305 | |||
2306 | if (rec) | ||
2307 | rec->elapsed += ktime_to_ns(ktime_sub(end, start)); | ||
2308 | *scanned = sc.nr_scanned; | ||
2309 | 2369 | ||
2310 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2370 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2311 | 2371 | ||
2372 | *nr_scanned = sc.nr_scanned; | ||
2312 | return sc.nr_reclaimed; | 2373 | return sc.nr_reclaimed; |
2313 | } | 2374 | } |
2314 | 2375 | ||
2315 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 2376 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
2316 | gfp_t gfp_mask, | 2377 | gfp_t gfp_mask, |
2317 | bool noswap, | 2378 | bool noswap) |
2318 | struct memcg_scanrecord *rec) | ||
2319 | { | 2379 | { |
2320 | struct zonelist *zonelist; | 2380 | struct zonelist *zonelist; |
2321 | unsigned long nr_reclaimed; | 2381 | unsigned long nr_reclaimed; |
2322 | ktime_t start, end; | ||
2323 | int nid; | 2382 | int nid; |
2324 | struct scan_control sc = { | 2383 | struct scan_control sc = { |
2325 | .may_writepage = !laptop_mode, | 2384 | .may_writepage = !laptop_mode, |
@@ -2328,7 +2387,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2328 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2387 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2329 | .order = 0, | 2388 | .order = 0, |
2330 | .mem_cgroup = mem_cont, | 2389 | .mem_cgroup = mem_cont, |
2331 | .memcg_record = rec, | ||
2332 | .nodemask = NULL, /* we don't care the placement */ | 2390 | .nodemask = NULL, /* we don't care the placement */ |
2333 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2391 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2334 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | 2392 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), |
@@ -2337,7 +2395,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2337 | .gfp_mask = sc.gfp_mask, | 2395 | .gfp_mask = sc.gfp_mask, |
2338 | }; | 2396 | }; |
2339 | 2397 | ||
2340 | start = ktime_get(); | ||
2341 | /* | 2398 | /* |
2342 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't | 2399 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't |
2343 | * take care of from where we get pages. So the node where we start the | 2400 | * take care of from where we get pages. So the node where we start the |
@@ -2352,9 +2409,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2352 | sc.gfp_mask); | 2409 | sc.gfp_mask); |
2353 | 2410 | ||
2354 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); | 2411 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
2355 | end = ktime_get(); | ||
2356 | if (rec) | ||
2357 | rec->elapsed += ktime_to_ns(ktime_sub(end, start)); | ||
2358 | 2412 | ||
2359 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); | 2413 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); |
2360 | 2414 | ||
@@ -2722,6 +2776,8 @@ out: | |||
2722 | 2776 | ||
2723 | /* If balanced, clear the congested flag */ | 2777 | /* If balanced, clear the congested flag */ |
2724 | zone_clear_flag(zone, ZONE_CONGESTED); | 2778 | zone_clear_flag(zone, ZONE_CONGESTED); |
2779 | if (i <= *classzone_idx) | ||
2780 | balanced += zone->present_pages; | ||
2725 | } | 2781 | } |
2726 | } | 2782 | } |
2727 | 2783 | ||
@@ -2795,7 +2851,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2795 | static int kswapd(void *p) | 2851 | static int kswapd(void *p) |
2796 | { | 2852 | { |
2797 | unsigned long order, new_order; | 2853 | unsigned long order, new_order; |
2854 | unsigned balanced_order; | ||
2798 | int classzone_idx, new_classzone_idx; | 2855 | int classzone_idx, new_classzone_idx; |
2856 | int balanced_classzone_idx; | ||
2799 | pg_data_t *pgdat = (pg_data_t*)p; | 2857 | pg_data_t *pgdat = (pg_data_t*)p; |
2800 | struct task_struct *tsk = current; | 2858 | struct task_struct *tsk = current; |
2801 | 2859 | ||
@@ -2826,7 +2884,9 @@ static int kswapd(void *p) | |||
2826 | set_freezable(); | 2884 | set_freezable(); |
2827 | 2885 | ||
2828 | order = new_order = 0; | 2886 | order = new_order = 0; |
2887 | balanced_order = 0; | ||
2829 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; | 2888 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
2889 | balanced_classzone_idx = classzone_idx; | ||
2830 | for ( ; ; ) { | 2890 | for ( ; ; ) { |
2831 | int ret; | 2891 | int ret; |
2832 | 2892 | ||
@@ -2835,7 +2895,8 @@ static int kswapd(void *p) | |||
2835 | * new request of a similar or harder type will succeed soon | 2895 | * new request of a similar or harder type will succeed soon |
2836 | * so consider going to sleep on the basis we reclaimed at | 2896 | * so consider going to sleep on the basis we reclaimed at |
2837 | */ | 2897 | */ |
2838 | if (classzone_idx >= new_classzone_idx && order == new_order) { | 2898 | if (balanced_classzone_idx >= new_classzone_idx && |
2899 | balanced_order == new_order) { | ||
2839 | new_order = pgdat->kswapd_max_order; | 2900 | new_order = pgdat->kswapd_max_order; |
2840 | new_classzone_idx = pgdat->classzone_idx; | 2901 | new_classzone_idx = pgdat->classzone_idx; |
2841 | pgdat->kswapd_max_order = 0; | 2902 | pgdat->kswapd_max_order = 0; |
@@ -2850,9 +2911,12 @@ static int kswapd(void *p) | |||
2850 | order = new_order; | 2911 | order = new_order; |
2851 | classzone_idx = new_classzone_idx; | 2912 | classzone_idx = new_classzone_idx; |
2852 | } else { | 2913 | } else { |
2853 | kswapd_try_to_sleep(pgdat, order, classzone_idx); | 2914 | kswapd_try_to_sleep(pgdat, balanced_order, |
2915 | balanced_classzone_idx); | ||
2854 | order = pgdat->kswapd_max_order; | 2916 | order = pgdat->kswapd_max_order; |
2855 | classzone_idx = pgdat->classzone_idx; | 2917 | classzone_idx = pgdat->classzone_idx; |
2918 | new_order = order; | ||
2919 | new_classzone_idx = classzone_idx; | ||
2856 | pgdat->kswapd_max_order = 0; | 2920 | pgdat->kswapd_max_order = 0; |
2857 | pgdat->classzone_idx = pgdat->nr_zones - 1; | 2921 | pgdat->classzone_idx = pgdat->nr_zones - 1; |
2858 | } | 2922 | } |
@@ -2867,7 +2931,9 @@ static int kswapd(void *p) | |||
2867 | */ | 2931 | */ |
2868 | if (!ret) { | 2932 | if (!ret) { |
2869 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 2933 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); |
2870 | order = balance_pgdat(pgdat, order, &classzone_idx); | 2934 | balanced_classzone_idx = classzone_idx; |
2935 | balanced_order = balance_pgdat(pgdat, order, | ||
2936 | &balanced_classzone_idx); | ||
2871 | } | 2937 | } |
2872 | } | 2938 | } |
2873 | return 0; | 2939 | return 0; |
@@ -3379,66 +3445,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) | |||
3379 | 3445 | ||
3380 | } | 3446 | } |
3381 | 3447 | ||
3382 | /** | 3448 | static void warn_scan_unevictable_pages(void) |
3383 | * scan_zone_unevictable_pages - check unevictable list for evictable pages | ||
3384 | * @zone - zone of which to scan the unevictable list | ||
3385 | * | ||
3386 | * Scan @zone's unevictable LRU lists to check for pages that have become | ||
3387 | * evictable. Move those that have to @zone's inactive list where they | ||
3388 | * become candidates for reclaim, unless shrink_inactive_zone() decides | ||
3389 | * to reactivate them. Pages that are still unevictable are rotated | ||
3390 | * back onto @zone's unevictable list. | ||
3391 | */ | ||
3392 | #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ | ||
3393 | static void scan_zone_unevictable_pages(struct zone *zone) | ||
3394 | { | ||
3395 | struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; | ||
3396 | unsigned long scan; | ||
3397 | unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); | ||
3398 | |||
3399 | while (nr_to_scan > 0) { | ||
3400 | unsigned long batch_size = min(nr_to_scan, | ||
3401 | SCAN_UNEVICTABLE_BATCH_SIZE); | ||
3402 | |||
3403 | spin_lock_irq(&zone->lru_lock); | ||
3404 | for (scan = 0; scan < batch_size; scan++) { | ||
3405 | struct page *page = lru_to_page(l_unevictable); | ||
3406 | |||
3407 | if (!trylock_page(page)) | ||
3408 | continue; | ||
3409 | |||
3410 | prefetchw_prev_lru_page(page, l_unevictable, flags); | ||
3411 | |||
3412 | if (likely(PageLRU(page) && PageUnevictable(page))) | ||
3413 | check_move_unevictable_page(page, zone); | ||
3414 | |||
3415 | unlock_page(page); | ||
3416 | } | ||
3417 | spin_unlock_irq(&zone->lru_lock); | ||
3418 | |||
3419 | nr_to_scan -= batch_size; | ||
3420 | } | ||
3421 | } | ||
3422 | |||
3423 | |||
3424 | /** | ||
3425 | * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages | ||
3426 | * | ||
3427 | * A really big hammer: scan all zones' unevictable LRU lists to check for | ||
3428 | * pages that have become evictable. Move those back to the zones' | ||
3429 | * inactive list where they become candidates for reclaim. | ||
3430 | * This occurs when, e.g., we have unswappable pages on the unevictable lists, | ||
3431 | * and we add swap to the system. As such, it runs in the context of a task | ||
3432 | * that has possibly/probably made some previously unevictable pages | ||
3433 | * evictable. | ||
3434 | */ | ||
3435 | static void scan_all_zones_unevictable_pages(void) | ||
3436 | { | 3449 | { |
3437 | struct zone *zone; | 3450 | printk_once(KERN_WARNING |
3438 | 3451 | "The scan_unevictable_pages sysctl/node-interface has been " | |
3439 | for_each_zone(zone) { | 3452 | "disabled for lack of a legitimate use case. If you have " |
3440 | scan_zone_unevictable_pages(zone); | 3453 | "one, please send an email to linux-mm@kvack.org.\n"); |
3441 | } | ||
3442 | } | 3454 | } |
3443 | 3455 | ||
3444 | /* | 3456 | /* |
@@ -3451,11 +3463,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write, | |||
3451 | void __user *buffer, | 3463 | void __user *buffer, |
3452 | size_t *length, loff_t *ppos) | 3464 | size_t *length, loff_t *ppos) |
3453 | { | 3465 | { |
3466 | warn_scan_unevictable_pages(); | ||
3454 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 3467 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
3455 | |||
3456 | if (write && *(unsigned long *)table->data) | ||
3457 | scan_all_zones_unevictable_pages(); | ||
3458 | |||
3459 | scan_unevictable_pages = 0; | 3468 | scan_unevictable_pages = 0; |
3460 | return 0; | 3469 | return 0; |
3461 | } | 3470 | } |
@@ -3470,6 +3479,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev, | |||
3470 | struct sysdev_attribute *attr, | 3479 | struct sysdev_attribute *attr, |
3471 | char *buf) | 3480 | char *buf) |
3472 | { | 3481 | { |
3482 | warn_scan_unevictable_pages(); | ||
3473 | return sprintf(buf, "0\n"); /* always zero; should fit... */ | 3483 | return sprintf(buf, "0\n"); /* always zero; should fit... */ |
3474 | } | 3484 | } |
3475 | 3485 | ||
@@ -3477,19 +3487,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev, | |||
3477 | struct sysdev_attribute *attr, | 3487 | struct sysdev_attribute *attr, |
3478 | const char *buf, size_t count) | 3488 | const char *buf, size_t count) |
3479 | { | 3489 | { |
3480 | struct zone *node_zones = NODE_DATA(dev->id)->node_zones; | 3490 | warn_scan_unevictable_pages(); |
3481 | struct zone *zone; | ||
3482 | unsigned long res; | ||
3483 | unsigned long req = strict_strtoul(buf, 10, &res); | ||
3484 | |||
3485 | if (!req) | ||
3486 | return 1; /* zero is no-op */ | ||
3487 | |||
3488 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
3489 | if (!populated_zone(zone)) | ||
3490 | continue; | ||
3491 | scan_zone_unevictable_pages(zone); | ||
3492 | } | ||
3493 | return 1; | 3491 | return 1; |
3494 | } | 3492 | } |
3495 | 3493 | ||