diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 140 |
1 files changed, 100 insertions, 40 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 5d4c4d02254d..eca70310adb2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
21 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> |
22 | #include <linux/vmstat.h> | ||
22 | #include <linux/file.h> | 23 | #include <linux/file.h> |
23 | #include <linux/writeback.h> | 24 | #include <linux/writeback.h> |
24 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
@@ -62,6 +63,8 @@ struct scan_control { | |||
62 | int swap_cluster_max; | 63 | int swap_cluster_max; |
63 | 64 | ||
64 | int swappiness; | 65 | int swappiness; |
66 | |||
67 | int all_unreclaimable; | ||
65 | }; | 68 | }; |
66 | 69 | ||
67 | /* | 70 | /* |
@@ -368,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
368 | /* synchronous write or broken a_ops? */ | 371 | /* synchronous write or broken a_ops? */ |
369 | ClearPageReclaim(page); | 372 | ClearPageReclaim(page); |
370 | } | 373 | } |
371 | 374 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | |
372 | return PAGE_SUCCESS; | 375 | return PAGE_SUCCESS; |
373 | } | 376 | } |
374 | 377 | ||
@@ -377,15 +380,34 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
377 | 380 | ||
378 | int remove_mapping(struct address_space *mapping, struct page *page) | 381 | int remove_mapping(struct address_space *mapping, struct page *page) |
379 | { | 382 | { |
380 | if (!mapping) | 383 | BUG_ON(!PageLocked(page)); |
381 | return 0; /* truncate got there first */ | 384 | BUG_ON(mapping != page_mapping(page)); |
382 | 385 | ||
383 | write_lock_irq(&mapping->tree_lock); | 386 | write_lock_irq(&mapping->tree_lock); |
384 | |||
385 | /* | 387 | /* |
386 | * The non-racy check for busy page. It is critical to check | 388 | * The non racy check for a busy page. |
387 | * PageDirty _after_ making sure that the page is freeable and | 389 | * |
388 | * not in use by anybody. (pagecache + us == 2) | 390 | * Must be careful with the order of the tests. When someone has |
391 | * a ref to the page, it may be possible that they dirty it then | ||
392 | * drop the reference. So if PageDirty is tested before page_count | ||
393 | * here, then the following race may occur: | ||
394 | * | ||
395 | * get_user_pages(&page); | ||
396 | * [user mapping goes away] | ||
397 | * write_to(page); | ||
398 | * !PageDirty(page) [good] | ||
399 | * SetPageDirty(page); | ||
400 | * put_page(page); | ||
401 | * !page_count(page) [good, discard it] | ||
402 | * | ||
403 | * [oops, our write_to data is lost] | ||
404 | * | ||
405 | * Reversing the order of the tests ensures such a situation cannot | ||
406 | * escape unnoticed. The smp_rmb is needed to ensure the page->flags | ||
407 | * load is not satisfied before that of page->_count. | ||
408 | * | ||
409 | * Note that if SetPageDirty is always performed via set_page_dirty, | ||
410 | * and thus under tree_lock, then this ordering is not required. | ||
389 | */ | 411 | */ |
390 | if (unlikely(page_count(page) != 2)) | 412 | if (unlikely(page_count(page) != 2)) |
391 | goto cannot_free; | 413 | goto cannot_free; |
@@ -440,7 +462,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
440 | if (TestSetPageLocked(page)) | 462 | if (TestSetPageLocked(page)) |
441 | goto keep; | 463 | goto keep; |
442 | 464 | ||
443 | BUG_ON(PageActive(page)); | 465 | VM_BUG_ON(PageActive(page)); |
444 | 466 | ||
445 | sc->nr_scanned++; | 467 | sc->nr_scanned++; |
446 | 468 | ||
@@ -547,7 +569,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
547 | goto free_it; | 569 | goto free_it; |
548 | } | 570 | } |
549 | 571 | ||
550 | if (!remove_mapping(mapping, page)) | 572 | if (!mapping || !remove_mapping(mapping, page)) |
551 | goto keep_locked; | 573 | goto keep_locked; |
552 | 574 | ||
553 | free_it: | 575 | free_it: |
@@ -564,7 +586,7 @@ keep_locked: | |||
564 | unlock_page(page); | 586 | unlock_page(page); |
565 | keep: | 587 | keep: |
566 | list_add(&page->lru, &ret_pages); | 588 | list_add(&page->lru, &ret_pages); |
567 | BUG_ON(PageLRU(page)); | 589 | VM_BUG_ON(PageLRU(page)); |
568 | } | 590 | } |
569 | list_splice(&ret_pages, page_list); | 591 | list_splice(&ret_pages, page_list); |
570 | if (pagevec_count(&freed_pvec)) | 592 | if (pagevec_count(&freed_pvec)) |
@@ -603,7 +625,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
603 | page = lru_to_page(src); | 625 | page = lru_to_page(src); |
604 | prefetchw_prev_lru_page(page, src, flags); | 626 | prefetchw_prev_lru_page(page, src, flags); |
605 | 627 | ||
606 | BUG_ON(!PageLRU(page)); | 628 | VM_BUG_ON(!PageLRU(page)); |
607 | 629 | ||
608 | list_del(&page->lru); | 630 | list_del(&page->lru); |
609 | target = src; | 631 | target = src; |
@@ -674,7 +696,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
674 | */ | 696 | */ |
675 | while (!list_empty(&page_list)) { | 697 | while (!list_empty(&page_list)) { |
676 | page = lru_to_page(&page_list); | 698 | page = lru_to_page(&page_list); |
677 | BUG_ON(PageLRU(page)); | 699 | VM_BUG_ON(PageLRU(page)); |
678 | SetPageLRU(page); | 700 | SetPageLRU(page); |
679 | list_del(&page->lru); | 701 | list_del(&page->lru); |
680 | if (PageActive(page)) | 702 | if (PageActive(page)) |
@@ -695,6 +717,11 @@ done: | |||
695 | return nr_reclaimed; | 717 | return nr_reclaimed; |
696 | } | 718 | } |
697 | 719 | ||
720 | static inline int zone_is_near_oom(struct zone *zone) | ||
721 | { | ||
722 | return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; | ||
723 | } | ||
724 | |||
698 | /* | 725 | /* |
699 | * This moves pages from the active list to the inactive list. | 726 | * This moves pages from the active list to the inactive list. |
700 | * | 727 | * |
@@ -730,6 +757,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
730 | long distress; | 757 | long distress; |
731 | long swap_tendency; | 758 | long swap_tendency; |
732 | 759 | ||
760 | if (zone_is_near_oom(zone)) | ||
761 | goto force_reclaim_mapped; | ||
762 | |||
733 | /* | 763 | /* |
734 | * `distress' is a measure of how much trouble we're having | 764 | * `distress' is a measure of how much trouble we're having |
735 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | 765 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. |
@@ -765,6 +795,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
765 | * memory onto the inactive list. | 795 | * memory onto the inactive list. |
766 | */ | 796 | */ |
767 | if (swap_tendency >= 100) | 797 | if (swap_tendency >= 100) |
798 | force_reclaim_mapped: | ||
768 | reclaim_mapped = 1; | 799 | reclaim_mapped = 1; |
769 | } | 800 | } |
770 | 801 | ||
@@ -797,9 +828,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
797 | while (!list_empty(&l_inactive)) { | 828 | while (!list_empty(&l_inactive)) { |
798 | page = lru_to_page(&l_inactive); | 829 | page = lru_to_page(&l_inactive); |
799 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 830 | prefetchw_prev_lru_page(page, &l_inactive, flags); |
800 | BUG_ON(PageLRU(page)); | 831 | VM_BUG_ON(PageLRU(page)); |
801 | SetPageLRU(page); | 832 | SetPageLRU(page); |
802 | BUG_ON(!PageActive(page)); | 833 | VM_BUG_ON(!PageActive(page)); |
803 | ClearPageActive(page); | 834 | ClearPageActive(page); |
804 | 835 | ||
805 | list_move(&page->lru, &zone->inactive_list); | 836 | list_move(&page->lru, &zone->inactive_list); |
@@ -827,9 +858,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
827 | while (!list_empty(&l_active)) { | 858 | while (!list_empty(&l_active)) { |
828 | page = lru_to_page(&l_active); | 859 | page = lru_to_page(&l_active); |
829 | prefetchw_prev_lru_page(page, &l_active, flags); | 860 | prefetchw_prev_lru_page(page, &l_active, flags); |
830 | BUG_ON(PageLRU(page)); | 861 | VM_BUG_ON(PageLRU(page)); |
831 | SetPageLRU(page); | 862 | SetPageLRU(page); |
832 | BUG_ON(!PageActive(page)); | 863 | VM_BUG_ON(!PageActive(page)); |
833 | list_move(&page->lru, &zone->active_list); | 864 | list_move(&page->lru, &zone->active_list); |
834 | pgmoved++; | 865 | pgmoved++; |
835 | if (!pagevec_add(&pvec, page)) { | 866 | if (!pagevec_add(&pvec, page)) { |
@@ -925,6 +956,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
925 | unsigned long nr_reclaimed = 0; | 956 | unsigned long nr_reclaimed = 0; |
926 | int i; | 957 | int i; |
927 | 958 | ||
959 | sc->all_unreclaimable = 1; | ||
928 | for (i = 0; zones[i] != NULL; i++) { | 960 | for (i = 0; zones[i] != NULL; i++) { |
929 | struct zone *zone = zones[i]; | 961 | struct zone *zone = zones[i]; |
930 | 962 | ||
@@ -941,6 +973,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
941 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 973 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
942 | continue; /* Let kswapd poll it */ | 974 | continue; /* Let kswapd poll it */ |
943 | 975 | ||
976 | sc->all_unreclaimable = 0; | ||
977 | |||
944 | nr_reclaimed += shrink_zone(priority, zone, sc); | 978 | nr_reclaimed += shrink_zone(priority, zone, sc); |
945 | } | 979 | } |
946 | return nr_reclaimed; | 980 | return nr_reclaimed; |
@@ -1021,6 +1055,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
1021 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | 1055 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) |
1022 | blk_congestion_wait(WRITE, HZ/10); | 1056 | blk_congestion_wait(WRITE, HZ/10); |
1023 | } | 1057 | } |
1058 | /* top priority shrink_caches still had more to do? don't OOM, then */ | ||
1059 | if (!sc.all_unreclaimable) | ||
1060 | ret = 1; | ||
1024 | out: | 1061 | out: |
1025 | for (i = 0; zones[i] != 0; i++) { | 1062 | for (i = 0; zones[i] != 0; i++) { |
1026 | struct zone *zone = zones[i]; | 1063 | struct zone *zone = zones[i]; |
@@ -1153,7 +1190,7 @@ scan: | |||
1153 | if (zone->all_unreclaimable) | 1190 | if (zone->all_unreclaimable) |
1154 | continue; | 1191 | continue; |
1155 | if (nr_slab == 0 && zone->pages_scanned >= | 1192 | if (nr_slab == 0 && zone->pages_scanned >= |
1156 | (zone->nr_active + zone->nr_inactive) * 4) | 1193 | (zone->nr_active + zone->nr_inactive) * 6) |
1157 | zone->all_unreclaimable = 1; | 1194 | zone->all_unreclaimable = 1; |
1158 | /* | 1195 | /* |
1159 | * If we've done a decent amount of scanning and | 1196 | * If we've done a decent amount of scanning and |
@@ -1361,7 +1398,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1361 | for_each_zone(zone) | 1398 | for_each_zone(zone) |
1362 | lru_pages += zone->nr_active + zone->nr_inactive; | 1399 | lru_pages += zone->nr_active + zone->nr_inactive; |
1363 | 1400 | ||
1364 | nr_slab = global_page_state(NR_SLAB); | 1401 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); |
1365 | /* If slab caches are huge, it's better to hit them first */ | 1402 | /* If slab caches are huge, it's better to hit them first */ |
1366 | while (nr_slab >= lru_pages) { | 1403 | while (nr_slab >= lru_pages) { |
1367 | reclaim_state.reclaimed_slab = 0; | 1404 | reclaim_state.reclaimed_slab = 0; |
@@ -1510,7 +1547,6 @@ int zone_reclaim_mode __read_mostly; | |||
1510 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ | 1547 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ |
1511 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ | 1548 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ |
1512 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ | 1549 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ |
1513 | #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ | ||
1514 | 1550 | ||
1515 | /* | 1551 | /* |
1516 | * Priority for ZONE_RECLAIM. This determines the fraction of pages | 1552 | * Priority for ZONE_RECLAIM. This determines the fraction of pages |
@@ -1526,6 +1562,12 @@ int zone_reclaim_mode __read_mostly; | |||
1526 | int sysctl_min_unmapped_ratio = 1; | 1562 | int sysctl_min_unmapped_ratio = 1; |
1527 | 1563 | ||
1528 | /* | 1564 | /* |
1565 | * If the number of slab pages in a zone grows beyond this percentage then | ||
1566 | * slab reclaim needs to occur. | ||
1567 | */ | ||
1568 | int sysctl_min_slab_ratio = 5; | ||
1569 | |||
1570 | /* | ||
1529 | * Try to free up some pages from this zone through reclaim. | 1571 | * Try to free up some pages from this zone through reclaim. |
1530 | */ | 1572 | */ |
1531 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1573 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
@@ -1544,6 +1586,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1544 | .gfp_mask = gfp_mask, | 1586 | .gfp_mask = gfp_mask, |
1545 | .swappiness = vm_swappiness, | 1587 | .swappiness = vm_swappiness, |
1546 | }; | 1588 | }; |
1589 | unsigned long slab_reclaimable; | ||
1547 | 1590 | ||
1548 | disable_swap_token(); | 1591 | disable_swap_token(); |
1549 | cond_resched(); | 1592 | cond_resched(); |
@@ -1556,29 +1599,43 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1556 | reclaim_state.reclaimed_slab = 0; | 1599 | reclaim_state.reclaimed_slab = 0; |
1557 | p->reclaim_state = &reclaim_state; | 1600 | p->reclaim_state = &reclaim_state; |
1558 | 1601 | ||
1559 | /* | 1602 | if (zone_page_state(zone, NR_FILE_PAGES) - |
1560 | * Free memory by calling shrink zone with increasing priorities | 1603 | zone_page_state(zone, NR_FILE_MAPPED) > |
1561 | * until we have enough memory freed. | 1604 | zone->min_unmapped_pages) { |
1562 | */ | 1605 | /* |
1563 | priority = ZONE_RECLAIM_PRIORITY; | 1606 | * Free memory by calling shrink zone with increasing |
1564 | do { | 1607 | * priorities until we have enough memory freed. |
1565 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1608 | */ |
1566 | priority--; | 1609 | priority = ZONE_RECLAIM_PRIORITY; |
1567 | } while (priority >= 0 && nr_reclaimed < nr_pages); | 1610 | do { |
1611 | nr_reclaimed += shrink_zone(priority, zone, &sc); | ||
1612 | priority--; | ||
1613 | } while (priority >= 0 && nr_reclaimed < nr_pages); | ||
1614 | } | ||
1568 | 1615 | ||
1569 | if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { | 1616 | slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
1617 | if (slab_reclaimable > zone->min_slab_pages) { | ||
1570 | /* | 1618 | /* |
1571 | * shrink_slab() does not currently allow us to determine how | 1619 | * shrink_slab() does not currently allow us to determine how |
1572 | * many pages were freed in this zone. So we just shake the slab | 1620 | * many pages were freed in this zone. So we take the current |
1573 | * a bit and then go off node for this particular allocation | 1621 | * number of slab pages and shake the slab until it is reduced |
1574 | * despite possibly having freed enough memory to allocate in | 1622 | * by the same nr_pages that we used for reclaiming unmapped |
1575 | * this zone. If we freed local memory then the next | 1623 | * pages. |
1576 | * allocations will be local again. | ||
1577 | * | 1624 | * |
1578 | * shrink_slab will free memory on all zones and may take | 1625 | * Note that shrink_slab will free memory on all zones and may |
1579 | * a long time. | 1626 | * take a long time. |
1627 | */ | ||
1628 | while (shrink_slab(sc.nr_scanned, gfp_mask, order) && | ||
1629 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) > | ||
1630 | slab_reclaimable - nr_pages) | ||
1631 | ; | ||
1632 | |||
1633 | /* | ||
1634 | * Update nr_reclaimed by the number of slab pages we | ||
1635 | * reclaimed from this zone. | ||
1580 | */ | 1636 | */ |
1581 | shrink_slab(sc.nr_scanned, gfp_mask, order); | 1637 | nr_reclaimed += slab_reclaimable - |
1638 | zone_page_state(zone, NR_SLAB_RECLAIMABLE); | ||
1582 | } | 1639 | } |
1583 | 1640 | ||
1584 | p->reclaim_state = NULL; | 1641 | p->reclaim_state = NULL; |
@@ -1592,7 +1649,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1592 | int node_id; | 1649 | int node_id; |
1593 | 1650 | ||
1594 | /* | 1651 | /* |
1595 | * Zone reclaim reclaims unmapped file backed pages. | 1652 | * Zone reclaim reclaims unmapped file backed pages and |
1653 | * slab pages if we are over the defined limits. | ||
1596 | * | 1654 | * |
1597 | * A small portion of unmapped file backed pages is needed for | 1655 | * A small portion of unmapped file backed pages is needed for |
1598 | * file I/O otherwise pages read by file I/O will be immediately | 1656 | * file I/O otherwise pages read by file I/O will be immediately |
@@ -1601,7 +1659,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1601 | * unmapped file backed pages. | 1659 | * unmapped file backed pages. |
1602 | */ | 1660 | */ |
1603 | if (zone_page_state(zone, NR_FILE_PAGES) - | 1661 | if (zone_page_state(zone, NR_FILE_PAGES) - |
1604 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio) | 1662 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages |
1663 | && zone_page_state(zone, NR_SLAB_RECLAIMABLE) | ||
1664 | <= zone->min_slab_pages) | ||
1605 | return 0; | 1665 | return 0; |
1606 | 1666 | ||
1607 | /* | 1667 | /* |
@@ -1621,7 +1681,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1621 | * over remote processors and spread off node memory allocations | 1681 | * over remote processors and spread off node memory allocations |
1622 | * as wide as possible. | 1682 | * as wide as possible. |
1623 | */ | 1683 | */ |
1624 | node_id = zone->zone_pgdat->node_id; | 1684 | node_id = zone_to_nid(zone); |
1625 | mask = node_to_cpumask(node_id); | 1685 | mask = node_to_cpumask(node_id); |
1626 | if (!cpus_empty(mask) && node_id != numa_node_id()) | 1686 | if (!cpus_empty(mask) && node_id != numa_node_id()) |
1627 | return 0; | 1687 | return 0; |