diff options
Diffstat (limited to 'mm/vmscan.c')
| -rw-r--r-- | mm/vmscan.c | 140 |
1 files changed, 100 insertions, 40 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 5d4c4d02254d..eca70310adb2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
| 20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
| 21 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> |
| 22 | #include <linux/vmstat.h> | ||
| 22 | #include <linux/file.h> | 23 | #include <linux/file.h> |
| 23 | #include <linux/writeback.h> | 24 | #include <linux/writeback.h> |
| 24 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
| @@ -62,6 +63,8 @@ struct scan_control { | |||
| 62 | int swap_cluster_max; | 63 | int swap_cluster_max; |
| 63 | 64 | ||
| 64 | int swappiness; | 65 | int swappiness; |
| 66 | |||
| 67 | int all_unreclaimable; | ||
| 65 | }; | 68 | }; |
| 66 | 69 | ||
| 67 | /* | 70 | /* |
| @@ -368,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
| 368 | /* synchronous write or broken a_ops? */ | 371 | /* synchronous write or broken a_ops? */ |
| 369 | ClearPageReclaim(page); | 372 | ClearPageReclaim(page); |
| 370 | } | 373 | } |
| 371 | 374 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | |
| 372 | return PAGE_SUCCESS; | 375 | return PAGE_SUCCESS; |
| 373 | } | 376 | } |
| 374 | 377 | ||
| @@ -377,15 +380,34 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
| 377 | 380 | ||
| 378 | int remove_mapping(struct address_space *mapping, struct page *page) | 381 | int remove_mapping(struct address_space *mapping, struct page *page) |
| 379 | { | 382 | { |
| 380 | if (!mapping) | 383 | BUG_ON(!PageLocked(page)); |
| 381 | return 0; /* truncate got there first */ | 384 | BUG_ON(mapping != page_mapping(page)); |
| 382 | 385 | ||
| 383 | write_lock_irq(&mapping->tree_lock); | 386 | write_lock_irq(&mapping->tree_lock); |
| 384 | |||
| 385 | /* | 387 | /* |
| 386 | * The non-racy check for busy page. It is critical to check | 388 | * The non racy check for a busy page. |
| 387 | * PageDirty _after_ making sure that the page is freeable and | 389 | * |
| 388 | * not in use by anybody. (pagecache + us == 2) | 390 | * Must be careful with the order of the tests. When someone has |
| 391 | * a ref to the page, it may be possible that they dirty it then | ||
| 392 | * drop the reference. So if PageDirty is tested before page_count | ||
| 393 | * here, then the following race may occur: | ||
| 394 | * | ||
| 395 | * get_user_pages(&page); | ||
| 396 | * [user mapping goes away] | ||
| 397 | * write_to(page); | ||
| 398 | * !PageDirty(page) [good] | ||
| 399 | * SetPageDirty(page); | ||
| 400 | * put_page(page); | ||
| 401 | * !page_count(page) [good, discard it] | ||
| 402 | * | ||
| 403 | * [oops, our write_to data is lost] | ||
| 404 | * | ||
| 405 | * Reversing the order of the tests ensures such a situation cannot | ||
| 406 | * escape unnoticed. The smp_rmb is needed to ensure the page->flags | ||
| 407 | * load is not satisfied before that of page->_count. | ||
| 408 | * | ||
| 409 | * Note that if SetPageDirty is always performed via set_page_dirty, | ||
| 410 | * and thus under tree_lock, then this ordering is not required. | ||
| 389 | */ | 411 | */ |
| 390 | if (unlikely(page_count(page) != 2)) | 412 | if (unlikely(page_count(page) != 2)) |
| 391 | goto cannot_free; | 413 | goto cannot_free; |
| @@ -440,7 +462,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 440 | if (TestSetPageLocked(page)) | 462 | if (TestSetPageLocked(page)) |
| 441 | goto keep; | 463 | goto keep; |
| 442 | 464 | ||
| 443 | BUG_ON(PageActive(page)); | 465 | VM_BUG_ON(PageActive(page)); |
| 444 | 466 | ||
| 445 | sc->nr_scanned++; | 467 | sc->nr_scanned++; |
| 446 | 468 | ||
| @@ -547,7 +569,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 547 | goto free_it; | 569 | goto free_it; |
| 548 | } | 570 | } |
| 549 | 571 | ||
| 550 | if (!remove_mapping(mapping, page)) | 572 | if (!mapping || !remove_mapping(mapping, page)) |
| 551 | goto keep_locked; | 573 | goto keep_locked; |
| 552 | 574 | ||
| 553 | free_it: | 575 | free_it: |
| @@ -564,7 +586,7 @@ keep_locked: | |||
| 564 | unlock_page(page); | 586 | unlock_page(page); |
| 565 | keep: | 587 | keep: |
| 566 | list_add(&page->lru, &ret_pages); | 588 | list_add(&page->lru, &ret_pages); |
| 567 | BUG_ON(PageLRU(page)); | 589 | VM_BUG_ON(PageLRU(page)); |
| 568 | } | 590 | } |
| 569 | list_splice(&ret_pages, page_list); | 591 | list_splice(&ret_pages, page_list); |
| 570 | if (pagevec_count(&freed_pvec)) | 592 | if (pagevec_count(&freed_pvec)) |
| @@ -603,7 +625,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
| 603 | page = lru_to_page(src); | 625 | page = lru_to_page(src); |
| 604 | prefetchw_prev_lru_page(page, src, flags); | 626 | prefetchw_prev_lru_page(page, src, flags); |
| 605 | 627 | ||
| 606 | BUG_ON(!PageLRU(page)); | 628 | VM_BUG_ON(!PageLRU(page)); |
| 607 | 629 | ||
| 608 | list_del(&page->lru); | 630 | list_del(&page->lru); |
| 609 | target = src; | 631 | target = src; |
| @@ -674,7 +696,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 674 | */ | 696 | */ |
| 675 | while (!list_empty(&page_list)) { | 697 | while (!list_empty(&page_list)) { |
| 676 | page = lru_to_page(&page_list); | 698 | page = lru_to_page(&page_list); |
| 677 | BUG_ON(PageLRU(page)); | 699 | VM_BUG_ON(PageLRU(page)); |
| 678 | SetPageLRU(page); | 700 | SetPageLRU(page); |
| 679 | list_del(&page->lru); | 701 | list_del(&page->lru); |
| 680 | if (PageActive(page)) | 702 | if (PageActive(page)) |
| @@ -695,6 +717,11 @@ done: | |||
| 695 | return nr_reclaimed; | 717 | return nr_reclaimed; |
| 696 | } | 718 | } |
| 697 | 719 | ||
| 720 | static inline int zone_is_near_oom(struct zone *zone) | ||
| 721 | { | ||
| 722 | return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; | ||
| 723 | } | ||
| 724 | |||
| 698 | /* | 725 | /* |
| 699 | * This moves pages from the active list to the inactive list. | 726 | * This moves pages from the active list to the inactive list. |
| 700 | * | 727 | * |
| @@ -730,6 +757,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 730 | long distress; | 757 | long distress; |
| 731 | long swap_tendency; | 758 | long swap_tendency; |
| 732 | 759 | ||
| 760 | if (zone_is_near_oom(zone)) | ||
| 761 | goto force_reclaim_mapped; | ||
| 762 | |||
| 733 | /* | 763 | /* |
| 734 | * `distress' is a measure of how much trouble we're having | 764 | * `distress' is a measure of how much trouble we're having |
| 735 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | 765 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. |
| @@ -765,6 +795,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 765 | * memory onto the inactive list. | 795 | * memory onto the inactive list. |
| 766 | */ | 796 | */ |
| 767 | if (swap_tendency >= 100) | 797 | if (swap_tendency >= 100) |
| 798 | force_reclaim_mapped: | ||
| 768 | reclaim_mapped = 1; | 799 | reclaim_mapped = 1; |
| 769 | } | 800 | } |
| 770 | 801 | ||
| @@ -797,9 +828,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 797 | while (!list_empty(&l_inactive)) { | 828 | while (!list_empty(&l_inactive)) { |
| 798 | page = lru_to_page(&l_inactive); | 829 | page = lru_to_page(&l_inactive); |
| 799 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 830 | prefetchw_prev_lru_page(page, &l_inactive, flags); |
| 800 | BUG_ON(PageLRU(page)); | 831 | VM_BUG_ON(PageLRU(page)); |
| 801 | SetPageLRU(page); | 832 | SetPageLRU(page); |
| 802 | BUG_ON(!PageActive(page)); | 833 | VM_BUG_ON(!PageActive(page)); |
| 803 | ClearPageActive(page); | 834 | ClearPageActive(page); |
| 804 | 835 | ||
| 805 | list_move(&page->lru, &zone->inactive_list); | 836 | list_move(&page->lru, &zone->inactive_list); |
| @@ -827,9 +858,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 827 | while (!list_empty(&l_active)) { | 858 | while (!list_empty(&l_active)) { |
| 828 | page = lru_to_page(&l_active); | 859 | page = lru_to_page(&l_active); |
| 829 | prefetchw_prev_lru_page(page, &l_active, flags); | 860 | prefetchw_prev_lru_page(page, &l_active, flags); |
| 830 | BUG_ON(PageLRU(page)); | 861 | VM_BUG_ON(PageLRU(page)); |
| 831 | SetPageLRU(page); | 862 | SetPageLRU(page); |
| 832 | BUG_ON(!PageActive(page)); | 863 | VM_BUG_ON(!PageActive(page)); |
| 833 | list_move(&page->lru, &zone->active_list); | 864 | list_move(&page->lru, &zone->active_list); |
| 834 | pgmoved++; | 865 | pgmoved++; |
| 835 | if (!pagevec_add(&pvec, page)) { | 866 | if (!pagevec_add(&pvec, page)) { |
| @@ -925,6 +956,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
| 925 | unsigned long nr_reclaimed = 0; | 956 | unsigned long nr_reclaimed = 0; |
| 926 | int i; | 957 | int i; |
| 927 | 958 | ||
| 959 | sc->all_unreclaimable = 1; | ||
| 928 | for (i = 0; zones[i] != NULL; i++) { | 960 | for (i = 0; zones[i] != NULL; i++) { |
| 929 | struct zone *zone = zones[i]; | 961 | struct zone *zone = zones[i]; |
| 930 | 962 | ||
| @@ -941,6 +973,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
| 941 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 973 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 942 | continue; /* Let kswapd poll it */ | 974 | continue; /* Let kswapd poll it */ |
| 943 | 975 | ||
| 976 | sc->all_unreclaimable = 0; | ||
| 977 | |||
| 944 | nr_reclaimed += shrink_zone(priority, zone, sc); | 978 | nr_reclaimed += shrink_zone(priority, zone, sc); |
| 945 | } | 979 | } |
| 946 | return nr_reclaimed; | 980 | return nr_reclaimed; |
| @@ -1021,6 +1055,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
| 1021 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | 1055 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) |
| 1022 | blk_congestion_wait(WRITE, HZ/10); | 1056 | blk_congestion_wait(WRITE, HZ/10); |
| 1023 | } | 1057 | } |
| 1058 | /* top priority shrink_caches still had more to do? don't OOM, then */ | ||
| 1059 | if (!sc.all_unreclaimable) | ||
| 1060 | ret = 1; | ||
| 1024 | out: | 1061 | out: |
| 1025 | for (i = 0; zones[i] != 0; i++) { | 1062 | for (i = 0; zones[i] != 0; i++) { |
| 1026 | struct zone *zone = zones[i]; | 1063 | struct zone *zone = zones[i]; |
| @@ -1153,7 +1190,7 @@ scan: | |||
| 1153 | if (zone->all_unreclaimable) | 1190 | if (zone->all_unreclaimable) |
| 1154 | continue; | 1191 | continue; |
| 1155 | if (nr_slab == 0 && zone->pages_scanned >= | 1192 | if (nr_slab == 0 && zone->pages_scanned >= |
| 1156 | (zone->nr_active + zone->nr_inactive) * 4) | 1193 | (zone->nr_active + zone->nr_inactive) * 6) |
| 1157 | zone->all_unreclaimable = 1; | 1194 | zone->all_unreclaimable = 1; |
| 1158 | /* | 1195 | /* |
| 1159 | * If we've done a decent amount of scanning and | 1196 | * If we've done a decent amount of scanning and |
| @@ -1361,7 +1398,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
| 1361 | for_each_zone(zone) | 1398 | for_each_zone(zone) |
| 1362 | lru_pages += zone->nr_active + zone->nr_inactive; | 1399 | lru_pages += zone->nr_active + zone->nr_inactive; |
| 1363 | 1400 | ||
| 1364 | nr_slab = global_page_state(NR_SLAB); | 1401 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); |
| 1365 | /* If slab caches are huge, it's better to hit them first */ | 1402 | /* If slab caches are huge, it's better to hit them first */ |
| 1366 | while (nr_slab >= lru_pages) { | 1403 | while (nr_slab >= lru_pages) { |
| 1367 | reclaim_state.reclaimed_slab = 0; | 1404 | reclaim_state.reclaimed_slab = 0; |
| @@ -1510,7 +1547,6 @@ int zone_reclaim_mode __read_mostly; | |||
| 1510 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ | 1547 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ |
| 1511 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ | 1548 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ |
| 1512 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ | 1549 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ |
| 1513 | #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ | ||
| 1514 | 1550 | ||
| 1515 | /* | 1551 | /* |
| 1516 | * Priority for ZONE_RECLAIM. This determines the fraction of pages | 1552 | * Priority for ZONE_RECLAIM. This determines the fraction of pages |
| @@ -1526,6 +1562,12 @@ int zone_reclaim_mode __read_mostly; | |||
| 1526 | int sysctl_min_unmapped_ratio = 1; | 1562 | int sysctl_min_unmapped_ratio = 1; |
| 1527 | 1563 | ||
| 1528 | /* | 1564 | /* |
| 1565 | * If the number of slab pages in a zone grows beyond this percentage then | ||
| 1566 | * slab reclaim needs to occur. | ||
| 1567 | */ | ||
| 1568 | int sysctl_min_slab_ratio = 5; | ||
| 1569 | |||
| 1570 | /* | ||
| 1529 | * Try to free up some pages from this zone through reclaim. | 1571 | * Try to free up some pages from this zone through reclaim. |
| 1530 | */ | 1572 | */ |
| 1531 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1573 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
| @@ -1544,6 +1586,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1544 | .gfp_mask = gfp_mask, | 1586 | .gfp_mask = gfp_mask, |
| 1545 | .swappiness = vm_swappiness, | 1587 | .swappiness = vm_swappiness, |
| 1546 | }; | 1588 | }; |
| 1589 | unsigned long slab_reclaimable; | ||
| 1547 | 1590 | ||
| 1548 | disable_swap_token(); | 1591 | disable_swap_token(); |
| 1549 | cond_resched(); | 1592 | cond_resched(); |
| @@ -1556,29 +1599,43 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1556 | reclaim_state.reclaimed_slab = 0; | 1599 | reclaim_state.reclaimed_slab = 0; |
| 1557 | p->reclaim_state = &reclaim_state; | 1600 | p->reclaim_state = &reclaim_state; |
| 1558 | 1601 | ||
| 1559 | /* | 1602 | if (zone_page_state(zone, NR_FILE_PAGES) - |
| 1560 | * Free memory by calling shrink zone with increasing priorities | 1603 | zone_page_state(zone, NR_FILE_MAPPED) > |
| 1561 | * until we have enough memory freed. | 1604 | zone->min_unmapped_pages) { |
| 1562 | */ | 1605 | /* |
| 1563 | priority = ZONE_RECLAIM_PRIORITY; | 1606 | * Free memory by calling shrink zone with increasing |
| 1564 | do { | 1607 | * priorities until we have enough memory freed. |
| 1565 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1608 | */ |
| 1566 | priority--; | 1609 | priority = ZONE_RECLAIM_PRIORITY; |
| 1567 | } while (priority >= 0 && nr_reclaimed < nr_pages); | 1610 | do { |
| 1611 | nr_reclaimed += shrink_zone(priority, zone, &sc); | ||
| 1612 | priority--; | ||
| 1613 | } while (priority >= 0 && nr_reclaimed < nr_pages); | ||
| 1614 | } | ||
| 1568 | 1615 | ||
| 1569 | if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { | 1616 | slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
| 1617 | if (slab_reclaimable > zone->min_slab_pages) { | ||
| 1570 | /* | 1618 | /* |
| 1571 | * shrink_slab() does not currently allow us to determine how | 1619 | * shrink_slab() does not currently allow us to determine how |
| 1572 | * many pages were freed in this zone. So we just shake the slab | 1620 | * many pages were freed in this zone. So we take the current |
| 1573 | * a bit and then go off node for this particular allocation | 1621 | * number of slab pages and shake the slab until it is reduced |
| 1574 | * despite possibly having freed enough memory to allocate in | 1622 | * by the same nr_pages that we used for reclaiming unmapped |
| 1575 | * this zone. If we freed local memory then the next | 1623 | * pages. |
| 1576 | * allocations will be local again. | ||
| 1577 | * | 1624 | * |
| 1578 | * shrink_slab will free memory on all zones and may take | 1625 | * Note that shrink_slab will free memory on all zones and may |
| 1579 | * a long time. | 1626 | * take a long time. |
| 1627 | */ | ||
| 1628 | while (shrink_slab(sc.nr_scanned, gfp_mask, order) && | ||
| 1629 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) > | ||
| 1630 | slab_reclaimable - nr_pages) | ||
| 1631 | ; | ||
| 1632 | |||
| 1633 | /* | ||
| 1634 | * Update nr_reclaimed by the number of slab pages we | ||
| 1635 | * reclaimed from this zone. | ||
| 1580 | */ | 1636 | */ |
| 1581 | shrink_slab(sc.nr_scanned, gfp_mask, order); | 1637 | nr_reclaimed += slab_reclaimable - |
| 1638 | zone_page_state(zone, NR_SLAB_RECLAIMABLE); | ||
| 1582 | } | 1639 | } |
| 1583 | 1640 | ||
| 1584 | p->reclaim_state = NULL; | 1641 | p->reclaim_state = NULL; |
| @@ -1592,7 +1649,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1592 | int node_id; | 1649 | int node_id; |
| 1593 | 1650 | ||
| 1594 | /* | 1651 | /* |
| 1595 | * Zone reclaim reclaims unmapped file backed pages. | 1652 | * Zone reclaim reclaims unmapped file backed pages and |
| 1653 | * slab pages if we are over the defined limits. | ||
| 1596 | * | 1654 | * |
| 1597 | * A small portion of unmapped file backed pages is needed for | 1655 | * A small portion of unmapped file backed pages is needed for |
| 1598 | * file I/O otherwise pages read by file I/O will be immediately | 1656 | * file I/O otherwise pages read by file I/O will be immediately |
| @@ -1601,7 +1659,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1601 | * unmapped file backed pages. | 1659 | * unmapped file backed pages. |
| 1602 | */ | 1660 | */ |
| 1603 | if (zone_page_state(zone, NR_FILE_PAGES) - | 1661 | if (zone_page_state(zone, NR_FILE_PAGES) - |
| 1604 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio) | 1662 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages |
| 1663 | && zone_page_state(zone, NR_SLAB_RECLAIMABLE) | ||
| 1664 | <= zone->min_slab_pages) | ||
| 1605 | return 0; | 1665 | return 0; |
| 1606 | 1666 | ||
| 1607 | /* | 1667 | /* |
| @@ -1621,7 +1681,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1621 | * over remote processors and spread off node memory allocations | 1681 | * over remote processors and spread off node memory allocations |
| 1622 | * as wide as possible. | 1682 | * as wide as possible. |
| 1623 | */ | 1683 | */ |
| 1624 | node_id = zone->zone_pgdat->node_id; | 1684 | node_id = zone_to_nid(zone); |
| 1625 | mask = node_to_cpumask(node_id); | 1685 | mask = node_to_cpumask(node_id); |
| 1626 | if (!cpus_empty(mask) && node_id != numa_node_id()) | 1686 | if (!cpus_empty(mask) && node_id != numa_node_id()) |
| 1627 | return 0; | 1687 | return 0; |
