diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 250 |
1 files changed, 154 insertions, 96 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 72babac71dea..eca70310adb2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
21 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> |
22 | #include <linux/vmstat.h> | ||
22 | #include <linux/file.h> | 23 | #include <linux/file.h> |
23 | #include <linux/writeback.h> | 24 | #include <linux/writeback.h> |
24 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
@@ -34,6 +35,7 @@ | |||
34 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
35 | #include <linux/rwsem.h> | 36 | #include <linux/rwsem.h> |
36 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
38 | #include <linux/kthread.h> | ||
37 | 39 | ||
38 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
39 | #include <asm/div64.h> | 41 | #include <asm/div64.h> |
@@ -46,8 +48,6 @@ struct scan_control { | |||
46 | /* Incremented by the number of inactive pages that were scanned */ | 48 | /* Incremented by the number of inactive pages that were scanned */ |
47 | unsigned long nr_scanned; | 49 | unsigned long nr_scanned; |
48 | 50 | ||
49 | unsigned long nr_mapped; /* From page_state */ | ||
50 | |||
51 | /* This context's GFP mask */ | 51 | /* This context's GFP mask */ |
52 | gfp_t gfp_mask; | 52 | gfp_t gfp_mask; |
53 | 53 | ||
@@ -63,6 +63,8 @@ struct scan_control { | |||
63 | int swap_cluster_max; | 63 | int swap_cluster_max; |
64 | 64 | ||
65 | int swappiness; | 65 | int swappiness; |
66 | |||
67 | int all_unreclaimable; | ||
66 | }; | 68 | }; |
67 | 69 | ||
68 | /* | 70 | /* |
@@ -216,7 +218,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
216 | break; | 218 | break; |
217 | if (shrink_ret < nr_before) | 219 | if (shrink_ret < nr_before) |
218 | ret += nr_before - shrink_ret; | 220 | ret += nr_before - shrink_ret; |
219 | mod_page_state(slabs_scanned, this_scan); | 221 | count_vm_events(SLABS_SCANNED, this_scan); |
220 | total_scan -= this_scan; | 222 | total_scan -= this_scan; |
221 | 223 | ||
222 | cond_resched(); | 224 | cond_resched(); |
@@ -369,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
369 | /* synchronous write or broken a_ops? */ | 371 | /* synchronous write or broken a_ops? */ |
370 | ClearPageReclaim(page); | 372 | ClearPageReclaim(page); |
371 | } | 373 | } |
372 | 374 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | |
373 | return PAGE_SUCCESS; | 375 | return PAGE_SUCCESS; |
374 | } | 376 | } |
375 | 377 | ||
@@ -378,15 +380,34 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
378 | 380 | ||
379 | int remove_mapping(struct address_space *mapping, struct page *page) | 381 | int remove_mapping(struct address_space *mapping, struct page *page) |
380 | { | 382 | { |
381 | if (!mapping) | 383 | BUG_ON(!PageLocked(page)); |
382 | return 0; /* truncate got there first */ | 384 | BUG_ON(mapping != page_mapping(page)); |
383 | 385 | ||
384 | write_lock_irq(&mapping->tree_lock); | 386 | write_lock_irq(&mapping->tree_lock); |
385 | |||
386 | /* | 387 | /* |
387 | * The non-racy check for busy page. It is critical to check | 388 | * The non racy check for a busy page. |
388 | * PageDirty _after_ making sure that the page is freeable and | 389 | * |
389 | * not in use by anybody. (pagecache + us == 2) | 390 | * Must be careful with the order of the tests. When someone has |
391 | * a ref to the page, it may be possible that they dirty it then | ||
392 | * drop the reference. So if PageDirty is tested before page_count | ||
393 | * here, then the following race may occur: | ||
394 | * | ||
395 | * get_user_pages(&page); | ||
396 | * [user mapping goes away] | ||
397 | * write_to(page); | ||
398 | * !PageDirty(page) [good] | ||
399 | * SetPageDirty(page); | ||
400 | * put_page(page); | ||
401 | * !page_count(page) [good, discard it] | ||
402 | * | ||
403 | * [oops, our write_to data is lost] | ||
404 | * | ||
405 | * Reversing the order of the tests ensures such a situation cannot | ||
406 | * escape unnoticed. The smp_rmb is needed to ensure the page->flags | ||
407 | * load is not satisfied before that of page->_count. | ||
408 | * | ||
409 | * Note that if SetPageDirty is always performed via set_page_dirty, | ||
410 | * and thus under tree_lock, then this ordering is not required. | ||
390 | */ | 411 | */ |
391 | if (unlikely(page_count(page) != 2)) | 412 | if (unlikely(page_count(page) != 2)) |
392 | goto cannot_free; | 413 | goto cannot_free; |
@@ -441,7 +462,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
441 | if (TestSetPageLocked(page)) | 462 | if (TestSetPageLocked(page)) |
442 | goto keep; | 463 | goto keep; |
443 | 464 | ||
444 | BUG_ON(PageActive(page)); | 465 | VM_BUG_ON(PageActive(page)); |
445 | 466 | ||
446 | sc->nr_scanned++; | 467 | sc->nr_scanned++; |
447 | 468 | ||
@@ -548,7 +569,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
548 | goto free_it; | 569 | goto free_it; |
549 | } | 570 | } |
550 | 571 | ||
551 | if (!remove_mapping(mapping, page)) | 572 | if (!mapping || !remove_mapping(mapping, page)) |
552 | goto keep_locked; | 573 | goto keep_locked; |
553 | 574 | ||
554 | free_it: | 575 | free_it: |
@@ -565,12 +586,12 @@ keep_locked: | |||
565 | unlock_page(page); | 586 | unlock_page(page); |
566 | keep: | 587 | keep: |
567 | list_add(&page->lru, &ret_pages); | 588 | list_add(&page->lru, &ret_pages); |
568 | BUG_ON(PageLRU(page)); | 589 | VM_BUG_ON(PageLRU(page)); |
569 | } | 590 | } |
570 | list_splice(&ret_pages, page_list); | 591 | list_splice(&ret_pages, page_list); |
571 | if (pagevec_count(&freed_pvec)) | 592 | if (pagevec_count(&freed_pvec)) |
572 | __pagevec_release_nonlru(&freed_pvec); | 593 | __pagevec_release_nonlru(&freed_pvec); |
573 | mod_page_state(pgactivate, pgactivate); | 594 | count_vm_events(PGACTIVATE, pgactivate); |
574 | return nr_reclaimed; | 595 | return nr_reclaimed; |
575 | } | 596 | } |
576 | 597 | ||
@@ -604,7 +625,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
604 | page = lru_to_page(src); | 625 | page = lru_to_page(src); |
605 | prefetchw_prev_lru_page(page, src, flags); | 626 | prefetchw_prev_lru_page(page, src, flags); |
606 | 627 | ||
607 | BUG_ON(!PageLRU(page)); | 628 | VM_BUG_ON(!PageLRU(page)); |
608 | 629 | ||
609 | list_del(&page->lru); | 630 | list_del(&page->lru); |
610 | target = src; | 631 | target = src; |
@@ -660,11 +681,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
660 | nr_reclaimed += nr_freed; | 681 | nr_reclaimed += nr_freed; |
661 | local_irq_disable(); | 682 | local_irq_disable(); |
662 | if (current_is_kswapd()) { | 683 | if (current_is_kswapd()) { |
663 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | 684 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); |
664 | __mod_page_state(kswapd_steal, nr_freed); | 685 | __count_vm_events(KSWAPD_STEAL, nr_freed); |
665 | } else | 686 | } else |
666 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); | 687 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); |
667 | __mod_page_state_zone(zone, pgsteal, nr_freed); | 688 | __count_vm_events(PGACTIVATE, nr_freed); |
668 | 689 | ||
669 | if (nr_taken == 0) | 690 | if (nr_taken == 0) |
670 | goto done; | 691 | goto done; |
@@ -675,7 +696,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
675 | */ | 696 | */ |
676 | while (!list_empty(&page_list)) { | 697 | while (!list_empty(&page_list)) { |
677 | page = lru_to_page(&page_list); | 698 | page = lru_to_page(&page_list); |
678 | BUG_ON(PageLRU(page)); | 699 | VM_BUG_ON(PageLRU(page)); |
679 | SetPageLRU(page); | 700 | SetPageLRU(page); |
680 | list_del(&page->lru); | 701 | list_del(&page->lru); |
681 | if (PageActive(page)) | 702 | if (PageActive(page)) |
@@ -696,6 +717,11 @@ done: | |||
696 | return nr_reclaimed; | 717 | return nr_reclaimed; |
697 | } | 718 | } |
698 | 719 | ||
720 | static inline int zone_is_near_oom(struct zone *zone) | ||
721 | { | ||
722 | return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; | ||
723 | } | ||
724 | |||
699 | /* | 725 | /* |
700 | * This moves pages from the active list to the inactive list. | 726 | * This moves pages from the active list to the inactive list. |
701 | * | 727 | * |
@@ -731,6 +757,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
731 | long distress; | 757 | long distress; |
732 | long swap_tendency; | 758 | long swap_tendency; |
733 | 759 | ||
760 | if (zone_is_near_oom(zone)) | ||
761 | goto force_reclaim_mapped; | ||
762 | |||
734 | /* | 763 | /* |
735 | * `distress' is a measure of how much trouble we're having | 764 | * `distress' is a measure of how much trouble we're having |
736 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | 765 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. |
@@ -743,7 +772,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
743 | * how much memory | 772 | * how much memory |
744 | * is mapped. | 773 | * is mapped. |
745 | */ | 774 | */ |
746 | mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages; | 775 | mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + |
776 | global_page_state(NR_ANON_PAGES)) * 100) / | ||
777 | vm_total_pages; | ||
747 | 778 | ||
748 | /* | 779 | /* |
749 | * Now decide how much we really want to unmap some pages. The | 780 | * Now decide how much we really want to unmap some pages. The |
@@ -764,6 +795,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
764 | * memory onto the inactive list. | 795 | * memory onto the inactive list. |
765 | */ | 796 | */ |
766 | if (swap_tendency >= 100) | 797 | if (swap_tendency >= 100) |
798 | force_reclaim_mapped: | ||
767 | reclaim_mapped = 1; | 799 | reclaim_mapped = 1; |
768 | } | 800 | } |
769 | 801 | ||
@@ -796,9 +828,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
796 | while (!list_empty(&l_inactive)) { | 828 | while (!list_empty(&l_inactive)) { |
797 | page = lru_to_page(&l_inactive); | 829 | page = lru_to_page(&l_inactive); |
798 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 830 | prefetchw_prev_lru_page(page, &l_inactive, flags); |
799 | BUG_ON(PageLRU(page)); | 831 | VM_BUG_ON(PageLRU(page)); |
800 | SetPageLRU(page); | 832 | SetPageLRU(page); |
801 | BUG_ON(!PageActive(page)); | 833 | VM_BUG_ON(!PageActive(page)); |
802 | ClearPageActive(page); | 834 | ClearPageActive(page); |
803 | 835 | ||
804 | list_move(&page->lru, &zone->inactive_list); | 836 | list_move(&page->lru, &zone->inactive_list); |
@@ -826,9 +858,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
826 | while (!list_empty(&l_active)) { | 858 | while (!list_empty(&l_active)) { |
827 | page = lru_to_page(&l_active); | 859 | page = lru_to_page(&l_active); |
828 | prefetchw_prev_lru_page(page, &l_active, flags); | 860 | prefetchw_prev_lru_page(page, &l_active, flags); |
829 | BUG_ON(PageLRU(page)); | 861 | VM_BUG_ON(PageLRU(page)); |
830 | SetPageLRU(page); | 862 | SetPageLRU(page); |
831 | BUG_ON(!PageActive(page)); | 863 | VM_BUG_ON(!PageActive(page)); |
832 | list_move(&page->lru, &zone->active_list); | 864 | list_move(&page->lru, &zone->active_list); |
833 | pgmoved++; | 865 | pgmoved++; |
834 | if (!pagevec_add(&pvec, page)) { | 866 | if (!pagevec_add(&pvec, page)) { |
@@ -840,11 +872,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
840 | } | 872 | } |
841 | } | 873 | } |
842 | zone->nr_active += pgmoved; | 874 | zone->nr_active += pgmoved; |
843 | spin_unlock(&zone->lru_lock); | ||
844 | 875 | ||
845 | __mod_page_state_zone(zone, pgrefill, pgscanned); | 876 | __count_zone_vm_events(PGREFILL, zone, pgscanned); |
846 | __mod_page_state(pgdeactivate, pgdeactivate); | 877 | __count_vm_events(PGDEACTIVATE, pgdeactivate); |
847 | local_irq_enable(); | 878 | spin_unlock_irq(&zone->lru_lock); |
848 | 879 | ||
849 | pagevec_release(&pvec); | 880 | pagevec_release(&pvec); |
850 | } | 881 | } |
@@ -925,6 +956,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
925 | unsigned long nr_reclaimed = 0; | 956 | unsigned long nr_reclaimed = 0; |
926 | int i; | 957 | int i; |
927 | 958 | ||
959 | sc->all_unreclaimable = 1; | ||
928 | for (i = 0; zones[i] != NULL; i++) { | 960 | for (i = 0; zones[i] != NULL; i++) { |
929 | struct zone *zone = zones[i]; | 961 | struct zone *zone = zones[i]; |
930 | 962 | ||
@@ -941,6 +973,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
941 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 973 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
942 | continue; /* Let kswapd poll it */ | 974 | continue; /* Let kswapd poll it */ |
943 | 975 | ||
976 | sc->all_unreclaimable = 0; | ||
977 | |||
944 | nr_reclaimed += shrink_zone(priority, zone, sc); | 978 | nr_reclaimed += shrink_zone(priority, zone, sc); |
945 | } | 979 | } |
946 | return nr_reclaimed; | 980 | return nr_reclaimed; |
@@ -976,7 +1010,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
976 | .swappiness = vm_swappiness, | 1010 | .swappiness = vm_swappiness, |
977 | }; | 1011 | }; |
978 | 1012 | ||
979 | inc_page_state(allocstall); | 1013 | count_vm_event(ALLOCSTALL); |
980 | 1014 | ||
981 | for (i = 0; zones[i] != NULL; i++) { | 1015 | for (i = 0; zones[i] != NULL; i++) { |
982 | struct zone *zone = zones[i]; | 1016 | struct zone *zone = zones[i]; |
@@ -989,7 +1023,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
989 | } | 1023 | } |
990 | 1024 | ||
991 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 1025 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
992 | sc.nr_mapped = read_page_state(nr_mapped); | ||
993 | sc.nr_scanned = 0; | 1026 | sc.nr_scanned = 0; |
994 | if (!priority) | 1027 | if (!priority) |
995 | disable_swap_token(); | 1028 | disable_swap_token(); |
@@ -1022,6 +1055,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
1022 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | 1055 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) |
1023 | blk_congestion_wait(WRITE, HZ/10); | 1056 | blk_congestion_wait(WRITE, HZ/10); |
1024 | } | 1057 | } |
1058 | /* top priority shrink_caches still had more to do? don't OOM, then */ | ||
1059 | if (!sc.all_unreclaimable) | ||
1060 | ret = 1; | ||
1025 | out: | 1061 | out: |
1026 | for (i = 0; zones[i] != 0; i++) { | 1062 | for (i = 0; zones[i] != 0; i++) { |
1027 | struct zone *zone = zones[i]; | 1063 | struct zone *zone = zones[i]; |
@@ -1074,9 +1110,7 @@ loop_again: | |||
1074 | total_scanned = 0; | 1110 | total_scanned = 0; |
1075 | nr_reclaimed = 0; | 1111 | nr_reclaimed = 0; |
1076 | sc.may_writepage = !laptop_mode; | 1112 | sc.may_writepage = !laptop_mode; |
1077 | sc.nr_mapped = read_page_state(nr_mapped); | 1113 | count_vm_event(PAGEOUTRUN); |
1078 | |||
1079 | inc_page_state(pageoutrun); | ||
1080 | 1114 | ||
1081 | for (i = 0; i < pgdat->nr_zones; i++) { | 1115 | for (i = 0; i < pgdat->nr_zones; i++) { |
1082 | struct zone *zone = pgdat->node_zones + i; | 1116 | struct zone *zone = pgdat->node_zones + i; |
@@ -1156,7 +1190,7 @@ scan: | |||
1156 | if (zone->all_unreclaimable) | 1190 | if (zone->all_unreclaimable) |
1157 | continue; | 1191 | continue; |
1158 | if (nr_slab == 0 && zone->pages_scanned >= | 1192 | if (nr_slab == 0 && zone->pages_scanned >= |
1159 | (zone->nr_active + zone->nr_inactive) * 4) | 1193 | (zone->nr_active + zone->nr_inactive) * 6) |
1160 | zone->all_unreclaimable = 1; | 1194 | zone->all_unreclaimable = 1; |
1161 | /* | 1195 | /* |
1162 | * If we've done a decent amount of scanning and | 1196 | * If we've done a decent amount of scanning and |
@@ -1223,7 +1257,6 @@ static int kswapd(void *p) | |||
1223 | }; | 1257 | }; |
1224 | cpumask_t cpumask; | 1258 | cpumask_t cpumask; |
1225 | 1259 | ||
1226 | daemonize("kswapd%d", pgdat->node_id); | ||
1227 | cpumask = node_to_cpumask(pgdat->node_id); | 1260 | cpumask = node_to_cpumask(pgdat->node_id); |
1228 | if (!cpus_empty(cpumask)) | 1261 | if (!cpus_empty(cpumask)) |
1229 | set_cpus_allowed(tsk, cpumask); | 1262 | set_cpus_allowed(tsk, cpumask); |
@@ -1365,7 +1398,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1365 | for_each_zone(zone) | 1398 | for_each_zone(zone) |
1366 | lru_pages += zone->nr_active + zone->nr_inactive; | 1399 | lru_pages += zone->nr_active + zone->nr_inactive; |
1367 | 1400 | ||
1368 | nr_slab = read_page_state(nr_slab); | 1401 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); |
1369 | /* If slab caches are huge, it's better to hit them first */ | 1402 | /* If slab caches are huge, it's better to hit them first */ |
1370 | while (nr_slab >= lru_pages) { | 1403 | while (nr_slab >= lru_pages) { |
1371 | reclaim_state.reclaimed_slab = 0; | 1404 | reclaim_state.reclaimed_slab = 0; |
@@ -1407,9 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1407 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | 1440 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { |
1408 | unsigned long nr_to_scan = nr_pages - ret; | 1441 | unsigned long nr_to_scan = nr_pages - ret; |
1409 | 1442 | ||
1410 | sc.nr_mapped = read_page_state(nr_mapped); | ||
1411 | sc.nr_scanned = 0; | 1443 | sc.nr_scanned = 0; |
1412 | |||
1413 | ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); | 1444 | ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); |
1414 | if (ret >= nr_pages) | 1445 | if (ret >= nr_pages) |
1415 | goto out; | 1446 | goto out; |
@@ -1450,7 +1481,7 @@ out: | |||
1450 | not required for correctness. So if the last cpu in a node goes | 1481 | not required for correctness. So if the last cpu in a node goes |
1451 | away, we get changed to run anywhere: as the first one comes back, | 1482 | away, we get changed to run anywhere: as the first one comes back, |
1452 | restore their cpu bindings. */ | 1483 | restore their cpu bindings. */ |
1453 | static int cpu_callback(struct notifier_block *nfb, | 1484 | static int __devinit cpu_callback(struct notifier_block *nfb, |
1454 | unsigned long action, void *hcpu) | 1485 | unsigned long action, void *hcpu) |
1455 | { | 1486 | { |
1456 | pg_data_t *pgdat; | 1487 | pg_data_t *pgdat; |
@@ -1468,20 +1499,35 @@ static int cpu_callback(struct notifier_block *nfb, | |||
1468 | } | 1499 | } |
1469 | #endif /* CONFIG_HOTPLUG_CPU */ | 1500 | #endif /* CONFIG_HOTPLUG_CPU */ |
1470 | 1501 | ||
1502 | /* | ||
1503 | * This kswapd start function will be called by init and node-hot-add. | ||
1504 | * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. | ||
1505 | */ | ||
1506 | int kswapd_run(int nid) | ||
1507 | { | ||
1508 | pg_data_t *pgdat = NODE_DATA(nid); | ||
1509 | int ret = 0; | ||
1510 | |||
1511 | if (pgdat->kswapd) | ||
1512 | return 0; | ||
1513 | |||
1514 | pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); | ||
1515 | if (IS_ERR(pgdat->kswapd)) { | ||
1516 | /* failure at boot is fatal */ | ||
1517 | BUG_ON(system_state == SYSTEM_BOOTING); | ||
1518 | printk("Failed to start kswapd on node %d\n",nid); | ||
1519 | ret = -1; | ||
1520 | } | ||
1521 | return ret; | ||
1522 | } | ||
1523 | |||
1471 | static int __init kswapd_init(void) | 1524 | static int __init kswapd_init(void) |
1472 | { | 1525 | { |
1473 | pg_data_t *pgdat; | 1526 | int nid; |
1474 | 1527 | ||
1475 | swap_setup(); | 1528 | swap_setup(); |
1476 | for_each_online_pgdat(pgdat) { | 1529 | for_each_online_node(nid) |
1477 | pid_t pid; | 1530 | kswapd_run(nid); |
1478 | |||
1479 | pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); | ||
1480 | BUG_ON(pid < 0); | ||
1481 | read_lock(&tasklist_lock); | ||
1482 | pgdat->kswapd = find_task_by_pid(pid); | ||
1483 | read_unlock(&tasklist_lock); | ||
1484 | } | ||
1485 | hotcpu_notifier(cpu_callback, 0); | 1531 | hotcpu_notifier(cpu_callback, 0); |
1486 | return 0; | 1532 | return 0; |
1487 | } | 1533 | } |
@@ -1494,10 +1540,6 @@ module_init(kswapd_init) | |||
1494 | * | 1540 | * |
1495 | * If non-zero call zone_reclaim when the number of free pages falls below | 1541 | * If non-zero call zone_reclaim when the number of free pages falls below |
1496 | * the watermarks. | 1542 | * the watermarks. |
1497 | * | ||
1498 | * In the future we may add flags to the mode. However, the page allocator | ||
1499 | * should only have to check that zone_reclaim_mode != 0 before calling | ||
1500 | * zone_reclaim(). | ||
1501 | */ | 1543 | */ |
1502 | int zone_reclaim_mode __read_mostly; | 1544 | int zone_reclaim_mode __read_mostly; |
1503 | 1545 | ||
@@ -1505,12 +1547,6 @@ int zone_reclaim_mode __read_mostly; | |||
1505 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ | 1547 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ |
1506 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ | 1548 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ |
1507 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ | 1549 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ |
1508 | #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ | ||
1509 | |||
1510 | /* | ||
1511 | * Mininum time between zone reclaim scans | ||
1512 | */ | ||
1513 | int zone_reclaim_interval __read_mostly = 30*HZ; | ||
1514 | 1550 | ||
1515 | /* | 1551 | /* |
1516 | * Priority for ZONE_RECLAIM. This determines the fraction of pages | 1552 | * Priority for ZONE_RECLAIM. This determines the fraction of pages |
@@ -1520,6 +1556,18 @@ int zone_reclaim_interval __read_mostly = 30*HZ; | |||
1520 | #define ZONE_RECLAIM_PRIORITY 4 | 1556 | #define ZONE_RECLAIM_PRIORITY 4 |
1521 | 1557 | ||
1522 | /* | 1558 | /* |
1559 | * Percentage of pages in a zone that must be unmapped for zone_reclaim to | ||
1560 | * occur. | ||
1561 | */ | ||
1562 | int sysctl_min_unmapped_ratio = 1; | ||
1563 | |||
1564 | /* | ||
1565 | * If the number of slab pages in a zone grows beyond this percentage then | ||
1566 | * slab reclaim needs to occur. | ||
1567 | */ | ||
1568 | int sysctl_min_slab_ratio = 5; | ||
1569 | |||
1570 | /* | ||
1523 | * Try to free up some pages from this zone through reclaim. | 1571 | * Try to free up some pages from this zone through reclaim. |
1524 | */ | 1572 | */ |
1525 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1573 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
@@ -1533,12 +1581,12 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1533 | struct scan_control sc = { | 1581 | struct scan_control sc = { |
1534 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 1582 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
1535 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 1583 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
1536 | .nr_mapped = read_page_state(nr_mapped), | ||
1537 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 1584 | .swap_cluster_max = max_t(unsigned long, nr_pages, |
1538 | SWAP_CLUSTER_MAX), | 1585 | SWAP_CLUSTER_MAX), |
1539 | .gfp_mask = gfp_mask, | 1586 | .gfp_mask = gfp_mask, |
1540 | .swappiness = vm_swappiness, | 1587 | .swappiness = vm_swappiness, |
1541 | }; | 1588 | }; |
1589 | unsigned long slab_reclaimable; | ||
1542 | 1590 | ||
1543 | disable_swap_token(); | 1591 | disable_swap_token(); |
1544 | cond_resched(); | 1592 | cond_resched(); |
@@ -1551,43 +1599,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1551 | reclaim_state.reclaimed_slab = 0; | 1599 | reclaim_state.reclaimed_slab = 0; |
1552 | p->reclaim_state = &reclaim_state; | 1600 | p->reclaim_state = &reclaim_state; |
1553 | 1601 | ||
1554 | /* | 1602 | if (zone_page_state(zone, NR_FILE_PAGES) - |
1555 | * Free memory by calling shrink zone with increasing priorities | 1603 | zone_page_state(zone, NR_FILE_MAPPED) > |
1556 | * until we have enough memory freed. | 1604 | zone->min_unmapped_pages) { |
1557 | */ | 1605 | /* |
1558 | priority = ZONE_RECLAIM_PRIORITY; | 1606 | * Free memory by calling shrink zone with increasing |
1559 | do { | 1607 | * priorities until we have enough memory freed. |
1560 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1608 | */ |
1561 | priority--; | 1609 | priority = ZONE_RECLAIM_PRIORITY; |
1562 | } while (priority >= 0 && nr_reclaimed < nr_pages); | 1610 | do { |
1611 | nr_reclaimed += shrink_zone(priority, zone, &sc); | ||
1612 | priority--; | ||
1613 | } while (priority >= 0 && nr_reclaimed < nr_pages); | ||
1614 | } | ||
1563 | 1615 | ||
1564 | if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { | 1616 | slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); |
1617 | if (slab_reclaimable > zone->min_slab_pages) { | ||
1565 | /* | 1618 | /* |
1566 | * shrink_slab() does not currently allow us to determine how | 1619 | * shrink_slab() does not currently allow us to determine how |
1567 | * many pages were freed in this zone. So we just shake the slab | 1620 | * many pages were freed in this zone. So we take the current |
1568 | * a bit and then go off node for this particular allocation | 1621 | * number of slab pages and shake the slab until it is reduced |
1569 | * despite possibly having freed enough memory to allocate in | 1622 | * by the same nr_pages that we used for reclaiming unmapped |
1570 | * this zone. If we freed local memory then the next | 1623 | * pages. |
1571 | * allocations will be local again. | ||
1572 | * | 1624 | * |
1573 | * shrink_slab will free memory on all zones and may take | 1625 | * Note that shrink_slab will free memory on all zones and may |
1574 | * a long time. | 1626 | * take a long time. |
1575 | */ | 1627 | */ |
1576 | shrink_slab(sc.nr_scanned, gfp_mask, order); | 1628 | while (shrink_slab(sc.nr_scanned, gfp_mask, order) && |
1577 | } | 1629 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) > |
1630 | slab_reclaimable - nr_pages) | ||
1631 | ; | ||
1578 | 1632 | ||
1579 | p->reclaim_state = NULL; | ||
1580 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | ||
1581 | |||
1582 | if (nr_reclaimed == 0) { | ||
1583 | /* | 1633 | /* |
1584 | * We were unable to reclaim enough pages to stay on node. We | 1634 | * Update nr_reclaimed by the number of slab pages we |
1585 | * now allow off node accesses for a certain time period before | 1635 | * reclaimed from this zone. |
1586 | * trying again to reclaim pages from the local zone. | ||
1587 | */ | 1636 | */ |
1588 | zone->last_unsuccessful_zone_reclaim = jiffies; | 1637 | nr_reclaimed += slab_reclaimable - |
1638 | zone_page_state(zone, NR_SLAB_RECLAIMABLE); | ||
1589 | } | 1639 | } |
1590 | 1640 | ||
1641 | p->reclaim_state = NULL; | ||
1642 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | ||
1591 | return nr_reclaimed >= nr_pages; | 1643 | return nr_reclaimed >= nr_pages; |
1592 | } | 1644 | } |
1593 | 1645 | ||
@@ -1597,14 +1649,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1597 | int node_id; | 1649 | int node_id; |
1598 | 1650 | ||
1599 | /* | 1651 | /* |
1600 | * Do not reclaim if there was a recent unsuccessful attempt at zone | 1652 | * Zone reclaim reclaims unmapped file backed pages and |
1601 | * reclaim. In that case we let allocations go off node for the | 1653 | * slab pages if we are over the defined limits. |
1602 | * zone_reclaim_interval. Otherwise we would scan for each off-node | 1654 | * |
1603 | * page allocation. | 1655 | * A small portion of unmapped file backed pages is needed for |
1656 | * file I/O otherwise pages read by file I/O will be immediately | ||
1657 | * thrown out if the zone is overallocated. So we do not reclaim | ||
1658 | * if less than a specified percentage of the zone is used by | ||
1659 | * unmapped file backed pages. | ||
1604 | */ | 1660 | */ |
1605 | if (time_before(jiffies, | 1661 | if (zone_page_state(zone, NR_FILE_PAGES) - |
1606 | zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) | 1662 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages |
1607 | return 0; | 1663 | && zone_page_state(zone, NR_SLAB_RECLAIMABLE) |
1664 | <= zone->min_slab_pages) | ||
1665 | return 0; | ||
1608 | 1666 | ||
1609 | /* | 1667 | /* |
1610 | * Avoid concurrent zone reclaims, do not reclaim in a zone that does | 1668 | * Avoid concurrent zone reclaims, do not reclaim in a zone that does |
@@ -1623,7 +1681,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1623 | * over remote processors and spread off node memory allocations | 1681 | * over remote processors and spread off node memory allocations |
1624 | * as wide as possible. | 1682 | * as wide as possible. |
1625 | */ | 1683 | */ |
1626 | node_id = zone->zone_pgdat->node_id; | 1684 | node_id = zone_to_nid(zone); |
1627 | mask = node_to_cpumask(node_id); | 1685 | mask = node_to_cpumask(node_id); |
1628 | if (!cpus_empty(mask) && node_id != numa_node_id()) | 1686 | if (!cpus_empty(mask) && node_id != numa_node_id()) |
1629 | return 0; | 1687 | return 0; |