aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c882
1 files changed, 216 insertions, 666 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4fe7e3aa02e2..fd572bbdc9f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -33,39 +33,21 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/notifier.h> 34#include <linux/notifier.h>
35#include <linux/rwsem.h> 35#include <linux/rwsem.h>
36#include <linux/delay.h>
36 37
37#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
38#include <asm/div64.h> 39#include <asm/div64.h>
39 40
40#include <linux/swapops.h> 41#include <linux/swapops.h>
41 42
42/* possible outcome of pageout() */ 43#include "internal.h"
43typedef enum {
44 /* failed to write page out, page is locked */
45 PAGE_KEEP,
46 /* move page to the active list, page is locked */
47 PAGE_ACTIVATE,
48 /* page has been sent to the disk successfully, page is unlocked */
49 PAGE_SUCCESS,
50 /* page is clean and locked */
51 PAGE_CLEAN,
52} pageout_t;
53 44
54struct scan_control { 45struct scan_control {
55 /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
56 unsigned long nr_to_scan;
57
58 /* Incremented by the number of inactive pages that were scanned */ 46 /* Incremented by the number of inactive pages that were scanned */
59 unsigned long nr_scanned; 47 unsigned long nr_scanned;
60 48
61 /* Incremented by the number of pages reclaimed */
62 unsigned long nr_reclaimed;
63
64 unsigned long nr_mapped; /* From page_state */ 49 unsigned long nr_mapped; /* From page_state */
65 50
66 /* Ask shrink_caches, or shrink_zone to scan at this priority */
67 unsigned int priority;
68
69 /* This context's GFP mask */ 51 /* This context's GFP mask */
70 gfp_t gfp_mask; 52 gfp_t gfp_mask;
71 53
@@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker);
183 * 165 *
184 * Returns the number of slab objects which we shrunk. 166 * Returns the number of slab objects which we shrunk.
185 */ 167 */
186int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) 168unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
169 unsigned long lru_pages)
187{ 170{
188 struct shrinker *shrinker; 171 struct shrinker *shrinker;
189 int ret = 0; 172 unsigned long ret = 0;
190 173
191 if (scanned == 0) 174 if (scanned == 0)
192 scanned = SWAP_CLUSTER_MAX; 175 scanned = SWAP_CLUSTER_MAX;
@@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping,
306} 289}
307 290
308/* 291/*
309 * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). 292 * pageout is called by shrink_page_list() for each dirty page.
293 * Calls ->writepage().
310 */ 294 */
311static pageout_t pageout(struct page *page, struct address_space *mapping) 295pageout_t pageout(struct page *page, struct address_space *mapping)
312{ 296{
313 /* 297 /*
314 * If the page is dirty, only perform writeback if that write 298 * If the page is dirty, only perform writeback if that write
@@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
376 return PAGE_CLEAN; 360 return PAGE_CLEAN;
377} 361}
378 362
379static int remove_mapping(struct address_space *mapping, struct page *page) 363int remove_mapping(struct address_space *mapping, struct page *page)
380{ 364{
381 if (!mapping) 365 if (!mapping)
382 return 0; /* truncate got there first */ 366 return 0; /* truncate got there first */
@@ -414,14 +398,15 @@ cannot_free:
414} 398}
415 399
416/* 400/*
417 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed 401 * shrink_page_list() returns the number of reclaimed pages
418 */ 402 */
419static int shrink_list(struct list_head *page_list, struct scan_control *sc) 403static unsigned long shrink_page_list(struct list_head *page_list,
404 struct scan_control *sc)
420{ 405{
421 LIST_HEAD(ret_pages); 406 LIST_HEAD(ret_pages);
422 struct pagevec freed_pvec; 407 struct pagevec freed_pvec;
423 int pgactivate = 0; 408 int pgactivate = 0;
424 int reclaimed = 0; 409 unsigned long nr_reclaimed = 0;
425 410
426 cond_resched(); 411 cond_resched();
427 412
@@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
464 * Anonymous process memory has backing store? 449 * Anonymous process memory has backing store?
465 * Try to allocate it some swap space here. 450 * Try to allocate it some swap space here.
466 */ 451 */
467 if (PageAnon(page) && !PageSwapCache(page)) { 452 if (PageAnon(page) && !PageSwapCache(page))
468 if (!sc->may_swap)
469 goto keep_locked;
470 if (!add_to_swap(page, GFP_ATOMIC)) 453 if (!add_to_swap(page, GFP_ATOMIC))
471 goto activate_locked; 454 goto activate_locked;
472 }
473#endif /* CONFIG_SWAP */ 455#endif /* CONFIG_SWAP */
474 456
475 mapping = page_mapping(page); 457 mapping = page_mapping(page);
@@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
481 * processes. Try to unmap it here. 463 * processes. Try to unmap it here.
482 */ 464 */
483 if (page_mapped(page) && mapping) { 465 if (page_mapped(page) && mapping) {
484 /*
485 * No unmapping if we do not swap
486 */
487 if (!sc->may_swap)
488 goto keep_locked;
489
490 switch (try_to_unmap(page, 0)) { 466 switch (try_to_unmap(page, 0)) {
491 case SWAP_FAIL: 467 case SWAP_FAIL:
492 goto activate_locked; 468 goto activate_locked;
@@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
561 537
562free_it: 538free_it:
563 unlock_page(page); 539 unlock_page(page);
564 reclaimed++; 540 nr_reclaimed++;
565 if (!pagevec_add(&freed_pvec, page)) 541 if (!pagevec_add(&freed_pvec, page))
566 __pagevec_release_nonlru(&freed_pvec); 542 __pagevec_release_nonlru(&freed_pvec);
567 continue; 543 continue;
@@ -579,483 +555,8 @@ keep:
579 if (pagevec_count(&freed_pvec)) 555 if (pagevec_count(&freed_pvec))
580 __pagevec_release_nonlru(&freed_pvec); 556 __pagevec_release_nonlru(&freed_pvec);
581 mod_page_state(pgactivate, pgactivate); 557 mod_page_state(pgactivate, pgactivate);
582 sc->nr_reclaimed += reclaimed; 558 return nr_reclaimed;
583 return reclaimed;
584}
585
586#ifdef CONFIG_MIGRATION
587static inline void move_to_lru(struct page *page)
588{
589 list_del(&page->lru);
590 if (PageActive(page)) {
591 /*
592 * lru_cache_add_active checks that
593 * the PG_active bit is off.
594 */
595 ClearPageActive(page);
596 lru_cache_add_active(page);
597 } else {
598 lru_cache_add(page);
599 }
600 put_page(page);
601}
602
603/*
604 * Add isolated pages on the list back to the LRU.
605 *
606 * returns the number of pages put back.
607 */
608int putback_lru_pages(struct list_head *l)
609{
610 struct page *page;
611 struct page *page2;
612 int count = 0;
613
614 list_for_each_entry_safe(page, page2, l, lru) {
615 move_to_lru(page);
616 count++;
617 }
618 return count;
619}
620
621/*
622 * Non migratable page
623 */
624int fail_migrate_page(struct page *newpage, struct page *page)
625{
626 return -EIO;
627}
628EXPORT_SYMBOL(fail_migrate_page);
629
630/*
631 * swapout a single page
632 * page is locked upon entry, unlocked on exit
633 */
634static int swap_page(struct page *page)
635{
636 struct address_space *mapping = page_mapping(page);
637
638 if (page_mapped(page) && mapping)
639 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
640 goto unlock_retry;
641
642 if (PageDirty(page)) {
643 /* Page is dirty, try to write it out here */
644 switch(pageout(page, mapping)) {
645 case PAGE_KEEP:
646 case PAGE_ACTIVATE:
647 goto unlock_retry;
648
649 case PAGE_SUCCESS:
650 goto retry;
651
652 case PAGE_CLEAN:
653 ; /* try to free the page below */
654 }
655 }
656
657 if (PagePrivate(page)) {
658 if (!try_to_release_page(page, GFP_KERNEL) ||
659 (!mapping && page_count(page) == 1))
660 goto unlock_retry;
661 }
662
663 if (remove_mapping(mapping, page)) {
664 /* Success */
665 unlock_page(page);
666 return 0;
667 }
668
669unlock_retry:
670 unlock_page(page);
671
672retry:
673 return -EAGAIN;
674}
675EXPORT_SYMBOL(swap_page);
676
677/*
678 * Page migration was first developed in the context of the memory hotplug
679 * project. The main authors of the migration code are:
680 *
681 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
682 * Hirokazu Takahashi <taka@valinux.co.jp>
683 * Dave Hansen <haveblue@us.ibm.com>
684 * Christoph Lameter <clameter@sgi.com>
685 */
686
687/*
688 * Remove references for a page and establish the new page with the correct
689 * basic settings to be able to stop accesses to the page.
690 */
691int migrate_page_remove_references(struct page *newpage,
692 struct page *page, int nr_refs)
693{
694 struct address_space *mapping = page_mapping(page);
695 struct page **radix_pointer;
696
697 /*
698 * Avoid doing any of the following work if the page count
699 * indicates that the page is in use or truncate has removed
700 * the page.
701 */
702 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
703 return -EAGAIN;
704
705 /*
706 * Establish swap ptes for anonymous pages or destroy pte
707 * maps for files.
708 *
709 * In order to reestablish file backed mappings the fault handlers
710 * will take the radix tree_lock which may then be used to stop
711 * processses from accessing this page until the new page is ready.
712 *
713 * A process accessing via a swap pte (an anonymous page) will take a
714 * page_lock on the old page which will block the process until the
715 * migration attempt is complete. At that time the PageSwapCache bit
716 * will be examined. If the page was migrated then the PageSwapCache
717 * bit will be clear and the operation to retrieve the page will be
718 * retried which will find the new page in the radix tree. Then a new
719 * direct mapping may be generated based on the radix tree contents.
720 *
721 * If the page was not migrated then the PageSwapCache bit
722 * is still set and the operation may continue.
723 */
724 if (try_to_unmap(page, 1) == SWAP_FAIL)
725 /* A vma has VM_LOCKED set -> Permanent failure */
726 return -EPERM;
727
728 /*
729 * Give up if we were unable to remove all mappings.
730 */
731 if (page_mapcount(page))
732 return -EAGAIN;
733
734 write_lock_irq(&mapping->tree_lock);
735
736 radix_pointer = (struct page **)radix_tree_lookup_slot(
737 &mapping->page_tree,
738 page_index(page));
739
740 if (!page_mapping(page) || page_count(page) != nr_refs ||
741 *radix_pointer != page) {
742 write_unlock_irq(&mapping->tree_lock);
743 return -EAGAIN;
744 }
745
746 /*
747 * Now we know that no one else is looking at the page.
748 *
749 * Certain minimal information about a page must be available
750 * in order for other subsystems to properly handle the page if they
751 * find it through the radix tree update before we are finished
752 * copying the page.
753 */
754 get_page(newpage);
755 newpage->index = page->index;
756 newpage->mapping = page->mapping;
757 if (PageSwapCache(page)) {
758 SetPageSwapCache(newpage);
759 set_page_private(newpage, page_private(page));
760 }
761
762 *radix_pointer = newpage;
763 __put_page(page);
764 write_unlock_irq(&mapping->tree_lock);
765
766 return 0;
767}
768EXPORT_SYMBOL(migrate_page_remove_references);
769
770/*
771 * Copy the page to its new location
772 */
773void migrate_page_copy(struct page *newpage, struct page *page)
774{
775 copy_highpage(newpage, page);
776
777 if (PageError(page))
778 SetPageError(newpage);
779 if (PageReferenced(page))
780 SetPageReferenced(newpage);
781 if (PageUptodate(page))
782 SetPageUptodate(newpage);
783 if (PageActive(page))
784 SetPageActive(newpage);
785 if (PageChecked(page))
786 SetPageChecked(newpage);
787 if (PageMappedToDisk(page))
788 SetPageMappedToDisk(newpage);
789
790 if (PageDirty(page)) {
791 clear_page_dirty_for_io(page);
792 set_page_dirty(newpage);
793 }
794
795 ClearPageSwapCache(page);
796 ClearPageActive(page);
797 ClearPagePrivate(page);
798 set_page_private(page, 0);
799 page->mapping = NULL;
800
801 /*
802 * If any waiters have accumulated on the new page then
803 * wake them up.
804 */
805 if (PageWriteback(newpage))
806 end_page_writeback(newpage);
807}
808EXPORT_SYMBOL(migrate_page_copy);
809
810/*
811 * Common logic to directly migrate a single page suitable for
812 * pages that do not use PagePrivate.
813 *
814 * Pages are locked upon entry and exit.
815 */
816int migrate_page(struct page *newpage, struct page *page)
817{
818 int rc;
819
820 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
821
822 rc = migrate_page_remove_references(newpage, page, 2);
823
824 if (rc)
825 return rc;
826
827 migrate_page_copy(newpage, page);
828
829 /*
830 * Remove auxiliary swap entries and replace
831 * them with real ptes.
832 *
833 * Note that a real pte entry will allow processes that are not
834 * waiting on the page lock to use the new page via the page tables
835 * before the new page is unlocked.
836 */
837 remove_from_swap(newpage);
838 return 0;
839} 559}
840EXPORT_SYMBOL(migrate_page);
841
842/*
843 * migrate_pages
844 *
845 * Two lists are passed to this function. The first list
846 * contains the pages isolated from the LRU to be migrated.
847 * The second list contains new pages that the pages isolated
848 * can be moved to. If the second list is NULL then all
849 * pages are swapped out.
850 *
851 * The function returns after 10 attempts or if no pages
852 * are movable anymore because to has become empty
853 * or no retryable pages exist anymore.
854 *
855 * Return: Number of pages not migrated when "to" ran empty.
856 */
857int migrate_pages(struct list_head *from, struct list_head *to,
858 struct list_head *moved, struct list_head *failed)
859{
860 int retry;
861 int nr_failed = 0;
862 int pass = 0;
863 struct page *page;
864 struct page *page2;
865 int swapwrite = current->flags & PF_SWAPWRITE;
866 int rc;
867
868 if (!swapwrite)
869 current->flags |= PF_SWAPWRITE;
870
871redo:
872 retry = 0;
873
874 list_for_each_entry_safe(page, page2, from, lru) {
875 struct page *newpage = NULL;
876 struct address_space *mapping;
877
878 cond_resched();
879
880 rc = 0;
881 if (page_count(page) == 1)
882 /* page was freed from under us. So we are done. */
883 goto next;
884
885 if (to && list_empty(to))
886 break;
887
888 /*
889 * Skip locked pages during the first two passes to give the
890 * functions holding the lock time to release the page. Later we
891 * use lock_page() to have a higher chance of acquiring the
892 * lock.
893 */
894 rc = -EAGAIN;
895 if (pass > 2)
896 lock_page(page);
897 else
898 if (TestSetPageLocked(page))
899 goto next;
900
901 /*
902 * Only wait on writeback if we have already done a pass where
903 * we we may have triggered writeouts for lots of pages.
904 */
905 if (pass > 0) {
906 wait_on_page_writeback(page);
907 } else {
908 if (PageWriteback(page))
909 goto unlock_page;
910 }
911
912 /*
913 * Anonymous pages must have swap cache references otherwise
914 * the information contained in the page maps cannot be
915 * preserved.
916 */
917 if (PageAnon(page) && !PageSwapCache(page)) {
918 if (!add_to_swap(page, GFP_KERNEL)) {
919 rc = -ENOMEM;
920 goto unlock_page;
921 }
922 }
923
924 if (!to) {
925 rc = swap_page(page);
926 goto next;
927 }
928
929 newpage = lru_to_page(to);
930 lock_page(newpage);
931
932 /*
933 * Pages are properly locked and writeback is complete.
934 * Try to migrate the page.
935 */
936 mapping = page_mapping(page);
937 if (!mapping)
938 goto unlock_both;
939
940 if (mapping->a_ops->migratepage) {
941 /*
942 * Most pages have a mapping and most filesystems
943 * should provide a migration function. Anonymous
944 * pages are part of swap space which also has its
945 * own migration function. This is the most common
946 * path for page migration.
947 */
948 rc = mapping->a_ops->migratepage(newpage, page);
949 goto unlock_both;
950 }
951
952 /*
953 * Default handling if a filesystem does not provide
954 * a migration function. We can only migrate clean
955 * pages so try to write out any dirty pages first.
956 */
957 if (PageDirty(page)) {
958 switch (pageout(page, mapping)) {
959 case PAGE_KEEP:
960 case PAGE_ACTIVATE:
961 goto unlock_both;
962
963 case PAGE_SUCCESS:
964 unlock_page(newpage);
965 goto next;
966
967 case PAGE_CLEAN:
968 ; /* try to migrate the page below */
969 }
970 }
971
972 /*
973 * Buffers are managed in a filesystem specific way.
974 * We must have no buffers or drop them.
975 */
976 if (!page_has_buffers(page) ||
977 try_to_release_page(page, GFP_KERNEL)) {
978 rc = migrate_page(newpage, page);
979 goto unlock_both;
980 }
981
982 /*
983 * On early passes with mapped pages simply
984 * retry. There may be a lock held for some
985 * buffers that may go away. Later
986 * swap them out.
987 */
988 if (pass > 4) {
989 /*
990 * Persistently unable to drop buffers..... As a
991 * measure of last resort we fall back to
992 * swap_page().
993 */
994 unlock_page(newpage);
995 newpage = NULL;
996 rc = swap_page(page);
997 goto next;
998 }
999
1000unlock_both:
1001 unlock_page(newpage);
1002
1003unlock_page:
1004 unlock_page(page);
1005
1006next:
1007 if (rc == -EAGAIN) {
1008 retry++;
1009 } else if (rc) {
1010 /* Permanent failure */
1011 list_move(&page->lru, failed);
1012 nr_failed++;
1013 } else {
1014 if (newpage) {
1015 /* Successful migration. Return page to LRU */
1016 move_to_lru(newpage);
1017 }
1018 list_move(&page->lru, moved);
1019 }
1020 }
1021 if (retry && pass++ < 10)
1022 goto redo;
1023
1024 if (!swapwrite)
1025 current->flags &= ~PF_SWAPWRITE;
1026
1027 return nr_failed + retry;
1028}
1029
1030/*
1031 * Isolate one page from the LRU lists and put it on the
1032 * indicated list with elevated refcount.
1033 *
1034 * Result:
1035 * 0 = page not on LRU list
1036 * 1 = page removed from LRU list and added to the specified list.
1037 */
1038int isolate_lru_page(struct page *page)
1039{
1040 int ret = 0;
1041
1042 if (PageLRU(page)) {
1043 struct zone *zone = page_zone(page);
1044 spin_lock_irq(&zone->lru_lock);
1045 if (TestClearPageLRU(page)) {
1046 ret = 1;
1047 get_page(page);
1048 if (PageActive(page))
1049 del_page_from_active_list(zone, page);
1050 else
1051 del_page_from_inactive_list(zone, page);
1052 }
1053 spin_unlock_irq(&zone->lru_lock);
1054 }
1055
1056 return ret;
1057}
1058#endif
1059 560
1060/* 561/*
1061 * zone->lru_lock is heavily contended. Some of the functions that 562 * zone->lru_lock is heavily contended. Some of the functions that
@@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page)
1074 * 575 *
1075 * returns how many pages were moved onto *@dst. 576 * returns how many pages were moved onto *@dst.
1076 */ 577 */
1077static int isolate_lru_pages(int nr_to_scan, struct list_head *src, 578static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1078 struct list_head *dst, int *scanned) 579 struct list_head *src, struct list_head *dst,
580 unsigned long *scanned)
1079{ 581{
1080 int nr_taken = 0; 582 unsigned long nr_taken = 0;
1081 struct page *page; 583 struct page *page;
1082 int scan = 0; 584 unsigned long scan;
1083 585
1084 while (scan++ < nr_to_scan && !list_empty(src)) { 586 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
587 struct list_head *target;
1085 page = lru_to_page(src); 588 page = lru_to_page(src);
1086 prefetchw_prev_lru_page(page, src, flags); 589 prefetchw_prev_lru_page(page, src, flags);
1087 590
1088 if (!TestClearPageLRU(page)) 591 BUG_ON(!PageLRU(page));
1089 BUG(); 592
1090 list_del(&page->lru); 593 list_del(&page->lru);
1091 if (get_page_testone(page)) { 594 target = src;
595 if (likely(get_page_unless_zero(page))) {
1092 /* 596 /*
1093 * It is being freed elsewhere 597 * Be careful not to clear PageLRU until after we're
598 * sure the page is not being freed elsewhere -- the
599 * page release code relies on it.
1094 */ 600 */
1095 __put_page(page); 601 ClearPageLRU(page);
1096 SetPageLRU(page); 602 target = dst;
1097 list_add(&page->lru, src);
1098 continue;
1099 } else {
1100 list_add(&page->lru, dst);
1101 nr_taken++; 603 nr_taken++;
1102 } 604 } /* else it is being freed elsewhere */
605
606 list_add(&page->lru, target);
1103 } 607 }
1104 608
1105 *scanned = scan; 609 *scanned = scan;
@@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
1107} 611}
1108 612
1109/* 613/*
1110 * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed 614 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
615 * of reclaimed pages
1111 */ 616 */
1112static void shrink_cache(struct zone *zone, struct scan_control *sc) 617static unsigned long shrink_inactive_list(unsigned long max_scan,
618 struct zone *zone, struct scan_control *sc)
1113{ 619{
1114 LIST_HEAD(page_list); 620 LIST_HEAD(page_list);
1115 struct pagevec pvec; 621 struct pagevec pvec;
1116 int max_scan = sc->nr_to_scan; 622 unsigned long nr_scanned = 0;
623 unsigned long nr_reclaimed = 0;
1117 624
1118 pagevec_init(&pvec, 1); 625 pagevec_init(&pvec, 1);
1119 626
1120 lru_add_drain(); 627 lru_add_drain();
1121 spin_lock_irq(&zone->lru_lock); 628 spin_lock_irq(&zone->lru_lock);
1122 while (max_scan > 0) { 629 do {
1123 struct page *page; 630 struct page *page;
1124 int nr_taken; 631 unsigned long nr_taken;
1125 int nr_scan; 632 unsigned long nr_scan;
1126 int nr_freed; 633 unsigned long nr_freed;
1127 634
1128 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 635 nr_taken = isolate_lru_pages(sc->swap_cluster_max,
1129 &zone->inactive_list, 636 &zone->inactive_list,
@@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
1132 zone->pages_scanned += nr_scan; 639 zone->pages_scanned += nr_scan;
1133 spin_unlock_irq(&zone->lru_lock); 640 spin_unlock_irq(&zone->lru_lock);
1134 641
1135 if (nr_taken == 0) 642 nr_scanned += nr_scan;
1136 goto done; 643 nr_freed = shrink_page_list(&page_list, sc);
1137 644 nr_reclaimed += nr_freed;
1138 max_scan -= nr_scan;
1139 nr_freed = shrink_list(&page_list, sc);
1140
1141 local_irq_disable(); 645 local_irq_disable();
1142 if (current_is_kswapd()) { 646 if (current_is_kswapd()) {
1143 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); 647 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
@@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
1146 __mod_page_state_zone(zone, pgscan_direct, nr_scan); 650 __mod_page_state_zone(zone, pgscan_direct, nr_scan);
1147 __mod_page_state_zone(zone, pgsteal, nr_freed); 651 __mod_page_state_zone(zone, pgsteal, nr_freed);
1148 652
653 if (nr_taken == 0)
654 goto done;
655
1149 spin_lock(&zone->lru_lock); 656 spin_lock(&zone->lru_lock);
1150 /* 657 /*
1151 * Put back any unfreeable pages. 658 * Put back any unfreeable pages.
1152 */ 659 */
1153 while (!list_empty(&page_list)) { 660 while (!list_empty(&page_list)) {
1154 page = lru_to_page(&page_list); 661 page = lru_to_page(&page_list);
1155 if (TestSetPageLRU(page)) 662 BUG_ON(PageLRU(page));
1156 BUG(); 663 SetPageLRU(page);
1157 list_del(&page->lru); 664 list_del(&page->lru);
1158 if (PageActive(page)) 665 if (PageActive(page))
1159 add_page_to_active_list(zone, page); 666 add_page_to_active_list(zone, page);
@@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
1165 spin_lock_irq(&zone->lru_lock); 672 spin_lock_irq(&zone->lru_lock);
1166 } 673 }
1167 } 674 }
1168 } 675 } while (nr_scanned < max_scan);
1169 spin_unlock_irq(&zone->lru_lock); 676 spin_unlock(&zone->lru_lock);
1170done: 677done:
678 local_irq_enable();
1171 pagevec_release(&pvec); 679 pagevec_release(&pvec);
680 return nr_reclaimed;
1172} 681}
1173 682
1174/* 683/*
@@ -1188,13 +697,12 @@ done:
1188 * The downside is that we have to touch page->_count against each page. 697 * The downside is that we have to touch page->_count against each page.
1189 * But we had to alter page->flags anyway. 698 * But we had to alter page->flags anyway.
1190 */ 699 */
1191static void 700static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1192refill_inactive_zone(struct zone *zone, struct scan_control *sc) 701 struct scan_control *sc)
1193{ 702{
1194 int pgmoved; 703 unsigned long pgmoved;
1195 int pgdeactivate = 0; 704 int pgdeactivate = 0;
1196 int pgscanned; 705 unsigned long pgscanned;
1197 int nr_pages = sc->nr_to_scan;
1198 LIST_HEAD(l_hold); /* The pages which were snipped off */ 706 LIST_HEAD(l_hold); /* The pages which were snipped off */
1199 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 707 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
1200 LIST_HEAD(l_active); /* Pages to go onto the active_list */ 708 LIST_HEAD(l_active); /* Pages to go onto the active_list */
@@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1202 struct pagevec pvec; 710 struct pagevec pvec;
1203 int reclaim_mapped = 0; 711 int reclaim_mapped = 0;
1204 712
1205 if (unlikely(sc->may_swap)) { 713 if (sc->may_swap) {
1206 long mapped_ratio; 714 long mapped_ratio;
1207 long distress; 715 long distress;
1208 long swap_tendency; 716 long swap_tendency;
@@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1272 while (!list_empty(&l_inactive)) { 780 while (!list_empty(&l_inactive)) {
1273 page = lru_to_page(&l_inactive); 781 page = lru_to_page(&l_inactive);
1274 prefetchw_prev_lru_page(page, &l_inactive, flags); 782 prefetchw_prev_lru_page(page, &l_inactive, flags);
1275 if (TestSetPageLRU(page)) 783 BUG_ON(PageLRU(page));
1276 BUG(); 784 SetPageLRU(page);
1277 if (!TestClearPageActive(page)) 785 BUG_ON(!PageActive(page));
1278 BUG(); 786 ClearPageActive(page);
787
1279 list_move(&page->lru, &zone->inactive_list); 788 list_move(&page->lru, &zone->inactive_list);
1280 pgmoved++; 789 pgmoved++;
1281 if (!pagevec_add(&pvec, page)) { 790 if (!pagevec_add(&pvec, page)) {
@@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1301 while (!list_empty(&l_active)) { 810 while (!list_empty(&l_active)) {
1302 page = lru_to_page(&l_active); 811 page = lru_to_page(&l_active);
1303 prefetchw_prev_lru_page(page, &l_active, flags); 812 prefetchw_prev_lru_page(page, &l_active, flags);
1304 if (TestSetPageLRU(page)) 813 BUG_ON(PageLRU(page));
1305 BUG(); 814 SetPageLRU(page);
1306 BUG_ON(!PageActive(page)); 815 BUG_ON(!PageActive(page));
1307 list_move(&page->lru, &zone->active_list); 816 list_move(&page->lru, &zone->active_list);
1308 pgmoved++; 817 pgmoved++;
@@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1327/* 836/*
1328 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 837 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1329 */ 838 */
1330static void 839static unsigned long shrink_zone(int priority, struct zone *zone,
1331shrink_zone(struct zone *zone, struct scan_control *sc) 840 struct scan_control *sc)
1332{ 841{
1333 unsigned long nr_active; 842 unsigned long nr_active;
1334 unsigned long nr_inactive; 843 unsigned long nr_inactive;
844 unsigned long nr_to_scan;
845 unsigned long nr_reclaimed = 0;
1335 846
1336 atomic_inc(&zone->reclaim_in_progress); 847 atomic_inc(&zone->reclaim_in_progress);
1337 848
@@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
1339 * Add one to `nr_to_scan' just to make sure that the kernel will 850 * Add one to `nr_to_scan' just to make sure that the kernel will
1340 * slowly sift through the active list. 851 * slowly sift through the active list.
1341 */ 852 */
1342 zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; 853 zone->nr_scan_active += (zone->nr_active >> priority) + 1;
1343 nr_active = zone->nr_scan_active; 854 nr_active = zone->nr_scan_active;
1344 if (nr_active >= sc->swap_cluster_max) 855 if (nr_active >= sc->swap_cluster_max)
1345 zone->nr_scan_active = 0; 856 zone->nr_scan_active = 0;
1346 else 857 else
1347 nr_active = 0; 858 nr_active = 0;
1348 859
1349 zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; 860 zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
1350 nr_inactive = zone->nr_scan_inactive; 861 nr_inactive = zone->nr_scan_inactive;
1351 if (nr_inactive >= sc->swap_cluster_max) 862 if (nr_inactive >= sc->swap_cluster_max)
1352 zone->nr_scan_inactive = 0; 863 zone->nr_scan_inactive = 0;
@@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
1355 866
1356 while (nr_active || nr_inactive) { 867 while (nr_active || nr_inactive) {
1357 if (nr_active) { 868 if (nr_active) {
1358 sc->nr_to_scan = min(nr_active, 869 nr_to_scan = min(nr_active,
1359 (unsigned long)sc->swap_cluster_max); 870 (unsigned long)sc->swap_cluster_max);
1360 nr_active -= sc->nr_to_scan; 871 nr_active -= nr_to_scan;
1361 refill_inactive_zone(zone, sc); 872 shrink_active_list(nr_to_scan, zone, sc);
1362 } 873 }
1363 874
1364 if (nr_inactive) { 875 if (nr_inactive) {
1365 sc->nr_to_scan = min(nr_inactive, 876 nr_to_scan = min(nr_inactive,
1366 (unsigned long)sc->swap_cluster_max); 877 (unsigned long)sc->swap_cluster_max);
1367 nr_inactive -= sc->nr_to_scan; 878 nr_inactive -= nr_to_scan;
1368 shrink_cache(zone, sc); 879 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
880 sc);
1369 } 881 }
1370 } 882 }
1371 883
1372 throttle_vm_writeout(); 884 throttle_vm_writeout();
1373 885
1374 atomic_dec(&zone->reclaim_in_progress); 886 atomic_dec(&zone->reclaim_in_progress);
887 return nr_reclaimed;
1375} 888}
1376 889
1377/* 890/*
@@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
1390 * If a zone is deemed to be full of pinned pages then just give it a light 903 * If a zone is deemed to be full of pinned pages then just give it a light
1391 * scan then give up on it. 904 * scan then give up on it.
1392 */ 905 */
1393static void 906static unsigned long shrink_zones(int priority, struct zone **zones,
1394shrink_caches(struct zone **zones, struct scan_control *sc) 907 struct scan_control *sc)
1395{ 908{
909 unsigned long nr_reclaimed = 0;
1396 int i; 910 int i;
1397 911
1398 for (i = 0; zones[i] != NULL; i++) { 912 for (i = 0; zones[i] != NULL; i++) {
@@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
1404 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 918 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1405 continue; 919 continue;
1406 920
1407 zone->temp_priority = sc->priority; 921 zone->temp_priority = priority;
1408 if (zone->prev_priority > sc->priority) 922 if (zone->prev_priority > priority)
1409 zone->prev_priority = sc->priority; 923 zone->prev_priority = priority;
1410 924
1411 if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) 925 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1412 continue; /* Let kswapd poll it */ 926 continue; /* Let kswapd poll it */
1413 927
1414 shrink_zone(zone, sc); 928 nr_reclaimed += shrink_zone(priority, zone, sc);
1415 } 929 }
930 return nr_reclaimed;
1416} 931}
1417 932
1418/* 933/*
@@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
1428 * holds filesystem locks which prevent writeout this might not work, and the 943 * holds filesystem locks which prevent writeout this might not work, and the
1429 * allocation attempt will fail. 944 * allocation attempt will fail.
1430 */ 945 */
1431int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) 946unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1432{ 947{
1433 int priority; 948 int priority;
1434 int ret = 0; 949 int ret = 0;
1435 int total_scanned = 0, total_reclaimed = 0; 950 unsigned long total_scanned = 0;
951 unsigned long nr_reclaimed = 0;
1436 struct reclaim_state *reclaim_state = current->reclaim_state; 952 struct reclaim_state *reclaim_state = current->reclaim_state;
1437 struct scan_control sc;
1438 unsigned long lru_pages = 0; 953 unsigned long lru_pages = 0;
1439 int i; 954 int i;
1440 955 struct scan_control sc = {
1441 sc.gfp_mask = gfp_mask; 956 .gfp_mask = gfp_mask,
1442 sc.may_writepage = !laptop_mode; 957 .may_writepage = !laptop_mode,
1443 sc.may_swap = 1; 958 .swap_cluster_max = SWAP_CLUSTER_MAX,
959 .may_swap = 1,
960 };
1444 961
1445 inc_page_state(allocstall); 962 inc_page_state(allocstall);
1446 963
@@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1457 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 974 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1458 sc.nr_mapped = read_page_state(nr_mapped); 975 sc.nr_mapped = read_page_state(nr_mapped);
1459 sc.nr_scanned = 0; 976 sc.nr_scanned = 0;
1460 sc.nr_reclaimed = 0;
1461 sc.priority = priority;
1462 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1463 if (!priority) 977 if (!priority)
1464 disable_swap_token(); 978 disable_swap_token();
1465 shrink_caches(zones, &sc); 979 nr_reclaimed += shrink_zones(priority, zones, &sc);
1466 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 980 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
1467 if (reclaim_state) { 981 if (reclaim_state) {
1468 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 982 nr_reclaimed += reclaim_state->reclaimed_slab;
1469 reclaim_state->reclaimed_slab = 0; 983 reclaim_state->reclaimed_slab = 0;
1470 } 984 }
1471 total_scanned += sc.nr_scanned; 985 total_scanned += sc.nr_scanned;
1472 total_reclaimed += sc.nr_reclaimed; 986 if (nr_reclaimed >= sc.swap_cluster_max) {
1473 if (total_reclaimed >= sc.swap_cluster_max) {
1474 ret = 1; 987 ret = 1;
1475 goto out; 988 goto out;
1476 } 989 }
@@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1482 * that's undesirable in laptop mode, where we *want* lumpy 995 * that's undesirable in laptop mode, where we *want* lumpy
1483 * writeout. So in laptop mode, write out the whole world. 996 * writeout. So in laptop mode, write out the whole world.
1484 */ 997 */
1485 if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { 998 if (total_scanned > sc.swap_cluster_max +
999 sc.swap_cluster_max / 2) {
1486 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1000 wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1487 sc.may_writepage = 1; 1001 sc.may_writepage = 1;
1488 } 1002 }
@@ -1528,22 +1042,26 @@ out:
1528 * the page allocator fallback scheme to ensure that aging of pages is balanced 1042 * the page allocator fallback scheme to ensure that aging of pages is balanced
1529 * across the zones. 1043 * across the zones.
1530 */ 1044 */
1531static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) 1045static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
1046 int order)
1532{ 1047{
1533 int to_free = nr_pages; 1048 unsigned long to_free = nr_pages;
1534 int all_zones_ok; 1049 int all_zones_ok;
1535 int priority; 1050 int priority;
1536 int i; 1051 int i;
1537 int total_scanned, total_reclaimed; 1052 unsigned long total_scanned;
1053 unsigned long nr_reclaimed;
1538 struct reclaim_state *reclaim_state = current->reclaim_state; 1054 struct reclaim_state *reclaim_state = current->reclaim_state;
1539 struct scan_control sc; 1055 struct scan_control sc = {
1056 .gfp_mask = GFP_KERNEL,
1057 .may_swap = 1,
1058 .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
1059 };
1540 1060
1541loop_again: 1061loop_again:
1542 total_scanned = 0; 1062 total_scanned = 0;
1543 total_reclaimed = 0; 1063 nr_reclaimed = 0;
1544 sc.gfp_mask = GFP_KERNEL; 1064 sc.may_writepage = !laptop_mode,
1545 sc.may_writepage = !laptop_mode;
1546 sc.may_swap = 1;
1547 sc.nr_mapped = read_page_state(nr_mapped); 1065 sc.nr_mapped = read_page_state(nr_mapped);
1548 1066
1549 inc_page_state(pageoutrun); 1067 inc_page_state(pageoutrun);
@@ -1624,15 +1142,11 @@ scan:
1624 if (zone->prev_priority > priority) 1142 if (zone->prev_priority > priority)
1625 zone->prev_priority = priority; 1143 zone->prev_priority = priority;
1626 sc.nr_scanned = 0; 1144 sc.nr_scanned = 0;
1627 sc.nr_reclaimed = 0; 1145 nr_reclaimed += shrink_zone(priority, zone, &sc);
1628 sc.priority = priority;
1629 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
1630 shrink_zone(zone, &sc);
1631 reclaim_state->reclaimed_slab = 0; 1146 reclaim_state->reclaimed_slab = 0;
1632 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1147 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1633 lru_pages); 1148 lru_pages);
1634 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 1149 nr_reclaimed += reclaim_state->reclaimed_slab;
1635 total_reclaimed += sc.nr_reclaimed;
1636 total_scanned += sc.nr_scanned; 1150 total_scanned += sc.nr_scanned;
1637 if (zone->all_unreclaimable) 1151 if (zone->all_unreclaimable)
1638 continue; 1152 continue;
@@ -1645,10 +1159,10 @@ scan:
1645 * even in laptop mode 1159 * even in laptop mode
1646 */ 1160 */
1647 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1161 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1648 total_scanned > total_reclaimed+total_reclaimed/2) 1162 total_scanned > nr_reclaimed + nr_reclaimed / 2)
1649 sc.may_writepage = 1; 1163 sc.may_writepage = 1;
1650 } 1164 }
1651 if (nr_pages && to_free > total_reclaimed) 1165 if (nr_pages && to_free > nr_reclaimed)
1652 continue; /* swsusp: need to do more work */ 1166 continue; /* swsusp: need to do more work */
1653 if (all_zones_ok) 1167 if (all_zones_ok)
1654 break; /* kswapd: all done */ 1168 break; /* kswapd: all done */
@@ -1665,7 +1179,7 @@ scan:
1665 * matches the direct reclaim path behaviour in terms of impact 1179 * matches the direct reclaim path behaviour in terms of impact
1666 * on zone->*_priority. 1180 * on zone->*_priority.
1667 */ 1181 */
1668 if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) 1182 if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
1669 break; 1183 break;
1670 } 1184 }
1671out: 1185out:
@@ -1679,7 +1193,7 @@ out:
1679 goto loop_again; 1193 goto loop_again;
1680 } 1194 }
1681 1195
1682 return total_reclaimed; 1196 return nr_reclaimed;
1683} 1197}
1684 1198
1685/* 1199/*
@@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order)
1779 * Try to free `nr_pages' of memory, system-wide. Returns the number of freed 1293 * Try to free `nr_pages' of memory, system-wide. Returns the number of freed
1780 * pages. 1294 * pages.
1781 */ 1295 */
1782int shrink_all_memory(int nr_pages) 1296unsigned long shrink_all_memory(unsigned long nr_pages)
1783{ 1297{
1784 pg_data_t *pgdat; 1298 pg_data_t *pgdat;
1785 int nr_to_free = nr_pages; 1299 unsigned long nr_to_free = nr_pages;
1786 int ret = 0; 1300 unsigned long ret = 0;
1301 unsigned retry = 2;
1787 struct reclaim_state reclaim_state = { 1302 struct reclaim_state reclaim_state = {
1788 .reclaimed_slab = 0, 1303 .reclaimed_slab = 0,
1789 }; 1304 };
1790 1305
1791 current->reclaim_state = &reclaim_state; 1306 current->reclaim_state = &reclaim_state;
1307repeat:
1792 for_each_pgdat(pgdat) { 1308 for_each_pgdat(pgdat) {
1793 int freed; 1309 unsigned long freed;
1310
1794 freed = balance_pgdat(pgdat, nr_to_free, 0); 1311 freed = balance_pgdat(pgdat, nr_to_free, 0);
1795 ret += freed; 1312 ret += freed;
1796 nr_to_free -= freed; 1313 nr_to_free -= freed;
1797 if (nr_to_free <= 0) 1314 if ((long)nr_to_free <= 0)
1798 break; 1315 break;
1799 } 1316 }
1317 if (retry-- && ret < nr_pages) {
1318 blk_congestion_wait(WRITE, HZ/5);
1319 goto repeat;
1320 }
1800 current->reclaim_state = NULL; 1321 current->reclaim_state = NULL;
1801 return ret; 1322 return ret;
1802} 1323}
@@ -1808,8 +1329,7 @@ int shrink_all_memory(int nr_pages)
1808 away, we get changed to run anywhere: as the first one comes back, 1329 away, we get changed to run anywhere: as the first one comes back,
1809 restore their cpu bindings. */ 1330 restore their cpu bindings. */
1810static int __devinit cpu_callback(struct notifier_block *nfb, 1331static int __devinit cpu_callback(struct notifier_block *nfb,
1811 unsigned long action, 1332 unsigned long action, void *hcpu)
1812 void *hcpu)
1813{ 1333{
1814 pg_data_t *pgdat; 1334 pg_data_t *pgdat;
1815 cpumask_t mask; 1335 cpumask_t mask;
@@ -1829,10 +1349,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
1829static int __init kswapd_init(void) 1349static int __init kswapd_init(void)
1830{ 1350{
1831 pg_data_t *pgdat; 1351 pg_data_t *pgdat;
1352
1832 swap_setup(); 1353 swap_setup();
1833 for_each_pgdat(pgdat) 1354 for_each_pgdat(pgdat) {
1834 pgdat->kswapd 1355 pid_t pid;
1835 = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); 1356
1357 pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
1358 BUG_ON(pid < 0);
1359 pgdat->kswapd = find_task_by_pid(pid);
1360 }
1836 total_memory = nr_free_pagecache_pages(); 1361 total_memory = nr_free_pagecache_pages();
1837 hotcpu_notifier(cpu_callback, 0); 1362 hotcpu_notifier(cpu_callback, 0);
1838 return 0; 1363 return 0;
@@ -1874,46 +1399,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
1874/* 1399/*
1875 * Try to free up some pages from this zone through reclaim. 1400 * Try to free up some pages from this zone through reclaim.
1876 */ 1401 */
1877int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1402static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1878{ 1403{
1879 int nr_pages; 1404 /* Minimum pages needed in order to stay on node */
1405 const unsigned long nr_pages = 1 << order;
1880 struct task_struct *p = current; 1406 struct task_struct *p = current;
1881 struct reclaim_state reclaim_state; 1407 struct reclaim_state reclaim_state;
1882 struct scan_control sc; 1408 int priority;
1883 cpumask_t mask; 1409 unsigned long nr_reclaimed = 0;
1884 int node_id; 1410 struct scan_control sc = {
1885 1411 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
1886 if (time_before(jiffies, 1412 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
1887 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) 1413 .nr_mapped = read_page_state(nr_mapped),
1888 return 0; 1414 .swap_cluster_max = max_t(unsigned long, nr_pages,
1889 1415 SWAP_CLUSTER_MAX),
1890 if (!(gfp_mask & __GFP_WAIT) || 1416 .gfp_mask = gfp_mask,
1891 zone->all_unreclaimable || 1417 };
1892 atomic_read(&zone->reclaim_in_progress) > 0 ||
1893 (p->flags & PF_MEMALLOC))
1894 return 0;
1895
1896 node_id = zone->zone_pgdat->node_id;
1897 mask = node_to_cpumask(node_id);
1898 if (!cpus_empty(mask) && node_id != numa_node_id())
1899 return 0;
1900
1901 sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
1902 sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
1903 sc.nr_scanned = 0;
1904 sc.nr_reclaimed = 0;
1905 sc.priority = ZONE_RECLAIM_PRIORITY + 1;
1906 sc.nr_mapped = read_page_state(nr_mapped);
1907 sc.gfp_mask = gfp_mask;
1908 1418
1909 disable_swap_token(); 1419 disable_swap_token();
1910
1911 nr_pages = 1 << order;
1912 if (nr_pages > SWAP_CLUSTER_MAX)
1913 sc.swap_cluster_max = nr_pages;
1914 else
1915 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1916
1917 cond_resched(); 1420 cond_resched();
1918 /* 1421 /*
1919 * We need to be able to allocate from the reserves for RECLAIM_SWAP 1422 * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -1928,17 +1431,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1928 * Free memory by calling shrink zone with increasing priorities 1431 * Free memory by calling shrink zone with increasing priorities
1929 * until we have enough memory freed. 1432 * until we have enough memory freed.
1930 */ 1433 */
1434 priority = ZONE_RECLAIM_PRIORITY;
1931 do { 1435 do {
1932 sc.priority--; 1436 nr_reclaimed += shrink_zone(priority, zone, &sc);
1933 shrink_zone(zone, &sc); 1437 priority--;
1438 } while (priority >= 0 && nr_reclaimed < nr_pages);
1934 1439
1935 } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); 1440 if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1936
1937 if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1938 /* 1441 /*
1939 * shrink_slab does not currently allow us to determine 1442 * shrink_slab() does not currently allow us to determine how
1940 * how many pages were freed in the zone. So we just 1443 * many pages were freed in this zone. So we just shake the slab
1941 * shake the slab and then go offnode for a single allocation. 1444 * a bit and then go off node for this particular allocation
1445 * despite possibly having freed enough memory to allocate in
1446 * this zone. If we freed local memory then the next
1447 * allocations will be local again.
1942 * 1448 *
1943 * shrink_slab will free memory on all zones and may take 1449 * shrink_slab will free memory on all zones and may take
1944 * a long time. 1450 * a long time.
@@ -1949,10 +1455,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1949 p->reclaim_state = NULL; 1455 p->reclaim_state = NULL;
1950 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 1456 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
1951 1457
1952 if (sc.nr_reclaimed == 0) 1458 if (nr_reclaimed == 0) {
1459 /*
1460 * We were unable to reclaim enough pages to stay on node. We
1461 * now allow off node accesses for a certain time period before
1462 * trying again to reclaim pages from the local zone.
1463 */
1953 zone->last_unsuccessful_zone_reclaim = jiffies; 1464 zone->last_unsuccessful_zone_reclaim = jiffies;
1465 }
1954 1466
1955 return sc.nr_reclaimed >= nr_pages; 1467 return nr_reclaimed >= nr_pages;
1956} 1468}
1957#endif
1958 1469
1470int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1471{
1472 cpumask_t mask;
1473 int node_id;
1474
1475 /*
1476 * Do not reclaim if there was a recent unsuccessful attempt at zone
1477 * reclaim. In that case we let allocations go off node for the
1478 * zone_reclaim_interval. Otherwise we would scan for each off-node
1479 * page allocation.
1480 */
1481 if (time_before(jiffies,
1482 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
1483 return 0;
1484
1485 /*
1486 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
1487 * not have reclaimable pages and if we should not delay the allocation
1488 * then do not scan.
1489 */
1490 if (!(gfp_mask & __GFP_WAIT) ||
1491 zone->all_unreclaimable ||
1492 atomic_read(&zone->reclaim_in_progress) > 0 ||
1493 (current->flags & PF_MEMALLOC))
1494 return 0;
1495
1496 /*
1497 * Only run zone reclaim on the local zone or on zones that do not
1498 * have associated processors. This will favor the local processor
1499 * over remote processors and spread off node memory allocations
1500 * as wide as possible.
1501 */
1502 node_id = zone->zone_pgdat->node_id;
1503 mask = node_to_cpumask(node_id);
1504 if (!cpus_empty(mask) && node_id != numa_node_id())
1505 return 0;
1506 return __zone_reclaim(zone, gfp_mask, order);
1507}
1508#endif