aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c1026
1 files changed, 760 insertions, 266 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1ff1a58e7c10..3b5860294bb6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -39,6 +39,7 @@
39#include <linux/freezer.h> 39#include <linux/freezer.h>
40#include <linux/memcontrol.h> 40#include <linux/memcontrol.h>
41#include <linux/delayacct.h> 41#include <linux/delayacct.h>
42#include <linux/sysctl.h>
42 43
43#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
44#include <asm/div64.h> 45#include <asm/div64.h>
@@ -78,7 +79,7 @@ struct scan_control {
78 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, 79 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
79 unsigned long *scanned, int order, int mode, 80 unsigned long *scanned, int order, int mode,
80 struct zone *z, struct mem_cgroup *mem_cont, 81 struct zone *z, struct mem_cgroup *mem_cont,
81 int active); 82 int active, int file);
82}; 83};
83 84
84#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 85#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -470,6 +471,85 @@ int remove_mapping(struct address_space *mapping, struct page *page)
470 return 0; 471 return 0;
471} 472}
472 473
474/**
475 * putback_lru_page - put previously isolated page onto appropriate LRU list
476 * @page: page to be put back to appropriate lru list
477 *
478 * Add previously isolated @page to appropriate LRU list.
479 * Page may still be unevictable for other reasons.
480 *
481 * lru_lock must not be held, interrupts must be enabled.
482 */
483#ifdef CONFIG_UNEVICTABLE_LRU
484void putback_lru_page(struct page *page)
485{
486 int lru;
487 int active = !!TestClearPageActive(page);
488 int was_unevictable = PageUnevictable(page);
489
490 VM_BUG_ON(PageLRU(page));
491
492redo:
493 ClearPageUnevictable(page);
494
495 if (page_evictable(page, NULL)) {
496 /*
497 * For evictable pages, we can use the cache.
498 * In event of a race, worst case is we end up with an
499 * unevictable page on [in]active list.
500 * We know how to handle that.
501 */
502 lru = active + page_is_file_cache(page);
503 lru_cache_add_lru(page, lru);
504 } else {
505 /*
506 * Put unevictable pages directly on zone's unevictable
507 * list.
508 */
509 lru = LRU_UNEVICTABLE;
510 add_page_to_unevictable_list(page);
511 }
512 mem_cgroup_move_lists(page, lru);
513
514 /*
515 * page's status can change while we move it among lru. If an evictable
516 * page is on unevictable list, it never be freed. To avoid that,
517 * check after we added it to the list, again.
518 */
519 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
520 if (!isolate_lru_page(page)) {
521 put_page(page);
522 goto redo;
523 }
524 /* This means someone else dropped this page from LRU
525 * So, it will be freed or putback to LRU again. There is
526 * nothing to do here.
527 */
528 }
529
530 if (was_unevictable && lru != LRU_UNEVICTABLE)
531 count_vm_event(UNEVICTABLE_PGRESCUED);
532 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
533 count_vm_event(UNEVICTABLE_PGCULLED);
534
535 put_page(page); /* drop ref from isolate */
536}
537
538#else /* CONFIG_UNEVICTABLE_LRU */
539
540void putback_lru_page(struct page *page)
541{
542 int lru;
543 VM_BUG_ON(PageLRU(page));
544
545 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
546 lru_cache_add_lru(page, lru);
547 mem_cgroup_move_lists(page, lru);
548 put_page(page);
549}
550#endif /* CONFIG_UNEVICTABLE_LRU */
551
552
473/* 553/*
474 * shrink_page_list() returns the number of reclaimed pages 554 * shrink_page_list() returns the number of reclaimed pages
475 */ 555 */
@@ -503,6 +583,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
503 583
504 sc->nr_scanned++; 584 sc->nr_scanned++;
505 585
586 if (unlikely(!page_evictable(page, NULL)))
587 goto cull_mlocked;
588
506 if (!sc->may_swap && page_mapped(page)) 589 if (!sc->may_swap && page_mapped(page))
507 goto keep_locked; 590 goto keep_locked;
508 591
@@ -539,9 +622,19 @@ static unsigned long shrink_page_list(struct list_head *page_list,
539 * Anonymous process memory has backing store? 622 * Anonymous process memory has backing store?
540 * Try to allocate it some swap space here. 623 * Try to allocate it some swap space here.
541 */ 624 */
542 if (PageAnon(page) && !PageSwapCache(page)) 625 if (PageAnon(page) && !PageSwapCache(page)) {
626 switch (try_to_munlock(page)) {
627 case SWAP_FAIL: /* shouldn't happen */
628 case SWAP_AGAIN:
629 goto keep_locked;
630 case SWAP_MLOCK:
631 goto cull_mlocked;
632 case SWAP_SUCCESS:
633 ; /* fall thru'; add to swap cache */
634 }
543 if (!add_to_swap(page, GFP_ATOMIC)) 635 if (!add_to_swap(page, GFP_ATOMIC))
544 goto activate_locked; 636 goto activate_locked;
637 }
545#endif /* CONFIG_SWAP */ 638#endif /* CONFIG_SWAP */
546 639
547 mapping = page_mapping(page); 640 mapping = page_mapping(page);
@@ -556,6 +649,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
556 goto activate_locked; 649 goto activate_locked;
557 case SWAP_AGAIN: 650 case SWAP_AGAIN:
558 goto keep_locked; 651 goto keep_locked;
652 case SWAP_MLOCK:
653 goto cull_mlocked;
559 case SWAP_SUCCESS: 654 case SWAP_SUCCESS:
560 ; /* try to free the page below */ 655 ; /* try to free the page below */
561 } 656 }
@@ -602,7 +697,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
602 * possible for a page to have PageDirty set, but it is actually 697 * possible for a page to have PageDirty set, but it is actually
603 * clean (all its buffers are clean). This happens if the 698 * clean (all its buffers are clean). This happens if the
604 * buffers were written out directly, with submit_bh(). ext3 699 * buffers were written out directly, with submit_bh(). ext3
605 * will do this, as well as the blockdev mapping. 700 * will do this, as well as the blockdev mapping.
606 * try_to_release_page() will discover that cleanness and will 701 * try_to_release_page() will discover that cleanness and will
607 * drop the buffers and mark the page clean - it can be freed. 702 * drop the buffers and mark the page clean - it can be freed.
608 * 703 *
@@ -637,7 +732,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
637 if (!mapping || !__remove_mapping(mapping, page)) 732 if (!mapping || !__remove_mapping(mapping, page))
638 goto keep_locked; 733 goto keep_locked;
639 734
640 unlock_page(page); 735 /*
736 * At this point, we have no other references and there is
737 * no way to pick any more up (removed from LRU, removed
738 * from pagecache). Can use non-atomic bitops now (and
739 * we obviously don't have to worry about waking up a process
740 * waiting on the page lock, because there are no references.
741 */
742 __clear_page_locked(page);
641free_it: 743free_it:
642 nr_reclaimed++; 744 nr_reclaimed++;
643 if (!pagevec_add(&freed_pvec, page)) { 745 if (!pagevec_add(&freed_pvec, page)) {
@@ -646,14 +748,23 @@ free_it:
646 } 748 }
647 continue; 749 continue;
648 750
751cull_mlocked:
752 unlock_page(page);
753 putback_lru_page(page);
754 continue;
755
649activate_locked: 756activate_locked:
757 /* Not a candidate for swapping, so reclaim swap space. */
758 if (PageSwapCache(page) && vm_swap_full())
759 remove_exclusive_swap_page_ref(page);
760 VM_BUG_ON(PageActive(page));
650 SetPageActive(page); 761 SetPageActive(page);
651 pgactivate++; 762 pgactivate++;
652keep_locked: 763keep_locked:
653 unlock_page(page); 764 unlock_page(page);
654keep: 765keep:
655 list_add(&page->lru, &ret_pages); 766 list_add(&page->lru, &ret_pages);
656 VM_BUG_ON(PageLRU(page)); 767 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
657 } 768 }
658 list_splice(&ret_pages, page_list); 769 list_splice(&ret_pages, page_list);
659 if (pagevec_count(&freed_pvec)) 770 if (pagevec_count(&freed_pvec))
@@ -677,7 +788,7 @@ keep:
677 * 788 *
678 * returns 0 on success, -ve errno on failure. 789 * returns 0 on success, -ve errno on failure.
679 */ 790 */
680int __isolate_lru_page(struct page *page, int mode) 791int __isolate_lru_page(struct page *page, int mode, int file)
681{ 792{
682 int ret = -EINVAL; 793 int ret = -EINVAL;
683 794
@@ -693,6 +804,17 @@ int __isolate_lru_page(struct page *page, int mode)
693 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 804 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
694 return ret; 805 return ret;
695 806
807 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
808 return ret;
809
810 /*
811 * When this function is being called for lumpy reclaim, we
812 * initially look into all LRU pages, active, inactive and
813 * unevictable; only give shrink_page_list evictable pages.
814 */
815 if (PageUnevictable(page))
816 return ret;
817
696 ret = -EBUSY; 818 ret = -EBUSY;
697 if (likely(get_page_unless_zero(page))) { 819 if (likely(get_page_unless_zero(page))) {
698 /* 820 /*
@@ -723,12 +845,13 @@ int __isolate_lru_page(struct page *page, int mode)
723 * @scanned: The number of pages that were scanned. 845 * @scanned: The number of pages that were scanned.
724 * @order: The caller's attempted allocation order 846 * @order: The caller's attempted allocation order
725 * @mode: One of the LRU isolation modes 847 * @mode: One of the LRU isolation modes
848 * @file: True [1] if isolating file [!anon] pages
726 * 849 *
727 * returns how many pages were moved onto *@dst. 850 * returns how many pages were moved onto *@dst.
728 */ 851 */
729static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 852static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
730 struct list_head *src, struct list_head *dst, 853 struct list_head *src, struct list_head *dst,
731 unsigned long *scanned, int order, int mode) 854 unsigned long *scanned, int order, int mode, int file)
732{ 855{
733 unsigned long nr_taken = 0; 856 unsigned long nr_taken = 0;
734 unsigned long scan; 857 unsigned long scan;
@@ -745,7 +868,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
745 868
746 VM_BUG_ON(!PageLRU(page)); 869 VM_BUG_ON(!PageLRU(page));
747 870
748 switch (__isolate_lru_page(page, mode)) { 871 switch (__isolate_lru_page(page, mode, file)) {
749 case 0: 872 case 0:
750 list_move(&page->lru, dst); 873 list_move(&page->lru, dst);
751 nr_taken++; 874 nr_taken++;
@@ -788,10 +911,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
788 break; 911 break;
789 912
790 cursor_page = pfn_to_page(pfn); 913 cursor_page = pfn_to_page(pfn);
914
791 /* Check that we have not crossed a zone boundary. */ 915 /* Check that we have not crossed a zone boundary. */
792 if (unlikely(page_zone_id(cursor_page) != zone_id)) 916 if (unlikely(page_zone_id(cursor_page) != zone_id))
793 continue; 917 continue;
794 switch (__isolate_lru_page(cursor_page, mode)) { 918 switch (__isolate_lru_page(cursor_page, mode, file)) {
795 case 0: 919 case 0:
796 list_move(&cursor_page->lru, dst); 920 list_move(&cursor_page->lru, dst);
797 nr_taken++; 921 nr_taken++;
@@ -802,7 +926,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
802 /* else it is being freed elsewhere */ 926 /* else it is being freed elsewhere */
803 list_move(&cursor_page->lru, src); 927 list_move(&cursor_page->lru, src);
804 default: 928 default:
805 break; 929 break; /* ! on LRU or wrong list */
806 } 930 }
807 } 931 }
808 } 932 }
@@ -816,40 +940,93 @@ static unsigned long isolate_pages_global(unsigned long nr,
816 unsigned long *scanned, int order, 940 unsigned long *scanned, int order,
817 int mode, struct zone *z, 941 int mode, struct zone *z,
818 struct mem_cgroup *mem_cont, 942 struct mem_cgroup *mem_cont,
819 int active) 943 int active, int file)
820{ 944{
945 int lru = LRU_BASE;
821 if (active) 946 if (active)
822 return isolate_lru_pages(nr, &z->active_list, dst, 947 lru += LRU_ACTIVE;
823 scanned, order, mode); 948 if (file)
824 else 949 lru += LRU_FILE;
825 return isolate_lru_pages(nr, &z->inactive_list, dst, 950 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
826 scanned, order, mode); 951 mode, !!file);
827} 952}
828 953
829/* 954/*
830 * clear_active_flags() is a helper for shrink_active_list(), clearing 955 * clear_active_flags() is a helper for shrink_active_list(), clearing
831 * any active bits from the pages in the list. 956 * any active bits from the pages in the list.
832 */ 957 */
833static unsigned long clear_active_flags(struct list_head *page_list) 958static unsigned long clear_active_flags(struct list_head *page_list,
959 unsigned int *count)
834{ 960{
835 int nr_active = 0; 961 int nr_active = 0;
962 int lru;
836 struct page *page; 963 struct page *page;
837 964
838 list_for_each_entry(page, page_list, lru) 965 list_for_each_entry(page, page_list, lru) {
966 lru = page_is_file_cache(page);
839 if (PageActive(page)) { 967 if (PageActive(page)) {
968 lru += LRU_ACTIVE;
840 ClearPageActive(page); 969 ClearPageActive(page);
841 nr_active++; 970 nr_active++;
842 } 971 }
972 count[lru]++;
973 }
843 974
844 return nr_active; 975 return nr_active;
845} 976}
846 977
978/**
979 * isolate_lru_page - tries to isolate a page from its LRU list
980 * @page: page to isolate from its LRU list
981 *
982 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
983 * vmstat statistic corresponding to whatever LRU list the page was on.
984 *
985 * Returns 0 if the page was removed from an LRU list.
986 * Returns -EBUSY if the page was not on an LRU list.
987 *
988 * The returned page will have PageLRU() cleared. If it was found on
989 * the active list, it will have PageActive set. If it was found on
990 * the unevictable list, it will have the PageUnevictable bit set. That flag
991 * may need to be cleared by the caller before letting the page go.
992 *
993 * The vmstat statistic corresponding to the list on which the page was
994 * found will be decremented.
995 *
996 * Restrictions:
997 * (1) Must be called with an elevated refcount on the page. This is a
998 * fundamentnal difference from isolate_lru_pages (which is called
999 * without a stable reference).
1000 * (2) the lru_lock must not be held.
1001 * (3) interrupts must be enabled.
1002 */
1003int isolate_lru_page(struct page *page)
1004{
1005 int ret = -EBUSY;
1006
1007 if (PageLRU(page)) {
1008 struct zone *zone = page_zone(page);
1009
1010 spin_lock_irq(&zone->lru_lock);
1011 if (PageLRU(page) && get_page_unless_zero(page)) {
1012 int lru = page_lru(page);
1013 ret = 0;
1014 ClearPageLRU(page);
1015
1016 del_page_from_lru_list(zone, page, lru);
1017 }
1018 spin_unlock_irq(&zone->lru_lock);
1019 }
1020 return ret;
1021}
1022
847/* 1023/*
848 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1024 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
849 * of reclaimed pages 1025 * of reclaimed pages
850 */ 1026 */
851static unsigned long shrink_inactive_list(unsigned long max_scan, 1027static unsigned long shrink_inactive_list(unsigned long max_scan,
852 struct zone *zone, struct scan_control *sc) 1028 struct zone *zone, struct scan_control *sc,
1029 int priority, int file)
853{ 1030{
854 LIST_HEAD(page_list); 1031 LIST_HEAD(page_list);
855 struct pagevec pvec; 1032 struct pagevec pvec;
@@ -866,20 +1043,43 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
866 unsigned long nr_scan; 1043 unsigned long nr_scan;
867 unsigned long nr_freed; 1044 unsigned long nr_freed;
868 unsigned long nr_active; 1045 unsigned long nr_active;
1046 unsigned int count[NR_LRU_LISTS] = { 0, };
1047 int mode = ISOLATE_INACTIVE;
1048
1049 /*
1050 * If we need a large contiguous chunk of memory, or have
1051 * trouble getting a small set of contiguous pages, we
1052 * will reclaim both active and inactive pages.
1053 *
1054 * We use the same threshold as pageout congestion_wait below.
1055 */
1056 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1057 mode = ISOLATE_BOTH;
1058 else if (sc->order && priority < DEF_PRIORITY - 2)
1059 mode = ISOLATE_BOTH;
869 1060
870 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1061 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
871 &page_list, &nr_scan, sc->order, 1062 &page_list, &nr_scan, sc->order, mode,
872 (sc->order > PAGE_ALLOC_COSTLY_ORDER)? 1063 zone, sc->mem_cgroup, 0, file);
873 ISOLATE_BOTH : ISOLATE_INACTIVE, 1064 nr_active = clear_active_flags(&page_list, count);
874 zone, sc->mem_cgroup, 0);
875 nr_active = clear_active_flags(&page_list);
876 __count_vm_events(PGDEACTIVATE, nr_active); 1065 __count_vm_events(PGDEACTIVATE, nr_active);
877 1066
878 __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); 1067 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
879 __mod_zone_page_state(zone, NR_INACTIVE, 1068 -count[LRU_ACTIVE_FILE]);
880 -(nr_taken - nr_active)); 1069 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
881 if (scan_global_lru(sc)) 1070 -count[LRU_INACTIVE_FILE]);
1071 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1072 -count[LRU_ACTIVE_ANON]);
1073 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1074 -count[LRU_INACTIVE_ANON]);
1075
1076 if (scan_global_lru(sc)) {
882 zone->pages_scanned += nr_scan; 1077 zone->pages_scanned += nr_scan;
1078 zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1079 zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
1080 zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1081 zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1082 }
883 spin_unlock_irq(&zone->lru_lock); 1083 spin_unlock_irq(&zone->lru_lock);
884 1084
885 nr_scanned += nr_scan; 1085 nr_scanned += nr_scan;
@@ -899,7 +1099,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
899 * The attempt at page out may have made some 1099 * The attempt at page out may have made some
900 * of the pages active, mark them inactive again. 1100 * of the pages active, mark them inactive again.
901 */ 1101 */
902 nr_active = clear_active_flags(&page_list); 1102 nr_active = clear_active_flags(&page_list, count);
903 count_vm_events(PGDEACTIVATE, nr_active); 1103 count_vm_events(PGDEACTIVATE, nr_active);
904 1104
905 nr_freed += shrink_page_list(&page_list, sc, 1105 nr_freed += shrink_page_list(&page_list, sc,
@@ -924,14 +1124,24 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
924 * Put back any unfreeable pages. 1124 * Put back any unfreeable pages.
925 */ 1125 */
926 while (!list_empty(&page_list)) { 1126 while (!list_empty(&page_list)) {
1127 int lru;
927 page = lru_to_page(&page_list); 1128 page = lru_to_page(&page_list);
928 VM_BUG_ON(PageLRU(page)); 1129 VM_BUG_ON(PageLRU(page));
929 SetPageLRU(page);
930 list_del(&page->lru); 1130 list_del(&page->lru);
931 if (PageActive(page)) 1131 if (unlikely(!page_evictable(page, NULL))) {
932 add_page_to_active_list(zone, page); 1132 spin_unlock_irq(&zone->lru_lock);
933 else 1133 putback_lru_page(page);
934 add_page_to_inactive_list(zone, page); 1134 spin_lock_irq(&zone->lru_lock);
1135 continue;
1136 }
1137 SetPageLRU(page);
1138 lru = page_lru(page);
1139 add_page_to_lru_list(zone, page, lru);
1140 mem_cgroup_move_lists(page, lru);
1141 if (PageActive(page) && scan_global_lru(sc)) {
1142 int file = !!page_is_file_cache(page);
1143 zone->recent_rotated[file]++;
1144 }
935 if (!pagevec_add(&pvec, page)) { 1145 if (!pagevec_add(&pvec, page)) {
936 spin_unlock_irq(&zone->lru_lock); 1146 spin_unlock_irq(&zone->lru_lock);
937 __pagevec_release(&pvec); 1147 __pagevec_release(&pvec);
@@ -962,115 +1172,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
962 1172
963static inline int zone_is_near_oom(struct zone *zone) 1173static inline int zone_is_near_oom(struct zone *zone)
964{ 1174{
965 return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) 1175 return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
966 + zone_page_state(zone, NR_INACTIVE))*3;
967}
968
969/*
970 * Determine we should try to reclaim mapped pages.
971 * This is called only when sc->mem_cgroup is NULL.
972 */
973static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
974 int priority)
975{
976 long mapped_ratio;
977 long distress;
978 long swap_tendency;
979 long imbalance;
980 int reclaim_mapped = 0;
981 int prev_priority;
982
983 if (scan_global_lru(sc) && zone_is_near_oom(zone))
984 return 1;
985 /*
986 * `distress' is a measure of how much trouble we're having
987 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
988 */
989 if (scan_global_lru(sc))
990 prev_priority = zone->prev_priority;
991 else
992 prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
993
994 distress = 100 >> min(prev_priority, priority);
995
996 /*
997 * The point of this algorithm is to decide when to start
998 * reclaiming mapped memory instead of just pagecache. Work out
999 * how much memory
1000 * is mapped.
1001 */
1002 if (scan_global_lru(sc))
1003 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
1004 global_page_state(NR_ANON_PAGES)) * 100) /
1005 vm_total_pages;
1006 else
1007 mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
1008
1009 /*
1010 * Now decide how much we really want to unmap some pages. The
1011 * mapped ratio is downgraded - just because there's a lot of
1012 * mapped memory doesn't necessarily mean that page reclaim
1013 * isn't succeeding.
1014 *
1015 * The distress ratio is important - we don't want to start
1016 * going oom.
1017 *
1018 * A 100% value of vm_swappiness overrides this algorithm
1019 * altogether.
1020 */
1021 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
1022
1023 /*
1024 * If there's huge imbalance between active and inactive
1025 * (think active 100 times larger than inactive) we should
1026 * become more permissive, or the system will take too much
1027 * cpu before it start swapping during memory pressure.
1028 * Distress is about avoiding early-oom, this is about
1029 * making swappiness graceful despite setting it to low
1030 * values.
1031 *
1032 * Avoid div by zero with nr_inactive+1, and max resulting
1033 * value is vm_total_pages.
1034 */
1035 if (scan_global_lru(sc)) {
1036 imbalance = zone_page_state(zone, NR_ACTIVE);
1037 imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
1038 } else
1039 imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
1040
1041 /*
1042 * Reduce the effect of imbalance if swappiness is low,
1043 * this means for a swappiness very low, the imbalance
1044 * must be much higher than 100 for this logic to make
1045 * the difference.
1046 *
1047 * Max temporary value is vm_total_pages*100.
1048 */
1049 imbalance *= (vm_swappiness + 1);
1050 imbalance /= 100;
1051
1052 /*
1053 * If not much of the ram is mapped, makes the imbalance
1054 * less relevant, it's high priority we refill the inactive
1055 * list with mapped pages only in presence of high ratio of
1056 * mapped pages.
1057 *
1058 * Max temporary value is vm_total_pages*100.
1059 */
1060 imbalance *= mapped_ratio;
1061 imbalance /= 100;
1062
1063 /* apply imbalance feedback to swap_tendency */
1064 swap_tendency += imbalance;
1065
1066 /*
1067 * Now use this metric to decide whether to start moving mapped
1068 * memory onto the inactive list.
1069 */
1070 if (swap_tendency >= 100)
1071 reclaim_mapped = 1;
1072
1073 return reclaim_mapped;
1074} 1176}
1075 1177
1076/* 1178/*
@@ -1093,53 +1195,71 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
1093 1195
1094 1196
1095static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1197static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1096 struct scan_control *sc, int priority) 1198 struct scan_control *sc, int priority, int file)
1097{ 1199{
1098 unsigned long pgmoved; 1200 unsigned long pgmoved;
1099 int pgdeactivate = 0; 1201 int pgdeactivate = 0;
1100 unsigned long pgscanned; 1202 unsigned long pgscanned;
1101 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1203 LIST_HEAD(l_hold); /* The pages which were snipped off */
1102 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 1204 LIST_HEAD(l_inactive);
1103 LIST_HEAD(l_active); /* Pages to go onto the active_list */
1104 struct page *page; 1205 struct page *page;
1105 struct pagevec pvec; 1206 struct pagevec pvec;
1106 int reclaim_mapped = 0; 1207 enum lru_list lru;
1107
1108 if (sc->may_swap)
1109 reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
1110 1208
1111 lru_add_drain(); 1209 lru_add_drain();
1112 spin_lock_irq(&zone->lru_lock); 1210 spin_lock_irq(&zone->lru_lock);
1113 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, 1211 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1114 ISOLATE_ACTIVE, zone, 1212 ISOLATE_ACTIVE, zone,
1115 sc->mem_cgroup, 1); 1213 sc->mem_cgroup, 1, file);
1116 /* 1214 /*
1117 * zone->pages_scanned is used for detect zone's oom 1215 * zone->pages_scanned is used for detect zone's oom
1118 * mem_cgroup remembers nr_scan by itself. 1216 * mem_cgroup remembers nr_scan by itself.
1119 */ 1217 */
1120 if (scan_global_lru(sc)) 1218 if (scan_global_lru(sc)) {
1121 zone->pages_scanned += pgscanned; 1219 zone->pages_scanned += pgscanned;
1220 zone->recent_scanned[!!file] += pgmoved;
1221 }
1122 1222
1123 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 1223 if (file)
1224 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1225 else
1226 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1124 spin_unlock_irq(&zone->lru_lock); 1227 spin_unlock_irq(&zone->lru_lock);
1125 1228
1229 pgmoved = 0;
1126 while (!list_empty(&l_hold)) { 1230 while (!list_empty(&l_hold)) {
1127 cond_resched(); 1231 cond_resched();
1128 page = lru_to_page(&l_hold); 1232 page = lru_to_page(&l_hold);
1129 list_del(&page->lru); 1233 list_del(&page->lru);
1130 if (page_mapped(page)) { 1234
1131 if (!reclaim_mapped || 1235 if (unlikely(!page_evictable(page, NULL))) {
1132 (total_swap_pages == 0 && PageAnon(page)) || 1236 putback_lru_page(page);
1133 page_referenced(page, 0, sc->mem_cgroup)) { 1237 continue;
1134 list_add(&page->lru, &l_active);
1135 continue;
1136 }
1137 } 1238 }
1239
1240 /* page_referenced clears PageReferenced */
1241 if (page_mapping_inuse(page) &&
1242 page_referenced(page, 0, sc->mem_cgroup))
1243 pgmoved++;
1244
1138 list_add(&page->lru, &l_inactive); 1245 list_add(&page->lru, &l_inactive);
1139 } 1246 }
1140 1247
1248 /*
1249 * Count referenced pages from currently used mappings as
1250 * rotated, even though they are moved to the inactive list.
1251 * This helps balance scan pressure between file and anonymous
1252 * pages in get_scan_ratio.
1253 */
1254 zone->recent_rotated[!!file] += pgmoved;
1255
1256 /*
1257 * Move the pages to the [file or anon] inactive list.
1258 */
1141 pagevec_init(&pvec, 1); 1259 pagevec_init(&pvec, 1);
1260
1142 pgmoved = 0; 1261 pgmoved = 0;
1262 lru = LRU_BASE + file * LRU_FILE;
1143 spin_lock_irq(&zone->lru_lock); 1263 spin_lock_irq(&zone->lru_lock);
1144 while (!list_empty(&l_inactive)) { 1264 while (!list_empty(&l_inactive)) {
1145 page = lru_to_page(&l_inactive); 1265 page = lru_to_page(&l_inactive);
@@ -1149,11 +1269,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1149 VM_BUG_ON(!PageActive(page)); 1269 VM_BUG_ON(!PageActive(page));
1150 ClearPageActive(page); 1270 ClearPageActive(page);
1151 1271
1152 list_move(&page->lru, &zone->inactive_list); 1272 list_move(&page->lru, &zone->lru[lru].list);
1153 mem_cgroup_move_lists(page, false); 1273 mem_cgroup_move_lists(page, lru);
1154 pgmoved++; 1274 pgmoved++;
1155 if (!pagevec_add(&pvec, page)) { 1275 if (!pagevec_add(&pvec, page)) {
1156 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1276 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1157 spin_unlock_irq(&zone->lru_lock); 1277 spin_unlock_irq(&zone->lru_lock);
1158 pgdeactivate += pgmoved; 1278 pgdeactivate += pgmoved;
1159 pgmoved = 0; 1279 pgmoved = 0;
@@ -1163,104 +1283,189 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1163 spin_lock_irq(&zone->lru_lock); 1283 spin_lock_irq(&zone->lru_lock);
1164 } 1284 }
1165 } 1285 }
1166 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1286 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1167 pgdeactivate += pgmoved; 1287 pgdeactivate += pgmoved;
1168 if (buffer_heads_over_limit) { 1288 if (buffer_heads_over_limit) {
1169 spin_unlock_irq(&zone->lru_lock); 1289 spin_unlock_irq(&zone->lru_lock);
1170 pagevec_strip(&pvec); 1290 pagevec_strip(&pvec);
1171 spin_lock_irq(&zone->lru_lock); 1291 spin_lock_irq(&zone->lru_lock);
1172 } 1292 }
1173
1174 pgmoved = 0;
1175 while (!list_empty(&l_active)) {
1176 page = lru_to_page(&l_active);
1177 prefetchw_prev_lru_page(page, &l_active, flags);
1178 VM_BUG_ON(PageLRU(page));
1179 SetPageLRU(page);
1180 VM_BUG_ON(!PageActive(page));
1181
1182 list_move(&page->lru, &zone->active_list);
1183 mem_cgroup_move_lists(page, true);
1184 pgmoved++;
1185 if (!pagevec_add(&pvec, page)) {
1186 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
1187 pgmoved = 0;
1188 spin_unlock_irq(&zone->lru_lock);
1189 __pagevec_release(&pvec);
1190 spin_lock_irq(&zone->lru_lock);
1191 }
1192 }
1193 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
1194
1195 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1293 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1196 __count_vm_events(PGDEACTIVATE, pgdeactivate); 1294 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1197 spin_unlock_irq(&zone->lru_lock); 1295 spin_unlock_irq(&zone->lru_lock);
1296 if (vm_swap_full())
1297 pagevec_swap_free(&pvec);
1198 1298
1199 pagevec_release(&pvec); 1299 pagevec_release(&pvec);
1200} 1300}
1201 1301
1302static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1303 struct zone *zone, struct scan_control *sc, int priority)
1304{
1305 int file = is_file_lru(lru);
1306
1307 if (lru == LRU_ACTIVE_FILE) {
1308 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1309 return 0;
1310 }
1311
1312 if (lru == LRU_ACTIVE_ANON &&
1313 (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
1314 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1315 return 0;
1316 }
1317 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1318}
1319
1320/*
1321 * Determine how aggressively the anon and file LRU lists should be
1322 * scanned. The relative value of each set of LRU lists is determined
1323 * by looking at the fraction of the pages scanned we did rotate back
1324 * onto the active list instead of evict.
1325 *
1326 * percent[0] specifies how much pressure to put on ram/swap backed
1327 * memory, while percent[1] determines pressure on the file LRUs.
1328 */
1329static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1330 unsigned long *percent)
1331{
1332 unsigned long anon, file, free;
1333 unsigned long anon_prio, file_prio;
1334 unsigned long ap, fp;
1335
1336 anon = zone_page_state(zone, NR_ACTIVE_ANON) +
1337 zone_page_state(zone, NR_INACTIVE_ANON);
1338 file = zone_page_state(zone, NR_ACTIVE_FILE) +
1339 zone_page_state(zone, NR_INACTIVE_FILE);
1340 free = zone_page_state(zone, NR_FREE_PAGES);
1341
1342 /* If we have no swap space, do not bother scanning anon pages. */
1343 if (nr_swap_pages <= 0) {
1344 percent[0] = 0;
1345 percent[1] = 100;
1346 return;
1347 }
1348
1349 /* If we have very few page cache pages, force-scan anon pages. */
1350 if (unlikely(file + free <= zone->pages_high)) {
1351 percent[0] = 100;
1352 percent[1] = 0;
1353 return;
1354 }
1355
1356 /*
1357 * OK, so we have swap space and a fair amount of page cache
1358 * pages. We use the recently rotated / recently scanned
1359 * ratios to determine how valuable each cache is.
1360 *
1361 * Because workloads change over time (and to avoid overflow)
1362 * we keep these statistics as a floating average, which ends
1363 * up weighing recent references more than old ones.
1364 *
1365 * anon in [0], file in [1]
1366 */
1367 if (unlikely(zone->recent_scanned[0] > anon / 4)) {
1368 spin_lock_irq(&zone->lru_lock);
1369 zone->recent_scanned[0] /= 2;
1370 zone->recent_rotated[0] /= 2;
1371 spin_unlock_irq(&zone->lru_lock);
1372 }
1373
1374 if (unlikely(zone->recent_scanned[1] > file / 4)) {
1375 spin_lock_irq(&zone->lru_lock);
1376 zone->recent_scanned[1] /= 2;
1377 zone->recent_rotated[1] /= 2;
1378 spin_unlock_irq(&zone->lru_lock);
1379 }
1380
1381 /*
1382 * With swappiness at 100, anonymous and file have the same priority.
1383 * This scanning priority is essentially the inverse of IO cost.
1384 */
1385 anon_prio = sc->swappiness;
1386 file_prio = 200 - sc->swappiness;
1387
1388 /*
1389 * anon recent_rotated[0]
1390 * %anon = 100 * ----------- / ----------------- * IO cost
1391 * anon + file rotate_sum
1392 */
1393 ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
1394 ap /= zone->recent_rotated[0] + 1;
1395
1396 fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
1397 fp /= zone->recent_rotated[1] + 1;
1398
1399 /* Normalize to percentages */
1400 percent[0] = 100 * ap / (ap + fp + 1);
1401 percent[1] = 100 - percent[0];
1402}
1403
1404
1202/* 1405/*
1203 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1406 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1204 */ 1407 */
1205static unsigned long shrink_zone(int priority, struct zone *zone, 1408static unsigned long shrink_zone(int priority, struct zone *zone,
1206 struct scan_control *sc) 1409 struct scan_control *sc)
1207{ 1410{
1208 unsigned long nr_active; 1411 unsigned long nr[NR_LRU_LISTS];
1209 unsigned long nr_inactive;
1210 unsigned long nr_to_scan; 1412 unsigned long nr_to_scan;
1211 unsigned long nr_reclaimed = 0; 1413 unsigned long nr_reclaimed = 0;
1414 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1415 enum lru_list l;
1212 1416
1213 if (scan_global_lru(sc)) { 1417 get_scan_ratio(zone, sc, percent);
1214 /*
1215 * Add one to nr_to_scan just to make sure that the kernel
1216 * will slowly sift through the active list.
1217 */
1218 zone->nr_scan_active +=
1219 (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
1220 nr_active = zone->nr_scan_active;
1221 zone->nr_scan_inactive +=
1222 (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
1223 nr_inactive = zone->nr_scan_inactive;
1224 if (nr_inactive >= sc->swap_cluster_max)
1225 zone->nr_scan_inactive = 0;
1226 else
1227 nr_inactive = 0;
1228
1229 if (nr_active >= sc->swap_cluster_max)
1230 zone->nr_scan_active = 0;
1231 else
1232 nr_active = 0;
1233 } else {
1234 /*
1235 * This reclaim occurs not because zone memory shortage but
1236 * because memory controller hits its limit.
1237 * Then, don't modify zone reclaim related data.
1238 */
1239 nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
1240 zone, priority);
1241
1242 nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
1243 zone, priority);
1244 }
1245 1418
1419 for_each_evictable_lru(l) {
1420 if (scan_global_lru(sc)) {
1421 int file = is_file_lru(l);
1422 int scan;
1246 1423
1247 while (nr_active || nr_inactive) { 1424 scan = zone_page_state(zone, NR_LRU_BASE + l);
1248 if (nr_active) { 1425 if (priority) {
1249 nr_to_scan = min(nr_active, 1426 scan >>= priority;
1250 (unsigned long)sc->swap_cluster_max); 1427 scan = (scan * percent[file]) / 100;
1251 nr_active -= nr_to_scan; 1428 }
1252 shrink_active_list(nr_to_scan, zone, sc, priority); 1429 zone->lru[l].nr_scan += scan;
1430 nr[l] = zone->lru[l].nr_scan;
1431 if (nr[l] >= sc->swap_cluster_max)
1432 zone->lru[l].nr_scan = 0;
1433 else
1434 nr[l] = 0;
1435 } else {
1436 /*
1437 * This reclaim occurs not because zone memory shortage
1438 * but because memory controller hits its limit.
1439 * Don't modify zone reclaim related data.
1440 */
1441 nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
1442 priority, l);
1253 } 1443 }
1444 }
1254 1445
1255 if (nr_inactive) { 1446 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1256 nr_to_scan = min(nr_inactive, 1447 nr[LRU_INACTIVE_FILE]) {
1448 for_each_evictable_lru(l) {
1449 if (nr[l]) {
1450 nr_to_scan = min(nr[l],
1257 (unsigned long)sc->swap_cluster_max); 1451 (unsigned long)sc->swap_cluster_max);
1258 nr_inactive -= nr_to_scan; 1452 nr[l] -= nr_to_scan;
1259 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, 1453
1260 sc); 1454 nr_reclaimed += shrink_list(l, nr_to_scan,
1455 zone, sc, priority);
1456 }
1261 } 1457 }
1262 } 1458 }
1263 1459
1460 /*
1461 * Even if we did not try to evict anon pages at all, we want to
1462 * rebalance the anon lru active/inactive ratio.
1463 */
1464 if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
1465 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1466 else if (!scan_global_lru(sc))
1467 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1468
1264 throttle_vm_writeout(sc->gfp_mask); 1469 throttle_vm_writeout(sc->gfp_mask);
1265 return nr_reclaimed; 1470 return nr_reclaimed;
1266} 1471}
@@ -1321,7 +1526,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1321 1526
1322 return nr_reclaimed; 1527 return nr_reclaimed;
1323} 1528}
1324 1529
1325/* 1530/*
1326 * This is the main entry point to direct page reclaim. 1531 * This is the main entry point to direct page reclaim.
1327 * 1532 *
@@ -1364,8 +1569,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1364 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1569 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1365 continue; 1570 continue;
1366 1571
1367 lru_pages += zone_page_state(zone, NR_ACTIVE) 1572 lru_pages += zone_lru_pages(zone);
1368 + zone_page_state(zone, NR_INACTIVE);
1369 } 1573 }
1370 } 1574 }
1371 1575
@@ -1555,6 +1759,14 @@ loop_again:
1555 priority != DEF_PRIORITY) 1759 priority != DEF_PRIORITY)
1556 continue; 1760 continue;
1557 1761
1762 /*
1763 * Do some background aging of the anon list, to give
1764 * pages a chance to be referenced before reclaiming.
1765 */
1766 if (inactive_anon_is_low(zone))
1767 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1768 &sc, priority, 0);
1769
1558 if (!zone_watermark_ok(zone, order, zone->pages_high, 1770 if (!zone_watermark_ok(zone, order, zone->pages_high,
1559 0, 0)) { 1771 0, 0)) {
1560 end_zone = i; 1772 end_zone = i;
@@ -1567,8 +1779,7 @@ loop_again:
1567 for (i = 0; i <= end_zone; i++) { 1779 for (i = 0; i <= end_zone; i++) {
1568 struct zone *zone = pgdat->node_zones + i; 1780 struct zone *zone = pgdat->node_zones + i;
1569 1781
1570 lru_pages += zone_page_state(zone, NR_ACTIVE) 1782 lru_pages += zone_lru_pages(zone);
1571 + zone_page_state(zone, NR_INACTIVE);
1572 } 1783 }
1573 1784
1574 /* 1785 /*
@@ -1612,8 +1823,7 @@ loop_again:
1612 if (zone_is_all_unreclaimable(zone)) 1823 if (zone_is_all_unreclaimable(zone))
1613 continue; 1824 continue;
1614 if (nr_slab == 0 && zone->pages_scanned >= 1825 if (nr_slab == 0 && zone->pages_scanned >=
1615 (zone_page_state(zone, NR_ACTIVE) 1826 (zone_lru_pages(zone) * 6))
1616 + zone_page_state(zone, NR_INACTIVE)) * 6)
1617 zone_set_flag(zone, 1827 zone_set_flag(zone,
1618 ZONE_ALL_UNRECLAIMABLE); 1828 ZONE_ALL_UNRECLAIMABLE);
1619 /* 1829 /*
@@ -1667,7 +1877,7 @@ out:
1667 1877
1668/* 1878/*
1669 * The background pageout daemon, started as a kernel thread 1879 * The background pageout daemon, started as a kernel thread
1670 * from the init process. 1880 * from the init process.
1671 * 1881 *
1672 * This basically trickles out pages so that we have _some_ 1882 * This basically trickles out pages so that we have _some_
1673 * free memory available even if there is no other activity 1883 * free memory available even if there is no other activity
@@ -1761,6 +1971,14 @@ void wakeup_kswapd(struct zone *zone, int order)
1761 wake_up_interruptible(&pgdat->kswapd_wait); 1971 wake_up_interruptible(&pgdat->kswapd_wait);
1762} 1972}
1763 1973
1974unsigned long global_lru_pages(void)
1975{
1976 return global_page_state(NR_ACTIVE_ANON)
1977 + global_page_state(NR_ACTIVE_FILE)
1978 + global_page_state(NR_INACTIVE_ANON)
1979 + global_page_state(NR_INACTIVE_FILE);
1980}
1981
1764#ifdef CONFIG_PM 1982#ifdef CONFIG_PM
1765/* 1983/*
1766 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 1984 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
@@ -1774,6 +1992,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1774{ 1992{
1775 struct zone *zone; 1993 struct zone *zone;
1776 unsigned long nr_to_scan, ret = 0; 1994 unsigned long nr_to_scan, ret = 0;
1995 enum lru_list l;
1777 1996
1778 for_each_zone(zone) { 1997 for_each_zone(zone) {
1779 1998
@@ -1783,38 +2002,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1783 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) 2002 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
1784 continue; 2003 continue;
1785 2004
1786 /* For pass = 0 we don't shrink the active list */ 2005 for_each_evictable_lru(l) {
1787 if (pass > 0) { 2006 /* For pass = 0, we don't shrink the active list */
1788 zone->nr_scan_active += 2007 if (pass == 0 &&
1789 (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; 2008 (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
1790 if (zone->nr_scan_active >= nr_pages || pass > 3) { 2009 continue;
1791 zone->nr_scan_active = 0; 2010
2011 zone->lru[l].nr_scan +=
2012 (zone_page_state(zone, NR_LRU_BASE + l)
2013 >> prio) + 1;
2014 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
2015 zone->lru[l].nr_scan = 0;
1792 nr_to_scan = min(nr_pages, 2016 nr_to_scan = min(nr_pages,
1793 zone_page_state(zone, NR_ACTIVE)); 2017 zone_page_state(zone,
1794 shrink_active_list(nr_to_scan, zone, sc, prio); 2018 NR_LRU_BASE + l));
2019 ret += shrink_list(l, nr_to_scan, zone,
2020 sc, prio);
2021 if (ret >= nr_pages)
2022 return ret;
1795 } 2023 }
1796 } 2024 }
1797
1798 zone->nr_scan_inactive +=
1799 (zone_page_state(zone, NR_INACTIVE) >> prio) + 1;
1800 if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
1801 zone->nr_scan_inactive = 0;
1802 nr_to_scan = min(nr_pages,
1803 zone_page_state(zone, NR_INACTIVE));
1804 ret += shrink_inactive_list(nr_to_scan, zone, sc);
1805 if (ret >= nr_pages)
1806 return ret;
1807 }
1808 } 2025 }
1809 2026
1810 return ret; 2027 return ret;
1811} 2028}
1812 2029
1813static unsigned long count_lru_pages(void)
1814{
1815 return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
1816}
1817
1818/* 2030/*
1819 * Try to free `nr_pages' of memory, system-wide, and return the number of 2031 * Try to free `nr_pages' of memory, system-wide, and return the number of
1820 * freed pages. 2032 * freed pages.
@@ -1840,7 +2052,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1840 2052
1841 current->reclaim_state = &reclaim_state; 2053 current->reclaim_state = &reclaim_state;
1842 2054
1843 lru_pages = count_lru_pages(); 2055 lru_pages = global_lru_pages();
1844 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2056 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1845 /* If slab caches are huge, it's better to hit them first */ 2057 /* If slab caches are huge, it's better to hit them first */
1846 while (nr_slab >= lru_pages) { 2058 while (nr_slab >= lru_pages) {
@@ -1883,7 +2095,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1883 2095
1884 reclaim_state.reclaimed_slab = 0; 2096 reclaim_state.reclaimed_slab = 0;
1885 shrink_slab(sc.nr_scanned, sc.gfp_mask, 2097 shrink_slab(sc.nr_scanned, sc.gfp_mask,
1886 count_lru_pages()); 2098 global_lru_pages());
1887 ret += reclaim_state.reclaimed_slab; 2099 ret += reclaim_state.reclaimed_slab;
1888 if (ret >= nr_pages) 2100 if (ret >= nr_pages)
1889 goto out; 2101 goto out;
@@ -1900,7 +2112,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1900 if (!ret) { 2112 if (!ret) {
1901 do { 2113 do {
1902 reclaim_state.reclaimed_slab = 0; 2114 reclaim_state.reclaimed_slab = 0;
1903 shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); 2115 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
1904 ret += reclaim_state.reclaimed_slab; 2116 ret += reclaim_state.reclaimed_slab;
1905 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 2117 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
1906 } 2118 }
@@ -2128,3 +2340,285 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2128 return ret; 2340 return ret;
2129} 2341}
2130#endif 2342#endif
2343
2344#ifdef CONFIG_UNEVICTABLE_LRU
2345/*
2346 * page_evictable - test whether a page is evictable
2347 * @page: the page to test
2348 * @vma: the VMA in which the page is or will be mapped, may be NULL
2349 *
2350 * Test whether page is evictable--i.e., should be placed on active/inactive
2351 * lists vs unevictable list. The vma argument is !NULL when called from the
2352 * fault path to determine how to instantate a new page.
2353 *
2354 * Reasons page might not be evictable:
2355 * (1) page's mapping marked unevictable
2356 * (2) page is part of an mlocked VMA
2357 *
2358 */
2359int page_evictable(struct page *page, struct vm_area_struct *vma)
2360{
2361
2362 if (mapping_unevictable(page_mapping(page)))
2363 return 0;
2364
2365 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
2366 return 0;
2367
2368 return 1;
2369}
2370
2371static void show_page_path(struct page *page)
2372{
2373 char buf[256];
2374 if (page_is_file_cache(page)) {
2375 struct address_space *mapping = page->mapping;
2376 struct dentry *dentry;
2377 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
2378
2379 spin_lock(&mapping->i_mmap_lock);
2380 dentry = d_find_alias(mapping->host);
2381 printk(KERN_INFO "rescued: %s %lu\n",
2382 dentry_path(dentry, buf, 256), pgoff);
2383 spin_unlock(&mapping->i_mmap_lock);
2384 } else {
2385#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU)
2386 struct anon_vma *anon_vma;
2387 struct vm_area_struct *vma;
2388
2389 anon_vma = page_lock_anon_vma(page);
2390 if (!anon_vma)
2391 return;
2392
2393 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
2394 printk(KERN_INFO "rescued: anon %s\n",
2395 vma->vm_mm->owner->comm);
2396 break;
2397 }
2398 page_unlock_anon_vma(anon_vma);
2399#endif
2400 }
2401}
2402
2403
2404/**
2405 * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
2406 * @page: page to check evictability and move to appropriate lru list
2407 * @zone: zone page is in
2408 *
2409 * Checks a page for evictability and moves the page to the appropriate
2410 * zone lru list.
2411 *
2412 * Restrictions: zone->lru_lock must be held, page must be on LRU and must
2413 * have PageUnevictable set.
2414 */
2415static void check_move_unevictable_page(struct page *page, struct zone *zone)
2416{
2417 VM_BUG_ON(PageActive(page));
2418
2419retry:
2420 ClearPageUnevictable(page);
2421 if (page_evictable(page, NULL)) {
2422 enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
2423
2424 show_page_path(page);
2425
2426 __dec_zone_state(zone, NR_UNEVICTABLE);
2427 list_move(&page->lru, &zone->lru[l].list);
2428 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
2429 __count_vm_event(UNEVICTABLE_PGRESCUED);
2430 } else {
2431 /*
2432 * rotate unevictable list
2433 */
2434 SetPageUnevictable(page);
2435 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
2436 if (page_evictable(page, NULL))
2437 goto retry;
2438 }
2439}
2440
2441/**
2442 * scan_mapping_unevictable_pages - scan an address space for evictable pages
2443 * @mapping: struct address_space to scan for evictable pages
2444 *
2445 * Scan all pages in mapping. Check unevictable pages for
2446 * evictability and move them to the appropriate zone lru list.
2447 */
2448void scan_mapping_unevictable_pages(struct address_space *mapping)
2449{
2450 pgoff_t next = 0;
2451 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
2452 PAGE_CACHE_SHIFT;
2453 struct zone *zone;
2454 struct pagevec pvec;
2455
2456 if (mapping->nrpages == 0)
2457 return;
2458
2459 pagevec_init(&pvec, 0);
2460 while (next < end &&
2461 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
2462 int i;
2463 int pg_scanned = 0;
2464
2465 zone = NULL;
2466
2467 for (i = 0; i < pagevec_count(&pvec); i++) {
2468 struct page *page = pvec.pages[i];
2469 pgoff_t page_index = page->index;
2470 struct zone *pagezone = page_zone(page);
2471
2472 pg_scanned++;
2473 if (page_index > next)
2474 next = page_index;
2475 next++;
2476
2477 if (pagezone != zone) {
2478 if (zone)
2479 spin_unlock_irq(&zone->lru_lock);
2480 zone = pagezone;
2481 spin_lock_irq(&zone->lru_lock);
2482 }
2483
2484 if (PageLRU(page) && PageUnevictable(page))
2485 check_move_unevictable_page(page, zone);
2486 }
2487 if (zone)
2488 spin_unlock_irq(&zone->lru_lock);
2489 pagevec_release(&pvec);
2490
2491 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
2492 }
2493
2494}
2495
2496/**
2497 * scan_zone_unevictable_pages - check unevictable list for evictable pages
2498 * @zone - zone of which to scan the unevictable list
2499 *
2500 * Scan @zone's unevictable LRU lists to check for pages that have become
2501 * evictable. Move those that have to @zone's inactive list where they
2502 * become candidates for reclaim, unless shrink_inactive_zone() decides
2503 * to reactivate them. Pages that are still unevictable are rotated
2504 * back onto @zone's unevictable list.
2505 */
2506#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
2507void scan_zone_unevictable_pages(struct zone *zone)
2508{
2509 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
2510 unsigned long scan;
2511 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
2512
2513 while (nr_to_scan > 0) {
2514 unsigned long batch_size = min(nr_to_scan,
2515 SCAN_UNEVICTABLE_BATCH_SIZE);
2516
2517 spin_lock_irq(&zone->lru_lock);
2518 for (scan = 0; scan < batch_size; scan++) {
2519 struct page *page = lru_to_page(l_unevictable);
2520
2521 if (!trylock_page(page))
2522 continue;
2523
2524 prefetchw_prev_lru_page(page, l_unevictable, flags);
2525
2526 if (likely(PageLRU(page) && PageUnevictable(page)))
2527 check_move_unevictable_page(page, zone);
2528
2529 unlock_page(page);
2530 }
2531 spin_unlock_irq(&zone->lru_lock);
2532
2533 nr_to_scan -= batch_size;
2534 }
2535}
2536
2537
2538/**
2539 * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
2540 *
2541 * A really big hammer: scan all zones' unevictable LRU lists to check for
2542 * pages that have become evictable. Move those back to the zones'
2543 * inactive list where they become candidates for reclaim.
2544 * This occurs when, e.g., we have unswappable pages on the unevictable lists,
2545 * and we add swap to the system. As such, it runs in the context of a task
2546 * that has possibly/probably made some previously unevictable pages
2547 * evictable.
2548 */
2549void scan_all_zones_unevictable_pages(void)
2550{
2551 struct zone *zone;
2552
2553 for_each_zone(zone) {
2554 scan_zone_unevictable_pages(zone);
2555 }
2556}
2557
2558/*
2559 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
2560 * all nodes' unevictable lists for evictable pages
2561 */
2562unsigned long scan_unevictable_pages;
2563
2564int scan_unevictable_handler(struct ctl_table *table, int write,
2565 struct file *file, void __user *buffer,
2566 size_t *length, loff_t *ppos)
2567{
2568 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
2569
2570 if (write && *(unsigned long *)table->data)
2571 scan_all_zones_unevictable_pages();
2572
2573 scan_unevictable_pages = 0;
2574 return 0;
2575}
2576
2577/*
2578 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
2579 * a specified node's per zone unevictable lists for evictable pages.
2580 */
2581
2582static ssize_t read_scan_unevictable_node(struct sys_device *dev,
2583 struct sysdev_attribute *attr,
2584 char *buf)
2585{
2586 return sprintf(buf, "0\n"); /* always zero; should fit... */
2587}
2588
2589static ssize_t write_scan_unevictable_node(struct sys_device *dev,
2590 struct sysdev_attribute *attr,
2591 const char *buf, size_t count)
2592{
2593 struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
2594 struct zone *zone;
2595 unsigned long res;
2596 unsigned long req = strict_strtoul(buf, 10, &res);
2597
2598 if (!req)
2599 return 1; /* zero is no-op */
2600
2601 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2602 if (!populated_zone(zone))
2603 continue;
2604 scan_zone_unevictable_pages(zone);
2605 }
2606 return 1;
2607}
2608
2609
2610static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
2611 read_scan_unevictable_node,
2612 write_scan_unevictable_node);
2613
2614int scan_unevictable_register_node(struct node *node)
2615{
2616 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
2617}
2618
2619void scan_unevictable_unregister_node(struct node *node)
2620{
2621 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2622}
2623
2624#endif