aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2008-10-18 23:26:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-20 11:50:25 -0400
commit4f98a2fee8acdb4ac84545df98cccecfd130f8db (patch)
tree035a2937f4c3e2f7b4269412041c073ac646937c /mm/vmscan.c
parentb2e185384f534781fd22f5ce170b2ad26f97df70 (diff)
vmscan: split LRU lists into anon & file sets
Split the LRU lists in two, one set for pages that are backed by real file systems ("file") and one for pages that are backed by memory and swap ("anon"). The latter includes tmpfs. The advantage of doing this is that the VM will not have to scan over lots of anonymous pages (which we generally do not want to swap out), just to find the page cache pages that it should evict. This patch has the infrastructure and a basic policy to balance how much we scan the anon lists and how much we scan the file lists. The big policy changes are in separate patches. [lee.schermerhorn@hp.com: collect lru meminfo statistics from correct offset] [kosaki.motohiro@jp.fujitsu.com: prevent incorrect oom under split_lru] [kosaki.motohiro@jp.fujitsu.com: fix pagevec_move_tail() doesn't treat unevictable page] [hugh@veritas.com: memcg swapbacked pages active] [hugh@veritas.com: splitlru: BDI_CAP_SWAP_BACKED] [akpm@linux-foundation.org: fix /proc/vmstat units] [nishimura@mxp.nes.nec.co.jp: memcg: fix handling of shmem migration] [kosaki.motohiro@jp.fujitsu.com: adjust Quicklists field of /proc/meminfo] [kosaki.motohiro@jp.fujitsu.com: fix style issue of get_scan_ratio()] Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c416
1 files changed, 216 insertions, 200 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e656035d3406..d10d2f9a33f3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -78,7 +78,7 @@ struct scan_control {
78 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, 78 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
79 unsigned long *scanned, int order, int mode, 79 unsigned long *scanned, int order, int mode,
80 struct zone *z, struct mem_cgroup *mem_cont, 80 struct zone *z, struct mem_cgroup *mem_cont,
81 int active); 81 int active, int file);
82}; 82};
83 83
84#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 84#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -680,7 +680,7 @@ keep:
680 * 680 *
681 * returns 0 on success, -ve errno on failure. 681 * returns 0 on success, -ve errno on failure.
682 */ 682 */
683int __isolate_lru_page(struct page *page, int mode) 683int __isolate_lru_page(struct page *page, int mode, int file)
684{ 684{
685 int ret = -EINVAL; 685 int ret = -EINVAL;
686 686
@@ -696,6 +696,9 @@ int __isolate_lru_page(struct page *page, int mode)
696 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 696 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
697 return ret; 697 return ret;
698 698
699 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
700 return ret;
701
699 ret = -EBUSY; 702 ret = -EBUSY;
700 if (likely(get_page_unless_zero(page))) { 703 if (likely(get_page_unless_zero(page))) {
701 /* 704 /*
@@ -726,12 +729,13 @@ int __isolate_lru_page(struct page *page, int mode)
726 * @scanned: The number of pages that were scanned. 729 * @scanned: The number of pages that were scanned.
727 * @order: The caller's attempted allocation order 730 * @order: The caller's attempted allocation order
728 * @mode: One of the LRU isolation modes 731 * @mode: One of the LRU isolation modes
732 * @file: True [1] if isolating file [!anon] pages
729 * 733 *
730 * returns how many pages were moved onto *@dst. 734 * returns how many pages were moved onto *@dst.
731 */ 735 */
732static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 736static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
733 struct list_head *src, struct list_head *dst, 737 struct list_head *src, struct list_head *dst,
734 unsigned long *scanned, int order, int mode) 738 unsigned long *scanned, int order, int mode, int file)
735{ 739{
736 unsigned long nr_taken = 0; 740 unsigned long nr_taken = 0;
737 unsigned long scan; 741 unsigned long scan;
@@ -748,7 +752,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
748 752
749 VM_BUG_ON(!PageLRU(page)); 753 VM_BUG_ON(!PageLRU(page));
750 754
751 switch (__isolate_lru_page(page, mode)) { 755 switch (__isolate_lru_page(page, mode, file)) {
752 case 0: 756 case 0:
753 list_move(&page->lru, dst); 757 list_move(&page->lru, dst);
754 nr_taken++; 758 nr_taken++;
@@ -791,10 +795,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
791 break; 795 break;
792 796
793 cursor_page = pfn_to_page(pfn); 797 cursor_page = pfn_to_page(pfn);
798
794 /* Check that we have not crossed a zone boundary. */ 799 /* Check that we have not crossed a zone boundary. */
795 if (unlikely(page_zone_id(cursor_page) != zone_id)) 800 if (unlikely(page_zone_id(cursor_page) != zone_id))
796 continue; 801 continue;
797 switch (__isolate_lru_page(cursor_page, mode)) { 802 switch (__isolate_lru_page(cursor_page, mode, file)) {
798 case 0: 803 case 0:
799 list_move(&cursor_page->lru, dst); 804 list_move(&cursor_page->lru, dst);
800 nr_taken++; 805 nr_taken++;
@@ -819,30 +824,37 @@ static unsigned long isolate_pages_global(unsigned long nr,
819 unsigned long *scanned, int order, 824 unsigned long *scanned, int order,
820 int mode, struct zone *z, 825 int mode, struct zone *z,
821 struct mem_cgroup *mem_cont, 826 struct mem_cgroup *mem_cont,
822 int active) 827 int active, int file)
823{ 828{
829 int lru = LRU_BASE;
824 if (active) 830 if (active)
825 return isolate_lru_pages(nr, &z->lru[LRU_ACTIVE].list, dst, 831 lru += LRU_ACTIVE;
826 scanned, order, mode); 832 if (file)
827 else 833 lru += LRU_FILE;
828 return isolate_lru_pages(nr, &z->lru[LRU_INACTIVE].list, dst, 834 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
829 scanned, order, mode); 835 mode, !!file);
830} 836}
831 837
832/* 838/*
833 * clear_active_flags() is a helper for shrink_active_list(), clearing 839 * clear_active_flags() is a helper for shrink_active_list(), clearing
834 * any active bits from the pages in the list. 840 * any active bits from the pages in the list.
835 */ 841 */
836static unsigned long clear_active_flags(struct list_head *page_list) 842static unsigned long clear_active_flags(struct list_head *page_list,
843 unsigned int *count)
837{ 844{
838 int nr_active = 0; 845 int nr_active = 0;
846 int lru;
839 struct page *page; 847 struct page *page;
840 848
841 list_for_each_entry(page, page_list, lru) 849 list_for_each_entry(page, page_list, lru) {
850 lru = page_is_file_cache(page);
842 if (PageActive(page)) { 851 if (PageActive(page)) {
852 lru += LRU_ACTIVE;
843 ClearPageActive(page); 853 ClearPageActive(page);
844 nr_active++; 854 nr_active++;
845 } 855 }
856 count[lru]++;
857 }
846 858
847 return nr_active; 859 return nr_active;
848} 860}
@@ -880,12 +892,12 @@ int isolate_lru_page(struct page *page)
880 892
881 spin_lock_irq(&zone->lru_lock); 893 spin_lock_irq(&zone->lru_lock);
882 if (PageLRU(page) && get_page_unless_zero(page)) { 894 if (PageLRU(page) && get_page_unless_zero(page)) {
895 int lru = LRU_BASE;
883 ret = 0; 896 ret = 0;
884 ClearPageLRU(page); 897 ClearPageLRU(page);
885 if (PageActive(page)) 898
886 del_page_from_active_list(zone, page); 899 lru += page_is_file_cache(page) + !!PageActive(page);
887 else 900 del_page_from_lru_list(zone, page, lru);
888 del_page_from_inactive_list(zone, page);
889 } 901 }
890 spin_unlock_irq(&zone->lru_lock); 902 spin_unlock_irq(&zone->lru_lock);
891 } 903 }
@@ -897,7 +909,7 @@ int isolate_lru_page(struct page *page)
897 * of reclaimed pages 909 * of reclaimed pages
898 */ 910 */
899static unsigned long shrink_inactive_list(unsigned long max_scan, 911static unsigned long shrink_inactive_list(unsigned long max_scan,
900 struct zone *zone, struct scan_control *sc) 912 struct zone *zone, struct scan_control *sc, int file)
901{ 913{
902 LIST_HEAD(page_list); 914 LIST_HEAD(page_list);
903 struct pagevec pvec; 915 struct pagevec pvec;
@@ -914,20 +926,32 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
914 unsigned long nr_scan; 926 unsigned long nr_scan;
915 unsigned long nr_freed; 927 unsigned long nr_freed;
916 unsigned long nr_active; 928 unsigned long nr_active;
929 unsigned int count[NR_LRU_LISTS] = { 0, };
930 int mode = (sc->order > PAGE_ALLOC_COSTLY_ORDER) ?
931 ISOLATE_BOTH : ISOLATE_INACTIVE;
917 932
918 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 933 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
919 &page_list, &nr_scan, sc->order, 934 &page_list, &nr_scan, sc->order, mode,
920 (sc->order > PAGE_ALLOC_COSTLY_ORDER)? 935 zone, sc->mem_cgroup, 0, file);
921 ISOLATE_BOTH : ISOLATE_INACTIVE, 936 nr_active = clear_active_flags(&page_list, count);
922 zone, sc->mem_cgroup, 0);
923 nr_active = clear_active_flags(&page_list);
924 __count_vm_events(PGDEACTIVATE, nr_active); 937 __count_vm_events(PGDEACTIVATE, nr_active);
925 938
926 __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); 939 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
927 __mod_zone_page_state(zone, NR_INACTIVE, 940 -count[LRU_ACTIVE_FILE]);
928 -(nr_taken - nr_active)); 941 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
929 if (scan_global_lru(sc)) 942 -count[LRU_INACTIVE_FILE]);
943 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
944 -count[LRU_ACTIVE_ANON]);
945 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
946 -count[LRU_INACTIVE_ANON]);
947
948 if (scan_global_lru(sc)) {
930 zone->pages_scanned += nr_scan; 949 zone->pages_scanned += nr_scan;
950 zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
951 zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
952 zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
953 zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
954 }
931 spin_unlock_irq(&zone->lru_lock); 955 spin_unlock_irq(&zone->lru_lock);
932 956
933 nr_scanned += nr_scan; 957 nr_scanned += nr_scan;
@@ -947,7 +971,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
947 * The attempt at page out may have made some 971 * The attempt at page out may have made some
948 * of the pages active, mark them inactive again. 972 * of the pages active, mark them inactive again.
949 */ 973 */
950 nr_active = clear_active_flags(&page_list); 974 nr_active = clear_active_flags(&page_list, count);
951 count_vm_events(PGDEACTIVATE, nr_active); 975 count_vm_events(PGDEACTIVATE, nr_active);
952 976
953 nr_freed += shrink_page_list(&page_list, sc, 977 nr_freed += shrink_page_list(&page_list, sc,
@@ -977,6 +1001,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
977 SetPageLRU(page); 1001 SetPageLRU(page);
978 list_del(&page->lru); 1002 list_del(&page->lru);
979 add_page_to_lru_list(zone, page, page_lru(page)); 1003 add_page_to_lru_list(zone, page, page_lru(page));
1004 if (PageActive(page) && scan_global_lru(sc)) {
1005 int file = !!page_is_file_cache(page);
1006 zone->recent_rotated[file]++;
1007 }
980 if (!pagevec_add(&pvec, page)) { 1008 if (!pagevec_add(&pvec, page)) {
981 spin_unlock_irq(&zone->lru_lock); 1009 spin_unlock_irq(&zone->lru_lock);
982 __pagevec_release(&pvec); 1010 __pagevec_release(&pvec);
@@ -1007,115 +1035,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1007 1035
1008static inline int zone_is_near_oom(struct zone *zone) 1036static inline int zone_is_near_oom(struct zone *zone)
1009{ 1037{
1010 return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) 1038 return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
1011 + zone_page_state(zone, NR_INACTIVE))*3;
1012}
1013
1014/*
1015 * Determine we should try to reclaim mapped pages.
1016 * This is called only when sc->mem_cgroup is NULL.
1017 */
1018static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
1019 int priority)
1020{
1021 long mapped_ratio;
1022 long distress;
1023 long swap_tendency;
1024 long imbalance;
1025 int reclaim_mapped = 0;
1026 int prev_priority;
1027
1028 if (scan_global_lru(sc) && zone_is_near_oom(zone))
1029 return 1;
1030 /*
1031 * `distress' is a measure of how much trouble we're having
1032 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
1033 */
1034 if (scan_global_lru(sc))
1035 prev_priority = zone->prev_priority;
1036 else
1037 prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
1038
1039 distress = 100 >> min(prev_priority, priority);
1040
1041 /*
1042 * The point of this algorithm is to decide when to start
1043 * reclaiming mapped memory instead of just pagecache. Work out
1044 * how much memory
1045 * is mapped.
1046 */
1047 if (scan_global_lru(sc))
1048 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
1049 global_page_state(NR_ANON_PAGES)) * 100) /
1050 vm_total_pages;
1051 else
1052 mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
1053
1054 /*
1055 * Now decide how much we really want to unmap some pages. The
1056 * mapped ratio is downgraded - just because there's a lot of
1057 * mapped memory doesn't necessarily mean that page reclaim
1058 * isn't succeeding.
1059 *
1060 * The distress ratio is important - we don't want to start
1061 * going oom.
1062 *
1063 * A 100% value of vm_swappiness overrides this algorithm
1064 * altogether.
1065 */
1066 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
1067
1068 /*
1069 * If there's huge imbalance between active and inactive
1070 * (think active 100 times larger than inactive) we should
1071 * become more permissive, or the system will take too much
1072 * cpu before it start swapping during memory pressure.
1073 * Distress is about avoiding early-oom, this is about
1074 * making swappiness graceful despite setting it to low
1075 * values.
1076 *
1077 * Avoid div by zero with nr_inactive+1, and max resulting
1078 * value is vm_total_pages.
1079 */
1080 if (scan_global_lru(sc)) {
1081 imbalance = zone_page_state(zone, NR_ACTIVE);
1082 imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
1083 } else
1084 imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
1085
1086 /*
1087 * Reduce the effect of imbalance if swappiness is low,
1088 * this means for a swappiness very low, the imbalance
1089 * must be much higher than 100 for this logic to make
1090 * the difference.
1091 *
1092 * Max temporary value is vm_total_pages*100.
1093 */
1094 imbalance *= (vm_swappiness + 1);
1095 imbalance /= 100;
1096
1097 /*
1098 * If not much of the ram is mapped, makes the imbalance
1099 * less relevant, it's high priority we refill the inactive
1100 * list with mapped pages only in presence of high ratio of
1101 * mapped pages.
1102 *
1103 * Max temporary value is vm_total_pages*100.
1104 */
1105 imbalance *= mapped_ratio;
1106 imbalance /= 100;
1107
1108 /* apply imbalance feedback to swap_tendency */
1109 swap_tendency += imbalance;
1110
1111 /*
1112 * Now use this metric to decide whether to start moving mapped
1113 * memory onto the inactive list.
1114 */
1115 if (swap_tendency >= 100)
1116 reclaim_mapped = 1;
1117
1118 return reclaim_mapped;
1119} 1039}
1120 1040
1121/* 1041/*
@@ -1138,7 +1058,7 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
1138 1058
1139 1059
1140static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1060static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1141 struct scan_control *sc, int priority) 1061 struct scan_control *sc, int priority, int file)
1142{ 1062{
1143 unsigned long pgmoved; 1063 unsigned long pgmoved;
1144 int pgdeactivate = 0; 1064 int pgdeactivate = 0;
@@ -1148,43 +1068,42 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1148 LIST_HEAD(l_inactive); 1068 LIST_HEAD(l_inactive);
1149 struct page *page; 1069 struct page *page;
1150 struct pagevec pvec; 1070 struct pagevec pvec;
1151 int reclaim_mapped = 0; 1071 enum lru_list lru;
1152
1153 if (sc->may_swap)
1154 reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
1155 1072
1156 lru_add_drain(); 1073 lru_add_drain();
1157 spin_lock_irq(&zone->lru_lock); 1074 spin_lock_irq(&zone->lru_lock);
1158 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, 1075 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1159 ISOLATE_ACTIVE, zone, 1076 ISOLATE_ACTIVE, zone,
1160 sc->mem_cgroup, 1); 1077 sc->mem_cgroup, 1, file);
1161 /* 1078 /*
1162 * zone->pages_scanned is used for detect zone's oom 1079 * zone->pages_scanned is used for detect zone's oom
1163 * mem_cgroup remembers nr_scan by itself. 1080 * mem_cgroup remembers nr_scan by itself.
1164 */ 1081 */
1165 if (scan_global_lru(sc)) 1082 if (scan_global_lru(sc)) {
1166 zone->pages_scanned += pgscanned; 1083 zone->pages_scanned += pgscanned;
1084 zone->recent_scanned[!!file] += pgmoved;
1085 }
1167 1086
1168 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 1087 if (file)
1088 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1089 else
1090 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1169 spin_unlock_irq(&zone->lru_lock); 1091 spin_unlock_irq(&zone->lru_lock);
1170 1092
1171 while (!list_empty(&l_hold)) { 1093 while (!list_empty(&l_hold)) {
1172 cond_resched(); 1094 cond_resched();
1173 page = lru_to_page(&l_hold); 1095 page = lru_to_page(&l_hold);
1174 list_del(&page->lru); 1096 list_del(&page->lru);
1175 if (page_mapped(page)) {
1176 if (!reclaim_mapped ||
1177 (total_swap_pages == 0 && PageAnon(page)) ||
1178 page_referenced(page, 0, sc->mem_cgroup)) {
1179 list_add(&page->lru, &l_active);
1180 continue;
1181 }
1182 }
1183 list_add(&page->lru, &l_inactive); 1097 list_add(&page->lru, &l_inactive);
1184 } 1098 }
1185 1099
1100 /*
1101 * Now put the pages back on the appropriate [file or anon] inactive
1102 * and active lists.
1103 */
1186 pagevec_init(&pvec, 1); 1104 pagevec_init(&pvec, 1);
1187 pgmoved = 0; 1105 pgmoved = 0;
1106 lru = LRU_BASE + file * LRU_FILE;
1188 spin_lock_irq(&zone->lru_lock); 1107 spin_lock_irq(&zone->lru_lock);
1189 while (!list_empty(&l_inactive)) { 1108 while (!list_empty(&l_inactive)) {
1190 page = lru_to_page(&l_inactive); 1109 page = lru_to_page(&l_inactive);
@@ -1194,11 +1113,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1194 VM_BUG_ON(!PageActive(page)); 1113 VM_BUG_ON(!PageActive(page));
1195 ClearPageActive(page); 1114 ClearPageActive(page);
1196 1115
1197 list_move(&page->lru, &zone->lru[LRU_INACTIVE].list); 1116 list_move(&page->lru, &zone->lru[lru].list);
1198 mem_cgroup_move_lists(page, false); 1117 mem_cgroup_move_lists(page, false);
1199 pgmoved++; 1118 pgmoved++;
1200 if (!pagevec_add(&pvec, page)) { 1119 if (!pagevec_add(&pvec, page)) {
1201 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1120 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1202 spin_unlock_irq(&zone->lru_lock); 1121 spin_unlock_irq(&zone->lru_lock);
1203 pgdeactivate += pgmoved; 1122 pgdeactivate += pgmoved;
1204 pgmoved = 0; 1123 pgmoved = 0;
@@ -1208,7 +1127,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1208 spin_lock_irq(&zone->lru_lock); 1127 spin_lock_irq(&zone->lru_lock);
1209 } 1128 }
1210 } 1129 }
1211 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1130 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1212 pgdeactivate += pgmoved; 1131 pgdeactivate += pgmoved;
1213 if (buffer_heads_over_limit) { 1132 if (buffer_heads_over_limit) {
1214 spin_unlock_irq(&zone->lru_lock); 1133 spin_unlock_irq(&zone->lru_lock);
@@ -1217,6 +1136,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1217 } 1136 }
1218 1137
1219 pgmoved = 0; 1138 pgmoved = 0;
1139 lru = LRU_ACTIVE + file * LRU_FILE;
1220 while (!list_empty(&l_active)) { 1140 while (!list_empty(&l_active)) {
1221 page = lru_to_page(&l_active); 1141 page = lru_to_page(&l_active);
1222 prefetchw_prev_lru_page(page, &l_active, flags); 1142 prefetchw_prev_lru_page(page, &l_active, flags);
@@ -1224,11 +1144,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1224 SetPageLRU(page); 1144 SetPageLRU(page);
1225 VM_BUG_ON(!PageActive(page)); 1145 VM_BUG_ON(!PageActive(page));
1226 1146
1227 list_move(&page->lru, &zone->lru[LRU_ACTIVE].list); 1147 list_move(&page->lru, &zone->lru[lru].list);
1228 mem_cgroup_move_lists(page, true); 1148 mem_cgroup_move_lists(page, true);
1229 pgmoved++; 1149 pgmoved++;
1230 if (!pagevec_add(&pvec, page)) { 1150 if (!pagevec_add(&pvec, page)) {
1231 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 1151 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1232 pgmoved = 0; 1152 pgmoved = 0;
1233 spin_unlock_irq(&zone->lru_lock); 1153 spin_unlock_irq(&zone->lru_lock);
1234 if (vm_swap_full()) 1154 if (vm_swap_full())
@@ -1237,7 +1157,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1237 spin_lock_irq(&zone->lru_lock); 1157 spin_lock_irq(&zone->lru_lock);
1238 } 1158 }
1239 } 1159 }
1240 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 1160 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1161 zone->recent_rotated[!!file] += pgmoved;
1241 1162
1242 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1163 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1243 __count_vm_events(PGDEACTIVATE, pgdeactivate); 1164 __count_vm_events(PGDEACTIVATE, pgdeactivate);
@@ -1248,16 +1169,103 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1248 pagevec_release(&pvec); 1169 pagevec_release(&pvec);
1249} 1170}
1250 1171
1251static unsigned long shrink_list(enum lru_list l, unsigned long nr_to_scan, 1172static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1252 struct zone *zone, struct scan_control *sc, int priority) 1173 struct zone *zone, struct scan_control *sc, int priority)
1253{ 1174{
1254 if (l == LRU_ACTIVE) { 1175 int file = is_file_lru(lru);
1255 shrink_active_list(nr_to_scan, zone, sc, priority); 1176
1177 if (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE) {
1178 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1256 return 0; 1179 return 0;
1257 } 1180 }
1258 return shrink_inactive_list(nr_to_scan, zone, sc); 1181 return shrink_inactive_list(nr_to_scan, zone, sc, file);
1182}
1183
1184/*
1185 * Determine how aggressively the anon and file LRU lists should be
1186 * scanned. The relative value of each set of LRU lists is determined
1187 * by looking at the fraction of the pages scanned we did rotate back
1188 * onto the active list instead of evict.
1189 *
1190 * percent[0] specifies how much pressure to put on ram/swap backed
1191 * memory, while percent[1] determines pressure on the file LRUs.
1192 */
1193static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1194 unsigned long *percent)
1195{
1196 unsigned long anon, file, free;
1197 unsigned long anon_prio, file_prio;
1198 unsigned long ap, fp;
1199
1200 anon = zone_page_state(zone, NR_ACTIVE_ANON) +
1201 zone_page_state(zone, NR_INACTIVE_ANON);
1202 file = zone_page_state(zone, NR_ACTIVE_FILE) +
1203 zone_page_state(zone, NR_INACTIVE_FILE);
1204 free = zone_page_state(zone, NR_FREE_PAGES);
1205
1206 /* If we have no swap space, do not bother scanning anon pages. */
1207 if (nr_swap_pages <= 0) {
1208 percent[0] = 0;
1209 percent[1] = 100;
1210 return;
1211 }
1212
1213 /* If we have very few page cache pages, force-scan anon pages. */
1214 if (unlikely(file + free <= zone->pages_high)) {
1215 percent[0] = 100;
1216 percent[1] = 0;
1217 return;
1218 }
1219
1220 /*
1221 * OK, so we have swap space and a fair amount of page cache
1222 * pages. We use the recently rotated / recently scanned
1223 * ratios to determine how valuable each cache is.
1224 *
1225 * Because workloads change over time (and to avoid overflow)
1226 * we keep these statistics as a floating average, which ends
1227 * up weighing recent references more than old ones.
1228 *
1229 * anon in [0], file in [1]
1230 */
1231 if (unlikely(zone->recent_scanned[0] > anon / 4)) {
1232 spin_lock_irq(&zone->lru_lock);
1233 zone->recent_scanned[0] /= 2;
1234 zone->recent_rotated[0] /= 2;
1235 spin_unlock_irq(&zone->lru_lock);
1236 }
1237
1238 if (unlikely(zone->recent_scanned[1] > file / 4)) {
1239 spin_lock_irq(&zone->lru_lock);
1240 zone->recent_scanned[1] /= 2;
1241 zone->recent_rotated[1] /= 2;
1242 spin_unlock_irq(&zone->lru_lock);
1243 }
1244
1245 /*
1246 * With swappiness at 100, anonymous and file have the same priority.
1247 * This scanning priority is essentially the inverse of IO cost.
1248 */
1249 anon_prio = sc->swappiness;
1250 file_prio = 200 - sc->swappiness;
1251
1252 /*
1253 * anon recent_rotated[0]
1254 * %anon = 100 * ----------- / ----------------- * IO cost
1255 * anon + file rotate_sum
1256 */
1257 ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
1258 ap /= zone->recent_rotated[0] + 1;
1259
1260 fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
1261 fp /= zone->recent_rotated[1] + 1;
1262
1263 /* Normalize to percentages */
1264 percent[0] = 100 * ap / (ap + fp + 1);
1265 percent[1] = 100 - percent[0];
1259} 1266}
1260 1267
1268
1261/* 1269/*
1262 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1270 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1263 */ 1271 */
@@ -1267,36 +1275,43 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1267 unsigned long nr[NR_LRU_LISTS]; 1275 unsigned long nr[NR_LRU_LISTS];
1268 unsigned long nr_to_scan; 1276 unsigned long nr_to_scan;
1269 unsigned long nr_reclaimed = 0; 1277 unsigned long nr_reclaimed = 0;
1278 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1270 enum lru_list l; 1279 enum lru_list l;
1271 1280
1272 if (scan_global_lru(sc)) { 1281 get_scan_ratio(zone, sc, percent);
1273 /* 1282
1274 * Add one to nr_to_scan just to make sure that the kernel 1283 for_each_lru(l) {
1275 * will slowly sift through the active list. 1284 if (scan_global_lru(sc)) {
1276 */ 1285 int file = is_file_lru(l);
1277 for_each_lru(l) { 1286 int scan;
1278 zone->lru[l].nr_scan += (zone_page_state(zone, 1287 /*
1279 NR_LRU_BASE + l) >> priority) + 1; 1288 * Add one to nr_to_scan just to make sure that the
1289 * kernel will slowly sift through each list.
1290 */
1291 scan = zone_page_state(zone, NR_LRU_BASE + l);
1292 if (priority) {
1293 scan >>= priority;
1294 scan = (scan * percent[file]) / 100;
1295 }
1296 zone->lru[l].nr_scan += scan + 1;
1280 nr[l] = zone->lru[l].nr_scan; 1297 nr[l] = zone->lru[l].nr_scan;
1281 if (nr[l] >= sc->swap_cluster_max) 1298 if (nr[l] >= sc->swap_cluster_max)
1282 zone->lru[l].nr_scan = 0; 1299 zone->lru[l].nr_scan = 0;
1283 else 1300 else
1284 nr[l] = 0; 1301 nr[l] = 0;
1302 } else {
1303 /*
1304 * This reclaim occurs not because zone memory shortage
1305 * but because memory controller hits its limit.
1306 * Don't modify zone reclaim related data.
1307 */
1308 nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
1309 priority, l);
1285 } 1310 }
1286 } else {
1287 /*
1288 * This reclaim occurs not because zone memory shortage but
1289 * because memory controller hits its limit.
1290 * Then, don't modify zone reclaim related data.
1291 */
1292 nr[LRU_ACTIVE] = mem_cgroup_calc_reclaim(sc->mem_cgroup,
1293 zone, priority, LRU_ACTIVE);
1294
1295 nr[LRU_INACTIVE] = mem_cgroup_calc_reclaim(sc->mem_cgroup,
1296 zone, priority, LRU_INACTIVE);
1297 } 1311 }
1298 1312
1299 while (nr[LRU_ACTIVE] || nr[LRU_INACTIVE]) { 1313 while (nr[LRU_ACTIVE_ANON] || nr[LRU_INACTIVE_ANON] ||
1314 nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) {
1300 for_each_lru(l) { 1315 for_each_lru(l) {
1301 if (nr[l]) { 1316 if (nr[l]) {
1302 nr_to_scan = min(nr[l], 1317 nr_to_scan = min(nr[l],
@@ -1369,7 +1384,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1369 1384
1370 return nr_reclaimed; 1385 return nr_reclaimed;
1371} 1386}
1372 1387
1373/* 1388/*
1374 * This is the main entry point to direct page reclaim. 1389 * This is the main entry point to direct page reclaim.
1375 * 1390 *
@@ -1412,8 +1427,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1412 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1427 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1413 continue; 1428 continue;
1414 1429
1415 lru_pages += zone_page_state(zone, NR_ACTIVE) 1430 lru_pages += zone_lru_pages(zone);
1416 + zone_page_state(zone, NR_INACTIVE);
1417 } 1431 }
1418 } 1432 }
1419 1433
@@ -1615,8 +1629,7 @@ loop_again:
1615 for (i = 0; i <= end_zone; i++) { 1629 for (i = 0; i <= end_zone; i++) {
1616 struct zone *zone = pgdat->node_zones + i; 1630 struct zone *zone = pgdat->node_zones + i;
1617 1631
1618 lru_pages += zone_page_state(zone, NR_ACTIVE) 1632 lru_pages += zone_lru_pages(zone);
1619 + zone_page_state(zone, NR_INACTIVE);
1620 } 1633 }
1621 1634
1622 /* 1635 /*
@@ -1660,8 +1673,7 @@ loop_again:
1660 if (zone_is_all_unreclaimable(zone)) 1673 if (zone_is_all_unreclaimable(zone))
1661 continue; 1674 continue;
1662 if (nr_slab == 0 && zone->pages_scanned >= 1675 if (nr_slab == 0 && zone->pages_scanned >=
1663 (zone_page_state(zone, NR_ACTIVE) 1676 (zone_lru_pages(zone) * 6))
1664 + zone_page_state(zone, NR_INACTIVE)) * 6)
1665 zone_set_flag(zone, 1677 zone_set_flag(zone,
1666 ZONE_ALL_UNRECLAIMABLE); 1678 ZONE_ALL_UNRECLAIMABLE);
1667 /* 1679 /*
@@ -1715,7 +1727,7 @@ out:
1715 1727
1716/* 1728/*
1717 * The background pageout daemon, started as a kernel thread 1729 * The background pageout daemon, started as a kernel thread
1718 * from the init process. 1730 * from the init process.
1719 * 1731 *
1720 * This basically trickles out pages so that we have _some_ 1732 * This basically trickles out pages so that we have _some_
1721 * free memory available even if there is no other activity 1733 * free memory available even if there is no other activity
@@ -1809,6 +1821,14 @@ void wakeup_kswapd(struct zone *zone, int order)
1809 wake_up_interruptible(&pgdat->kswapd_wait); 1821 wake_up_interruptible(&pgdat->kswapd_wait);
1810} 1822}
1811 1823
1824unsigned long global_lru_pages(void)
1825{
1826 return global_page_state(NR_ACTIVE_ANON)
1827 + global_page_state(NR_ACTIVE_FILE)
1828 + global_page_state(NR_INACTIVE_ANON)
1829 + global_page_state(NR_INACTIVE_FILE);
1830}
1831
1812#ifdef CONFIG_PM 1832#ifdef CONFIG_PM
1813/* 1833/*
1814 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 1834 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
@@ -1834,7 +1854,8 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1834 1854
1835 for_each_lru(l) { 1855 for_each_lru(l) {
1836 /* For pass = 0 we don't shrink the active list */ 1856 /* For pass = 0 we don't shrink the active list */
1837 if (pass == 0 && l == LRU_ACTIVE) 1857 if (pass == 0 &&
1858 (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
1838 continue; 1859 continue;
1839 1860
1840 zone->lru[l].nr_scan += 1861 zone->lru[l].nr_scan +=
@@ -1856,11 +1877,6 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1856 return ret; 1877 return ret;
1857} 1878}
1858 1879
1859static unsigned long count_lru_pages(void)
1860{
1861 return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
1862}
1863
1864/* 1880/*
1865 * Try to free `nr_pages' of memory, system-wide, and return the number of 1881 * Try to free `nr_pages' of memory, system-wide, and return the number of
1866 * freed pages. 1882 * freed pages.
@@ -1886,7 +1902,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1886 1902
1887 current->reclaim_state = &reclaim_state; 1903 current->reclaim_state = &reclaim_state;
1888 1904
1889 lru_pages = count_lru_pages(); 1905 lru_pages = global_lru_pages();
1890 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 1906 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1891 /* If slab caches are huge, it's better to hit them first */ 1907 /* If slab caches are huge, it's better to hit them first */
1892 while (nr_slab >= lru_pages) { 1908 while (nr_slab >= lru_pages) {
@@ -1929,7 +1945,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1929 1945
1930 reclaim_state.reclaimed_slab = 0; 1946 reclaim_state.reclaimed_slab = 0;
1931 shrink_slab(sc.nr_scanned, sc.gfp_mask, 1947 shrink_slab(sc.nr_scanned, sc.gfp_mask,
1932 count_lru_pages()); 1948 global_lru_pages());
1933 ret += reclaim_state.reclaimed_slab; 1949 ret += reclaim_state.reclaimed_slab;
1934 if (ret >= nr_pages) 1950 if (ret >= nr_pages)
1935 goto out; 1951 goto out;
@@ -1946,7 +1962,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1946 if (!ret) { 1962 if (!ret) {
1947 do { 1963 do {
1948 reclaim_state.reclaimed_slab = 0; 1964 reclaim_state.reclaimed_slab = 0;
1949 shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); 1965 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
1950 ret += reclaim_state.reclaimed_slab; 1966 ret += reclaim_state.reclaimed_slab;
1951 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 1967 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
1952 } 1968 }