aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2008-02-07 03:13:56 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-07 11:42:18 -0500
commit66e1707bc34609f626e2e7b4fe7e454c9748bad5 (patch)
treed850a729887485874c976ba64eb85e3406e488a1 /mm/vmscan.c
parent67e465a77ba658635309ee00b367bec6555ea544 (diff)
Memory controller: add per cgroup LRU and reclaim
Add the page_cgroup to the per cgroup LRU. The reclaim algorithm has been modified to make the isolate_lru_pages() as a pluggable component. The scan_control data structure now accepts the cgroup on behalf of which reclaims are carried out. try_to_free_pages() has been extended to become cgroup aware. [akpm@linux-foundation.org: fix warning] [Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member] [bunk@kernel.org: make do_try_to_free_pages() static] [hugh@veritas.com: memcgroup: fix try_to_free order] [kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary] Signed-off-by: Pavel Emelianov <xemul@openvz.org> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Paul Menage <menage@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Kirill Korotaev <dev@sw.ru> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: David Rientjes <rientjes@google.com> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c128
1 files changed, 103 insertions, 25 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5a9597e3bbc..7408a8a7d882 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -37,6 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/kthread.h> 38#include <linux/kthread.h>
39#include <linux/freezer.h> 39#include <linux/freezer.h>
40#include <linux/memcontrol.h>
40 41
41#include <asm/tlbflush.h> 42#include <asm/tlbflush.h>
42#include <asm/div64.h> 43#include <asm/div64.h>
@@ -68,6 +69,15 @@ struct scan_control {
68 int all_unreclaimable; 69 int all_unreclaimable;
69 70
70 int order; 71 int order;
72
73 /* Which cgroup do we reclaim from */
74 struct mem_cgroup *mem_cgroup;
75
76 /* Pluggable isolate pages callback */
77 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
78 unsigned long *scanned, int order, int mode,
79 struct zone *z, struct mem_cgroup *mem_cont,
80 int active);
71}; 81};
72 82
73#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 83#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -626,7 +636,7 @@ keep:
626 * 636 *
627 * returns 0 on success, -ve errno on failure. 637 * returns 0 on success, -ve errno on failure.
628 */ 638 */
629static int __isolate_lru_page(struct page *page, int mode) 639int __isolate_lru_page(struct page *page, int mode)
630{ 640{
631 int ret = -EINVAL; 641 int ret = -EINVAL;
632 642
@@ -760,6 +770,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
760 return nr_taken; 770 return nr_taken;
761} 771}
762 772
773static unsigned long isolate_pages_global(unsigned long nr,
774 struct list_head *dst,
775 unsigned long *scanned, int order,
776 int mode, struct zone *z,
777 struct mem_cgroup *mem_cont,
778 int active)
779{
780 if (active)
781 return isolate_lru_pages(nr, &z->active_list, dst,
782 scanned, order, mode);
783 else
784 return isolate_lru_pages(nr, &z->inactive_list, dst,
785 scanned, order, mode);
786}
787
763/* 788/*
764 * clear_active_flags() is a helper for shrink_active_list(), clearing 789 * clear_active_flags() is a helper for shrink_active_list(), clearing
765 * any active bits from the pages in the list. 790 * any active bits from the pages in the list.
@@ -801,11 +826,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
801 unsigned long nr_freed; 826 unsigned long nr_freed;
802 unsigned long nr_active; 827 unsigned long nr_active;
803 828
804 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 829 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
805 &zone->inactive_list,
806 &page_list, &nr_scan, sc->order, 830 &page_list, &nr_scan, sc->order,
807 (sc->order > PAGE_ALLOC_COSTLY_ORDER)? 831 (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
808 ISOLATE_BOTH : ISOLATE_INACTIVE); 832 ISOLATE_BOTH : ISOLATE_INACTIVE,
833 zone, sc->mem_cgroup, 0);
809 nr_active = clear_active_flags(&page_list); 834 nr_active = clear_active_flags(&page_list);
810 __count_vm_events(PGDEACTIVATE, nr_active); 835 __count_vm_events(PGDEACTIVATE, nr_active);
811 836
@@ -1018,8 +1043,9 @@ force_reclaim_mapped:
1018 1043
1019 lru_add_drain(); 1044 lru_add_drain();
1020 spin_lock_irq(&zone->lru_lock); 1045 spin_lock_irq(&zone->lru_lock);
1021 pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, 1046 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1022 &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE); 1047 ISOLATE_ACTIVE, zone,
1048 sc->mem_cgroup, 1);
1023 zone->pages_scanned += pgscanned; 1049 zone->pages_scanned += pgscanned;
1024 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 1050 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
1025 spin_unlock_irq(&zone->lru_lock); 1051 spin_unlock_irq(&zone->lru_lock);
@@ -1051,6 +1077,7 @@ force_reclaim_mapped:
1051 ClearPageActive(page); 1077 ClearPageActive(page);
1052 1078
1053 list_move(&page->lru, &zone->inactive_list); 1079 list_move(&page->lru, &zone->inactive_list);
1080 mem_cgroup_move_lists(page_get_page_cgroup(page), false);
1054 pgmoved++; 1081 pgmoved++;
1055 if (!pagevec_add(&pvec, page)) { 1082 if (!pagevec_add(&pvec, page)) {
1056 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1083 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
@@ -1079,6 +1106,7 @@ force_reclaim_mapped:
1079 SetPageLRU(page); 1106 SetPageLRU(page);
1080 VM_BUG_ON(!PageActive(page)); 1107 VM_BUG_ON(!PageActive(page));
1081 list_move(&page->lru, &zone->active_list); 1108 list_move(&page->lru, &zone->active_list);
1109 mem_cgroup_move_lists(page_get_page_cgroup(page), true);
1082 pgmoved++; 1110 pgmoved++;
1083 if (!pagevec_add(&pvec, page)) { 1111 if (!pagevec_add(&pvec, page)) {
1084 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 1112 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
@@ -1206,7 +1234,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
1206 * holds filesystem locks which prevent writeout this might not work, and the 1234 * holds filesystem locks which prevent writeout this might not work, and the
1207 * allocation attempt will fail. 1235 * allocation attempt will fail.
1208 */ 1236 */
1209unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) 1237static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
1238 struct scan_control *sc)
1210{ 1239{
1211 int priority; 1240 int priority;
1212 int ret = 0; 1241 int ret = 0;
@@ -1215,14 +1244,6 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1215 struct reclaim_state *reclaim_state = current->reclaim_state; 1244 struct reclaim_state *reclaim_state = current->reclaim_state;
1216 unsigned long lru_pages = 0; 1245 unsigned long lru_pages = 0;
1217 int i; 1246 int i;
1218 struct scan_control sc = {
1219 .gfp_mask = gfp_mask,
1220 .may_writepage = !laptop_mode,
1221 .swap_cluster_max = SWAP_CLUSTER_MAX,
1222 .may_swap = 1,
1223 .swappiness = vm_swappiness,
1224 .order = order,
1225 };
1226 1247
1227 count_vm_event(ALLOCSTALL); 1248 count_vm_event(ALLOCSTALL);
1228 1249
@@ -1237,17 +1258,22 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1237 } 1258 }
1238 1259
1239 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1260 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1240 sc.nr_scanned = 0; 1261 sc->nr_scanned = 0;
1241 if (!priority) 1262 if (!priority)
1242 disable_swap_token(); 1263 disable_swap_token();
1243 nr_reclaimed += shrink_zones(priority, zones, &sc); 1264 nr_reclaimed += shrink_zones(priority, zones, sc);
1244 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 1265 /*
1266 * Don't shrink slabs when reclaiming memory from
1267 * over limit cgroups
1268 */
1269 if (sc->mem_cgroup == NULL)
1270 shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
1245 if (reclaim_state) { 1271 if (reclaim_state) {
1246 nr_reclaimed += reclaim_state->reclaimed_slab; 1272 nr_reclaimed += reclaim_state->reclaimed_slab;
1247 reclaim_state->reclaimed_slab = 0; 1273 reclaim_state->reclaimed_slab = 0;
1248 } 1274 }
1249 total_scanned += sc.nr_scanned; 1275 total_scanned += sc->nr_scanned;
1250 if (nr_reclaimed >= sc.swap_cluster_max) { 1276 if (nr_reclaimed >= sc->swap_cluster_max) {
1251 ret = 1; 1277 ret = 1;
1252 goto out; 1278 goto out;
1253 } 1279 }
@@ -1259,18 +1285,18 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1259 * that's undesirable in laptop mode, where we *want* lumpy 1285 * that's undesirable in laptop mode, where we *want* lumpy
1260 * writeout. So in laptop mode, write out the whole world. 1286 * writeout. So in laptop mode, write out the whole world.
1261 */ 1287 */
1262 if (total_scanned > sc.swap_cluster_max + 1288 if (total_scanned > sc->swap_cluster_max +
1263 sc.swap_cluster_max / 2) { 1289 sc->swap_cluster_max / 2) {
1264 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1290 wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1265 sc.may_writepage = 1; 1291 sc->may_writepage = 1;
1266 } 1292 }
1267 1293
1268 /* Take a nap, wait for some writeback to complete */ 1294 /* Take a nap, wait for some writeback to complete */
1269 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1295 if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1270 congestion_wait(WRITE, HZ/10); 1296 congestion_wait(WRITE, HZ/10);
1271 } 1297 }
1272 /* top priority shrink_caches still had more to do? don't OOM, then */ 1298 /* top priority shrink_caches still had more to do? don't OOM, then */
1273 if (!sc.all_unreclaimable) 1299 if (!sc->all_unreclaimable && sc->mem_cgroup == NULL)
1274 ret = 1; 1300 ret = 1;
1275out: 1301out:
1276 /* 1302 /*
@@ -1293,6 +1319,54 @@ out:
1293 return ret; 1319 return ret;
1294} 1320}
1295 1321
1322unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1323{
1324 struct scan_control sc = {
1325 .gfp_mask = gfp_mask,
1326 .may_writepage = !laptop_mode,
1327 .swap_cluster_max = SWAP_CLUSTER_MAX,
1328 .may_swap = 1,
1329 .swappiness = vm_swappiness,
1330 .order = order,
1331 .mem_cgroup = NULL,
1332 .isolate_pages = isolate_pages_global,
1333 };
1334
1335 return do_try_to_free_pages(zones, gfp_mask, &sc);
1336}
1337
1338#ifdef CONFIG_CGROUP_MEM_CONT
1339
1340#ifdef CONFIG_HIGHMEM
1341#define ZONE_USERPAGES ZONE_HIGHMEM
1342#else
1343#define ZONE_USERPAGES ZONE_NORMAL
1344#endif
1345
1346unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont)
1347{
1348 struct scan_control sc = {
1349 .gfp_mask = GFP_KERNEL,
1350 .may_writepage = !laptop_mode,
1351 .may_swap = 1,
1352 .swap_cluster_max = SWAP_CLUSTER_MAX,
1353 .swappiness = vm_swappiness,
1354 .order = 0,
1355 .mem_cgroup = mem_cont,
1356 .isolate_pages = mem_cgroup_isolate_pages,
1357 };
1358 int node;
1359 struct zone **zones;
1360
1361 for_each_online_node(node) {
1362 zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones;
1363 if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
1364 return 1;
1365 }
1366 return 0;
1367}
1368#endif
1369
1296/* 1370/*
1297 * For kswapd, balance_pgdat() will work across all this node's zones until 1371 * For kswapd, balance_pgdat() will work across all this node's zones until
1298 * they are all at pages_high. 1372 * they are all at pages_high.
@@ -1328,6 +1402,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1328 .swap_cluster_max = SWAP_CLUSTER_MAX, 1402 .swap_cluster_max = SWAP_CLUSTER_MAX,
1329 .swappiness = vm_swappiness, 1403 .swappiness = vm_swappiness,
1330 .order = order, 1404 .order = order,
1405 .mem_cgroup = NULL,
1406 .isolate_pages = isolate_pages_global,
1331 }; 1407 };
1332 /* 1408 /*
1333 * temp_priority is used to remember the scanning priority at which 1409 * temp_priority is used to remember the scanning priority at which
@@ -1649,6 +1725,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1649 .swap_cluster_max = nr_pages, 1725 .swap_cluster_max = nr_pages,
1650 .may_writepage = 1, 1726 .may_writepage = 1,
1651 .swappiness = vm_swappiness, 1727 .swappiness = vm_swappiness,
1728 .isolate_pages = isolate_pages_global,
1652 }; 1729 };
1653 1730
1654 current->reclaim_state = &reclaim_state; 1731 current->reclaim_state = &reclaim_state;
@@ -1834,6 +1911,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1834 SWAP_CLUSTER_MAX), 1911 SWAP_CLUSTER_MAX),
1835 .gfp_mask = gfp_mask, 1912 .gfp_mask = gfp_mask,
1836 .swappiness = vm_swappiness, 1913 .swappiness = vm_swappiness,
1914 .isolate_pages = isolate_pages_global,
1837 }; 1915 };
1838 unsigned long slab_reclaimable; 1916 unsigned long slab_reclaimable;
1839 1917