diff options
author | Balbir Singh <balbir@linux.vnet.ibm.com> | 2008-02-07 03:13:56 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2008-02-07 11:42:18 -0500 |
commit | 66e1707bc34609f626e2e7b4fe7e454c9748bad5 (patch) | |
tree | d850a729887485874c976ba64eb85e3406e488a1 /mm/vmscan.c | |
parent | 67e465a77ba658635309ee00b367bec6555ea544 (diff) |
Memory controller: add per cgroup LRU and reclaim
Add the page_cgroup to the per cgroup LRU. The reclaim algorithm has
been modified to make the isolate_lru_pages() as a pluggable component. The
scan_control data structure now accepts the cgroup on behalf of which
reclaims are carried out. try_to_free_pages() has been extended to become
cgroup aware.
[akpm@linux-foundation.org: fix warning]
[Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member]
[bunk@kernel.org: make do_try_to_free_pages() static]
[hugh@veritas.com: memcgroup: fix try_to_free order]
[kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary]
Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: David Rientjes <rientjes@google.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 128 |
1 files changed, 103 insertions, 25 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index e5a9597e3bbc..7408a8a7d882 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
38 | #include <linux/kthread.h> | 38 | #include <linux/kthread.h> |
39 | #include <linux/freezer.h> | 39 | #include <linux/freezer.h> |
40 | #include <linux/memcontrol.h> | ||
40 | 41 | ||
41 | #include <asm/tlbflush.h> | 42 | #include <asm/tlbflush.h> |
42 | #include <asm/div64.h> | 43 | #include <asm/div64.h> |
@@ -68,6 +69,15 @@ struct scan_control { | |||
68 | int all_unreclaimable; | 69 | int all_unreclaimable; |
69 | 70 | ||
70 | int order; | 71 | int order; |
72 | |||
73 | /* Which cgroup do we reclaim from */ | ||
74 | struct mem_cgroup *mem_cgroup; | ||
75 | |||
76 | /* Pluggable isolate pages callback */ | ||
77 | unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, | ||
78 | unsigned long *scanned, int order, int mode, | ||
79 | struct zone *z, struct mem_cgroup *mem_cont, | ||
80 | int active); | ||
71 | }; | 81 | }; |
72 | 82 | ||
73 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 83 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
@@ -626,7 +636,7 @@ keep: | |||
626 | * | 636 | * |
627 | * returns 0 on success, -ve errno on failure. | 637 | * returns 0 on success, -ve errno on failure. |
628 | */ | 638 | */ |
629 | static int __isolate_lru_page(struct page *page, int mode) | 639 | int __isolate_lru_page(struct page *page, int mode) |
630 | { | 640 | { |
631 | int ret = -EINVAL; | 641 | int ret = -EINVAL; |
632 | 642 | ||
@@ -760,6 +770,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
760 | return nr_taken; | 770 | return nr_taken; |
761 | } | 771 | } |
762 | 772 | ||
773 | static unsigned long isolate_pages_global(unsigned long nr, | ||
774 | struct list_head *dst, | ||
775 | unsigned long *scanned, int order, | ||
776 | int mode, struct zone *z, | ||
777 | struct mem_cgroup *mem_cont, | ||
778 | int active) | ||
779 | { | ||
780 | if (active) | ||
781 | return isolate_lru_pages(nr, &z->active_list, dst, | ||
782 | scanned, order, mode); | ||
783 | else | ||
784 | return isolate_lru_pages(nr, &z->inactive_list, dst, | ||
785 | scanned, order, mode); | ||
786 | } | ||
787 | |||
763 | /* | 788 | /* |
764 | * clear_active_flags() is a helper for shrink_active_list(), clearing | 789 | * clear_active_flags() is a helper for shrink_active_list(), clearing |
765 | * any active bits from the pages in the list. | 790 | * any active bits from the pages in the list. |
@@ -801,11 +826,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
801 | unsigned long nr_freed; | 826 | unsigned long nr_freed; |
802 | unsigned long nr_active; | 827 | unsigned long nr_active; |
803 | 828 | ||
804 | nr_taken = isolate_lru_pages(sc->swap_cluster_max, | 829 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, |
805 | &zone->inactive_list, | ||
806 | &page_list, &nr_scan, sc->order, | 830 | &page_list, &nr_scan, sc->order, |
807 | (sc->order > PAGE_ALLOC_COSTLY_ORDER)? | 831 | (sc->order > PAGE_ALLOC_COSTLY_ORDER)? |
808 | ISOLATE_BOTH : ISOLATE_INACTIVE); | 832 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
833 | zone, sc->mem_cgroup, 0); | ||
809 | nr_active = clear_active_flags(&page_list); | 834 | nr_active = clear_active_flags(&page_list); |
810 | __count_vm_events(PGDEACTIVATE, nr_active); | 835 | __count_vm_events(PGDEACTIVATE, nr_active); |
811 | 836 | ||
@@ -1018,8 +1043,9 @@ force_reclaim_mapped: | |||
1018 | 1043 | ||
1019 | lru_add_drain(); | 1044 | lru_add_drain(); |
1020 | spin_lock_irq(&zone->lru_lock); | 1045 | spin_lock_irq(&zone->lru_lock); |
1021 | pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, | 1046 | pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, |
1022 | &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE); | 1047 | ISOLATE_ACTIVE, zone, |
1048 | sc->mem_cgroup, 1); | ||
1023 | zone->pages_scanned += pgscanned; | 1049 | zone->pages_scanned += pgscanned; |
1024 | __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); | 1050 | __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); |
1025 | spin_unlock_irq(&zone->lru_lock); | 1051 | spin_unlock_irq(&zone->lru_lock); |
@@ -1051,6 +1077,7 @@ force_reclaim_mapped: | |||
1051 | ClearPageActive(page); | 1077 | ClearPageActive(page); |
1052 | 1078 | ||
1053 | list_move(&page->lru, &zone->inactive_list); | 1079 | list_move(&page->lru, &zone->inactive_list); |
1080 | mem_cgroup_move_lists(page_get_page_cgroup(page), false); | ||
1054 | pgmoved++; | 1081 | pgmoved++; |
1055 | if (!pagevec_add(&pvec, page)) { | 1082 | if (!pagevec_add(&pvec, page)) { |
1056 | __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); | 1083 | __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); |
@@ -1079,6 +1106,7 @@ force_reclaim_mapped: | |||
1079 | SetPageLRU(page); | 1106 | SetPageLRU(page); |
1080 | VM_BUG_ON(!PageActive(page)); | 1107 | VM_BUG_ON(!PageActive(page)); |
1081 | list_move(&page->lru, &zone->active_list); | 1108 | list_move(&page->lru, &zone->active_list); |
1109 | mem_cgroup_move_lists(page_get_page_cgroup(page), true); | ||
1082 | pgmoved++; | 1110 | pgmoved++; |
1083 | if (!pagevec_add(&pvec, page)) { | 1111 | if (!pagevec_add(&pvec, page)) { |
1084 | __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); | 1112 | __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); |
@@ -1206,7 +1234,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
1206 | * holds filesystem locks which prevent writeout this might not work, and the | 1234 | * holds filesystem locks which prevent writeout this might not work, and the |
1207 | * allocation attempt will fail. | 1235 | * allocation attempt will fail. |
1208 | */ | 1236 | */ |
1209 | unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | 1237 | static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, |
1238 | struct scan_control *sc) | ||
1210 | { | 1239 | { |
1211 | int priority; | 1240 | int priority; |
1212 | int ret = 0; | 1241 | int ret = 0; |
@@ -1215,14 +1244,6 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | |||
1215 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1244 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1216 | unsigned long lru_pages = 0; | 1245 | unsigned long lru_pages = 0; |
1217 | int i; | 1246 | int i; |
1218 | struct scan_control sc = { | ||
1219 | .gfp_mask = gfp_mask, | ||
1220 | .may_writepage = !laptop_mode, | ||
1221 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1222 | .may_swap = 1, | ||
1223 | .swappiness = vm_swappiness, | ||
1224 | .order = order, | ||
1225 | }; | ||
1226 | 1247 | ||
1227 | count_vm_event(ALLOCSTALL); | 1248 | count_vm_event(ALLOCSTALL); |
1228 | 1249 | ||
@@ -1237,17 +1258,22 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | |||
1237 | } | 1258 | } |
1238 | 1259 | ||
1239 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 1260 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1240 | sc.nr_scanned = 0; | 1261 | sc->nr_scanned = 0; |
1241 | if (!priority) | 1262 | if (!priority) |
1242 | disable_swap_token(); | 1263 | disable_swap_token(); |
1243 | nr_reclaimed += shrink_zones(priority, zones, &sc); | 1264 | nr_reclaimed += shrink_zones(priority, zones, sc); |
1244 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); | 1265 | /* |
1266 | * Don't shrink slabs when reclaiming memory from | ||
1267 | * over limit cgroups | ||
1268 | */ | ||
1269 | if (sc->mem_cgroup == NULL) | ||
1270 | shrink_slab(sc->nr_scanned, gfp_mask, lru_pages); | ||
1245 | if (reclaim_state) { | 1271 | if (reclaim_state) { |
1246 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1272 | nr_reclaimed += reclaim_state->reclaimed_slab; |
1247 | reclaim_state->reclaimed_slab = 0; | 1273 | reclaim_state->reclaimed_slab = 0; |
1248 | } | 1274 | } |
1249 | total_scanned += sc.nr_scanned; | 1275 | total_scanned += sc->nr_scanned; |
1250 | if (nr_reclaimed >= sc.swap_cluster_max) { | 1276 | if (nr_reclaimed >= sc->swap_cluster_max) { |
1251 | ret = 1; | 1277 | ret = 1; |
1252 | goto out; | 1278 | goto out; |
1253 | } | 1279 | } |
@@ -1259,18 +1285,18 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | |||
1259 | * that's undesirable in laptop mode, where we *want* lumpy | 1285 | * that's undesirable in laptop mode, where we *want* lumpy |
1260 | * writeout. So in laptop mode, write out the whole world. | 1286 | * writeout. So in laptop mode, write out the whole world. |
1261 | */ | 1287 | */ |
1262 | if (total_scanned > sc.swap_cluster_max + | 1288 | if (total_scanned > sc->swap_cluster_max + |
1263 | sc.swap_cluster_max / 2) { | 1289 | sc->swap_cluster_max / 2) { |
1264 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); | 1290 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); |
1265 | sc.may_writepage = 1; | 1291 | sc->may_writepage = 1; |
1266 | } | 1292 | } |
1267 | 1293 | ||
1268 | /* Take a nap, wait for some writeback to complete */ | 1294 | /* Take a nap, wait for some writeback to complete */ |
1269 | if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | 1295 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) |
1270 | congestion_wait(WRITE, HZ/10); | 1296 | congestion_wait(WRITE, HZ/10); |
1271 | } | 1297 | } |
1272 | /* top priority shrink_caches still had more to do? don't OOM, then */ | 1298 | /* top priority shrink_caches still had more to do? don't OOM, then */ |
1273 | if (!sc.all_unreclaimable) | 1299 | if (!sc->all_unreclaimable && sc->mem_cgroup == NULL) |
1274 | ret = 1; | 1300 | ret = 1; |
1275 | out: | 1301 | out: |
1276 | /* | 1302 | /* |
@@ -1293,6 +1319,54 @@ out: | |||
1293 | return ret; | 1319 | return ret; |
1294 | } | 1320 | } |
1295 | 1321 | ||
1322 | unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | ||
1323 | { | ||
1324 | struct scan_control sc = { | ||
1325 | .gfp_mask = gfp_mask, | ||
1326 | .may_writepage = !laptop_mode, | ||
1327 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1328 | .may_swap = 1, | ||
1329 | .swappiness = vm_swappiness, | ||
1330 | .order = order, | ||
1331 | .mem_cgroup = NULL, | ||
1332 | .isolate_pages = isolate_pages_global, | ||
1333 | }; | ||
1334 | |||
1335 | return do_try_to_free_pages(zones, gfp_mask, &sc); | ||
1336 | } | ||
1337 | |||
1338 | #ifdef CONFIG_CGROUP_MEM_CONT | ||
1339 | |||
1340 | #ifdef CONFIG_HIGHMEM | ||
1341 | #define ZONE_USERPAGES ZONE_HIGHMEM | ||
1342 | #else | ||
1343 | #define ZONE_USERPAGES ZONE_NORMAL | ||
1344 | #endif | ||
1345 | |||
1346 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont) | ||
1347 | { | ||
1348 | struct scan_control sc = { | ||
1349 | .gfp_mask = GFP_KERNEL, | ||
1350 | .may_writepage = !laptop_mode, | ||
1351 | .may_swap = 1, | ||
1352 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
1353 | .swappiness = vm_swappiness, | ||
1354 | .order = 0, | ||
1355 | .mem_cgroup = mem_cont, | ||
1356 | .isolate_pages = mem_cgroup_isolate_pages, | ||
1357 | }; | ||
1358 | int node; | ||
1359 | struct zone **zones; | ||
1360 | |||
1361 | for_each_online_node(node) { | ||
1362 | zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones; | ||
1363 | if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) | ||
1364 | return 1; | ||
1365 | } | ||
1366 | return 0; | ||
1367 | } | ||
1368 | #endif | ||
1369 | |||
1296 | /* | 1370 | /* |
1297 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1371 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1298 | * they are all at pages_high. | 1372 | * they are all at pages_high. |
@@ -1328,6 +1402,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1328 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1402 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1329 | .swappiness = vm_swappiness, | 1403 | .swappiness = vm_swappiness, |
1330 | .order = order, | 1404 | .order = order, |
1405 | .mem_cgroup = NULL, | ||
1406 | .isolate_pages = isolate_pages_global, | ||
1331 | }; | 1407 | }; |
1332 | /* | 1408 | /* |
1333 | * temp_priority is used to remember the scanning priority at which | 1409 | * temp_priority is used to remember the scanning priority at which |
@@ -1649,6 +1725,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1649 | .swap_cluster_max = nr_pages, | 1725 | .swap_cluster_max = nr_pages, |
1650 | .may_writepage = 1, | 1726 | .may_writepage = 1, |
1651 | .swappiness = vm_swappiness, | 1727 | .swappiness = vm_swappiness, |
1728 | .isolate_pages = isolate_pages_global, | ||
1652 | }; | 1729 | }; |
1653 | 1730 | ||
1654 | current->reclaim_state = &reclaim_state; | 1731 | current->reclaim_state = &reclaim_state; |
@@ -1834,6 +1911,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1834 | SWAP_CLUSTER_MAX), | 1911 | SWAP_CLUSTER_MAX), |
1835 | .gfp_mask = gfp_mask, | 1912 | .gfp_mask = gfp_mask, |
1836 | .swappiness = vm_swappiness, | 1913 | .swappiness = vm_swappiness, |
1914 | .isolate_pages = isolate_pages_global, | ||
1837 | }; | 1915 | }; |
1838 | unsigned long slab_reclaimable; | 1916 | unsigned long slab_reclaimable; |
1839 | 1917 | ||