aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorYing Han <yinghan@google.com>2011-05-26 19:25:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-26 20:12:35 -0400
commit889976dbcb1218119fdd950fb7819084e37d7d37 (patch)
tree7508706ddb6bcbe0f673aca3744f30f281b17734 /mm
parent4e4c941c108eff10844d2b441d96dab44f32f424 (diff)
memcg: reclaim memory from nodes in round-robin order
Presently, memory cgroup's direct reclaim frees memory from the current node. But this has some troubles. Usually when a set of threads works in a cooperative way, they tend to operate on the same node. So if they hit limits under memcg they will reclaim memory from themselves, damaging the active working set. For example, assume 2 node system which has Node 0 and Node 1 and a memcg which has 1G limit. After some work, file cache remains and the usages are Node 0: 1M Node 1: 998M. and run an application on Node 0, it will eat its foot before freeing unnecessary file caches. This patch adds round-robin for NUMA and adds equal pressure to each node. When using cpuset's spread memory feature, this will work very well. But yes, a better algorithm is needed. [akpm@linux-foundation.org: comment editing] [kamezawa.hiroyu@jp.fujitsu.com: fix time comparisons] Signed-off-by: Ying Han <yinghan@google.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c102
-rw-r--r--mm/vmscan.c10
2 files changed, 105 insertions, 7 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fc62c714f3b6..1520efd1c7c4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -231,6 +231,11 @@ struct mem_cgroup {
231 * reclaimed from. 231 * reclaimed from.
232 */ 232 */
233 int last_scanned_child; 233 int last_scanned_child;
234 int last_scanned_node;
235#if MAX_NUMNODES > 1
236 nodemask_t scan_nodes;
237 unsigned long next_scan_node_update;
238#endif
234 /* 239 /*
235 * Should the accounting and control be hierarchical, per subtree? 240 * Should the accounting and control be hierarchical, per subtree?
236 */ 241 */
@@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
624 preempt_enable(); 629 preempt_enable();
625} 630}
626 631
632static unsigned long
633mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
634{
635 struct mem_cgroup_per_zone *mz;
636 u64 total = 0;
637 int zid;
638
639 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
640 mz = mem_cgroup_zoneinfo(mem, nid, zid);
641 total += MEM_CGROUP_ZSTAT(mz, idx);
642 }
643 return total;
644}
627static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 645static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
628 enum lru_list idx) 646 enum lru_list idx)
629{ 647{
630 int nid, zid; 648 int nid;
631 struct mem_cgroup_per_zone *mz;
632 u64 total = 0; 649 u64 total = 0;
633 650
634 for_each_online_node(nid) 651 for_each_online_node(nid)
635 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 652 total += mem_cgroup_get_zonestat_node(mem, nid, idx);
636 mz = mem_cgroup_zoneinfo(mem, nid, zid);
637 total += MEM_CGROUP_ZSTAT(mz, idx);
638 }
639 return total; 653 return total;
640} 654}
641 655
@@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1418 return ret; 1432 return ret;
1419} 1433}
1420 1434
1435#if MAX_NUMNODES > 1
1436
1437/*
1438 * Always updating the nodemask is not very good - even if we have an empty
1439 * list or the wrong list here, we can start from some node and traverse all
1440 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1441 *
1442 */
1443static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1444{
1445 int nid;
1446
1447 if (time_after(mem->next_scan_node_update, jiffies))
1448 return;
1449
1450 mem->next_scan_node_update = jiffies + 10*HZ;
1451 /* make a nodemask where this memcg uses memory from */
1452 mem->scan_nodes = node_states[N_HIGH_MEMORY];
1453
1454 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1455
1456 if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
1457 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
1458 continue;
1459
1460 if (total_swap_pages &&
1461 (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
1462 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
1463 continue;
1464 node_clear(nid, mem->scan_nodes);
1465 }
1466}
1467
1468/*
1469 * Selecting a node where we start reclaim from. Because what we need is just
1470 * reducing usage counter, start from anywhere is O,K. Considering
1471 * memory reclaim from current node, there are pros. and cons.
1472 *
1473 * Freeing memory from current node means freeing memory from a node which
1474 * we'll use or we've used. So, it may make LRU bad. And if several threads
1475 * hit limits, it will see a contention on a node. But freeing from remote
1476 * node means more costs for memory reclaim because of memory latency.
1477 *
1478 * Now, we use round-robin. Better algorithm is welcomed.
1479 */
1480int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1481{
1482 int node;
1483
1484 mem_cgroup_may_update_nodemask(mem);
1485 node = mem->last_scanned_node;
1486
1487 node = next_node(node, mem->scan_nodes);
1488 if (node == MAX_NUMNODES)
1489 node = first_node(mem->scan_nodes);
1490 /*
1491 * We call this when we hit limit, not when pages are added to LRU.
1492 * No LRU may hold pages because all pages are UNEVICTABLE or
1493 * memcg is too small and all pages are not on LRU. In that case,
1494 * we use curret node.
1495 */
1496 if (unlikely(node == MAX_NUMNODES))
1497 node = numa_node_id();
1498
1499 mem->last_scanned_node = node;
1500 return node;
1501}
1502
1503#else
1504int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1505{
1506 return 0;
1507}
1508#endif
1509
1421/* 1510/*
1422 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1511 * Scan the hierarchy if needed to reclaim memory. We remember the last child
1423 * we reclaimed from, so that we don't end up penalizing one child extensively 1512 * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4606 res_counter_init(&mem->memsw, NULL); 4695 res_counter_init(&mem->memsw, NULL);
4607 } 4696 }
4608 mem->last_scanned_child = 0; 4697 mem->last_scanned_child = 0;
4698 mem->last_scanned_node = MAX_NUMNODES;
4609 INIT_LIST_HEAD(&mem->oom_notify); 4699 INIT_LIST_HEAD(&mem->oom_notify);
4610 4700
4611 if (parent) 4701 if (parent)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 884ae08c16cc..b0875871820d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2226,6 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2226{ 2226{
2227 struct zonelist *zonelist; 2227 struct zonelist *zonelist;
2228 unsigned long nr_reclaimed; 2228 unsigned long nr_reclaimed;
2229 int nid;
2229 struct scan_control sc = { 2230 struct scan_control sc = {
2230 .may_writepage = !laptop_mode, 2231 .may_writepage = !laptop_mode,
2231 .may_unmap = 1, 2232 .may_unmap = 1,
@@ -2242,7 +2243,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2242 .gfp_mask = sc.gfp_mask, 2243 .gfp_mask = sc.gfp_mask,
2243 }; 2244 };
2244 2245
2245 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 2246 /*
2247 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2248 * take care of from where we get pages. So the node where we start the
2249 * scan does not need to be the current node.
2250 */
2251 nid = mem_cgroup_select_victim_node(mem_cont);
2252
2253 zonelist = NODE_DATA(nid)->node_zonelists;
2246 2254
2247 trace_mm_vmscan_memcg_reclaim_begin(0, 2255 trace_mm_vmscan_memcg_reclaim_begin(0,
2248 sc.may_writepage, 2256 sc.may_writepage,