memcg: reclaim memory from nodes in round-robin order

Presently, memory cgroup's direct reclaim frees memory from the current node. But this has some troubles. Usually when a set of threads works in a cooperative way, they tend to operate on the same node. So if they hit limits under memcg they will reclaim memory from themselves, damaging the active working set. For example, assume 2 node system which has Node 0 and Node 1 and a memcg which has 1G limit. After some work, file cache remains and the usages are Node 0: 1M Node 1: 998M. and run an application on Node 0, it will eat its foot before freeing unnecessary file caches. This patch adds round-robin for NUMA and adds equal pressure to each node. When using cpuset's spread memory feature, this will work very well. But yes, a better algorithm is needed. [akpm@linux-foundation.org: comment editing] [kamezawa.hiroyu@jp.fujitsu.com: fix time comparisons] Signed-off-by: Ying Han <yinghan@google.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Ying Han <yinghan@google.com> 2011-05-26 19:25:33 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-05-26 20:12:35 -0400
commit: 889976dbcb1218119fdd950fb7819084e37d7d37 (patch)
tree: 7508706ddb6bcbe0f673aca3744f30f281b17734 /mm
parent: 4e4c941c108eff10844d2b441d96dab44f32f424 (diff)
2 files changed, 105 insertions, 7 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fc62c714f3b6..1520efd1c7c4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -231,6 +231,11 @@ struct mem_cgroup {
         * reclaimed from.
         */
        int last_scanned_child;
+        int last_scanned_node;
+#if MAX_NUMNODES > 1
+        nodemask_t      scan_nodes;
+        unsigned long   next_scan_node_update;
+#endif
        /*
         * Should the accounting and control be hierarchical, per subtree?
         */
@@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        preempt_enable();
 }
+static unsigned long
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+{
+        struct mem_cgroup_per_zone *mz;
+        u64 total = 0;
+        int zid;
+        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                mz = mem_cgroup_zoneinfo(mem, nid, zid);
+                total += MEM_CGROUP_ZSTAT(mz, idx);
+        }
+        return total;
+}
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
                                        enum lru_list idx)
 {
-        int nid, zid;
+        int nid;
-        struct mem_cgroup_per_zone *mz;
        u64 total = 0;
        for_each_online_node(nid)
-                for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                total += mem_cgroup_get_zonestat_node(mem, nid, idx);
-                        mz = mem_cgroup_zoneinfo(mem, nid, zid);
-                        total += MEM_CGROUP_ZSTAT(mz, idx);
-                }
        return total;
 }
@@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
        return ret;
 }
+#if MAX_NUMNODES > 1
+/*
+ * Always updating the nodemask is not very good - even if we have an empty
+ * list or the wrong list here, we can start from some node and traverse all
+ * nodes based on the zonelist. So update the list loosely once per 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+        int nid;
+        if (time_after(mem->next_scan_node_update, jiffies))
+                return;
+        mem->next_scan_node_update = jiffies + 10*HZ;
+        /* make a nodemask where this memcg uses memory from */
+        mem->scan_nodes = node_states[N_HIGH_MEMORY];
+        for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+                if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+                        continue;
+                if (total_swap_pages &&
+                    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
+                     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
+                        continue;
+                node_clear(nid, mem->scan_nodes);
+        }
+}
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+        int node;
+        mem_cgroup_may_update_nodemask(mem);
+        node = mem->last_scanned_node;
+        node = next_node(node, mem->scan_nodes);
+        if (node == MAX_NUMNODES)
+                node = first_node(mem->scan_nodes);
+        /*
+         * We call this when we hit limit, not when pages are added to LRU.
+         * No LRU may hold pages because all pages are UNEVICTABLE or
+         * memcg is too small and all pages are not on LRU. In that case,
+         * we use curret node.
+         */
+        if (unlikely(node == MAX_NUMNODES))
+                node = numa_node_id();
+        mem->last_scanned_node = node;
+        return node;
+}
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+        return 0;
+}
+#endif
 /*
 * Scan the hierarchy if needed to reclaim memory. We remember the last child
 * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                res_counter_init(&mem->memsw, NULL);
        }
        mem->last_scanned_child = 0;
+        mem->last_scanned_node = MAX_NUMNODES;
        INIT_LIST_HEAD(&mem->oom_notify);
        if (parent)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 884ae08c16cc..b0875871820d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2226,6 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 {
        struct zonelist *zonelist;
        unsigned long nr_reclaimed;
+        int nid;
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
@@ -2242,7 +2243,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .gfp_mask = sc.gfp_mask,
        };
-        zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+        /*
+         * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
+         * take care of from where we get pages. So the node where we start the
+         * scan does not need to be the current node.
+         */
+        nid = mem_cgroup_select_victim_node(mem_cont);
+        zonelist = NODE_DATA(nid)->node_zonelists;
        trace_mm_vmscan_memcg_reclaim_begin(0,
                                            sc.may_writepage,
author	Ying Han <yinghan@google.com>	2011-05-26 19:25:33 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-05-26 20:12:35 -0400
commit	889976dbcb1218119fdd950fb7819084e37d7d37 (patch)
tree	7508706ddb6bcbe0f673aca3744f30f281b17734 /mm
parent	4e4c941c108eff10844d2b441d96dab44f32f424 (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc62c714f3b6..1520efd1c7c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -231,6 +231,11 @@ struct mem_cgroup {
231	* reclaimed from.	231	* reclaimed from.
232	*/	232	*/
233	int last_scanned_child;	233	int last_scanned_child;
		234	int last_scanned_node;
		235	#if MAX_NUMNODES > 1
		236	nodemask_t scan_nodes;
		237	unsigned long next_scan_node_update;
		238	#endif
234	/*	239	/*
235	* Should the accounting and control be hierarchical, per subtree?	240	* Should the accounting and control be hierarchical, per subtree?
236	*/	241	*/
@@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
624	preempt_enable();	629	preempt_enable();
625	}	630	}
626		631
		632	static unsigned long
		633	mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
		634	{
		635	struct mem_cgroup_per_zone *mz;
		636	u64 total = 0;
		637	int zid;
		638
		639	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
		640	mz = mem_cgroup_zoneinfo(mem, nid, zid);
		641	total += MEM_CGROUP_ZSTAT(mz, idx);
		642	}
		643	return total;
		644	}
627	static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,	645	static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
628	enum lru_list idx)	646	enum lru_list idx)
629	{	647	{
630	int nid, zid;	648	int nid;
631	struct mem_cgroup_per_zone *mz;
632	u64 total = 0;	649	u64 total = 0;
633		650
634	for_each_online_node(nid)	651	for_each_online_node(nid)
635	for (zid = 0; zid < MAX_NR_ZONES; zid++) {	652	total += mem_cgroup_get_zonestat_node(mem, nid, idx);
636	mz = mem_cgroup_zoneinfo(mem, nid, zid);
637	total += MEM_CGROUP_ZSTAT(mz, idx);
638	}
639	return total;	653	return total;
640	}	654	}
641		655
@@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1418	return ret;	1432	return ret;
1419	}	1433	}
1420		1434
		1435	#if MAX_NUMNODES > 1
		1436
		1437	/*
		1438	* Always updating the nodemask is not very good - even if we have an empty
		1439	* list or the wrong list here, we can start from some node and traverse all
		1440	* nodes based on the zonelist. So update the list loosely once per 10 secs.
		1441	*
		1442	*/
		1443	static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
		1444	{
		1445	int nid;
		1446
		1447	if (time_after(mem->next_scan_node_update, jiffies))
		1448	return;
		1449
		1450	mem->next_scan_node_update = jiffies + 10*HZ;
		1451	/* make a nodemask where this memcg uses memory from */
		1452	mem->scan_nodes = node_states[N_HIGH_MEMORY];
		1453
		1454	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
		1455
		1456	if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) \|\|
		1457	mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
		1458	continue;
		1459
		1460	if (total_swap_pages &&
		1461	(mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) \|\|
		1462	mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
		1463	continue;
		1464	node_clear(nid, mem->scan_nodes);
		1465	}
		1466	}
		1467
		1468	/*
		1469	* Selecting a node where we start reclaim from. Because what we need is just
		1470	* reducing usage counter, start from anywhere is O,K. Considering
		1471	* memory reclaim from current node, there are pros. and cons.
		1472	*
		1473	* Freeing memory from current node means freeing memory from a node which
		1474	* we'll use or we've used. So, it may make LRU bad. And if several threads
		1475	* hit limits, it will see a contention on a node. But freeing from remote
		1476	* node means more costs for memory reclaim because of memory latency.
		1477	*
		1478	* Now, we use round-robin. Better algorithm is welcomed.
		1479	*/
		1480	int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
		1481	{
		1482	int node;
		1483
		1484	mem_cgroup_may_update_nodemask(mem);
		1485	node = mem->last_scanned_node;
		1486
		1487	node = next_node(node, mem->scan_nodes);
		1488	if (node == MAX_NUMNODES)
		1489	node = first_node(mem->scan_nodes);
		1490	/*
		1491	* We call this when we hit limit, not when pages are added to LRU.
		1492	* No LRU may hold pages because all pages are UNEVICTABLE or
		1493	* memcg is too small and all pages are not on LRU. In that case,
		1494	* we use curret node.
		1495	*/
		1496	if (unlikely(node == MAX_NUMNODES))
		1497	node = numa_node_id();
		1498
		1499	mem->last_scanned_node = node;
		1500	return node;
		1501	}
		1502
		1503	#else
		1504	int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
		1505	{
		1506	return 0;
		1507	}
		1508	#endif
		1509
1421	/*	1510	/*
1422	* Scan the hierarchy if needed to reclaim memory. We remember the last child	1511	* Scan the hierarchy if needed to reclaim memory. We remember the last child
1423	* we reclaimed from, so that we don't end up penalizing one child extensively	1512	* we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys ss, struct cgroup cont)
4606	res_counter_init(&mem->memsw, NULL);	4695	res_counter_init(&mem->memsw, NULL);
4607	}	4696	}
4608	mem->last_scanned_child = 0;	4697	mem->last_scanned_child = 0;
		4698	mem->last_scanned_node = MAX_NUMNODES;
4609	INIT_LIST_HEAD(&mem->oom_notify);	4699	INIT_LIST_HEAD(&mem->oom_notify);
4610		4700
4611	if (parent)	4701	if (parent)


diff --git a/mm/vmscan.c b/mm/vmscan.c index 884ae08c16cc..b0875871820d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -2226,6 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2226	{	2226	{
2227	struct zonelist *zonelist;	2227	struct zonelist *zonelist;
2228	unsigned long nr_reclaimed;	2228	unsigned long nr_reclaimed;
		2229	int nid;
2229	struct scan_control sc = {	2230	struct scan_control sc = {
2230	.may_writepage = !laptop_mode,	2231	.may_writepage = !laptop_mode,
2231	.may_unmap = 1,	2232	.may_unmap = 1,
@@ -2242,7 +2243,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2242	.gfp_mask = sc.gfp_mask,	2243	.gfp_mask = sc.gfp_mask,
2243	};	2244	};
2244		2245
2245	zonelist = NODE_DATA(numa_node_id())->node_zonelists;	2246	/*
		2247	* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
		2248	* take care of from where we get pages. So the node where we start the
		2249	* scan does not need to be the current node.
		2250	*/
		2251	nid = mem_cgroup_select_victim_node(mem_cont);
		2252
		2253	zonelist = NODE_DATA(nid)->node_zonelists;
2246		2254
2247	trace_mm_vmscan_memcg_reclaim_begin(0,	2255	trace_mm_vmscan_memcg_reclaim_begin(0,
2248	sc.may_writepage,	2256	sc.may_writepage,