3 files changed, 106 insertions, 7 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0629121f2c0b..16052117131e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -106,6 +106,7 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
 */
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
                                       struct zone *zone,
                                       enum lru_list lru);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fc62c714f3b6..1520efd1c7c4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -231,6 +231,11 @@ struct mem_cgroup {
         * reclaimed from.
         */
        int last_scanned_child;
+        int last_scanned_node;
+#if MAX_NUMNODES > 1
+        nodemask_t      scan_nodes;
+        unsigned long   next_scan_node_update;
+#endif
        /*
         * Should the accounting and control be hierarchical, per subtree?
         */
@@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        preempt_enable();
 }
+static unsigned long
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+{
+        struct mem_cgroup_per_zone *mz;
+        u64 total = 0;
+        int zid;
+        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                mz = mem_cgroup_zoneinfo(mem, nid, zid);
+                total += MEM_CGROUP_ZSTAT(mz, idx);
+        }
+        return total;
+}
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
                                        enum lru_list idx)
 {
-        int nid, zid;
+        int nid;
-        struct mem_cgroup_per_zone *mz;
        u64 total = 0;
        for_each_online_node(nid)
-                for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                total += mem_cgroup_get_zonestat_node(mem, nid, idx);
-                        mz = mem_cgroup_zoneinfo(mem, nid, zid);
-                        total += MEM_CGROUP_ZSTAT(mz, idx);
-                }
        return total;
 }
@@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
        return ret;
 }
+#if MAX_NUMNODES > 1
+/*
+ * Always updating the nodemask is not very good - even if we have an empty
+ * list or the wrong list here, we can start from some node and traverse all
+ * nodes based on the zonelist. So update the list loosely once per 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+        int nid;
+        if (time_after(mem->next_scan_node_update, jiffies))
+                return;
+        mem->next_scan_node_update = jiffies + 10*HZ;
+        /* make a nodemask where this memcg uses memory from */
+        mem->scan_nodes = node_states[N_HIGH_MEMORY];
+        for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+                if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+                        continue;
+                if (total_swap_pages &&
+                    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
+                     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
+                        continue;
+                node_clear(nid, mem->scan_nodes);
+        }
+}
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+        int node;
+        mem_cgroup_may_update_nodemask(mem);
+        node = mem->last_scanned_node;
+        node = next_node(node, mem->scan_nodes);
+        if (node == MAX_NUMNODES)
+                node = first_node(mem->scan_nodes);
+        /*
+         * We call this when we hit limit, not when pages are added to LRU.
+         * No LRU may hold pages because all pages are UNEVICTABLE or
+         * memcg is too small and all pages are not on LRU. In that case,
+         * we use curret node.
+         */
+        if (unlikely(node == MAX_NUMNODES))
+                node = numa_node_id();
+        mem->last_scanned_node = node;
+        return node;
+}
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+        return 0;
+}
+#endif
 /*
 * Scan the hierarchy if needed to reclaim memory. We remember the last child
 * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                res_counter_init(&mem->memsw, NULL);
        }
        mem->last_scanned_child = 0;
+        mem->last_scanned_node = MAX_NUMNODES;
        INIT_LIST_HEAD(&mem->oom_notify);
        if (parent)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 884ae08c16cc..b0875871820d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2226,6 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 {
        struct zonelist *zonelist;
        unsigned long nr_reclaimed;
+        int nid;
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
@@ -2242,7 +2243,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .gfp_mask = sc.gfp_mask,
        };
-        zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+        /*
+         * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
+         * take care of from where we get pages. So the node where we start the
+         * scan does not need to be the current node.
+         */
+        nid = mem_cgroup_select_victim_node(mem_cont);
+        zonelist = NODE_DATA(nid)->node_zonelists;
        trace_mm_vmscan_memcg_reclaim_begin(0,
                                            sc.may_writepage,

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0629121f2c0b..16052117131e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h
@@ -106,6 +106,7 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
106	*/	106	*/
107	int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);	107	int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
108	int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);	108	int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
		109	int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
109	unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,	110	unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
110	struct zone *zone,	111	struct zone *zone,
111	enum lru_list lru);	112	enum lru_list lru);


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc62c714f3b6..1520efd1c7c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -231,6 +231,11 @@ struct mem_cgroup {
231	* reclaimed from.	231	* reclaimed from.
232	*/	232	*/
233	int last_scanned_child;	233	int last_scanned_child;
		234	int last_scanned_node;
		235	#if MAX_NUMNODES > 1
		236	nodemask_t scan_nodes;
		237	unsigned long next_scan_node_update;
		238	#endif
234	/*	239	/*
235	* Should the accounting and control be hierarchical, per subtree?	240	* Should the accounting and control be hierarchical, per subtree?
236	*/	241	*/
@@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
624	preempt_enable();	629	preempt_enable();
625	}	630	}
626		631
		632	static unsigned long
		633	mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
		634	{
		635	struct mem_cgroup_per_zone *mz;
		636	u64 total = 0;
		637	int zid;
		638
		639	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
		640	mz = mem_cgroup_zoneinfo(mem, nid, zid);
		641	total += MEM_CGROUP_ZSTAT(mz, idx);
		642	}
		643	return total;
		644	}
627	static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,	645	static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
628	enum lru_list idx)	646	enum lru_list idx)
629	{	647	{
630	int nid, zid;	648	int nid;
631	struct mem_cgroup_per_zone *mz;
632	u64 total = 0;	649	u64 total = 0;
633		650
634	for_each_online_node(nid)	651	for_each_online_node(nid)
635	for (zid = 0; zid < MAX_NR_ZONES; zid++) {	652	total += mem_cgroup_get_zonestat_node(mem, nid, idx);
636	mz = mem_cgroup_zoneinfo(mem, nid, zid);
637	total += MEM_CGROUP_ZSTAT(mz, idx);
638	}
639	return total;	653	return total;
640	}	654	}
641		655
@@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1418	return ret;	1432	return ret;
1419	}	1433	}
1420		1434
		1435	#if MAX_NUMNODES > 1
		1436
		1437	/*
		1438	* Always updating the nodemask is not very good - even if we have an empty
		1439	* list or the wrong list here, we can start from some node and traverse all
		1440	* nodes based on the zonelist. So update the list loosely once per 10 secs.
		1441	*
		1442	*/
		1443	static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
		1444	{
		1445	int nid;
		1446
		1447	if (time_after(mem->next_scan_node_update, jiffies))
		1448	return;
		1449
		1450	mem->next_scan_node_update = jiffies + 10*HZ;
		1451	/* make a nodemask where this memcg uses memory from */
		1452	mem->scan_nodes = node_states[N_HIGH_MEMORY];
		1453
		1454	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
		1455
		1456	if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) \|\|
		1457	mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
		1458	continue;
		1459
		1460	if (total_swap_pages &&
		1461	(mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) \|\|
		1462	mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
		1463	continue;
		1464	node_clear(nid, mem->scan_nodes);
		1465	}
		1466	}
		1467
		1468	/*
		1469	* Selecting a node where we start reclaim from. Because what we need is just
		1470	* reducing usage counter, start from anywhere is O,K. Considering
		1471	* memory reclaim from current node, there are pros. and cons.
		1472	*
		1473	* Freeing memory from current node means freeing memory from a node which
		1474	* we'll use or we've used. So, it may make LRU bad. And if several threads
		1475	* hit limits, it will see a contention on a node. But freeing from remote
		1476	* node means more costs for memory reclaim because of memory latency.
		1477	*
		1478	* Now, we use round-robin. Better algorithm is welcomed.
		1479	*/
		1480	int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
		1481	{
		1482	int node;
		1483
		1484	mem_cgroup_may_update_nodemask(mem);
		1485	node = mem->last_scanned_node;
		1486
		1487	node = next_node(node, mem->scan_nodes);
		1488	if (node == MAX_NUMNODES)
		1489	node = first_node(mem->scan_nodes);
		1490	/*
		1491	* We call this when we hit limit, not when pages are added to LRU.
		1492	* No LRU may hold pages because all pages are UNEVICTABLE or
		1493	* memcg is too small and all pages are not on LRU. In that case,
		1494	* we use curret node.
		1495	*/
		1496	if (unlikely(node == MAX_NUMNODES))
		1497	node = numa_node_id();
		1498
		1499	mem->last_scanned_node = node;
		1500	return node;
		1501	}
		1502
		1503	#else
		1504	int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
		1505	{
		1506	return 0;
		1507	}
		1508	#endif
		1509
1421	/*	1510	/*
1422	* Scan the hierarchy if needed to reclaim memory. We remember the last child	1511	* Scan the hierarchy if needed to reclaim memory. We remember the last child
1423	* we reclaimed from, so that we don't end up penalizing one child extensively	1512	* we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys ss, struct cgroup cont)
4606	res_counter_init(&mem->memsw, NULL);	4695	res_counter_init(&mem->memsw, NULL);
4607	}	4696	}
4608	mem->last_scanned_child = 0;	4697	mem->last_scanned_child = 0;
		4698	mem->last_scanned_node = MAX_NUMNODES;
4609	INIT_LIST_HEAD(&mem->oom_notify);	4699	INIT_LIST_HEAD(&mem->oom_notify);
4610		4700
4611	if (parent)	4701	if (parent)


diff --git a/mm/vmscan.c b/mm/vmscan.c index 884ae08c16cc..b0875871820d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -2226,6 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2226	{	2226	{
2227	struct zonelist *zonelist;	2227	struct zonelist *zonelist;
2228	unsigned long nr_reclaimed;	2228	unsigned long nr_reclaimed;
		2229	int nid;
2229	struct scan_control sc = {	2230	struct scan_control sc = {
2230	.may_writepage = !laptop_mode,	2231	.may_writepage = !laptop_mode,
2231	.may_unmap = 1,	2232	.may_unmap = 1,
@@ -2242,7 +2243,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2242	.gfp_mask = sc.gfp_mask,	2243	.gfp_mask = sc.gfp_mask,
2243	};	2244	};
2244		2245
2245	zonelist = NODE_DATA(numa_node_id())->node_zonelists;	2246	/*
		2247	* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
		2248	* take care of from where we get pages. So the node where we start the
		2249	* scan does not need to be the current node.
		2250	*/
		2251	nid = mem_cgroup_select_victim_node(mem_cont);
		2252
		2253	zonelist = NODE_DATA(nid)->node_zonelists;
2246		2254
2247	trace_mm_vmscan_memcg_reclaim_begin(0,	2255	trace_mm_vmscan_memcg_reclaim_begin(0,
2248	sc.may_writepage,	2256	sc.may_writepage,