memcg: avoid lock in updating file_mapped (Was fix race in file_mapped accouting flag management

At accounting file events per memory cgroup, we need to find memory cgroup via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup() for guarantee pc->mem_cgroup is not overwritten while we make use of it. But, considering the context which page-cgroup for files are accessed, we can use alternative light-weight mutual execusion in the most case. At handling file-caches, the only race we have to take care of is "moving" account, IOW, overwriting page_cgroup->mem_cgroup. (See comment in the patch) Unlike charge/uncharge, "move" happens not so frequently. It happens only when rmdir() and task-moving (with a special settings.) This patch adds a race-checker for file-cache-status accounting v.s. account moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added. The routine for account move 1. Increment it before start moving 2. Call synchronize_rcu() 3. Decrement it after the end of moving. By this, file-status-counting routine can check it needs to call lock_page_cgroup(). In most case, I doesn't need to call it. Following is a perf data of a process which mmap()/munmap 32MB of file cache in a minute. Before patch: 28.25% mmap mmap [.] main 22.64% mmap [kernel.kallsyms] [k] page_fault 9.96% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped 3.67% mmap [kernel.kallsyms] [k] filemap_fault 3.50% mmap [kernel.kallsyms] [k] unmap_vmas 2.99% mmap [kernel.kallsyms] [k] __do_fault 2.76% mmap [kernel.kallsyms] [k] find_get_page After patch: 30.00% mmap mmap [.] main 23.78% mmap [kernel.kallsyms] [k] page_fault 5.52% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped 3.81% mmap [kernel.kallsyms] [k] unmap_vmas 3.26% mmap [kernel.kallsyms] [k] find_get_page 3.18% mmap [kernel.kallsyms] [k] __do_fault 3.03% mmap [kernel.kallsyms] [k] filemap_fault 2.40% mmap [kernel.kallsyms] [k] handle_mm_fault 2.40% mmap [kernel.kallsyms] [k] do_page_fault This patch reduces memcg's cost to some extent. (mem_cgroup_update_file_mapped is called by both of map/unmap) Note: It seems some more improvements are required..but no idea. maybe removing set/unset flag is required. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Greg Thelen <gthelen@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> 2010-10-27 18:33:40 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-10-27 21:03:09 -0400
commit: 32047e2a85f06633ee4c53e2d0346fbcd34e480b (patch)
tree: 25ed1e04bf60a46951581b0ad28a45e51b1602a2 /mm
parent: 0c270f8f9988fb0d93ea214fdcff7ab90eb3d894 (diff)
1 files changed, 85 insertions, 14 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0e3fdbd809c7..dd845d25827a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -90,6 +90,7 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
        MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
        MEM_CGROUP_EVENTS,      /* incremented at every  pagein/pageout */
+        MEM_CGROUP_ON_MOVE,     /* someone is moving account between groups */
        MEM_CGROUP_STAT_NSTATS,
 };
@@ -1051,7 +1052,46 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
        return swappiness;
 }
-/* A routine for testing mem is not under move_account */
+static void mem_cgroup_start_move(struct mem_cgroup *mem)
+{
+        int cpu;
+        /* Because this is for moving account, reuse mc.lock */
+        spin_lock(&mc.lock);
+        for_each_possible_cpu(cpu)
+                per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+        spin_unlock(&mc.lock);
+        synchronize_rcu();
+}
+static void mem_cgroup_end_move(struct mem_cgroup *mem)
+{
+        int cpu;
+        if (!mem)
+                return;
+        spin_lock(&mc.lock);
+        for_each_possible_cpu(cpu)
+                per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+        spin_unlock(&mc.lock);
+}
+/*
+ * 2 routines for checking "mem" is under move_account() or not.
+ *
+ * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
+ *                        for avoiding race in accounting. If true,
+ *                        pc->mem_cgroup may be overwritten.
+ *
+ * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
+ *                        under hierarchy of moving cgroups. This is for
+ *                        waiting at hith-memory prressure caused by "move".
+ */
+static bool mem_cgroup_stealed(struct mem_cgroup *mem)
+{
+        VM_BUG_ON(!rcu_read_lock_held());
+        return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+}
 static bool mem_cgroup_under_move(struct mem_cgroup *mem)
 {
@@ -1462,35 +1502,62 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 /*
 * Currently used to update mapped file statistics, but the routine can be
 * generalized to update other statistics as well.
+ *
+ * Notes: Race condition
+ *
+ * We usually use page_cgroup_lock() for accessing page_cgroup member but
+ * it tends to be costly. But considering some conditions, we doesn't need
+ * to do so _always_.
+ *
+ * Considering "charge", lock_page_cgroup() is not required because all
+ * file-stat operations happen after a page is attached to radix-tree. There
+ * are no race with "charge".
+ *
+ * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
+ * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
+ * if there are race with "uncharge". Statistics itself is properly handled
+ * by flags.
+ *
+ * Considering "move", this is an only case we see a race. To make the race
+ * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
+ * possibility of race condition. If there is, we take a lock.
 */
 void mem_cgroup_update_file_mapped(struct page *page, int val)
 {
        struct mem_cgroup *mem;
-        struct page_cgroup *pc;
+        struct page_cgroup *pc = lookup_page_cgroup(page);
+        bool need_unlock = false;
-        pc = lookup_page_cgroup(page);
        if (unlikely(!pc))
                return;
-        lock_page_cgroup(pc);
+        rcu_read_lock();
        mem = pc->mem_cgroup;
-        if (!mem || !PageCgroupUsed(pc))
+        if (unlikely(!mem || !PageCgroupUsed(pc)))
-                goto done;
+                goto out;
+        /* pc->mem_cgroup is unstable ? */
-        /*
+        if (unlikely(mem_cgroup_stealed(mem))) {
-         * Preemption is already disabled. We can use __this_cpu_xxx
+                /* take a lock against to access pc->mem_cgroup */
-         */
+                lock_page_cgroup(pc);
+                need_unlock = true;
+                mem = pc->mem_cgroup;
+                if (!mem || !PageCgroupUsed(pc))
+                        goto out;
+        }
        if (val > 0) {
-                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+                this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
                SetPageCgroupFileMapped(pc);
        } else {
-                __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+                this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
                if (!page_mapped(page)) /* for race between dec->inc counter */
                        ClearPageCgroupFileMapped(pc);
        }
-done:
+out:
-        unlock_page_cgroup(pc);
+        if (unlikely(need_unlock))
+                unlock_page_cgroup(pc);
+        rcu_read_unlock();
+        return;
 }
 /*
@@ -3039,6 +3106,7 @@ move_account:
                lru_add_drain_all();
                drain_all_stock_sync();
                ret = 0;
+                mem_cgroup_start_move(mem);
                for_each_node_state(node, N_HIGH_MEMORY) {
                        for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
                                enum lru_list l;
@@ -3052,6 +3120,7 @@ move_account:
                        if (ret)
                                break;
                }
+                mem_cgroup_end_move(mem);
                memcg_oom_recover(mem);
                /* it seems parent cgroup doesn't have enough mem */
                if (ret == -ENOMEM)
@@ -4514,6 +4583,7 @@ static void mem_cgroup_clear_mc(void)
        mc.to = NULL;
        mc.moving_task = NULL;
        spin_unlock(&mc.lock);
+        mem_cgroup_end_move(from);
        memcg_oom_recover(from);
        memcg_oom_recover(to);
        wake_up_all(&mc.waitq);
@@ -4544,6 +4614,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                        VM_BUG_ON(mc.moved_charge);
                        VM_BUG_ON(mc.moved_swap);
                        VM_BUG_ON(mc.moving_task);
+                        mem_cgroup_start_move(from);
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = mem;
author	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>	2010-10-27 18:33:40 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-10-27 21:03:09 -0400
commit	32047e2a85f06633ee4c53e2d0346fbcd34e480b (patch)
tree	25ed1e04bf60a46951581b0ad28a45e51b1602a2 /mm
parent	0c270f8f9988fb0d93ea214fdcff7ab90eb3d894 (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e3fdbd809c7..dd845d25827a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -90,6 +90,7 @@ enum mem_cgroup_stat_index {
90	MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */	90	MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
91	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */	91	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
92	MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */	92	MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
		93	MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
93		94
94	MEM_CGROUP_STAT_NSTATS,	95	MEM_CGROUP_STAT_NSTATS,
95	};	96	};
@@ -1051,7 +1052,46 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
1051	return swappiness;	1052	return swappiness;
1052	}	1053	}
1053		1054
1054	/* A routine for testing mem is not under move_account */	1055	static void mem_cgroup_start_move(struct mem_cgroup *mem)
		1056	{
		1057	int cpu;
		1058	/* Because this is for moving account, reuse mc.lock */
		1059	spin_lock(&mc.lock);
		1060	for_each_possible_cpu(cpu)
		1061	per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
		1062	spin_unlock(&mc.lock);
		1063
		1064	synchronize_rcu();
		1065	}
		1066
		1067	static void mem_cgroup_end_move(struct mem_cgroup *mem)
		1068	{
		1069	int cpu;
		1070
		1071	if (!mem)
		1072	return;
		1073	spin_lock(&mc.lock);
		1074	for_each_possible_cpu(cpu)
		1075	per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
		1076	spin_unlock(&mc.lock);
		1077	}
		1078	/*
		1079	* 2 routines for checking "mem" is under move_account() or not.
		1080	*
		1081	* mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
		1082	* for avoiding race in accounting. If true,
		1083	* pc->mem_cgroup may be overwritten.
		1084	*
		1085	* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
		1086	* under hierarchy of moving cgroups. This is for
		1087	* waiting at hith-memory prressure caused by "move".
		1088	*/
		1089
		1090	static bool mem_cgroup_stealed(struct mem_cgroup *mem)
		1091	{
		1092	VM_BUG_ON(!rcu_read_lock_held());
		1093	return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
		1094	}
1055		1095
1056	static bool mem_cgroup_under_move(struct mem_cgroup *mem)	1096	static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1057	{	1097	{
@@ -1462,35 +1502,62 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1462	/*	1502	/*
1463	* Currently used to update mapped file statistics, but the routine can be	1503	* Currently used to update mapped file statistics, but the routine can be
1464	* generalized to update other statistics as well.	1504	* generalized to update other statistics as well.
		1505	*
		1506	* Notes: Race condition
		1507	*
		1508	* We usually use page_cgroup_lock() for accessing page_cgroup member but
		1509	* it tends to be costly. But considering some conditions, we doesn't need
		1510	* to do so _always_.
		1511	*
		1512	* Considering "charge", lock_page_cgroup() is not required because all
		1513	* file-stat operations happen after a page is attached to radix-tree. There
		1514	* are no race with "charge".
		1515	*
		1516	* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
		1517	* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
		1518	* if there are race with "uncharge". Statistics itself is properly handled
		1519	* by flags.
		1520	*
		1521	* Considering "move", this is an only case we see a race. To make the race
		1522	* small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
		1523	* possibility of race condition. If there is, we take a lock.
1465	*/	1524	*/
1466	void mem_cgroup_update_file_mapped(struct page *page, int val)	1525	void mem_cgroup_update_file_mapped(struct page *page, int val)
1467	{	1526	{
1468	struct mem_cgroup *mem;	1527	struct mem_cgroup *mem;
1469	struct page_cgroup *pc;	1528	struct page_cgroup *pc = lookup_page_cgroup(page);
		1529	bool need_unlock = false;
1470		1530
1471	pc = lookup_page_cgroup(page);
1472	if (unlikely(!pc))	1531	if (unlikely(!pc))
1473	return;	1532	return;
1474		1533
1475	lock_page_cgroup(pc);	1534	rcu_read_lock();
1476	mem = pc->mem_cgroup;	1535	mem = pc->mem_cgroup;
1477	if (!mem \|\| !PageCgroupUsed(pc))	1536	if (unlikely(!mem \|\| !PageCgroupUsed(pc)))
1478	goto done;	1537	goto out;
1479		1538	/* pc->mem_cgroup is unstable ? */
1480	/*	1539	if (unlikely(mem_cgroup_stealed(mem))) {
1481	* Preemption is already disabled. We can use __this_cpu_xxx	1540	/* take a lock against to access pc->mem_cgroup */
1482	*/	1541	lock_page_cgroup(pc);
		1542	need_unlock = true;
		1543	mem = pc->mem_cgroup;
		1544	if (!mem \|\| !PageCgroupUsed(pc))
		1545	goto out;
		1546	}
1483	if (val > 0) {	1547	if (val > 0) {
1484	__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);	1548	this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1485	SetPageCgroupFileMapped(pc);	1549	SetPageCgroupFileMapped(pc);
1486	} else {	1550	} else {
1487	__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);	1551	this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1488	if (!page_mapped(page)) /* for race between dec->inc counter */	1552	if (!page_mapped(page)) /* for race between dec->inc counter */
1489	ClearPageCgroupFileMapped(pc);	1553	ClearPageCgroupFileMapped(pc);
1490	}	1554	}
1491		1555
1492	done:	1556	out:
1493	unlock_page_cgroup(pc);	1557	if (unlikely(need_unlock))
		1558	unlock_page_cgroup(pc);
		1559	rcu_read_unlock();
		1560	return;
1494	}	1561	}
1495		1562
1496	/*	1563	/*
@@ -3039,6 +3106,7 @@ move_account:
3039	lru_add_drain_all();	3106	lru_add_drain_all();
3040	drain_all_stock_sync();	3107	drain_all_stock_sync();
3041	ret = 0;	3108	ret = 0;
		3109	mem_cgroup_start_move(mem);
3042	for_each_node_state(node, N_HIGH_MEMORY) {	3110	for_each_node_state(node, N_HIGH_MEMORY) {
3043	for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {	3111	for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3044	enum lru_list l;	3112	enum lru_list l;
@@ -3052,6 +3120,7 @@ move_account:
3052	if (ret)	3120	if (ret)
3053	break;	3121	break;
3054	}	3122	}
		3123	mem_cgroup_end_move(mem);
3055	memcg_oom_recover(mem);	3124	memcg_oom_recover(mem);
3056	/* it seems parent cgroup doesn't have enough mem */	3125	/* it seems parent cgroup doesn't have enough mem */
3057	if (ret == -ENOMEM)	3126	if (ret == -ENOMEM)
@@ -4514,6 +4583,7 @@ static void mem_cgroup_clear_mc(void)
4514	mc.to = NULL;	4583	mc.to = NULL;
4515	mc.moving_task = NULL;	4584	mc.moving_task = NULL;
4516	spin_unlock(&mc.lock);	4585	spin_unlock(&mc.lock);
		4586	mem_cgroup_end_move(from);
4517	memcg_oom_recover(from);	4587	memcg_oom_recover(from);
4518	memcg_oom_recover(to);	4588	memcg_oom_recover(to);
4519	wake_up_all(&mc.waitq);	4589	wake_up_all(&mc.waitq);
@@ -4544,6 +4614,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4544	VM_BUG_ON(mc.moved_charge);	4614	VM_BUG_ON(mc.moved_charge);
4545	VM_BUG_ON(mc.moved_swap);	4615	VM_BUG_ON(mc.moved_swap);
4546	VM_BUG_ON(mc.moving_task);	4616	VM_BUG_ON(mc.moving_task);
		4617	mem_cgroup_start_move(from);
4547	spin_lock(&mc.lock);	4618	spin_lock(&mc.lock);
4548	mc.from = from;	4619	mc.from = from;
4549	mc.to = mem;	4620	mc.to = mem;