aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2010-10-27 18:33:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-27 21:03:09 -0400
commit32047e2a85f06633ee4c53e2d0346fbcd34e480b (patch)
tree25ed1e04bf60a46951581b0ad28a45e51b1602a2
parent0c270f8f9988fb0d93ea214fdcff7ab90eb3d894 (diff)
memcg: avoid lock in updating file_mapped (Was fix race in file_mapped accouting flag management
At accounting file events per memory cgroup, we need to find memory cgroup via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup() for guarantee pc->mem_cgroup is not overwritten while we make use of it. But, considering the context which page-cgroup for files are accessed, we can use alternative light-weight mutual execusion in the most case. At handling file-caches, the only race we have to take care of is "moving" account, IOW, overwriting page_cgroup->mem_cgroup. (See comment in the patch) Unlike charge/uncharge, "move" happens not so frequently. It happens only when rmdir() and task-moving (with a special settings.) This patch adds a race-checker for file-cache-status accounting v.s. account moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added. The routine for account move 1. Increment it before start moving 2. Call synchronize_rcu() 3. Decrement it after the end of moving. By this, file-status-counting routine can check it needs to call lock_page_cgroup(). In most case, I doesn't need to call it. Following is a perf data of a process which mmap()/munmap 32MB of file cache in a minute. Before patch: 28.25% mmap mmap [.] main 22.64% mmap [kernel.kallsyms] [k] page_fault 9.96% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped 3.67% mmap [kernel.kallsyms] [k] filemap_fault 3.50% mmap [kernel.kallsyms] [k] unmap_vmas 2.99% mmap [kernel.kallsyms] [k] __do_fault 2.76% mmap [kernel.kallsyms] [k] find_get_page After patch: 30.00% mmap mmap [.] main 23.78% mmap [kernel.kallsyms] [k] page_fault 5.52% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped 3.81% mmap [kernel.kallsyms] [k] unmap_vmas 3.26% mmap [kernel.kallsyms] [k] find_get_page 3.18% mmap [kernel.kallsyms] [k] __do_fault 3.03% mmap [kernel.kallsyms] [k] filemap_fault 2.40% mmap [kernel.kallsyms] [k] handle_mm_fault 2.40% mmap [kernel.kallsyms] [k] do_page_fault This patch reduces memcg's cost to some extent. (mem_cgroup_update_file_mapped is called by both of map/unmap) Note: It seems some more improvements are required..but no idea. maybe removing set/unset flag is required. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Greg Thelen <gthelen@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/memcontrol.c99
1 files changed, 85 insertions, 14 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0e3fdbd809c7..dd845d25827a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -90,6 +90,7 @@ enum mem_cgroup_stat_index {
90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
92 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ 92 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
93 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
93 94
94 MEM_CGROUP_STAT_NSTATS, 95 MEM_CGROUP_STAT_NSTATS,
95}; 96};
@@ -1051,7 +1052,46 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
1051 return swappiness; 1052 return swappiness;
1052} 1053}
1053 1054
1054/* A routine for testing mem is not under move_account */ 1055static void mem_cgroup_start_move(struct mem_cgroup *mem)
1056{
1057 int cpu;
1058 /* Because this is for moving account, reuse mc.lock */
1059 spin_lock(&mc.lock);
1060 for_each_possible_cpu(cpu)
1061 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1062 spin_unlock(&mc.lock);
1063
1064 synchronize_rcu();
1065}
1066
1067static void mem_cgroup_end_move(struct mem_cgroup *mem)
1068{
1069 int cpu;
1070
1071 if (!mem)
1072 return;
1073 spin_lock(&mc.lock);
1074 for_each_possible_cpu(cpu)
1075 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1076 spin_unlock(&mc.lock);
1077}
1078/*
1079 * 2 routines for checking "mem" is under move_account() or not.
1080 *
1081 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1082 * for avoiding race in accounting. If true,
1083 * pc->mem_cgroup may be overwritten.
1084 *
1085 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1086 * under hierarchy of moving cgroups. This is for
1087 * waiting at hith-memory prressure caused by "move".
1088 */
1089
1090static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1091{
1092 VM_BUG_ON(!rcu_read_lock_held());
1093 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1094}
1055 1095
1056static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1096static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1057{ 1097{
@@ -1462,35 +1502,62 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1462/* 1502/*
1463 * Currently used to update mapped file statistics, but the routine can be 1503 * Currently used to update mapped file statistics, but the routine can be
1464 * generalized to update other statistics as well. 1504 * generalized to update other statistics as well.
1505 *
1506 * Notes: Race condition
1507 *
1508 * We usually use page_cgroup_lock() for accessing page_cgroup member but
1509 * it tends to be costly. But considering some conditions, we doesn't need
1510 * to do so _always_.
1511 *
1512 * Considering "charge", lock_page_cgroup() is not required because all
1513 * file-stat operations happen after a page is attached to radix-tree. There
1514 * are no race with "charge".
1515 *
1516 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1517 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1518 * if there are race with "uncharge". Statistics itself is properly handled
1519 * by flags.
1520 *
1521 * Considering "move", this is an only case we see a race. To make the race
1522 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1523 * possibility of race condition. If there is, we take a lock.
1465 */ 1524 */
1466void mem_cgroup_update_file_mapped(struct page *page, int val) 1525void mem_cgroup_update_file_mapped(struct page *page, int val)
1467{ 1526{
1468 struct mem_cgroup *mem; 1527 struct mem_cgroup *mem;
1469 struct page_cgroup *pc; 1528 struct page_cgroup *pc = lookup_page_cgroup(page);
1529 bool need_unlock = false;
1470 1530
1471 pc = lookup_page_cgroup(page);
1472 if (unlikely(!pc)) 1531 if (unlikely(!pc))
1473 return; 1532 return;
1474 1533
1475 lock_page_cgroup(pc); 1534 rcu_read_lock();
1476 mem = pc->mem_cgroup; 1535 mem = pc->mem_cgroup;
1477 if (!mem || !PageCgroupUsed(pc)) 1536 if (unlikely(!mem || !PageCgroupUsed(pc)))
1478 goto done; 1537 goto out;
1479 1538 /* pc->mem_cgroup is unstable ? */
1480 /* 1539 if (unlikely(mem_cgroup_stealed(mem))) {
1481 * Preemption is already disabled. We can use __this_cpu_xxx 1540 /* take a lock against to access pc->mem_cgroup */
1482 */ 1541 lock_page_cgroup(pc);
1542 need_unlock = true;
1543 mem = pc->mem_cgroup;
1544 if (!mem || !PageCgroupUsed(pc))
1545 goto out;
1546 }
1483 if (val > 0) { 1547 if (val > 0) {
1484 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1548 this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1485 SetPageCgroupFileMapped(pc); 1549 SetPageCgroupFileMapped(pc);
1486 } else { 1550 } else {
1487 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1551 this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1488 if (!page_mapped(page)) /* for race between dec->inc counter */ 1552 if (!page_mapped(page)) /* for race between dec->inc counter */
1489 ClearPageCgroupFileMapped(pc); 1553 ClearPageCgroupFileMapped(pc);
1490 } 1554 }
1491 1555
1492done: 1556out:
1493 unlock_page_cgroup(pc); 1557 if (unlikely(need_unlock))
1558 unlock_page_cgroup(pc);
1559 rcu_read_unlock();
1560 return;
1494} 1561}
1495 1562
1496/* 1563/*
@@ -3039,6 +3106,7 @@ move_account:
3039 lru_add_drain_all(); 3106 lru_add_drain_all();
3040 drain_all_stock_sync(); 3107 drain_all_stock_sync();
3041 ret = 0; 3108 ret = 0;
3109 mem_cgroup_start_move(mem);
3042 for_each_node_state(node, N_HIGH_MEMORY) { 3110 for_each_node_state(node, N_HIGH_MEMORY) {
3043 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3111 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3044 enum lru_list l; 3112 enum lru_list l;
@@ -3052,6 +3120,7 @@ move_account:
3052 if (ret) 3120 if (ret)
3053 break; 3121 break;
3054 } 3122 }
3123 mem_cgroup_end_move(mem);
3055 memcg_oom_recover(mem); 3124 memcg_oom_recover(mem);
3056 /* it seems parent cgroup doesn't have enough mem */ 3125 /* it seems parent cgroup doesn't have enough mem */
3057 if (ret == -ENOMEM) 3126 if (ret == -ENOMEM)
@@ -4514,6 +4583,7 @@ static void mem_cgroup_clear_mc(void)
4514 mc.to = NULL; 4583 mc.to = NULL;
4515 mc.moving_task = NULL; 4584 mc.moving_task = NULL;
4516 spin_unlock(&mc.lock); 4585 spin_unlock(&mc.lock);
4586 mem_cgroup_end_move(from);
4517 memcg_oom_recover(from); 4587 memcg_oom_recover(from);
4518 memcg_oom_recover(to); 4588 memcg_oom_recover(to);
4519 wake_up_all(&mc.waitq); 4589 wake_up_all(&mc.waitq);
@@ -4544,6 +4614,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4544 VM_BUG_ON(mc.moved_charge); 4614 VM_BUG_ON(mc.moved_charge);
4545 VM_BUG_ON(mc.moved_swap); 4615 VM_BUG_ON(mc.moved_swap);
4546 VM_BUG_ON(mc.moving_task); 4616 VM_BUG_ON(mc.moving_task);
4617 mem_cgroup_start_move(from);
4547 spin_lock(&mc.lock); 4618 spin_lock(&mc.lock);
4548 mc.from = from; 4619 mc.from = from;
4549 mc.to = mem; 4620 mc.to = mem;