diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2010-10-27 18:33:40 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-27 21:03:09 -0400 |
commit | 32047e2a85f06633ee4c53e2d0346fbcd34e480b (patch) | |
tree | 25ed1e04bf60a46951581b0ad28a45e51b1602a2 | |
parent | 0c270f8f9988fb0d93ea214fdcff7ab90eb3d894 (diff) |
memcg: avoid lock in updating file_mapped (Was fix race in file_mapped accouting flag management
At accounting file events per memory cgroup, we need to find memory cgroup
via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup() for guarantee
pc->mem_cgroup is not overwritten while we make use of it.
But, considering the context which page-cgroup for files are accessed,
we can use alternative light-weight mutual execusion in the most case.
At handling file-caches, the only race we have to take care of is "moving"
account, IOW, overwriting page_cgroup->mem_cgroup. (See comment in the
patch)
Unlike charge/uncharge, "move" happens not so frequently. It happens only when
rmdir() and task-moving (with a special settings.)
This patch adds a race-checker for file-cache-status accounting v.s. account
moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added.
The routine for account move
1. Increment it before start moving
2. Call synchronize_rcu()
3. Decrement it after the end of moving.
By this, file-status-counting routine can check it needs to call
lock_page_cgroup(). In most case, I doesn't need to call it.
Following is a perf data of a process which mmap()/munmap 32MB of file cache
in a minute.
Before patch:
28.25% mmap mmap [.] main
22.64% mmap [kernel.kallsyms] [k] page_fault
9.96% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped
3.67% mmap [kernel.kallsyms] [k] filemap_fault
3.50% mmap [kernel.kallsyms] [k] unmap_vmas
2.99% mmap [kernel.kallsyms] [k] __do_fault
2.76% mmap [kernel.kallsyms] [k] find_get_page
After patch:
30.00% mmap mmap [.] main
23.78% mmap [kernel.kallsyms] [k] page_fault
5.52% mmap [kernel.kallsyms] [k] mem_cgroup_update_file_mapped
3.81% mmap [kernel.kallsyms] [k] unmap_vmas
3.26% mmap [kernel.kallsyms] [k] find_get_page
3.18% mmap [kernel.kallsyms] [k] __do_fault
3.03% mmap [kernel.kallsyms] [k] filemap_fault
2.40% mmap [kernel.kallsyms] [k] handle_mm_fault
2.40% mmap [kernel.kallsyms] [k] do_page_fault
This patch reduces memcg's cost to some extent.
(mem_cgroup_update_file_mapped is called by both of map/unmap)
Note: It seems some more improvements are required..but no idea.
maybe removing set/unset flag is required.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/memcontrol.c | 99 |
1 files changed, 85 insertions, 14 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e3fdbd809c7..dd845d25827a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -90,6 +90,7 @@ enum mem_cgroup_stat_index { | |||
90 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 90 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
91 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 91 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
92 | MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ | 92 | MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ |
93 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ | ||
93 | 94 | ||
94 | MEM_CGROUP_STAT_NSTATS, | 95 | MEM_CGROUP_STAT_NSTATS, |
95 | }; | 96 | }; |
@@ -1051,7 +1052,46 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) | |||
1051 | return swappiness; | 1052 | return swappiness; |
1052 | } | 1053 | } |
1053 | 1054 | ||
1054 | /* A routine for testing mem is not under move_account */ | 1055 | static void mem_cgroup_start_move(struct mem_cgroup *mem) |
1056 | { | ||
1057 | int cpu; | ||
1058 | /* Because this is for moving account, reuse mc.lock */ | ||
1059 | spin_lock(&mc.lock); | ||
1060 | for_each_possible_cpu(cpu) | ||
1061 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; | ||
1062 | spin_unlock(&mc.lock); | ||
1063 | |||
1064 | synchronize_rcu(); | ||
1065 | } | ||
1066 | |||
1067 | static void mem_cgroup_end_move(struct mem_cgroup *mem) | ||
1068 | { | ||
1069 | int cpu; | ||
1070 | |||
1071 | if (!mem) | ||
1072 | return; | ||
1073 | spin_lock(&mc.lock); | ||
1074 | for_each_possible_cpu(cpu) | ||
1075 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; | ||
1076 | spin_unlock(&mc.lock); | ||
1077 | } | ||
1078 | /* | ||
1079 | * 2 routines for checking "mem" is under move_account() or not. | ||
1080 | * | ||
1081 | * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used | ||
1082 | * for avoiding race in accounting. If true, | ||
1083 | * pc->mem_cgroup may be overwritten. | ||
1084 | * | ||
1085 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or | ||
1086 | * under hierarchy of moving cgroups. This is for | ||
1087 | * waiting at hith-memory prressure caused by "move". | ||
1088 | */ | ||
1089 | |||
1090 | static bool mem_cgroup_stealed(struct mem_cgroup *mem) | ||
1091 | { | ||
1092 | VM_BUG_ON(!rcu_read_lock_held()); | ||
1093 | return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; | ||
1094 | } | ||
1055 | 1095 | ||
1056 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) | 1096 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) |
1057 | { | 1097 | { |
@@ -1462,35 +1502,62 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1462 | /* | 1502 | /* |
1463 | * Currently used to update mapped file statistics, but the routine can be | 1503 | * Currently used to update mapped file statistics, but the routine can be |
1464 | * generalized to update other statistics as well. | 1504 | * generalized to update other statistics as well. |
1505 | * | ||
1506 | * Notes: Race condition | ||
1507 | * | ||
1508 | * We usually use page_cgroup_lock() for accessing page_cgroup member but | ||
1509 | * it tends to be costly. But considering some conditions, we doesn't need | ||
1510 | * to do so _always_. | ||
1511 | * | ||
1512 | * Considering "charge", lock_page_cgroup() is not required because all | ||
1513 | * file-stat operations happen after a page is attached to radix-tree. There | ||
1514 | * are no race with "charge". | ||
1515 | * | ||
1516 | * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup | ||
1517 | * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even | ||
1518 | * if there are race with "uncharge". Statistics itself is properly handled | ||
1519 | * by flags. | ||
1520 | * | ||
1521 | * Considering "move", this is an only case we see a race. To make the race | ||
1522 | * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are | ||
1523 | * possibility of race condition. If there is, we take a lock. | ||
1465 | */ | 1524 | */ |
1466 | void mem_cgroup_update_file_mapped(struct page *page, int val) | 1525 | void mem_cgroup_update_file_mapped(struct page *page, int val) |
1467 | { | 1526 | { |
1468 | struct mem_cgroup *mem; | 1527 | struct mem_cgroup *mem; |
1469 | struct page_cgroup *pc; | 1528 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1529 | bool need_unlock = false; | ||
1470 | 1530 | ||
1471 | pc = lookup_page_cgroup(page); | ||
1472 | if (unlikely(!pc)) | 1531 | if (unlikely(!pc)) |
1473 | return; | 1532 | return; |
1474 | 1533 | ||
1475 | lock_page_cgroup(pc); | 1534 | rcu_read_lock(); |
1476 | mem = pc->mem_cgroup; | 1535 | mem = pc->mem_cgroup; |
1477 | if (!mem || !PageCgroupUsed(pc)) | 1536 | if (unlikely(!mem || !PageCgroupUsed(pc))) |
1478 | goto done; | 1537 | goto out; |
1479 | 1538 | /* pc->mem_cgroup is unstable ? */ | |
1480 | /* | 1539 | if (unlikely(mem_cgroup_stealed(mem))) { |
1481 | * Preemption is already disabled. We can use __this_cpu_xxx | 1540 | /* take a lock against to access pc->mem_cgroup */ |
1482 | */ | 1541 | lock_page_cgroup(pc); |
1542 | need_unlock = true; | ||
1543 | mem = pc->mem_cgroup; | ||
1544 | if (!mem || !PageCgroupUsed(pc)) | ||
1545 | goto out; | ||
1546 | } | ||
1483 | if (val > 0) { | 1547 | if (val > 0) { |
1484 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 1548 | this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
1485 | SetPageCgroupFileMapped(pc); | 1549 | SetPageCgroupFileMapped(pc); |
1486 | } else { | 1550 | } else { |
1487 | __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 1551 | this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
1488 | if (!page_mapped(page)) /* for race between dec->inc counter */ | 1552 | if (!page_mapped(page)) /* for race between dec->inc counter */ |
1489 | ClearPageCgroupFileMapped(pc); | 1553 | ClearPageCgroupFileMapped(pc); |
1490 | } | 1554 | } |
1491 | 1555 | ||
1492 | done: | 1556 | out: |
1493 | unlock_page_cgroup(pc); | 1557 | if (unlikely(need_unlock)) |
1558 | unlock_page_cgroup(pc); | ||
1559 | rcu_read_unlock(); | ||
1560 | return; | ||
1494 | } | 1561 | } |
1495 | 1562 | ||
1496 | /* | 1563 | /* |
@@ -3039,6 +3106,7 @@ move_account: | |||
3039 | lru_add_drain_all(); | 3106 | lru_add_drain_all(); |
3040 | drain_all_stock_sync(); | 3107 | drain_all_stock_sync(); |
3041 | ret = 0; | 3108 | ret = 0; |
3109 | mem_cgroup_start_move(mem); | ||
3042 | for_each_node_state(node, N_HIGH_MEMORY) { | 3110 | for_each_node_state(node, N_HIGH_MEMORY) { |
3043 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 3111 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
3044 | enum lru_list l; | 3112 | enum lru_list l; |
@@ -3052,6 +3120,7 @@ move_account: | |||
3052 | if (ret) | 3120 | if (ret) |
3053 | break; | 3121 | break; |
3054 | } | 3122 | } |
3123 | mem_cgroup_end_move(mem); | ||
3055 | memcg_oom_recover(mem); | 3124 | memcg_oom_recover(mem); |
3056 | /* it seems parent cgroup doesn't have enough mem */ | 3125 | /* it seems parent cgroup doesn't have enough mem */ |
3057 | if (ret == -ENOMEM) | 3126 | if (ret == -ENOMEM) |
@@ -4514,6 +4583,7 @@ static void mem_cgroup_clear_mc(void) | |||
4514 | mc.to = NULL; | 4583 | mc.to = NULL; |
4515 | mc.moving_task = NULL; | 4584 | mc.moving_task = NULL; |
4516 | spin_unlock(&mc.lock); | 4585 | spin_unlock(&mc.lock); |
4586 | mem_cgroup_end_move(from); | ||
4517 | memcg_oom_recover(from); | 4587 | memcg_oom_recover(from); |
4518 | memcg_oom_recover(to); | 4588 | memcg_oom_recover(to); |
4519 | wake_up_all(&mc.waitq); | 4589 | wake_up_all(&mc.waitq); |
@@ -4544,6 +4614,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
4544 | VM_BUG_ON(mc.moved_charge); | 4614 | VM_BUG_ON(mc.moved_charge); |
4545 | VM_BUG_ON(mc.moved_swap); | 4615 | VM_BUG_ON(mc.moved_swap); |
4546 | VM_BUG_ON(mc.moving_task); | 4616 | VM_BUG_ON(mc.moving_task); |
4617 | mem_cgroup_start_move(from); | ||
4547 | spin_lock(&mc.lock); | 4618 | spin_lock(&mc.lock); |
4548 | mc.from = from; | 4619 | mc.from = from; |
4549 | mc.to = mem; | 4620 | mc.to = mem; |