diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2012-03-21 19:34:25 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-21 20:55:01 -0400 |
commit | 89c06bd52fb9ffceddf84f7309d2e8c9f1666216 (patch) | |
tree | 43ec3d97a89988bc143bed5796bcd7bef64212dc /mm/memcontrol.c | |
parent | 312734c04e2fecc58429aec98194e4ff12d8f7d6 (diff) |
memcg: use new logic for page stat accounting
Now, page-stat-per-memcg is recorded into per page_cgroup flag by
duplicating page's status into the flag. The reason is that memcg has a
feature to move a page from a group to another group and we have race
between "move" and "page stat accounting",
Under current logic, assume CPU-A and CPU-B. CPU-A does "move" and CPU-B
does "page stat accounting".
When CPU-A goes 1st,
CPU-A CPU-B
update "struct page" info.
move_lock_mem_cgroup(memcg)
see pc->flags
copy page stat to new group
overwrite pc->mem_cgroup.
move_unlock_mem_cgroup(memcg)
move_lock_mem_cgroup(mem)
set pc->flags
update page stat accounting
move_unlock_mem_cgroup(mem)
stat accounting is guarded by move_lock_mem_cgroup() and "move" logic
(CPU-A) doesn't see changes in "struct page" information.
But it's costly to have the same information both in 'struct page' and
'struct page_cgroup'. And, there is a potential problem.
For example, assume we have PG_dirty accounting in memcg.
PG_..is a flag for struct page.
PCG_ is a flag for struct page_cgroup.
(This is just an example. The same problem can be found in any
kind of page stat accounting.)
CPU-A CPU-B
TestSet PG_dirty
(delay) TestClear PG_dirty
if (TestClear(PCG_dirty))
memcg->nr_dirty--
if (TestSet(PCG_dirty))
memcg->nr_dirty++
Here, memcg->nr_dirty = +1, this is wrong. This race was reported by Greg
Thelen <gthelen@google.com>. Now, only FILE_MAPPED is supported but
fortunately, it's serialized by page table lock and this is not real bug,
_now_,
If this potential problem is caused by having duplicated information in
struct page and struct page_cgroup, we may be able to fix this by using
original 'struct page' information. But we'll have a problem in "move
account"
Assume we use only PG_dirty.
CPU-A CPU-B
TestSet PG_dirty
(delay) move_lock_mem_cgroup()
if (PageDirty(page))
new_memcg->nr_dirty++
pc->mem_cgroup = new_memcg;
move_unlock_mem_cgroup()
move_lock_mem_cgroup()
memcg = pc->mem_cgroup
new_memcg->nr_dirty++
accounting information may be double-counted. This was original reason to
have PCG_xxx flags but it seems PCG_xxx has another problem.
I think we need a bigger lock as
move_lock_mem_cgroup(page)
TestSetPageDirty(page)
update page stats (without any checks)
move_unlock_mem_cgroup(page)
This fixes both of problems and we don't have to duplicate page flag into
page_cgroup. Please note: move_lock_mem_cgroup() is held only when there
are possibility of "account move" under the system. So, in most path,
status update will go without atomic locks.
This patch introduces mem_cgroup_begin_update_page_stat() and
mem_cgroup_end_update_page_stat() both should be called at modifying
'struct page' information if memcg takes care of it. as
mem_cgroup_begin_update_page_stat()
modify page information
mem_cgroup_update_page_stat()
=> never check any 'struct page' info, just update counters.
mem_cgroup_end_update_page_stat().
This patch is slow because we need to call begin_update_page_stat()/
end_update_page_stat() regardless of accounted will be changed or not. A
following patch adds an easy optimization and reduces the cost.
[akpm@linux-foundation.org: s/lock/locked/]
[hughd@google.com: fix deadlock by avoiding stat lock when anon]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 62 |
1 files changed, 42 insertions, 20 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8afed2819b8f..df1e180f6c30 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1910,32 +1910,59 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | |||
1910 | * If there is, we take a lock. | 1910 | * If there is, we take a lock. |
1911 | */ | 1911 | */ |
1912 | 1912 | ||
1913 | void __mem_cgroup_begin_update_page_stat(struct page *page, | ||
1914 | bool *locked, unsigned long *flags) | ||
1915 | { | ||
1916 | struct mem_cgroup *memcg; | ||
1917 | struct page_cgroup *pc; | ||
1918 | |||
1919 | pc = lookup_page_cgroup(page); | ||
1920 | again: | ||
1921 | memcg = pc->mem_cgroup; | ||
1922 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | ||
1923 | return; | ||
1924 | /* | ||
1925 | * If this memory cgroup is not under account moving, we don't | ||
1926 | * need to take move_lock_page_cgroup(). Because we already hold | ||
1927 | * rcu_read_lock(), any calls to move_account will be delayed until | ||
1928 | * rcu_read_unlock() if mem_cgroup_stealed() == true. | ||
1929 | */ | ||
1930 | if (!mem_cgroup_stealed(memcg)) | ||
1931 | return; | ||
1932 | |||
1933 | move_lock_mem_cgroup(memcg, flags); | ||
1934 | if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { | ||
1935 | move_unlock_mem_cgroup(memcg, flags); | ||
1936 | goto again; | ||
1937 | } | ||
1938 | *locked = true; | ||
1939 | } | ||
1940 | |||
1941 | void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) | ||
1942 | { | ||
1943 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
1944 | |||
1945 | /* | ||
1946 | * It's guaranteed that pc->mem_cgroup never changes while | ||
1947 | * lock is held because a routine modifies pc->mem_cgroup | ||
1948 | * should take move_lock_page_cgroup(). | ||
1949 | */ | ||
1950 | move_unlock_mem_cgroup(pc->mem_cgroup, flags); | ||
1951 | } | ||
1952 | |||
1913 | void mem_cgroup_update_page_stat(struct page *page, | 1953 | void mem_cgroup_update_page_stat(struct page *page, |
1914 | enum mem_cgroup_page_stat_item idx, int val) | 1954 | enum mem_cgroup_page_stat_item idx, int val) |
1915 | { | 1955 | { |
1916 | struct mem_cgroup *memcg; | 1956 | struct mem_cgroup *memcg; |
1917 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1957 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1918 | bool need_unlock = false; | ||
1919 | unsigned long uninitialized_var(flags); | 1958 | unsigned long uninitialized_var(flags); |
1920 | 1959 | ||
1921 | if (mem_cgroup_disabled()) | 1960 | if (mem_cgroup_disabled()) |
1922 | return; | 1961 | return; |
1923 | again: | 1962 | |
1924 | rcu_read_lock(); | ||
1925 | memcg = pc->mem_cgroup; | 1963 | memcg = pc->mem_cgroup; |
1926 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | 1964 | if (unlikely(!memcg || !PageCgroupUsed(pc))) |
1927 | goto out; | 1965 | return; |
1928 | /* pc->mem_cgroup is unstable ? */ | ||
1929 | if (unlikely(mem_cgroup_stealed(memcg))) { | ||
1930 | /* take a lock against to access pc->mem_cgroup */ | ||
1931 | move_lock_mem_cgroup(memcg, &flags); | ||
1932 | if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { | ||
1933 | move_unlock_mem_cgroup(memcg, &flags); | ||
1934 | rcu_read_unlock(); | ||
1935 | goto again; | ||
1936 | } | ||
1937 | need_unlock = true; | ||
1938 | } | ||
1939 | 1966 | ||
1940 | switch (idx) { | 1967 | switch (idx) { |
1941 | case MEMCG_NR_FILE_MAPPED: | 1968 | case MEMCG_NR_FILE_MAPPED: |
@@ -1950,11 +1977,6 @@ again: | |||
1950 | } | 1977 | } |
1951 | 1978 | ||
1952 | this_cpu_add(memcg->stat->count[idx], val); | 1979 | this_cpu_add(memcg->stat->count[idx], val); |
1953 | |||
1954 | out: | ||
1955 | if (unlikely(need_unlock)) | ||
1956 | move_unlock_mem_cgroup(memcg, &flags); | ||
1957 | rcu_read_unlock(); | ||
1958 | } | 1980 | } |
1959 | 1981 | ||
1960 | /* | 1982 | /* |