aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-10-29 17:50:48 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-29 19:33:15 -0400
commitd7365e783edb858279be1d03f61bc8d5d3383d90 (patch)
treeca8c1aea5763cace1eb63022cfea83c480eef487 /mm/memcontrol.c
parent3a3c02ecf7f2852f122d6d16fb9b3d9cb0c6f201 (diff)
mm: memcontrol: fix missed end-writeback page accounting
Commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed page migration to uncharge the old page right away. The page is locked, unmapped, truncated, and off the LRU, but it could race with writeback ending, which then doesn't unaccount the page properly: test_clear_page_writeback() migration wait_on_page_writeback() TestClearPageWriteback() mem_cgroup_migrate() clear PCG_USED mem_cgroup_update_page_stat() if (PageCgroupUsed(pc)) decrease memcg pages under writeback release pc->mem_cgroup->move_lock The per-page statistics interface is heavily optimized to avoid a function call and a lookup_page_cgroup() in the file unmap fast path, which means it doesn't verify whether a page is still charged before clearing PageWriteback() and it has to do it in the stat update later. Rework it so that it looks up the page's memcg once at the beginning of the transaction and then uses it throughout. The charge will be verified before clearing PageWriteback() and migration can't uncharge the page as long as that is still set. The RCU lock will protect the memcg past uncharge. As far as losing the optimization goes, the following test results are from a microbenchmark that maps, faults, and unmaps a 4GB sparse file three times in a nested fashion, so that there are two negative passes that don't account but still go through the new transaction overhead. There is no actual difference: old: 33.195102545 seconds time elapsed ( +- 0.01% ) new: 33.199231369 seconds time elapsed ( +- 0.03% ) The time spent in page_remove_rmap()'s callees still adds up to the same, but the time spent in the function itself seems reduced: # Children Self Command Shared Object Symbol old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: Vladimir Davydov <vdavydov@parallels.com> Cc: <stable@vger.kernel.org> [3.17.x] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c105
1 files changed, 57 insertions, 48 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 23976fd885fd..d6ac0e33e150 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1536,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1536 * start move here. 1536 * start move here.
1537 */ 1537 */
1538 1538
1539/* for quick checking without looking up memcg */
1540atomic_t memcg_moving __read_mostly;
1541
1542static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1539static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1543{ 1540{
1544 atomic_inc(&memcg_moving);
1545 atomic_inc(&memcg->moving_account); 1541 atomic_inc(&memcg->moving_account);
1546 synchronize_rcu(); 1542 synchronize_rcu();
1547} 1543}
@@ -1552,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1552 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1548 * Now, mem_cgroup_clear_mc() may call this function with NULL.
1553 * We check NULL in callee rather than caller. 1549 * We check NULL in callee rather than caller.
1554 */ 1550 */
1555 if (memcg) { 1551 if (memcg)
1556 atomic_dec(&memcg_moving);
1557 atomic_dec(&memcg->moving_account); 1552 atomic_dec(&memcg->moving_account);
1558 }
1559} 1553}
1560 1554
1561/* 1555/*
@@ -2204,41 +2198,52 @@ cleanup:
2204 return true; 2198 return true;
2205} 2199}
2206 2200
2207/* 2201/**
2208 * Used to update mapped file or writeback or other statistics. 2202 * mem_cgroup_begin_page_stat - begin a page state statistics transaction
2203 * @page: page that is going to change accounted state
2204 * @locked: &memcg->move_lock slowpath was taken
2205 * @flags: IRQ-state flags for &memcg->move_lock
2209 * 2206 *
2210 * Notes: Race condition 2207 * This function must mark the beginning of an accounted page state
2208 * change to prevent double accounting when the page is concurrently
2209 * being moved to another memcg:
2211 * 2210 *
2212 * Charging occurs during page instantiation, while the page is 2211 * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
2213 * unmapped and locked in page migration, or while the page table is 2212 * if (TestClearPageState(page))
2214 * locked in THP migration. No race is possible. 2213 * mem_cgroup_update_page_stat(memcg, state, -1);
2214 * mem_cgroup_end_page_stat(memcg, locked, flags);
2215 * 2215 *
2216 * Uncharge happens to pages with zero references, no race possible. 2216 * The RCU lock is held throughout the transaction. The fast path can
2217 * get away without acquiring the memcg->move_lock (@locked is false)
2218 * because page moving starts with an RCU grace period.
2217 * 2219 *
2218 * Charge moving between groups is protected by checking mm->moving 2220 * The RCU lock also protects the memcg from being freed when the page
2219 * account and taking the move_lock in the slowpath. 2221 * state that is going to change is the only thing preventing the page
2222 * from being uncharged. E.g. end-writeback clearing PageWriteback(),
2223 * which allows migration to go ahead and uncharge the page before the
2224 * account transaction might be complete.
2220 */ 2225 */
2221 2226struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
2222void __mem_cgroup_begin_update_page_stat(struct page *page, 2227 bool *locked,
2223 bool *locked, unsigned long *flags) 2228 unsigned long *flags)
2224{ 2229{
2225 struct mem_cgroup *memcg; 2230 struct mem_cgroup *memcg;
2226 struct page_cgroup *pc; 2231 struct page_cgroup *pc;
2227 2232
2233 rcu_read_lock();
2234
2235 if (mem_cgroup_disabled())
2236 return NULL;
2237
2228 pc = lookup_page_cgroup(page); 2238 pc = lookup_page_cgroup(page);
2229again: 2239again:
2230 memcg = pc->mem_cgroup; 2240 memcg = pc->mem_cgroup;
2231 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2241 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2232 return; 2242 return NULL;
2233 /* 2243
2234 * If this memory cgroup is not under account moving, we don't 2244 *locked = false;
2235 * need to take move_lock_mem_cgroup(). Because we already hold
2236 * rcu_read_lock(), any calls to move_account will be delayed until
2237 * rcu_read_unlock().
2238 */
2239 VM_BUG_ON(!rcu_read_lock_held());
2240 if (atomic_read(&memcg->moving_account) <= 0) 2245 if (atomic_read(&memcg->moving_account) <= 0)
2241 return; 2246 return memcg;
2242 2247
2243 move_lock_mem_cgroup(memcg, flags); 2248 move_lock_mem_cgroup(memcg, flags);
2244 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 2249 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
@@ -2246,36 +2251,40 @@ again:
2246 goto again; 2251 goto again;
2247 } 2252 }
2248 *locked = true; 2253 *locked = true;
2254
2255 return memcg;
2249} 2256}
2250 2257
2251void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) 2258/**
2259 * mem_cgroup_end_page_stat - finish a page state statistics transaction
2260 * @memcg: the memcg that was accounted against
2261 * @locked: value received from mem_cgroup_begin_page_stat()
2262 * @flags: value received from mem_cgroup_begin_page_stat()
2263 */
2264void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
2265 unsigned long flags)
2252{ 2266{
2253 struct page_cgroup *pc = lookup_page_cgroup(page); 2267 if (memcg && locked)
2268 move_unlock_mem_cgroup(memcg, &flags);
2254 2269
2255 /* 2270 rcu_read_unlock();
2256 * It's guaranteed that pc->mem_cgroup never changes while
2257 * lock is held because a routine modifies pc->mem_cgroup
2258 * should take move_lock_mem_cgroup().
2259 */
2260 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
2261} 2271}
2262 2272
2263void mem_cgroup_update_page_stat(struct page *page, 2273/**
2274 * mem_cgroup_update_page_stat - update page state statistics
2275 * @memcg: memcg to account against
2276 * @idx: page state item to account
2277 * @val: number of pages (positive or negative)
2278 *
2279 * See mem_cgroup_begin_page_stat() for locking requirements.
2280 */
2281void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
2264 enum mem_cgroup_stat_index idx, int val) 2282 enum mem_cgroup_stat_index idx, int val)
2265{ 2283{
2266 struct mem_cgroup *memcg;
2267 struct page_cgroup *pc = lookup_page_cgroup(page);
2268 unsigned long uninitialized_var(flags);
2269
2270 if (mem_cgroup_disabled())
2271 return;
2272
2273 VM_BUG_ON(!rcu_read_lock_held()); 2284 VM_BUG_ON(!rcu_read_lock_held());
2274 memcg = pc->mem_cgroup;
2275 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2276 return;
2277 2285
2278 this_cpu_add(memcg->stat->count[idx], val); 2286 if (memcg)
2287 this_cpu_add(memcg->stat->count[idx], val);
2279} 2288}
2280 2289
2281/* 2290/*