aboutsummaryrefslogtreecommitdiffstats
path: root/mm/shmem.c
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2012-07-11 17:02:47 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-11 19:04:48 -0400
commitd189922862e03ce6c7adc1e99d3b94e632dc8e89 (patch)
treef3408f4208ae51eefac07292dfa559257658d2d8 /mm/shmem.c
parentf21f8062201fc6361f65de92e758a76375ba8c59 (diff)
shmem: fix negative rss in memcg memory.stat
When adding the page_private checks before calling shmem_replace_page(), I did realize that there is a further race, but thought it too unlikely to need a hurried fix. But independently I've been chasing why a mem cgroup's memory.stat sometimes shows negative rss after all tasks have gone: I expected it to be a stats gathering bug, but actually it's shmem swapping's fault. It's an old surprise, that when you lock_page(lookup_swap_cache(swap)), the page may have been removed from swapcache before getting the lock; or it may have been freed and reused and be back in swapcache; and it can even be using the same swap location as before (page_private same). The swapoff case is already secure against this (swap cannot be reused until the whole area has been swapped off, and a new swapped on); and shmem_getpage_gfp() is protected by shmem_add_to_page_cache()'s check for the expected radix_tree entry - but a little too late. By that time, we might have already decided to shmem_replace_page(): I don't know of a problem from that, but I'd feel more at ease not to do so spuriously. And we have already done mem_cgroup_cache_charge(), on perhaps the wrong mem cgroup: and this charge is not then undone on the error path, because PageSwapCache ends up preventing that. It's this last case which causes the occasional negative rss in memory.stat: the page is charged here as cache, but (sometimes) found to be anon when eventually it's uncharged - and in between, it's an undeserved charge on the wrong memcg. Fix this by adding an earlier check on the radix_tree entry: it's inelegant to descend the tree twice, but swapping is not the fast path, and a better solution would need a pair (try+commit) of memcg calls, and a rework of shmem_replace_page() to keep out of the swapcache. We can use the added shmem_confirm_swap() function to replace the find_get_page+page_cache_release we were already doing on the error path. And add a comment on that -EEXIST: it seems a peculiar errno to be using, but originates from its use in radix_tree_insert(). [It can be surprising to see positive rss left in a memcg's memory.stat after all tasks have gone, since it is supposed to count anonymous but not shmem. Aside from sharing anon pages via fork with a task in some other memcg, it often happens after swapping: because a swap page can't be freed while under writeback, nor while locked. So it's not an error, and these residual pages are easily freed once pressure demands.] Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Michal Hocko <mhocko@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/shmem.c')
-rw-r--r--mm/shmem.c41
1 files changed, 29 insertions, 12 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index 3f696f7d9bac..294364a24a1f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -264,6 +264,24 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
264} 264}
265 265
266/* 266/*
267 * Sometimes, before we decide whether to proceed or to fail, we must check
268 * that an entry was not already brought back from swap by a racing thread.
269 *
270 * Checking page is not enough: by the time a SwapCache page is locked, it
271 * might be reused, and again be SwapCache, using the same swap as before.
272 */
273static bool shmem_confirm_swap(struct address_space *mapping,
274 pgoff_t index, swp_entry_t swap)
275{
276 void *item;
277
278 rcu_read_lock();
279 item = radix_tree_lookup(&mapping->page_tree, index);
280 rcu_read_unlock();
281 return item == swp_to_radix_entry(swap);
282}
283
284/*
267 * Like add_to_page_cache_locked, but error if expected item has gone. 285 * Like add_to_page_cache_locked, but error if expected item has gone.
268 */ 286 */
269static int shmem_add_to_page_cache(struct page *page, 287static int shmem_add_to_page_cache(struct page *page,
@@ -1124,9 +1142,9 @@ repeat:
1124 /* We have to do this with page locked to prevent races */ 1142 /* We have to do this with page locked to prevent races */
1125 lock_page(page); 1143 lock_page(page);
1126 if (!PageSwapCache(page) || page_private(page) != swap.val || 1144 if (!PageSwapCache(page) || page_private(page) != swap.val ||
1127 page->mapping) { 1145 !shmem_confirm_swap(mapping, index, swap)) {
1128 error = -EEXIST; /* try again */ 1146 error = -EEXIST; /* try again */
1129 goto failed; 1147 goto unlock;
1130 } 1148 }
1131 if (!PageUptodate(page)) { 1149 if (!PageUptodate(page)) {
1132 error = -EIO; 1150 error = -EIO;
@@ -1142,9 +1160,12 @@ repeat:
1142 1160
1143 error = mem_cgroup_cache_charge(page, current->mm, 1161 error = mem_cgroup_cache_charge(page, current->mm,
1144 gfp & GFP_RECLAIM_MASK); 1162 gfp & GFP_RECLAIM_MASK);
1145 if (!error) 1163 if (!error) {
1146 error = shmem_add_to_page_cache(page, mapping, index, 1164 error = shmem_add_to_page_cache(page, mapping, index,
1147 gfp, swp_to_radix_entry(swap)); 1165 gfp, swp_to_radix_entry(swap));
1166 /* We already confirmed swap, and make no allocation */
1167 VM_BUG_ON(error);
1168 }
1148 if (error) 1169 if (error)
1149 goto failed; 1170 goto failed;
1150 1171
@@ -1245,14 +1266,10 @@ decused:
1245unacct: 1266unacct:
1246 shmem_unacct_blocks(info->flags, 1); 1267 shmem_unacct_blocks(info->flags, 1);
1247failed: 1268failed:
1248 if (swap.val && error != -EINVAL) { 1269 if (swap.val && error != -EINVAL &&
1249 struct page *test = find_get_page(mapping, index); 1270 !shmem_confirm_swap(mapping, index, swap))
1250 if (test && !radix_tree_exceptional_entry(test)) 1271 error = -EEXIST;
1251 page_cache_release(test); 1272unlock:
1252 /* Have another try if the entry has changed */
1253 if (test != swp_to_radix_entry(swap))
1254 error = -EEXIST;
1255 }
1256 if (page) { 1273 if (page) {
1257 unlock_page(page); 1274 unlock_page(page);
1258 page_cache_release(page); 1275 page_cache_release(page);
@@ -1264,7 +1281,7 @@ failed:
1264 spin_unlock(&info->lock); 1281 spin_unlock(&info->lock);
1265 goto repeat; 1282 goto repeat;
1266 } 1283 }
1267 if (error == -EEXIST) 1284 if (error == -EEXIST) /* from above or from radix_tree_insert */
1268 goto repeat; 1285 goto repeat;
1269 return error; 1286 return error;
1270} 1287}