diff options
author | Hugh Dickins <hughd@google.com> | 2012-07-11 17:02:47 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-11 19:04:48 -0400 |
commit | d189922862e03ce6c7adc1e99d3b94e632dc8e89 (patch) | |
tree | f3408f4208ae51eefac07292dfa559257658d2d8 /mm/shmem.c | |
parent | f21f8062201fc6361f65de92e758a76375ba8c59 (diff) |
shmem: fix negative rss in memcg memory.stat
When adding the page_private checks before calling shmem_replace_page(), I
did realize that there is a further race, but thought it too unlikely to
need a hurried fix.
But independently I've been chasing why a mem cgroup's memory.stat
sometimes shows negative rss after all tasks have gone: I expected it to
be a stats gathering bug, but actually it's shmem swapping's fault.
It's an old surprise, that when you lock_page(lookup_swap_cache(swap)),
the page may have been removed from swapcache before getting the lock; or
it may have been freed and reused and be back in swapcache; and it can
even be using the same swap location as before (page_private same).
The swapoff case is already secure against this (swap cannot be reused
until the whole area has been swapped off, and a new swapped on); and
shmem_getpage_gfp() is protected by shmem_add_to_page_cache()'s check for
the expected radix_tree entry - but a little too late.
By that time, we might have already decided to shmem_replace_page(): I
don't know of a problem from that, but I'd feel more at ease not to do so
spuriously. And we have already done mem_cgroup_cache_charge(), on
perhaps the wrong mem cgroup: and this charge is not then undone on the
error path, because PageSwapCache ends up preventing that.
It's this last case which causes the occasional negative rss in
memory.stat: the page is charged here as cache, but (sometimes) found to
be anon when eventually it's uncharged - and in between, it's an
undeserved charge on the wrong memcg.
Fix this by adding an earlier check on the radix_tree entry: it's
inelegant to descend the tree twice, but swapping is not the fast path,
and a better solution would need a pair (try+commit) of memcg calls, and a
rework of shmem_replace_page() to keep out of the swapcache.
We can use the added shmem_confirm_swap() function to replace the
find_get_page+page_cache_release we were already doing on the error path.
And add a comment on that -EEXIST: it seems a peculiar errno to be using,
but originates from its use in radix_tree_insert().
[It can be surprising to see positive rss left in a memcg's memory.stat
after all tasks have gone, since it is supposed to count anonymous but not
shmem. Aside from sharing anon pages via fork with a task in some other
memcg, it often happens after swapping: because a swap page can't be freed
while under writeback, nor while locked. So it's not an error, and these
residual pages are easily freed once pressure demands.]
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/shmem.c')
-rw-r--r-- | mm/shmem.c | 41 |
1 files changed, 29 insertions, 12 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index 3f696f7d9bac..294364a24a1f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -264,6 +264,24 @@ static int shmem_radix_tree_replace(struct address_space *mapping, | |||
264 | } | 264 | } |
265 | 265 | ||
266 | /* | 266 | /* |
267 | * Sometimes, before we decide whether to proceed or to fail, we must check | ||
268 | * that an entry was not already brought back from swap by a racing thread. | ||
269 | * | ||
270 | * Checking page is not enough: by the time a SwapCache page is locked, it | ||
271 | * might be reused, and again be SwapCache, using the same swap as before. | ||
272 | */ | ||
273 | static bool shmem_confirm_swap(struct address_space *mapping, | ||
274 | pgoff_t index, swp_entry_t swap) | ||
275 | { | ||
276 | void *item; | ||
277 | |||
278 | rcu_read_lock(); | ||
279 | item = radix_tree_lookup(&mapping->page_tree, index); | ||
280 | rcu_read_unlock(); | ||
281 | return item == swp_to_radix_entry(swap); | ||
282 | } | ||
283 | |||
284 | /* | ||
267 | * Like add_to_page_cache_locked, but error if expected item has gone. | 285 | * Like add_to_page_cache_locked, but error if expected item has gone. |
268 | */ | 286 | */ |
269 | static int shmem_add_to_page_cache(struct page *page, | 287 | static int shmem_add_to_page_cache(struct page *page, |
@@ -1124,9 +1142,9 @@ repeat: | |||
1124 | /* We have to do this with page locked to prevent races */ | 1142 | /* We have to do this with page locked to prevent races */ |
1125 | lock_page(page); | 1143 | lock_page(page); |
1126 | if (!PageSwapCache(page) || page_private(page) != swap.val || | 1144 | if (!PageSwapCache(page) || page_private(page) != swap.val || |
1127 | page->mapping) { | 1145 | !shmem_confirm_swap(mapping, index, swap)) { |
1128 | error = -EEXIST; /* try again */ | 1146 | error = -EEXIST; /* try again */ |
1129 | goto failed; | 1147 | goto unlock; |
1130 | } | 1148 | } |
1131 | if (!PageUptodate(page)) { | 1149 | if (!PageUptodate(page)) { |
1132 | error = -EIO; | 1150 | error = -EIO; |
@@ -1142,9 +1160,12 @@ repeat: | |||
1142 | 1160 | ||
1143 | error = mem_cgroup_cache_charge(page, current->mm, | 1161 | error = mem_cgroup_cache_charge(page, current->mm, |
1144 | gfp & GFP_RECLAIM_MASK); | 1162 | gfp & GFP_RECLAIM_MASK); |
1145 | if (!error) | 1163 | if (!error) { |
1146 | error = shmem_add_to_page_cache(page, mapping, index, | 1164 | error = shmem_add_to_page_cache(page, mapping, index, |
1147 | gfp, swp_to_radix_entry(swap)); | 1165 | gfp, swp_to_radix_entry(swap)); |
1166 | /* We already confirmed swap, and make no allocation */ | ||
1167 | VM_BUG_ON(error); | ||
1168 | } | ||
1148 | if (error) | 1169 | if (error) |
1149 | goto failed; | 1170 | goto failed; |
1150 | 1171 | ||
@@ -1245,14 +1266,10 @@ decused: | |||
1245 | unacct: | 1266 | unacct: |
1246 | shmem_unacct_blocks(info->flags, 1); | 1267 | shmem_unacct_blocks(info->flags, 1); |
1247 | failed: | 1268 | failed: |
1248 | if (swap.val && error != -EINVAL) { | 1269 | if (swap.val && error != -EINVAL && |
1249 | struct page *test = find_get_page(mapping, index); | 1270 | !shmem_confirm_swap(mapping, index, swap)) |
1250 | if (test && !radix_tree_exceptional_entry(test)) | 1271 | error = -EEXIST; |
1251 | page_cache_release(test); | 1272 | unlock: |
1252 | /* Have another try if the entry has changed */ | ||
1253 | if (test != swp_to_radix_entry(swap)) | ||
1254 | error = -EEXIST; | ||
1255 | } | ||
1256 | if (page) { | 1273 | if (page) { |
1257 | unlock_page(page); | 1274 | unlock_page(page); |
1258 | page_cache_release(page); | 1275 | page_cache_release(page); |
@@ -1264,7 +1281,7 @@ failed: | |||
1264 | spin_unlock(&info->lock); | 1281 | spin_unlock(&info->lock); |
1265 | goto repeat; | 1282 | goto repeat; |
1266 | } | 1283 | } |
1267 | if (error == -EEXIST) | 1284 | if (error == -EEXIST) /* from above or from radix_tree_insert */ |
1268 | goto repeat; | 1285 | goto repeat; |
1269 | return error; | 1286 | return error; |
1270 | } | 1287 | } |