diff options
author | Johannes Weiner <hannes@cmpxchg.org> | 2014-05-06 15:50:05 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-05-06 16:04:59 -0400 |
commit | 139b6a6fb1539e04b01663d61baff3088c63dbb5 (patch) | |
tree | 299fc6452057660ce3626e9e8c00d675d6647423 /mm/filemap.c | |
parent | 49e068f0b73dd042c186ffa9b420a9943e90389a (diff) |
mm: filemap: update find_get_pages_tag() to deal with shadow entries
Dave Jones reports the following crash when find_get_pages_tag() runs
into an exceptional entry:
kernel BUG at mm/filemap.c:1347!
RIP: find_get_pages_tag+0x1cb/0x220
Call Trace:
find_get_pages_tag+0x36/0x220
pagevec_lookup_tag+0x21/0x30
filemap_fdatawait_range+0xbe/0x1e0
filemap_fdatawait+0x27/0x30
sync_inodes_sb+0x204/0x2a0
sync_inodes_one_sb+0x19/0x20
iterate_supers+0xb2/0x110
sys_sync+0x44/0xb0
ia32_do_call+0x13/0x13
1343 /*
1344 * This function is never used on a shmem/tmpfs
1345 * mapping, so a swap entry won't be found here.
1346 */
1347 BUG();
After commit 0cd6144aadd2 ("mm + fs: prepare for non-page entries in
page cache radix trees") this comment and BUG() are out of date because
exceptional entries can now appear in all mappings - as shadows of
recently evicted pages.
However, as Hugh Dickins notes,
"it is truly surprising for a PAGECACHE_TAG_WRITEBACK (and probably
any other PAGECACHE_TAG_*) to appear on an exceptional entry.
I expect it comes down to an occasional race in RCU lookup of the
radix_tree: lacking absolute synchronization, we might sometimes
catch an exceptional entry, with the tag which really belongs with
the unexceptional entry which was there an instant before."
And indeed, not only is the tree walk lockless, the tags are also read
in chunks, one radix tree node at a time. There is plenty of time for
page reclaim to swoop in and replace a page that was already looked up
as tagged with a shadow entry.
Remove the BUG() and update the comment. While reviewing all other
lookup sites for whether they properly deal with shadow entries of
evicted pages, update all the comments and fix memcg file charge moving
to not miss shmem/tmpfs swapcache pages.
Fixes: 0cd6144aadd2 ("mm + fs: prepare for non-page entries in page cache radix trees")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Dave Jones <davej@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 49 |
1 files changed, 28 insertions, 21 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 5020b280a771..000a220e2a41 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -906,8 +906,8 @@ EXPORT_SYMBOL(page_cache_prev_hole); | |||
906 | * Looks up the page cache slot at @mapping & @offset. If there is a | 906 | * Looks up the page cache slot at @mapping & @offset. If there is a |
907 | * page cache page, it is returned with an increased refcount. | 907 | * page cache page, it is returned with an increased refcount. |
908 | * | 908 | * |
909 | * If the slot holds a shadow entry of a previously evicted page, it | 909 | * If the slot holds a shadow entry of a previously evicted page, or a |
910 | * is returned. | 910 | * swap entry from shmem/tmpfs, it is returned. |
911 | * | 911 | * |
912 | * Otherwise, %NULL is returned. | 912 | * Otherwise, %NULL is returned. |
913 | */ | 913 | */ |
@@ -928,9 +928,9 @@ repeat: | |||
928 | if (radix_tree_deref_retry(page)) | 928 | if (radix_tree_deref_retry(page)) |
929 | goto repeat; | 929 | goto repeat; |
930 | /* | 930 | /* |
931 | * Otherwise, shmem/tmpfs must be storing a swap entry | 931 | * A shadow entry of a recently evicted page, |
932 | * here as an exceptional entry: so return it without | 932 | * or a swap entry from shmem/tmpfs. Return |
933 | * attempting to raise page count. | 933 | * it without attempting to raise page count. |
934 | */ | 934 | */ |
935 | goto out; | 935 | goto out; |
936 | } | 936 | } |
@@ -983,8 +983,8 @@ EXPORT_SYMBOL(find_get_page); | |||
983 | * page cache page, it is returned locked and with an increased | 983 | * page cache page, it is returned locked and with an increased |
984 | * refcount. | 984 | * refcount. |
985 | * | 985 | * |
986 | * If the slot holds a shadow entry of a previously evicted page, it | 986 | * If the slot holds a shadow entry of a previously evicted page, or a |
987 | * is returned. | 987 | * swap entry from shmem/tmpfs, it is returned. |
988 | * | 988 | * |
989 | * Otherwise, %NULL is returned. | 989 | * Otherwise, %NULL is returned. |
990 | * | 990 | * |
@@ -1099,8 +1099,8 @@ EXPORT_SYMBOL(find_or_create_page); | |||
1099 | * with ascending indexes. There may be holes in the indices due to | 1099 | * with ascending indexes. There may be holes in the indices due to |
1100 | * not-present pages. | 1100 | * not-present pages. |
1101 | * | 1101 | * |
1102 | * Any shadow entries of evicted pages are included in the returned | 1102 | * Any shadow entries of evicted pages, or swap entries from |
1103 | * array. | 1103 | * shmem/tmpfs, are included in the returned array. |
1104 | * | 1104 | * |
1105 | * find_get_entries() returns the number of pages and shadow entries | 1105 | * find_get_entries() returns the number of pages and shadow entries |
1106 | * which were found. | 1106 | * which were found. |
@@ -1128,9 +1128,9 @@ repeat: | |||
1128 | if (radix_tree_deref_retry(page)) | 1128 | if (radix_tree_deref_retry(page)) |
1129 | goto restart; | 1129 | goto restart; |
1130 | /* | 1130 | /* |
1131 | * Otherwise, we must be storing a swap entry | 1131 | * A shadow entry of a recently evicted page, |
1132 | * here as an exceptional entry: so return it | 1132 | * or a swap entry from shmem/tmpfs. Return |
1133 | * without attempting to raise page count. | 1133 | * it without attempting to raise page count. |
1134 | */ | 1134 | */ |
1135 | goto export; | 1135 | goto export; |
1136 | } | 1136 | } |
@@ -1198,9 +1198,9 @@ repeat: | |||
1198 | goto restart; | 1198 | goto restart; |
1199 | } | 1199 | } |
1200 | /* | 1200 | /* |
1201 | * Otherwise, shmem/tmpfs must be storing a swap entry | 1201 | * A shadow entry of a recently evicted page, |
1202 | * here as an exceptional entry: so skip over it - | 1202 | * or a swap entry from shmem/tmpfs. Skip |
1203 | * we only reach this from invalidate_mapping_pages(). | 1203 | * over it. |
1204 | */ | 1204 | */ |
1205 | continue; | 1205 | continue; |
1206 | } | 1206 | } |
@@ -1265,9 +1265,9 @@ repeat: | |||
1265 | goto restart; | 1265 | goto restart; |
1266 | } | 1266 | } |
1267 | /* | 1267 | /* |
1268 | * Otherwise, shmem/tmpfs must be storing a swap entry | 1268 | * A shadow entry of a recently evicted page, |
1269 | * here as an exceptional entry: so stop looking for | 1269 | * or a swap entry from shmem/tmpfs. Stop |
1270 | * contiguous pages. | 1270 | * looking for contiguous pages. |
1271 | */ | 1271 | */ |
1272 | break; | 1272 | break; |
1273 | } | 1273 | } |
@@ -1341,10 +1341,17 @@ repeat: | |||
1341 | goto restart; | 1341 | goto restart; |
1342 | } | 1342 | } |
1343 | /* | 1343 | /* |
1344 | * This function is never used on a shmem/tmpfs | 1344 | * A shadow entry of a recently evicted page. |
1345 | * mapping, so a swap entry won't be found here. | 1345 | * |
1346 | * Those entries should never be tagged, but | ||
1347 | * this tree walk is lockless and the tags are | ||
1348 | * looked up in bulk, one radix tree node at a | ||
1349 | * time, so there is a sizable window for page | ||
1350 | * reclaim to evict a page we saw tagged. | ||
1351 | * | ||
1352 | * Skip over it. | ||
1346 | */ | 1353 | */ |
1347 | BUG(); | 1354 | continue; |
1348 | } | 1355 | } |
1349 | 1356 | ||
1350 | if (!page_cache_get_speculative(page)) | 1357 | if (!page_cache_get_speculative(page)) |