aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorShaohua Li <shli@kernel.org>2013-09-11 17:20:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-11 18:57:16 -0400
commitedfe23dac3e2981277087b05bec7fec7790d1835 (patch)
treeae5b30cc84893e97f3a68f21612cb9288755555e /mm/swapfile.c
parent815c2c543d3aeb914a361f981440ece552778724 (diff)
swap: fix races exposed by swap discard
The previous patch can expose races, according to Hugh: swapoff was sometimes failing with "Cannot allocate memory", coming from try_to_unuse()'s -ENOMEM: it needs to allow for swap_duplicate() failing on a free entry temporarily SWAP_MAP_BAD while being discarded. We should use ACCESS_ONCE() there, and whenever accessing swap_map locklessly; but rather than peppering it throughout try_to_unuse(), just declare *swap_map with volatile. try_to_unuse() is accustomed to *swap_map going down racily, but not necessarily to it jumping up from 0 to SWAP_MAP_BAD: we'll be safer to prevent that transition once SWP_WRITEOK is switched off, when it's a waste of time to issue discards anyway (swapon can do a whole discard). Another issue is: In swapin_readahead(), read_swap_cache_async() can read a bad swap entry, because we don't check if readahead swap entry is bad. This doesn't break anything but such swapin page is wasteful and can only be freed at page reclaim. We should avoid read such swap entry. And in discard, we mark swap entry SWAP_MAP_BAD and then switch it to normal when discard is finished. If readahead reads such swap entry, we have the same issue, so we much check if swap entry is bad too. Thanks Hugh to inspire swapin_readahead could use bad swap entry. [include Hugh's patch 'swap: fix swapoff ENOMEMs from discard'] Signed-off-by: Shaohua Li <shli@fusionio.com> Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Kyungmin Park <kmpark@infradead.org> Cc: Rafael Aquini <aquini@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c31
1 files changed, 27 insertions, 4 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dac47c66055c..98e52e373bd8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -370,7 +370,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
370 * instead of free it immediately. The cluster will be freed 370 * instead of free it immediately. The cluster will be freed
371 * after discard. 371 * after discard.
372 */ 372 */
373 if (p->flags & SWP_PAGE_DISCARD) { 373 if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
374 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
374 swap_cluster_schedule_discard(p, idx); 375 swap_cluster_schedule_discard(p, idx);
375 return; 376 return;
376 } 377 }
@@ -1288,7 +1289,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1288 else 1289 else
1289 continue; 1290 continue;
1290 } 1291 }
1291 count = si->swap_map[i]; 1292 count = ACCESS_ONCE(si->swap_map[i]);
1292 if (count && swap_count(count) != SWAP_MAP_BAD) 1293 if (count && swap_count(count) != SWAP_MAP_BAD)
1293 break; 1294 break;
1294 } 1295 }
@@ -1308,7 +1309,11 @@ int try_to_unuse(unsigned int type, bool frontswap,
1308{ 1309{
1309 struct swap_info_struct *si = swap_info[type]; 1310 struct swap_info_struct *si = swap_info[type];
1310 struct mm_struct *start_mm; 1311 struct mm_struct *start_mm;
1311 unsigned char *swap_map; 1312 volatile unsigned char *swap_map; /* swap_map is accessed without
1313 * locking. Mark it as volatile
1314 * to prevent compiler doing
1315 * something odd.
1316 */
1312 unsigned char swcount; 1317 unsigned char swcount;
1313 struct page *page; 1318 struct page *page;
1314 swp_entry_t entry; 1319 swp_entry_t entry;
@@ -1359,7 +1364,15 @@ int try_to_unuse(unsigned int type, bool frontswap,
1359 * reused since sys_swapoff() already disabled 1364 * reused since sys_swapoff() already disabled
1360 * allocation from here, or alloc_page() failed. 1365 * allocation from here, or alloc_page() failed.
1361 */ 1366 */
1362 if (!*swap_map) 1367 swcount = *swap_map;
1368 /*
1369 * We don't hold lock here, so the swap entry could be
1370 * SWAP_MAP_BAD (when the cluster is discarding).
1371 * Instead of fail out, We can just skip the swap
1372 * entry because swapoff will wait for discarding
1373 * finish anyway.
1374 */
1375 if (!swcount || swcount == SWAP_MAP_BAD)
1363 continue; 1376 continue;
1364 retval = -ENOMEM; 1377 retval = -ENOMEM;
1365 break; 1378 break;
@@ -2543,6 +2556,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2543 goto unlock_out; 2556 goto unlock_out;
2544 2557
2545 count = p->swap_map[offset]; 2558 count = p->swap_map[offset];
2559
2560 /*
2561 * swapin_readahead() doesn't check if a swap entry is valid, so the
2562 * swap entry could be SWAP_MAP_BAD. Check here with lock held.
2563 */
2564 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
2565 err = -ENOENT;
2566 goto unlock_out;
2567 }
2568
2546 has_cache = count & SWAP_HAS_CACHE; 2569 has_cache = count & SWAP_HAS_CACHE;
2547 count &= ~SWAP_HAS_CACHE; 2570 count &= ~SWAP_HAS_CACHE;
2548 err = 0; 2571 err = 0;