summaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorHuang Ying <ying.huang@intel.com>2019-07-11 23:55:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-12 14:05:43 -0400
commiteb085574a7526c4375965c5fbf7e5b0c19cdd336 (patch)
treed650ed2fa646d7ba36d665c4b809c2f7db088b49 /mm/swapfile.c
parenta4985833885b8f568bab90d5dc1886ae68dc82cf (diff)
mm, swap: fix race between swapoff and some swap operations
When swapin is performed, after getting the swap entry information from the page table, system will swap in the swap entry, without any lock held to prevent the swap device from being swapoff. This may cause the race like below, CPU 1 CPU 2 ----- ----- do_swap_page swapin_readahead __read_swap_cache_async swapoff swapcache_prepare p->swap_map = NULL __swap_duplicate p->swap_map[?] /* !!! NULL pointer access */ Because swapoff is usually done when system shutdown only, the race may not hit many people in practice. But it is still a race need to be fixed. To fix the race, get_swap_device() is added to check whether the specified swap entry is valid in its swap device. If so, it will keep the swap entry valid via preventing the swap device from being swapoff, until put_swap_device() is called. Because swapoff() is very rare code path, to make the normal path runs as fast as possible, rcu_read_lock/unlock() and synchronize_rcu() instead of reference count is used to implement get/put_swap_device(). >From get_swap_device() to put_swap_device(), RCU reader side is locked, so synchronize_rcu() in swapoff() will wait until put_swap_device() is called. In addition to swap_map, cluster_info, etc. data structure in the struct swap_info_struct, the swap cache radix tree will be freed after swapoff, so this patch fixes the race between swap cache looking up and swapoff too. Races between some other swap cache usages and swapoff are fixed too via calling synchronize_rcu() between clearing PageSwapCache() and freeing swap cache data structure. Another possible method to fix this is to use preempt_off() + stop_machine() to prevent the swap device from being swapoff when its data structure is being accessed. The overhead in hot-path of both methods is similar. The advantages of RCU based method are, 1. stop_machine() may disturb the normal execution code path on other CPUs. 2. File cache uses RCU to protect its radix tree. If the similar mechanism is used for swap cache too, it is easier to share code between them. 3. RCU is used to protect swap cache in total_swapcache_pages() and exit_swap_address_space() already. The two mechanisms can be merged to simplify the logic. Link: http://lkml.kernel.org/r/20190522015423.14418-1-ying.huang@intel.com Fixes: 235b62176712 ("mm/swap: add cluster lock") Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Reviewed-by: Andrea Parri <andrea.parri@amarulasolutions.com> Not-nacked-by: Hugh Dickins <hughd@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Daniel Jordan <daniel.m.jordan@oracle.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Yang Shi <yang.shi@linux.alibaba.com> Cc: David Rientjes <rientjes@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Dave Jiang <dave.jiang@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c154
1 files changed, 122 insertions, 32 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 596ac98051c5..dbab16ddefa6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1079,12 +1079,11 @@ fail:
1079static struct swap_info_struct *__swap_info_get(swp_entry_t entry) 1079static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
1080{ 1080{
1081 struct swap_info_struct *p; 1081 struct swap_info_struct *p;
1082 unsigned long offset, type; 1082 unsigned long offset;
1083 1083
1084 if (!entry.val) 1084 if (!entry.val)
1085 goto out; 1085 goto out;
1086 type = swp_type(entry); 1086 p = swp_swap_info(entry);
1087 p = swap_type_to_swap_info(type);
1088 if (!p) 1087 if (!p)
1089 goto bad_nofile; 1088 goto bad_nofile;
1090 if (!(p->flags & SWP_USED)) 1089 if (!(p->flags & SWP_USED))
@@ -1187,6 +1186,69 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
1187 return usage; 1186 return usage;
1188} 1187}
1189 1188
1189/*
1190 * Check whether swap entry is valid in the swap device. If so,
1191 * return pointer to swap_info_struct, and keep the swap entry valid
1192 * via preventing the swap device from being swapoff, until
1193 * put_swap_device() is called. Otherwise return NULL.
1194 *
1195 * The entirety of the RCU read critical section must come before the
1196 * return from or after the call to synchronize_rcu() in
1197 * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
1198 * true, the si->map, si->cluster_info, etc. must be valid in the
1199 * critical section.
1200 *
1201 * Notice that swapoff or swapoff+swapon can still happen before the
1202 * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
1203 * in put_swap_device() if there isn't any other way to prevent
1204 * swapoff, such as page lock, page table lock, etc. The caller must
1205 * be prepared for that. For example, the following situation is
1206 * possible.
1207 *
1208 * CPU1 CPU2
1209 * do_swap_page()
1210 * ... swapoff+swapon
1211 * __read_swap_cache_async()
1212 * swapcache_prepare()
1213 * __swap_duplicate()
1214 * // check swap_map
1215 * // verify PTE not changed
1216 *
1217 * In __swap_duplicate(), the swap_map need to be checked before
1218 * changing partly because the specified swap entry may be for another
1219 * swap device which has been swapoff. And in do_swap_page(), after
1220 * the page is read from the swap device, the PTE is verified not
1221 * changed with the page table locked to check whether the swap device
1222 * has been swapoff or swapoff+swapon.
1223 */
1224struct swap_info_struct *get_swap_device(swp_entry_t entry)
1225{
1226 struct swap_info_struct *si;
1227 unsigned long offset;
1228
1229 if (!entry.val)
1230 goto out;
1231 si = swp_swap_info(entry);
1232 if (!si)
1233 goto bad_nofile;
1234
1235 rcu_read_lock();
1236 if (!(si->flags & SWP_VALID))
1237 goto unlock_out;
1238 offset = swp_offset(entry);
1239 if (offset >= si->max)
1240 goto unlock_out;
1241
1242 return si;
1243bad_nofile:
1244 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1245out:
1246 return NULL;
1247unlock_out:
1248 rcu_read_unlock();
1249 return NULL;
1250}
1251
1190static unsigned char __swap_entry_free(struct swap_info_struct *p, 1252static unsigned char __swap_entry_free(struct swap_info_struct *p,
1191 swp_entry_t entry, unsigned char usage) 1253 swp_entry_t entry, unsigned char usage)
1192{ 1254{
@@ -1358,11 +1420,18 @@ int page_swapcount(struct page *page)
1358 return count; 1420 return count;
1359} 1421}
1360 1422
1361int __swap_count(struct swap_info_struct *si, swp_entry_t entry) 1423int __swap_count(swp_entry_t entry)
1362{ 1424{
1425 struct swap_info_struct *si;
1363 pgoff_t offset = swp_offset(entry); 1426 pgoff_t offset = swp_offset(entry);
1427 int count = 0;
1364 1428
1365 return swap_count(si->swap_map[offset]); 1429 si = get_swap_device(entry);
1430 if (si) {
1431 count = swap_count(si->swap_map[offset]);
1432 put_swap_device(si);
1433 }
1434 return count;
1366} 1435}
1367 1436
1368static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) 1437static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
@@ -1387,9 +1456,11 @@ int __swp_swapcount(swp_entry_t entry)
1387 int count = 0; 1456 int count = 0;
1388 struct swap_info_struct *si; 1457 struct swap_info_struct *si;
1389 1458
1390 si = __swap_info_get(entry); 1459 si = get_swap_device(entry);
1391 if (si) 1460 if (si) {
1392 count = swap_swapcount(si, entry); 1461 count = swap_swapcount(si, entry);
1462 put_swap_device(si);
1463 }
1393 return count; 1464 return count;
1394} 1465}
1395 1466
@@ -2335,9 +2406,9 @@ static int swap_node(struct swap_info_struct *p)
2335 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; 2406 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2336} 2407}
2337 2408
2338static void _enable_swap_info(struct swap_info_struct *p, int prio, 2409static void setup_swap_info(struct swap_info_struct *p, int prio,
2339 unsigned char *swap_map, 2410 unsigned char *swap_map,
2340 struct swap_cluster_info *cluster_info) 2411 struct swap_cluster_info *cluster_info)
2341{ 2412{
2342 int i; 2413 int i;
2343 2414
@@ -2362,7 +2433,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
2362 } 2433 }
2363 p->swap_map = swap_map; 2434 p->swap_map = swap_map;
2364 p->cluster_info = cluster_info; 2435 p->cluster_info = cluster_info;
2365 p->flags |= SWP_WRITEOK; 2436}
2437
2438static void _enable_swap_info(struct swap_info_struct *p)
2439{
2440 p->flags |= SWP_WRITEOK | SWP_VALID;
2366 atomic_long_add(p->pages, &nr_swap_pages); 2441 atomic_long_add(p->pages, &nr_swap_pages);
2367 total_swap_pages += p->pages; 2442 total_swap_pages += p->pages;
2368 2443
@@ -2389,7 +2464,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
2389 frontswap_init(p->type, frontswap_map); 2464 frontswap_init(p->type, frontswap_map);
2390 spin_lock(&swap_lock); 2465 spin_lock(&swap_lock);
2391 spin_lock(&p->lock); 2466 spin_lock(&p->lock);
2392 _enable_swap_info(p, prio, swap_map, cluster_info); 2467 setup_swap_info(p, prio, swap_map, cluster_info);
2468 spin_unlock(&p->lock);
2469 spin_unlock(&swap_lock);
2470 /*
2471 * Guarantee swap_map, cluster_info, etc. fields are valid
2472 * between get/put_swap_device() if SWP_VALID bit is set
2473 */
2474 synchronize_rcu();
2475 spin_lock(&swap_lock);
2476 spin_lock(&p->lock);
2477 _enable_swap_info(p);
2393 spin_unlock(&p->lock); 2478 spin_unlock(&p->lock);
2394 spin_unlock(&swap_lock); 2479 spin_unlock(&swap_lock);
2395} 2480}
@@ -2398,7 +2483,8 @@ static void reinsert_swap_info(struct swap_info_struct *p)
2398{ 2483{
2399 spin_lock(&swap_lock); 2484 spin_lock(&swap_lock);
2400 spin_lock(&p->lock); 2485 spin_lock(&p->lock);
2401 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); 2486 setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
2487 _enable_swap_info(p);
2402 spin_unlock(&p->lock); 2488 spin_unlock(&p->lock);
2403 spin_unlock(&swap_lock); 2489 spin_unlock(&swap_lock);
2404} 2490}
@@ -2501,6 +2587,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2501 2587
2502 reenable_swap_slots_cache_unlock(); 2588 reenable_swap_slots_cache_unlock();
2503 2589
2590 spin_lock(&swap_lock);
2591 spin_lock(&p->lock);
2592 p->flags &= ~SWP_VALID; /* mark swap device as invalid */
2593 spin_unlock(&p->lock);
2594 spin_unlock(&swap_lock);
2595 /*
2596 * wait for swap operations protected by get/put_swap_device()
2597 * to complete
2598 */
2599 synchronize_rcu();
2600
2504 flush_work(&p->discard_work); 2601 flush_work(&p->discard_work);
2505 2602
2506 destroy_swap_extents(p); 2603 destroy_swap_extents(p);
@@ -3265,17 +3362,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3265 unsigned char has_cache; 3362 unsigned char has_cache;
3266 int err = -EINVAL; 3363 int err = -EINVAL;
3267 3364
3268 if (non_swap_entry(entry)) 3365 p = get_swap_device(entry);
3269 goto out;
3270
3271 p = swp_swap_info(entry);
3272 if (!p) 3366 if (!p)
3273 goto bad_file;
3274
3275 offset = swp_offset(entry);
3276 if (unlikely(offset >= p->max))
3277 goto out; 3367 goto out;
3278 3368
3369 offset = swp_offset(entry);
3279 ci = lock_cluster_or_swap_info(p, offset); 3370 ci = lock_cluster_or_swap_info(p, offset);
3280 3371
3281 count = p->swap_map[offset]; 3372 count = p->swap_map[offset];
@@ -3321,11 +3412,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3321unlock_out: 3412unlock_out:
3322 unlock_cluster_or_swap_info(p, ci); 3413 unlock_cluster_or_swap_info(p, ci);
3323out: 3414out:
3415 if (p)
3416 put_swap_device(p);
3324 return err; 3417 return err;
3325
3326bad_file:
3327 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
3328 goto out;
3329} 3418}
3330 3419
3331/* 3420/*
@@ -3417,6 +3506,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3417 struct page *list_page; 3506 struct page *list_page;
3418 pgoff_t offset; 3507 pgoff_t offset;
3419 unsigned char count; 3508 unsigned char count;
3509 int ret = 0;
3420 3510
3421 /* 3511 /*
3422 * When debugging, it's easier to use __GFP_ZERO here; but it's better 3512 * When debugging, it's easier to use __GFP_ZERO here; but it's better
@@ -3424,15 +3514,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3424 */ 3514 */
3425 page = alloc_page(gfp_mask | __GFP_HIGHMEM); 3515 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3426 3516
3427 si = swap_info_get(entry); 3517 si = get_swap_device(entry);
3428 if (!si) { 3518 if (!si) {
3429 /* 3519 /*
3430 * An acceptable race has occurred since the failing 3520 * An acceptable race has occurred since the failing
3431 * __swap_duplicate(): the swap entry has been freed, 3521 * __swap_duplicate(): the swap device may be swapoff
3432 * perhaps even the whole swap_map cleared for swapoff.
3433 */ 3522 */
3434 goto outer; 3523 goto outer;
3435 } 3524 }
3525 spin_lock(&si->lock);
3436 3526
3437 offset = swp_offset(entry); 3527 offset = swp_offset(entry);
3438 3528
@@ -3450,9 +3540,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3450 } 3540 }
3451 3541
3452 if (!page) { 3542 if (!page) {
3453 unlock_cluster(ci); 3543 ret = -ENOMEM;
3454 spin_unlock(&si->lock); 3544 goto out;
3455 return -ENOMEM;
3456 } 3545 }
3457 3546
3458 /* 3547 /*
@@ -3504,10 +3593,11 @@ out_unlock_cont:
3504out: 3593out:
3505 unlock_cluster(ci); 3594 unlock_cluster(ci);
3506 spin_unlock(&si->lock); 3595 spin_unlock(&si->lock);
3596 put_swap_device(si);
3507outer: 3597outer:
3508 if (page) 3598 if (page)
3509 __free_page(page); 3599 __free_page(page);
3510 return 0; 3600 return ret;
3511} 3601}
3512 3602
3513/* 3603/*