summaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorShaohua Li <shli@kernel.org>2013-09-11 17:20:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-11 18:57:17 -0400
commitebc2a1a69111eadfeda8487e577f1a5d42ef0dae (patch)
tree8a1d08bc6c0a1eb7e1bcd93056141614c22a7d40 /mm/swapfile.c
parentedfe23dac3e2981277087b05bec7fec7790d1835 (diff)
swap: make cluster allocation per-cpu
swap cluster allocation is to get better request merge to improve performance. But the cluster is shared globally, if multiple tasks are doing swap, this will cause interleave disk access. While multiple tasks swap is quite common, for example, each numa node has a kswapd thread doing swap and multiple threads/processes doing direct page reclaim. ioscheduler can't help too much here, because tasks don't send swapout IO down to block layer in the meantime. Block layer does merge some IOs, but a lot not, depending on how many tasks are doing swapout concurrently. In practice, I've seen a lot of small size IO in swapout workloads. We makes the cluster allocation per-cpu here. The interleave disk access issue goes away. All tasks swapout to their own cluster, so swapout will become sequential, which can be easily merged to big size IO. If one CPU can't get its per-cpu cluster (for example, there is no free cluster anymore in the swap), it will fallback to scan swap_map. The CPU can still continue swap. We don't need recycle free swap entries of other CPUs. In my test (swap to a 2-disk raid0 partition), this improves around 10% swapout throughput, and request size is increased significantly. How does this impact swap readahead is uncertain though. On one side, page reclaim always isolates and swaps several adjancent pages, this will make page reclaim write the pages sequentially and benefit readahead. On the other side, several CPU write pages interleave means the pages don't live _sequentially_ but relatively _near_. In the per-cpu allocation case, if adjancent pages are written by different cpus, they will live relatively _far_. So how this impacts swap readahead depends on how many pages page reclaim isolates and swaps one time. If the number is big, this patch will benefit swap readahead. Of course, this is about sequential access pattern. The patch has no impact for random access pattern, because the new cluster allocation algorithm is just for SSD. Alternative solution is organizing swap layout to be per-mm instead of this per-cpu approach. In the per-mm layout, we allocate a disk range for each mm, so pages of one mm live in swap disk adjacently. per-mm layout has potential issues of lock contention if multiple reclaimers are swap pages from one mm. For a sequential workload, per-mm layout is better to implement swap readahead, because pages from the mm are adjacent in disk. But per-cpu layout isn't very bad in this workload, as page reclaim always isolates and swaps several pages one time, such pages will still live in disk sequentially and readahead can utilize this. For a random workload, per-mm layout isn't beneficial of request merge, because it's quite possible pages from different mm are swapout in the meantime and IO can't be merged in per-mm layout. while with per-cpu layout we can merge requests from any mm. Considering random workload is more popular in workloads with swap (and per-cpu approach isn't too bad for sequential workload too), I'm choosing per-cpu layout. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Shaohua Li <shli@fusionio.com> Cc: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Kyungmin Park <kmpark@infradead.org> Cc: Hugh Dickins <hughd@google.com> Cc: Rafael Aquini <aquini@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c125
1 files changed, 91 insertions, 34 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 98e52e373bd8..3963fc24fcc1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -392,13 +392,78 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
392 * It's possible scan_swap_map() uses a free cluster in the middle of free 392 * It's possible scan_swap_map() uses a free cluster in the middle of free
393 * cluster list. Avoiding such abuse to avoid list corruption. 393 * cluster list. Avoiding such abuse to avoid list corruption.
394 */ 394 */
395static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si, 395static bool
396scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
396 unsigned long offset) 397 unsigned long offset)
397{ 398{
399 struct percpu_cluster *percpu_cluster;
400 bool conflict;
401
398 offset /= SWAPFILE_CLUSTER; 402 offset /= SWAPFILE_CLUSTER;
399 return !cluster_is_null(&si->free_cluster_head) && 403 conflict = !cluster_is_null(&si->free_cluster_head) &&
400 offset != cluster_next(&si->free_cluster_head) && 404 offset != cluster_next(&si->free_cluster_head) &&
401 cluster_is_free(&si->cluster_info[offset]); 405 cluster_is_free(&si->cluster_info[offset]);
406
407 if (!conflict)
408 return false;
409
410 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
411 cluster_set_null(&percpu_cluster->index);
412 return true;
413}
414
415/*
416 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
417 * might involve allocating a new cluster for current CPU too.
418 */
419static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
420 unsigned long *offset, unsigned long *scan_base)
421{
422 struct percpu_cluster *cluster;
423 bool found_free;
424 unsigned long tmp;
425
426new_cluster:
427 cluster = this_cpu_ptr(si->percpu_cluster);
428 if (cluster_is_null(&cluster->index)) {
429 if (!cluster_is_null(&si->free_cluster_head)) {
430 cluster->index = si->free_cluster_head;
431 cluster->next = cluster_next(&cluster->index) *
432 SWAPFILE_CLUSTER;
433 } else if (!cluster_is_null(&si->discard_cluster_head)) {
434 /*
435 * we don't have free cluster but have some clusters in
436 * discarding, do discard now and reclaim them
437 */
438 swap_do_scheduled_discard(si);
439 *scan_base = *offset = si->cluster_next;
440 goto new_cluster;
441 } else
442 return;
443 }
444
445 found_free = false;
446
447 /*
448 * Other CPUs can use our cluster if they can't find a free cluster,
449 * check if there is still free entry in the cluster
450 */
451 tmp = cluster->next;
452 while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
453 SWAPFILE_CLUSTER) {
454 if (!si->swap_map[tmp]) {
455 found_free = true;
456 break;
457 }
458 tmp++;
459 }
460 if (!found_free) {
461 cluster_set_null(&cluster->index);
462 goto new_cluster;
463 }
464 cluster->next = tmp + 1;
465 *offset = tmp;
466 *scan_base = tmp;
402} 467}
403 468
404static unsigned long scan_swap_map(struct swap_info_struct *si, 469static unsigned long scan_swap_map(struct swap_info_struct *si,
@@ -423,41 +488,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
423 si->flags += SWP_SCANNING; 488 si->flags += SWP_SCANNING;
424 scan_base = offset = si->cluster_next; 489 scan_base = offset = si->cluster_next;
425 490
491 /* SSD algorithm */
492 if (si->cluster_info) {
493 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
494 goto checks;
495 }
496
426 if (unlikely(!si->cluster_nr--)) { 497 if (unlikely(!si->cluster_nr--)) {
427 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 498 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
428 si->cluster_nr = SWAPFILE_CLUSTER - 1; 499 si->cluster_nr = SWAPFILE_CLUSTER - 1;
429 goto checks; 500 goto checks;
430 } 501 }
431check_cluster:
432 if (!cluster_is_null(&si->free_cluster_head)) {
433 offset = cluster_next(&si->free_cluster_head) *
434 SWAPFILE_CLUSTER;
435 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
436 si->cluster_next = offset;
437 si->cluster_nr = SWAPFILE_CLUSTER - 1;
438 goto checks;
439 } else if (si->cluster_info) {
440 /*
441 * we don't have free cluster but have some clusters in
442 * discarding, do discard now and reclaim them
443 */
444 if (!cluster_is_null(&si->discard_cluster_head)) {
445 si->cluster_nr = 0;
446 swap_do_scheduled_discard(si);
447 scan_base = offset = si->cluster_next;
448 if (!si->cluster_nr)
449 goto check_cluster;
450 si->cluster_nr--;
451 goto checks;
452 }
453
454 /*
455 * Checking free cluster is fast enough, we can do the
456 * check every time
457 */
458 si->cluster_nr = 0;
459 goto checks;
460 }
461 502
462 spin_unlock(&si->lock); 503 spin_unlock(&si->lock);
463 504
@@ -516,8 +557,10 @@ check_cluster:
516 } 557 }
517 558
518checks: 559checks:
519 if (scan_swap_map_recheck_cluster(si, offset)) 560 if (si->cluster_info) {
520 goto check_cluster; 561 while (scan_swap_map_ssd_cluster_conflict(si, offset))
562 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
563 }
521 if (!(si->flags & SWP_WRITEOK)) 564 if (!(si->flags & SWP_WRITEOK))
522 goto no_page; 565 goto no_page;
523 if (!si->highest_bit) 566 if (!si->highest_bit)
@@ -1884,6 +1927,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1884 spin_unlock(&swap_lock); 1927 spin_unlock(&swap_lock);
1885 frontswap_invalidate_area(type); 1928 frontswap_invalidate_area(type);
1886 mutex_unlock(&swapon_mutex); 1929 mutex_unlock(&swapon_mutex);
1930 free_percpu(p->percpu_cluster);
1931 p->percpu_cluster = NULL;
1887 vfree(swap_map); 1932 vfree(swap_map);
1888 vfree(cluster_info); 1933 vfree(cluster_info);
1889 vfree(frontswap_map); 1934 vfree(frontswap_map);
@@ -2403,6 +2448,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2403 error = -ENOMEM; 2448 error = -ENOMEM;
2404 goto bad_swap; 2449 goto bad_swap;
2405 } 2450 }
2451 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
2452 if (!p->percpu_cluster) {
2453 error = -ENOMEM;
2454 goto bad_swap;
2455 }
2456 for_each_possible_cpu(i) {
2457 struct percpu_cluster *cluster;
2458 cluster = per_cpu_ptr(p->percpu_cluster, i);
2459 cluster_set_null(&cluster->index);
2460 }
2406 } 2461 }
2407 2462
2408 error = swap_cgroup_swapon(p->type, maxpages); 2463 error = swap_cgroup_swapon(p->type, maxpages);
@@ -2475,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2475 error = 0; 2530 error = 0;
2476 goto out; 2531 goto out;
2477bad_swap: 2532bad_swap:
2533 free_percpu(p->percpu_cluster);
2534 p->percpu_cluster = NULL;
2478 if (inode && S_ISBLK(inode->i_mode) && p->bdev) { 2535 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2479 set_blocksize(p->bdev, p->old_block_size); 2536 set_blocksize(p->bdev, p->old_block_size);
2480 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2537 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);