aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/swap.h11
-rw-r--r--mm/swapfile.c125
2 files changed, 102 insertions, 34 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8a3c4a1caa14..24db9142e93b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -199,6 +199,16 @@ struct swap_cluster_info {
199#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ 199#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
200 200
201/* 201/*
202 * We assign a cluster to each CPU, so each CPU can allocate swap entry from
203 * its own cluster and swapout sequentially. The purpose is to optimize swapout
204 * throughput.
205 */
206struct percpu_cluster {
207 struct swap_cluster_info index; /* Current cluster index */
208 unsigned int next; /* Likely next allocation offset */
209};
210
211/*
202 * The in-memory structure used to track swap areas. 212 * The in-memory structure used to track swap areas.
203 */ 213 */
204struct swap_info_struct { 214struct swap_info_struct {
@@ -217,6 +227,7 @@ struct swap_info_struct {
217 unsigned int inuse_pages; /* number of those currently in use */ 227 unsigned int inuse_pages; /* number of those currently in use */
218 unsigned int cluster_next; /* likely index for next allocation */ 228 unsigned int cluster_next; /* likely index for next allocation */
219 unsigned int cluster_nr; /* countdown to next cluster search */ 229 unsigned int cluster_nr; /* countdown to next cluster search */
230 struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
220 struct swap_extent *curr_swap_extent; 231 struct swap_extent *curr_swap_extent;
221 struct swap_extent first_swap_extent; 232 struct swap_extent first_swap_extent;
222 struct block_device *bdev; /* swap device or bdev of swap file */ 233 struct block_device *bdev; /* swap device or bdev of swap file */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 98e52e373bd8..3963fc24fcc1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -392,13 +392,78 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
392 * It's possible scan_swap_map() uses a free cluster in the middle of free 392 * It's possible scan_swap_map() uses a free cluster in the middle of free
393 * cluster list. Avoiding such abuse to avoid list corruption. 393 * cluster list. Avoiding such abuse to avoid list corruption.
394 */ 394 */
395static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si, 395static bool
396scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
396 unsigned long offset) 397 unsigned long offset)
397{ 398{
399 struct percpu_cluster *percpu_cluster;
400 bool conflict;
401
398 offset /= SWAPFILE_CLUSTER; 402 offset /= SWAPFILE_CLUSTER;
399 return !cluster_is_null(&si->free_cluster_head) && 403 conflict = !cluster_is_null(&si->free_cluster_head) &&
400 offset != cluster_next(&si->free_cluster_head) && 404 offset != cluster_next(&si->free_cluster_head) &&
401 cluster_is_free(&si->cluster_info[offset]); 405 cluster_is_free(&si->cluster_info[offset]);
406
407 if (!conflict)
408 return false;
409
410 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
411 cluster_set_null(&percpu_cluster->index);
412 return true;
413}
414
415/*
416 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
417 * might involve allocating a new cluster for current CPU too.
418 */
419static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
420 unsigned long *offset, unsigned long *scan_base)
421{
422 struct percpu_cluster *cluster;
423 bool found_free;
424 unsigned long tmp;
425
426new_cluster:
427 cluster = this_cpu_ptr(si->percpu_cluster);
428 if (cluster_is_null(&cluster->index)) {
429 if (!cluster_is_null(&si->free_cluster_head)) {
430 cluster->index = si->free_cluster_head;
431 cluster->next = cluster_next(&cluster->index) *
432 SWAPFILE_CLUSTER;
433 } else if (!cluster_is_null(&si->discard_cluster_head)) {
434 /*
435 * we don't have free cluster but have some clusters in
436 * discarding, do discard now and reclaim them
437 */
438 swap_do_scheduled_discard(si);
439 *scan_base = *offset = si->cluster_next;
440 goto new_cluster;
441 } else
442 return;
443 }
444
445 found_free = false;
446
447 /*
448 * Other CPUs can use our cluster if they can't find a free cluster,
449 * check if there is still free entry in the cluster
450 */
451 tmp = cluster->next;
452 while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
453 SWAPFILE_CLUSTER) {
454 if (!si->swap_map[tmp]) {
455 found_free = true;
456 break;
457 }
458 tmp++;
459 }
460 if (!found_free) {
461 cluster_set_null(&cluster->index);
462 goto new_cluster;
463 }
464 cluster->next = tmp + 1;
465 *offset = tmp;
466 *scan_base = tmp;
402} 467}
403 468
404static unsigned long scan_swap_map(struct swap_info_struct *si, 469static unsigned long scan_swap_map(struct swap_info_struct *si,
@@ -423,41 +488,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
423 si->flags += SWP_SCANNING; 488 si->flags += SWP_SCANNING;
424 scan_base = offset = si->cluster_next; 489 scan_base = offset = si->cluster_next;
425 490
491 /* SSD algorithm */
492 if (si->cluster_info) {
493 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
494 goto checks;
495 }
496
426 if (unlikely(!si->cluster_nr--)) { 497 if (unlikely(!si->cluster_nr--)) {
427 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 498 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
428 si->cluster_nr = SWAPFILE_CLUSTER - 1; 499 si->cluster_nr = SWAPFILE_CLUSTER - 1;
429 goto checks; 500 goto checks;
430 } 501 }
431check_cluster:
432 if (!cluster_is_null(&si->free_cluster_head)) {
433 offset = cluster_next(&si->free_cluster_head) *
434 SWAPFILE_CLUSTER;
435 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
436 si->cluster_next = offset;
437 si->cluster_nr = SWAPFILE_CLUSTER - 1;
438 goto checks;
439 } else if (si->cluster_info) {
440 /*
441 * we don't have free cluster but have some clusters in
442 * discarding, do discard now and reclaim them
443 */
444 if (!cluster_is_null(&si->discard_cluster_head)) {
445 si->cluster_nr = 0;
446 swap_do_scheduled_discard(si);
447 scan_base = offset = si->cluster_next;
448 if (!si->cluster_nr)
449 goto check_cluster;
450 si->cluster_nr--;
451 goto checks;
452 }
453
454 /*
455 * Checking free cluster is fast enough, we can do the
456 * check every time
457 */
458 si->cluster_nr = 0;
459 goto checks;
460 }
461 502
462 spin_unlock(&si->lock); 503 spin_unlock(&si->lock);
463 504
@@ -516,8 +557,10 @@ check_cluster:
516 } 557 }
517 558
518checks: 559checks:
519 if (scan_swap_map_recheck_cluster(si, offset)) 560 if (si->cluster_info) {
520 goto check_cluster; 561 while (scan_swap_map_ssd_cluster_conflict(si, offset))
562 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
563 }
521 if (!(si->flags & SWP_WRITEOK)) 564 if (!(si->flags & SWP_WRITEOK))
522 goto no_page; 565 goto no_page;
523 if (!si->highest_bit) 566 if (!si->highest_bit)
@@ -1884,6 +1927,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1884 spin_unlock(&swap_lock); 1927 spin_unlock(&swap_lock);
1885 frontswap_invalidate_area(type); 1928 frontswap_invalidate_area(type);
1886 mutex_unlock(&swapon_mutex); 1929 mutex_unlock(&swapon_mutex);
1930 free_percpu(p->percpu_cluster);
1931 p->percpu_cluster = NULL;
1887 vfree(swap_map); 1932 vfree(swap_map);
1888 vfree(cluster_info); 1933 vfree(cluster_info);
1889 vfree(frontswap_map); 1934 vfree(frontswap_map);
@@ -2403,6 +2448,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2403 error = -ENOMEM; 2448 error = -ENOMEM;
2404 goto bad_swap; 2449 goto bad_swap;
2405 } 2450 }
2451 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
2452 if (!p->percpu_cluster) {
2453 error = -ENOMEM;
2454 goto bad_swap;
2455 }
2456 for_each_possible_cpu(i) {
2457 struct percpu_cluster *cluster;
2458 cluster = per_cpu_ptr(p->percpu_cluster, i);
2459 cluster_set_null(&cluster->index);
2460 }
2406 } 2461 }
2407 2462
2408 error = swap_cgroup_swapon(p->type, maxpages); 2463 error = swap_cgroup_swapon(p->type, maxpages);
@@ -2475,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2475 error = 0; 2530 error = 0;
2476 goto out; 2531 goto out;
2477bad_swap: 2532bad_swap:
2533 free_percpu(p->percpu_cluster);
2534 p->percpu_cluster = NULL;
2478 if (inode && S_ISBLK(inode->i_mode) && p->bdev) { 2535 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2479 set_blocksize(p->bdev, p->old_block_size); 2536 set_blocksize(p->bdev, p->old_block_size);
2480 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2537 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);