diff options
-rw-r--r-- | include/linux/swap.h | 11 | ||||
-rw-r--r-- | mm/swapfile.c | 125 |
2 files changed, 102 insertions, 34 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h index 8a3c4a1caa14..24db9142e93b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -199,6 +199,16 @@ struct swap_cluster_info { | |||
199 | #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ | 199 | #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ |
200 | 200 | ||
201 | /* | 201 | /* |
202 | * We assign a cluster to each CPU, so each CPU can allocate swap entry from | ||
203 | * its own cluster and swapout sequentially. The purpose is to optimize swapout | ||
204 | * throughput. | ||
205 | */ | ||
206 | struct percpu_cluster { | ||
207 | struct swap_cluster_info index; /* Current cluster index */ | ||
208 | unsigned int next; /* Likely next allocation offset */ | ||
209 | }; | ||
210 | |||
211 | /* | ||
202 | * The in-memory structure used to track swap areas. | 212 | * The in-memory structure used to track swap areas. |
203 | */ | 213 | */ |
204 | struct swap_info_struct { | 214 | struct swap_info_struct { |
@@ -217,6 +227,7 @@ struct swap_info_struct { | |||
217 | unsigned int inuse_pages; /* number of those currently in use */ | 227 | unsigned int inuse_pages; /* number of those currently in use */ |
218 | unsigned int cluster_next; /* likely index for next allocation */ | 228 | unsigned int cluster_next; /* likely index for next allocation */ |
219 | unsigned int cluster_nr; /* countdown to next cluster search */ | 229 | unsigned int cluster_nr; /* countdown to next cluster search */ |
230 | struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ | ||
220 | struct swap_extent *curr_swap_extent; | 231 | struct swap_extent *curr_swap_extent; |
221 | struct swap_extent first_swap_extent; | 232 | struct swap_extent first_swap_extent; |
222 | struct block_device *bdev; /* swap device or bdev of swap file */ | 233 | struct block_device *bdev; /* swap device or bdev of swap file */ |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 98e52e373bd8..3963fc24fcc1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -392,13 +392,78 @@ static void dec_cluster_info_page(struct swap_info_struct *p, | |||
392 | * It's possible scan_swap_map() uses a free cluster in the middle of free | 392 | * It's possible scan_swap_map() uses a free cluster in the middle of free |
393 | * cluster list. Avoiding such abuse to avoid list corruption. | 393 | * cluster list. Avoiding such abuse to avoid list corruption. |
394 | */ | 394 | */ |
395 | static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si, | 395 | static bool |
396 | scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, | ||
396 | unsigned long offset) | 397 | unsigned long offset) |
397 | { | 398 | { |
399 | struct percpu_cluster *percpu_cluster; | ||
400 | bool conflict; | ||
401 | |||
398 | offset /= SWAPFILE_CLUSTER; | 402 | offset /= SWAPFILE_CLUSTER; |
399 | return !cluster_is_null(&si->free_cluster_head) && | 403 | conflict = !cluster_is_null(&si->free_cluster_head) && |
400 | offset != cluster_next(&si->free_cluster_head) && | 404 | offset != cluster_next(&si->free_cluster_head) && |
401 | cluster_is_free(&si->cluster_info[offset]); | 405 | cluster_is_free(&si->cluster_info[offset]); |
406 | |||
407 | if (!conflict) | ||
408 | return false; | ||
409 | |||
410 | percpu_cluster = this_cpu_ptr(si->percpu_cluster); | ||
411 | cluster_set_null(&percpu_cluster->index); | ||
412 | return true; | ||
413 | } | ||
414 | |||
415 | /* | ||
416 | * Try to get a swap entry from current cpu's swap entry pool (a cluster). This | ||
417 | * might involve allocating a new cluster for current CPU too. | ||
418 | */ | ||
419 | static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, | ||
420 | unsigned long *offset, unsigned long *scan_base) | ||
421 | { | ||
422 | struct percpu_cluster *cluster; | ||
423 | bool found_free; | ||
424 | unsigned long tmp; | ||
425 | |||
426 | new_cluster: | ||
427 | cluster = this_cpu_ptr(si->percpu_cluster); | ||
428 | if (cluster_is_null(&cluster->index)) { | ||
429 | if (!cluster_is_null(&si->free_cluster_head)) { | ||
430 | cluster->index = si->free_cluster_head; | ||
431 | cluster->next = cluster_next(&cluster->index) * | ||
432 | SWAPFILE_CLUSTER; | ||
433 | } else if (!cluster_is_null(&si->discard_cluster_head)) { | ||
434 | /* | ||
435 | * we don't have free cluster but have some clusters in | ||
436 | * discarding, do discard now and reclaim them | ||
437 | */ | ||
438 | swap_do_scheduled_discard(si); | ||
439 | *scan_base = *offset = si->cluster_next; | ||
440 | goto new_cluster; | ||
441 | } else | ||
442 | return; | ||
443 | } | ||
444 | |||
445 | found_free = false; | ||
446 | |||
447 | /* | ||
448 | * Other CPUs can use our cluster if they can't find a free cluster, | ||
449 | * check if there is still free entry in the cluster | ||
450 | */ | ||
451 | tmp = cluster->next; | ||
452 | while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * | ||
453 | SWAPFILE_CLUSTER) { | ||
454 | if (!si->swap_map[tmp]) { | ||
455 | found_free = true; | ||
456 | break; | ||
457 | } | ||
458 | tmp++; | ||
459 | } | ||
460 | if (!found_free) { | ||
461 | cluster_set_null(&cluster->index); | ||
462 | goto new_cluster; | ||
463 | } | ||
464 | cluster->next = tmp + 1; | ||
465 | *offset = tmp; | ||
466 | *scan_base = tmp; | ||
402 | } | 467 | } |
403 | 468 | ||
404 | static unsigned long scan_swap_map(struct swap_info_struct *si, | 469 | static unsigned long scan_swap_map(struct swap_info_struct *si, |
@@ -423,41 +488,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
423 | si->flags += SWP_SCANNING; | 488 | si->flags += SWP_SCANNING; |
424 | scan_base = offset = si->cluster_next; | 489 | scan_base = offset = si->cluster_next; |
425 | 490 | ||
491 | /* SSD algorithm */ | ||
492 | if (si->cluster_info) { | ||
493 | scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); | ||
494 | goto checks; | ||
495 | } | ||
496 | |||
426 | if (unlikely(!si->cluster_nr--)) { | 497 | if (unlikely(!si->cluster_nr--)) { |
427 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { | 498 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { |
428 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 499 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
429 | goto checks; | 500 | goto checks; |
430 | } | 501 | } |
431 | check_cluster: | ||
432 | if (!cluster_is_null(&si->free_cluster_head)) { | ||
433 | offset = cluster_next(&si->free_cluster_head) * | ||
434 | SWAPFILE_CLUSTER; | ||
435 | last_in_cluster = offset + SWAPFILE_CLUSTER - 1; | ||
436 | si->cluster_next = offset; | ||
437 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | ||
438 | goto checks; | ||
439 | } else if (si->cluster_info) { | ||
440 | /* | ||
441 | * we don't have free cluster but have some clusters in | ||
442 | * discarding, do discard now and reclaim them | ||
443 | */ | ||
444 | if (!cluster_is_null(&si->discard_cluster_head)) { | ||
445 | si->cluster_nr = 0; | ||
446 | swap_do_scheduled_discard(si); | ||
447 | scan_base = offset = si->cluster_next; | ||
448 | if (!si->cluster_nr) | ||
449 | goto check_cluster; | ||
450 | si->cluster_nr--; | ||
451 | goto checks; | ||
452 | } | ||
453 | |||
454 | /* | ||
455 | * Checking free cluster is fast enough, we can do the | ||
456 | * check every time | ||
457 | */ | ||
458 | si->cluster_nr = 0; | ||
459 | goto checks; | ||
460 | } | ||
461 | 502 | ||
462 | spin_unlock(&si->lock); | 503 | spin_unlock(&si->lock); |
463 | 504 | ||
@@ -516,8 +557,10 @@ check_cluster: | |||
516 | } | 557 | } |
517 | 558 | ||
518 | checks: | 559 | checks: |
519 | if (scan_swap_map_recheck_cluster(si, offset)) | 560 | if (si->cluster_info) { |
520 | goto check_cluster; | 561 | while (scan_swap_map_ssd_cluster_conflict(si, offset)) |
562 | scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); | ||
563 | } | ||
521 | if (!(si->flags & SWP_WRITEOK)) | 564 | if (!(si->flags & SWP_WRITEOK)) |
522 | goto no_page; | 565 | goto no_page; |
523 | if (!si->highest_bit) | 566 | if (!si->highest_bit) |
@@ -1884,6 +1927,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1884 | spin_unlock(&swap_lock); | 1927 | spin_unlock(&swap_lock); |
1885 | frontswap_invalidate_area(type); | 1928 | frontswap_invalidate_area(type); |
1886 | mutex_unlock(&swapon_mutex); | 1929 | mutex_unlock(&swapon_mutex); |
1930 | free_percpu(p->percpu_cluster); | ||
1931 | p->percpu_cluster = NULL; | ||
1887 | vfree(swap_map); | 1932 | vfree(swap_map); |
1888 | vfree(cluster_info); | 1933 | vfree(cluster_info); |
1889 | vfree(frontswap_map); | 1934 | vfree(frontswap_map); |
@@ -2403,6 +2448,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2403 | error = -ENOMEM; | 2448 | error = -ENOMEM; |
2404 | goto bad_swap; | 2449 | goto bad_swap; |
2405 | } | 2450 | } |
2451 | p->percpu_cluster = alloc_percpu(struct percpu_cluster); | ||
2452 | if (!p->percpu_cluster) { | ||
2453 | error = -ENOMEM; | ||
2454 | goto bad_swap; | ||
2455 | } | ||
2456 | for_each_possible_cpu(i) { | ||
2457 | struct percpu_cluster *cluster; | ||
2458 | cluster = per_cpu_ptr(p->percpu_cluster, i); | ||
2459 | cluster_set_null(&cluster->index); | ||
2460 | } | ||
2406 | } | 2461 | } |
2407 | 2462 | ||
2408 | error = swap_cgroup_swapon(p->type, maxpages); | 2463 | error = swap_cgroup_swapon(p->type, maxpages); |
@@ -2475,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2475 | error = 0; | 2530 | error = 0; |
2476 | goto out; | 2531 | goto out; |
2477 | bad_swap: | 2532 | bad_swap: |
2533 | free_percpu(p->percpu_cluster); | ||
2534 | p->percpu_cluster = NULL; | ||
2478 | if (inode && S_ISBLK(inode->i_mode) && p->bdev) { | 2535 | if (inode && S_ISBLK(inode->i_mode) && p->bdev) { |
2479 | set_blocksize(p->bdev, p->old_block_size); | 2536 | set_blocksize(p->bdev, p->old_block_size); |
2480 | blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 2537 | blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |