swap: make cluster allocation per-cpu

swap cluster allocation is to get better request merge to improve performance. But the cluster is shared globally, if multiple tasks are doing swap, this will cause interleave disk access. While multiple tasks swap is quite common, for example, each numa node has a kswapd thread doing swap and multiple threads/processes doing direct page reclaim. ioscheduler can't help too much here, because tasks don't send swapout IO down to block layer in the meantime. Block layer does merge some IOs, but a lot not, depending on how many tasks are doing swapout concurrently. In practice, I've seen a lot of small size IO in swapout workloads. We makes the cluster allocation per-cpu here. The interleave disk access issue goes away. All tasks swapout to their own cluster, so swapout will become sequential, which can be easily merged to big size IO. If one CPU can't get its per-cpu cluster (for example, there is no free cluster anymore in the swap), it will fallback to scan swap_map. The CPU can still continue swap. We don't need recycle free swap entries of other CPUs. In my test (swap to a 2-disk raid0 partition), this improves around 10% swapout throughput, and request size is increased significantly. How does this impact swap readahead is uncertain though. On one side, page reclaim always isolates and swaps several adjancent pages, this will make page reclaim write the pages sequentially and benefit readahead. On the other side, several CPU write pages interleave means the pages don't live _sequentially_ but relatively _near_. In the per-cpu allocation case, if adjancent pages are written by different cpus, they will live relatively _far_. So how this impacts swap readahead depends on how many pages page reclaim isolates and swaps one time. If the number is big, this patch will benefit swap readahead. Of course, this is about sequential access pattern. The patch has no impact for random access pattern, because the new cluster allocation algorithm is just for SSD. Alternative solution is organizing swap layout to be per-mm instead of this per-cpu approach. In the per-mm layout, we allocate a disk range for each mm, so pages of one mm live in swap disk adjacently. per-mm layout has potential issues of lock contention if multiple reclaimers are swap pages from one mm. For a sequential workload, per-mm layout is better to implement swap readahead, because pages from the mm are adjacent in disk. But per-cpu layout isn't very bad in this workload, as page reclaim always isolates and swaps several pages one time, such pages will still live in disk sequentially and readahead can utilize this. For a random workload, per-mm layout isn't beneficial of request merge, because it's quite possible pages from different mm are swapout in the meantime and IO can't be merged in per-mm layout. while with per-cpu layout we can merge requests from any mm. Considering random workload is more popular in workloads with swap (and per-cpu approach isn't too bad for sequential workload too), I'm choosing per-cpu layout. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Shaohua Li <shli@fusionio.com> Cc: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Kyungmin Park <kmpark@infradead.org> Cc: Hugh Dickins <hughd@google.com> Cc: Rafael Aquini <aquini@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Shaohua Li <shli@kernel.org> 2013-09-11 17:20:32 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-09-11 18:57:17 -0400
commit: ebc2a1a69111eadfeda8487e577f1a5d42ef0dae (patch)
tree: 8a1d08bc6c0a1eb7e1bcd93056141614c22a7d40 /mm/swapfile.c
parent: edfe23dac3e2981277087b05bec7fec7790d1835 (diff)
1 files changed, 91 insertions, 34 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 98e52e373bd8..3963fc24fcc1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -392,13 +392,78 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
 * It's possible scan_swap_map() uses a free cluster in the middle of free
 * cluster list. Avoiding such abuse to avoid list corruption.
 */
-static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si,
+static bool
+scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
        unsigned long offset)
 {
+        struct percpu_cluster *percpu_cluster;
+        bool conflict;
        offset /= SWAPFILE_CLUSTER;
-        return !cluster_is_null(&si->free_cluster_head) &&
+        conflict = !cluster_is_null(&si->free_cluster_head) &&
                offset != cluster_next(&si->free_cluster_head) &&
                cluster_is_free(&si->cluster_info[offset]);
+        if (!conflict)
+                return false;
+        percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+        cluster_set_null(&percpu_cluster->index);
+        return true;
+}
+/*
+ * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
+ * might involve allocating a new cluster for current CPU too.
+ */
+static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+        unsigned long *offset, unsigned long *scan_base)
+{
+        struct percpu_cluster *cluster;
+        bool found_free;
+        unsigned long tmp;
+new_cluster:
+        cluster = this_cpu_ptr(si->percpu_cluster);
+        if (cluster_is_null(&cluster->index)) {
+                if (!cluster_is_null(&si->free_cluster_head)) {
+                        cluster->index = si->free_cluster_head;
+                        cluster->next = cluster_next(&cluster->index) *
+                                        SWAPFILE_CLUSTER;
+                } else if (!cluster_is_null(&si->discard_cluster_head)) {
+                        /*
+                         * we don't have free cluster but have some clusters in
+                         * discarding, do discard now and reclaim them
+                         */
+                        swap_do_scheduled_discard(si);
+                        *scan_base = *offset = si->cluster_next;
+                        goto new_cluster;
+                } else
+                        return;
+        }
+        found_free = false;
+        /*
+         * Other CPUs can use our cluster if they can't find a free cluster,
+         * check if there is still free entry in the cluster
+         */
+        tmp = cluster->next;
+        while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
+               SWAPFILE_CLUSTER) {
+                if (!si->swap_map[tmp]) {
+                        found_free = true;
+                        break;
+                }
+                tmp++;
+        }
+        if (!found_free) {
+                cluster_set_null(&cluster->index);
+                goto new_cluster;
+        }
+        cluster->next = tmp + 1;
+        *offset = tmp;
+        *scan_base = tmp;
 }
 static unsigned long scan_swap_map(struct swap_info_struct *si,
@@ -423,41 +488,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
        si->flags += SWP_SCANNING;
        scan_base = offset = si->cluster_next;
+        /* SSD algorithm */
+        if (si->cluster_info) {
+                scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+                goto checks;
+        }
        if (unlikely(!si->cluster_nr--)) {
                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
                }
-check_cluster:
-                if (!cluster_is_null(&si->free_cluster_head)) {
-                        offset = cluster_next(&si->free_cluster_head) *
-                                                SWAPFILE_CLUSTER;
-                        last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
-                        si->cluster_next = offset;
-                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                        goto checks;
-                } else if (si->cluster_info) {
-                        /*
-                         * we don't have free cluster but have some clusters in
-                         * discarding, do discard now and reclaim them
-                         */
-                        if (!cluster_is_null(&si->discard_cluster_head)) {
-                                si->cluster_nr = 0;
-                                swap_do_scheduled_discard(si);
-                                scan_base = offset = si->cluster_next;
-                                if (!si->cluster_nr)
-                                        goto check_cluster;
-                                si->cluster_nr--;
-                                goto checks;
-                        }
-                        /*
-                         * Checking free cluster is fast enough, we can do the
-                         * check every time
-                         */
-                        si->cluster_nr = 0;
-                        goto checks;
-                }
                spin_unlock(&si->lock);
@@ -516,8 +557,10 @@ check_cluster:
        }
 checks:
-        if (scan_swap_map_recheck_cluster(si, offset))
+        if (si->cluster_info) {
-                goto check_cluster;
+                while (scan_swap_map_ssd_cluster_conflict(si, offset))
+                        scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+        }
        if (!(si->flags & SWP_WRITEOK))
                goto no_page;
        if (!si->highest_bit)
@@ -1884,6 +1927,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        spin_unlock(&swap_lock);
        frontswap_invalidate_area(type);
        mutex_unlock(&swapon_mutex);
+        free_percpu(p->percpu_cluster);
+        p->percpu_cluster = NULL;
        vfree(swap_map);
        vfree(cluster_info);
        vfree(frontswap_map);
@@ -2403,6 +2448,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                        error = -ENOMEM;
                        goto bad_swap;
                }
+                p->percpu_cluster = alloc_percpu(struct percpu_cluster);
+                if (!p->percpu_cluster) {
+                        error = -ENOMEM;
+                        goto bad_swap;
+                }
+                for_each_possible_cpu(i) {
+                        struct percpu_cluster *cluster;
+                        cluster = per_cpu_ptr(p->percpu_cluster, i);
+                        cluster_set_null(&cluster->index);
+                }
        }
        error = swap_cgroup_swapon(p->type, maxpages);
@@ -2475,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        error = 0;
        goto out;
 bad_swap:
+        free_percpu(p->percpu_cluster);
+        p->percpu_cluster = NULL;
        if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
                set_blocksize(p->bdev, p->old_block_size);
                blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
author	Shaohua Li <shli@kernel.org>	2013-09-11 17:20:32 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-09-11 18:57:17 -0400
commit	ebc2a1a69111eadfeda8487e577f1a5d42ef0dae (patch)
tree	8a1d08bc6c0a1eb7e1bcd93056141614c22a7d40 /mm/swapfile.c
parent	edfe23dac3e2981277087b05bec7fec7790d1835 (diff)

diff --git a/mm/swapfile.c b/mm/swapfile.c index 98e52e373bd8..3963fc24fcc1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c
@@ -392,13 +392,78 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
392	* It's possible scan_swap_map() uses a free cluster in the middle of free	392	* It's possible scan_swap_map() uses a free cluster in the middle of free
393	* cluster list. Avoiding such abuse to avoid list corruption.	393	* cluster list. Avoiding such abuse to avoid list corruption.
394	*/	394	*/
395	static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si,	395	static bool
		396	scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
396	unsigned long offset)	397	unsigned long offset)
397	{	398	{
		399	struct percpu_cluster *percpu_cluster;
		400	bool conflict;
		401
398	offset /= SWAPFILE_CLUSTER;	402	offset /= SWAPFILE_CLUSTER;
399	return !cluster_is_null(&si->free_cluster_head) &&	403	conflict = !cluster_is_null(&si->free_cluster_head) &&
400	offset != cluster_next(&si->free_cluster_head) &&	404	offset != cluster_next(&si->free_cluster_head) &&
401	cluster_is_free(&si->cluster_info[offset]);	405	cluster_is_free(&si->cluster_info[offset]);
		406
		407	if (!conflict)
		408	return false;
		409
		410	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
		411	cluster_set_null(&percpu_cluster->index);
		412	return true;
		413	}
		414
		415	/*
		416	* Try to get a swap entry from current cpu's swap entry pool (a cluster). This
		417	* might involve allocating a new cluster for current CPU too.
		418	*/
		419	static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
		420	unsigned long offset, unsigned long scan_base)
		421	{
		422	struct percpu_cluster *cluster;
		423	bool found_free;
		424	unsigned long tmp;
		425
		426	new_cluster:
		427	cluster = this_cpu_ptr(si->percpu_cluster);
		428	if (cluster_is_null(&cluster->index)) {
		429	if (!cluster_is_null(&si->free_cluster_head)) {
		430	cluster->index = si->free_cluster_head;
		431	cluster->next = cluster_next(&cluster->index) *
		432	SWAPFILE_CLUSTER;
		433	} else if (!cluster_is_null(&si->discard_cluster_head)) {
		434	/*
		435	* we don't have free cluster but have some clusters in
		436	* discarding, do discard now and reclaim them
		437	*/
		438	swap_do_scheduled_discard(si);
		439	scan_base = offset = si->cluster_next;
		440	goto new_cluster;
		441	} else
		442	return;
		443	}
		444
		445	found_free = false;
		446
		447	/*
		448	* Other CPUs can use our cluster if they can't find a free cluster,
		449	* check if there is still free entry in the cluster
		450	*/
		451	tmp = cluster->next;
		452	while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
		453	SWAPFILE_CLUSTER) {
		454	if (!si->swap_map[tmp]) {
		455	found_free = true;
		456	break;
		457	}
		458	tmp++;
		459	}
		460	if (!found_free) {
		461	cluster_set_null(&cluster->index);
		462	goto new_cluster;
		463	}
		464	cluster->next = tmp + 1;
		465	*offset = tmp;
		466	*scan_base = tmp;
402	}	467	}
403		468
404	static unsigned long scan_swap_map(struct swap_info_struct *si,	469	static unsigned long scan_swap_map(struct swap_info_struct *si,
@@ -423,41 +488,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
423	si->flags += SWP_SCANNING;	488	si->flags += SWP_SCANNING;
424	scan_base = offset = si->cluster_next;	489	scan_base = offset = si->cluster_next;
425		490
		491	/* SSD algorithm */
		492	if (si->cluster_info) {
		493	scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
		494	goto checks;
		495	}
		496
426	if (unlikely(!si->cluster_nr--)) {	497	if (unlikely(!si->cluster_nr--)) {
427	if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {	498	if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
428	si->cluster_nr = SWAPFILE_CLUSTER - 1;	499	si->cluster_nr = SWAPFILE_CLUSTER - 1;
429	goto checks;	500	goto checks;
430	}	501	}
431	check_cluster:
432	if (!cluster_is_null(&si->free_cluster_head)) {
433	offset = cluster_next(&si->free_cluster_head) *
434	SWAPFILE_CLUSTER;
435	last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
436	si->cluster_next = offset;
437	si->cluster_nr = SWAPFILE_CLUSTER - 1;
438	goto checks;
439	} else if (si->cluster_info) {
440	/*
441	* we don't have free cluster but have some clusters in
442	* discarding, do discard now and reclaim them
443	*/
444	if (!cluster_is_null(&si->discard_cluster_head)) {
445	si->cluster_nr = 0;
446	swap_do_scheduled_discard(si);
447	scan_base = offset = si->cluster_next;
448	if (!si->cluster_nr)
449	goto check_cluster;
450	si->cluster_nr--;
451	goto checks;
452	}
453
454	/*
455	* Checking free cluster is fast enough, we can do the
456	* check every time
457	*/
458	si->cluster_nr = 0;
459	goto checks;
460	}
461		502
462	spin_unlock(&si->lock);	503	spin_unlock(&si->lock);
463		504
@@ -516,8 +557,10 @@ check_cluster:
516	}	557	}
517		558
518	checks:	559	checks:
519	if (scan_swap_map_recheck_cluster(si, offset))	560	if (si->cluster_info) {
520	goto check_cluster;	561	while (scan_swap_map_ssd_cluster_conflict(si, offset))
		562	scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
		563	}
521	if (!(si->flags & SWP_WRITEOK))	564	if (!(si->flags & SWP_WRITEOK))
522	goto no_page;	565	goto no_page;
523	if (!si->highest_bit)	566	if (!si->highest_bit)
@@ -1884,6 +1927,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1884	spin_unlock(&swap_lock);	1927	spin_unlock(&swap_lock);
1885	frontswap_invalidate_area(type);	1928	frontswap_invalidate_area(type);
1886	mutex_unlock(&swapon_mutex);	1929	mutex_unlock(&swapon_mutex);
		1930	free_percpu(p->percpu_cluster);
		1931	p->percpu_cluster = NULL;
1887	vfree(swap_map);	1932	vfree(swap_map);
1888	vfree(cluster_info);	1933	vfree(cluster_info);
1889	vfree(frontswap_map);	1934	vfree(frontswap_map);
@@ -2403,6 +2448,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2403	error = -ENOMEM;	2448	error = -ENOMEM;
2404	goto bad_swap;	2449	goto bad_swap;
2405	}	2450	}
		2451	p->percpu_cluster = alloc_percpu(struct percpu_cluster);
		2452	if (!p->percpu_cluster) {
		2453	error = -ENOMEM;
		2454	goto bad_swap;
		2455	}
		2456	for_each_possible_cpu(i) {
		2457	struct percpu_cluster *cluster;
		2458	cluster = per_cpu_ptr(p->percpu_cluster, i);
		2459	cluster_set_null(&cluster->index);
		2460	}
2406	}	2461	}
2407		2462
2408	error = swap_cgroup_swapon(p->type, maxpages);	2463	error = swap_cgroup_swapon(p->type, maxpages);
@@ -2475,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2475	error = 0;	2530	error = 0;
2476	goto out;	2531	goto out;
2477	bad_swap:	2532	bad_swap:
		2533	free_percpu(p->percpu_cluster);
		2534	p->percpu_cluster = NULL;
2478	if (inode && S_ISBLK(inode->i_mode) && p->bdev) {	2535	if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2479	set_blocksize(p->bdev, p->old_block_size);	2536	set_blocksize(p->bdev, p->old_block_size);
2480	blkdev_put(p->bdev, FMODE_READ \| FMODE_WRITE \| FMODE_EXCL);	2537	blkdev_put(p->bdev, FMODE_READ \| FMODE_WRITE \| FMODE_EXCL);