aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShaohua Li <shli@fb.com>2017-07-10 18:47:11 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-10 19:32:30 -0400
commit23955622ff8d231bcc9650b3d06583f117a6e3ba (patch)
treef0cea083054f298d7fd7eff3371ae07005e28506
parent9eb788800510ae1a6bc419636a66071ee4deafd5 (diff)
swap: add block io poll in swapin path
For fast flash disk, async IO could introduce overhead because of context switch. block-mq now supports IO poll, which improves performance and latency a lot. swapin is a good place to use this technique, because the task is waiting for the swapin page to continue execution. In my virtual machine, directly read 4k data from a NVMe with iopoll is about 60% better than that without poll. With iopoll support in swapin patch, my microbenchmark (a task does random memory write) is about 10%~25% faster. CPU utilization increases a lot though, 2x and even 3x CPU utilization. This will depend on disk speed. While iopoll in swapin isn't intended for all usage cases, it's a win for latency sensistive workloads with high speed swap disk. block layer has knob to control poll in runtime. If poll isn't enabled in block layer, there should be no noticeable change in swapin. I got a chance to run the same test in a NVMe with DRAM as the media. In simple fio IO test, blkpoll boosts 50% performance in single thread test and ~20% in 8 threads test. So this is the base line. In above swap test, blkpoll boosts ~27% performance in single thread test. blkpoll uses 2x CPU time though. If we enable hybid polling, the performance gain has very slight drop but CPU time is only 50% worse than that without blkpoll. Also we can adjust parameter of hybid poll, with it, the CPU time penality is reduced further. In 8 threads test, blkpoll doesn't help though. The performance is similar to that without blkpoll, but cpu utilization is similar too. There is lock contention in swap path. The cpu time spending on blkpoll isn't high. So overall, blkpoll swapin isn't worse than that without it. The swapin readahead might read several pages in in the same time and form a big IO request. Since the IO will take longer time, it doesn't make sense to do poll, so the patch only does iopoll for single page swapin. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/070c3c3e40b711e7b1390002c991e86a-b5408f0@7511894063d3764ff01ea8111f5a004d7dd700ed078797c204a24e620ddb965c Signed-off-by: Shaohua Li <shli@fb.com> Cc: Tim Chen <tim.c.chen@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Jens Axboe <axboe@fb.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/swap.h5
-rw-r--r--mm/madvise.c4
-rw-r--r--mm/page_io.c23
-rw-r--r--mm/swap_state.c10
-rw-r--r--mm/swapfile.c2
5 files changed, 33 insertions, 11 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5ab1c98c7d27..61e7180cee21 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -331,7 +331,7 @@ extern void kswapd_stop(int nid);
331#include <linux/blk_types.h> /* for bio_end_io_t */ 331#include <linux/blk_types.h> /* for bio_end_io_t */
332 332
333/* linux/mm/page_io.c */ 333/* linux/mm/page_io.c */
334extern int swap_readpage(struct page *); 334extern int swap_readpage(struct page *page, bool do_poll);
335extern int swap_writepage(struct page *page, struct writeback_control *wbc); 335extern int swap_writepage(struct page *page, struct writeback_control *wbc);
336extern void end_swap_bio_write(struct bio *bio); 336extern void end_swap_bio_write(struct bio *bio);
337extern int __swap_writepage(struct page *page, struct writeback_control *wbc, 337extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
@@ -362,7 +362,8 @@ extern void free_page_and_swap_cache(struct page *);
362extern void free_pages_and_swap_cache(struct page **, int); 362extern void free_pages_and_swap_cache(struct page **, int);
363extern struct page *lookup_swap_cache(swp_entry_t); 363extern struct page *lookup_swap_cache(swp_entry_t);
364extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, 364extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
365 struct vm_area_struct *vma, unsigned long addr); 365 struct vm_area_struct *vma, unsigned long addr,
366 bool do_poll);
366extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, 367extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
367 struct vm_area_struct *vma, unsigned long addr, 368 struct vm_area_struct *vma, unsigned long addr,
368 bool *new_page_allocated); 369 bool *new_page_allocated);
diff --git a/mm/madvise.c b/mm/madvise.c
index 25b78ee4fc2c..8eda1841c576 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -205,7 +205,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
205 continue; 205 continue;
206 206
207 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 207 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
208 vma, index); 208 vma, index, false);
209 if (page) 209 if (page)
210 put_page(page); 210 put_page(page);
211 } 211 }
@@ -246,7 +246,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
246 } 246 }
247 swap = radix_to_swp_entry(page); 247 swap = radix_to_swp_entry(page);
248 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 248 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
249 NULL, 0); 249 NULL, 0, false);
250 if (page) 250 if (page)
251 put_page(page); 251 put_page(page);
252 } 252 }
diff --git a/mm/page_io.c b/mm/page_io.c
index 2da71e627812..b6c4ac388209 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -117,6 +117,7 @@ static void swap_slot_free_notify(struct page *page)
117static void end_swap_bio_read(struct bio *bio) 117static void end_swap_bio_read(struct bio *bio)
118{ 118{
119 struct page *page = bio->bi_io_vec[0].bv_page; 119 struct page *page = bio->bi_io_vec[0].bv_page;
120 struct task_struct *waiter = bio->bi_private;
120 121
121 if (bio->bi_status) { 122 if (bio->bi_status) {
122 SetPageError(page); 123 SetPageError(page);
@@ -132,7 +133,9 @@ static void end_swap_bio_read(struct bio *bio)
132 swap_slot_free_notify(page); 133 swap_slot_free_notify(page);
133out: 134out:
134 unlock_page(page); 135 unlock_page(page);
136 WRITE_ONCE(bio->bi_private, NULL);
135 bio_put(bio); 137 bio_put(bio);
138 wake_up_process(waiter);
136} 139}
137 140
138int generic_swapfile_activate(struct swap_info_struct *sis, 141int generic_swapfile_activate(struct swap_info_struct *sis,
@@ -329,11 +332,13 @@ out:
329 return ret; 332 return ret;
330} 333}
331 334
332int swap_readpage(struct page *page) 335int swap_readpage(struct page *page, bool do_poll)
333{ 336{
334 struct bio *bio; 337 struct bio *bio;
335 int ret = 0; 338 int ret = 0;
336 struct swap_info_struct *sis = page_swap_info(page); 339 struct swap_info_struct *sis = page_swap_info(page);
340 blk_qc_t qc;
341 struct block_device *bdev;
337 342
338 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 343 VM_BUG_ON_PAGE(!PageSwapCache(page), page);
339 VM_BUG_ON_PAGE(!PageLocked(page), page); 344 VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -372,9 +377,23 @@ int swap_readpage(struct page *page)
372 ret = -ENOMEM; 377 ret = -ENOMEM;
373 goto out; 378 goto out;
374 } 379 }
380 bdev = bio->bi_bdev;
381 bio->bi_private = current;
375 bio_set_op_attrs(bio, REQ_OP_READ, 0); 382 bio_set_op_attrs(bio, REQ_OP_READ, 0);
376 count_vm_event(PSWPIN); 383 count_vm_event(PSWPIN);
377 submit_bio(bio); 384 bio_get(bio);
385 qc = submit_bio(bio);
386 while (do_poll) {
387 set_current_state(TASK_UNINTERRUPTIBLE);
388 if (!READ_ONCE(bio->bi_private))
389 break;
390
391 if (!blk_mq_poll(bdev_get_queue(bdev), qc))
392 break;
393 }
394 __set_current_state(TASK_RUNNING);
395 bio_put(bio);
396
378out: 397out:
379 return ret; 398 return ret;
380} 399}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9c71b6b2562f..b68c93014f50 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -412,14 +412,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
412 * the swap entry is no longer in use. 412 * the swap entry is no longer in use.
413 */ 413 */
414struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 414struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
415 struct vm_area_struct *vma, unsigned long addr) 415 struct vm_area_struct *vma, unsigned long addr, bool do_poll)
416{ 416{
417 bool page_was_allocated; 417 bool page_was_allocated;
418 struct page *retpage = __read_swap_cache_async(entry, gfp_mask, 418 struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
419 vma, addr, &page_was_allocated); 419 vma, addr, &page_was_allocated);
420 420
421 if (page_was_allocated) 421 if (page_was_allocated)
422 swap_readpage(retpage); 422 swap_readpage(retpage, do_poll);
423 423
424 return retpage; 424 return retpage;
425} 425}
@@ -496,11 +496,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
496 unsigned long start_offset, end_offset; 496 unsigned long start_offset, end_offset;
497 unsigned long mask; 497 unsigned long mask;
498 struct blk_plug plug; 498 struct blk_plug plug;
499 bool do_poll = true;
499 500
500 mask = swapin_nr_pages(offset) - 1; 501 mask = swapin_nr_pages(offset) - 1;
501 if (!mask) 502 if (!mask)
502 goto skip; 503 goto skip;
503 504
505 do_poll = false;
504 /* Read a page_cluster sized and aligned cluster around offset. */ 506 /* Read a page_cluster sized and aligned cluster around offset. */
505 start_offset = offset & ~mask; 507 start_offset = offset & ~mask;
506 end_offset = offset | mask; 508 end_offset = offset | mask;
@@ -511,7 +513,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
511 for (offset = start_offset; offset <= end_offset ; offset++) { 513 for (offset = start_offset; offset <= end_offset ; offset++) {
512 /* Ok, do the async read-ahead now */ 514 /* Ok, do the async read-ahead now */
513 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 515 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
514 gfp_mask, vma, addr); 516 gfp_mask, vma, addr, false);
515 if (!page) 517 if (!page)
516 continue; 518 continue;
517 if (offset != entry_offset && likely(!PageTransCompound(page))) 519 if (offset != entry_offset && likely(!PageTransCompound(page)))
@@ -522,7 +524,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
522 524
523 lru_add_drain(); /* Push any new pages onto the LRU now */ 525 lru_add_drain(); /* Push any new pages onto the LRU now */
524skip: 526skip:
525 return read_swap_cache_async(entry, gfp_mask, vma, addr); 527 return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
526} 528}
527 529
528int init_swap_address_space(unsigned int type, unsigned long nr_pages) 530int init_swap_address_space(unsigned int type, unsigned long nr_pages)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 811d90e1c929..6ba4aab2db0b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1868,7 +1868,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
1868 swap_map = &si->swap_map[i]; 1868 swap_map = &si->swap_map[i];
1869 entry = swp_entry(type, i); 1869 entry = swp_entry(type, i);
1870 page = read_swap_cache_async(entry, 1870 page = read_swap_cache_async(entry,
1871 GFP_HIGHUSER_MOVABLE, NULL, 0); 1871 GFP_HIGHUSER_MOVABLE, NULL, 0, false);
1872 if (!page) { 1872 if (!page) {
1873 /* 1873 /*
1874 * Either swap_duplicate() failed because entry 1874 * Either swap_duplicate() failed because entry