aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorXiao Ni <xni@redhat.com>2018-05-20 23:49:54 -0400
committerShaohua Li <shli@fb.com>2018-05-21 12:30:26 -0400
commit5a409b4f56d50b212334f338cb8465d65550cd85 (patch)
tree05c7ca56936fa8f658a2b08d09b74becd0d36f42
parent448ec638c6bcf36923db695b6188373b933e7ddc (diff)
MD: fix lock contention for flush bios
There is a lock contention when there are many processes which send flush bios to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv. Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio to be NULL, otherwise get mddev->lock. This patch remove mddev->flush_bio and handle flush bio asynchronously. I did a test with command dbench -s 128 -t 300. This is the test result: =================Without the patch============================ Operation Count AvgLat MaxLat -------------------------------------------------- Flush 11165 167.595 5879.560 Close 107469 1.391 2231.094 LockX 384 0.003 0.019 Rename 5944 2.141 1856.001 ReadX 208121 0.003 0.074 WriteX 98259 1925.402 15204.895 Unlink 25198 13.264 3457.268 UnlockX 384 0.001 0.009 FIND_FIRST 47111 0.012 0.076 SET_FILE_INFORMATION 12966 0.007 0.065 QUERY_FILE_INFORMATION 27921 0.004 0.085 QUERY_PATH_INFORMATION 124650 0.005 5.766 QUERY_FS_INFORMATION 22519 0.003 0.053 NTCreateX 141086 4.291 2502.812 Throughput 3.7181 MB/sec (sync open) 128 clients 128 procs max_latency=15204.905 ms =================With the patch============================ Operation Count AvgLat MaxLat -------------------------------------------------- Flush 4500 174.134 406.398 Close 48195 0.060 467.062 LockX 256 0.003 0.029 Rename 2324 0.026 0.360 ReadX 78846 0.004 0.504 WriteX 66832 562.775 1467.037 Unlink 5516 3.665 1141.740 UnlockX 256 0.002 0.019 FIND_FIRST 16428 0.015 0.313 SET_FILE_INFORMATION 6400 0.009 0.520 QUERY_FILE_INFORMATION 17865 0.003 0.089 QUERY_PATH_INFORMATION 47060 0.078 416.299 QUERY_FS_INFORMATION 7024 0.004 0.032 NTCreateX 55921 0.854 1141.452 Throughput 11.744 MB/sec (sync open) 128 clients 128 procs max_latency=1467.041 ms The test is done on raid1 disk with two rotational disks V5: V4 is more complicated than the version with memory pool. So revert to the memory pool version V4: use address of fbio to do hash to choose free flush info. V3: Shaohua suggests mempool is overkill. In v3 it allocs memory during creating raid device and uses a simple bitmap to record which resource is free. Fix a bug from v2. It should set flush_pending to 1 at first. V2: Neil pointed out two problems. One is counting error problem and another is return value when allocat memory fails. 1. counting error problem This isn't safe. It is only safe to call rdev_dec_pending() on rdevs that you previously called atomic_inc(&rdev->nr_pending); If an rdev was added to the list between the start and end of the flush, this will do something bad. Now it doesn't use bio_chain. It uses specified call back function for each flush bio. 2. Returned on IO error when kmalloc fails is wrong. I use mempool suggested by Neil in V2 3. Fixed some places pointed by Guoqing Suggested-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Xiao Ni <xni@redhat.com> Signed-off-by: Shaohua Li <shli@fb.com>
-rw-r--r--drivers/md/md.c158
-rw-r--r--drivers/md/md.h22
2 files changed, 120 insertions, 60 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index df9eb1a04f26..6b4e2f29fe4e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -130,6 +130,24 @@ static inline int speed_max(struct mddev *mddev)
130 mddev->sync_speed_max : sysctl_speed_limit_max; 130 mddev->sync_speed_max : sysctl_speed_limit_max;
131} 131}
132 132
133static void * flush_info_alloc(gfp_t gfp_flags, void *data)
134{
135 return kzalloc(sizeof(struct flush_info), gfp_flags);
136}
137static void flush_info_free(void *flush_info, void *data)
138{
139 kfree(flush_info);
140}
141
142static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
143{
144 return kzalloc(sizeof(struct flush_bio), gfp_flags);
145}
146static void flush_bio_free(void *flush_bio, void *data)
147{
148 kfree(flush_bio);
149}
150
133static struct ctl_table_header *raid_table_header; 151static struct ctl_table_header *raid_table_header;
134 152
135static struct ctl_table raid_table[] = { 153static struct ctl_table raid_table[] = {
@@ -412,30 +430,53 @@ static int md_congested(void *data, int bits)
412/* 430/*
413 * Generic flush handling for md 431 * Generic flush handling for md
414 */ 432 */
433static void submit_flushes(struct work_struct *ws)
434{
435 struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
436 struct mddev *mddev = fi->mddev;
437 struct bio *bio = fi->bio;
438
439 bio->bi_opf &= ~REQ_PREFLUSH;
440 md_handle_request(mddev, bio);
441
442 mempool_free(fi, mddev->flush_pool);
443}
415 444
416static void md_end_flush(struct bio *bio) 445static void md_end_flush(struct bio *fbio)
417{ 446{
418 struct md_rdev *rdev = bio->bi_private; 447 struct flush_bio *fb = fbio->bi_private;
419 struct mddev *mddev = rdev->mddev; 448 struct md_rdev *rdev = fb->rdev;
449 struct flush_info *fi = fb->fi;
450 struct bio *bio = fi->bio;
451 struct mddev *mddev = fi->mddev;
420 452
421 rdev_dec_pending(rdev, mddev); 453 rdev_dec_pending(rdev, mddev);
422 454
423 if (atomic_dec_and_test(&mddev->flush_pending)) { 455 if (atomic_dec_and_test(&fi->flush_pending)) {
424 /* The pre-request flush has finished */ 456 if (bio->bi_iter.bi_size == 0)
425 queue_work(md_wq, &mddev->flush_work); 457 /* an empty barrier - all done */
458 bio_endio(bio);
459 else {
460 INIT_WORK(&fi->flush_work, submit_flushes);
461 queue_work(md_wq, &fi->flush_work);
462 }
426 } 463 }
427 bio_put(bio);
428}
429 464
430static void md_submit_flush_data(struct work_struct *ws); 465 mempool_free(fb, mddev->flush_bio_pool);
466 bio_put(fbio);
467}
431 468
432static void submit_flushes(struct work_struct *ws) 469void md_flush_request(struct mddev *mddev, struct bio *bio)
433{ 470{
434 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
435 struct md_rdev *rdev; 471 struct md_rdev *rdev;
472 struct flush_info *fi;
473
474 fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
475
476 fi->bio = bio;
477 fi->mddev = mddev;
478 atomic_set(&fi->flush_pending, 1);
436 479
437 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
438 atomic_set(&mddev->flush_pending, 1);
439 rcu_read_lock(); 480 rcu_read_lock();
440 rdev_for_each_rcu(rdev, mddev) 481 rdev_for_each_rcu(rdev, mddev)
441 if (rdev->raid_disk >= 0 && 482 if (rdev->raid_disk >= 0 &&
@@ -445,59 +486,39 @@ static void submit_flushes(struct work_struct *ws)
445 * we reclaim rcu_read_lock 486 * we reclaim rcu_read_lock
446 */ 487 */
447 struct bio *bi; 488 struct bio *bi;
489 struct flush_bio *fb;
448 atomic_inc(&rdev->nr_pending); 490 atomic_inc(&rdev->nr_pending);
449 atomic_inc(&rdev->nr_pending); 491 atomic_inc(&rdev->nr_pending);
450 rcu_read_unlock(); 492 rcu_read_unlock();
493
494 fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
495 fb->fi = fi;
496 fb->rdev = rdev;
497
451 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 498 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
452 bi->bi_end_io = md_end_flush;
453 bi->bi_private = rdev;
454 bio_set_dev(bi, rdev->bdev); 499 bio_set_dev(bi, rdev->bdev);
500 bi->bi_end_io = md_end_flush;
501 bi->bi_private = fb;
455 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 502 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
456 atomic_inc(&mddev->flush_pending); 503
504 atomic_inc(&fi->flush_pending);
457 submit_bio(bi); 505 submit_bio(bi);
506
458 rcu_read_lock(); 507 rcu_read_lock();
459 rdev_dec_pending(rdev, mddev); 508 rdev_dec_pending(rdev, mddev);
460 } 509 }
461 rcu_read_unlock(); 510 rcu_read_unlock();
462 if (atomic_dec_and_test(&mddev->flush_pending))
463 queue_work(md_wq, &mddev->flush_work);
464}
465
466static void md_submit_flush_data(struct work_struct *ws)
467{
468 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
469 struct bio *bio = mddev->flush_bio;
470 511
471 /* 512 if (atomic_dec_and_test(&fi->flush_pending)) {
472 * must reset flush_bio before calling into md_handle_request to avoid a 513 if (bio->bi_iter.bi_size == 0)
473 * deadlock, because other bios passed md_handle_request suspend check 514 /* an empty barrier - all done */
474 * could wait for this and below md_handle_request could wait for those 515 bio_endio(bio);
475 * bios because of suspend check 516 else {
476 */ 517 INIT_WORK(&fi->flush_work, submit_flushes);
477 mddev->flush_bio = NULL; 518 queue_work(md_wq, &fi->flush_work);
478 wake_up(&mddev->sb_wait); 519 }
479
480 if (bio->bi_iter.bi_size == 0)
481 /* an empty barrier - all done */
482 bio_endio(bio);
483 else {
484 bio->bi_opf &= ~REQ_PREFLUSH;
485 md_handle_request(mddev, bio);
486 } 520 }
487} 521}
488
489void md_flush_request(struct mddev *mddev, struct bio *bio)
490{
491 spin_lock_irq(&mddev->lock);
492 wait_event_lock_irq(mddev->sb_wait,
493 !mddev->flush_bio,
494 mddev->lock);
495 mddev->flush_bio = bio;
496 spin_unlock_irq(&mddev->lock);
497
498 INIT_WORK(&mddev->flush_work, submit_flushes);
499 queue_work(md_wq, &mddev->flush_work);
500}
501EXPORT_SYMBOL(md_flush_request); 522EXPORT_SYMBOL(md_flush_request);
502 523
503static inline struct mddev *mddev_get(struct mddev *mddev) 524static inline struct mddev *mddev_get(struct mddev *mddev)
@@ -555,7 +576,6 @@ void mddev_init(struct mddev *mddev)
555 atomic_set(&mddev->openers, 0); 576 atomic_set(&mddev->openers, 0);
556 atomic_set(&mddev->active_io, 0); 577 atomic_set(&mddev->active_io, 0);
557 spin_lock_init(&mddev->lock); 578 spin_lock_init(&mddev->lock);
558 atomic_set(&mddev->flush_pending, 0);
559 init_waitqueue_head(&mddev->sb_wait); 579 init_waitqueue_head(&mddev->sb_wait);
560 init_waitqueue_head(&mddev->recovery_wait); 580 init_waitqueue_head(&mddev->recovery_wait);
561 mddev->reshape_position = MaxSector; 581 mddev->reshape_position = MaxSector;
@@ -5510,6 +5530,22 @@ int md_run(struct mddev *mddev)
5510 goto abort; 5530 goto abort;
5511 } 5531 }
5512 } 5532 }
5533 if (mddev->flush_pool == NULL) {
5534 mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
5535 flush_info_free, mddev);
5536 if (!mddev->flush_pool) {
5537 err = -ENOMEM;
5538 goto abort;
5539 }
5540 }
5541 if (mddev->flush_bio_pool == NULL) {
5542 mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
5543 flush_bio_free, mddev);
5544 if (!mddev->flush_bio_pool) {
5545 err = -ENOMEM;
5546 goto abort;
5547 }
5548 }
5513 5549
5514 spin_lock(&pers_lock); 5550 spin_lock(&pers_lock);
5515 pers = find_pers(mddev->level, mddev->clevel); 5551 pers = find_pers(mddev->level, mddev->clevel);
@@ -5669,6 +5705,14 @@ int md_run(struct mddev *mddev)
5669 return 0; 5705 return 0;
5670 5706
5671abort: 5707abort:
5708 if (mddev->flush_bio_pool) {
5709 mempool_destroy(mddev->flush_bio_pool);
5710 mddev->flush_bio_pool = NULL;
5711 }
5712 if (mddev->flush_pool){
5713 mempool_destroy(mddev->flush_pool);
5714 mddev->flush_pool = NULL;
5715 }
5672 if (mddev->bio_set) { 5716 if (mddev->bio_set) {
5673 bioset_free(mddev->bio_set); 5717 bioset_free(mddev->bio_set);
5674 mddev->bio_set = NULL; 5718 mddev->bio_set = NULL;
@@ -5889,6 +5933,14 @@ void md_stop(struct mddev *mddev)
5889 * This is called from dm-raid 5933 * This is called from dm-raid
5890 */ 5934 */
5891 __md_stop(mddev); 5935 __md_stop(mddev);
5936 if (mddev->flush_bio_pool) {
5937 mempool_destroy(mddev->flush_bio_pool);
5938 mddev->flush_bio_pool = NULL;
5939 }
5940 if (mddev->flush_pool) {
5941 mempool_destroy(mddev->flush_pool);
5942 mddev->flush_pool = NULL;
5943 }
5892 if (mddev->bio_set) { 5944 if (mddev->bio_set) {
5893 bioset_free(mddev->bio_set); 5945 bioset_free(mddev->bio_set);
5894 mddev->bio_set = NULL; 5946 mddev->bio_set = NULL;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fbc925cce810..ffcb1ae217fe 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -252,6 +252,19 @@ enum mddev_sb_flags {
252 MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ 252 MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
253}; 253};
254 254
255#define NR_FLUSH_INFOS 8
256#define NR_FLUSH_BIOS 64
257struct flush_info {
258 struct bio *bio;
259 struct mddev *mddev;
260 struct work_struct flush_work;
261 atomic_t flush_pending;
262};
263struct flush_bio {
264 struct flush_info *fi;
265 struct md_rdev *rdev;
266};
267
255struct mddev { 268struct mddev {
256 void *private; 269 void *private;
257 struct md_personality *pers; 270 struct md_personality *pers;
@@ -457,13 +470,8 @@ struct mddev {
457 * metadata and bitmap writes 470 * metadata and bitmap writes
458 */ 471 */
459 472
460 /* Generic flush handling. 473 mempool_t *flush_pool;
461 * The last to finish preflush schedules a worker to submit 474 mempool_t *flush_bio_pool;
462 * the rest of the request (without the REQ_PREFLUSH flag).
463 */
464 struct bio *flush_bio;
465 atomic_t flush_pending;
466 struct work_struct flush_work;
467 struct work_struct event_work; /* used by dm to report failure event */ 475 struct work_struct event_work; /* used by dm to report failure event */
468 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); 476 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
469 struct md_cluster_info *cluster_info; 477 struct md_cluster_info *cluster_info;