diff options
author | NeilBrown <neilb@suse.com> | 2017-04-05 00:05:51 -0400 |
---|---|---|
committer | Shaohua Li <shli@fb.com> | 2017-04-11 13:13:02 -0400 |
commit | fc9977dd069e4f82fcacb262652117c488647319 (patch) | |
tree | 329ec8a20b630d9f3cfdf2596dfe71ea71368c41 | |
parent | 673ca68d93879b9ffbbed874c9e70ca6e37cab15 (diff) |
md/raid10: simplify the splitting of requests.
raid10 splits requests in two different ways for two different
reasons.
First, bio_split() is used to ensure the bio fits with a chunk.
Second, multiple r10bio structures are allocated to represent the
different sections that need to go to different devices, to avoid
known bad blocks.
This can be simplified to just use bio_split() once, and not to use
multiple r10bios.
We delay the split until we know a maximum bio size that can
be handled with a single r10bio, and then split the bio and queue
the remainder for later handling.
As with raid1, we allocate a new bio_set to help with the splitting.
It is not correct to use fs_bio_set in a device driver.
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
-rw-r--r-- | drivers/md/raid10.c | 164 | ||||
-rw-r--r-- | drivers/md/raid10.h | 1 |
2 files changed, 51 insertions, 114 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e055ec94b9a8..41845bae67be 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -1127,7 +1127,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, | |||
1127 | struct bio *read_bio; | 1127 | struct bio *read_bio; |
1128 | const int op = bio_op(bio); | 1128 | const int op = bio_op(bio); |
1129 | const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); | 1129 | const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); |
1130 | int sectors_handled; | ||
1131 | int max_sectors; | 1130 | int max_sectors; |
1132 | sector_t sectors; | 1131 | sector_t sectors; |
1133 | struct md_rdev *rdev; | 1132 | struct md_rdev *rdev; |
@@ -1140,7 +1139,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, | |||
1140 | */ | 1139 | */ |
1141 | wait_barrier(conf); | 1140 | wait_barrier(conf); |
1142 | 1141 | ||
1143 | sectors = bio_sectors(bio); | 1142 | sectors = r10_bio->sectors; |
1144 | while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | 1143 | while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
1145 | bio->bi_iter.bi_sector < conf->reshape_progress && | 1144 | bio->bi_iter.bi_sector < conf->reshape_progress && |
1146 | bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { | 1145 | bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { |
@@ -1157,17 +1156,23 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, | |||
1157 | wait_barrier(conf); | 1156 | wait_barrier(conf); |
1158 | } | 1157 | } |
1159 | 1158 | ||
1160 | read_again: | ||
1161 | rdev = read_balance(conf, r10_bio, &max_sectors); | 1159 | rdev = read_balance(conf, r10_bio, &max_sectors); |
1162 | if (!rdev) { | 1160 | if (!rdev) { |
1163 | raid_end_bio_io(r10_bio); | 1161 | raid_end_bio_io(r10_bio); |
1164 | return; | 1162 | return; |
1165 | } | 1163 | } |
1164 | if (max_sectors < bio_sectors(bio)) { | ||
1165 | struct bio *split = bio_split(bio, max_sectors, | ||
1166 | GFP_NOIO, conf->bio_split); | ||
1167 | bio_chain(split, bio); | ||
1168 | generic_make_request(bio); | ||
1169 | bio = split; | ||
1170 | r10_bio->master_bio = bio; | ||
1171 | r10_bio->sectors = max_sectors; | ||
1172 | } | ||
1166 | slot = r10_bio->read_slot; | 1173 | slot = r10_bio->read_slot; |
1167 | 1174 | ||
1168 | read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); | 1175 | read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); |
1169 | bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, | ||
1170 | max_sectors); | ||
1171 | 1176 | ||
1172 | r10_bio->devs[slot].bio = read_bio; | 1177 | r10_bio->devs[slot].bio = read_bio; |
1173 | r10_bio->devs[slot].rdev = rdev; | 1178 | r10_bio->devs[slot].rdev = rdev; |
@@ -1186,40 +1191,13 @@ read_again: | |||
1186 | trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), | 1191 | trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), |
1187 | read_bio, disk_devt(mddev->gendisk), | 1192 | read_bio, disk_devt(mddev->gendisk), |
1188 | r10_bio->sector); | 1193 | r10_bio->sector); |
1189 | if (max_sectors < r10_bio->sectors) { | 1194 | generic_make_request(read_bio); |
1190 | /* | ||
1191 | * Could not read all from this device, so we will need another | ||
1192 | * r10_bio. | ||
1193 | */ | ||
1194 | sectors_handled = (r10_bio->sector + max_sectors | ||
1195 | - bio->bi_iter.bi_sector); | ||
1196 | r10_bio->sectors = max_sectors; | ||
1197 | inc_pending(conf); | ||
1198 | bio_inc_remaining(bio); | ||
1199 | /* | ||
1200 | * Cannot call generic_make_request directly as that will be | ||
1201 | * queued in __generic_make_request and subsequent | ||
1202 | * mempool_alloc might block waiting for it. so hand bio over | ||
1203 | * to raid10d. | ||
1204 | */ | ||
1205 | reschedule_retry(r10_bio); | ||
1206 | |||
1207 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
1208 | |||
1209 | r10_bio->master_bio = bio; | ||
1210 | r10_bio->sectors = bio_sectors(bio) - sectors_handled; | ||
1211 | r10_bio->state = 0; | ||
1212 | r10_bio->mddev = mddev; | ||
1213 | r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled; | ||
1214 | goto read_again; | ||
1215 | } else | ||
1216 | generic_make_request(read_bio); | ||
1217 | return; | 1195 | return; |
1218 | } | 1196 | } |
1219 | 1197 | ||
1220 | static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, | 1198 | static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, |
1221 | struct bio *bio, bool replacement, | 1199 | struct bio *bio, bool replacement, |
1222 | int n_copy, int max_sectors) | 1200 | int n_copy) |
1223 | { | 1201 | { |
1224 | const int op = bio_op(bio); | 1202 | const int op = bio_op(bio); |
1225 | const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); | 1203 | const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); |
@@ -1243,7 +1221,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, | |||
1243 | rdev = conf->mirrors[devnum].rdev; | 1221 | rdev = conf->mirrors[devnum].rdev; |
1244 | 1222 | ||
1245 | mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); | 1223 | mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); |
1246 | bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); | ||
1247 | if (replacement) | 1224 | if (replacement) |
1248 | r10_bio->devs[n_copy].repl_bio = mbio; | 1225 | r10_bio->devs[n_copy].repl_bio = mbio; |
1249 | else | 1226 | else |
@@ -1294,7 +1271,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, | |||
1294 | int i; | 1271 | int i; |
1295 | struct md_rdev *blocked_rdev; | 1272 | struct md_rdev *blocked_rdev; |
1296 | sector_t sectors; | 1273 | sector_t sectors; |
1297 | int sectors_handled; | ||
1298 | int max_sectors; | 1274 | int max_sectors; |
1299 | 1275 | ||
1300 | md_write_start(mddev, bio); | 1276 | md_write_start(mddev, bio); |
@@ -1306,7 +1282,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, | |||
1306 | */ | 1282 | */ |
1307 | wait_barrier(conf); | 1283 | wait_barrier(conf); |
1308 | 1284 | ||
1309 | sectors = bio_sectors(bio); | 1285 | sectors = r10_bio->sectors; |
1310 | while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | 1286 | while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
1311 | bio->bi_iter.bi_sector < conf->reshape_progress && | 1287 | bio->bi_iter.bi_sector < conf->reshape_progress && |
1312 | bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { | 1288 | bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { |
@@ -1476,44 +1452,29 @@ retry_write: | |||
1476 | 1452 | ||
1477 | if (max_sectors < r10_bio->sectors) | 1453 | if (max_sectors < r10_bio->sectors) |
1478 | r10_bio->sectors = max_sectors; | 1454 | r10_bio->sectors = max_sectors; |
1479 | sectors_handled = r10_bio->sector + max_sectors - | 1455 | |
1480 | bio->bi_iter.bi_sector; | 1456 | if (r10_bio->sectors < bio_sectors(bio)) { |
1457 | struct bio *split = bio_split(bio, r10_bio->sectors, | ||
1458 | GFP_NOIO, conf->bio_split); | ||
1459 | bio_chain(split, bio); | ||
1460 | generic_make_request(bio); | ||
1461 | bio = split; | ||
1462 | r10_bio->master_bio = bio; | ||
1463 | } | ||
1481 | 1464 | ||
1482 | atomic_set(&r10_bio->remaining, 1); | 1465 | atomic_set(&r10_bio->remaining, 1); |
1483 | bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); | 1466 | bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); |
1484 | 1467 | ||
1485 | for (i = 0; i < conf->copies; i++) { | 1468 | for (i = 0; i < conf->copies; i++) { |
1486 | if (r10_bio->devs[i].bio) | 1469 | if (r10_bio->devs[i].bio) |
1487 | raid10_write_one_disk(mddev, r10_bio, bio, false, | 1470 | raid10_write_one_disk(mddev, r10_bio, bio, false, i); |
1488 | i, max_sectors); | ||
1489 | if (r10_bio->devs[i].repl_bio) | 1471 | if (r10_bio->devs[i].repl_bio) |
1490 | raid10_write_one_disk(mddev, r10_bio, bio, true, | 1472 | raid10_write_one_disk(mddev, r10_bio, bio, true, i); |
1491 | i, max_sectors); | ||
1492 | } | ||
1493 | |||
1494 | /* Don't remove the bias on 'remaining' (one_write_done) until | ||
1495 | * after checking if we need to go around again. | ||
1496 | */ | ||
1497 | |||
1498 | if (sectors_handled < bio_sectors(bio)) { | ||
1499 | /* We need another r10_bio and it needs to be counted */ | ||
1500 | inc_pending(conf); | ||
1501 | bio_inc_remaining(bio); | ||
1502 | one_write_done(r10_bio); | ||
1503 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
1504 | |||
1505 | r10_bio->master_bio = bio; | ||
1506 | r10_bio->sectors = bio_sectors(bio) - sectors_handled; | ||
1507 | |||
1508 | r10_bio->mddev = mddev; | ||
1509 | r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled; | ||
1510 | r10_bio->state = 0; | ||
1511 | goto retry_write; | ||
1512 | } | 1473 | } |
1513 | one_write_done(r10_bio); | 1474 | one_write_done(r10_bio); |
1514 | } | 1475 | } |
1515 | 1476 | ||
1516 | static void __make_request(struct mddev *mddev, struct bio *bio) | 1477 | static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) |
1517 | { | 1478 | { |
1518 | struct r10conf *conf = mddev->private; | 1479 | struct r10conf *conf = mddev->private; |
1519 | struct r10bio *r10_bio; | 1480 | struct r10bio *r10_bio; |
@@ -1521,7 +1482,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio) | |||
1521 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | 1482 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); |
1522 | 1483 | ||
1523 | r10_bio->master_bio = bio; | 1484 | r10_bio->master_bio = bio; |
1524 | r10_bio->sectors = bio_sectors(bio); | 1485 | r10_bio->sectors = sectors; |
1525 | 1486 | ||
1526 | r10_bio->mddev = mddev; | 1487 | r10_bio->mddev = mddev; |
1527 | r10_bio->sector = bio->bi_iter.bi_sector; | 1488 | r10_bio->sector = bio->bi_iter.bi_sector; |
@@ -1538,54 +1499,26 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) | |||
1538 | struct r10conf *conf = mddev->private; | 1499 | struct r10conf *conf = mddev->private; |
1539 | sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); | 1500 | sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); |
1540 | int chunk_sects = chunk_mask + 1; | 1501 | int chunk_sects = chunk_mask + 1; |
1541 | 1502 | int sectors = bio_sectors(bio); | |
1542 | struct bio *split; | ||
1543 | 1503 | ||
1544 | if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { | 1504 | if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { |
1545 | md_flush_request(mddev, bio); | 1505 | md_flush_request(mddev, bio); |
1546 | return; | 1506 | return; |
1547 | } | 1507 | } |
1548 | 1508 | ||
1549 | do { | 1509 | /* |
1550 | 1510 | * If this request crosses a chunk boundary, we need to split | |
1551 | /* | 1511 | * it. |
1552 | * If this request crosses a chunk boundary, we need to split | 1512 | */ |
1553 | * it. | 1513 | if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + |
1554 | */ | 1514 | sectors > chunk_sects |
1555 | if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + | 1515 | && (conf->geo.near_copies < conf->geo.raid_disks |
1556 | bio_sectors(bio) > chunk_sects | 1516 | || conf->prev.near_copies < |
1557 | && (conf->geo.near_copies < conf->geo.raid_disks | 1517 | conf->prev.raid_disks))) |
1558 | || conf->prev.near_copies < | 1518 | sectors = chunk_sects - |
1559 | conf->prev.raid_disks))) { | 1519 | (bio->bi_iter.bi_sector & |
1560 | split = bio_split(bio, chunk_sects - | 1520 | (chunk_sects - 1)); |
1561 | (bio->bi_iter.bi_sector & | 1521 | __make_request(mddev, bio, sectors); |
1562 | (chunk_sects - 1)), | ||
1563 | GFP_NOIO, fs_bio_set); | ||
1564 | bio_chain(split, bio); | ||
1565 | } else { | ||
1566 | split = bio; | ||
1567 | } | ||
1568 | |||
1569 | /* | ||
1570 | * If a bio is splitted, the first part of bio will pass | ||
1571 | * barrier but the bio is queued in current->bio_list (see | ||
1572 | * generic_make_request). If there is a raise_barrier() called | ||
1573 | * here, the second part of bio can't pass barrier. But since | ||
1574 | * the first part bio isn't dispatched to underlaying disks | ||
1575 | * yet, the barrier is never released, hence raise_barrier will | ||
1576 | * alays wait. We have a deadlock. | ||
1577 | * Note, this only happens in read path. For write path, the | ||
1578 | * first part of bio is dispatched in a schedule() call | ||
1579 | * (because of blk plug) or offloaded to raid10d. | ||
1580 | * Quitting from the function immediately can change the bio | ||
1581 | * order queued in bio_list and avoid the deadlock. | ||
1582 | */ | ||
1583 | __make_request(mddev, split); | ||
1584 | if (split != bio && bio_data_dir(bio) == READ) { | ||
1585 | generic_make_request(bio); | ||
1586 | break; | ||
1587 | } | ||
1588 | } while (split != bio); | ||
1589 | 1522 | ||
1590 | /* In case raid10d snuck in to freeze_array */ | 1523 | /* In case raid10d snuck in to freeze_array */ |
1591 | wake_up(&conf->wait_barrier); | 1524 | wake_up(&conf->wait_barrier); |
@@ -2873,13 +2806,8 @@ static void raid10d(struct md_thread *thread) | |||
2873 | recovery_request_write(mddev, r10_bio); | 2806 | recovery_request_write(mddev, r10_bio); |
2874 | else if (test_bit(R10BIO_ReadError, &r10_bio->state)) | 2807 | else if (test_bit(R10BIO_ReadError, &r10_bio->state)) |
2875 | handle_read_error(mddev, r10_bio); | 2808 | handle_read_error(mddev, r10_bio); |
2876 | else { | 2809 | else |
2877 | /* just a partial read to be scheduled from a | 2810 | WARN_ON_ONCE(1); |
2878 | * separate context | ||
2879 | */ | ||
2880 | int slot = r10_bio->read_slot; | ||
2881 | generic_make_request(r10_bio->devs[slot].bio); | ||
2882 | } | ||
2883 | 2811 | ||
2884 | cond_resched(); | 2812 | cond_resched(); |
2885 | if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) | 2813 | if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) |
@@ -3652,6 +3580,10 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3652 | if (!conf->r10bio_pool) | 3580 | if (!conf->r10bio_pool) |
3653 | goto out; | 3581 | goto out; |
3654 | 3582 | ||
3583 | conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); | ||
3584 | if (!conf->bio_split) | ||
3585 | goto out; | ||
3586 | |||
3655 | calc_sectors(conf, mddev->dev_sectors); | 3587 | calc_sectors(conf, mddev->dev_sectors); |
3656 | if (mddev->reshape_position == MaxSector) { | 3588 | if (mddev->reshape_position == MaxSector) { |
3657 | conf->prev = conf->geo; | 3589 | conf->prev = conf->geo; |
@@ -3689,6 +3621,8 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3689 | mempool_destroy(conf->r10bio_pool); | 3621 | mempool_destroy(conf->r10bio_pool); |
3690 | kfree(conf->mirrors); | 3622 | kfree(conf->mirrors); |
3691 | safe_put_page(conf->tmppage); | 3623 | safe_put_page(conf->tmppage); |
3624 | if (conf->bio_split) | ||
3625 | bioset_free(conf->bio_split); | ||
3692 | kfree(conf); | 3626 | kfree(conf); |
3693 | } | 3627 | } |
3694 | return ERR_PTR(err); | 3628 | return ERR_PTR(err); |
@@ -3899,6 +3833,8 @@ static void raid10_free(struct mddev *mddev, void *priv) | |||
3899 | kfree(conf->mirrors); | 3833 | kfree(conf->mirrors); |
3900 | kfree(conf->mirrors_old); | 3834 | kfree(conf->mirrors_old); |
3901 | kfree(conf->mirrors_new); | 3835 | kfree(conf->mirrors_new); |
3836 | if (conf->bio_split) | ||
3837 | bioset_free(conf->bio_split); | ||
3902 | kfree(conf); | 3838 | kfree(conf); |
3903 | } | 3839 | } |
3904 | 3840 | ||
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 3162615e57bd..735ce1a3d260 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -82,6 +82,7 @@ struct r10conf { | |||
82 | mempool_t *r10bio_pool; | 82 | mempool_t *r10bio_pool; |
83 | mempool_t *r10buf_pool; | 83 | mempool_t *r10buf_pool; |
84 | struct page *tmppage; | 84 | struct page *tmppage; |
85 | struct bio_set *bio_split; | ||
85 | 86 | ||
86 | /* When taking over an array from a different personality, we store | 87 | /* When taking over an array from a different personality, we store |
87 | * the new thread here until we fully activate the array. | 88 | * the new thread here until we fully activate the array. |