aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.com>2017-04-05 00:05:51 -0400
committerShaohua Li <shli@fb.com>2017-04-11 13:13:02 -0400
commitfc9977dd069e4f82fcacb262652117c488647319 (patch)
tree329ec8a20b630d9f3cfdf2596dfe71ea71368c41
parent673ca68d93879b9ffbbed874c9e70ca6e37cab15 (diff)
md/raid10: simplify the splitting of requests.
raid10 splits requests in two different ways for two different reasons. First, bio_split() is used to ensure the bio fits with a chunk. Second, multiple r10bio structures are allocated to represent the different sections that need to go to different devices, to avoid known bad blocks. This can be simplified to just use bio_split() once, and not to use multiple r10bios. We delay the split until we know a maximum bio size that can be handled with a single r10bio, and then split the bio and queue the remainder for later handling. As with raid1, we allocate a new bio_set to help with the splitting. It is not correct to use fs_bio_set in a device driver. Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
-rw-r--r--drivers/md/raid10.c164
-rw-r--r--drivers/md/raid10.h1
2 files changed, 51 insertions, 114 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e055ec94b9a8..41845bae67be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1127,7 +1127,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1127 struct bio *read_bio; 1127 struct bio *read_bio;
1128 const int op = bio_op(bio); 1128 const int op = bio_op(bio);
1129 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); 1129 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1130 int sectors_handled;
1131 int max_sectors; 1130 int max_sectors;
1132 sector_t sectors; 1131 sector_t sectors;
1133 struct md_rdev *rdev; 1132 struct md_rdev *rdev;
@@ -1140,7 +1139,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1140 */ 1139 */
1141 wait_barrier(conf); 1140 wait_barrier(conf);
1142 1141
1143 sectors = bio_sectors(bio); 1142 sectors = r10_bio->sectors;
1144 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1143 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1145 bio->bi_iter.bi_sector < conf->reshape_progress && 1144 bio->bi_iter.bi_sector < conf->reshape_progress &&
1146 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { 1145 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1157,17 +1156,23 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1157 wait_barrier(conf); 1156 wait_barrier(conf);
1158 } 1157 }
1159 1158
1160read_again:
1161 rdev = read_balance(conf, r10_bio, &max_sectors); 1159 rdev = read_balance(conf, r10_bio, &max_sectors);
1162 if (!rdev) { 1160 if (!rdev) {
1163 raid_end_bio_io(r10_bio); 1161 raid_end_bio_io(r10_bio);
1164 return; 1162 return;
1165 } 1163 }
1164 if (max_sectors < bio_sectors(bio)) {
1165 struct bio *split = bio_split(bio, max_sectors,
1166 GFP_NOIO, conf->bio_split);
1167 bio_chain(split, bio);
1168 generic_make_request(bio);
1169 bio = split;
1170 r10_bio->master_bio = bio;
1171 r10_bio->sectors = max_sectors;
1172 }
1166 slot = r10_bio->read_slot; 1173 slot = r10_bio->read_slot;
1167 1174
1168 read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); 1175 read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
1169 bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
1170 max_sectors);
1171 1176
1172 r10_bio->devs[slot].bio = read_bio; 1177 r10_bio->devs[slot].bio = read_bio;
1173 r10_bio->devs[slot].rdev = rdev; 1178 r10_bio->devs[slot].rdev = rdev;
@@ -1186,40 +1191,13 @@ read_again:
1186 trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), 1191 trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
1187 read_bio, disk_devt(mddev->gendisk), 1192 read_bio, disk_devt(mddev->gendisk),
1188 r10_bio->sector); 1193 r10_bio->sector);
1189 if (max_sectors < r10_bio->sectors) { 1194 generic_make_request(read_bio);
1190 /*
1191 * Could not read all from this device, so we will need another
1192 * r10_bio.
1193 */
1194 sectors_handled = (r10_bio->sector + max_sectors
1195 - bio->bi_iter.bi_sector);
1196 r10_bio->sectors = max_sectors;
1197 inc_pending(conf);
1198 bio_inc_remaining(bio);
1199 /*
1200 * Cannot call generic_make_request directly as that will be
1201 * queued in __generic_make_request and subsequent
1202 * mempool_alloc might block waiting for it. so hand bio over
1203 * to raid10d.
1204 */
1205 reschedule_retry(r10_bio);
1206
1207 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1208
1209 r10_bio->master_bio = bio;
1210 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1211 r10_bio->state = 0;
1212 r10_bio->mddev = mddev;
1213 r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1214 goto read_again;
1215 } else
1216 generic_make_request(read_bio);
1217 return; 1195 return;
1218} 1196}
1219 1197
1220static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, 1198static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1221 struct bio *bio, bool replacement, 1199 struct bio *bio, bool replacement,
1222 int n_copy, int max_sectors) 1200 int n_copy)
1223{ 1201{
1224 const int op = bio_op(bio); 1202 const int op = bio_op(bio);
1225 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); 1203 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
@@ -1243,7 +1221,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1243 rdev = conf->mirrors[devnum].rdev; 1221 rdev = conf->mirrors[devnum].rdev;
1244 1222
1245 mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); 1223 mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
1246 bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
1247 if (replacement) 1224 if (replacement)
1248 r10_bio->devs[n_copy].repl_bio = mbio; 1225 r10_bio->devs[n_copy].repl_bio = mbio;
1249 else 1226 else
@@ -1294,7 +1271,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1294 int i; 1271 int i;
1295 struct md_rdev *blocked_rdev; 1272 struct md_rdev *blocked_rdev;
1296 sector_t sectors; 1273 sector_t sectors;
1297 int sectors_handled;
1298 int max_sectors; 1274 int max_sectors;
1299 1275
1300 md_write_start(mddev, bio); 1276 md_write_start(mddev, bio);
@@ -1306,7 +1282,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1306 */ 1282 */
1307 wait_barrier(conf); 1283 wait_barrier(conf);
1308 1284
1309 sectors = bio_sectors(bio); 1285 sectors = r10_bio->sectors;
1310 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1286 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1311 bio->bi_iter.bi_sector < conf->reshape_progress && 1287 bio->bi_iter.bi_sector < conf->reshape_progress &&
1312 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { 1288 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1476,44 +1452,29 @@ retry_write:
1476 1452
1477 if (max_sectors < r10_bio->sectors) 1453 if (max_sectors < r10_bio->sectors)
1478 r10_bio->sectors = max_sectors; 1454 r10_bio->sectors = max_sectors;
1479 sectors_handled = r10_bio->sector + max_sectors - 1455
1480 bio->bi_iter.bi_sector; 1456 if (r10_bio->sectors < bio_sectors(bio)) {
1457 struct bio *split = bio_split(bio, r10_bio->sectors,
1458 GFP_NOIO, conf->bio_split);
1459 bio_chain(split, bio);
1460 generic_make_request(bio);
1461 bio = split;
1462 r10_bio->master_bio = bio;
1463 }
1481 1464
1482 atomic_set(&r10_bio->remaining, 1); 1465 atomic_set(&r10_bio->remaining, 1);
1483 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); 1466 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1484 1467
1485 for (i = 0; i < conf->copies; i++) { 1468 for (i = 0; i < conf->copies; i++) {
1486 if (r10_bio->devs[i].bio) 1469 if (r10_bio->devs[i].bio)
1487 raid10_write_one_disk(mddev, r10_bio, bio, false, 1470 raid10_write_one_disk(mddev, r10_bio, bio, false, i);
1488 i, max_sectors);
1489 if (r10_bio->devs[i].repl_bio) 1471 if (r10_bio->devs[i].repl_bio)
1490 raid10_write_one_disk(mddev, r10_bio, bio, true, 1472 raid10_write_one_disk(mddev, r10_bio, bio, true, i);
1491 i, max_sectors);
1492 }
1493
1494 /* Don't remove the bias on 'remaining' (one_write_done) until
1495 * after checking if we need to go around again.
1496 */
1497
1498 if (sectors_handled < bio_sectors(bio)) {
1499 /* We need another r10_bio and it needs to be counted */
1500 inc_pending(conf);
1501 bio_inc_remaining(bio);
1502 one_write_done(r10_bio);
1503 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1504
1505 r10_bio->master_bio = bio;
1506 r10_bio->sectors = bio_sectors(bio) - sectors_handled;
1507
1508 r10_bio->mddev = mddev;
1509 r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1510 r10_bio->state = 0;
1511 goto retry_write;
1512 } 1473 }
1513 one_write_done(r10_bio); 1474 one_write_done(r10_bio);
1514} 1475}
1515 1476
1516static void __make_request(struct mddev *mddev, struct bio *bio) 1477static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1517{ 1478{
1518 struct r10conf *conf = mddev->private; 1479 struct r10conf *conf = mddev->private;
1519 struct r10bio *r10_bio; 1480 struct r10bio *r10_bio;
@@ -1521,7 +1482,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
1521 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 1482 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1522 1483
1523 r10_bio->master_bio = bio; 1484 r10_bio->master_bio = bio;
1524 r10_bio->sectors = bio_sectors(bio); 1485 r10_bio->sectors = sectors;
1525 1486
1526 r10_bio->mddev = mddev; 1487 r10_bio->mddev = mddev;
1527 r10_bio->sector = bio->bi_iter.bi_sector; 1488 r10_bio->sector = bio->bi_iter.bi_sector;
@@ -1538,54 +1499,26 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
1538 struct r10conf *conf = mddev->private; 1499 struct r10conf *conf = mddev->private;
1539 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); 1500 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1540 int chunk_sects = chunk_mask + 1; 1501 int chunk_sects = chunk_mask + 1;
1541 1502 int sectors = bio_sectors(bio);
1542 struct bio *split;
1543 1503
1544 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 1504 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1545 md_flush_request(mddev, bio); 1505 md_flush_request(mddev, bio);
1546 return; 1506 return;
1547 } 1507 }
1548 1508
1549 do { 1509 /*
1550 1510 * If this request crosses a chunk boundary, we need to split
1551 /* 1511 * it.
1552 * If this request crosses a chunk boundary, we need to split 1512 */
1553 * it. 1513 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1554 */ 1514 sectors > chunk_sects
1555 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + 1515 && (conf->geo.near_copies < conf->geo.raid_disks
1556 bio_sectors(bio) > chunk_sects 1516 || conf->prev.near_copies <
1557 && (conf->geo.near_copies < conf->geo.raid_disks 1517 conf->prev.raid_disks)))
1558 || conf->prev.near_copies < 1518 sectors = chunk_sects -
1559 conf->prev.raid_disks))) { 1519 (bio->bi_iter.bi_sector &
1560 split = bio_split(bio, chunk_sects - 1520 (chunk_sects - 1));
1561 (bio->bi_iter.bi_sector & 1521 __make_request(mddev, bio, sectors);
1562 (chunk_sects - 1)),
1563 GFP_NOIO, fs_bio_set);
1564 bio_chain(split, bio);
1565 } else {
1566 split = bio;
1567 }
1568
1569 /*
1570 * If a bio is splitted, the first part of bio will pass
1571 * barrier but the bio is queued in current->bio_list (see
1572 * generic_make_request). If there is a raise_barrier() called
1573 * here, the second part of bio can't pass barrier. But since
1574 * the first part bio isn't dispatched to underlaying disks
1575 * yet, the barrier is never released, hence raise_barrier will
1576 * alays wait. We have a deadlock.
1577 * Note, this only happens in read path. For write path, the
1578 * first part of bio is dispatched in a schedule() call
1579 * (because of blk plug) or offloaded to raid10d.
1580 * Quitting from the function immediately can change the bio
1581 * order queued in bio_list and avoid the deadlock.
1582 */
1583 __make_request(mddev, split);
1584 if (split != bio && bio_data_dir(bio) == READ) {
1585 generic_make_request(bio);
1586 break;
1587 }
1588 } while (split != bio);
1589 1522
1590 /* In case raid10d snuck in to freeze_array */ 1523 /* In case raid10d snuck in to freeze_array */
1591 wake_up(&conf->wait_barrier); 1524 wake_up(&conf->wait_barrier);
@@ -2873,13 +2806,8 @@ static void raid10d(struct md_thread *thread)
2873 recovery_request_write(mddev, r10_bio); 2806 recovery_request_write(mddev, r10_bio);
2874 else if (test_bit(R10BIO_ReadError, &r10_bio->state)) 2807 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2875 handle_read_error(mddev, r10_bio); 2808 handle_read_error(mddev, r10_bio);
2876 else { 2809 else
2877 /* just a partial read to be scheduled from a 2810 WARN_ON_ONCE(1);
2878 * separate context
2879 */
2880 int slot = r10_bio->read_slot;
2881 generic_make_request(r10_bio->devs[slot].bio);
2882 }
2883 2811
2884 cond_resched(); 2812 cond_resched();
2885 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) 2813 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -3652,6 +3580,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3652 if (!conf->r10bio_pool) 3580 if (!conf->r10bio_pool)
3653 goto out; 3581 goto out;
3654 3582
3583 conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
3584 if (!conf->bio_split)
3585 goto out;
3586
3655 calc_sectors(conf, mddev->dev_sectors); 3587 calc_sectors(conf, mddev->dev_sectors);
3656 if (mddev->reshape_position == MaxSector) { 3588 if (mddev->reshape_position == MaxSector) {
3657 conf->prev = conf->geo; 3589 conf->prev = conf->geo;
@@ -3689,6 +3621,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3689 mempool_destroy(conf->r10bio_pool); 3621 mempool_destroy(conf->r10bio_pool);
3690 kfree(conf->mirrors); 3622 kfree(conf->mirrors);
3691 safe_put_page(conf->tmppage); 3623 safe_put_page(conf->tmppage);
3624 if (conf->bio_split)
3625 bioset_free(conf->bio_split);
3692 kfree(conf); 3626 kfree(conf);
3693 } 3627 }
3694 return ERR_PTR(err); 3628 return ERR_PTR(err);
@@ -3899,6 +3833,8 @@ static void raid10_free(struct mddev *mddev, void *priv)
3899 kfree(conf->mirrors); 3833 kfree(conf->mirrors);
3900 kfree(conf->mirrors_old); 3834 kfree(conf->mirrors_old);
3901 kfree(conf->mirrors_new); 3835 kfree(conf->mirrors_new);
3836 if (conf->bio_split)
3837 bioset_free(conf->bio_split);
3902 kfree(conf); 3838 kfree(conf);
3903} 3839}
3904 3840
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 3162615e57bd..735ce1a3d260 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -82,6 +82,7 @@ struct r10conf {
82 mempool_t *r10bio_pool; 82 mempool_t *r10bio_pool;
83 mempool_t *r10buf_pool; 83 mempool_t *r10buf_pool;
84 struct page *tmppage; 84 struct page *tmppage;
85 struct bio_set *bio_split;
85 86
86 /* When taking over an array from a different personality, we store 87 /* When taking over an array from a different personality, we store
87 * the new thread here until we fully activate the array. 88 * the new thread here until we fully activate the array.