aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKiyoshi Ueda <k-ueda@ct.jp.nec.com>2009-06-22 05:12:37 -0400
committerAlasdair G Kergon <agk@redhat.com>2009-06-22 05:12:37 -0400
commitf40c67f0f7e2767f80f7cbcbc1ab86c4113c202e (patch)
treee8a20f3ceac54ac6edd0f25ffa95c0216a949c52
parent523d9297d43cce3fa6de6474b7674329e98743b1 (diff)
dm mpath: change to be request based
This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
-rw-r--r--drivers/md/dm-mpath.c193
1 files changed, 128 insertions, 65 deletions
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f8aeaaa54afe..c70604a20897 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -8,7 +8,6 @@
8#include <linux/device-mapper.h> 8#include <linux/device-mapper.h>
9 9
10#include "dm-path-selector.h" 10#include "dm-path-selector.h"
11#include "dm-bio-record.h"
12#include "dm-uevent.h" 11#include "dm-uevent.h"
13 12
14#include <linux/ctype.h> 13#include <linux/ctype.h>
@@ -83,7 +82,7 @@ struct multipath {
83 unsigned pg_init_count; /* Number of times pg_init called */ 82 unsigned pg_init_count; /* Number of times pg_init called */
84 83
85 struct work_struct process_queued_ios; 84 struct work_struct process_queued_ios;
86 struct bio_list queued_ios; 85 struct list_head queued_ios;
87 unsigned queue_size; 86 unsigned queue_size;
88 87
89 struct work_struct trigger_event; 88 struct work_struct trigger_event;
@@ -100,7 +99,6 @@ struct multipath {
100 */ 99 */
101struct dm_mpath_io { 100struct dm_mpath_io {
102 struct pgpath *pgpath; 101 struct pgpath *pgpath;
103 struct dm_bio_details details;
104 size_t nr_bytes; 102 size_t nr_bytes;
105}; 103};
106 104
@@ -194,6 +192,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
194 m = kzalloc(sizeof(*m), GFP_KERNEL); 192 m = kzalloc(sizeof(*m), GFP_KERNEL);
195 if (m) { 193 if (m) {
196 INIT_LIST_HEAD(&m->priority_groups); 194 INIT_LIST_HEAD(&m->priority_groups);
195 INIT_LIST_HEAD(&m->queued_ios);
197 spin_lock_init(&m->lock); 196 spin_lock_init(&m->lock);
198 m->queue_io = 1; 197 m->queue_io = 1;
199 INIT_WORK(&m->process_queued_ios, process_queued_ios); 198 INIT_WORK(&m->process_queued_ios, process_queued_ios);
@@ -318,13 +317,14 @@ static int __must_push_back(struct multipath *m)
318 dm_noflush_suspending(m->ti)); 317 dm_noflush_suspending(m->ti));
319} 318}
320 319
321static int map_io(struct multipath *m, struct bio *bio, 320static int map_io(struct multipath *m, struct request *clone,
322 struct dm_mpath_io *mpio, unsigned was_queued) 321 struct dm_mpath_io *mpio, unsigned was_queued)
323{ 322{
324 int r = DM_MAPIO_REMAPPED; 323 int r = DM_MAPIO_REMAPPED;
325 size_t nr_bytes = bio->bi_size; 324 size_t nr_bytes = blk_rq_bytes(clone);
326 unsigned long flags; 325 unsigned long flags;
327 struct pgpath *pgpath; 326 struct pgpath *pgpath;
327 struct block_device *bdev;
328 328
329 spin_lock_irqsave(&m->lock, flags); 329 spin_lock_irqsave(&m->lock, flags);
330 330
@@ -341,16 +341,18 @@ static int map_io(struct multipath *m, struct bio *bio,
341 if ((pgpath && m->queue_io) || 341 if ((pgpath && m->queue_io) ||
342 (!pgpath && m->queue_if_no_path)) { 342 (!pgpath && m->queue_if_no_path)) {
343 /* Queue for the daemon to resubmit */ 343 /* Queue for the daemon to resubmit */
344 bio_list_add(&m->queued_ios, bio); 344 list_add_tail(&clone->queuelist, &m->queued_ios);
345 m->queue_size++; 345 m->queue_size++;
346 if ((m->pg_init_required && !m->pg_init_in_progress) || 346 if ((m->pg_init_required && !m->pg_init_in_progress) ||
347 !m->queue_io) 347 !m->queue_io)
348 queue_work(kmultipathd, &m->process_queued_ios); 348 queue_work(kmultipathd, &m->process_queued_ios);
349 pgpath = NULL; 349 pgpath = NULL;
350 r = DM_MAPIO_SUBMITTED; 350 r = DM_MAPIO_SUBMITTED;
351 } else if (pgpath) 351 } else if (pgpath) {
352 bio->bi_bdev = pgpath->path.dev->bdev; 352 bdev = pgpath->path.dev->bdev;
353 else if (__must_push_back(m)) 353 clone->q = bdev_get_queue(bdev);
354 clone->rq_disk = bdev->bd_disk;
355 } else if (__must_push_back(m))
354 r = DM_MAPIO_REQUEUE; 356 r = DM_MAPIO_REQUEUE;
355 else 357 else
356 r = -EIO; /* Failed */ 358 r = -EIO; /* Failed */
@@ -398,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m)
398{ 400{
399 int r; 401 int r;
400 unsigned long flags; 402 unsigned long flags;
401 struct bio *bio = NULL, *next;
402 struct dm_mpath_io *mpio; 403 struct dm_mpath_io *mpio;
403 union map_info *info; 404 union map_info *info;
405 struct request *clone, *n;
406 LIST_HEAD(cl);
404 407
405 spin_lock_irqsave(&m->lock, flags); 408 spin_lock_irqsave(&m->lock, flags);
406 bio = bio_list_get(&m->queued_ios); 409 list_splice_init(&m->queued_ios, &cl);
407 spin_unlock_irqrestore(&m->lock, flags); 410 spin_unlock_irqrestore(&m->lock, flags);
408 411
409 while (bio) { 412 list_for_each_entry_safe(clone, n, &cl, queuelist) {
410 next = bio->bi_next; 413 list_del_init(&clone->queuelist);
411 bio->bi_next = NULL;
412 414
413 info = dm_get_mapinfo(bio); 415 info = dm_get_rq_mapinfo(clone);
414 mpio = info->ptr; 416 mpio = info->ptr;
415 417
416 r = map_io(m, bio, mpio, 1); 418 r = map_io(m, clone, mpio, 1);
417 if (r < 0) 419 if (r < 0) {
418 bio_endio(bio, r); 420 mempool_free(mpio, m->mpio_pool);
419 else if (r == DM_MAPIO_REMAPPED) 421 dm_kill_unmapped_request(clone, r);
420 generic_make_request(bio); 422 } else if (r == DM_MAPIO_REMAPPED)
421 else if (r == DM_MAPIO_REQUEUE) 423 dm_dispatch_request(clone);
422 bio_endio(bio, -EIO); 424 else if (r == DM_MAPIO_REQUEUE) {
423 425 mempool_free(mpio, m->mpio_pool);
424 bio = next; 426 dm_requeue_unmapped_request(clone);
427 }
425 } 428 }
426} 429}
427 430
@@ -863,21 +866,24 @@ static void multipath_dtr(struct dm_target *ti)
863} 866}
864 867
865/* 868/*
866 * Map bios, recording original fields for later in case we have to resubmit 869 * Map cloned requests
867 */ 870 */
868static int multipath_map(struct dm_target *ti, struct bio *bio, 871static int multipath_map(struct dm_target *ti, struct request *clone,
869 union map_info *map_context) 872 union map_info *map_context)
870{ 873{
871 int r; 874 int r;
872 struct dm_mpath_io *mpio; 875 struct dm_mpath_io *mpio;
873 struct multipath *m = (struct multipath *) ti->private; 876 struct multipath *m = (struct multipath *) ti->private;
874 877
875 mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); 878 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
876 dm_bio_record(&mpio->details, bio); 879 if (!mpio)
880 /* ENOMEM, requeue */
881 return DM_MAPIO_REQUEUE;
882 memset(mpio, 0, sizeof(*mpio));
877 883
878 map_context->ptr = mpio; 884 map_context->ptr = mpio;
879 bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT); 885 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
880 r = map_io(m, bio, mpio, 0); 886 r = map_io(m, clone, mpio, 0);
881 if (r < 0 || r == DM_MAPIO_REQUEUE) 887 if (r < 0 || r == DM_MAPIO_REQUEUE)
882 mempool_free(mpio, m->mpio_pool); 888 mempool_free(mpio, m->mpio_pool);
883 889
@@ -1158,53 +1164,41 @@ static void activate_path(struct work_struct *work)
1158/* 1164/*
1159 * end_io handling 1165 * end_io handling
1160 */ 1166 */
1161static int do_end_io(struct multipath *m, struct bio *bio, 1167static int do_end_io(struct multipath *m, struct request *clone,
1162 int error, struct dm_mpath_io *mpio) 1168 int error, struct dm_mpath_io *mpio)
1163{ 1169{
1170 /*
1171 * We don't queue any clone request inside the multipath target
1172 * during end I/O handling, since those clone requests don't have
1173 * bio clones. If we queue them inside the multipath target,
1174 * we need to make bio clones, that requires memory allocation.
1175 * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
1176 * don't have bio clones.)
1177 * Instead of queueing the clone request here, we queue the original
1178 * request into dm core, which will remake a clone request and
1179 * clone bios for it and resubmit it later.
1180 */
1181 int r = DM_ENDIO_REQUEUE;
1164 unsigned long flags; 1182 unsigned long flags;
1165 1183
1166 if (!error) 1184 if (!error && !clone->errors)
1167 return 0; /* I/O complete */ 1185 return 0; /* I/O complete */
1168 1186
1169 if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
1170 return error;
1171
1172 if (error == -EOPNOTSUPP) 1187 if (error == -EOPNOTSUPP)
1173 return error; 1188 return error;
1174 1189
1175 spin_lock_irqsave(&m->lock, flags);
1176 if (!m->nr_valid_paths) {
1177 if (__must_push_back(m)) {
1178 spin_unlock_irqrestore(&m->lock, flags);
1179 return DM_ENDIO_REQUEUE;
1180 } else if (!m->queue_if_no_path) {
1181 spin_unlock_irqrestore(&m->lock, flags);
1182 return -EIO;
1183 } else {
1184 spin_unlock_irqrestore(&m->lock, flags);
1185 goto requeue;
1186 }
1187 }
1188 spin_unlock_irqrestore(&m->lock, flags);
1189
1190 if (mpio->pgpath) 1190 if (mpio->pgpath)
1191 fail_path(mpio->pgpath); 1191 fail_path(mpio->pgpath);
1192 1192
1193 requeue:
1194 dm_bio_restore(&mpio->details, bio);
1195
1196 /* queue for the daemon to resubmit or fail */
1197 spin_lock_irqsave(&m->lock, flags); 1193 spin_lock_irqsave(&m->lock, flags);
1198 bio_list_add(&m->queued_ios, bio); 1194 if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m))
1199 m->queue_size++; 1195 r = -EIO;
1200 if (!m->queue_io)
1201 queue_work(kmultipathd, &m->process_queued_ios);
1202 spin_unlock_irqrestore(&m->lock, flags); 1196 spin_unlock_irqrestore(&m->lock, flags);
1203 1197
1204 return DM_ENDIO_INCOMPLETE; /* io not complete */ 1198 return r;
1205} 1199}
1206 1200
1207static int multipath_end_io(struct dm_target *ti, struct bio *bio, 1201static int multipath_end_io(struct dm_target *ti, struct request *clone,
1208 int error, union map_info *map_context) 1202 int error, union map_info *map_context)
1209{ 1203{
1210 struct multipath *m = ti->private; 1204 struct multipath *m = ti->private;
@@ -1213,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
1213 struct path_selector *ps; 1207 struct path_selector *ps;
1214 int r; 1208 int r;
1215 1209
1216 r = do_end_io(m, bio, error, mpio); 1210 r = do_end_io(m, clone, error, mpio);
1217 if (pgpath) { 1211 if (pgpath) {
1218 ps = &pgpath->pg->ps; 1212 ps = &pgpath->pg->ps;
1219 if (ps->type->end_io) 1213 if (ps->type->end_io)
1220 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1214 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1221 } 1215 }
1222 if (r != DM_ENDIO_INCOMPLETE) 1216 mempool_free(mpio, m->mpio_pool);
1223 mempool_free(mpio, m->mpio_pool);
1224 1217
1225 return r; 1218 return r;
1226} 1219}
@@ -1470,6 +1463,75 @@ out:
1470 return ret; 1463 return ret;
1471} 1464}
1472 1465
1466static int __pgpath_busy(struct pgpath *pgpath)
1467{
1468 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1469
1470 return dm_underlying_device_busy(q);
1471}
1472
1473/*
1474 * We return "busy", only when we can map I/Os but underlying devices
1475 * are busy (so even if we map I/Os now, the I/Os will wait on
1476 * the underlying queue).
1477 * In other words, if we want to kill I/Os or queue them inside us
1478 * due to map unavailability, we don't return "busy". Otherwise,
1479 * dm core won't give us the I/Os and we can't do what we want.
1480 */
1481static int multipath_busy(struct dm_target *ti)
1482{
1483 int busy = 0, has_active = 0;
1484 struct multipath *m = ti->private;
1485 struct priority_group *pg;
1486 struct pgpath *pgpath;
1487 unsigned long flags;
1488
1489 spin_lock_irqsave(&m->lock, flags);
1490
1491 /* Guess which priority_group will be used at next mapping time */
1492 if (unlikely(!m->current_pgpath && m->next_pg))
1493 pg = m->next_pg;
1494 else if (likely(m->current_pg))
1495 pg = m->current_pg;
1496 else
1497 /*
1498 * We don't know which pg will be used at next mapping time.
1499 * We don't call __choose_pgpath() here to avoid to trigger
1500 * pg_init just by busy checking.
1501 * So we don't know whether underlying devices we will be using
1502 * at next mapping time are busy or not. Just try mapping.
1503 */
1504 goto out;
1505
1506 /*
1507 * If there is one non-busy active path at least, the path selector
1508 * will be able to select it. So we consider such a pg as not busy.
1509 */
1510 busy = 1;
1511 list_for_each_entry(pgpath, &pg->pgpaths, list)
1512 if (pgpath->is_active) {
1513 has_active = 1;
1514
1515 if (!__pgpath_busy(pgpath)) {
1516 busy = 0;
1517 break;
1518 }
1519 }
1520
1521 if (!has_active)
1522 /*
1523 * No active path in this pg, so this pg won't be used and
1524 * the current_pg will be changed at next mapping time.
1525 * We need to try mapping to determine it.
1526 */
1527 busy = 0;
1528
1529out:
1530 spin_unlock_irqrestore(&m->lock, flags);
1531
1532 return busy;
1533}
1534
1473/*----------------------------------------------------------------- 1535/*-----------------------------------------------------------------
1474 * Module setup 1536 * Module setup
1475 *---------------------------------------------------------------*/ 1537 *---------------------------------------------------------------*/
@@ -1479,14 +1541,15 @@ static struct target_type multipath_target = {
1479 .module = THIS_MODULE, 1541 .module = THIS_MODULE,
1480 .ctr = multipath_ctr, 1542 .ctr = multipath_ctr,
1481 .dtr = multipath_dtr, 1543 .dtr = multipath_dtr,
1482 .map = multipath_map, 1544 .map_rq = multipath_map,
1483 .end_io = multipath_end_io, 1545 .rq_end_io = multipath_end_io,
1484 .presuspend = multipath_presuspend, 1546 .presuspend = multipath_presuspend,
1485 .resume = multipath_resume, 1547 .resume = multipath_resume,
1486 .status = multipath_status, 1548 .status = multipath_status,
1487 .message = multipath_message, 1549 .message = multipath_message,
1488 .ioctl = multipath_ioctl, 1550 .ioctl = multipath_ioctl,
1489 .iterate_devices = multipath_iterate_devices, 1551 .iterate_devices = multipath_iterate_devices,
1552 .busy = multipath_busy,
1490}; 1553};
1491 1554
1492static int __init dm_multipath_init(void) 1555static int __init dm_multipath_init(void)