diff options
author | Mike Snitzer <snitzer@redhat.com> | 2016-03-17 18:38:17 -0400 |
---|---|---|
committer | Mike Snitzer <snitzer@redhat.com> | 2016-05-05 15:25:52 -0400 |
commit | 2da1610ae20e995e53658c3b10166d2ad74e30bd (patch) | |
tree | 4bee08cc145727a6366b2d516e41d2f703e3090c | |
parent | 20800cb3450ee44ec1827d7e8bbfd5a9dc02e6cd (diff) |
dm mpath: eliminate use of spinlock in IO fast-paths
The primary motivation of this commit is to improve the scalability of
DM multipath on large NUMA systems where m->lock spinlock contention has
been proven to be a serious bottleneck on really fast storage.
The ability to atomically read a pointer, using lockless_dereference(),
is leveraged in this commit. But all pointer writes are still protected
by the m->lock spinlock (which is fine since these all now occur in the
slow-path).
The following functions no longer require the m->lock spinlock in their
fast-path: multipath_busy(), __multipath_map(), and do_end_io()
And choose_pgpath() is modified to _not_ update m->current_pgpath unless
it also switches the path-group. This is done to avoid needing to take
the m->lock everytime __multipath_map() calls choose_pgpath().
But m->current_pgpath will be reset if it is failed via fail_path().
Suggested-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
-rw-r--r-- | drivers/md/dm-mpath.c | 170 |
1 files changed, 93 insertions, 77 deletions
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 54daf96980c2..52baf8a5b0f4 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -305,9 +305,21 @@ static int __pg_init_all_paths(struct multipath *m) | |||
305 | return atomic_read(&m->pg_init_in_progress); | 305 | return atomic_read(&m->pg_init_in_progress); |
306 | } | 306 | } |
307 | 307 | ||
308 | static void __switch_pg(struct multipath *m, struct pgpath *pgpath) | 308 | static int pg_init_all_paths(struct multipath *m) |
309 | { | 309 | { |
310 | m->current_pg = pgpath->pg; | 310 | int r; |
311 | unsigned long flags; | ||
312 | |||
313 | spin_lock_irqsave(&m->lock, flags); | ||
314 | r = __pg_init_all_paths(m); | ||
315 | spin_unlock_irqrestore(&m->lock, flags); | ||
316 | |||
317 | return r; | ||
318 | } | ||
319 | |||
320 | static void __switch_pg(struct multipath *m, struct priority_group *pg) | ||
321 | { | ||
322 | m->current_pg = pg; | ||
311 | 323 | ||
312 | /* Must we initialise the PG first, and queue I/O till it's ready? */ | 324 | /* Must we initialise the PG first, and queue I/O till it's ready? */ |
313 | if (m->hw_handler_name) { | 325 | if (m->hw_handler_name) { |
@@ -321,26 +333,36 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) | |||
321 | atomic_set(&m->pg_init_count, 0); | 333 | atomic_set(&m->pg_init_count, 0); |
322 | } | 334 | } |
323 | 335 | ||
324 | static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, | 336 | static struct pgpath *choose_path_in_pg(struct multipath *m, |
325 | size_t nr_bytes) | 337 | struct priority_group *pg, |
338 | size_t nr_bytes) | ||
326 | { | 339 | { |
340 | unsigned long flags; | ||
327 | struct dm_path *path; | 341 | struct dm_path *path; |
342 | struct pgpath *pgpath; | ||
328 | 343 | ||
329 | path = pg->ps.type->select_path(&pg->ps, nr_bytes); | 344 | path = pg->ps.type->select_path(&pg->ps, nr_bytes); |
330 | if (!path) | 345 | if (!path) |
331 | return -ENXIO; | 346 | return ERR_PTR(-ENXIO); |
332 | 347 | ||
333 | m->current_pgpath = path_to_pgpath(path); | 348 | pgpath = path_to_pgpath(path); |
334 | 349 | ||
335 | if (m->current_pg != pg) | 350 | if (unlikely(lockless_dereference(m->current_pg) != pg)) { |
336 | __switch_pg(m, m->current_pgpath); | 351 | /* Only update current_pgpath if pg changed */ |
352 | spin_lock_irqsave(&m->lock, flags); | ||
353 | m->current_pgpath = pgpath; | ||
354 | __switch_pg(m, pg); | ||
355 | spin_unlock_irqrestore(&m->lock, flags); | ||
356 | } | ||
337 | 357 | ||
338 | return 0; | 358 | return pgpath; |
339 | } | 359 | } |
340 | 360 | ||
341 | static void __choose_pgpath(struct multipath *m, size_t nr_bytes) | 361 | static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) |
342 | { | 362 | { |
363 | unsigned long flags; | ||
343 | struct priority_group *pg; | 364 | struct priority_group *pg; |
365 | struct pgpath *pgpath; | ||
344 | bool bypassed = true; | 366 | bool bypassed = true; |
345 | 367 | ||
346 | if (!atomic_read(&m->nr_valid_paths)) { | 368 | if (!atomic_read(&m->nr_valid_paths)) { |
@@ -349,16 +371,28 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes) | |||
349 | } | 371 | } |
350 | 372 | ||
351 | /* Were we instructed to switch PG? */ | 373 | /* Were we instructed to switch PG? */ |
352 | if (m->next_pg) { | 374 | if (lockless_dereference(m->next_pg)) { |
375 | spin_lock_irqsave(&m->lock, flags); | ||
353 | pg = m->next_pg; | 376 | pg = m->next_pg; |
377 | if (!pg) { | ||
378 | spin_unlock_irqrestore(&m->lock, flags); | ||
379 | goto check_current_pg; | ||
380 | } | ||
354 | m->next_pg = NULL; | 381 | m->next_pg = NULL; |
355 | if (!__choose_path_in_pg(m, pg, nr_bytes)) | 382 | spin_unlock_irqrestore(&m->lock, flags); |
356 | return; | 383 | pgpath = choose_path_in_pg(m, pg, nr_bytes); |
384 | if (!IS_ERR_OR_NULL(pgpath)) | ||
385 | return pgpath; | ||
357 | } | 386 | } |
358 | 387 | ||
359 | /* Don't change PG until it has no remaining paths */ | 388 | /* Don't change PG until it has no remaining paths */ |
360 | if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) | 389 | check_current_pg: |
361 | return; | 390 | pg = lockless_dereference(m->current_pg); |
391 | if (pg) { | ||
392 | pgpath = choose_path_in_pg(m, pg, nr_bytes); | ||
393 | if (!IS_ERR_OR_NULL(pgpath)) | ||
394 | return pgpath; | ||
395 | } | ||
362 | 396 | ||
363 | /* | 397 | /* |
364 | * Loop through priority groups until we find a valid path. | 398 | * Loop through priority groups until we find a valid path. |
@@ -370,31 +404,34 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes) | |||
370 | list_for_each_entry(pg, &m->priority_groups, list) { | 404 | list_for_each_entry(pg, &m->priority_groups, list) { |
371 | if (pg->bypassed == bypassed) | 405 | if (pg->bypassed == bypassed) |
372 | continue; | 406 | continue; |
373 | if (!__choose_path_in_pg(m, pg, nr_bytes)) { | 407 | pgpath = choose_path_in_pg(m, pg, nr_bytes); |
408 | if (!IS_ERR_OR_NULL(pgpath)) { | ||
374 | if (!bypassed) | 409 | if (!bypassed) |
375 | set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); | 410 | set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); |
376 | return; | 411 | return pgpath; |
377 | } | 412 | } |
378 | } | 413 | } |
379 | } while (bypassed--); | 414 | } while (bypassed--); |
380 | 415 | ||
381 | failed: | 416 | failed: |
417 | spin_lock_irqsave(&m->lock, flags); | ||
382 | m->current_pgpath = NULL; | 418 | m->current_pgpath = NULL; |
383 | m->current_pg = NULL; | 419 | m->current_pg = NULL; |
420 | spin_unlock_irqrestore(&m->lock, flags); | ||
421 | |||
422 | return NULL; | ||
384 | } | 423 | } |
385 | 424 | ||
386 | /* | 425 | /* |
387 | * Check whether bios must be queued in the device-mapper core rather | 426 | * Check whether bios must be queued in the device-mapper core rather |
388 | * than here in the target. | 427 | * than here in the target. |
389 | * | 428 | * |
390 | * m->lock must be held on entry. | ||
391 | * | ||
392 | * If m->queue_if_no_path and m->saved_queue_if_no_path hold the | 429 | * If m->queue_if_no_path and m->saved_queue_if_no_path hold the |
393 | * same value then we are not between multipath_presuspend() | 430 | * same value then we are not between multipath_presuspend() |
394 | * and multipath_resume() calls and we have no need to check | 431 | * and multipath_resume() calls and we have no need to check |
395 | * for the DMF_NOFLUSH_SUSPENDING flag. | 432 | * for the DMF_NOFLUSH_SUSPENDING flag. |
396 | */ | 433 | */ |
397 | static int __must_push_back(struct multipath *m) | 434 | static int must_push_back(struct multipath *m) |
398 | { | 435 | { |
399 | return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || | 436 | return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || |
400 | ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != | 437 | ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != |
@@ -416,36 +453,31 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, | |||
416 | struct block_device *bdev; | 453 | struct block_device *bdev; |
417 | struct dm_mpath_io *mpio; | 454 | struct dm_mpath_io *mpio; |
418 | 455 | ||
419 | spin_lock_irq(&m->lock); | ||
420 | |||
421 | /* Do we need to select a new pgpath? */ | 456 | /* Do we need to select a new pgpath? */ |
422 | if (!m->current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) | 457 | pgpath = lockless_dereference(m->current_pgpath); |
423 | __choose_pgpath(m, nr_bytes); | 458 | if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) |
424 | 459 | pgpath = choose_pgpath(m, nr_bytes); | |
425 | pgpath = m->current_pgpath; | ||
426 | 460 | ||
427 | if (!pgpath) { | 461 | if (!pgpath) { |
428 | if (!__must_push_back(m)) | 462 | if (!must_push_back(m)) |
429 | r = -EIO; /* Failed */ | 463 | r = -EIO; /* Failed */ |
430 | goto out_unlock; | 464 | return r; |
431 | } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || | 465 | } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || |
432 | test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { | 466 | test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { |
433 | __pg_init_all_paths(m); | 467 | pg_init_all_paths(m); |
434 | goto out_unlock; | 468 | return r; |
435 | } | 469 | } |
436 | 470 | ||
437 | mpio = set_mpio(m, map_context); | 471 | mpio = set_mpio(m, map_context); |
438 | if (!mpio) | 472 | if (!mpio) |
439 | /* ENOMEM, requeue */ | 473 | /* ENOMEM, requeue */ |
440 | goto out_unlock; | 474 | return r; |
441 | 475 | ||
442 | mpio->pgpath = pgpath; | 476 | mpio->pgpath = pgpath; |
443 | mpio->nr_bytes = nr_bytes; | 477 | mpio->nr_bytes = nr_bytes; |
444 | 478 | ||
445 | bdev = pgpath->path.dev->bdev; | 479 | bdev = pgpath->path.dev->bdev; |
446 | 480 | ||
447 | spin_unlock_irq(&m->lock); | ||
448 | |||
449 | if (clone) { | 481 | if (clone) { |
450 | /* | 482 | /* |
451 | * Old request-based interface: allocated clone is passed in. | 483 | * Old request-based interface: allocated clone is passed in. |
@@ -477,11 +509,6 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, | |||
477 | &pgpath->path, | 509 | &pgpath->path, |
478 | nr_bytes); | 510 | nr_bytes); |
479 | return DM_MAPIO_REMAPPED; | 511 | return DM_MAPIO_REMAPPED; |
480 | |||
481 | out_unlock: | ||
482 | spin_unlock_irq(&m->lock); | ||
483 | |||
484 | return r; | ||
485 | } | 512 | } |
486 | 513 | ||
487 | static int multipath_map(struct dm_target *ti, struct request *clone, | 514 | static int multipath_map(struct dm_target *ti, struct request *clone, |
@@ -1308,7 +1335,6 @@ static int do_end_io(struct multipath *m, struct request *clone, | |||
1308 | * clone bios for it and resubmit it later. | 1335 | * clone bios for it and resubmit it later. |
1309 | */ | 1336 | */ |
1310 | int r = DM_ENDIO_REQUEUE; | 1337 | int r = DM_ENDIO_REQUEUE; |
1311 | unsigned long flags; | ||
1312 | 1338 | ||
1313 | if (!error && !clone->errors) | 1339 | if (!error && !clone->errors) |
1314 | return 0; /* I/O complete */ | 1340 | return 0; /* I/O complete */ |
@@ -1319,17 +1345,15 @@ static int do_end_io(struct multipath *m, struct request *clone, | |||
1319 | if (mpio->pgpath) | 1345 | if (mpio->pgpath) |
1320 | fail_path(mpio->pgpath); | 1346 | fail_path(mpio->pgpath); |
1321 | 1347 | ||
1322 | spin_lock_irqsave(&m->lock, flags); | ||
1323 | if (!atomic_read(&m->nr_valid_paths)) { | 1348 | if (!atomic_read(&m->nr_valid_paths)) { |
1324 | if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { | 1349 | if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { |
1325 | if (!__must_push_back(m)) | 1350 | if (!must_push_back(m)) |
1326 | r = -EIO; | 1351 | r = -EIO; |
1327 | } else { | 1352 | } else { |
1328 | if (error == -EBADE) | 1353 | if (error == -EBADE) |
1329 | r = error; | 1354 | r = error; |
1330 | } | 1355 | } |
1331 | } | 1356 | } |
1332 | spin_unlock_irqrestore(&m->lock, flags); | ||
1333 | 1357 | ||
1334 | return r; | 1358 | return r; |
1335 | } | 1359 | } |
@@ -1586,18 +1610,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti, | |||
1586 | struct block_device **bdev, fmode_t *mode) | 1610 | struct block_device **bdev, fmode_t *mode) |
1587 | { | 1611 | { |
1588 | struct multipath *m = ti->private; | 1612 | struct multipath *m = ti->private; |
1589 | unsigned long flags; | 1613 | struct pgpath *current_pgpath; |
1590 | int r; | 1614 | int r; |
1591 | 1615 | ||
1592 | spin_lock_irqsave(&m->lock, flags); | 1616 | current_pgpath = lockless_dereference(m->current_pgpath); |
1593 | 1617 | if (!current_pgpath) | |
1594 | if (!m->current_pgpath) | 1618 | current_pgpath = choose_pgpath(m, 0); |
1595 | __choose_pgpath(m, 0); | ||
1596 | 1619 | ||
1597 | if (m->current_pgpath) { | 1620 | if (current_pgpath) { |
1598 | if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { | 1621 | if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { |
1599 | *bdev = m->current_pgpath->path.dev->bdev; | 1622 | *bdev = current_pgpath->path.dev->bdev; |
1600 | *mode = m->current_pgpath->path.dev->mode; | 1623 | *mode = current_pgpath->path.dev->mode; |
1601 | r = 0; | 1624 | r = 0; |
1602 | } else { | 1625 | } else { |
1603 | /* pg_init has not started or completed */ | 1626 | /* pg_init has not started or completed */ |
@@ -1611,17 +1634,13 @@ static int multipath_prepare_ioctl(struct dm_target *ti, | |||
1611 | r = -EIO; | 1634 | r = -EIO; |
1612 | } | 1635 | } |
1613 | 1636 | ||
1614 | spin_unlock_irqrestore(&m->lock, flags); | ||
1615 | |||
1616 | if (r == -ENOTCONN) { | 1637 | if (r == -ENOTCONN) { |
1617 | spin_lock_irqsave(&m->lock, flags); | 1638 | if (!lockless_dereference(m->current_pg)) { |
1618 | if (!m->current_pg) { | ||
1619 | /* Path status changed, redo selection */ | 1639 | /* Path status changed, redo selection */ |
1620 | __choose_pgpath(m, 0); | 1640 | (void) choose_pgpath(m, 0); |
1621 | } | 1641 | } |
1622 | if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) | 1642 | if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) |
1623 | __pg_init_all_paths(m); | 1643 | pg_init_all_paths(m); |
1624 | spin_unlock_irqrestore(&m->lock, flags); | ||
1625 | dm_table_run_md_queue_async(m->ti->table); | 1644 | dm_table_run_md_queue_async(m->ti->table); |
1626 | } | 1645 | } |
1627 | 1646 | ||
@@ -1672,39 +1691,37 @@ static int multipath_busy(struct dm_target *ti) | |||
1672 | { | 1691 | { |
1673 | bool busy = false, has_active = false; | 1692 | bool busy = false, has_active = false; |
1674 | struct multipath *m = ti->private; | 1693 | struct multipath *m = ti->private; |
1675 | struct priority_group *pg; | 1694 | struct priority_group *pg, *next_pg; |
1676 | struct pgpath *pgpath; | 1695 | struct pgpath *pgpath; |
1677 | unsigned long flags; | ||
1678 | |||
1679 | spin_lock_irqsave(&m->lock, flags); | ||
1680 | 1696 | ||
1681 | /* pg_init in progress or no paths available */ | 1697 | /* pg_init in progress or no paths available */ |
1682 | if (atomic_read(&m->pg_init_in_progress) || | 1698 | if (atomic_read(&m->pg_init_in_progress) || |
1683 | (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { | 1699 | (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) |
1684 | busy = true; | 1700 | return true; |
1685 | goto out; | 1701 | |
1686 | } | ||
1687 | /* Guess which priority_group will be used at next mapping time */ | 1702 | /* Guess which priority_group will be used at next mapping time */ |
1688 | if (unlikely(!m->current_pgpath && m->next_pg)) | 1703 | pg = lockless_dereference(m->current_pg); |
1689 | pg = m->next_pg; | 1704 | next_pg = lockless_dereference(m->next_pg); |
1690 | else if (likely(m->current_pg)) | 1705 | if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) |
1691 | pg = m->current_pg; | 1706 | pg = next_pg; |
1692 | else | 1707 | |
1708 | if (!pg) { | ||
1693 | /* | 1709 | /* |
1694 | * We don't know which pg will be used at next mapping time. | 1710 | * We don't know which pg will be used at next mapping time. |
1695 | * We don't call __choose_pgpath() here to avoid to trigger | 1711 | * We don't call choose_pgpath() here to avoid to trigger |
1696 | * pg_init just by busy checking. | 1712 | * pg_init just by busy checking. |
1697 | * So we don't know whether underlying devices we will be using | 1713 | * So we don't know whether underlying devices we will be using |
1698 | * at next mapping time are busy or not. Just try mapping. | 1714 | * at next mapping time are busy or not. Just try mapping. |
1699 | */ | 1715 | */ |
1700 | goto out; | 1716 | return busy; |
1717 | } | ||
1701 | 1718 | ||
1702 | /* | 1719 | /* |
1703 | * If there is one non-busy active path at least, the path selector | 1720 | * If there is one non-busy active path at least, the path selector |
1704 | * will be able to select it. So we consider such a pg as not busy. | 1721 | * will be able to select it. So we consider such a pg as not busy. |
1705 | */ | 1722 | */ |
1706 | busy = true; | 1723 | busy = true; |
1707 | list_for_each_entry(pgpath, &pg->pgpaths, list) | 1724 | list_for_each_entry(pgpath, &pg->pgpaths, list) { |
1708 | if (pgpath->is_active) { | 1725 | if (pgpath->is_active) { |
1709 | has_active = true; | 1726 | has_active = true; |
1710 | if (!pgpath_busy(pgpath)) { | 1727 | if (!pgpath_busy(pgpath)) { |
@@ -1712,17 +1729,16 @@ static int multipath_busy(struct dm_target *ti) | |||
1712 | break; | 1729 | break; |
1713 | } | 1730 | } |
1714 | } | 1731 | } |
1732 | } | ||
1715 | 1733 | ||
1716 | if (!has_active) | 1734 | if (!has_active) { |
1717 | /* | 1735 | /* |
1718 | * No active path in this pg, so this pg won't be used and | 1736 | * No active path in this pg, so this pg won't be used and |
1719 | * the current_pg will be changed at next mapping time. | 1737 | * the current_pg will be changed at next mapping time. |
1720 | * We need to try mapping to determine it. | 1738 | * We need to try mapping to determine it. |
1721 | */ | 1739 | */ |
1722 | busy = false; | 1740 | busy = false; |
1723 | 1741 | } | |
1724 | out: | ||
1725 | spin_unlock_irqrestore(&m->lock, flags); | ||
1726 | 1742 | ||
1727 | return busy; | 1743 | return busy; |
1728 | } | 1744 | } |