aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMike Snitzer <snitzer@redhat.com>2016-03-17 18:38:17 -0400
committerMike Snitzer <snitzer@redhat.com>2016-05-05 15:25:52 -0400
commit2da1610ae20e995e53658c3b10166d2ad74e30bd (patch)
tree4bee08cc145727a6366b2d516e41d2f703e3090c
parent20800cb3450ee44ec1827d7e8bbfd5a9dc02e6cd (diff)
dm mpath: eliminate use of spinlock in IO fast-paths
The primary motivation of this commit is to improve the scalability of DM multipath on large NUMA systems where m->lock spinlock contention has been proven to be a serious bottleneck on really fast storage. The ability to atomically read a pointer, using lockless_dereference(), is leveraged in this commit. But all pointer writes are still protected by the m->lock spinlock (which is fine since these all now occur in the slow-path). The following functions no longer require the m->lock spinlock in their fast-path: multipath_busy(), __multipath_map(), and do_end_io() And choose_pgpath() is modified to _not_ update m->current_pgpath unless it also switches the path-group. This is done to avoid needing to take the m->lock everytime __multipath_map() calls choose_pgpath(). But m->current_pgpath will be reset if it is failed via fail_path(). Suggested-by: Jeff Moyer <jmoyer@redhat.com> Reviewed-by: Hannes Reinecke <hare@suse.com> Tested-by: Hannes Reinecke <hare@suse.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
-rw-r--r--drivers/md/dm-mpath.c170
1 files changed, 93 insertions, 77 deletions
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 54daf96980c2..52baf8a5b0f4 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -305,9 +305,21 @@ static int __pg_init_all_paths(struct multipath *m)
305 return atomic_read(&m->pg_init_in_progress); 305 return atomic_read(&m->pg_init_in_progress);
306} 306}
307 307
308static void __switch_pg(struct multipath *m, struct pgpath *pgpath) 308static int pg_init_all_paths(struct multipath *m)
309{ 309{
310 m->current_pg = pgpath->pg; 310 int r;
311 unsigned long flags;
312
313 spin_lock_irqsave(&m->lock, flags);
314 r = __pg_init_all_paths(m);
315 spin_unlock_irqrestore(&m->lock, flags);
316
317 return r;
318}
319
320static void __switch_pg(struct multipath *m, struct priority_group *pg)
321{
322 m->current_pg = pg;
311 323
312 /* Must we initialise the PG first, and queue I/O till it's ready? */ 324 /* Must we initialise the PG first, and queue I/O till it's ready? */
313 if (m->hw_handler_name) { 325 if (m->hw_handler_name) {
@@ -321,26 +333,36 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
321 atomic_set(&m->pg_init_count, 0); 333 atomic_set(&m->pg_init_count, 0);
322} 334}
323 335
324static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, 336static struct pgpath *choose_path_in_pg(struct multipath *m,
325 size_t nr_bytes) 337 struct priority_group *pg,
338 size_t nr_bytes)
326{ 339{
340 unsigned long flags;
327 struct dm_path *path; 341 struct dm_path *path;
342 struct pgpath *pgpath;
328 343
329 path = pg->ps.type->select_path(&pg->ps, nr_bytes); 344 path = pg->ps.type->select_path(&pg->ps, nr_bytes);
330 if (!path) 345 if (!path)
331 return -ENXIO; 346 return ERR_PTR(-ENXIO);
332 347
333 m->current_pgpath = path_to_pgpath(path); 348 pgpath = path_to_pgpath(path);
334 349
335 if (m->current_pg != pg) 350 if (unlikely(lockless_dereference(m->current_pg) != pg)) {
336 __switch_pg(m, m->current_pgpath); 351 /* Only update current_pgpath if pg changed */
352 spin_lock_irqsave(&m->lock, flags);
353 m->current_pgpath = pgpath;
354 __switch_pg(m, pg);
355 spin_unlock_irqrestore(&m->lock, flags);
356 }
337 357
338 return 0; 358 return pgpath;
339} 359}
340 360
341static void __choose_pgpath(struct multipath *m, size_t nr_bytes) 361static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
342{ 362{
363 unsigned long flags;
343 struct priority_group *pg; 364 struct priority_group *pg;
365 struct pgpath *pgpath;
344 bool bypassed = true; 366 bool bypassed = true;
345 367
346 if (!atomic_read(&m->nr_valid_paths)) { 368 if (!atomic_read(&m->nr_valid_paths)) {
@@ -349,16 +371,28 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
349 } 371 }
350 372
351 /* Were we instructed to switch PG? */ 373 /* Were we instructed to switch PG? */
352 if (m->next_pg) { 374 if (lockless_dereference(m->next_pg)) {
375 spin_lock_irqsave(&m->lock, flags);
353 pg = m->next_pg; 376 pg = m->next_pg;
377 if (!pg) {
378 spin_unlock_irqrestore(&m->lock, flags);
379 goto check_current_pg;
380 }
354 m->next_pg = NULL; 381 m->next_pg = NULL;
355 if (!__choose_path_in_pg(m, pg, nr_bytes)) 382 spin_unlock_irqrestore(&m->lock, flags);
356 return; 383 pgpath = choose_path_in_pg(m, pg, nr_bytes);
384 if (!IS_ERR_OR_NULL(pgpath))
385 return pgpath;
357 } 386 }
358 387
359 /* Don't change PG until it has no remaining paths */ 388 /* Don't change PG until it has no remaining paths */
360 if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) 389check_current_pg:
361 return; 390 pg = lockless_dereference(m->current_pg);
391 if (pg) {
392 pgpath = choose_path_in_pg(m, pg, nr_bytes);
393 if (!IS_ERR_OR_NULL(pgpath))
394 return pgpath;
395 }
362 396
363 /* 397 /*
364 * Loop through priority groups until we find a valid path. 398 * Loop through priority groups until we find a valid path.
@@ -370,31 +404,34 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
370 list_for_each_entry(pg, &m->priority_groups, list) { 404 list_for_each_entry(pg, &m->priority_groups, list) {
371 if (pg->bypassed == bypassed) 405 if (pg->bypassed == bypassed)
372 continue; 406 continue;
373 if (!__choose_path_in_pg(m, pg, nr_bytes)) { 407 pgpath = choose_path_in_pg(m, pg, nr_bytes);
408 if (!IS_ERR_OR_NULL(pgpath)) {
374 if (!bypassed) 409 if (!bypassed)
375 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); 410 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
376 return; 411 return pgpath;
377 } 412 }
378 } 413 }
379 } while (bypassed--); 414 } while (bypassed--);
380 415
381failed: 416failed:
417 spin_lock_irqsave(&m->lock, flags);
382 m->current_pgpath = NULL; 418 m->current_pgpath = NULL;
383 m->current_pg = NULL; 419 m->current_pg = NULL;
420 spin_unlock_irqrestore(&m->lock, flags);
421
422 return NULL;
384} 423}
385 424
386/* 425/*
387 * Check whether bios must be queued in the device-mapper core rather 426 * Check whether bios must be queued in the device-mapper core rather
388 * than here in the target. 427 * than here in the target.
389 * 428 *
390 * m->lock must be held on entry.
391 *
392 * If m->queue_if_no_path and m->saved_queue_if_no_path hold the 429 * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
393 * same value then we are not between multipath_presuspend() 430 * same value then we are not between multipath_presuspend()
394 * and multipath_resume() calls and we have no need to check 431 * and multipath_resume() calls and we have no need to check
395 * for the DMF_NOFLUSH_SUSPENDING flag. 432 * for the DMF_NOFLUSH_SUSPENDING flag.
396 */ 433 */
397static int __must_push_back(struct multipath *m) 434static int must_push_back(struct multipath *m)
398{ 435{
399 return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || 436 return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
400 ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != 437 ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
@@ -416,36 +453,31 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
416 struct block_device *bdev; 453 struct block_device *bdev;
417 struct dm_mpath_io *mpio; 454 struct dm_mpath_io *mpio;
418 455
419 spin_lock_irq(&m->lock);
420
421 /* Do we need to select a new pgpath? */ 456 /* Do we need to select a new pgpath? */
422 if (!m->current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) 457 pgpath = lockless_dereference(m->current_pgpath);
423 __choose_pgpath(m, nr_bytes); 458 if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
424 459 pgpath = choose_pgpath(m, nr_bytes);
425 pgpath = m->current_pgpath;
426 460
427 if (!pgpath) { 461 if (!pgpath) {
428 if (!__must_push_back(m)) 462 if (!must_push_back(m))
429 r = -EIO; /* Failed */ 463 r = -EIO; /* Failed */
430 goto out_unlock; 464 return r;
431 } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || 465 } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
432 test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { 466 test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
433 __pg_init_all_paths(m); 467 pg_init_all_paths(m);
434 goto out_unlock; 468 return r;
435 } 469 }
436 470
437 mpio = set_mpio(m, map_context); 471 mpio = set_mpio(m, map_context);
438 if (!mpio) 472 if (!mpio)
439 /* ENOMEM, requeue */ 473 /* ENOMEM, requeue */
440 goto out_unlock; 474 return r;
441 475
442 mpio->pgpath = pgpath; 476 mpio->pgpath = pgpath;
443 mpio->nr_bytes = nr_bytes; 477 mpio->nr_bytes = nr_bytes;
444 478
445 bdev = pgpath->path.dev->bdev; 479 bdev = pgpath->path.dev->bdev;
446 480
447 spin_unlock_irq(&m->lock);
448
449 if (clone) { 481 if (clone) {
450 /* 482 /*
451 * Old request-based interface: allocated clone is passed in. 483 * Old request-based interface: allocated clone is passed in.
@@ -477,11 +509,6 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
477 &pgpath->path, 509 &pgpath->path,
478 nr_bytes); 510 nr_bytes);
479 return DM_MAPIO_REMAPPED; 511 return DM_MAPIO_REMAPPED;
480
481out_unlock:
482 spin_unlock_irq(&m->lock);
483
484 return r;
485} 512}
486 513
487static int multipath_map(struct dm_target *ti, struct request *clone, 514static int multipath_map(struct dm_target *ti, struct request *clone,
@@ -1308,7 +1335,6 @@ static int do_end_io(struct multipath *m, struct request *clone,
1308 * clone bios for it and resubmit it later. 1335 * clone bios for it and resubmit it later.
1309 */ 1336 */
1310 int r = DM_ENDIO_REQUEUE; 1337 int r = DM_ENDIO_REQUEUE;
1311 unsigned long flags;
1312 1338
1313 if (!error && !clone->errors) 1339 if (!error && !clone->errors)
1314 return 0; /* I/O complete */ 1340 return 0; /* I/O complete */
@@ -1319,17 +1345,15 @@ static int do_end_io(struct multipath *m, struct request *clone,
1319 if (mpio->pgpath) 1345 if (mpio->pgpath)
1320 fail_path(mpio->pgpath); 1346 fail_path(mpio->pgpath);
1321 1347
1322 spin_lock_irqsave(&m->lock, flags);
1323 if (!atomic_read(&m->nr_valid_paths)) { 1348 if (!atomic_read(&m->nr_valid_paths)) {
1324 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 1349 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
1325 if (!__must_push_back(m)) 1350 if (!must_push_back(m))
1326 r = -EIO; 1351 r = -EIO;
1327 } else { 1352 } else {
1328 if (error == -EBADE) 1353 if (error == -EBADE)
1329 r = error; 1354 r = error;
1330 } 1355 }
1331 } 1356 }
1332 spin_unlock_irqrestore(&m->lock, flags);
1333 1357
1334 return r; 1358 return r;
1335} 1359}
@@ -1586,18 +1610,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
1586 struct block_device **bdev, fmode_t *mode) 1610 struct block_device **bdev, fmode_t *mode)
1587{ 1611{
1588 struct multipath *m = ti->private; 1612 struct multipath *m = ti->private;
1589 unsigned long flags; 1613 struct pgpath *current_pgpath;
1590 int r; 1614 int r;
1591 1615
1592 spin_lock_irqsave(&m->lock, flags); 1616 current_pgpath = lockless_dereference(m->current_pgpath);
1593 1617 if (!current_pgpath)
1594 if (!m->current_pgpath) 1618 current_pgpath = choose_pgpath(m, 0);
1595 __choose_pgpath(m, 0);
1596 1619
1597 if (m->current_pgpath) { 1620 if (current_pgpath) {
1598 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { 1621 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
1599 *bdev = m->current_pgpath->path.dev->bdev; 1622 *bdev = current_pgpath->path.dev->bdev;
1600 *mode = m->current_pgpath->path.dev->mode; 1623 *mode = current_pgpath->path.dev->mode;
1601 r = 0; 1624 r = 0;
1602 } else { 1625 } else {
1603 /* pg_init has not started or completed */ 1626 /* pg_init has not started or completed */
@@ -1611,17 +1634,13 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
1611 r = -EIO; 1634 r = -EIO;
1612 } 1635 }
1613 1636
1614 spin_unlock_irqrestore(&m->lock, flags);
1615
1616 if (r == -ENOTCONN) { 1637 if (r == -ENOTCONN) {
1617 spin_lock_irqsave(&m->lock, flags); 1638 if (!lockless_dereference(m->current_pg)) {
1618 if (!m->current_pg) {
1619 /* Path status changed, redo selection */ 1639 /* Path status changed, redo selection */
1620 __choose_pgpath(m, 0); 1640 (void) choose_pgpath(m, 0);
1621 } 1641 }
1622 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 1642 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1623 __pg_init_all_paths(m); 1643 pg_init_all_paths(m);
1624 spin_unlock_irqrestore(&m->lock, flags);
1625 dm_table_run_md_queue_async(m->ti->table); 1644 dm_table_run_md_queue_async(m->ti->table);
1626 } 1645 }
1627 1646
@@ -1672,39 +1691,37 @@ static int multipath_busy(struct dm_target *ti)
1672{ 1691{
1673 bool busy = false, has_active = false; 1692 bool busy = false, has_active = false;
1674 struct multipath *m = ti->private; 1693 struct multipath *m = ti->private;
1675 struct priority_group *pg; 1694 struct priority_group *pg, *next_pg;
1676 struct pgpath *pgpath; 1695 struct pgpath *pgpath;
1677 unsigned long flags;
1678
1679 spin_lock_irqsave(&m->lock, flags);
1680 1696
1681 /* pg_init in progress or no paths available */ 1697 /* pg_init in progress or no paths available */
1682 if (atomic_read(&m->pg_init_in_progress) || 1698 if (atomic_read(&m->pg_init_in_progress) ||
1683 (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { 1699 (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)))
1684 busy = true; 1700 return true;
1685 goto out; 1701
1686 }
1687 /* Guess which priority_group will be used at next mapping time */ 1702 /* Guess which priority_group will be used at next mapping time */
1688 if (unlikely(!m->current_pgpath && m->next_pg)) 1703 pg = lockless_dereference(m->current_pg);
1689 pg = m->next_pg; 1704 next_pg = lockless_dereference(m->next_pg);
1690 else if (likely(m->current_pg)) 1705 if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg))
1691 pg = m->current_pg; 1706 pg = next_pg;
1692 else 1707
1708 if (!pg) {
1693 /* 1709 /*
1694 * We don't know which pg will be used at next mapping time. 1710 * We don't know which pg will be used at next mapping time.
1695 * We don't call __choose_pgpath() here to avoid to trigger 1711 * We don't call choose_pgpath() here to avoid to trigger
1696 * pg_init just by busy checking. 1712 * pg_init just by busy checking.
1697 * So we don't know whether underlying devices we will be using 1713 * So we don't know whether underlying devices we will be using
1698 * at next mapping time are busy or not. Just try mapping. 1714 * at next mapping time are busy or not. Just try mapping.
1699 */ 1715 */
1700 goto out; 1716 return busy;
1717 }
1701 1718
1702 /* 1719 /*
1703 * If there is one non-busy active path at least, the path selector 1720 * If there is one non-busy active path at least, the path selector
1704 * will be able to select it. So we consider such a pg as not busy. 1721 * will be able to select it. So we consider such a pg as not busy.
1705 */ 1722 */
1706 busy = true; 1723 busy = true;
1707 list_for_each_entry(pgpath, &pg->pgpaths, list) 1724 list_for_each_entry(pgpath, &pg->pgpaths, list) {
1708 if (pgpath->is_active) { 1725 if (pgpath->is_active) {
1709 has_active = true; 1726 has_active = true;
1710 if (!pgpath_busy(pgpath)) { 1727 if (!pgpath_busy(pgpath)) {
@@ -1712,17 +1729,16 @@ static int multipath_busy(struct dm_target *ti)
1712 break; 1729 break;
1713 } 1730 }
1714 } 1731 }
1732 }
1715 1733
1716 if (!has_active) 1734 if (!has_active) {
1717 /* 1735 /*
1718 * No active path in this pg, so this pg won't be used and 1736 * No active path in this pg, so this pg won't be used and
1719 * the current_pg will be changed at next mapping time. 1737 * the current_pg will be changed at next mapping time.
1720 * We need to try mapping to determine it. 1738 * We need to try mapping to determine it.
1721 */ 1739 */
1722 busy = false; 1740 busy = false;
1723 1741 }
1724out:
1725 spin_unlock_irqrestore(&m->lock, flags);
1726 1742
1727 return busy; 1743 return busy;
1728} 1744}