aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-17 19:13:00 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-17 19:13:00 -0400
commitb80fed9595513384424cd141923c9161c4b5021b (patch)
treea7ca08c40a41f157f3cb472b9bc7cfc123859d8d
parent24b9f0cf00c8e8df29a4ddfec8c139ad62753113 (diff)
parent202bae52934d4eb79ffaebf49f49b1cc64d8e40b (diff)
Merge tag 'dm-4.7-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - based on Jens' 'for-4.7/core' to have DM thinp's discard support use bio_inc_remaining() and the block core's new async __blkdev_issue_discard() interface - make DM multipath's fast code-paths lockless, using lockless_deference, to significantly improve large NUMA performance when using blk-mq. The m->lock spinlock contention was a serious bottleneck. - a few other small code cleanups and Documentation fixes * tag 'dm-4.7-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: dm thin: unroll issue_discard() to create longer discard bio chains dm thin: use __blkdev_issue_discard for async discard support dm thin: remove __bio_inc_remaining() and switch to using bio_inc_remaining() dm raid: make sure no feature flags are set in metadata dm ioctl: drop use of __GFP_REPEAT in copy_params()'s __vmalloc() call dm stats: fix spelling mistake in Documentation dm cache: update cache-policies.txt now that mq is an alias for smq dm mpath: eliminate use of spinlock in IO fast-paths dm mpath: move trigger_event member to the end of 'struct multipath' dm mpath: use atomic_t for counting members of 'struct multipath' dm mpath: switch to using bitops for state flags dm thin: Remove return statement from void function dm: remove unused mapped_device argument from free_tio()
-rw-r--r--Documentation/device-mapper/cache-policies.txt34
-rw-r--r--Documentation/device-mapper/statistics.txt2
-rw-r--r--drivers/md/dm-ioctl.c2
-rw-r--r--drivers/md/dm-mpath.c351
-rw-r--r--drivers/md/dm-raid.c7
-rw-r--r--drivers/md/dm-thin.c165
-rw-r--r--drivers/md/dm.c10
7 files changed, 298 insertions, 273 deletions
diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt
index e5062ad18717..d3ca8af21a31 100644
--- a/Documentation/device-mapper/cache-policies.txt
+++ b/Documentation/device-mapper/cache-policies.txt
@@ -11,7 +11,7 @@ Every bio that is mapped by the target is referred to the policy.
11The policy can return a simple HIT or MISS or issue a migration. 11The policy can return a simple HIT or MISS or issue a migration.
12 12
13Currently there's no way for the policy to issue background work, 13Currently there's no way for the policy to issue background work,
14e.g. to start writing back dirty blocks that are going to be evicte 14e.g. to start writing back dirty blocks that are going to be evicted
15soon. 15soon.
16 16
17Because we map bios, rather than requests it's easy for the policy 17Because we map bios, rather than requests it's easy for the policy
@@ -48,7 +48,7 @@ with the multiqueue (mq) policy.
48 48
49The smq policy (vs mq) offers the promise of less memory utilization, 49The smq policy (vs mq) offers the promise of less memory utilization,
50improved performance and increased adaptability in the face of changing 50improved performance and increased adaptability in the face of changing
51workloads. SMQ also does not have any cumbersome tuning knobs. 51workloads. smq also does not have any cumbersome tuning knobs.
52 52
53Users may switch from "mq" to "smq" simply by appropriately reloading a 53Users may switch from "mq" to "smq" simply by appropriately reloading a
54DM table that is using the cache target. Doing so will cause all of the 54DM table that is using the cache target. Doing so will cause all of the
@@ -57,47 +57,45 @@ degrade slightly until smq recalculates the origin device's hotspots
57that should be cached. 57that should be cached.
58 58
59Memory usage: 59Memory usage:
60The mq policy uses a lot of memory; 88 bytes per cache block on a 64 60The mq policy used a lot of memory; 88 bytes per cache block on a 64
61bit machine. 61bit machine.
62 62
63SMQ uses 28bit indexes to implement it's data structures rather than 63smq uses 28bit indexes to implement it's data structures rather than
64pointers. It avoids storing an explicit hit count for each block. It 64pointers. It avoids storing an explicit hit count for each block. It
65has a 'hotspot' queue rather than a pre cache which uses a quarter of 65has a 'hotspot' queue, rather than a pre-cache, which uses a quarter of
66the entries (each hotspot block covers a larger area than a single 66the entries (each hotspot block covers a larger area than a single
67cache block). 67cache block).
68 68
69All these mean smq uses ~25bytes per cache block. Still a lot of 69All this means smq uses ~25bytes per cache block. Still a lot of
70memory, but a substantial improvement nontheless. 70memory, but a substantial improvement nontheless.
71 71
72Level balancing: 72Level balancing:
73MQ places entries in different levels of the multiqueue structures 73mq placed entries in different levels of the multiqueue structures
74based on their hit count (~ln(hit count)). This means the bottom 74based on their hit count (~ln(hit count)). This meant the bottom
75levels generally have the most entries, and the top ones have very 75levels generally had the most entries, and the top ones had very
76few. Having unbalanced levels like this reduces the efficacy of the 76few. Having unbalanced levels like this reduced the efficacy of the
77multiqueue. 77multiqueue.
78 78
79SMQ does not maintain a hit count, instead it swaps hit entries with 79smq does not maintain a hit count, instead it swaps hit entries with
80the least recently used entry from the level above. The over all 80the least recently used entry from the level above. The overall
81ordering being a side effect of this stochastic process. With this 81ordering being a side effect of this stochastic process. With this
82scheme we can decide how many entries occupy each multiqueue level, 82scheme we can decide how many entries occupy each multiqueue level,
83resulting in better promotion/demotion decisions. 83resulting in better promotion/demotion decisions.
84 84
85Adaptability: 85Adaptability:
86The MQ policy maintains a hit count for each cache block. For a 86The mq policy maintained a hit count for each cache block. For a
87different block to get promoted to the cache it's hit count has to 87different block to get promoted to the cache it's hit count has to
88exceed the lowest currently in the cache. This means it can take a 88exceed the lowest currently in the cache. This meant it could take a
89long time for the cache to adapt between varying IO patterns. 89long time for the cache to adapt between varying IO patterns.
90Periodically degrading the hit counts could help with this, but I
91haven't found a nice general solution.
92 90
93SMQ doesn't maintain hit counts, so a lot of this problem just goes 91smq doesn't maintain hit counts, so a lot of this problem just goes
94away. In addition it tracks performance of the hotspot queue, which 92away. In addition it tracks performance of the hotspot queue, which
95is used to decide which blocks to promote. If the hotspot queue is 93is used to decide which blocks to promote. If the hotspot queue is
96performing badly then it starts moving entries more quickly between 94performing badly then it starts moving entries more quickly between
97levels. This lets it adapt to new IO patterns very quickly. 95levels. This lets it adapt to new IO patterns very quickly.
98 96
99Performance: 97Performance:
100Testing SMQ shows substantially better performance than MQ. 98Testing smq shows substantially better performance than mq.
101 99
102cleaner 100cleaner
103------- 101-------
diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.txt
index 6f5ef944ca4c..170ac02a1f50 100644
--- a/Documentation/device-mapper/statistics.txt
+++ b/Documentation/device-mapper/statistics.txt
@@ -205,7 +205,7 @@ statistics on them:
205 205
206 dmsetup message vol 0 @stats_create - /100 206 dmsetup message vol 0 @stats_create - /100
207 207
208Set the auxillary data string to "foo bar baz" (the escape for each 208Set the auxiliary data string to "foo bar baz" (the escape for each
209space must also be escaped, otherwise the shell will consume them): 209space must also be escaped, otherwise the shell will consume them):
210 210
211 dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz 211 dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 2adf81d81fca..2c7ca258c4e4 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1723,7 +1723,7 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
1723 if (!dmi) { 1723 if (!dmi) {
1724 unsigned noio_flag; 1724 unsigned noio_flag;
1725 noio_flag = memalloc_noio_save(); 1725 noio_flag = memalloc_noio_save();
1726 dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL); 1726 dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
1727 memalloc_noio_restore(noio_flag); 1727 memalloc_noio_restore(noio_flag);
1728 if (dmi) 1728 if (dmi)
1729 *param_flags |= DM_PARAMS_VMALLOC; 1729 *param_flags |= DM_PARAMS_VMALLOC;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 677ba223e2ae..52baf8a5b0f4 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -76,26 +76,18 @@ struct multipath {
76 76
77 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ 77 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
78 78
79 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
80
81 unsigned nr_valid_paths; /* Total number of usable paths */
82 struct pgpath *current_pgpath; 79 struct pgpath *current_pgpath;
83 struct priority_group *current_pg; 80 struct priority_group *current_pg;
84 struct priority_group *next_pg; /* Switch to this PG if set */ 81 struct priority_group *next_pg; /* Switch to this PG if set */
85 82
86 bool queue_io:1; /* Must we queue all I/O? */ 83 unsigned long flags; /* Multipath state flags */
87 bool queue_if_no_path:1; /* Queue I/O if last path fails? */
88 bool saved_queue_if_no_path:1; /* Saved state during suspension */
89 bool retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
90 bool pg_init_disabled:1; /* pg_init is not currently allowed */
91 bool pg_init_required:1; /* pg_init needs calling? */
92 bool pg_init_delay_retry:1; /* Delay pg_init retry? */
93 84
94 unsigned pg_init_retries; /* Number of times to retry pg_init */ 85 unsigned pg_init_retries; /* Number of times to retry pg_init */
95 unsigned pg_init_count; /* Number of times pg_init called */
96 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ 86 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
97 87
98 struct work_struct trigger_event; 88 atomic_t nr_valid_paths; /* Total number of usable paths */
89 atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */
90 atomic_t pg_init_count; /* Number of times pg_init called */
99 91
100 /* 92 /*
101 * We must use a mempool of dm_mpath_io structs so that we 93 * We must use a mempool of dm_mpath_io structs so that we
@@ -104,6 +96,7 @@ struct multipath {
104 mempool_t *mpio_pool; 96 mempool_t *mpio_pool;
105 97
106 struct mutex work_mutex; 98 struct mutex work_mutex;
99 struct work_struct trigger_event;
107}; 100};
108 101
109/* 102/*
@@ -122,6 +115,17 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
122static void trigger_event(struct work_struct *work); 115static void trigger_event(struct work_struct *work);
123static void activate_path(struct work_struct *work); 116static void activate_path(struct work_struct *work);
124 117
118/*-----------------------------------------------
119 * Multipath state flags.
120 *-----------------------------------------------*/
121
122#define MPATHF_QUEUE_IO 0 /* Must we queue all I/O? */
123#define MPATHF_QUEUE_IF_NO_PATH 1 /* Queue I/O if last path fails? */
124#define MPATHF_SAVED_QUEUE_IF_NO_PATH 2 /* Saved state during suspension */
125#define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3 /* If there's already a hw_handler present, don't change it. */
126#define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */
127#define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */
128#define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */
125 129
126/*----------------------------------------------- 130/*-----------------------------------------------
127 * Allocation routines 131 * Allocation routines
@@ -189,7 +193,10 @@ static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq)
189 if (m) { 193 if (m) {
190 INIT_LIST_HEAD(&m->priority_groups); 194 INIT_LIST_HEAD(&m->priority_groups);
191 spin_lock_init(&m->lock); 195 spin_lock_init(&m->lock);
192 m->queue_io = true; 196 set_bit(MPATHF_QUEUE_IO, &m->flags);
197 atomic_set(&m->nr_valid_paths, 0);
198 atomic_set(&m->pg_init_in_progress, 0);
199 atomic_set(&m->pg_init_count, 0);
193 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; 200 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
194 INIT_WORK(&m->trigger_event, trigger_event); 201 INIT_WORK(&m->trigger_event, trigger_event);
195 init_waitqueue_head(&m->pg_init_wait); 202 init_waitqueue_head(&m->pg_init_wait);
@@ -274,17 +281,17 @@ static int __pg_init_all_paths(struct multipath *m)
274 struct pgpath *pgpath; 281 struct pgpath *pgpath;
275 unsigned long pg_init_delay = 0; 282 unsigned long pg_init_delay = 0;
276 283
277 if (m->pg_init_in_progress || m->pg_init_disabled) 284 if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
278 return 0; 285 return 0;
279 286
280 m->pg_init_count++; 287 atomic_inc(&m->pg_init_count);
281 m->pg_init_required = false; 288 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
282 289
283 /* Check here to reset pg_init_required */ 290 /* Check here to reset pg_init_required */
284 if (!m->current_pg) 291 if (!m->current_pg)
285 return 0; 292 return 0;
286 293
287 if (m->pg_init_delay_retry) 294 if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags))
288 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? 295 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
289 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); 296 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
290 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { 297 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
@@ -293,65 +300,99 @@ static int __pg_init_all_paths(struct multipath *m)
293 continue; 300 continue;
294 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, 301 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
295 pg_init_delay)) 302 pg_init_delay))
296 m->pg_init_in_progress++; 303 atomic_inc(&m->pg_init_in_progress);
297 } 304 }
298 return m->pg_init_in_progress; 305 return atomic_read(&m->pg_init_in_progress);
306}
307
308static int pg_init_all_paths(struct multipath *m)
309{
310 int r;
311 unsigned long flags;
312
313 spin_lock_irqsave(&m->lock, flags);
314 r = __pg_init_all_paths(m);
315 spin_unlock_irqrestore(&m->lock, flags);
316
317 return r;
299} 318}
300 319
301static void __switch_pg(struct multipath *m, struct pgpath *pgpath) 320static void __switch_pg(struct multipath *m, struct priority_group *pg)
302{ 321{
303 m->current_pg = pgpath->pg; 322 m->current_pg = pg;
304 323
305 /* Must we initialise the PG first, and queue I/O till it's ready? */ 324 /* Must we initialise the PG first, and queue I/O till it's ready? */
306 if (m->hw_handler_name) { 325 if (m->hw_handler_name) {
307 m->pg_init_required = true; 326 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
308 m->queue_io = true; 327 set_bit(MPATHF_QUEUE_IO, &m->flags);
309 } else { 328 } else {
310 m->pg_init_required = false; 329 clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
311 m->queue_io = false; 330 clear_bit(MPATHF_QUEUE_IO, &m->flags);
312 } 331 }
313 332
314 m->pg_init_count = 0; 333 atomic_set(&m->pg_init_count, 0);
315} 334}
316 335
317static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, 336static struct pgpath *choose_path_in_pg(struct multipath *m,
318 size_t nr_bytes) 337 struct priority_group *pg,
338 size_t nr_bytes)
319{ 339{
340 unsigned long flags;
320 struct dm_path *path; 341 struct dm_path *path;
342 struct pgpath *pgpath;
321 343
322 path = pg->ps.type->select_path(&pg->ps, nr_bytes); 344 path = pg->ps.type->select_path(&pg->ps, nr_bytes);
323 if (!path) 345 if (!path)
324 return -ENXIO; 346 return ERR_PTR(-ENXIO);
325 347
326 m->current_pgpath = path_to_pgpath(path); 348 pgpath = path_to_pgpath(path);
327 349
328 if (m->current_pg != pg) 350 if (unlikely(lockless_dereference(m->current_pg) != pg)) {
329 __switch_pg(m, m->current_pgpath); 351 /* Only update current_pgpath if pg changed */
352 spin_lock_irqsave(&m->lock, flags);
353 m->current_pgpath = pgpath;
354 __switch_pg(m, pg);
355 spin_unlock_irqrestore(&m->lock, flags);
356 }
330 357
331 return 0; 358 return pgpath;
332} 359}
333 360
334static void __choose_pgpath(struct multipath *m, size_t nr_bytes) 361static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
335{ 362{
363 unsigned long flags;
336 struct priority_group *pg; 364 struct priority_group *pg;
365 struct pgpath *pgpath;
337 bool bypassed = true; 366 bool bypassed = true;
338 367
339 if (!m->nr_valid_paths) { 368 if (!atomic_read(&m->nr_valid_paths)) {
340 m->queue_io = false; 369 clear_bit(MPATHF_QUEUE_IO, &m->flags);
341 goto failed; 370 goto failed;
342 } 371 }
343 372
344 /* Were we instructed to switch PG? */ 373 /* Were we instructed to switch PG? */
345 if (m->next_pg) { 374 if (lockless_dereference(m->next_pg)) {
375 spin_lock_irqsave(&m->lock, flags);
346 pg = m->next_pg; 376 pg = m->next_pg;
377 if (!pg) {
378 spin_unlock_irqrestore(&m->lock, flags);
379 goto check_current_pg;
380 }
347 m->next_pg = NULL; 381 m->next_pg = NULL;
348 if (!__choose_path_in_pg(m, pg, nr_bytes)) 382 spin_unlock_irqrestore(&m->lock, flags);
349 return; 383 pgpath = choose_path_in_pg(m, pg, nr_bytes);
384 if (!IS_ERR_OR_NULL(pgpath))
385 return pgpath;
350 } 386 }
351 387
352 /* Don't change PG until it has no remaining paths */ 388 /* Don't change PG until it has no remaining paths */
353 if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) 389check_current_pg:
354 return; 390 pg = lockless_dereference(m->current_pg);
391 if (pg) {
392 pgpath = choose_path_in_pg(m, pg, nr_bytes);
393 if (!IS_ERR_OR_NULL(pgpath))
394 return pgpath;
395 }
355 396
356 /* 397 /*
357 * Loop through priority groups until we find a valid path. 398 * Loop through priority groups until we find a valid path.
@@ -363,34 +404,38 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
363 list_for_each_entry(pg, &m->priority_groups, list) { 404 list_for_each_entry(pg, &m->priority_groups, list) {
364 if (pg->bypassed == bypassed) 405 if (pg->bypassed == bypassed)
365 continue; 406 continue;
366 if (!__choose_path_in_pg(m, pg, nr_bytes)) { 407 pgpath = choose_path_in_pg(m, pg, nr_bytes);
408 if (!IS_ERR_OR_NULL(pgpath)) {
367 if (!bypassed) 409 if (!bypassed)
368 m->pg_init_delay_retry = true; 410 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
369 return; 411 return pgpath;
370 } 412 }
371 } 413 }
372 } while (bypassed--); 414 } while (bypassed--);
373 415
374failed: 416failed:
417 spin_lock_irqsave(&m->lock, flags);
375 m->current_pgpath = NULL; 418 m->current_pgpath = NULL;
376 m->current_pg = NULL; 419 m->current_pg = NULL;
420 spin_unlock_irqrestore(&m->lock, flags);
421
422 return NULL;
377} 423}
378 424
379/* 425/*
380 * Check whether bios must be queued in the device-mapper core rather 426 * Check whether bios must be queued in the device-mapper core rather
381 * than here in the target. 427 * than here in the target.
382 * 428 *
383 * m->lock must be held on entry.
384 *
385 * If m->queue_if_no_path and m->saved_queue_if_no_path hold the 429 * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
386 * same value then we are not between multipath_presuspend() 430 * same value then we are not between multipath_presuspend()
387 * and multipath_resume() calls and we have no need to check 431 * and multipath_resume() calls and we have no need to check
388 * for the DMF_NOFLUSH_SUSPENDING flag. 432 * for the DMF_NOFLUSH_SUSPENDING flag.
389 */ 433 */
390static int __must_push_back(struct multipath *m) 434static int must_push_back(struct multipath *m)
391{ 435{
392 return (m->queue_if_no_path || 436 return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
393 (m->queue_if_no_path != m->saved_queue_if_no_path && 437 ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
438 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
394 dm_noflush_suspending(m->ti))); 439 dm_noflush_suspending(m->ti)));
395} 440}
396 441
@@ -408,35 +453,31 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
408 struct block_device *bdev; 453 struct block_device *bdev;
409 struct dm_mpath_io *mpio; 454 struct dm_mpath_io *mpio;
410 455
411 spin_lock_irq(&m->lock);
412
413 /* Do we need to select a new pgpath? */ 456 /* Do we need to select a new pgpath? */
414 if (!m->current_pgpath || !m->queue_io) 457 pgpath = lockless_dereference(m->current_pgpath);
415 __choose_pgpath(m, nr_bytes); 458 if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
416 459 pgpath = choose_pgpath(m, nr_bytes);
417 pgpath = m->current_pgpath;
418 460
419 if (!pgpath) { 461 if (!pgpath) {
420 if (!__must_push_back(m)) 462 if (!must_push_back(m))
421 r = -EIO; /* Failed */ 463 r = -EIO; /* Failed */
422 goto out_unlock; 464 return r;
423 } else if (m->queue_io || m->pg_init_required) { 465 } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
424 __pg_init_all_paths(m); 466 test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
425 goto out_unlock; 467 pg_init_all_paths(m);
468 return r;
426 } 469 }
427 470
428 mpio = set_mpio(m, map_context); 471 mpio = set_mpio(m, map_context);
429 if (!mpio) 472 if (!mpio)
430 /* ENOMEM, requeue */ 473 /* ENOMEM, requeue */
431 goto out_unlock; 474 return r;
432 475
433 mpio->pgpath = pgpath; 476 mpio->pgpath = pgpath;
434 mpio->nr_bytes = nr_bytes; 477 mpio->nr_bytes = nr_bytes;
435 478
436 bdev = pgpath->path.dev->bdev; 479 bdev = pgpath->path.dev->bdev;
437 480
438 spin_unlock_irq(&m->lock);
439
440 if (clone) { 481 if (clone) {
441 /* 482 /*
442 * Old request-based interface: allocated clone is passed in. 483 * Old request-based interface: allocated clone is passed in.
@@ -468,11 +509,6 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
468 &pgpath->path, 509 &pgpath->path,
469 nr_bytes); 510 nr_bytes);
470 return DM_MAPIO_REMAPPED; 511 return DM_MAPIO_REMAPPED;
471
472out_unlock:
473 spin_unlock_irq(&m->lock);
474
475 return r;
476} 512}
477 513
478static int multipath_map(struct dm_target *ti, struct request *clone, 514static int multipath_map(struct dm_target *ti, struct request *clone,
@@ -503,11 +539,22 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
503 539
504 spin_lock_irqsave(&m->lock, flags); 540 spin_lock_irqsave(&m->lock, flags);
505 541
506 if (save_old_value) 542 if (save_old_value) {
507 m->saved_queue_if_no_path = m->queue_if_no_path; 543 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
544 set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
545 else
546 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
547 } else {
548 if (queue_if_no_path)
549 set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
550 else
551 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
552 }
553 if (queue_if_no_path)
554 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
508 else 555 else
509 m->saved_queue_if_no_path = queue_if_no_path; 556 clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
510 m->queue_if_no_path = queue_if_no_path; 557
511 spin_unlock_irqrestore(&m->lock, flags); 558 spin_unlock_irqrestore(&m->lock, flags);
512 559
513 if (!queue_if_no_path) 560 if (!queue_if_no_path)
@@ -600,10 +647,10 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
600 goto bad; 647 goto bad;
601 } 648 }
602 649
603 if (m->retain_attached_hw_handler || m->hw_handler_name) 650 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) || m->hw_handler_name)
604 q = bdev_get_queue(p->path.dev->bdev); 651 q = bdev_get_queue(p->path.dev->bdev);
605 652
606 if (m->retain_attached_hw_handler) { 653 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) {
607retain: 654retain:
608 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); 655 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
609 if (attached_handler_name) { 656 if (attached_handler_name) {
@@ -808,7 +855,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
808 } 855 }
809 856
810 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { 857 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
811 m->retain_attached_hw_handler = true; 858 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
812 continue; 859 continue;
813 } 860 }
814 861
@@ -884,6 +931,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
884 /* parse the priority groups */ 931 /* parse the priority groups */
885 while (as.argc) { 932 while (as.argc) {
886 struct priority_group *pg; 933 struct priority_group *pg;
934 unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths);
887 935
888 pg = parse_priority_group(&as, m); 936 pg = parse_priority_group(&as, m);
889 if (IS_ERR(pg)) { 937 if (IS_ERR(pg)) {
@@ -891,7 +939,9 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
891 goto bad; 939 goto bad;
892 } 940 }
893 941
894 m->nr_valid_paths += pg->nr_pgpaths; 942 nr_valid_paths += pg->nr_pgpaths;
943 atomic_set(&m->nr_valid_paths, nr_valid_paths);
944
895 list_add_tail(&pg->list, &m->priority_groups); 945 list_add_tail(&pg->list, &m->priority_groups);
896 pg_count++; 946 pg_count++;
897 pg->pg_num = pg_count; 947 pg->pg_num = pg_count;
@@ -921,19 +971,14 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
921static void multipath_wait_for_pg_init_completion(struct multipath *m) 971static void multipath_wait_for_pg_init_completion(struct multipath *m)
922{ 972{
923 DECLARE_WAITQUEUE(wait, current); 973 DECLARE_WAITQUEUE(wait, current);
924 unsigned long flags;
925 974
926 add_wait_queue(&m->pg_init_wait, &wait); 975 add_wait_queue(&m->pg_init_wait, &wait);
927 976
928 while (1) { 977 while (1) {
929 set_current_state(TASK_UNINTERRUPTIBLE); 978 set_current_state(TASK_UNINTERRUPTIBLE);
930 979
931 spin_lock_irqsave(&m->lock, flags); 980 if (!atomic_read(&m->pg_init_in_progress))
932 if (!m->pg_init_in_progress) {
933 spin_unlock_irqrestore(&m->lock, flags);
934 break; 981 break;
935 }
936 spin_unlock_irqrestore(&m->lock, flags);
937 982
938 io_schedule(); 983 io_schedule();
939 } 984 }
@@ -944,20 +989,16 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
944 989
945static void flush_multipath_work(struct multipath *m) 990static void flush_multipath_work(struct multipath *m)
946{ 991{
947 unsigned long flags; 992 set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
948 993 smp_mb__after_atomic();
949 spin_lock_irqsave(&m->lock, flags);
950 m->pg_init_disabled = true;
951 spin_unlock_irqrestore(&m->lock, flags);
952 994
953 flush_workqueue(kmpath_handlerd); 995 flush_workqueue(kmpath_handlerd);
954 multipath_wait_for_pg_init_completion(m); 996 multipath_wait_for_pg_init_completion(m);
955 flush_workqueue(kmultipathd); 997 flush_workqueue(kmultipathd);
956 flush_work(&m->trigger_event); 998 flush_work(&m->trigger_event);
957 999
958 spin_lock_irqsave(&m->lock, flags); 1000 clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
959 m->pg_init_disabled = false; 1001 smp_mb__after_atomic();
960 spin_unlock_irqrestore(&m->lock, flags);
961} 1002}
962 1003
963static void multipath_dtr(struct dm_target *ti) 1004static void multipath_dtr(struct dm_target *ti)
@@ -987,13 +1028,13 @@ static int fail_path(struct pgpath *pgpath)
987 pgpath->is_active = false; 1028 pgpath->is_active = false;
988 pgpath->fail_count++; 1029 pgpath->fail_count++;
989 1030
990 m->nr_valid_paths--; 1031 atomic_dec(&m->nr_valid_paths);
991 1032
992 if (pgpath == m->current_pgpath) 1033 if (pgpath == m->current_pgpath)
993 m->current_pgpath = NULL; 1034 m->current_pgpath = NULL;
994 1035
995 dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, 1036 dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
996 pgpath->path.dev->name, m->nr_valid_paths); 1037 pgpath->path.dev->name, atomic_read(&m->nr_valid_paths));
997 1038
998 schedule_work(&m->trigger_event); 1039 schedule_work(&m->trigger_event);
999 1040
@@ -1011,6 +1052,7 @@ static int reinstate_path(struct pgpath *pgpath)
1011 int r = 0, run_queue = 0; 1052 int r = 0, run_queue = 0;
1012 unsigned long flags; 1053 unsigned long flags;
1013 struct multipath *m = pgpath->pg->m; 1054 struct multipath *m = pgpath->pg->m;
1055 unsigned nr_valid_paths;
1014 1056
1015 spin_lock_irqsave(&m->lock, flags); 1057 spin_lock_irqsave(&m->lock, flags);
1016 1058
@@ -1025,16 +1067,17 @@ static int reinstate_path(struct pgpath *pgpath)
1025 1067
1026 pgpath->is_active = true; 1068 pgpath->is_active = true;
1027 1069
1028 if (!m->nr_valid_paths++) { 1070 nr_valid_paths = atomic_inc_return(&m->nr_valid_paths);
1071 if (nr_valid_paths == 1) {
1029 m->current_pgpath = NULL; 1072 m->current_pgpath = NULL;
1030 run_queue = 1; 1073 run_queue = 1;
1031 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { 1074 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1032 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) 1075 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1033 m->pg_init_in_progress++; 1076 atomic_inc(&m->pg_init_in_progress);
1034 } 1077 }
1035 1078
1036 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, 1079 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
1037 pgpath->path.dev->name, m->nr_valid_paths); 1080 pgpath->path.dev->name, nr_valid_paths);
1038 1081
1039 schedule_work(&m->trigger_event); 1082 schedule_work(&m->trigger_event);
1040 1083
@@ -1152,8 +1195,9 @@ static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1152 1195
1153 spin_lock_irqsave(&m->lock, flags); 1196 spin_lock_irqsave(&m->lock, flags);
1154 1197
1155 if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled) 1198 if (atomic_read(&m->pg_init_count) <= m->pg_init_retries &&
1156 m->pg_init_required = true; 1199 !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
1200 set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
1157 else 1201 else
1158 limit_reached = true; 1202 limit_reached = true;
1159 1203
@@ -1219,19 +1263,23 @@ static void pg_init_done(void *data, int errors)
1219 m->current_pgpath = NULL; 1263 m->current_pgpath = NULL;
1220 m->current_pg = NULL; 1264 m->current_pg = NULL;
1221 } 1265 }
1222 } else if (!m->pg_init_required) 1266 } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1223 pg->bypassed = false; 1267 pg->bypassed = false;
1224 1268
1225 if (--m->pg_init_in_progress) 1269 if (atomic_dec_return(&m->pg_init_in_progress) > 0)
1226 /* Activations of other paths are still on going */ 1270 /* Activations of other paths are still on going */
1227 goto out; 1271 goto out;
1228 1272
1229 if (m->pg_init_required) { 1273 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
1230 m->pg_init_delay_retry = delay_retry; 1274 if (delay_retry)
1275 set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1276 else
1277 clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1278
1231 if (__pg_init_all_paths(m)) 1279 if (__pg_init_all_paths(m))
1232 goto out; 1280 goto out;
1233 } 1281 }
1234 m->queue_io = false; 1282 clear_bit(MPATHF_QUEUE_IO, &m->flags);
1235 1283
1236 /* 1284 /*
1237 * Wake up any thread waiting to suspend. 1285 * Wake up any thread waiting to suspend.
@@ -1287,7 +1335,6 @@ static int do_end_io(struct multipath *m, struct request *clone,
1287 * clone bios for it and resubmit it later. 1335 * clone bios for it and resubmit it later.
1288 */ 1336 */
1289 int r = DM_ENDIO_REQUEUE; 1337 int r = DM_ENDIO_REQUEUE;
1290 unsigned long flags;
1291 1338
1292 if (!error && !clone->errors) 1339 if (!error && !clone->errors)
1293 return 0; /* I/O complete */ 1340 return 0; /* I/O complete */
@@ -1298,17 +1345,15 @@ static int do_end_io(struct multipath *m, struct request *clone,
1298 if (mpio->pgpath) 1345 if (mpio->pgpath)
1299 fail_path(mpio->pgpath); 1346 fail_path(mpio->pgpath);
1300 1347
1301 spin_lock_irqsave(&m->lock, flags); 1348 if (!atomic_read(&m->nr_valid_paths)) {
1302 if (!m->nr_valid_paths) { 1349 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
1303 if (!m->queue_if_no_path) { 1350 if (!must_push_back(m))
1304 if (!__must_push_back(m))
1305 r = -EIO; 1351 r = -EIO;
1306 } else { 1352 } else {
1307 if (error == -EBADE) 1353 if (error == -EBADE)
1308 r = error; 1354 r = error;
1309 } 1355 }
1310 } 1356 }
1311 spin_unlock_irqrestore(&m->lock, flags);
1312 1357
1313 return r; 1358 return r;
1314} 1359}
@@ -1364,11 +1409,12 @@ static void multipath_postsuspend(struct dm_target *ti)
1364static void multipath_resume(struct dm_target *ti) 1409static void multipath_resume(struct dm_target *ti)
1365{ 1410{
1366 struct multipath *m = ti->private; 1411 struct multipath *m = ti->private;
1367 unsigned long flags;
1368 1412
1369 spin_lock_irqsave(&m->lock, flags); 1413 if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags))
1370 m->queue_if_no_path = m->saved_queue_if_no_path; 1414 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
1371 spin_unlock_irqrestore(&m->lock, flags); 1415 else
1416 clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
1417 smp_mb__after_atomic();
1372} 1418}
1373 1419
1374/* 1420/*
@@ -1402,19 +1448,20 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
1402 1448
1403 /* Features */ 1449 /* Features */
1404 if (type == STATUSTYPE_INFO) 1450 if (type == STATUSTYPE_INFO)
1405 DMEMIT("2 %u %u ", m->queue_io, m->pg_init_count); 1451 DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags),
1452 atomic_read(&m->pg_init_count));
1406 else { 1453 else {
1407 DMEMIT("%u ", m->queue_if_no_path + 1454 DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
1408 (m->pg_init_retries > 0) * 2 + 1455 (m->pg_init_retries > 0) * 2 +
1409 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + 1456 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1410 m->retain_attached_hw_handler); 1457 test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags));
1411 if (m->queue_if_no_path) 1458 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1412 DMEMIT("queue_if_no_path "); 1459 DMEMIT("queue_if_no_path ");
1413 if (m->pg_init_retries) 1460 if (m->pg_init_retries)
1414 DMEMIT("pg_init_retries %u ", m->pg_init_retries); 1461 DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1415 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) 1462 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1416 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); 1463 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1417 if (m->retain_attached_hw_handler) 1464 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
1418 DMEMIT("retain_attached_hw_handler "); 1465 DMEMIT("retain_attached_hw_handler ");
1419 } 1466 }
1420 1467
@@ -1563,18 +1610,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
1563 struct block_device **bdev, fmode_t *mode) 1610 struct block_device **bdev, fmode_t *mode)
1564{ 1611{
1565 struct multipath *m = ti->private; 1612 struct multipath *m = ti->private;
1566 unsigned long flags; 1613 struct pgpath *current_pgpath;
1567 int r; 1614 int r;
1568 1615
1569 spin_lock_irqsave(&m->lock, flags); 1616 current_pgpath = lockless_dereference(m->current_pgpath);
1617 if (!current_pgpath)
1618 current_pgpath = choose_pgpath(m, 0);
1570 1619
1571 if (!m->current_pgpath) 1620 if (current_pgpath) {
1572 __choose_pgpath(m, 0); 1621 if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
1573 1622 *bdev = current_pgpath->path.dev->bdev;
1574 if (m->current_pgpath) { 1623 *mode = current_pgpath->path.dev->mode;
1575 if (!m->queue_io) {
1576 *bdev = m->current_pgpath->path.dev->bdev;
1577 *mode = m->current_pgpath->path.dev->mode;
1578 r = 0; 1624 r = 0;
1579 } else { 1625 } else {
1580 /* pg_init has not started or completed */ 1626 /* pg_init has not started or completed */
@@ -1582,23 +1628,19 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
1582 } 1628 }
1583 } else { 1629 } else {
1584 /* No path is available */ 1630 /* No path is available */
1585 if (m->queue_if_no_path) 1631 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1586 r = -ENOTCONN; 1632 r = -ENOTCONN;
1587 else 1633 else
1588 r = -EIO; 1634 r = -EIO;
1589 } 1635 }
1590 1636
1591 spin_unlock_irqrestore(&m->lock, flags);
1592
1593 if (r == -ENOTCONN) { 1637 if (r == -ENOTCONN) {
1594 spin_lock_irqsave(&m->lock, flags); 1638 if (!lockless_dereference(m->current_pg)) {
1595 if (!m->current_pg) {
1596 /* Path status changed, redo selection */ 1639 /* Path status changed, redo selection */
1597 __choose_pgpath(m, 0); 1640 (void) choose_pgpath(m, 0);
1598 } 1641 }
1599 if (m->pg_init_required) 1642 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1600 __pg_init_all_paths(m); 1643 pg_init_all_paths(m);
1601 spin_unlock_irqrestore(&m->lock, flags);
1602 dm_table_run_md_queue_async(m->ti->table); 1644 dm_table_run_md_queue_async(m->ti->table);
1603 } 1645 }
1604 1646
@@ -1649,39 +1691,37 @@ static int multipath_busy(struct dm_target *ti)
1649{ 1691{
1650 bool busy = false, has_active = false; 1692 bool busy = false, has_active = false;
1651 struct multipath *m = ti->private; 1693 struct multipath *m = ti->private;
1652 struct priority_group *pg; 1694 struct priority_group *pg, *next_pg;
1653 struct pgpath *pgpath; 1695 struct pgpath *pgpath;
1654 unsigned long flags;
1655
1656 spin_lock_irqsave(&m->lock, flags);
1657 1696
1658 /* pg_init in progress or no paths available */ 1697 /* pg_init in progress or no paths available */
1659 if (m->pg_init_in_progress || 1698 if (atomic_read(&m->pg_init_in_progress) ||
1660 (!m->nr_valid_paths && m->queue_if_no_path)) { 1699 (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)))
1661 busy = true; 1700 return true;
1662 goto out; 1701
1663 }
1664 /* Guess which priority_group will be used at next mapping time */ 1702 /* Guess which priority_group will be used at next mapping time */
1665 if (unlikely(!m->current_pgpath && m->next_pg)) 1703 pg = lockless_dereference(m->current_pg);
1666 pg = m->next_pg; 1704 next_pg = lockless_dereference(m->next_pg);
1667 else if (likely(m->current_pg)) 1705 if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg))
1668 pg = m->current_pg; 1706 pg = next_pg;
1669 else 1707
1708 if (!pg) {
1670 /* 1709 /*
1671 * We don't know which pg will be used at next mapping time. 1710 * We don't know which pg will be used at next mapping time.
1672 * We don't call __choose_pgpath() here to avoid to trigger 1711 * We don't call choose_pgpath() here to avoid to trigger
1673 * pg_init just by busy checking. 1712 * pg_init just by busy checking.
1674 * So we don't know whether underlying devices we will be using 1713 * So we don't know whether underlying devices we will be using
1675 * at next mapping time are busy or not. Just try mapping. 1714 * at next mapping time are busy or not. Just try mapping.
1676 */ 1715 */
1677 goto out; 1716 return busy;
1717 }
1678 1718
1679 /* 1719 /*
1680 * If there is one non-busy active path at least, the path selector 1720 * If there is one non-busy active path at least, the path selector
1681 * will be able to select it. So we consider such a pg as not busy. 1721 * will be able to select it. So we consider such a pg as not busy.
1682 */ 1722 */
1683 busy = true; 1723 busy = true;
1684 list_for_each_entry(pgpath, &pg->pgpaths, list) 1724 list_for_each_entry(pgpath, &pg->pgpaths, list) {
1685 if (pgpath->is_active) { 1725 if (pgpath->is_active) {
1686 has_active = true; 1726 has_active = true;
1687 if (!pgpath_busy(pgpath)) { 1727 if (!pgpath_busy(pgpath)) {
@@ -1689,17 +1729,16 @@ static int multipath_busy(struct dm_target *ti)
1689 break; 1729 break;
1690 } 1730 }
1691 } 1731 }
1732 }
1692 1733
1693 if (!has_active) 1734 if (!has_active) {
1694 /* 1735 /*
1695 * No active path in this pg, so this pg won't be used and 1736 * No active path in this pg, so this pg won't be used and
1696 * the current_pg will be changed at next mapping time. 1737 * the current_pg will be changed at next mapping time.
1697 * We need to try mapping to determine it. 1738 * We need to try mapping to determine it.
1698 */ 1739 */
1699 busy = false; 1740 busy = false;
1700 1741 }
1701out:
1702 spin_unlock_irqrestore(&m->lock, flags);
1703 1742
1704 return busy; 1743 return busy;
1705} 1744}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index a0901214aef5..52532745a50f 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1037,6 +1037,11 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
1037 if (!mddev->events && super_init_validation(mddev, rdev)) 1037 if (!mddev->events && super_init_validation(mddev, rdev))
1038 return -EINVAL; 1038 return -EINVAL;
1039 1039
1040 if (le32_to_cpu(sb->features)) {
1041 rs->ti->error = "Unable to assemble array: No feature flags supported yet";
1042 return -EINVAL;
1043 }
1044
1040 /* Enable bitmap creation for RAID levels != 0 */ 1045 /* Enable bitmap creation for RAID levels != 0 */
1041 mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0; 1046 mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
1042 rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; 1047 rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
@@ -1718,7 +1723,7 @@ static void raid_resume(struct dm_target *ti)
1718 1723
1719static struct target_type raid_target = { 1724static struct target_type raid_target = {
1720 .name = "raid", 1725 .name = "raid",
1721 .version = {1, 7, 0}, 1726 .version = {1, 8, 0},
1722 .module = THIS_MODULE, 1727 .module = THIS_MODULE,
1723 .ctr = raid_ctr, 1728 .ctr = raid_ctr,
1724 .dtr = raid_dtr, 1729 .dtr = raid_dtr,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 92237b6fa8cd..fc803d50f9f0 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -322,56 +322,6 @@ struct thin_c {
322 322
323/*----------------------------------------------------------------*/ 323/*----------------------------------------------------------------*/
324 324
325/**
326 * __blkdev_issue_discard_async - queue a discard with async completion
327 * @bdev: blockdev to issue discard for
328 * @sector: start sector
329 * @nr_sects: number of sectors to discard
330 * @gfp_mask: memory allocation flags (for bio_alloc)
331 * @flags: BLKDEV_IFL_* flags to control behaviour
332 * @parent_bio: parent discard bio that all sub discards get chained to
333 *
334 * Description:
335 * Asynchronously issue a discard request for the sectors in question.
336 */
337static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector,
338 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags,
339 struct bio *parent_bio)
340{
341 struct request_queue *q = bdev_get_queue(bdev);
342 int type = REQ_WRITE | REQ_DISCARD;
343 struct bio *bio;
344
345 if (!q || !nr_sects)
346 return -ENXIO;
347
348 if (!blk_queue_discard(q))
349 return -EOPNOTSUPP;
350
351 if (flags & BLKDEV_DISCARD_SECURE) {
352 if (!blk_queue_secdiscard(q))
353 return -EOPNOTSUPP;
354 type |= REQ_SECURE;
355 }
356
357 /*
358 * Required bio_put occurs in bio_endio thanks to bio_chain below
359 */
360 bio = bio_alloc(gfp_mask, 1);
361 if (!bio)
362 return -ENOMEM;
363
364 bio_chain(bio, parent_bio);
365
366 bio->bi_iter.bi_sector = sector;
367 bio->bi_bdev = bdev;
368 bio->bi_iter.bi_size = nr_sects << 9;
369
370 submit_bio(type, bio);
371
372 return 0;
373}
374
375static bool block_size_is_power_of_two(struct pool *pool) 325static bool block_size_is_power_of_two(struct pool *pool)
376{ 326{
377 return pool->sectors_per_block_shift >= 0; 327 return pool->sectors_per_block_shift >= 0;
@@ -384,14 +334,55 @@ static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
384 (b * pool->sectors_per_block); 334 (b * pool->sectors_per_block);
385} 335}
386 336
387static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e, 337/*----------------------------------------------------------------*/
388 struct bio *parent_bio) 338
339struct discard_op {
340 struct thin_c *tc;
341 struct blk_plug plug;
342 struct bio *parent_bio;
343 struct bio *bio;
344};
345
346static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *parent)
347{
348 BUG_ON(!parent);
349
350 op->tc = tc;
351 blk_start_plug(&op->plug);
352 op->parent_bio = parent;
353 op->bio = NULL;
354}
355
356static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
389{ 357{
358 struct thin_c *tc = op->tc;
390 sector_t s = block_to_sectors(tc->pool, data_b); 359 sector_t s = block_to_sectors(tc->pool, data_b);
391 sector_t len = block_to_sectors(tc->pool, data_e - data_b); 360 sector_t len = block_to_sectors(tc->pool, data_e - data_b);
392 361
393 return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len, 362 return __blkdev_issue_discard(tc->pool_dev->bdev, s, len,
394 GFP_NOWAIT, 0, parent_bio); 363 GFP_NOWAIT, REQ_WRITE | REQ_DISCARD, &op->bio);
364}
365
366static void end_discard(struct discard_op *op, int r)
367{
368 if (op->bio) {
369 /*
370 * Even if one of the calls to issue_discard failed, we
371 * need to wait for the chain to complete.
372 */
373 bio_chain(op->bio, op->parent_bio);
374 submit_bio(REQ_WRITE | REQ_DISCARD, op->bio);
375 }
376
377 blk_finish_plug(&op->plug);
378
379 /*
380 * Even if r is set, there could be sub discards in flight that we
381 * need to wait for.
382 */
383 if (r && !op->parent_bio->bi_error)
384 op->parent_bio->bi_error = r;
385 bio_endio(op->parent_bio);
395} 386}
396 387
397/*----------------------------------------------------------------*/ 388/*----------------------------------------------------------------*/
@@ -632,7 +623,7 @@ static void error_retry_list(struct pool *pool)
632{ 623{
633 int error = get_pool_io_error_code(pool); 624 int error = get_pool_io_error_code(pool);
634 625
635 return error_retry_list_with_code(pool, error); 626 error_retry_list_with_code(pool, error);
636} 627}
637 628
638/* 629/*
@@ -1006,24 +997,28 @@ static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
1006 mempool_free(m, tc->pool->mapping_pool); 997 mempool_free(m, tc->pool->mapping_pool);
1007} 998}
1008 999
1009static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m) 1000/*----------------------------------------------------------------*/
1001
1002static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
1010{ 1003{
1011 /* 1004 /*
1012 * We've already unmapped this range of blocks, but before we 1005 * We've already unmapped this range of blocks, but before we
1013 * passdown we have to check that these blocks are now unused. 1006 * passdown we have to check that these blocks are now unused.
1014 */ 1007 */
1015 int r; 1008 int r = 0;
1016 bool used = true; 1009 bool used = true;
1017 struct thin_c *tc = m->tc; 1010 struct thin_c *tc = m->tc;
1018 struct pool *pool = tc->pool; 1011 struct pool *pool = tc->pool;
1019 dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin; 1012 dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
1013 struct discard_op op;
1020 1014
1015 begin_discard(&op, tc, m->bio);
1021 while (b != end) { 1016 while (b != end) {
1022 /* find start of unmapped run */ 1017 /* find start of unmapped run */
1023 for (; b < end; b++) { 1018 for (; b < end; b++) {
1024 r = dm_pool_block_is_used(pool->pmd, b, &used); 1019 r = dm_pool_block_is_used(pool->pmd, b, &used);
1025 if (r) 1020 if (r)
1026 return r; 1021 goto out;
1027 1022
1028 if (!used) 1023 if (!used)
1029 break; 1024 break;
@@ -1036,20 +1031,20 @@ static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
1036 for (e = b + 1; e != end; e++) { 1031 for (e = b + 1; e != end; e++) {
1037 r = dm_pool_block_is_used(pool->pmd, e, &used); 1032 r = dm_pool_block_is_used(pool->pmd, e, &used);
1038 if (r) 1033 if (r)
1039 return r; 1034 goto out;
1040 1035
1041 if (used) 1036 if (used)
1042 break; 1037 break;
1043 } 1038 }
1044 1039
1045 r = issue_discard(tc, b, e, m->bio); 1040 r = issue_discard(&op, b, e);
1046 if (r) 1041 if (r)
1047 return r; 1042 goto out;
1048 1043
1049 b = e; 1044 b = e;
1050 } 1045 }
1051 1046out:
1052 return 0; 1047 end_discard(&op, r);
1053} 1048}
1054 1049
1055static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) 1050static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
@@ -1059,20 +1054,21 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
1059 struct pool *pool = tc->pool; 1054 struct pool *pool = tc->pool;
1060 1055
1061 r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end); 1056 r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
1062 if (r) 1057 if (r) {
1063 metadata_operation_failed(pool, "dm_thin_remove_range", r); 1058 metadata_operation_failed(pool, "dm_thin_remove_range", r);
1059 bio_io_error(m->bio);
1064 1060
1065 else if (m->maybe_shared) 1061 } else if (m->maybe_shared) {
1066 r = passdown_double_checking_shared_status(m); 1062 passdown_double_checking_shared_status(m);
1067 else 1063
1068 r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio); 1064 } else {
1065 struct discard_op op;
1066 begin_discard(&op, tc, m->bio);
1067 r = issue_discard(&op, m->data_block,
1068 m->data_block + (m->virt_end - m->virt_begin));
1069 end_discard(&op, r);
1070 }
1069 1071
1070 /*
1071 * Even if r is set, there could be sub discards in flight that we
1072 * need to wait for.
1073 */
1074 m->bio->bi_error = r;
1075 bio_endio(m->bio);
1076 cell_defer_no_holder(tc, m->cell); 1072 cell_defer_no_holder(tc, m->cell);
1077 mempool_free(m, pool->mapping_pool); 1073 mempool_free(m, pool->mapping_pool);
1078} 1074}
@@ -1494,17 +1490,6 @@ static void process_discard_cell_no_passdown(struct thin_c *tc,
1494 pool->process_prepared_discard(m); 1490 pool->process_prepared_discard(m);
1495} 1491}
1496 1492
1497/*
1498 * __bio_inc_remaining() is used to defer parent bios's end_io until
1499 * we _know_ all chained sub range discard bios have completed.
1500 */
1501static inline void __bio_inc_remaining(struct bio *bio)
1502{
1503 bio->bi_flags |= (1 << BIO_CHAIN);
1504 smp_mb__before_atomic();
1505 atomic_inc(&bio->__bi_remaining);
1506}
1507
1508static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end, 1493static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
1509 struct bio *bio) 1494 struct bio *bio)
1510{ 1495{
@@ -1554,13 +1539,13 @@ static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t
1554 1539
1555 /* 1540 /*
1556 * The parent bio must not complete before sub discard bios are 1541 * The parent bio must not complete before sub discard bios are
1557 * chained to it (see __blkdev_issue_discard_async's bio_chain)! 1542 * chained to it (see end_discard's bio_chain)!
1558 * 1543 *
1559 * This per-mapping bi_remaining increment is paired with 1544 * This per-mapping bi_remaining increment is paired with
1560 * the implicit decrement that occurs via bio_endio() in 1545 * the implicit decrement that occurs via bio_endio() in
1561 * process_prepared_discard_{passdown,no_passdown}. 1546 * end_discard().
1562 */ 1547 */
1563 __bio_inc_remaining(bio); 1548 bio_inc_remaining(bio);
1564 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) 1549 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1565 pool->process_prepared_discard(m); 1550 pool->process_prepared_discard(m);
1566 1551
@@ -3899,7 +3884,7 @@ static struct target_type pool_target = {
3899 .name = "thin-pool", 3884 .name = "thin-pool",
3900 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 3885 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
3901 DM_TARGET_IMMUTABLE, 3886 DM_TARGET_IMMUTABLE,
3902 .version = {1, 18, 0}, 3887 .version = {1, 19, 0},
3903 .module = THIS_MODULE, 3888 .module = THIS_MODULE,
3904 .ctr = pool_ctr, 3889 .ctr = pool_ctr,
3905 .dtr = pool_dtr, 3890 .dtr = pool_dtr,
@@ -4273,7 +4258,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
4273 4258
4274static struct target_type thin_target = { 4259static struct target_type thin_target = {
4275 .name = "thin", 4260 .name = "thin",
4276 .version = {1, 18, 0}, 4261 .version = {1, 19, 0},
4277 .module = THIS_MODULE, 4262 .module = THIS_MODULE,
4278 .ctr = thin_ctr, 4263 .ctr = thin_ctr,
4279 .dtr = thin_dtr, 4264 .dtr = thin_dtr,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3d3ac13287a4..1b2f96205361 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -674,7 +674,7 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
674 mempool_free(io, md->io_pool); 674 mempool_free(io, md->io_pool);
675} 675}
676 676
677static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 677static void free_tio(struct dm_target_io *tio)
678{ 678{
679 bio_put(&tio->clone); 679 bio_put(&tio->clone);
680} 680}
@@ -1055,7 +1055,7 @@ static void clone_endio(struct bio *bio)
1055 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) 1055 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
1056 disable_write_same(md); 1056 disable_write_same(md);
1057 1057
1058 free_tio(md, tio); 1058 free_tio(tio);
1059 dec_pending(io, error); 1059 dec_pending(io, error);
1060} 1060}
1061 1061
@@ -1517,7 +1517,6 @@ static void __map_bio(struct dm_target_io *tio)
1517{ 1517{
1518 int r; 1518 int r;
1519 sector_t sector; 1519 sector_t sector;
1520 struct mapped_device *md;
1521 struct bio *clone = &tio->clone; 1520 struct bio *clone = &tio->clone;
1522 struct dm_target *ti = tio->ti; 1521 struct dm_target *ti = tio->ti;
1523 1522
@@ -1540,9 +1539,8 @@ static void __map_bio(struct dm_target_io *tio)
1540 generic_make_request(clone); 1539 generic_make_request(clone);
1541 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1540 } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1542 /* error the io and bail out, or requeue it if needed */ 1541 /* error the io and bail out, or requeue it if needed */
1543 md = tio->io->md;
1544 dec_pending(tio->io, r); 1542 dec_pending(tio->io, r);
1545 free_tio(md, tio); 1543 free_tio(tio);
1546 } else if (r != DM_MAPIO_SUBMITTED) { 1544 } else if (r != DM_MAPIO_SUBMITTED) {
1547 DMWARN("unimplemented target map return value: %d", r); 1545 DMWARN("unimplemented target map return value: %d", r);
1548 BUG(); 1546 BUG();
@@ -1663,7 +1661,7 @@ static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1663 tio->len_ptr = len; 1661 tio->len_ptr = len;
1664 r = clone_bio(tio, bio, sector, *len); 1662 r = clone_bio(tio, bio, sector, *len);
1665 if (r < 0) { 1663 if (r < 0) {
1666 free_tio(ci->md, tio); 1664 free_tio(tio);
1667 break; 1665 break;
1668 } 1666 }
1669 __map_bio(tio); 1667 __map_bio(tio);