diff options
Diffstat (limited to 'drivers/md')
| -rw-r--r-- | drivers/md/dm-bufio.c | 5 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 13 | ||||
| -rw-r--r-- | drivers/md/dm-cache-target.c | 2 | ||||
| -rw-r--r-- | drivers/md/dm-delay.c | 23 | ||||
| -rw-r--r-- | drivers/md/dm-snap.c | 71 | ||||
| -rw-r--r-- | drivers/md/dm-stats.c | 1 | ||||
| -rw-r--r-- | drivers/md/dm-table.c | 5 | ||||
| -rw-r--r-- | drivers/md/dm-thin-metadata.c | 8 | ||||
| -rw-r--r-- | drivers/md/dm-thin-metadata.h | 1 | ||||
| -rw-r--r-- | drivers/md/dm-thin.c | 66 | ||||
| -rw-r--r-- | drivers/md/md.c | 147 | ||||
| -rw-r--r-- | drivers/md/persistent-data/dm-array.c | 10 | ||||
| -rw-r--r-- | drivers/md/persistent-data/dm-block-manager.c | 6 | ||||
| -rw-r--r-- | drivers/md/persistent-data/dm-block-manager.h | 7 | ||||
| -rw-r--r-- | drivers/md/persistent-data/dm-space-map-common.c | 32 | ||||
| -rw-r--r-- | drivers/md/persistent-data/dm-space-map-metadata.c | 8 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 162 | ||||
| -rw-r--r-- | drivers/md/raid1.h | 15 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 6 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 425 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 16 |
21 files changed, 760 insertions, 269 deletions
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 173cbb20d104..54bdd923316f 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c | |||
| @@ -1717,6 +1717,11 @@ static int __init dm_bufio_init(void) | |||
| 1717 | { | 1717 | { |
| 1718 | __u64 mem; | 1718 | __u64 mem; |
| 1719 | 1719 | ||
| 1720 | dm_bufio_allocated_kmem_cache = 0; | ||
| 1721 | dm_bufio_allocated_get_free_pages = 0; | ||
| 1722 | dm_bufio_allocated_vmalloc = 0; | ||
| 1723 | dm_bufio_current_allocated = 0; | ||
| 1724 | |||
| 1720 | memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); | 1725 | memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); |
| 1721 | memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); | 1726 | memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); |
| 1722 | 1727 | ||
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 416b7b752a6e..64780ad73bb0 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c | |||
| @@ -730,15 +730,18 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, | |||
| 730 | int r = 0; | 730 | int r = 0; |
| 731 | bool updated = updated_this_tick(mq, e); | 731 | bool updated = updated_this_tick(mq, e); |
| 732 | 732 | ||
| 733 | requeue_and_update_tick(mq, e); | ||
| 734 | |||
| 735 | if ((!discarded_oblock && updated) || | 733 | if ((!discarded_oblock && updated) || |
| 736 | !should_promote(mq, e, discarded_oblock, data_dir)) | 734 | !should_promote(mq, e, discarded_oblock, data_dir)) { |
| 735 | requeue_and_update_tick(mq, e); | ||
| 737 | result->op = POLICY_MISS; | 736 | result->op = POLICY_MISS; |
| 738 | else if (!can_migrate) | 737 | |
| 738 | } else if (!can_migrate) | ||
| 739 | r = -EWOULDBLOCK; | 739 | r = -EWOULDBLOCK; |
| 740 | else | 740 | |
| 741 | else { | ||
| 742 | requeue_and_update_tick(mq, e); | ||
| 741 | r = pre_cache_to_cache(mq, e, result); | 743 | r = pre_cache_to_cache(mq, e, result); |
| 744 | } | ||
| 742 | 745 | ||
| 743 | return r; | 746 | return r; |
| 744 | } | 747 | } |
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 9efcf1059b99..1b1469ebe5cb 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c | |||
| @@ -2755,7 +2755,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) | |||
| 2755 | { | 2755 | { |
| 2756 | int r; | 2756 | int r; |
| 2757 | 2757 | ||
| 2758 | r = dm_cache_resize(cache->cmd, cache->cache_size); | 2758 | r = dm_cache_resize(cache->cmd, new_size); |
| 2759 | if (r) { | 2759 | if (r) { |
| 2760 | DMERR("could not resize cache metadata"); | 2760 | DMERR("could not resize cache metadata"); |
| 2761 | return r; | 2761 | return r; |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 496d5f3646a5..2f91d6d4a2cc 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | struct delay_c { | 20 | struct delay_c { |
| 21 | struct timer_list delay_timer; | 21 | struct timer_list delay_timer; |
| 22 | struct mutex timer_lock; | 22 | struct mutex timer_lock; |
| 23 | struct workqueue_struct *kdelayd_wq; | ||
| 23 | struct work_struct flush_expired_bios; | 24 | struct work_struct flush_expired_bios; |
| 24 | struct list_head delayed_bios; | 25 | struct list_head delayed_bios; |
| 25 | atomic_t may_delay; | 26 | atomic_t may_delay; |
| @@ -45,14 +46,13 @@ struct dm_delay_info { | |||
| 45 | 46 | ||
| 46 | static DEFINE_MUTEX(delayed_bios_lock); | 47 | static DEFINE_MUTEX(delayed_bios_lock); |
| 47 | 48 | ||
| 48 | static struct workqueue_struct *kdelayd_wq; | ||
| 49 | static struct kmem_cache *delayed_cache; | 49 | static struct kmem_cache *delayed_cache; |
| 50 | 50 | ||
| 51 | static void handle_delayed_timer(unsigned long data) | 51 | static void handle_delayed_timer(unsigned long data) |
| 52 | { | 52 | { |
| 53 | struct delay_c *dc = (struct delay_c *)data; | 53 | struct delay_c *dc = (struct delay_c *)data; |
| 54 | 54 | ||
| 55 | queue_work(kdelayd_wq, &dc->flush_expired_bios); | 55 | queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); |
| 56 | } | 56 | } |
| 57 | 57 | ||
| 58 | static void queue_timeout(struct delay_c *dc, unsigned long expires) | 58 | static void queue_timeout(struct delay_c *dc, unsigned long expires) |
| @@ -191,6 +191,12 @@ out: | |||
| 191 | goto bad_dev_write; | 191 | goto bad_dev_write; |
| 192 | } | 192 | } |
| 193 | 193 | ||
| 194 | dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); | ||
| 195 | if (!dc->kdelayd_wq) { | ||
| 196 | DMERR("Couldn't start kdelayd"); | ||
| 197 | goto bad_queue; | ||
| 198 | } | ||
| 199 | |||
| 194 | setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); | 200 | setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); |
| 195 | 201 | ||
| 196 | INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); | 202 | INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); |
| @@ -203,6 +209,8 @@ out: | |||
| 203 | ti->private = dc; | 209 | ti->private = dc; |
| 204 | return 0; | 210 | return 0; |
| 205 | 211 | ||
| 212 | bad_queue: | ||
| 213 | mempool_destroy(dc->delayed_pool); | ||
| 206 | bad_dev_write: | 214 | bad_dev_write: |
| 207 | if (dc->dev_write) | 215 | if (dc->dev_write) |
| 208 | dm_put_device(ti, dc->dev_write); | 216 | dm_put_device(ti, dc->dev_write); |
| @@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti) | |||
| 217 | { | 225 | { |
| 218 | struct delay_c *dc = ti->private; | 226 | struct delay_c *dc = ti->private; |
| 219 | 227 | ||
| 220 | flush_workqueue(kdelayd_wq); | 228 | destroy_workqueue(dc->kdelayd_wq); |
| 221 | 229 | ||
| 222 | dm_put_device(ti, dc->dev_read); | 230 | dm_put_device(ti, dc->dev_read); |
| 223 | 231 | ||
| @@ -350,12 +358,6 @@ static int __init dm_delay_init(void) | |||
| 350 | { | 358 | { |
| 351 | int r = -ENOMEM; | 359 | int r = -ENOMEM; |
| 352 | 360 | ||
| 353 | kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); | ||
| 354 | if (!kdelayd_wq) { | ||
| 355 | DMERR("Couldn't start kdelayd"); | ||
| 356 | goto bad_queue; | ||
| 357 | } | ||
| 358 | |||
| 359 | delayed_cache = KMEM_CACHE(dm_delay_info, 0); | 361 | delayed_cache = KMEM_CACHE(dm_delay_info, 0); |
| 360 | if (!delayed_cache) { | 362 | if (!delayed_cache) { |
| 361 | DMERR("Couldn't create delayed bio cache."); | 363 | DMERR("Couldn't create delayed bio cache."); |
| @@ -373,8 +375,6 @@ static int __init dm_delay_init(void) | |||
| 373 | bad_register: | 375 | bad_register: |
| 374 | kmem_cache_destroy(delayed_cache); | 376 | kmem_cache_destroy(delayed_cache); |
| 375 | bad_memcache: | 377 | bad_memcache: |
| 376 | destroy_workqueue(kdelayd_wq); | ||
| 377 | bad_queue: | ||
| 378 | return r; | 378 | return r; |
| 379 | } | 379 | } |
| 380 | 380 | ||
| @@ -382,7 +382,6 @@ static void __exit dm_delay_exit(void) | |||
| 382 | { | 382 | { |
| 383 | dm_unregister_target(&delay_target); | 383 | dm_unregister_target(&delay_target); |
| 384 | kmem_cache_destroy(delayed_cache); | 384 | kmem_cache_destroy(delayed_cache); |
| 385 | destroy_workqueue(kdelayd_wq); | ||
| 386 | } | 385 | } |
| 387 | 386 | ||
| 388 | /* Module hooks */ | 387 | /* Module hooks */ |
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index aec57d76db5d..944690bafd93 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
| @@ -66,6 +66,18 @@ struct dm_snapshot { | |||
| 66 | 66 | ||
| 67 | atomic_t pending_exceptions_count; | 67 | atomic_t pending_exceptions_count; |
| 68 | 68 | ||
| 69 | /* Protected by "lock" */ | ||
| 70 | sector_t exception_start_sequence; | ||
| 71 | |||
| 72 | /* Protected by kcopyd single-threaded callback */ | ||
| 73 | sector_t exception_complete_sequence; | ||
| 74 | |||
| 75 | /* | ||
| 76 | * A list of pending exceptions that completed out of order. | ||
| 77 | * Protected by kcopyd single-threaded callback. | ||
| 78 | */ | ||
| 79 | struct list_head out_of_order_list; | ||
| 80 | |||
| 69 | mempool_t *pending_pool; | 81 | mempool_t *pending_pool; |
| 70 | 82 | ||
| 71 | struct dm_exception_table pending; | 83 | struct dm_exception_table pending; |
| @@ -173,6 +185,14 @@ struct dm_snap_pending_exception { | |||
| 173 | */ | 185 | */ |
| 174 | int started; | 186 | int started; |
| 175 | 187 | ||
| 188 | /* There was copying error. */ | ||
| 189 | int copy_error; | ||
| 190 | |||
| 191 | /* A sequence number, it is used for in-order completion. */ | ||
| 192 | sector_t exception_sequence; | ||
| 193 | |||
| 194 | struct list_head out_of_order_entry; | ||
| 195 | |||
| 176 | /* | 196 | /* |
| 177 | * For writing a complete chunk, bypassing the copy. | 197 | * For writing a complete chunk, bypassing the copy. |
| 178 | */ | 198 | */ |
| @@ -1094,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1094 | s->valid = 1; | 1114 | s->valid = 1; |
| 1095 | s->active = 0; | 1115 | s->active = 0; |
| 1096 | atomic_set(&s->pending_exceptions_count, 0); | 1116 | atomic_set(&s->pending_exceptions_count, 0); |
| 1117 | s->exception_start_sequence = 0; | ||
| 1118 | s->exception_complete_sequence = 0; | ||
| 1119 | INIT_LIST_HEAD(&s->out_of_order_list); | ||
| 1097 | init_rwsem(&s->lock); | 1120 | init_rwsem(&s->lock); |
| 1098 | INIT_LIST_HEAD(&s->list); | 1121 | INIT_LIST_HEAD(&s->list); |
| 1099 | spin_lock_init(&s->pe_lock); | 1122 | spin_lock_init(&s->pe_lock); |
| @@ -1443,6 +1466,19 @@ static void commit_callback(void *context, int success) | |||
| 1443 | pending_complete(pe, success); | 1466 | pending_complete(pe, success); |
| 1444 | } | 1467 | } |
| 1445 | 1468 | ||
| 1469 | static void complete_exception(struct dm_snap_pending_exception *pe) | ||
| 1470 | { | ||
| 1471 | struct dm_snapshot *s = pe->snap; | ||
| 1472 | |||
| 1473 | if (unlikely(pe->copy_error)) | ||
| 1474 | pending_complete(pe, 0); | ||
| 1475 | |||
| 1476 | else | ||
| 1477 | /* Update the metadata if we are persistent */ | ||
| 1478 | s->store->type->commit_exception(s->store, &pe->e, | ||
| 1479 | commit_callback, pe); | ||
| 1480 | } | ||
| 1481 | |||
| 1446 | /* | 1482 | /* |
| 1447 | * Called when the copy I/O has finished. kcopyd actually runs | 1483 | * Called when the copy I/O has finished. kcopyd actually runs |
| 1448 | * this code so don't block. | 1484 | * this code so don't block. |
| @@ -1452,13 +1488,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) | |||
| 1452 | struct dm_snap_pending_exception *pe = context; | 1488 | struct dm_snap_pending_exception *pe = context; |
| 1453 | struct dm_snapshot *s = pe->snap; | 1489 | struct dm_snapshot *s = pe->snap; |
| 1454 | 1490 | ||
| 1455 | if (read_err || write_err) | 1491 | pe->copy_error = read_err || write_err; |
| 1456 | pending_complete(pe, 0); | ||
| 1457 | 1492 | ||
| 1458 | else | 1493 | if (pe->exception_sequence == s->exception_complete_sequence) { |
| 1459 | /* Update the metadata if we are persistent */ | 1494 | s->exception_complete_sequence++; |
| 1460 | s->store->type->commit_exception(s->store, &pe->e, | 1495 | complete_exception(pe); |
| 1461 | commit_callback, pe); | 1496 | |
| 1497 | while (!list_empty(&s->out_of_order_list)) { | ||
| 1498 | pe = list_entry(s->out_of_order_list.next, | ||
| 1499 | struct dm_snap_pending_exception, out_of_order_entry); | ||
| 1500 | if (pe->exception_sequence != s->exception_complete_sequence) | ||
| 1501 | break; | ||
| 1502 | s->exception_complete_sequence++; | ||
| 1503 | list_del(&pe->out_of_order_entry); | ||
| 1504 | complete_exception(pe); | ||
| 1505 | } | ||
| 1506 | } else { | ||
| 1507 | struct list_head *lh; | ||
| 1508 | struct dm_snap_pending_exception *pe2; | ||
| 1509 | |||
| 1510 | list_for_each_prev(lh, &s->out_of_order_list) { | ||
| 1511 | pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry); | ||
| 1512 | if (pe2->exception_sequence < pe->exception_sequence) | ||
| 1513 | break; | ||
| 1514 | } | ||
| 1515 | list_add(&pe->out_of_order_entry, lh); | ||
| 1516 | } | ||
| 1462 | } | 1517 | } |
| 1463 | 1518 | ||
| 1464 | /* | 1519 | /* |
| @@ -1553,6 +1608,8 @@ __find_pending_exception(struct dm_snapshot *s, | |||
| 1553 | return NULL; | 1608 | return NULL; |
| 1554 | } | 1609 | } |
| 1555 | 1610 | ||
| 1611 | pe->exception_sequence = s->exception_start_sequence++; | ||
| 1612 | |||
| 1556 | dm_insert_exception(&s->pending, &pe->e); | 1613 | dm_insert_exception(&s->pending, &pe->e); |
| 1557 | 1614 | ||
| 1558 | return pe; | 1615 | return pe; |
| @@ -2192,7 +2249,7 @@ static struct target_type origin_target = { | |||
| 2192 | 2249 | ||
| 2193 | static struct target_type snapshot_target = { | 2250 | static struct target_type snapshot_target = { |
| 2194 | .name = "snapshot", | 2251 | .name = "snapshot", |
| 2195 | .version = {1, 11, 1}, | 2252 | .version = {1, 12, 0}, |
| 2196 | .module = THIS_MODULE, | 2253 | .module = THIS_MODULE, |
| 2197 | .ctr = snapshot_ctr, | 2254 | .ctr = snapshot_ctr, |
| 2198 | .dtr = snapshot_dtr, | 2255 | .dtr = snapshot_dtr, |
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 3d404c1371ed..28a90122a5a8 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c | |||
| @@ -964,6 +964,7 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, | |||
| 964 | 964 | ||
| 965 | int __init dm_statistics_init(void) | 965 | int __init dm_statistics_init(void) |
| 966 | { | 966 | { |
| 967 | shared_memory_amount = 0; | ||
| 967 | dm_stat_need_rcu_barrier = 0; | 968 | dm_stat_need_rcu_barrier = 0; |
| 968 | return 0; | 969 | return 0; |
| 969 | } | 970 | } |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 465f08ca62b1..3ba6a3859ce3 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
| @@ -200,6 +200,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode, | |||
| 200 | 200 | ||
| 201 | num_targets = dm_round_up(num_targets, KEYS_PER_NODE); | 201 | num_targets = dm_round_up(num_targets, KEYS_PER_NODE); |
| 202 | 202 | ||
| 203 | if (!num_targets) { | ||
| 204 | kfree(t); | ||
| 205 | return -ENOMEM; | ||
| 206 | } | ||
| 207 | |||
| 203 | if (alloc_targets(t, num_targets)) { | 208 | if (alloc_targets(t, num_targets)) { |
| 204 | kfree(t); | 209 | kfree(t); |
| 205 | return -ENOMEM; | 210 | return -ENOMEM; |
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 60bce435f4fa..8a30ad54bd46 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c | |||
| @@ -1697,6 +1697,14 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) | |||
| 1697 | up_write(&pmd->root_lock); | 1697 | up_write(&pmd->root_lock); |
| 1698 | } | 1698 | } |
| 1699 | 1699 | ||
| 1700 | void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) | ||
| 1701 | { | ||
| 1702 | down_write(&pmd->root_lock); | ||
| 1703 | pmd->read_only = false; | ||
| 1704 | dm_bm_set_read_write(pmd->bm); | ||
| 1705 | up_write(&pmd->root_lock); | ||
| 1706 | } | ||
| 1707 | |||
| 1700 | int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, | 1708 | int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, |
| 1701 | dm_block_t threshold, | 1709 | dm_block_t threshold, |
| 1702 | dm_sm_threshold_fn fn, | 1710 | dm_sm_threshold_fn fn, |
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 845ebbe589a9..7bcc0e1d6238 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h | |||
| @@ -193,6 +193,7 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_siz | |||
| 193 | * that nothing is changing. | 193 | * that nothing is changing. |
| 194 | */ | 194 | */ |
| 195 | void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); | 195 | void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); |
| 196 | void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd); | ||
| 196 | 197 | ||
| 197 | int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, | 198 | int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, |
| 198 | dm_block_t threshold, | 199 | dm_block_t threshold, |
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 2c0cf511ec23..ee29037ffc2e 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
| @@ -640,7 +640,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) | |||
| 640 | */ | 640 | */ |
| 641 | r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); | 641 | r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); |
| 642 | if (r) { | 642 | if (r) { |
| 643 | DMERR_LIMIT("dm_thin_insert_block() failed"); | 643 | DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d", |
| 644 | dm_device_name(pool->pool_md), r); | ||
| 645 | set_pool_mode(pool, PM_READ_ONLY); | ||
| 644 | cell_error(pool, m->cell); | 646 | cell_error(pool, m->cell); |
| 645 | goto out; | 647 | goto out; |
| 646 | } | 648 | } |
| @@ -881,32 +883,23 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | |||
| 881 | } | 883 | } |
| 882 | } | 884 | } |
| 883 | 885 | ||
| 884 | static int commit(struct pool *pool) | ||
| 885 | { | ||
| 886 | int r; | ||
| 887 | |||
| 888 | r = dm_pool_commit_metadata(pool->pmd); | ||
| 889 | if (r) | ||
| 890 | DMERR_LIMIT("%s: commit failed: error = %d", | ||
| 891 | dm_device_name(pool->pool_md), r); | ||
| 892 | |||
| 893 | return r; | ||
| 894 | } | ||
| 895 | |||
| 896 | /* | 886 | /* |
| 897 | * A non-zero return indicates read_only or fail_io mode. | 887 | * A non-zero return indicates read_only or fail_io mode. |
| 898 | * Many callers don't care about the return value. | 888 | * Many callers don't care about the return value. |
| 899 | */ | 889 | */ |
| 900 | static int commit_or_fallback(struct pool *pool) | 890 | static int commit(struct pool *pool) |
| 901 | { | 891 | { |
| 902 | int r; | 892 | int r; |
| 903 | 893 | ||
| 904 | if (get_pool_mode(pool) != PM_WRITE) | 894 | if (get_pool_mode(pool) != PM_WRITE) |
| 905 | return -EINVAL; | 895 | return -EINVAL; |
| 906 | 896 | ||
| 907 | r = commit(pool); | 897 | r = dm_pool_commit_metadata(pool->pmd); |
| 908 | if (r) | 898 | if (r) { |
| 899 | DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d", | ||
| 900 | dm_device_name(pool->pool_md), r); | ||
| 909 | set_pool_mode(pool, PM_READ_ONLY); | 901 | set_pool_mode(pool, PM_READ_ONLY); |
| 902 | } | ||
| 910 | 903 | ||
| 911 | return r; | 904 | return r; |
| 912 | } | 905 | } |
| @@ -943,7 +936,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | |||
| 943 | * Try to commit to see if that will free up some | 936 | * Try to commit to see if that will free up some |
| 944 | * more space. | 937 | * more space. |
| 945 | */ | 938 | */ |
| 946 | (void) commit_or_fallback(pool); | 939 | r = commit(pool); |
| 940 | if (r) | ||
| 941 | return r; | ||
| 947 | 942 | ||
| 948 | r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); | 943 | r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); |
| 949 | if (r) | 944 | if (r) |
| @@ -957,7 +952,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | |||
| 957 | * table reload). | 952 | * table reload). |
| 958 | */ | 953 | */ |
| 959 | if (!free_blocks) { | 954 | if (!free_blocks) { |
| 960 | DMWARN("%s: no free space available.", | 955 | DMWARN("%s: no free data space available.", |
| 961 | dm_device_name(pool->pool_md)); | 956 | dm_device_name(pool->pool_md)); |
| 962 | spin_lock_irqsave(&pool->lock, flags); | 957 | spin_lock_irqsave(&pool->lock, flags); |
| 963 | pool->no_free_space = 1; | 958 | pool->no_free_space = 1; |
| @@ -967,8 +962,16 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | |||
| 967 | } | 962 | } |
| 968 | 963 | ||
| 969 | r = dm_pool_alloc_data_block(pool->pmd, result); | 964 | r = dm_pool_alloc_data_block(pool->pmd, result); |
| 970 | if (r) | 965 | if (r) { |
| 966 | if (r == -ENOSPC && | ||
| 967 | !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && | ||
| 968 | !free_blocks) { | ||
| 969 | DMWARN("%s: no free metadata space available.", | ||
| 970 | dm_device_name(pool->pool_md)); | ||
| 971 | set_pool_mode(pool, PM_READ_ONLY); | ||
| 972 | } | ||
| 971 | return r; | 973 | return r; |
| 974 | } | ||
| 972 | 975 | ||
| 973 | return 0; | 976 | return 0; |
| 974 | } | 977 | } |
| @@ -1349,7 +1352,7 @@ static void process_deferred_bios(struct pool *pool) | |||
| 1349 | if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) | 1352 | if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) |
| 1350 | return; | 1353 | return; |
| 1351 | 1354 | ||
| 1352 | if (commit_or_fallback(pool)) { | 1355 | if (commit(pool)) { |
| 1353 | while ((bio = bio_list_pop(&bios))) | 1356 | while ((bio = bio_list_pop(&bios))) |
| 1354 | bio_io_error(bio); | 1357 | bio_io_error(bio); |
| 1355 | return; | 1358 | return; |
| @@ -1397,6 +1400,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode) | |||
| 1397 | case PM_FAIL: | 1400 | case PM_FAIL: |
| 1398 | DMERR("%s: switching pool to failure mode", | 1401 | DMERR("%s: switching pool to failure mode", |
| 1399 | dm_device_name(pool->pool_md)); | 1402 | dm_device_name(pool->pool_md)); |
| 1403 | dm_pool_metadata_read_only(pool->pmd); | ||
| 1400 | pool->process_bio = process_bio_fail; | 1404 | pool->process_bio = process_bio_fail; |
| 1401 | pool->process_discard = process_bio_fail; | 1405 | pool->process_discard = process_bio_fail; |
| 1402 | pool->process_prepared_mapping = process_prepared_mapping_fail; | 1406 | pool->process_prepared_mapping = process_prepared_mapping_fail; |
| @@ -1421,6 +1425,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode) | |||
| 1421 | break; | 1425 | break; |
| 1422 | 1426 | ||
| 1423 | case PM_WRITE: | 1427 | case PM_WRITE: |
| 1428 | dm_pool_metadata_read_write(pool->pmd); | ||
| 1424 | pool->process_bio = process_bio; | 1429 | pool->process_bio = process_bio; |
| 1425 | pool->process_discard = process_discard; | 1430 | pool->process_discard = process_discard; |
| 1426 | pool->process_prepared_mapping = process_prepared_mapping; | 1431 | pool->process_prepared_mapping = process_prepared_mapping; |
| @@ -1637,12 +1642,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) | |||
| 1637 | struct pool_c *pt = ti->private; | 1642 | struct pool_c *pt = ti->private; |
| 1638 | 1643 | ||
| 1639 | /* | 1644 | /* |
| 1640 | * We want to make sure that degraded pools are never upgraded. | 1645 | * We want to make sure that a pool in PM_FAIL mode is never upgraded. |
| 1641 | */ | 1646 | */ |
| 1642 | enum pool_mode old_mode = pool->pf.mode; | 1647 | enum pool_mode old_mode = pool->pf.mode; |
| 1643 | enum pool_mode new_mode = pt->adjusted_pf.mode; | 1648 | enum pool_mode new_mode = pt->adjusted_pf.mode; |
| 1644 | 1649 | ||
| 1645 | if (old_mode > new_mode) | 1650 | /* |
| 1651 | * If we were in PM_FAIL mode, rollback of metadata failed. We're | ||
| 1652 | * not going to recover without a thin_repair. So we never let the | ||
| 1653 | * pool move out of the old mode. On the other hand a PM_READ_ONLY | ||
| 1654 | * may have been due to a lack of metadata or data space, and may | ||
| 1655 | * now work (ie. if the underlying devices have been resized). | ||
| 1656 | */ | ||
| 1657 | if (old_mode == PM_FAIL) | ||
| 1646 | new_mode = old_mode; | 1658 | new_mode = old_mode; |
| 1647 | 1659 | ||
| 1648 | pool->ti = ti; | 1660 | pool->ti = ti; |
| @@ -2266,7 +2278,7 @@ static int pool_preresume(struct dm_target *ti) | |||
| 2266 | return r; | 2278 | return r; |
| 2267 | 2279 | ||
| 2268 | if (need_commit1 || need_commit2) | 2280 | if (need_commit1 || need_commit2) |
| 2269 | (void) commit_or_fallback(pool); | 2281 | (void) commit(pool); |
| 2270 | 2282 | ||
| 2271 | return 0; | 2283 | return 0; |
| 2272 | } | 2284 | } |
| @@ -2293,7 +2305,7 @@ static void pool_postsuspend(struct dm_target *ti) | |||
| 2293 | 2305 | ||
| 2294 | cancel_delayed_work(&pool->waker); | 2306 | cancel_delayed_work(&pool->waker); |
| 2295 | flush_workqueue(pool->wq); | 2307 | flush_workqueue(pool->wq); |
| 2296 | (void) commit_or_fallback(pool); | 2308 | (void) commit(pool); |
| 2297 | } | 2309 | } |
| 2298 | 2310 | ||
| 2299 | static int check_arg_count(unsigned argc, unsigned args_required) | 2311 | static int check_arg_count(unsigned argc, unsigned args_required) |
| @@ -2427,7 +2439,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct | |||
| 2427 | if (r) | 2439 | if (r) |
| 2428 | return r; | 2440 | return r; |
| 2429 | 2441 | ||
| 2430 | (void) commit_or_fallback(pool); | 2442 | (void) commit(pool); |
| 2431 | 2443 | ||
| 2432 | r = dm_pool_reserve_metadata_snap(pool->pmd); | 2444 | r = dm_pool_reserve_metadata_snap(pool->pmd); |
| 2433 | if (r) | 2445 | if (r) |
| @@ -2489,7 +2501,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) | |||
| 2489 | DMWARN("Unrecognised thin pool target message received: %s", argv[0]); | 2501 | DMWARN("Unrecognised thin pool target message received: %s", argv[0]); |
| 2490 | 2502 | ||
| 2491 | if (!r) | 2503 | if (!r) |
| 2492 | (void) commit_or_fallback(pool); | 2504 | (void) commit(pool); |
| 2493 | 2505 | ||
| 2494 | return r; | 2506 | return r; |
| 2495 | } | 2507 | } |
| @@ -2544,7 +2556,7 @@ static void pool_status(struct dm_target *ti, status_type_t type, | |||
| 2544 | 2556 | ||
| 2545 | /* Commit to ensure statistics aren't out-of-date */ | 2557 | /* Commit to ensure statistics aren't out-of-date */ |
| 2546 | if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) | 2558 | if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) |
| 2547 | (void) commit_or_fallback(pool); | 2559 | (void) commit(pool); |
| 2548 | 2560 | ||
| 2549 | r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); | 2561 | r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); |
| 2550 | if (r) { | 2562 | if (r) { |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 8766eabb0014..21f4d7ff0da2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev) | |||
| 112 | 112 | ||
| 113 | static struct ctl_table_header *raid_table_header; | 113 | static struct ctl_table_header *raid_table_header; |
| 114 | 114 | ||
| 115 | static ctl_table raid_table[] = { | 115 | static struct ctl_table raid_table[] = { |
| 116 | { | 116 | { |
| 117 | .procname = "speed_limit_min", | 117 | .procname = "speed_limit_min", |
| 118 | .data = &sysctl_speed_limit_min, | 118 | .data = &sysctl_speed_limit_min, |
| @@ -130,7 +130,7 @@ static ctl_table raid_table[] = { | |||
| 130 | { } | 130 | { } |
| 131 | }; | 131 | }; |
| 132 | 132 | ||
| 133 | static ctl_table raid_dir_table[] = { | 133 | static struct ctl_table raid_dir_table[] = { |
| 134 | { | 134 | { |
| 135 | .procname = "raid", | 135 | .procname = "raid", |
| 136 | .maxlen = 0, | 136 | .maxlen = 0, |
| @@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = { | |||
| 140 | { } | 140 | { } |
| 141 | }; | 141 | }; |
| 142 | 142 | ||
| 143 | static ctl_table raid_root_table[] = { | 143 | static struct ctl_table raid_root_table[] = { |
| 144 | { | 144 | { |
| 145 | .procname = "dev", | 145 | .procname = "dev", |
| 146 | .maxlen = 0, | 146 | .maxlen = 0, |
| @@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit) | |||
| 562 | goto retry; | 562 | goto retry; |
| 563 | } | 563 | } |
| 564 | 564 | ||
| 565 | static inline int mddev_lock(struct mddev * mddev) | 565 | static inline int __must_check mddev_lock(struct mddev * mddev) |
| 566 | { | 566 | { |
| 567 | return mutex_lock_interruptible(&mddev->reconfig_mutex); | 567 | return mutex_lock_interruptible(&mddev->reconfig_mutex); |
| 568 | } | 568 | } |
| 569 | 569 | ||
| 570 | /* Sometimes we need to take the lock in a situation where | ||
| 571 | * failure due to interrupts is not acceptable. | ||
| 572 | */ | ||
| 573 | static inline void mddev_lock_nointr(struct mddev * mddev) | ||
| 574 | { | ||
| 575 | mutex_lock(&mddev->reconfig_mutex); | ||
| 576 | } | ||
| 577 | |||
| 570 | static inline int mddev_is_locked(struct mddev *mddev) | 578 | static inline int mddev_is_locked(struct mddev *mddev) |
| 571 | { | 579 | { |
| 572 | return mutex_is_locked(&mddev->reconfig_mutex); | 580 | return mutex_is_locked(&mddev->reconfig_mutex); |
| @@ -768,16 +776,10 @@ void md_super_wait(struct mddev *mddev) | |||
| 768 | finish_wait(&mddev->sb_wait, &wq); | 776 | finish_wait(&mddev->sb_wait, &wq); |
| 769 | } | 777 | } |
| 770 | 778 | ||
| 771 | static void bi_complete(struct bio *bio, int error) | ||
| 772 | { | ||
| 773 | complete((struct completion*)bio->bi_private); | ||
| 774 | } | ||
| 775 | |||
| 776 | int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | 779 | int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, |
| 777 | struct page *page, int rw, bool metadata_op) | 780 | struct page *page, int rw, bool metadata_op) |
| 778 | { | 781 | { |
| 779 | struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); | 782 | struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); |
| 780 | struct completion event; | ||
| 781 | int ret; | 783 | int ret; |
| 782 | 784 | ||
| 783 | rw |= REQ_SYNC; | 785 | rw |= REQ_SYNC; |
| @@ -793,11 +795,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | |||
| 793 | else | 795 | else |
| 794 | bio->bi_sector = sector + rdev->data_offset; | 796 | bio->bi_sector = sector + rdev->data_offset; |
| 795 | bio_add_page(bio, page, size, 0); | 797 | bio_add_page(bio, page, size, 0); |
| 796 | init_completion(&event); | 798 | submit_bio_wait(rw, bio); |
| 797 | bio->bi_private = &event; | ||
| 798 | bio->bi_end_io = bi_complete; | ||
| 799 | submit_bio(rw, bio); | ||
| 800 | wait_for_completion(&event); | ||
| 801 | 799 | ||
| 802 | ret = test_bit(BIO_UPTODATE, &bio->bi_flags); | 800 | ret = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| 803 | bio_put(bio); | 801 | bio_put(bio); |
| @@ -2978,7 +2976,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
| 2978 | for_each_mddev(mddev, tmp) { | 2976 | for_each_mddev(mddev, tmp) { |
| 2979 | struct md_rdev *rdev2; | 2977 | struct md_rdev *rdev2; |
| 2980 | 2978 | ||
| 2981 | mddev_lock(mddev); | 2979 | mddev_lock_nointr(mddev); |
| 2982 | rdev_for_each(rdev2, mddev) | 2980 | rdev_for_each(rdev2, mddev) |
| 2983 | if (rdev->bdev == rdev2->bdev && | 2981 | if (rdev->bdev == rdev2->bdev && |
| 2984 | rdev != rdev2 && | 2982 | rdev != rdev2 && |
| @@ -2994,7 +2992,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
| 2994 | break; | 2992 | break; |
| 2995 | } | 2993 | } |
| 2996 | } | 2994 | } |
| 2997 | mddev_lock(my_mddev); | 2995 | mddev_lock_nointr(my_mddev); |
| 2998 | if (overlap) { | 2996 | if (overlap) { |
| 2999 | /* Someone else could have slipped in a size | 2997 | /* Someone else could have slipped in a size |
| 3000 | * change here, but doing so is just silly. | 2998 | * change here, but doing so is just silly. |
| @@ -3580,6 +3578,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3580 | mddev->in_sync = 1; | 3578 | mddev->in_sync = 1; |
| 3581 | del_timer_sync(&mddev->safemode_timer); | 3579 | del_timer_sync(&mddev->safemode_timer); |
| 3582 | } | 3580 | } |
| 3581 | blk_set_stacking_limits(&mddev->queue->limits); | ||
| 3583 | pers->run(mddev); | 3582 | pers->run(mddev); |
| 3584 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3583 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
| 3585 | mddev_resume(mddev); | 3584 | mddev_resume(mddev); |
| @@ -5258,7 +5257,7 @@ static void __md_stop_writes(struct mddev *mddev) | |||
| 5258 | 5257 | ||
| 5259 | void md_stop_writes(struct mddev *mddev) | 5258 | void md_stop_writes(struct mddev *mddev) |
| 5260 | { | 5259 | { |
| 5261 | mddev_lock(mddev); | 5260 | mddev_lock_nointr(mddev); |
| 5262 | __md_stop_writes(mddev); | 5261 | __md_stop_writes(mddev); |
| 5263 | mddev_unlock(mddev); | 5262 | mddev_unlock(mddev); |
| 5264 | } | 5263 | } |
| @@ -5291,20 +5290,35 @@ EXPORT_SYMBOL_GPL(md_stop); | |||
| 5291 | static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) | 5290 | static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) |
| 5292 | { | 5291 | { |
| 5293 | int err = 0; | 5292 | int err = 0; |
| 5293 | int did_freeze = 0; | ||
| 5294 | |||
| 5295 | if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { | ||
| 5296 | did_freeze = 1; | ||
| 5297 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
| 5298 | md_wakeup_thread(mddev->thread); | ||
| 5299 | } | ||
| 5300 | if (mddev->sync_thread) { | ||
| 5301 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
| 5302 | /* Thread might be blocked waiting for metadata update | ||
| 5303 | * which will now never happen */ | ||
| 5304 | wake_up_process(mddev->sync_thread->tsk); | ||
| 5305 | } | ||
| 5306 | mddev_unlock(mddev); | ||
| 5307 | wait_event(resync_wait, mddev->sync_thread == NULL); | ||
| 5308 | mddev_lock_nointr(mddev); | ||
| 5309 | |||
| 5294 | mutex_lock(&mddev->open_mutex); | 5310 | mutex_lock(&mddev->open_mutex); |
| 5295 | if (atomic_read(&mddev->openers) > !!bdev) { | 5311 | if (atomic_read(&mddev->openers) > !!bdev || |
| 5312 | mddev->sync_thread || | ||
| 5313 | (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { | ||
| 5296 | printk("md: %s still in use.\n",mdname(mddev)); | 5314 | printk("md: %s still in use.\n",mdname(mddev)); |
| 5315 | if (did_freeze) { | ||
| 5316 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
| 5317 | md_wakeup_thread(mddev->thread); | ||
| 5318 | } | ||
| 5297 | err = -EBUSY; | 5319 | err = -EBUSY; |
| 5298 | goto out; | 5320 | goto out; |
| 5299 | } | 5321 | } |
| 5300 | if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { | ||
| 5301 | /* Someone opened the device since we flushed it | ||
| 5302 | * so page cache could be dirty and it is too late | ||
| 5303 | * to flush. So abort | ||
| 5304 | */ | ||
| 5305 | mutex_unlock(&mddev->open_mutex); | ||
| 5306 | return -EBUSY; | ||
| 5307 | } | ||
| 5308 | if (mddev->pers) { | 5322 | if (mddev->pers) { |
| 5309 | __md_stop_writes(mddev); | 5323 | __md_stop_writes(mddev); |
| 5310 | 5324 | ||
| @@ -5315,7 +5329,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) | |||
| 5315 | set_disk_ro(mddev->gendisk, 1); | 5329 | set_disk_ro(mddev->gendisk, 1); |
| 5316 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 5330 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
| 5317 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 5331 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
| 5318 | err = 0; | 5332 | err = 0; |
| 5319 | } | 5333 | } |
| 5320 | out: | 5334 | out: |
| 5321 | mutex_unlock(&mddev->open_mutex); | 5335 | mutex_unlock(&mddev->open_mutex); |
| @@ -5331,20 +5345,34 @@ static int do_md_stop(struct mddev * mddev, int mode, | |||
| 5331 | { | 5345 | { |
| 5332 | struct gendisk *disk = mddev->gendisk; | 5346 | struct gendisk *disk = mddev->gendisk; |
| 5333 | struct md_rdev *rdev; | 5347 | struct md_rdev *rdev; |
| 5348 | int did_freeze = 0; | ||
| 5349 | |||
| 5350 | if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { | ||
| 5351 | did_freeze = 1; | ||
| 5352 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | ||
| 5353 | md_wakeup_thread(mddev->thread); | ||
| 5354 | } | ||
| 5355 | if (mddev->sync_thread) { | ||
| 5356 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
| 5357 | /* Thread might be blocked waiting for metadata update | ||
| 5358 | * which will now never happen */ | ||
| 5359 | wake_up_process(mddev->sync_thread->tsk); | ||
| 5360 | } | ||
| 5361 | mddev_unlock(mddev); | ||
| 5362 | wait_event(resync_wait, mddev->sync_thread == NULL); | ||
| 5363 | mddev_lock_nointr(mddev); | ||
| 5334 | 5364 | ||
| 5335 | mutex_lock(&mddev->open_mutex); | 5365 | mutex_lock(&mddev->open_mutex); |
| 5336 | if (atomic_read(&mddev->openers) > !!bdev || | 5366 | if (atomic_read(&mddev->openers) > !!bdev || |
| 5337 | mddev->sysfs_active) { | 5367 | mddev->sysfs_active || |
| 5368 | mddev->sync_thread || | ||
| 5369 | (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { | ||
| 5338 | printk("md: %s still in use.\n",mdname(mddev)); | 5370 | printk("md: %s still in use.\n",mdname(mddev)); |
| 5339 | mutex_unlock(&mddev->open_mutex); | 5371 | mutex_unlock(&mddev->open_mutex); |
| 5340 | return -EBUSY; | 5372 | if (did_freeze) { |
| 5341 | } | 5373 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
| 5342 | if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { | 5374 | md_wakeup_thread(mddev->thread); |
| 5343 | /* Someone opened the device since we flushed it | 5375 | } |
| 5344 | * so page cache could be dirty and it is too late | ||
| 5345 | * to flush. So abort | ||
| 5346 | */ | ||
| 5347 | mutex_unlock(&mddev->open_mutex); | ||
| 5348 | return -EBUSY; | 5376 | return -EBUSY; |
| 5349 | } | 5377 | } |
| 5350 | if (mddev->pers) { | 5378 | if (mddev->pers) { |
| @@ -6551,7 +6579,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
| 6551 | wait_event(mddev->sb_wait, | 6579 | wait_event(mddev->sb_wait, |
| 6552 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) && | 6580 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) && |
| 6553 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 6581 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); |
| 6554 | mddev_lock(mddev); | 6582 | mddev_lock_nointr(mddev); |
| 6555 | } | 6583 | } |
| 6556 | } else { | 6584 | } else { |
| 6557 | err = -EROFS; | 6585 | err = -EROFS; |
| @@ -7361,9 +7389,6 @@ void md_do_sync(struct md_thread *thread) | |||
| 7361 | mddev->curr_resync = 2; | 7389 | mddev->curr_resync = 2; |
| 7362 | 7390 | ||
| 7363 | try_again: | 7391 | try_again: |
| 7364 | if (kthread_should_stop()) | ||
| 7365 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
| 7366 | |||
| 7367 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | 7392 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
| 7368 | goto skip; | 7393 | goto skip; |
| 7369 | for_each_mddev(mddev2, tmp) { | 7394 | for_each_mddev(mddev2, tmp) { |
| @@ -7388,7 +7413,7 @@ void md_do_sync(struct md_thread *thread) | |||
| 7388 | * be caught by 'softlockup' | 7413 | * be caught by 'softlockup' |
| 7389 | */ | 7414 | */ |
| 7390 | prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); | 7415 | prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); |
| 7391 | if (!kthread_should_stop() && | 7416 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
| 7392 | mddev2->curr_resync >= mddev->curr_resync) { | 7417 | mddev2->curr_resync >= mddev->curr_resync) { |
| 7393 | printk(KERN_INFO "md: delaying %s of %s" | 7418 | printk(KERN_INFO "md: delaying %s of %s" |
| 7394 | " until %s has finished (they" | 7419 | " until %s has finished (they" |
| @@ -7464,7 +7489,7 @@ void md_do_sync(struct md_thread *thread) | |||
| 7464 | last_check = 0; | 7489 | last_check = 0; |
| 7465 | 7490 | ||
| 7466 | if (j>2) { | 7491 | if (j>2) { |
| 7467 | printk(KERN_INFO | 7492 | printk(KERN_INFO |
| 7468 | "md: resuming %s of %s from checkpoint.\n", | 7493 | "md: resuming %s of %s from checkpoint.\n", |
| 7469 | desc, mdname(mddev)); | 7494 | desc, mdname(mddev)); |
| 7470 | mddev->curr_resync = j; | 7495 | mddev->curr_resync = j; |
| @@ -7501,7 +7526,8 @@ void md_do_sync(struct md_thread *thread) | |||
| 7501 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 7526 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
| 7502 | } | 7527 | } |
| 7503 | 7528 | ||
| 7504 | while (j >= mddev->resync_max && !kthread_should_stop()) { | 7529 | while (j >= mddev->resync_max && |
| 7530 | !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | ||
| 7505 | /* As this condition is controlled by user-space, | 7531 | /* As this condition is controlled by user-space, |
| 7506 | * we can block indefinitely, so use '_interruptible' | 7532 | * we can block indefinitely, so use '_interruptible' |
| 7507 | * to avoid triggering warnings. | 7533 | * to avoid triggering warnings. |
| @@ -7509,17 +7535,18 @@ void md_do_sync(struct md_thread *thread) | |||
| 7509 | flush_signals(current); /* just in case */ | 7535 | flush_signals(current); /* just in case */ |
| 7510 | wait_event_interruptible(mddev->recovery_wait, | 7536 | wait_event_interruptible(mddev->recovery_wait, |
| 7511 | mddev->resync_max > j | 7537 | mddev->resync_max > j |
| 7512 | || kthread_should_stop()); | 7538 | || test_bit(MD_RECOVERY_INTR, |
| 7539 | &mddev->recovery)); | ||
| 7513 | } | 7540 | } |
| 7514 | 7541 | ||
| 7515 | if (kthread_should_stop()) | 7542 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
| 7516 | goto interrupted; | 7543 | break; |
| 7517 | 7544 | ||
| 7518 | sectors = mddev->pers->sync_request(mddev, j, &skipped, | 7545 | sectors = mddev->pers->sync_request(mddev, j, &skipped, |
| 7519 | currspeed < speed_min(mddev)); | 7546 | currspeed < speed_min(mddev)); |
| 7520 | if (sectors == 0) { | 7547 | if (sectors == 0) { |
| 7521 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 7548 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
| 7522 | goto out; | 7549 | break; |
| 7523 | } | 7550 | } |
| 7524 | 7551 | ||
| 7525 | if (!skipped) { /* actual IO requested */ | 7552 | if (!skipped) { /* actual IO requested */ |
| @@ -7556,10 +7583,8 @@ void md_do_sync(struct md_thread *thread) | |||
| 7556 | last_mark = next; | 7583 | last_mark = next; |
| 7557 | } | 7584 | } |
| 7558 | 7585 | ||
| 7559 | 7586 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | |
| 7560 | if (kthread_should_stop()) | 7587 | break; |
| 7561 | goto interrupted; | ||
| 7562 | |||
| 7563 | 7588 | ||
| 7564 | /* | 7589 | /* |
| 7565 | * this loop exits only if either when we are slower than | 7590 | * this loop exits only if either when we are slower than |
| @@ -7582,11 +7607,12 @@ void md_do_sync(struct md_thread *thread) | |||
| 7582 | } | 7607 | } |
| 7583 | } | 7608 | } |
| 7584 | } | 7609 | } |
| 7585 | printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); | 7610 | printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, |
| 7611 | test_bit(MD_RECOVERY_INTR, &mddev->recovery) | ||
| 7612 | ? "interrupted" : "done"); | ||
| 7586 | /* | 7613 | /* |
| 7587 | * this also signals 'finished resyncing' to md_stop | 7614 | * this also signals 'finished resyncing' to md_stop |
| 7588 | */ | 7615 | */ |
| 7589 | out: | ||
| 7590 | blk_finish_plug(&plug); | 7616 | blk_finish_plug(&plug); |
| 7591 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); | 7617 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); |
| 7592 | 7618 | ||
| @@ -7640,16 +7666,6 @@ void md_do_sync(struct md_thread *thread) | |||
| 7640 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); | 7666 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); |
| 7641 | md_wakeup_thread(mddev->thread); | 7667 | md_wakeup_thread(mddev->thread); |
| 7642 | return; | 7668 | return; |
| 7643 | |||
| 7644 | interrupted: | ||
| 7645 | /* | ||
| 7646 | * got a signal, exit. | ||
| 7647 | */ | ||
| 7648 | printk(KERN_INFO | ||
| 7649 | "md: md_do_sync() got signal ... exiting\n"); | ||
| 7650 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
| 7651 | goto out; | ||
| 7652 | |||
| 7653 | } | 7669 | } |
| 7654 | EXPORT_SYMBOL_GPL(md_do_sync); | 7670 | EXPORT_SYMBOL_GPL(md_do_sync); |
| 7655 | 7671 | ||
| @@ -7751,7 +7767,7 @@ void md_check_recovery(struct mddev *mddev) | |||
| 7751 | if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) | 7767 | if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) |
| 7752 | return; | 7768 | return; |
| 7753 | if ( ! ( | 7769 | if ( ! ( |
| 7754 | (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || | 7770 | (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || |
| 7755 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || | 7771 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || |
| 7756 | test_bit(MD_RECOVERY_DONE, &mddev->recovery) || | 7772 | test_bit(MD_RECOVERY_DONE, &mddev->recovery) || |
| 7757 | (mddev->external == 0 && mddev->safemode == 1) || | 7773 | (mddev->external == 0 && mddev->safemode == 1) || |
| @@ -7894,6 +7910,7 @@ void md_reap_sync_thread(struct mddev *mddev) | |||
| 7894 | 7910 | ||
| 7895 | /* resync has finished, collect result */ | 7911 | /* resync has finished, collect result */ |
| 7896 | md_unregister_thread(&mddev->sync_thread); | 7912 | md_unregister_thread(&mddev->sync_thread); |
| 7913 | wake_up(&resync_wait); | ||
| 7897 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && | 7914 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
| 7898 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 7915 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { |
| 7899 | /* success...*/ | 7916 | /* success...*/ |
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index af96e24ec328..1d75b1dc1e2e 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c | |||
| @@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, | |||
| 317 | * The shadow op will often be a noop. Only insert if it really | 317 | * The shadow op will often be a noop. Only insert if it really |
| 318 | * copied data. | 318 | * copied data. |
| 319 | */ | 319 | */ |
| 320 | if (dm_block_location(*block) != b) | 320 | if (dm_block_location(*block) != b) { |
| 321 | /* | ||
| 322 | * dm_tm_shadow_block will have already decremented the old | ||
| 323 | * block, but it is still referenced by the btree. We | ||
| 324 | * increment to stop the insert decrementing it below zero | ||
| 325 | * when overwriting the old value. | ||
| 326 | */ | ||
| 327 | dm_tm_inc(info->btree_info.tm, b); | ||
| 321 | r = insert_ablock(info, index, *block, root); | 328 | r = insert_ablock(info, index, *block, root); |
| 329 | } | ||
| 322 | 330 | ||
| 323 | return r; | 331 | return r; |
| 324 | } | 332 | } |
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index a7e8bf296388..064a3c271baa 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c | |||
| @@ -626,6 +626,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm) | |||
| 626 | } | 626 | } |
| 627 | EXPORT_SYMBOL_GPL(dm_bm_set_read_only); | 627 | EXPORT_SYMBOL_GPL(dm_bm_set_read_only); |
| 628 | 628 | ||
| 629 | void dm_bm_set_read_write(struct dm_block_manager *bm) | ||
| 630 | { | ||
| 631 | bm->read_only = false; | ||
| 632 | } | ||
| 633 | EXPORT_SYMBOL_GPL(dm_bm_set_read_write); | ||
| 634 | |||
| 629 | u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) | 635 | u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) |
| 630 | { | 636 | { |
| 631 | return crc32c(~(u32) 0, data, len) ^ init_xor; | 637 | return crc32c(~(u32) 0, data, len) ^ init_xor; |
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 9a82083a66b6..13cd58e1fe69 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h | |||
| @@ -108,9 +108,9 @@ int dm_bm_unlock(struct dm_block *b); | |||
| 108 | int dm_bm_flush_and_unlock(struct dm_block_manager *bm, | 108 | int dm_bm_flush_and_unlock(struct dm_block_manager *bm, |
| 109 | struct dm_block *superblock); | 109 | struct dm_block *superblock); |
| 110 | 110 | ||
| 111 | /* | 111 | /* |
| 112 | * Request data be prefetched into the cache. | 112 | * Request data is prefetched into the cache. |
| 113 | */ | 113 | */ |
| 114 | void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); | 114 | void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); |
| 115 | 115 | ||
| 116 | /* | 116 | /* |
| @@ -125,6 +125,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); | |||
| 125 | * be returned if you do. | 125 | * be returned if you do. |
| 126 | */ | 126 | */ |
| 127 | void dm_bm_set_read_only(struct dm_block_manager *bm); | 127 | void dm_bm_set_read_only(struct dm_block_manager *bm); |
| 128 | void dm_bm_set_read_write(struct dm_block_manager *bm); | ||
| 128 | 129 | ||
| 129 | u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); | 130 | u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); |
| 130 | 131 | ||
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 6058569fe86c..466a60bbd716 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c | |||
| @@ -381,7 +381,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, | |||
| 381 | } | 381 | } |
| 382 | 382 | ||
| 383 | static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, | 383 | static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, |
| 384 | uint32_t (*mutator)(void *context, uint32_t old), | 384 | int (*mutator)(void *context, uint32_t old, uint32_t *new), |
| 385 | void *context, enum allocation_event *ev) | 385 | void *context, enum allocation_event *ev) |
| 386 | { | 386 | { |
| 387 | int r; | 387 | int r; |
| @@ -410,11 +410,17 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, | |||
| 410 | 410 | ||
| 411 | if (old > 2) { | 411 | if (old > 2) { |
| 412 | r = sm_ll_lookup_big_ref_count(ll, b, &old); | 412 | r = sm_ll_lookup_big_ref_count(ll, b, &old); |
| 413 | if (r < 0) | 413 | if (r < 0) { |
| 414 | dm_tm_unlock(ll->tm, nb); | ||
| 414 | return r; | 415 | return r; |
| 416 | } | ||
| 415 | } | 417 | } |
| 416 | 418 | ||
| 417 | ref_count = mutator(context, old); | 419 | r = mutator(context, old, &ref_count); |
| 420 | if (r) { | ||
| 421 | dm_tm_unlock(ll->tm, nb); | ||
| 422 | return r; | ||
| 423 | } | ||
| 418 | 424 | ||
| 419 | if (ref_count <= 2) { | 425 | if (ref_count <= 2) { |
| 420 | sm_set_bitmap(bm_le, bit, ref_count); | 426 | sm_set_bitmap(bm_le, bit, ref_count); |
| @@ -465,9 +471,10 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, | |||
| 465 | return ll->save_ie(ll, index, &ie_disk); | 471 | return ll->save_ie(ll, index, &ie_disk); |
| 466 | } | 472 | } |
| 467 | 473 | ||
| 468 | static uint32_t set_ref_count(void *context, uint32_t old) | 474 | static int set_ref_count(void *context, uint32_t old, uint32_t *new) |
| 469 | { | 475 | { |
| 470 | return *((uint32_t *) context); | 476 | *new = *((uint32_t *) context); |
| 477 | return 0; | ||
| 471 | } | 478 | } |
| 472 | 479 | ||
| 473 | int sm_ll_insert(struct ll_disk *ll, dm_block_t b, | 480 | int sm_ll_insert(struct ll_disk *ll, dm_block_t b, |
| @@ -476,9 +483,10 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, | |||
| 476 | return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); | 483 | return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); |
| 477 | } | 484 | } |
| 478 | 485 | ||
| 479 | static uint32_t inc_ref_count(void *context, uint32_t old) | 486 | static int inc_ref_count(void *context, uint32_t old, uint32_t *new) |
| 480 | { | 487 | { |
| 481 | return old + 1; | 488 | *new = old + 1; |
| 489 | return 0; | ||
| 482 | } | 490 | } |
| 483 | 491 | ||
| 484 | int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) | 492 | int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) |
| @@ -486,9 +494,15 @@ int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) | |||
| 486 | return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); | 494 | return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); |
| 487 | } | 495 | } |
| 488 | 496 | ||
| 489 | static uint32_t dec_ref_count(void *context, uint32_t old) | 497 | static int dec_ref_count(void *context, uint32_t old, uint32_t *new) |
| 490 | { | 498 | { |
| 491 | return old - 1; | 499 | if (!old) { |
| 500 | DMERR_LIMIT("unable to decrement a reference count below 0"); | ||
| 501 | return -EINVAL; | ||
| 502 | } | ||
| 503 | |||
| 504 | *new = old - 1; | ||
| 505 | return 0; | ||
| 492 | } | 506 | } |
| 493 | 507 | ||
| 494 | int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) | 508 | int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) |
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 1c959684caef..58fc1eef7499 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c | |||
| @@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) | |||
| 384 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); | 384 | struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); |
| 385 | 385 | ||
| 386 | int r = sm_metadata_new_block_(sm, b); | 386 | int r = sm_metadata_new_block_(sm, b); |
| 387 | if (r) | 387 | if (r) { |
| 388 | DMERR("unable to allocate new metadata block"); | 388 | DMERR("unable to allocate new metadata block"); |
| 389 | return r; | ||
| 390 | } | ||
| 389 | 391 | ||
| 390 | r = sm_metadata_get_nr_free(sm, &count); | 392 | r = sm_metadata_get_nr_free(sm, &count); |
| 391 | if (r) | 393 | if (r) { |
| 392 | DMERR("couldn't get free block count"); | 394 | DMERR("couldn't get free block count"); |
| 395 | return r; | ||
| 396 | } | ||
| 393 | 397 | ||
| 394 | check_threshold(&smm->threshold, count); | 398 | check_threshold(&smm->threshold, count); |
| 395 | 399 | ||
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index af6681b19776..1e5a540995e9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -66,7 +66,8 @@ | |||
| 66 | */ | 66 | */ |
| 67 | static int max_queued_requests = 1024; | 67 | static int max_queued_requests = 1024; |
| 68 | 68 | ||
| 69 | static void allow_barrier(struct r1conf *conf); | 69 | static void allow_barrier(struct r1conf *conf, sector_t start_next_window, |
| 70 | sector_t bi_sector); | ||
| 70 | static void lower_barrier(struct r1conf *conf); | 71 | static void lower_barrier(struct r1conf *conf); |
| 71 | 72 | ||
| 72 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) | 73 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) |
| @@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data) | |||
| 84 | } | 85 | } |
| 85 | 86 | ||
| 86 | #define RESYNC_BLOCK_SIZE (64*1024) | 87 | #define RESYNC_BLOCK_SIZE (64*1024) |
| 87 | //#define RESYNC_BLOCK_SIZE PAGE_SIZE | 88 | #define RESYNC_DEPTH 32 |
| 88 | #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) | 89 | #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) |
| 89 | #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) | 90 | #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) |
| 90 | #define RESYNC_WINDOW (2048*1024) | 91 | #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) |
| 92 | #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) | ||
| 93 | #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) | ||
| 91 | 94 | ||
| 92 | static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) | 95 | static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) |
| 93 | { | 96 | { |
| @@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
| 225 | struct bio *bio = r1_bio->master_bio; | 228 | struct bio *bio = r1_bio->master_bio; |
| 226 | int done; | 229 | int done; |
| 227 | struct r1conf *conf = r1_bio->mddev->private; | 230 | struct r1conf *conf = r1_bio->mddev->private; |
| 231 | sector_t start_next_window = r1_bio->start_next_window; | ||
| 232 | sector_t bi_sector = bio->bi_sector; | ||
| 228 | 233 | ||
| 229 | if (bio->bi_phys_segments) { | 234 | if (bio->bi_phys_segments) { |
| 230 | unsigned long flags; | 235 | unsigned long flags; |
| @@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
| 232 | bio->bi_phys_segments--; | 237 | bio->bi_phys_segments--; |
| 233 | done = (bio->bi_phys_segments == 0); | 238 | done = (bio->bi_phys_segments == 0); |
| 234 | spin_unlock_irqrestore(&conf->device_lock, flags); | 239 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 240 | /* | ||
| 241 | * make_request() might be waiting for | ||
| 242 | * bi_phys_segments to decrease | ||
| 243 | */ | ||
| 244 | wake_up(&conf->wait_barrier); | ||
| 235 | } else | 245 | } else |
| 236 | done = 1; | 246 | done = 1; |
| 237 | 247 | ||
| @@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio) | |||
| 243 | * Wake up any possible resync thread that waits for the device | 253 | * Wake up any possible resync thread that waits for the device |
| 244 | * to go idle. | 254 | * to go idle. |
| 245 | */ | 255 | */ |
| 246 | allow_barrier(conf); | 256 | allow_barrier(conf, start_next_window, bi_sector); |
| 247 | } | 257 | } |
| 248 | } | 258 | } |
| 249 | 259 | ||
| @@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf) | |||
| 814 | * there is no normal IO happeing. It must arrange to call | 824 | * there is no normal IO happeing. It must arrange to call |
| 815 | * lower_barrier when the particular background IO completes. | 825 | * lower_barrier when the particular background IO completes. |
| 816 | */ | 826 | */ |
| 817 | #define RESYNC_DEPTH 32 | ||
| 818 | |||
| 819 | static void raise_barrier(struct r1conf *conf) | 827 | static void raise_barrier(struct r1conf *conf) |
| 820 | { | 828 | { |
| 821 | spin_lock_irq(&conf->resync_lock); | 829 | spin_lock_irq(&conf->resync_lock); |
| @@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf) | |||
| 827 | /* block any new IO from starting */ | 835 | /* block any new IO from starting */ |
| 828 | conf->barrier++; | 836 | conf->barrier++; |
| 829 | 837 | ||
| 830 | /* Now wait for all pending IO to complete */ | 838 | /* For these conditions we must wait: |
| 839 | * A: while the array is in frozen state | ||
| 840 | * B: while barrier >= RESYNC_DEPTH, meaning resync reach | ||
| 841 | * the max count which allowed. | ||
| 842 | * C: next_resync + RESYNC_SECTORS > start_next_window, meaning | ||
| 843 | * next resync will reach to the window which normal bios are | ||
| 844 | * handling. | ||
| 845 | */ | ||
| 831 | wait_event_lock_irq(conf->wait_barrier, | 846 | wait_event_lock_irq(conf->wait_barrier, |
| 832 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, | 847 | !conf->array_frozen && |
| 848 | conf->barrier < RESYNC_DEPTH && | ||
| 849 | (conf->start_next_window >= | ||
| 850 | conf->next_resync + RESYNC_SECTORS), | ||
| 833 | conf->resync_lock); | 851 | conf->resync_lock); |
| 834 | 852 | ||
| 835 | spin_unlock_irq(&conf->resync_lock); | 853 | spin_unlock_irq(&conf->resync_lock); |
| @@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf) | |||
| 845 | wake_up(&conf->wait_barrier); | 863 | wake_up(&conf->wait_barrier); |
| 846 | } | 864 | } |
| 847 | 865 | ||
| 848 | static void wait_barrier(struct r1conf *conf) | 866 | static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) |
| 849 | { | 867 | { |
| 868 | bool wait = false; | ||
| 869 | |||
| 870 | if (conf->array_frozen || !bio) | ||
| 871 | wait = true; | ||
| 872 | else if (conf->barrier && bio_data_dir(bio) == WRITE) { | ||
| 873 | if (conf->next_resync < RESYNC_WINDOW_SECTORS) | ||
| 874 | wait = true; | ||
| 875 | else if ((conf->next_resync - RESYNC_WINDOW_SECTORS | ||
| 876 | >= bio_end_sector(bio)) || | ||
| 877 | (conf->next_resync + NEXT_NORMALIO_DISTANCE | ||
| 878 | <= bio->bi_sector)) | ||
| 879 | wait = false; | ||
| 880 | else | ||
| 881 | wait = true; | ||
| 882 | } | ||
| 883 | |||
| 884 | return wait; | ||
| 885 | } | ||
| 886 | |||
| 887 | static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) | ||
| 888 | { | ||
| 889 | sector_t sector = 0; | ||
| 890 | |||
| 850 | spin_lock_irq(&conf->resync_lock); | 891 | spin_lock_irq(&conf->resync_lock); |
| 851 | if (conf->barrier) { | 892 | if (need_to_wait_for_sync(conf, bio)) { |
| 852 | conf->nr_waiting++; | 893 | conf->nr_waiting++; |
| 853 | /* Wait for the barrier to drop. | 894 | /* Wait for the barrier to drop. |
| 854 | * However if there are already pending | 895 | * However if there are already pending |
| @@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf) | |||
| 860 | * count down. | 901 | * count down. |
| 861 | */ | 902 | */ |
| 862 | wait_event_lock_irq(conf->wait_barrier, | 903 | wait_event_lock_irq(conf->wait_barrier, |
| 863 | !conf->barrier || | 904 | !conf->array_frozen && |
| 864 | (conf->nr_pending && | 905 | (!conf->barrier || |
| 906 | ((conf->start_next_window < | ||
| 907 | conf->next_resync + RESYNC_SECTORS) && | ||
| 865 | current->bio_list && | 908 | current->bio_list && |
| 866 | !bio_list_empty(current->bio_list)), | 909 | !bio_list_empty(current->bio_list))), |
| 867 | conf->resync_lock); | 910 | conf->resync_lock); |
| 868 | conf->nr_waiting--; | 911 | conf->nr_waiting--; |
| 869 | } | 912 | } |
| 913 | |||
| 914 | if (bio && bio_data_dir(bio) == WRITE) { | ||
| 915 | if (conf->next_resync + NEXT_NORMALIO_DISTANCE | ||
| 916 | <= bio->bi_sector) { | ||
| 917 | if (conf->start_next_window == MaxSector) | ||
| 918 | conf->start_next_window = | ||
| 919 | conf->next_resync + | ||
| 920 | NEXT_NORMALIO_DISTANCE; | ||
| 921 | |||
| 922 | if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) | ||
| 923 | <= bio->bi_sector) | ||
| 924 | conf->next_window_requests++; | ||
| 925 | else | ||
| 926 | conf->current_window_requests++; | ||
| 927 | } | ||
| 928 | if (bio->bi_sector >= conf->start_next_window) | ||
| 929 | sector = conf->start_next_window; | ||
| 930 | } | ||
| 931 | |||
| 870 | conf->nr_pending++; | 932 | conf->nr_pending++; |
| 871 | spin_unlock_irq(&conf->resync_lock); | 933 | spin_unlock_irq(&conf->resync_lock); |
| 934 | return sector; | ||
| 872 | } | 935 | } |
| 873 | 936 | ||
| 874 | static void allow_barrier(struct r1conf *conf) | 937 | static void allow_barrier(struct r1conf *conf, sector_t start_next_window, |
| 938 | sector_t bi_sector) | ||
| 875 | { | 939 | { |
| 876 | unsigned long flags; | 940 | unsigned long flags; |
| 941 | |||
| 877 | spin_lock_irqsave(&conf->resync_lock, flags); | 942 | spin_lock_irqsave(&conf->resync_lock, flags); |
| 878 | conf->nr_pending--; | 943 | conf->nr_pending--; |
| 944 | if (start_next_window) { | ||
| 945 | if (start_next_window == conf->start_next_window) { | ||
| 946 | if (conf->start_next_window + NEXT_NORMALIO_DISTANCE | ||
| 947 | <= bi_sector) | ||
| 948 | conf->next_window_requests--; | ||
| 949 | else | ||
| 950 | conf->current_window_requests--; | ||
| 951 | } else | ||
| 952 | conf->current_window_requests--; | ||
| 953 | |||
| 954 | if (!conf->current_window_requests) { | ||
| 955 | if (conf->next_window_requests) { | ||
| 956 | conf->current_window_requests = | ||
| 957 | conf->next_window_requests; | ||
| 958 | conf->next_window_requests = 0; | ||
| 959 | conf->start_next_window += | ||
| 960 | NEXT_NORMALIO_DISTANCE; | ||
| 961 | } else | ||
| 962 | conf->start_next_window = MaxSector; | ||
| 963 | } | ||
| 964 | } | ||
| 879 | spin_unlock_irqrestore(&conf->resync_lock, flags); | 965 | spin_unlock_irqrestore(&conf->resync_lock, flags); |
| 880 | wake_up(&conf->wait_barrier); | 966 | wake_up(&conf->wait_barrier); |
| 881 | } | 967 | } |
| @@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra) | |||
| 884 | { | 970 | { |
| 885 | /* stop syncio and normal IO and wait for everything to | 971 | /* stop syncio and normal IO and wait for everything to |
| 886 | * go quite. | 972 | * go quite. |
| 887 | * We increment barrier and nr_waiting, and then | 973 | * We wait until nr_pending match nr_queued+extra |
| 888 | * wait until nr_pending match nr_queued+extra | ||
| 889 | * This is called in the context of one normal IO request | 974 | * This is called in the context of one normal IO request |
| 890 | * that has failed. Thus any sync request that might be pending | 975 | * that has failed. Thus any sync request that might be pending |
| 891 | * will be blocked by nr_pending, and we need to wait for | 976 | * will be blocked by nr_pending, and we need to wait for |
| @@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra) | |||
| 895 | * we continue. | 980 | * we continue. |
| 896 | */ | 981 | */ |
| 897 | spin_lock_irq(&conf->resync_lock); | 982 | spin_lock_irq(&conf->resync_lock); |
| 898 | conf->barrier++; | 983 | conf->array_frozen = 1; |
| 899 | conf->nr_waiting++; | ||
| 900 | wait_event_lock_irq_cmd(conf->wait_barrier, | 984 | wait_event_lock_irq_cmd(conf->wait_barrier, |
| 901 | conf->nr_pending == conf->nr_queued+extra, | 985 | conf->nr_pending == conf->nr_queued+extra, |
| 902 | conf->resync_lock, | 986 | conf->resync_lock, |
| @@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf) | |||
| 907 | { | 991 | { |
| 908 | /* reverse the effect of the freeze */ | 992 | /* reverse the effect of the freeze */ |
| 909 | spin_lock_irq(&conf->resync_lock); | 993 | spin_lock_irq(&conf->resync_lock); |
| 910 | conf->barrier--; | 994 | conf->array_frozen = 0; |
| 911 | conf->nr_waiting--; | ||
| 912 | wake_up(&conf->wait_barrier); | 995 | wake_up(&conf->wait_barrier); |
| 913 | spin_unlock_irq(&conf->resync_lock); | 996 | spin_unlock_irq(&conf->resync_lock); |
| 914 | } | 997 | } |
| @@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
| 1013 | int first_clone; | 1096 | int first_clone; |
| 1014 | int sectors_handled; | 1097 | int sectors_handled; |
| 1015 | int max_sectors; | 1098 | int max_sectors; |
| 1099 | sector_t start_next_window; | ||
| 1016 | 1100 | ||
| 1017 | /* | 1101 | /* |
| 1018 | * Register the new request and wait if the reconstruction | 1102 | * Register the new request and wait if the reconstruction |
| @@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
| 1042 | finish_wait(&conf->wait_barrier, &w); | 1126 | finish_wait(&conf->wait_barrier, &w); |
| 1043 | } | 1127 | } |
| 1044 | 1128 | ||
| 1045 | wait_barrier(conf); | 1129 | start_next_window = wait_barrier(conf, bio); |
| 1046 | 1130 | ||
| 1047 | bitmap = mddev->bitmap; | 1131 | bitmap = mddev->bitmap; |
| 1048 | 1132 | ||
| @@ -1163,6 +1247,7 @@ read_again: | |||
| 1163 | 1247 | ||
| 1164 | disks = conf->raid_disks * 2; | 1248 | disks = conf->raid_disks * 2; |
| 1165 | retry_write: | 1249 | retry_write: |
| 1250 | r1_bio->start_next_window = start_next_window; | ||
| 1166 | blocked_rdev = NULL; | 1251 | blocked_rdev = NULL; |
| 1167 | rcu_read_lock(); | 1252 | rcu_read_lock(); |
| 1168 | max_sectors = r1_bio->sectors; | 1253 | max_sectors = r1_bio->sectors; |
| @@ -1231,14 +1316,24 @@ read_again: | |||
| 1231 | if (unlikely(blocked_rdev)) { | 1316 | if (unlikely(blocked_rdev)) { |
| 1232 | /* Wait for this device to become unblocked */ | 1317 | /* Wait for this device to become unblocked */ |
| 1233 | int j; | 1318 | int j; |
| 1319 | sector_t old = start_next_window; | ||
| 1234 | 1320 | ||
| 1235 | for (j = 0; j < i; j++) | 1321 | for (j = 0; j < i; j++) |
| 1236 | if (r1_bio->bios[j]) | 1322 | if (r1_bio->bios[j]) |
| 1237 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 1323 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
| 1238 | r1_bio->state = 0; | 1324 | r1_bio->state = 0; |
| 1239 | allow_barrier(conf); | 1325 | allow_barrier(conf, start_next_window, bio->bi_sector); |
| 1240 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1326 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
| 1241 | wait_barrier(conf); | 1327 | start_next_window = wait_barrier(conf, bio); |
| 1328 | /* | ||
| 1329 | * We must make sure the multi r1bios of bio have | ||
| 1330 | * the same value of bi_phys_segments | ||
| 1331 | */ | ||
| 1332 | if (bio->bi_phys_segments && old && | ||
| 1333 | old != start_next_window) | ||
| 1334 | /* Wait for the former r1bio(s) to complete */ | ||
| 1335 | wait_event(conf->wait_barrier, | ||
| 1336 | bio->bi_phys_segments == 1); | ||
| 1242 | goto retry_write; | 1337 | goto retry_write; |
| 1243 | } | 1338 | } |
| 1244 | 1339 | ||
| @@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf) | |||
| 1438 | 1533 | ||
| 1439 | static void close_sync(struct r1conf *conf) | 1534 | static void close_sync(struct r1conf *conf) |
| 1440 | { | 1535 | { |
| 1441 | wait_barrier(conf); | 1536 | wait_barrier(conf, NULL); |
| 1442 | allow_barrier(conf); | 1537 | allow_barrier(conf, 0, 0); |
| 1443 | 1538 | ||
| 1444 | mempool_destroy(conf->r1buf_pool); | 1539 | mempool_destroy(conf->r1buf_pool); |
| 1445 | conf->r1buf_pool = NULL; | 1540 | conf->r1buf_pool = NULL; |
| 1541 | |||
| 1542 | conf->next_resync = 0; | ||
| 1543 | conf->start_next_window = MaxSector; | ||
| 1446 | } | 1544 | } |
| 1447 | 1545 | ||
| 1448 | static int raid1_spare_active(struct mddev *mddev) | 1546 | static int raid1_spare_active(struct mddev *mddev) |
| @@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2714 | conf->pending_count = 0; | 2812 | conf->pending_count = 0; |
| 2715 | conf->recovery_disabled = mddev->recovery_disabled - 1; | 2813 | conf->recovery_disabled = mddev->recovery_disabled - 1; |
| 2716 | 2814 | ||
| 2815 | conf->start_next_window = MaxSector; | ||
| 2816 | conf->current_window_requests = conf->next_window_requests = 0; | ||
| 2817 | |||
| 2717 | err = -EIO; | 2818 | err = -EIO; |
| 2718 | for (i = 0; i < conf->raid_disks * 2; i++) { | 2819 | for (i = 0; i < conf->raid_disks * 2; i++) { |
| 2719 | 2820 | ||
| @@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev) | |||
| 2871 | atomic_read(&bitmap->behind_writes) == 0); | 2972 | atomic_read(&bitmap->behind_writes) == 0); |
| 2872 | } | 2973 | } |
| 2873 | 2974 | ||
| 2874 | raise_barrier(conf); | 2975 | freeze_array(conf, 0); |
| 2875 | lower_barrier(conf); | 2976 | unfreeze_array(conf); |
| 2876 | 2977 | ||
| 2877 | md_unregister_thread(&mddev->thread); | 2978 | md_unregister_thread(&mddev->thread); |
| 2878 | if (conf->r1bio_pool) | 2979 | if (conf->r1bio_pool) |
| @@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state) | |||
| 3031 | wake_up(&conf->wait_barrier); | 3132 | wake_up(&conf->wait_barrier); |
| 3032 | break; | 3133 | break; |
| 3033 | case 1: | 3134 | case 1: |
| 3034 | raise_barrier(conf); | 3135 | freeze_array(conf, 0); |
| 3035 | break; | 3136 | break; |
| 3036 | case 0: | 3137 | case 0: |
| 3037 | lower_barrier(conf); | 3138 | unfreeze_array(conf); |
| 3038 | break; | 3139 | break; |
| 3039 | } | 3140 | } |
| 3040 | } | 3141 | } |
| @@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev) | |||
| 3051 | mddev->new_chunk_sectors = 0; | 3152 | mddev->new_chunk_sectors = 0; |
| 3052 | conf = setup_conf(mddev); | 3153 | conf = setup_conf(mddev); |
| 3053 | if (!IS_ERR(conf)) | 3154 | if (!IS_ERR(conf)) |
| 3054 | conf->barrier = 1; | 3155 | /* Array must appear to be quiesced */ |
| 3156 | conf->array_frozen = 1; | ||
| 3055 | return conf; | 3157 | return conf; |
| 3056 | } | 3158 | } |
| 3057 | return ERR_PTR(-EINVAL); | 3159 | return ERR_PTR(-EINVAL); |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 0ff3715fb7eb..9bebca7bff2f 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
| @@ -41,6 +41,19 @@ struct r1conf { | |||
| 41 | */ | 41 | */ |
| 42 | sector_t next_resync; | 42 | sector_t next_resync; |
| 43 | 43 | ||
| 44 | /* When raid1 starts resync, we divide array into four partitions | ||
| 45 | * |---------|--------------|---------------------|-------------| | ||
| 46 | * next_resync start_next_window end_window | ||
| 47 | * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE | ||
| 48 | * end_window = start_next_window + NEXT_NORMALIO_DISTANCE | ||
| 49 | * current_window_requests means the count of normalIO between | ||
| 50 | * start_next_window and end_window. | ||
| 51 | * next_window_requests means the count of normalIO after end_window. | ||
| 52 | * */ | ||
| 53 | sector_t start_next_window; | ||
| 54 | int current_window_requests; | ||
| 55 | int next_window_requests; | ||
| 56 | |||
| 44 | spinlock_t device_lock; | 57 | spinlock_t device_lock; |
| 45 | 58 | ||
| 46 | /* list of 'struct r1bio' that need to be processed by raid1d, | 59 | /* list of 'struct r1bio' that need to be processed by raid1d, |
| @@ -65,6 +78,7 @@ struct r1conf { | |||
| 65 | int nr_waiting; | 78 | int nr_waiting; |
| 66 | int nr_queued; | 79 | int nr_queued; |
| 67 | int barrier; | 80 | int barrier; |
| 81 | int array_frozen; | ||
| 68 | 82 | ||
| 69 | /* Set to 1 if a full sync is needed, (fresh device added). | 83 | /* Set to 1 if a full sync is needed, (fresh device added). |
| 70 | * Cleared when a sync completes. | 84 | * Cleared when a sync completes. |
| @@ -111,6 +125,7 @@ struct r1bio { | |||
| 111 | * in this BehindIO request | 125 | * in this BehindIO request |
| 112 | */ | 126 | */ |
| 113 | sector_t sector; | 127 | sector_t sector; |
| 128 | sector_t start_next_window; | ||
| 114 | int sectors; | 129 | int sectors; |
| 115 | unsigned long state; | 130 | unsigned long state; |
| 116 | struct mddev *mddev; | 131 | struct mddev *mddev; |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7c3508abb5e1..c504e8389e69 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | |||
| 4384 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4384 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
| 4385 | md_wakeup_thread(mddev->thread); | 4385 | md_wakeup_thread(mddev->thread); |
| 4386 | wait_event(mddev->sb_wait, mddev->flags == 0 || | 4386 | wait_event(mddev->sb_wait, mddev->flags == 0 || |
| 4387 | kthread_should_stop()); | 4387 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
| 4388 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | ||
| 4389 | allow_barrier(conf); | ||
| 4390 | return sectors_done; | ||
| 4391 | } | ||
| 4388 | conf->reshape_safe = mddev->reshape_position; | 4392 | conf->reshape_safe = mddev->reshape_position; |
| 4389 | allow_barrier(conf); | 4393 | allow_barrier(conf); |
| 4390 | } | 4394 | } |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7f0e17a27aeb..cc055da02e2a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) | |||
| 85 | return &conf->stripe_hashtbl[hash]; | 85 | return &conf->stripe_hashtbl[hash]; |
| 86 | } | 86 | } |
| 87 | 87 | ||
| 88 | static inline int stripe_hash_locks_hash(sector_t sect) | ||
| 89 | { | ||
| 90 | return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; | ||
| 91 | } | ||
| 92 | |||
| 93 | static inline void lock_device_hash_lock(struct r5conf *conf, int hash) | ||
| 94 | { | ||
| 95 | spin_lock_irq(conf->hash_locks + hash); | ||
| 96 | spin_lock(&conf->device_lock); | ||
| 97 | } | ||
| 98 | |||
| 99 | static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) | ||
| 100 | { | ||
| 101 | spin_unlock(&conf->device_lock); | ||
| 102 | spin_unlock_irq(conf->hash_locks + hash); | ||
| 103 | } | ||
| 104 | |||
| 105 | static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) | ||
| 106 | { | ||
| 107 | int i; | ||
| 108 | local_irq_disable(); | ||
| 109 | spin_lock(conf->hash_locks); | ||
| 110 | for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) | ||
| 111 | spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); | ||
| 112 | spin_lock(&conf->device_lock); | ||
| 113 | } | ||
| 114 | |||
| 115 | static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) | ||
| 116 | { | ||
| 117 | int i; | ||
| 118 | spin_unlock(&conf->device_lock); | ||
| 119 | for (i = NR_STRIPE_HASH_LOCKS; i; i--) | ||
| 120 | spin_unlock(conf->hash_locks + i - 1); | ||
| 121 | local_irq_enable(); | ||
| 122 | } | ||
| 123 | |||
| 88 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector | 124 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector |
| 89 | * order without overlap. There may be several bio's per stripe+device, and | 125 | * order without overlap. There may be several bio's per stripe+device, and |
| 90 | * a bio could span several devices. | 126 | * a bio could span several devices. |
| @@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) | |||
| 249 | } | 285 | } |
| 250 | } | 286 | } |
| 251 | 287 | ||
| 252 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) | 288 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, |
| 289 | struct list_head *temp_inactive_list) | ||
| 253 | { | 290 | { |
| 254 | BUG_ON(!list_empty(&sh->lru)); | 291 | BUG_ON(!list_empty(&sh->lru)); |
| 255 | BUG_ON(atomic_read(&conf->active_stripes)==0); | 292 | BUG_ON(atomic_read(&conf->active_stripes)==0); |
| @@ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) | |||
| 278 | < IO_THRESHOLD) | 315 | < IO_THRESHOLD) |
| 279 | md_wakeup_thread(conf->mddev->thread); | 316 | md_wakeup_thread(conf->mddev->thread); |
| 280 | atomic_dec(&conf->active_stripes); | 317 | atomic_dec(&conf->active_stripes); |
| 281 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { | 318 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) |
| 282 | list_add_tail(&sh->lru, &conf->inactive_list); | 319 | list_add_tail(&sh->lru, temp_inactive_list); |
| 283 | wake_up(&conf->wait_for_stripe); | ||
| 284 | if (conf->retry_read_aligned) | ||
| 285 | md_wakeup_thread(conf->mddev->thread); | ||
| 286 | } | ||
| 287 | } | 320 | } |
| 288 | } | 321 | } |
| 289 | 322 | ||
| 290 | static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) | 323 | static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, |
| 324 | struct list_head *temp_inactive_list) | ||
| 291 | { | 325 | { |
| 292 | if (atomic_dec_and_test(&sh->count)) | 326 | if (atomic_dec_and_test(&sh->count)) |
| 293 | do_release_stripe(conf, sh); | 327 | do_release_stripe(conf, sh, temp_inactive_list); |
| 328 | } | ||
| 329 | |||
| 330 | /* | ||
| 331 | * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list | ||
| 332 | * | ||
| 333 | * Be careful: Only one task can add/delete stripes from temp_inactive_list at | ||
| 334 | * given time. Adding stripes only takes device lock, while deleting stripes | ||
| 335 | * only takes hash lock. | ||
| 336 | */ | ||
| 337 | static void release_inactive_stripe_list(struct r5conf *conf, | ||
| 338 | struct list_head *temp_inactive_list, | ||
| 339 | int hash) | ||
| 340 | { | ||
| 341 | int size; | ||
| 342 | bool do_wakeup = false; | ||
| 343 | unsigned long flags; | ||
| 344 | |||
| 345 | if (hash == NR_STRIPE_HASH_LOCKS) { | ||
| 346 | size = NR_STRIPE_HASH_LOCKS; | ||
| 347 | hash = NR_STRIPE_HASH_LOCKS - 1; | ||
| 348 | } else | ||
| 349 | size = 1; | ||
| 350 | while (size) { | ||
| 351 | struct list_head *list = &temp_inactive_list[size - 1]; | ||
| 352 | |||
| 353 | /* | ||
| 354 | * We don't hold any lock here yet, get_active_stripe() might | ||
| 355 | * remove stripes from the list | ||
| 356 | */ | ||
| 357 | if (!list_empty_careful(list)) { | ||
| 358 | spin_lock_irqsave(conf->hash_locks + hash, flags); | ||
| 359 | if (list_empty(conf->inactive_list + hash) && | ||
| 360 | !list_empty(list)) | ||
| 361 | atomic_dec(&conf->empty_inactive_list_nr); | ||
| 362 | list_splice_tail_init(list, conf->inactive_list + hash); | ||
| 363 | do_wakeup = true; | ||
| 364 | spin_unlock_irqrestore(conf->hash_locks + hash, flags); | ||
| 365 | } | ||
| 366 | size--; | ||
| 367 | hash--; | ||
| 368 | } | ||
| 369 | |||
| 370 | if (do_wakeup) { | ||
| 371 | wake_up(&conf->wait_for_stripe); | ||
| 372 | if (conf->retry_read_aligned) | ||
| 373 | md_wakeup_thread(conf->mddev->thread); | ||
| 374 | } | ||
| 294 | } | 375 | } |
| 295 | 376 | ||
| 296 | /* should hold conf->device_lock already */ | 377 | /* should hold conf->device_lock already */ |
| 297 | static int release_stripe_list(struct r5conf *conf) | 378 | static int release_stripe_list(struct r5conf *conf, |
| 379 | struct list_head *temp_inactive_list) | ||
| 298 | { | 380 | { |
| 299 | struct stripe_head *sh; | 381 | struct stripe_head *sh; |
| 300 | int count = 0; | 382 | int count = 0; |
| @@ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf) | |||
| 303 | head = llist_del_all(&conf->released_stripes); | 385 | head = llist_del_all(&conf->released_stripes); |
| 304 | head = llist_reverse_order(head); | 386 | head = llist_reverse_order(head); |
| 305 | while (head) { | 387 | while (head) { |
| 388 | int hash; | ||
| 389 | |||
| 306 | sh = llist_entry(head, struct stripe_head, release_list); | 390 | sh = llist_entry(head, struct stripe_head, release_list); |
| 307 | head = llist_next(head); | 391 | head = llist_next(head); |
| 308 | /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ | 392 | /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ |
| @@ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf) | |||
| 313 | * again, the count is always > 1. This is true for | 397 | * again, the count is always > 1. This is true for |
| 314 | * STRIPE_ON_UNPLUG_LIST bit too. | 398 | * STRIPE_ON_UNPLUG_LIST bit too. |
| 315 | */ | 399 | */ |
| 316 | __release_stripe(conf, sh); | 400 | hash = sh->hash_lock_index; |
| 401 | __release_stripe(conf, sh, &temp_inactive_list[hash]); | ||
| 317 | count++; | 402 | count++; |
| 318 | } | 403 | } |
| 319 | 404 | ||
| @@ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh) | |||
| 324 | { | 409 | { |
| 325 | struct r5conf *conf = sh->raid_conf; | 410 | struct r5conf *conf = sh->raid_conf; |
| 326 | unsigned long flags; | 411 | unsigned long flags; |
| 412 | struct list_head list; | ||
| 413 | int hash; | ||
| 327 | bool wakeup; | 414 | bool wakeup; |
| 328 | 415 | ||
| 329 | if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) | 416 | if (unlikely(!conf->mddev->thread) || |
| 417 | test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) | ||
| 330 | goto slow_path; | 418 | goto slow_path; |
| 331 | wakeup = llist_add(&sh->release_list, &conf->released_stripes); | 419 | wakeup = llist_add(&sh->release_list, &conf->released_stripes); |
| 332 | if (wakeup) | 420 | if (wakeup) |
| @@ -336,8 +424,11 @@ slow_path: | |||
| 336 | local_irq_save(flags); | 424 | local_irq_save(flags); |
| 337 | /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ | 425 | /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ |
| 338 | if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { | 426 | if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { |
| 339 | do_release_stripe(conf, sh); | 427 | INIT_LIST_HEAD(&list); |
| 428 | hash = sh->hash_lock_index; | ||
| 429 | do_release_stripe(conf, sh, &list); | ||
| 340 | spin_unlock(&conf->device_lock); | 430 | spin_unlock(&conf->device_lock); |
| 431 | release_inactive_stripe_list(conf, &list, hash); | ||
| 341 | } | 432 | } |
| 342 | local_irq_restore(flags); | 433 | local_irq_restore(flags); |
| 343 | } | 434 | } |
| @@ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) | |||
| 362 | 453 | ||
| 363 | 454 | ||
| 364 | /* find an idle stripe, make sure it is unhashed, and return it. */ | 455 | /* find an idle stripe, make sure it is unhashed, and return it. */ |
| 365 | static struct stripe_head *get_free_stripe(struct r5conf *conf) | 456 | static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) |
| 366 | { | 457 | { |
| 367 | struct stripe_head *sh = NULL; | 458 | struct stripe_head *sh = NULL; |
| 368 | struct list_head *first; | 459 | struct list_head *first; |
| 369 | 460 | ||
| 370 | if (list_empty(&conf->inactive_list)) | 461 | if (list_empty(conf->inactive_list + hash)) |
| 371 | goto out; | 462 | goto out; |
| 372 | first = conf->inactive_list.next; | 463 | first = (conf->inactive_list + hash)->next; |
| 373 | sh = list_entry(first, struct stripe_head, lru); | 464 | sh = list_entry(first, struct stripe_head, lru); |
| 374 | list_del_init(first); | 465 | list_del_init(first); |
| 375 | remove_hash(sh); | 466 | remove_hash(sh); |
| 376 | atomic_inc(&conf->active_stripes); | 467 | atomic_inc(&conf->active_stripes); |
| 468 | BUG_ON(hash != sh->hash_lock_index); | ||
| 469 | if (list_empty(conf->inactive_list + hash)) | ||
| 470 | atomic_inc(&conf->empty_inactive_list_nr); | ||
| 377 | out: | 471 | out: |
| 378 | return sh; | 472 | return sh; |
| 379 | } | 473 | } |
| @@ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, | |||
| 416 | static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | 510 | static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) |
| 417 | { | 511 | { |
| 418 | struct r5conf *conf = sh->raid_conf; | 512 | struct r5conf *conf = sh->raid_conf; |
| 419 | int i; | 513 | int i, seq; |
| 420 | 514 | ||
| 421 | BUG_ON(atomic_read(&sh->count) != 0); | 515 | BUG_ON(atomic_read(&sh->count) != 0); |
| 422 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); | 516 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); |
| @@ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | |||
| 426 | (unsigned long long)sh->sector); | 520 | (unsigned long long)sh->sector); |
| 427 | 521 | ||
| 428 | remove_hash(sh); | 522 | remove_hash(sh); |
| 429 | 523 | retry: | |
| 524 | seq = read_seqcount_begin(&conf->gen_lock); | ||
| 430 | sh->generation = conf->generation - previous; | 525 | sh->generation = conf->generation - previous; |
| 431 | sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; | 526 | sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; |
| 432 | sh->sector = sector; | 527 | sh->sector = sector; |
| @@ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | |||
| 448 | dev->flags = 0; | 543 | dev->flags = 0; |
| 449 | raid5_build_block(sh, i, previous); | 544 | raid5_build_block(sh, i, previous); |
| 450 | } | 545 | } |
| 546 | if (read_seqcount_retry(&conf->gen_lock, seq)) | ||
| 547 | goto retry; | ||
| 451 | insert_hash(conf, sh); | 548 | insert_hash(conf, sh); |
| 452 | sh->cpu = smp_processor_id(); | 549 | sh->cpu = smp_processor_id(); |
| 453 | } | 550 | } |
| @@ -552,57 +649,59 @@ get_active_stripe(struct r5conf *conf, sector_t sector, | |||
| 552 | int previous, int noblock, int noquiesce) | 649 | int previous, int noblock, int noquiesce) |
| 553 | { | 650 | { |
| 554 | struct stripe_head *sh; | 651 | struct stripe_head *sh; |
| 652 | int hash = stripe_hash_locks_hash(sector); | ||
| 555 | 653 | ||
| 556 | pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); | 654 | pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); |
| 557 | 655 | ||
| 558 | spin_lock_irq(&conf->device_lock); | 656 | spin_lock_irq(conf->hash_locks + hash); |
| 559 | 657 | ||
| 560 | do { | 658 | do { |
| 561 | wait_event_lock_irq(conf->wait_for_stripe, | 659 | wait_event_lock_irq(conf->wait_for_stripe, |
| 562 | conf->quiesce == 0 || noquiesce, | 660 | conf->quiesce == 0 || noquiesce, |
| 563 | conf->device_lock); | 661 | *(conf->hash_locks + hash)); |
| 564 | sh = __find_stripe(conf, sector, conf->generation - previous); | 662 | sh = __find_stripe(conf, sector, conf->generation - previous); |
| 565 | if (!sh) { | 663 | if (!sh) { |
| 566 | if (!conf->inactive_blocked) | 664 | if (!conf->inactive_blocked) |
| 567 | sh = get_free_stripe(conf); | 665 | sh = get_free_stripe(conf, hash); |
| 568 | if (noblock && sh == NULL) | 666 | if (noblock && sh == NULL) |
| 569 | break; | 667 | break; |
| 570 | if (!sh) { | 668 | if (!sh) { |
| 571 | conf->inactive_blocked = 1; | 669 | conf->inactive_blocked = 1; |
| 572 | wait_event_lock_irq(conf->wait_for_stripe, | 670 | wait_event_lock_irq( |
| 573 | !list_empty(&conf->inactive_list) && | 671 | conf->wait_for_stripe, |
| 574 | (atomic_read(&conf->active_stripes) | 672 | !list_empty(conf->inactive_list + hash) && |
| 575 | < (conf->max_nr_stripes *3/4) | 673 | (atomic_read(&conf->active_stripes) |
| 576 | || !conf->inactive_blocked), | 674 | < (conf->max_nr_stripes * 3 / 4) |
| 577 | conf->device_lock); | 675 | || !conf->inactive_blocked), |
| 676 | *(conf->hash_locks + hash)); | ||
| 578 | conf->inactive_blocked = 0; | 677 | conf->inactive_blocked = 0; |
| 579 | } else | 678 | } else |
| 580 | init_stripe(sh, sector, previous); | 679 | init_stripe(sh, sector, previous); |
| 581 | } else { | 680 | } else { |
| 681 | spin_lock(&conf->device_lock); | ||
| 582 | if (atomic_read(&sh->count)) { | 682 | if (atomic_read(&sh->count)) { |
| 583 | BUG_ON(!list_empty(&sh->lru) | 683 | BUG_ON(!list_empty(&sh->lru) |
| 584 | && !test_bit(STRIPE_EXPANDING, &sh->state) | 684 | && !test_bit(STRIPE_EXPANDING, &sh->state) |
| 585 | && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) | 685 | && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) |
| 586 | && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); | 686 | ); |
| 587 | } else { | 687 | } else { |
| 588 | if (!test_bit(STRIPE_HANDLE, &sh->state)) | 688 | if (!test_bit(STRIPE_HANDLE, &sh->state)) |
| 589 | atomic_inc(&conf->active_stripes); | 689 | atomic_inc(&conf->active_stripes); |
| 590 | if (list_empty(&sh->lru) && | 690 | BUG_ON(list_empty(&sh->lru)); |
| 591 | !test_bit(STRIPE_EXPANDING, &sh->state)) | ||
| 592 | BUG(); | ||
| 593 | list_del_init(&sh->lru); | 691 | list_del_init(&sh->lru); |
| 594 | if (sh->group) { | 692 | if (sh->group) { |
| 595 | sh->group->stripes_cnt--; | 693 | sh->group->stripes_cnt--; |
| 596 | sh->group = NULL; | 694 | sh->group = NULL; |
| 597 | } | 695 | } |
| 598 | } | 696 | } |
| 697 | spin_unlock(&conf->device_lock); | ||
| 599 | } | 698 | } |
| 600 | } while (sh == NULL); | 699 | } while (sh == NULL); |
| 601 | 700 | ||
| 602 | if (sh) | 701 | if (sh) |
| 603 | atomic_inc(&sh->count); | 702 | atomic_inc(&sh->count); |
| 604 | 703 | ||
| 605 | spin_unlock_irq(&conf->device_lock); | 704 | spin_unlock_irq(conf->hash_locks + hash); |
| 606 | return sh; | 705 | return sh; |
| 607 | } | 706 | } |
| 608 | 707 | ||
| @@ -758,7 +857,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 758 | bi->bi_sector = (sh->sector | 857 | bi->bi_sector = (sh->sector |
| 759 | + rdev->data_offset); | 858 | + rdev->data_offset); |
| 760 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | 859 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) |
| 761 | bi->bi_rw |= REQ_FLUSH; | 860 | bi->bi_rw |= REQ_NOMERGE; |
| 762 | 861 | ||
| 763 | bi->bi_vcnt = 1; | 862 | bi->bi_vcnt = 1; |
| 764 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 863 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
| @@ -1582,7 +1681,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
| 1582 | put_cpu(); | 1681 | put_cpu(); |
| 1583 | } | 1682 | } |
| 1584 | 1683 | ||
| 1585 | static int grow_one_stripe(struct r5conf *conf) | 1684 | static int grow_one_stripe(struct r5conf *conf, int hash) |
| 1586 | { | 1685 | { |
| 1587 | struct stripe_head *sh; | 1686 | struct stripe_head *sh; |
| 1588 | sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); | 1687 | sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); |
| @@ -1598,6 +1697,7 @@ static int grow_one_stripe(struct r5conf *conf) | |||
| 1598 | kmem_cache_free(conf->slab_cache, sh); | 1697 | kmem_cache_free(conf->slab_cache, sh); |
| 1599 | return 0; | 1698 | return 0; |
| 1600 | } | 1699 | } |
| 1700 | sh->hash_lock_index = hash; | ||
| 1601 | /* we just created an active stripe so... */ | 1701 | /* we just created an active stripe so... */ |
| 1602 | atomic_set(&sh->count, 1); | 1702 | atomic_set(&sh->count, 1); |
| 1603 | atomic_inc(&conf->active_stripes); | 1703 | atomic_inc(&conf->active_stripes); |
| @@ -1610,6 +1710,7 @@ static int grow_stripes(struct r5conf *conf, int num) | |||
| 1610 | { | 1710 | { |
| 1611 | struct kmem_cache *sc; | 1711 | struct kmem_cache *sc; |
| 1612 | int devs = max(conf->raid_disks, conf->previous_raid_disks); | 1712 | int devs = max(conf->raid_disks, conf->previous_raid_disks); |
| 1713 | int hash; | ||
| 1613 | 1714 | ||
| 1614 | if (conf->mddev->gendisk) | 1715 | if (conf->mddev->gendisk) |
| 1615 | sprintf(conf->cache_name[0], | 1716 | sprintf(conf->cache_name[0], |
| @@ -1627,9 +1728,13 @@ static int grow_stripes(struct r5conf *conf, int num) | |||
| 1627 | return 1; | 1728 | return 1; |
| 1628 | conf->slab_cache = sc; | 1729 | conf->slab_cache = sc; |
| 1629 | conf->pool_size = devs; | 1730 | conf->pool_size = devs; |
| 1630 | while (num--) | 1731 | hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; |
| 1631 | if (!grow_one_stripe(conf)) | 1732 | while (num--) { |
| 1733 | if (!grow_one_stripe(conf, hash)) | ||
| 1632 | return 1; | 1734 | return 1; |
| 1735 | conf->max_nr_stripes++; | ||
| 1736 | hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; | ||
| 1737 | } | ||
| 1633 | return 0; | 1738 | return 0; |
| 1634 | } | 1739 | } |
| 1635 | 1740 | ||
| @@ -1687,6 +1792,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
| 1687 | int err; | 1792 | int err; |
| 1688 | struct kmem_cache *sc; | 1793 | struct kmem_cache *sc; |
| 1689 | int i; | 1794 | int i; |
| 1795 | int hash, cnt; | ||
| 1690 | 1796 | ||
| 1691 | if (newsize <= conf->pool_size) | 1797 | if (newsize <= conf->pool_size) |
| 1692 | return 0; /* never bother to shrink */ | 1798 | return 0; /* never bother to shrink */ |
| @@ -1726,19 +1832,29 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
| 1726 | * OK, we have enough stripes, start collecting inactive | 1832 | * OK, we have enough stripes, start collecting inactive |
| 1727 | * stripes and copying them over | 1833 | * stripes and copying them over |
| 1728 | */ | 1834 | */ |
| 1835 | hash = 0; | ||
| 1836 | cnt = 0; | ||
| 1729 | list_for_each_entry(nsh, &newstripes, lru) { | 1837 | list_for_each_entry(nsh, &newstripes, lru) { |
| 1730 | spin_lock_irq(&conf->device_lock); | 1838 | lock_device_hash_lock(conf, hash); |
| 1731 | wait_event_lock_irq(conf->wait_for_stripe, | 1839 | wait_event_cmd(conf->wait_for_stripe, |
| 1732 | !list_empty(&conf->inactive_list), | 1840 | !list_empty(conf->inactive_list + hash), |
| 1733 | conf->device_lock); | 1841 | unlock_device_hash_lock(conf, hash), |
| 1734 | osh = get_free_stripe(conf); | 1842 | lock_device_hash_lock(conf, hash)); |
| 1735 | spin_unlock_irq(&conf->device_lock); | 1843 | osh = get_free_stripe(conf, hash); |
| 1844 | unlock_device_hash_lock(conf, hash); | ||
| 1736 | atomic_set(&nsh->count, 1); | 1845 | atomic_set(&nsh->count, 1); |
| 1737 | for(i=0; i<conf->pool_size; i++) | 1846 | for(i=0; i<conf->pool_size; i++) |
| 1738 | nsh->dev[i].page = osh->dev[i].page; | 1847 | nsh->dev[i].page = osh->dev[i].page; |
| 1739 | for( ; i<newsize; i++) | 1848 | for( ; i<newsize; i++) |
| 1740 | nsh->dev[i].page = NULL; | 1849 | nsh->dev[i].page = NULL; |
| 1850 | nsh->hash_lock_index = hash; | ||
| 1741 | kmem_cache_free(conf->slab_cache, osh); | 1851 | kmem_cache_free(conf->slab_cache, osh); |
| 1852 | cnt++; | ||
| 1853 | if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + | ||
| 1854 | !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { | ||
| 1855 | hash++; | ||
| 1856 | cnt = 0; | ||
| 1857 | } | ||
| 1742 | } | 1858 | } |
| 1743 | kmem_cache_destroy(conf->slab_cache); | 1859 | kmem_cache_destroy(conf->slab_cache); |
| 1744 | 1860 | ||
| @@ -1797,13 +1913,13 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
| 1797 | return err; | 1913 | return err; |
| 1798 | } | 1914 | } |
| 1799 | 1915 | ||
| 1800 | static int drop_one_stripe(struct r5conf *conf) | 1916 | static int drop_one_stripe(struct r5conf *conf, int hash) |
| 1801 | { | 1917 | { |
| 1802 | struct stripe_head *sh; | 1918 | struct stripe_head *sh; |
| 1803 | 1919 | ||
| 1804 | spin_lock_irq(&conf->device_lock); | 1920 | spin_lock_irq(conf->hash_locks + hash); |
| 1805 | sh = get_free_stripe(conf); | 1921 | sh = get_free_stripe(conf, hash); |
| 1806 | spin_unlock_irq(&conf->device_lock); | 1922 | spin_unlock_irq(conf->hash_locks + hash); |
| 1807 | if (!sh) | 1923 | if (!sh) |
| 1808 | return 0; | 1924 | return 0; |
| 1809 | BUG_ON(atomic_read(&sh->count)); | 1925 | BUG_ON(atomic_read(&sh->count)); |
| @@ -1815,8 +1931,10 @@ static int drop_one_stripe(struct r5conf *conf) | |||
| 1815 | 1931 | ||
| 1816 | static void shrink_stripes(struct r5conf *conf) | 1932 | static void shrink_stripes(struct r5conf *conf) |
| 1817 | { | 1933 | { |
| 1818 | while (drop_one_stripe(conf)) | 1934 | int hash; |
| 1819 | ; | 1935 | for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) |
| 1936 | while (drop_one_stripe(conf, hash)) | ||
| 1937 | ; | ||
| 1820 | 1938 | ||
| 1821 | if (conf->slab_cache) | 1939 | if (conf->slab_cache) |
| 1822 | kmem_cache_destroy(conf->slab_cache); | 1940 | kmem_cache_destroy(conf->slab_cache); |
| @@ -1921,6 +2039,9 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1921 | mdname(conf->mddev), bdn); | 2039 | mdname(conf->mddev), bdn); |
| 1922 | else | 2040 | else |
| 1923 | retry = 1; | 2041 | retry = 1; |
| 2042 | if (set_bad && test_bit(In_sync, &rdev->flags) | ||
| 2043 | && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | ||
| 2044 | retry = 1; | ||
| 1924 | if (retry) | 2045 | if (retry) |
| 1925 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { | 2046 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { |
| 1926 | set_bit(R5_ReadError, &sh->dev[i].flags); | 2047 | set_bit(R5_ReadError, &sh->dev[i].flags); |
| @@ -3900,7 +4021,8 @@ static void raid5_activate_delayed(struct r5conf *conf) | |||
| 3900 | } | 4021 | } |
| 3901 | } | 4022 | } |
| 3902 | 4023 | ||
| 3903 | static void activate_bit_delay(struct r5conf *conf) | 4024 | static void activate_bit_delay(struct r5conf *conf, |
| 4025 | struct list_head *temp_inactive_list) | ||
| 3904 | { | 4026 | { |
| 3905 | /* device_lock is held */ | 4027 | /* device_lock is held */ |
| 3906 | struct list_head head; | 4028 | struct list_head head; |
| @@ -3908,9 +4030,11 @@ static void activate_bit_delay(struct r5conf *conf) | |||
| 3908 | list_del_init(&conf->bitmap_list); | 4030 | list_del_init(&conf->bitmap_list); |
| 3909 | while (!list_empty(&head)) { | 4031 | while (!list_empty(&head)) { |
| 3910 | struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); | 4032 | struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); |
| 4033 | int hash; | ||
| 3911 | list_del_init(&sh->lru); | 4034 | list_del_init(&sh->lru); |
| 3912 | atomic_inc(&sh->count); | 4035 | atomic_inc(&sh->count); |
| 3913 | __release_stripe(conf, sh); | 4036 | hash = sh->hash_lock_index; |
| 4037 | __release_stripe(conf, sh, &temp_inactive_list[hash]); | ||
| 3914 | } | 4038 | } |
| 3915 | } | 4039 | } |
| 3916 | 4040 | ||
| @@ -3926,7 +4050,7 @@ int md_raid5_congested(struct mddev *mddev, int bits) | |||
| 3926 | return 1; | 4050 | return 1; |
| 3927 | if (conf->quiesce) | 4051 | if (conf->quiesce) |
| 3928 | return 1; | 4052 | return 1; |
| 3929 | if (list_empty_careful(&conf->inactive_list)) | 4053 | if (atomic_read(&conf->empty_inactive_list_nr)) |
| 3930 | return 1; | 4054 | return 1; |
| 3931 | 4055 | ||
| 3932 | return 0; | 4056 | return 0; |
| @@ -4256,6 +4380,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) | |||
| 4256 | struct raid5_plug_cb { | 4380 | struct raid5_plug_cb { |
| 4257 | struct blk_plug_cb cb; | 4381 | struct blk_plug_cb cb; |
| 4258 | struct list_head list; | 4382 | struct list_head list; |
| 4383 | struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; | ||
| 4259 | }; | 4384 | }; |
| 4260 | 4385 | ||
| 4261 | static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | 4386 | static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) |
| @@ -4266,6 +4391,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | |||
| 4266 | struct mddev *mddev = cb->cb.data; | 4391 | struct mddev *mddev = cb->cb.data; |
| 4267 | struct r5conf *conf = mddev->private; | 4392 | struct r5conf *conf = mddev->private; |
| 4268 | int cnt = 0; | 4393 | int cnt = 0; |
| 4394 | int hash; | ||
| 4269 | 4395 | ||
| 4270 | if (cb->list.next && !list_empty(&cb->list)) { | 4396 | if (cb->list.next && !list_empty(&cb->list)) { |
| 4271 | spin_lock_irq(&conf->device_lock); | 4397 | spin_lock_irq(&conf->device_lock); |
| @@ -4283,11 +4409,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) | |||
| 4283 | * STRIPE_ON_RELEASE_LIST could be set here. In that | 4409 | * STRIPE_ON_RELEASE_LIST could be set here. In that |
| 4284 | * case, the count is always > 1 here | 4410 | * case, the count is always > 1 here |
| 4285 | */ | 4411 | */ |
| 4286 | __release_stripe(conf, sh); | 4412 | hash = sh->hash_lock_index; |
| 4413 | __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); | ||
| 4287 | cnt++; | 4414 | cnt++; |
| 4288 | } | 4415 | } |
| 4289 | spin_unlock_irq(&conf->device_lock); | 4416 | spin_unlock_irq(&conf->device_lock); |
| 4290 | } | 4417 | } |
| 4418 | release_inactive_stripe_list(conf, cb->temp_inactive_list, | ||
| 4419 | NR_STRIPE_HASH_LOCKS); | ||
| 4291 | if (mddev->queue) | 4420 | if (mddev->queue) |
| 4292 | trace_block_unplug(mddev->queue, cnt, !from_schedule); | 4421 | trace_block_unplug(mddev->queue, cnt, !from_schedule); |
| 4293 | kfree(cb); | 4422 | kfree(cb); |
| @@ -4308,8 +4437,12 @@ static void release_stripe_plug(struct mddev *mddev, | |||
| 4308 | 4437 | ||
| 4309 | cb = container_of(blk_cb, struct raid5_plug_cb, cb); | 4438 | cb = container_of(blk_cb, struct raid5_plug_cb, cb); |
| 4310 | 4439 | ||
| 4311 | if (cb->list.next == NULL) | 4440 | if (cb->list.next == NULL) { |
| 4441 | int i; | ||
| 4312 | INIT_LIST_HEAD(&cb->list); | 4442 | INIT_LIST_HEAD(&cb->list); |
| 4443 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||
| 4444 | INIT_LIST_HEAD(cb->temp_inactive_list + i); | ||
| 4445 | } | ||
| 4313 | 4446 | ||
| 4314 | if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) | 4447 | if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) |
| 4315 | list_add_tail(&sh->lru, &cb->list); | 4448 | list_add_tail(&sh->lru, &cb->list); |
| @@ -4692,14 +4825,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4692 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | 4825 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { |
| 4693 | /* Cannot proceed until we've updated the superblock... */ | 4826 | /* Cannot proceed until we've updated the superblock... */ |
| 4694 | wait_event(conf->wait_for_overlap, | 4827 | wait_event(conf->wait_for_overlap, |
| 4695 | atomic_read(&conf->reshape_stripes)==0); | 4828 | atomic_read(&conf->reshape_stripes)==0 |
| 4829 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | ||
| 4830 | if (atomic_read(&conf->reshape_stripes) != 0) | ||
| 4831 | return 0; | ||
| 4696 | mddev->reshape_position = conf->reshape_progress; | 4832 | mddev->reshape_position = conf->reshape_progress; |
| 4697 | mddev->curr_resync_completed = sector_nr; | 4833 | mddev->curr_resync_completed = sector_nr; |
| 4698 | conf->reshape_checkpoint = jiffies; | 4834 | conf->reshape_checkpoint = jiffies; |
| 4699 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4835 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
| 4700 | md_wakeup_thread(mddev->thread); | 4836 | md_wakeup_thread(mddev->thread); |
| 4701 | wait_event(mddev->sb_wait, mddev->flags == 0 || | 4837 | wait_event(mddev->sb_wait, mddev->flags == 0 || |
| 4702 | kthread_should_stop()); | 4838 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
| 4839 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
| 4840 | return 0; | ||
| 4703 | spin_lock_irq(&conf->device_lock); | 4841 | spin_lock_irq(&conf->device_lock); |
| 4704 | conf->reshape_safe = mddev->reshape_position; | 4842 | conf->reshape_safe = mddev->reshape_position; |
| 4705 | spin_unlock_irq(&conf->device_lock); | 4843 | spin_unlock_irq(&conf->device_lock); |
| @@ -4782,7 +4920,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4782 | >= mddev->resync_max - mddev->curr_resync_completed) { | 4920 | >= mddev->resync_max - mddev->curr_resync_completed) { |
| 4783 | /* Cannot proceed until we've updated the superblock... */ | 4921 | /* Cannot proceed until we've updated the superblock... */ |
| 4784 | wait_event(conf->wait_for_overlap, | 4922 | wait_event(conf->wait_for_overlap, |
| 4785 | atomic_read(&conf->reshape_stripes) == 0); | 4923 | atomic_read(&conf->reshape_stripes) == 0 |
| 4924 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | ||
| 4925 | if (atomic_read(&conf->reshape_stripes) != 0) | ||
| 4926 | goto ret; | ||
| 4786 | mddev->reshape_position = conf->reshape_progress; | 4927 | mddev->reshape_position = conf->reshape_progress; |
| 4787 | mddev->curr_resync_completed = sector_nr; | 4928 | mddev->curr_resync_completed = sector_nr; |
| 4788 | conf->reshape_checkpoint = jiffies; | 4929 | conf->reshape_checkpoint = jiffies; |
| @@ -4790,13 +4931,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 4790 | md_wakeup_thread(mddev->thread); | 4931 | md_wakeup_thread(mddev->thread); |
| 4791 | wait_event(mddev->sb_wait, | 4932 | wait_event(mddev->sb_wait, |
| 4792 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) | 4933 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) |
| 4793 | || kthread_should_stop()); | 4934 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
| 4935 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
| 4936 | goto ret; | ||
| 4794 | spin_lock_irq(&conf->device_lock); | 4937 | spin_lock_irq(&conf->device_lock); |
| 4795 | conf->reshape_safe = mddev->reshape_position; | 4938 | conf->reshape_safe = mddev->reshape_position; |
| 4796 | spin_unlock_irq(&conf->device_lock); | 4939 | spin_unlock_irq(&conf->device_lock); |
| 4797 | wake_up(&conf->wait_for_overlap); | 4940 | wake_up(&conf->wait_for_overlap); |
| 4798 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 4941 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
| 4799 | } | 4942 | } |
| 4943 | ret: | ||
| 4800 | return reshape_sectors; | 4944 | return reshape_sectors; |
| 4801 | } | 4945 | } |
| 4802 | 4946 | ||
| @@ -4954,27 +5098,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
| 4954 | } | 5098 | } |
| 4955 | 5099 | ||
| 4956 | static int handle_active_stripes(struct r5conf *conf, int group, | 5100 | static int handle_active_stripes(struct r5conf *conf, int group, |
| 4957 | struct r5worker *worker) | 5101 | struct r5worker *worker, |
| 5102 | struct list_head *temp_inactive_list) | ||
| 4958 | { | 5103 | { |
| 4959 | struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; | 5104 | struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; |
| 4960 | int i, batch_size = 0; | 5105 | int i, batch_size = 0, hash; |
| 5106 | bool release_inactive = false; | ||
| 4961 | 5107 | ||
| 4962 | while (batch_size < MAX_STRIPE_BATCH && | 5108 | while (batch_size < MAX_STRIPE_BATCH && |
| 4963 | (sh = __get_priority_stripe(conf, group)) != NULL) | 5109 | (sh = __get_priority_stripe(conf, group)) != NULL) |
| 4964 | batch[batch_size++] = sh; | 5110 | batch[batch_size++] = sh; |
| 4965 | 5111 | ||
| 4966 | if (batch_size == 0) | 5112 | if (batch_size == 0) { |
| 4967 | return batch_size; | 5113 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) |
| 5114 | if (!list_empty(temp_inactive_list + i)) | ||
| 5115 | break; | ||
| 5116 | if (i == NR_STRIPE_HASH_LOCKS) | ||
| 5117 | return batch_size; | ||
| 5118 | release_inactive = true; | ||
| 5119 | } | ||
| 4968 | spin_unlock_irq(&conf->device_lock); | 5120 | spin_unlock_irq(&conf->device_lock); |
| 4969 | 5121 | ||
| 5122 | release_inactive_stripe_list(conf, temp_inactive_list, | ||
| 5123 | NR_STRIPE_HASH_LOCKS); | ||
| 5124 | |||
| 5125 | if (release_inactive) { | ||
| 5126 | spin_lock_irq(&conf->device_lock); | ||
| 5127 | return 0; | ||
| 5128 | } | ||
| 5129 | |||
| 4970 | for (i = 0; i < batch_size; i++) | 5130 | for (i = 0; i < batch_size; i++) |
| 4971 | handle_stripe(batch[i]); | 5131 | handle_stripe(batch[i]); |
| 4972 | 5132 | ||
| 4973 | cond_resched(); | 5133 | cond_resched(); |
| 4974 | 5134 | ||
| 4975 | spin_lock_irq(&conf->device_lock); | 5135 | spin_lock_irq(&conf->device_lock); |
| 4976 | for (i = 0; i < batch_size; i++) | 5136 | for (i = 0; i < batch_size; i++) { |
| 4977 | __release_stripe(conf, batch[i]); | 5137 | hash = batch[i]->hash_lock_index; |
| 5138 | __release_stripe(conf, batch[i], &temp_inactive_list[hash]); | ||
| 5139 | } | ||
| 4978 | return batch_size; | 5140 | return batch_size; |
| 4979 | } | 5141 | } |
| 4980 | 5142 | ||
| @@ -4995,9 +5157,10 @@ static void raid5_do_work(struct work_struct *work) | |||
| 4995 | while (1) { | 5157 | while (1) { |
| 4996 | int batch_size, released; | 5158 | int batch_size, released; |
| 4997 | 5159 | ||
| 4998 | released = release_stripe_list(conf); | 5160 | released = release_stripe_list(conf, worker->temp_inactive_list); |
| 4999 | 5161 | ||
| 5000 | batch_size = handle_active_stripes(conf, group_id, worker); | 5162 | batch_size = handle_active_stripes(conf, group_id, worker, |
| 5163 | worker->temp_inactive_list); | ||
| 5001 | worker->working = false; | 5164 | worker->working = false; |
| 5002 | if (!batch_size && !released) | 5165 | if (!batch_size && !released) |
| 5003 | break; | 5166 | break; |
| @@ -5036,7 +5199,7 @@ static void raid5d(struct md_thread *thread) | |||
| 5036 | struct bio *bio; | 5199 | struct bio *bio; |
| 5037 | int batch_size, released; | 5200 | int batch_size, released; |
| 5038 | 5201 | ||
| 5039 | released = release_stripe_list(conf); | 5202 | released = release_stripe_list(conf, conf->temp_inactive_list); |
| 5040 | 5203 | ||
| 5041 | if ( | 5204 | if ( |
| 5042 | !list_empty(&conf->bitmap_list)) { | 5205 | !list_empty(&conf->bitmap_list)) { |
| @@ -5046,7 +5209,7 @@ static void raid5d(struct md_thread *thread) | |||
| 5046 | bitmap_unplug(mddev->bitmap); | 5209 | bitmap_unplug(mddev->bitmap); |
| 5047 | spin_lock_irq(&conf->device_lock); | 5210 | spin_lock_irq(&conf->device_lock); |
| 5048 | conf->seq_write = conf->seq_flush; | 5211 | conf->seq_write = conf->seq_flush; |
| 5049 | activate_bit_delay(conf); | 5212 | activate_bit_delay(conf, conf->temp_inactive_list); |
| 5050 | } | 5213 | } |
| 5051 | raid5_activate_delayed(conf); | 5214 | raid5_activate_delayed(conf); |
| 5052 | 5215 | ||
| @@ -5060,7 +5223,8 @@ static void raid5d(struct md_thread *thread) | |||
| 5060 | handled++; | 5223 | handled++; |
| 5061 | } | 5224 | } |
| 5062 | 5225 | ||
| 5063 | batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); | 5226 | batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, |
| 5227 | conf->temp_inactive_list); | ||
| 5064 | if (!batch_size && !released) | 5228 | if (!batch_size && !released) |
| 5065 | break; | 5229 | break; |
| 5066 | handled += batch_size; | 5230 | handled += batch_size; |
| @@ -5096,22 +5260,29 @@ raid5_set_cache_size(struct mddev *mddev, int size) | |||
| 5096 | { | 5260 | { |
| 5097 | struct r5conf *conf = mddev->private; | 5261 | struct r5conf *conf = mddev->private; |
| 5098 | int err; | 5262 | int err; |
| 5263 | int hash; | ||
| 5099 | 5264 | ||
| 5100 | if (size <= 16 || size > 32768) | 5265 | if (size <= 16 || size > 32768) |
| 5101 | return -EINVAL; | 5266 | return -EINVAL; |
| 5267 | hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; | ||
| 5102 | while (size < conf->max_nr_stripes) { | 5268 | while (size < conf->max_nr_stripes) { |
| 5103 | if (drop_one_stripe(conf)) | 5269 | if (drop_one_stripe(conf, hash)) |
| 5104 | conf->max_nr_stripes--; | 5270 | conf->max_nr_stripes--; |
| 5105 | else | 5271 | else |
| 5106 | break; | 5272 | break; |
| 5273 | hash--; | ||
| 5274 | if (hash < 0) | ||
| 5275 | hash = NR_STRIPE_HASH_LOCKS - 1; | ||
| 5107 | } | 5276 | } |
| 5108 | err = md_allow_write(mddev); | 5277 | err = md_allow_write(mddev); |
| 5109 | if (err) | 5278 | if (err) |
| 5110 | return err; | 5279 | return err; |
| 5280 | hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; | ||
| 5111 | while (size > conf->max_nr_stripes) { | 5281 | while (size > conf->max_nr_stripes) { |
| 5112 | if (grow_one_stripe(conf)) | 5282 | if (grow_one_stripe(conf, hash)) |
| 5113 | conf->max_nr_stripes++; | 5283 | conf->max_nr_stripes++; |
| 5114 | else break; | 5284 | else break; |
| 5285 | hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; | ||
| 5115 | } | 5286 | } |
| 5116 | return 0; | 5287 | return 0; |
| 5117 | } | 5288 | } |
| @@ -5199,15 +5370,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page) | |||
| 5199 | return 0; | 5370 | return 0; |
| 5200 | } | 5371 | } |
| 5201 | 5372 | ||
| 5202 | static int alloc_thread_groups(struct r5conf *conf, int cnt); | 5373 | static int alloc_thread_groups(struct r5conf *conf, int cnt, |
| 5374 | int *group_cnt, | ||
| 5375 | int *worker_cnt_per_group, | ||
| 5376 | struct r5worker_group **worker_groups); | ||
| 5203 | static ssize_t | 5377 | static ssize_t |
| 5204 | raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) | 5378 | raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) |
| 5205 | { | 5379 | { |
| 5206 | struct r5conf *conf = mddev->private; | 5380 | struct r5conf *conf = mddev->private; |
| 5207 | unsigned long new; | 5381 | unsigned long new; |
| 5208 | int err; | 5382 | int err; |
| 5209 | struct r5worker_group *old_groups; | 5383 | struct r5worker_group *new_groups, *old_groups; |
| 5210 | int old_group_cnt; | 5384 | int group_cnt, worker_cnt_per_group; |
| 5211 | 5385 | ||
| 5212 | if (len >= PAGE_SIZE) | 5386 | if (len >= PAGE_SIZE) |
| 5213 | return -EINVAL; | 5387 | return -EINVAL; |
| @@ -5223,14 +5397,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) | |||
| 5223 | mddev_suspend(mddev); | 5397 | mddev_suspend(mddev); |
| 5224 | 5398 | ||
| 5225 | old_groups = conf->worker_groups; | 5399 | old_groups = conf->worker_groups; |
| 5226 | old_group_cnt = conf->worker_cnt_per_group; | 5400 | if (old_groups) |
| 5401 | flush_workqueue(raid5_wq); | ||
| 5402 | |||
| 5403 | err = alloc_thread_groups(conf, new, | ||
| 5404 | &group_cnt, &worker_cnt_per_group, | ||
| 5405 | &new_groups); | ||
| 5406 | if (!err) { | ||
| 5407 | spin_lock_irq(&conf->device_lock); | ||
| 5408 | conf->group_cnt = group_cnt; | ||
| 5409 | conf->worker_cnt_per_group = worker_cnt_per_group; | ||
| 5410 | conf->worker_groups = new_groups; | ||
| 5411 | spin_unlock_irq(&conf->device_lock); | ||
| 5227 | 5412 | ||
| 5228 | conf->worker_groups = NULL; | ||
| 5229 | err = alloc_thread_groups(conf, new); | ||
| 5230 | if (err) { | ||
| 5231 | conf->worker_groups = old_groups; | ||
| 5232 | conf->worker_cnt_per_group = old_group_cnt; | ||
| 5233 | } else { | ||
| 5234 | if (old_groups) | 5413 | if (old_groups) |
| 5235 | kfree(old_groups[0].workers); | 5414 | kfree(old_groups[0].workers); |
| 5236 | kfree(old_groups); | 5415 | kfree(old_groups); |
| @@ -5260,40 +5439,47 @@ static struct attribute_group raid5_attrs_group = { | |||
| 5260 | .attrs = raid5_attrs, | 5439 | .attrs = raid5_attrs, |
| 5261 | }; | 5440 | }; |
| 5262 | 5441 | ||
| 5263 | static int alloc_thread_groups(struct r5conf *conf, int cnt) | 5442 | static int alloc_thread_groups(struct r5conf *conf, int cnt, |
| 5443 | int *group_cnt, | ||
| 5444 | int *worker_cnt_per_group, | ||
| 5445 | struct r5worker_group **worker_groups) | ||
| 5264 | { | 5446 | { |
| 5265 | int i, j; | 5447 | int i, j, k; |
| 5266 | ssize_t size; | 5448 | ssize_t size; |
| 5267 | struct r5worker *workers; | 5449 | struct r5worker *workers; |
| 5268 | 5450 | ||
| 5269 | conf->worker_cnt_per_group = cnt; | 5451 | *worker_cnt_per_group = cnt; |
| 5270 | if (cnt == 0) { | 5452 | if (cnt == 0) { |
| 5271 | conf->worker_groups = NULL; | 5453 | *group_cnt = 0; |
| 5454 | *worker_groups = NULL; | ||
| 5272 | return 0; | 5455 | return 0; |
| 5273 | } | 5456 | } |
| 5274 | conf->group_cnt = num_possible_nodes(); | 5457 | *group_cnt = num_possible_nodes(); |
| 5275 | size = sizeof(struct r5worker) * cnt; | 5458 | size = sizeof(struct r5worker) * cnt; |
| 5276 | workers = kzalloc(size * conf->group_cnt, GFP_NOIO); | 5459 | workers = kzalloc(size * *group_cnt, GFP_NOIO); |
| 5277 | conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * | 5460 | *worker_groups = kzalloc(sizeof(struct r5worker_group) * |
| 5278 | conf->group_cnt, GFP_NOIO); | 5461 | *group_cnt, GFP_NOIO); |
| 5279 | if (!conf->worker_groups || !workers) { | 5462 | if (!*worker_groups || !workers) { |
| 5280 | kfree(workers); | 5463 | kfree(workers); |
| 5281 | kfree(conf->worker_groups); | 5464 | kfree(*worker_groups); |
| 5282 | conf->worker_groups = NULL; | ||
| 5283 | return -ENOMEM; | 5465 | return -ENOMEM; |
| 5284 | } | 5466 | } |
| 5285 | 5467 | ||
| 5286 | for (i = 0; i < conf->group_cnt; i++) { | 5468 | for (i = 0; i < *group_cnt; i++) { |
| 5287 | struct r5worker_group *group; | 5469 | struct r5worker_group *group; |
| 5288 | 5470 | ||
| 5289 | group = &conf->worker_groups[i]; | 5471 | group = &(*worker_groups)[i]; |
| 5290 | INIT_LIST_HEAD(&group->handle_list); | 5472 | INIT_LIST_HEAD(&group->handle_list); |
| 5291 | group->conf = conf; | 5473 | group->conf = conf; |
| 5292 | group->workers = workers + i * cnt; | 5474 | group->workers = workers + i * cnt; |
| 5293 | 5475 | ||
| 5294 | for (j = 0; j < cnt; j++) { | 5476 | for (j = 0; j < cnt; j++) { |
| 5295 | group->workers[j].group = group; | 5477 | struct r5worker *worker = group->workers + j; |
| 5296 | INIT_WORK(&group->workers[j].work, raid5_do_work); | 5478 | worker->group = group; |
| 5479 | INIT_WORK(&worker->work, raid5_do_work); | ||
| 5480 | |||
| 5481 | for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) | ||
| 5482 | INIT_LIST_HEAD(worker->temp_inactive_list + k); | ||
| 5297 | } | 5483 | } |
| 5298 | } | 5484 | } |
| 5299 | 5485 | ||
| @@ -5444,6 +5630,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 5444 | struct md_rdev *rdev; | 5630 | struct md_rdev *rdev; |
| 5445 | struct disk_info *disk; | 5631 | struct disk_info *disk; |
| 5446 | char pers_name[6]; | 5632 | char pers_name[6]; |
| 5633 | int i; | ||
| 5634 | int group_cnt, worker_cnt_per_group; | ||
| 5635 | struct r5worker_group *new_group; | ||
| 5447 | 5636 | ||
| 5448 | if (mddev->new_level != 5 | 5637 | if (mddev->new_level != 5 |
| 5449 | && mddev->new_level != 4 | 5638 | && mddev->new_level != 4 |
| @@ -5478,7 +5667,12 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 5478 | if (conf == NULL) | 5667 | if (conf == NULL) |
| 5479 | goto abort; | 5668 | goto abort; |
| 5480 | /* Don't enable multi-threading by default*/ | 5669 | /* Don't enable multi-threading by default*/ |
| 5481 | if (alloc_thread_groups(conf, 0)) | 5670 | if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, |
| 5671 | &new_group)) { | ||
| 5672 | conf->group_cnt = group_cnt; | ||
| 5673 | conf->worker_cnt_per_group = worker_cnt_per_group; | ||
| 5674 | conf->worker_groups = new_group; | ||
| 5675 | } else | ||
| 5482 | goto abort; | 5676 | goto abort; |
| 5483 | spin_lock_init(&conf->device_lock); | 5677 | spin_lock_init(&conf->device_lock); |
| 5484 | seqcount_init(&conf->gen_lock); | 5678 | seqcount_init(&conf->gen_lock); |
| @@ -5488,7 +5682,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 5488 | INIT_LIST_HEAD(&conf->hold_list); | 5682 | INIT_LIST_HEAD(&conf->hold_list); |
| 5489 | INIT_LIST_HEAD(&conf->delayed_list); | 5683 | INIT_LIST_HEAD(&conf->delayed_list); |
| 5490 | INIT_LIST_HEAD(&conf->bitmap_list); | 5684 | INIT_LIST_HEAD(&conf->bitmap_list); |
| 5491 | INIT_LIST_HEAD(&conf->inactive_list); | ||
| 5492 | init_llist_head(&conf->released_stripes); | 5685 | init_llist_head(&conf->released_stripes); |
| 5493 | atomic_set(&conf->active_stripes, 0); | 5686 | atomic_set(&conf->active_stripes, 0); |
| 5494 | atomic_set(&conf->preread_active_stripes, 0); | 5687 | atomic_set(&conf->preread_active_stripes, 0); |
| @@ -5514,6 +5707,21 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 5514 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) | 5707 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) |
| 5515 | goto abort; | 5708 | goto abort; |
| 5516 | 5709 | ||
| 5710 | /* We init hash_locks[0] separately to that it can be used | ||
| 5711 | * as the reference lock in the spin_lock_nest_lock() call | ||
| 5712 | * in lock_all_device_hash_locks_irq in order to convince | ||
| 5713 | * lockdep that we know what we are doing. | ||
| 5714 | */ | ||
| 5715 | spin_lock_init(conf->hash_locks); | ||
| 5716 | for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) | ||
| 5717 | spin_lock_init(conf->hash_locks + i); | ||
| 5718 | |||
| 5719 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||
| 5720 | INIT_LIST_HEAD(conf->inactive_list + i); | ||
| 5721 | |||
| 5722 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | ||
| 5723 | INIT_LIST_HEAD(conf->temp_inactive_list + i); | ||
| 5724 | |||
| 5517 | conf->level = mddev->new_level; | 5725 | conf->level = mddev->new_level; |
| 5518 | if (raid5_alloc_percpu(conf) != 0) | 5726 | if (raid5_alloc_percpu(conf) != 0) |
| 5519 | goto abort; | 5727 | goto abort; |
| @@ -5554,7 +5762,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 5554 | else | 5762 | else |
| 5555 | conf->max_degraded = 1; | 5763 | conf->max_degraded = 1; |
| 5556 | conf->algorithm = mddev->new_layout; | 5764 | conf->algorithm = mddev->new_layout; |
| 5557 | conf->max_nr_stripes = NR_STRIPES; | ||
| 5558 | conf->reshape_progress = mddev->reshape_position; | 5765 | conf->reshape_progress = mddev->reshape_position; |
| 5559 | if (conf->reshape_progress != MaxSector) { | 5766 | if (conf->reshape_progress != MaxSector) { |
| 5560 | conf->prev_chunk_sectors = mddev->chunk_sectors; | 5767 | conf->prev_chunk_sectors = mddev->chunk_sectors; |
| @@ -5563,7 +5770,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 5563 | 5770 | ||
| 5564 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + | 5771 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + |
| 5565 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | 5772 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; |
| 5566 | if (grow_stripes(conf, conf->max_nr_stripes)) { | 5773 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); |
| 5774 | if (grow_stripes(conf, NR_STRIPES)) { | ||
| 5567 | printk(KERN_ERR | 5775 | printk(KERN_ERR |
| 5568 | "md/raid:%s: couldn't allocate %dkB for buffers\n", | 5776 | "md/raid:%s: couldn't allocate %dkB for buffers\n", |
| 5569 | mdname(mddev), memory); | 5777 | mdname(mddev), memory); |
| @@ -6369,12 +6577,18 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
| 6369 | if (!mddev->sync_thread) { | 6577 | if (!mddev->sync_thread) { |
| 6370 | mddev->recovery = 0; | 6578 | mddev->recovery = 0; |
| 6371 | spin_lock_irq(&conf->device_lock); | 6579 | spin_lock_irq(&conf->device_lock); |
| 6580 | write_seqcount_begin(&conf->gen_lock); | ||
| 6372 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; | 6581 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; |
| 6582 | mddev->new_chunk_sectors = | ||
| 6583 | conf->chunk_sectors = conf->prev_chunk_sectors; | ||
| 6584 | mddev->new_layout = conf->algorithm = conf->prev_algo; | ||
| 6373 | rdev_for_each(rdev, mddev) | 6585 | rdev_for_each(rdev, mddev) |
| 6374 | rdev->new_data_offset = rdev->data_offset; | 6586 | rdev->new_data_offset = rdev->data_offset; |
| 6375 | smp_wmb(); | 6587 | smp_wmb(); |
| 6588 | conf->generation --; | ||
| 6376 | conf->reshape_progress = MaxSector; | 6589 | conf->reshape_progress = MaxSector; |
| 6377 | mddev->reshape_position = MaxSector; | 6590 | mddev->reshape_position = MaxSector; |
| 6591 | write_seqcount_end(&conf->gen_lock); | ||
| 6378 | spin_unlock_irq(&conf->device_lock); | 6592 | spin_unlock_irq(&conf->device_lock); |
| 6379 | return -EAGAIN; | 6593 | return -EAGAIN; |
| 6380 | } | 6594 | } |
| @@ -6462,27 +6676,28 @@ static void raid5_quiesce(struct mddev *mddev, int state) | |||
| 6462 | break; | 6676 | break; |
| 6463 | 6677 | ||
| 6464 | case 1: /* stop all writes */ | 6678 | case 1: /* stop all writes */ |
| 6465 | spin_lock_irq(&conf->device_lock); | 6679 | lock_all_device_hash_locks_irq(conf); |
| 6466 | /* '2' tells resync/reshape to pause so that all | 6680 | /* '2' tells resync/reshape to pause so that all |
| 6467 | * active stripes can drain | 6681 | * active stripes can drain |
| 6468 | */ | 6682 | */ |
| 6469 | conf->quiesce = 2; | 6683 | conf->quiesce = 2; |
| 6470 | wait_event_lock_irq(conf->wait_for_stripe, | 6684 | wait_event_cmd(conf->wait_for_stripe, |
| 6471 | atomic_read(&conf->active_stripes) == 0 && | 6685 | atomic_read(&conf->active_stripes) == 0 && |
| 6472 | atomic_read(&conf->active_aligned_reads) == 0, | 6686 | atomic_read(&conf->active_aligned_reads) == 0, |
| 6473 | conf->device_lock); | 6687 | unlock_all_device_hash_locks_irq(conf), |
| 6688 | lock_all_device_hash_locks_irq(conf)); | ||
| 6474 | conf->quiesce = 1; | 6689 | conf->quiesce = 1; |
| 6475 | spin_unlock_irq(&conf->device_lock); | 6690 | unlock_all_device_hash_locks_irq(conf); |
| 6476 | /* allow reshape to continue */ | 6691 | /* allow reshape to continue */ |
| 6477 | wake_up(&conf->wait_for_overlap); | 6692 | wake_up(&conf->wait_for_overlap); |
| 6478 | break; | 6693 | break; |
| 6479 | 6694 | ||
| 6480 | case 0: /* re-enable writes */ | 6695 | case 0: /* re-enable writes */ |
| 6481 | spin_lock_irq(&conf->device_lock); | 6696 | lock_all_device_hash_locks_irq(conf); |
| 6482 | conf->quiesce = 0; | 6697 | conf->quiesce = 0; |
| 6483 | wake_up(&conf->wait_for_stripe); | 6698 | wake_up(&conf->wait_for_stripe); |
| 6484 | wake_up(&conf->wait_for_overlap); | 6699 | wake_up(&conf->wait_for_overlap); |
| 6485 | spin_unlock_irq(&conf->device_lock); | 6700 | unlock_all_device_hash_locks_irq(conf); |
| 6486 | break; | 6701 | break; |
| 6487 | } | 6702 | } |
| 6488 | } | 6703 | } |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index b42e6b462eda..01ad8ae8f578 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
| @@ -205,6 +205,7 @@ struct stripe_head { | |||
| 205 | short pd_idx; /* parity disk index */ | 205 | short pd_idx; /* parity disk index */ |
| 206 | short qd_idx; /* 'Q' disk index for raid6 */ | 206 | short qd_idx; /* 'Q' disk index for raid6 */ |
| 207 | short ddf_layout;/* use DDF ordering to calculate Q */ | 207 | short ddf_layout;/* use DDF ordering to calculate Q */ |
| 208 | short hash_lock_index; | ||
| 208 | unsigned long state; /* state flags */ | 209 | unsigned long state; /* state flags */ |
| 209 | atomic_t count; /* nr of active thread/requests */ | 210 | atomic_t count; /* nr of active thread/requests */ |
| 210 | int bm_seq; /* sequence number for bitmap flushes */ | 211 | int bm_seq; /* sequence number for bitmap flushes */ |
| @@ -367,9 +368,18 @@ struct disk_info { | |||
| 367 | struct md_rdev *rdev, *replacement; | 368 | struct md_rdev *rdev, *replacement; |
| 368 | }; | 369 | }; |
| 369 | 370 | ||
| 371 | /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. | ||
| 372 | * This is because we sometimes take all the spinlocks | ||
| 373 | * and creating that much locking depth can cause | ||
| 374 | * problems. | ||
| 375 | */ | ||
| 376 | #define NR_STRIPE_HASH_LOCKS 8 | ||
| 377 | #define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1) | ||
| 378 | |||
| 370 | struct r5worker { | 379 | struct r5worker { |
| 371 | struct work_struct work; | 380 | struct work_struct work; |
| 372 | struct r5worker_group *group; | 381 | struct r5worker_group *group; |
| 382 | struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; | ||
| 373 | bool working; | 383 | bool working; |
| 374 | }; | 384 | }; |
| 375 | 385 | ||
| @@ -382,6 +392,8 @@ struct r5worker_group { | |||
| 382 | 392 | ||
| 383 | struct r5conf { | 393 | struct r5conf { |
| 384 | struct hlist_head *stripe_hashtbl; | 394 | struct hlist_head *stripe_hashtbl; |
| 395 | /* only protect corresponding hash list and inactive_list */ | ||
| 396 | spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; | ||
| 385 | struct mddev *mddev; | 397 | struct mddev *mddev; |
| 386 | int chunk_sectors; | 398 | int chunk_sectors; |
| 387 | int level, algorithm; | 399 | int level, algorithm; |
| @@ -462,7 +474,8 @@ struct r5conf { | |||
| 462 | * Free stripes pool | 474 | * Free stripes pool |
| 463 | */ | 475 | */ |
| 464 | atomic_t active_stripes; | 476 | atomic_t active_stripes; |
| 465 | struct list_head inactive_list; | 477 | struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; |
| 478 | atomic_t empty_inactive_list_nr; | ||
| 466 | struct llist_head released_stripes; | 479 | struct llist_head released_stripes; |
| 467 | wait_queue_head_t wait_for_stripe; | 480 | wait_queue_head_t wait_for_stripe; |
| 468 | wait_queue_head_t wait_for_overlap; | 481 | wait_queue_head_t wait_for_overlap; |
| @@ -477,6 +490,7 @@ struct r5conf { | |||
| 477 | * the new thread here until we fully activate the array. | 490 | * the new thread here until we fully activate the array. |
| 478 | */ | 491 | */ |
| 479 | struct md_thread *thread; | 492 | struct md_thread *thread; |
| 493 | struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; | ||
| 480 | struct r5worker_group *worker_groups; | 494 | struct r5worker_group *worker_groups; |
| 481 | int group_cnt; | 495 | int group_cnt; |
| 482 | int worker_cnt_per_group; | 496 | int worker_cnt_per_group; |
