aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2013-12-17 09:27:08 -0500
committerIngo Molnar <mingo@kernel.org>2013-12-17 09:27:08 -0500
commitbb799d3b980eb803ca2da4a4eefbd9308f8d988a (patch)
tree69fbe0cd6d47b23a50f5e1d87bf7489532fae149 /drivers/md
parent919fc6e34831d1c2b58bfb5ae261dc3facc9b269 (diff)
parent319e2e3f63c348a9b66db4667efa73178e18b17d (diff)
Merge tag 'v3.13-rc4' into core/locking
Merge Linux 3.13-rc4, to refresh this rather old tree with the latest fixes. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-bufio.c5
-rw-r--r--drivers/md/dm-cache-policy-mq.c13
-rw-r--r--drivers/md/dm-cache-target.c2
-rw-r--r--drivers/md/dm-delay.c23
-rw-r--r--drivers/md/dm-snap.c71
-rw-r--r--drivers/md/dm-stats.c1
-rw-r--r--drivers/md/dm-table.c5
-rw-r--r--drivers/md/dm-thin-metadata.c8
-rw-r--r--drivers/md/dm-thin-metadata.h1
-rw-r--r--drivers/md/dm-thin.c66
-rw-r--r--drivers/md/md.c147
-rw-r--r--drivers/md/persistent-data/dm-array.c10
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c6
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h7
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c32
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c8
-rw-r--r--drivers/md/raid1.c162
-rw-r--r--drivers/md/raid1.h15
-rw-r--r--drivers/md/raid10.c6
-rw-r--r--drivers/md/raid5.c425
-rw-r--r--drivers/md/raid5.h16
21 files changed, 760 insertions, 269 deletions
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 173cbb20d104..54bdd923316f 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1717,6 +1717,11 @@ static int __init dm_bufio_init(void)
1717{ 1717{
1718 __u64 mem; 1718 __u64 mem;
1719 1719
1720 dm_bufio_allocated_kmem_cache = 0;
1721 dm_bufio_allocated_get_free_pages = 0;
1722 dm_bufio_allocated_vmalloc = 0;
1723 dm_bufio_current_allocated = 0;
1724
1720 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1725 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
1721 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1726 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
1722 1727
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 416b7b752a6e..64780ad73bb0 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -730,15 +730,18 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
730 int r = 0; 730 int r = 0;
731 bool updated = updated_this_tick(mq, e); 731 bool updated = updated_this_tick(mq, e);
732 732
733 requeue_and_update_tick(mq, e);
734
735 if ((!discarded_oblock && updated) || 733 if ((!discarded_oblock && updated) ||
736 !should_promote(mq, e, discarded_oblock, data_dir)) 734 !should_promote(mq, e, discarded_oblock, data_dir)) {
735 requeue_and_update_tick(mq, e);
737 result->op = POLICY_MISS; 736 result->op = POLICY_MISS;
738 else if (!can_migrate) 737
738 } else if (!can_migrate)
739 r = -EWOULDBLOCK; 739 r = -EWOULDBLOCK;
740 else 740
741 else {
742 requeue_and_update_tick(mq, e);
741 r = pre_cache_to_cache(mq, e, result); 743 r = pre_cache_to_cache(mq, e, result);
744 }
742 745
743 return r; 746 return r;
744} 747}
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9efcf1059b99..1b1469ebe5cb 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2755,7 +2755,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2755{ 2755{
2756 int r; 2756 int r;
2757 2757
2758 r = dm_cache_resize(cache->cmd, cache->cache_size); 2758 r = dm_cache_resize(cache->cmd, new_size);
2759 if (r) { 2759 if (r) {
2760 DMERR("could not resize cache metadata"); 2760 DMERR("could not resize cache metadata");
2761 return r; 2761 return r;
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 496d5f3646a5..2f91d6d4a2cc 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -20,6 +20,7 @@
20struct delay_c { 20struct delay_c {
21 struct timer_list delay_timer; 21 struct timer_list delay_timer;
22 struct mutex timer_lock; 22 struct mutex timer_lock;
23 struct workqueue_struct *kdelayd_wq;
23 struct work_struct flush_expired_bios; 24 struct work_struct flush_expired_bios;
24 struct list_head delayed_bios; 25 struct list_head delayed_bios;
25 atomic_t may_delay; 26 atomic_t may_delay;
@@ -45,14 +46,13 @@ struct dm_delay_info {
45 46
46static DEFINE_MUTEX(delayed_bios_lock); 47static DEFINE_MUTEX(delayed_bios_lock);
47 48
48static struct workqueue_struct *kdelayd_wq;
49static struct kmem_cache *delayed_cache; 49static struct kmem_cache *delayed_cache;
50 50
51static void handle_delayed_timer(unsigned long data) 51static void handle_delayed_timer(unsigned long data)
52{ 52{
53 struct delay_c *dc = (struct delay_c *)data; 53 struct delay_c *dc = (struct delay_c *)data;
54 54
55 queue_work(kdelayd_wq, &dc->flush_expired_bios); 55 queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
56} 56}
57 57
58static void queue_timeout(struct delay_c *dc, unsigned long expires) 58static void queue_timeout(struct delay_c *dc, unsigned long expires)
@@ -191,6 +191,12 @@ out:
191 goto bad_dev_write; 191 goto bad_dev_write;
192 } 192 }
193 193
194 dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
195 if (!dc->kdelayd_wq) {
196 DMERR("Couldn't start kdelayd");
197 goto bad_queue;
198 }
199
194 setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); 200 setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc);
195 201
196 INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); 202 INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
@@ -203,6 +209,8 @@ out:
203 ti->private = dc; 209 ti->private = dc;
204 return 0; 210 return 0;
205 211
212bad_queue:
213 mempool_destroy(dc->delayed_pool);
206bad_dev_write: 214bad_dev_write:
207 if (dc->dev_write) 215 if (dc->dev_write)
208 dm_put_device(ti, dc->dev_write); 216 dm_put_device(ti, dc->dev_write);
@@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti)
217{ 225{
218 struct delay_c *dc = ti->private; 226 struct delay_c *dc = ti->private;
219 227
220 flush_workqueue(kdelayd_wq); 228 destroy_workqueue(dc->kdelayd_wq);
221 229
222 dm_put_device(ti, dc->dev_read); 230 dm_put_device(ti, dc->dev_read);
223 231
@@ -350,12 +358,6 @@ static int __init dm_delay_init(void)
350{ 358{
351 int r = -ENOMEM; 359 int r = -ENOMEM;
352 360
353 kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
354 if (!kdelayd_wq) {
355 DMERR("Couldn't start kdelayd");
356 goto bad_queue;
357 }
358
359 delayed_cache = KMEM_CACHE(dm_delay_info, 0); 361 delayed_cache = KMEM_CACHE(dm_delay_info, 0);
360 if (!delayed_cache) { 362 if (!delayed_cache) {
361 DMERR("Couldn't create delayed bio cache."); 363 DMERR("Couldn't create delayed bio cache.");
@@ -373,8 +375,6 @@ static int __init dm_delay_init(void)
373bad_register: 375bad_register:
374 kmem_cache_destroy(delayed_cache); 376 kmem_cache_destroy(delayed_cache);
375bad_memcache: 377bad_memcache:
376 destroy_workqueue(kdelayd_wq);
377bad_queue:
378 return r; 378 return r;
379} 379}
380 380
@@ -382,7 +382,6 @@ static void __exit dm_delay_exit(void)
382{ 382{
383 dm_unregister_target(&delay_target); 383 dm_unregister_target(&delay_target);
384 kmem_cache_destroy(delayed_cache); 384 kmem_cache_destroy(delayed_cache);
385 destroy_workqueue(kdelayd_wq);
386} 385}
387 386
388/* Module hooks */ 387/* Module hooks */
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index aec57d76db5d..944690bafd93 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -66,6 +66,18 @@ struct dm_snapshot {
66 66
67 atomic_t pending_exceptions_count; 67 atomic_t pending_exceptions_count;
68 68
69 /* Protected by "lock" */
70 sector_t exception_start_sequence;
71
72 /* Protected by kcopyd single-threaded callback */
73 sector_t exception_complete_sequence;
74
75 /*
76 * A list of pending exceptions that completed out of order.
77 * Protected by kcopyd single-threaded callback.
78 */
79 struct list_head out_of_order_list;
80
69 mempool_t *pending_pool; 81 mempool_t *pending_pool;
70 82
71 struct dm_exception_table pending; 83 struct dm_exception_table pending;
@@ -173,6 +185,14 @@ struct dm_snap_pending_exception {
173 */ 185 */
174 int started; 186 int started;
175 187
188 /* There was copying error. */
189 int copy_error;
190
191 /* A sequence number, it is used for in-order completion. */
192 sector_t exception_sequence;
193
194 struct list_head out_of_order_entry;
195
176 /* 196 /*
177 * For writing a complete chunk, bypassing the copy. 197 * For writing a complete chunk, bypassing the copy.
178 */ 198 */
@@ -1094,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1094 s->valid = 1; 1114 s->valid = 1;
1095 s->active = 0; 1115 s->active = 0;
1096 atomic_set(&s->pending_exceptions_count, 0); 1116 atomic_set(&s->pending_exceptions_count, 0);
1117 s->exception_start_sequence = 0;
1118 s->exception_complete_sequence = 0;
1119 INIT_LIST_HEAD(&s->out_of_order_list);
1097 init_rwsem(&s->lock); 1120 init_rwsem(&s->lock);
1098 INIT_LIST_HEAD(&s->list); 1121 INIT_LIST_HEAD(&s->list);
1099 spin_lock_init(&s->pe_lock); 1122 spin_lock_init(&s->pe_lock);
@@ -1443,6 +1466,19 @@ static void commit_callback(void *context, int success)
1443 pending_complete(pe, success); 1466 pending_complete(pe, success);
1444} 1467}
1445 1468
1469static void complete_exception(struct dm_snap_pending_exception *pe)
1470{
1471 struct dm_snapshot *s = pe->snap;
1472
1473 if (unlikely(pe->copy_error))
1474 pending_complete(pe, 0);
1475
1476 else
1477 /* Update the metadata if we are persistent */
1478 s->store->type->commit_exception(s->store, &pe->e,
1479 commit_callback, pe);
1480}
1481
1446/* 1482/*
1447 * Called when the copy I/O has finished. kcopyd actually runs 1483 * Called when the copy I/O has finished. kcopyd actually runs
1448 * this code so don't block. 1484 * this code so don't block.
@@ -1452,13 +1488,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
1452 struct dm_snap_pending_exception *pe = context; 1488 struct dm_snap_pending_exception *pe = context;
1453 struct dm_snapshot *s = pe->snap; 1489 struct dm_snapshot *s = pe->snap;
1454 1490
1455 if (read_err || write_err) 1491 pe->copy_error = read_err || write_err;
1456 pending_complete(pe, 0);
1457 1492
1458 else 1493 if (pe->exception_sequence == s->exception_complete_sequence) {
1459 /* Update the metadata if we are persistent */ 1494 s->exception_complete_sequence++;
1460 s->store->type->commit_exception(s->store, &pe->e, 1495 complete_exception(pe);
1461 commit_callback, pe); 1496
1497 while (!list_empty(&s->out_of_order_list)) {
1498 pe = list_entry(s->out_of_order_list.next,
1499 struct dm_snap_pending_exception, out_of_order_entry);
1500 if (pe->exception_sequence != s->exception_complete_sequence)
1501 break;
1502 s->exception_complete_sequence++;
1503 list_del(&pe->out_of_order_entry);
1504 complete_exception(pe);
1505 }
1506 } else {
1507 struct list_head *lh;
1508 struct dm_snap_pending_exception *pe2;
1509
1510 list_for_each_prev(lh, &s->out_of_order_list) {
1511 pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry);
1512 if (pe2->exception_sequence < pe->exception_sequence)
1513 break;
1514 }
1515 list_add(&pe->out_of_order_entry, lh);
1516 }
1462} 1517}
1463 1518
1464/* 1519/*
@@ -1553,6 +1608,8 @@ __find_pending_exception(struct dm_snapshot *s,
1553 return NULL; 1608 return NULL;
1554 } 1609 }
1555 1610
1611 pe->exception_sequence = s->exception_start_sequence++;
1612
1556 dm_insert_exception(&s->pending, &pe->e); 1613 dm_insert_exception(&s->pending, &pe->e);
1557 1614
1558 return pe; 1615 return pe;
@@ -2192,7 +2249,7 @@ static struct target_type origin_target = {
2192 2249
2193static struct target_type snapshot_target = { 2250static struct target_type snapshot_target = {
2194 .name = "snapshot", 2251 .name = "snapshot",
2195 .version = {1, 11, 1}, 2252 .version = {1, 12, 0},
2196 .module = THIS_MODULE, 2253 .module = THIS_MODULE,
2197 .ctr = snapshot_ctr, 2254 .ctr = snapshot_ctr,
2198 .dtr = snapshot_dtr, 2255 .dtr = snapshot_dtr,
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 3d404c1371ed..28a90122a5a8 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -964,6 +964,7 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
964 964
965int __init dm_statistics_init(void) 965int __init dm_statistics_init(void)
966{ 966{
967 shared_memory_amount = 0;
967 dm_stat_need_rcu_barrier = 0; 968 dm_stat_need_rcu_barrier = 0;
968 return 0; 969 return 0;
969} 970}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 465f08ca62b1..3ba6a3859ce3 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -200,6 +200,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
200 200
201 num_targets = dm_round_up(num_targets, KEYS_PER_NODE); 201 num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
202 202
203 if (!num_targets) {
204 kfree(t);
205 return -ENOMEM;
206 }
207
203 if (alloc_targets(t, num_targets)) { 208 if (alloc_targets(t, num_targets)) {
204 kfree(t); 209 kfree(t);
205 return -ENOMEM; 210 return -ENOMEM;
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 60bce435f4fa..8a30ad54bd46 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1697,6 +1697,14 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
1697 up_write(&pmd->root_lock); 1697 up_write(&pmd->root_lock);
1698} 1698}
1699 1699
1700void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
1701{
1702 down_write(&pmd->root_lock);
1703 pmd->read_only = false;
1704 dm_bm_set_read_write(pmd->bm);
1705 up_write(&pmd->root_lock);
1706}
1707
1700int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, 1708int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
1701 dm_block_t threshold, 1709 dm_block_t threshold,
1702 dm_sm_threshold_fn fn, 1710 dm_sm_threshold_fn fn,
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 845ebbe589a9..7bcc0e1d6238 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -193,6 +193,7 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_siz
193 * that nothing is changing. 193 * that nothing is changing.
194 */ 194 */
195void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); 195void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
196void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd);
196 197
197int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, 198int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
198 dm_block_t threshold, 199 dm_block_t threshold,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2c0cf511ec23..ee29037ffc2e 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -640,7 +640,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
640 */ 640 */
641 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 641 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
642 if (r) { 642 if (r) {
643 DMERR_LIMIT("dm_thin_insert_block() failed"); 643 DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d",
644 dm_device_name(pool->pool_md), r);
645 set_pool_mode(pool, PM_READ_ONLY);
644 cell_error(pool, m->cell); 646 cell_error(pool, m->cell);
645 goto out; 647 goto out;
646 } 648 }
@@ -881,32 +883,23 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
881 } 883 }
882} 884}
883 885
884static int commit(struct pool *pool)
885{
886 int r;
887
888 r = dm_pool_commit_metadata(pool->pmd);
889 if (r)
890 DMERR_LIMIT("%s: commit failed: error = %d",
891 dm_device_name(pool->pool_md), r);
892
893 return r;
894}
895
896/* 886/*
897 * A non-zero return indicates read_only or fail_io mode. 887 * A non-zero return indicates read_only or fail_io mode.
898 * Many callers don't care about the return value. 888 * Many callers don't care about the return value.
899 */ 889 */
900static int commit_or_fallback(struct pool *pool) 890static int commit(struct pool *pool)
901{ 891{
902 int r; 892 int r;
903 893
904 if (get_pool_mode(pool) != PM_WRITE) 894 if (get_pool_mode(pool) != PM_WRITE)
905 return -EINVAL; 895 return -EINVAL;
906 896
907 r = commit(pool); 897 r = dm_pool_commit_metadata(pool->pmd);
908 if (r) 898 if (r) {
899 DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d",
900 dm_device_name(pool->pool_md), r);
909 set_pool_mode(pool, PM_READ_ONLY); 901 set_pool_mode(pool, PM_READ_ONLY);
902 }
910 903
911 return r; 904 return r;
912} 905}
@@ -943,7 +936,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
943 * Try to commit to see if that will free up some 936 * Try to commit to see if that will free up some
944 * more space. 937 * more space.
945 */ 938 */
946 (void) commit_or_fallback(pool); 939 r = commit(pool);
940 if (r)
941 return r;
947 942
948 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 943 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
949 if (r) 944 if (r)
@@ -957,7 +952,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
957 * table reload). 952 * table reload).
958 */ 953 */
959 if (!free_blocks) { 954 if (!free_blocks) {
960 DMWARN("%s: no free space available.", 955 DMWARN("%s: no free data space available.",
961 dm_device_name(pool->pool_md)); 956 dm_device_name(pool->pool_md));
962 spin_lock_irqsave(&pool->lock, flags); 957 spin_lock_irqsave(&pool->lock, flags);
963 pool->no_free_space = 1; 958 pool->no_free_space = 1;
@@ -967,8 +962,16 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
967 } 962 }
968 963
969 r = dm_pool_alloc_data_block(pool->pmd, result); 964 r = dm_pool_alloc_data_block(pool->pmd, result);
970 if (r) 965 if (r) {
966 if (r == -ENOSPC &&
967 !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
968 !free_blocks) {
969 DMWARN("%s: no free metadata space available.",
970 dm_device_name(pool->pool_md));
971 set_pool_mode(pool, PM_READ_ONLY);
972 }
971 return r; 973 return r;
974 }
972 975
973 return 0; 976 return 0;
974} 977}
@@ -1349,7 +1352,7 @@ static void process_deferred_bios(struct pool *pool)
1349 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1352 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1350 return; 1353 return;
1351 1354
1352 if (commit_or_fallback(pool)) { 1355 if (commit(pool)) {
1353 while ((bio = bio_list_pop(&bios))) 1356 while ((bio = bio_list_pop(&bios)))
1354 bio_io_error(bio); 1357 bio_io_error(bio);
1355 return; 1358 return;
@@ -1397,6 +1400,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1397 case PM_FAIL: 1400 case PM_FAIL:
1398 DMERR("%s: switching pool to failure mode", 1401 DMERR("%s: switching pool to failure mode",
1399 dm_device_name(pool->pool_md)); 1402 dm_device_name(pool->pool_md));
1403 dm_pool_metadata_read_only(pool->pmd);
1400 pool->process_bio = process_bio_fail; 1404 pool->process_bio = process_bio_fail;
1401 pool->process_discard = process_bio_fail; 1405 pool->process_discard = process_bio_fail;
1402 pool->process_prepared_mapping = process_prepared_mapping_fail; 1406 pool->process_prepared_mapping = process_prepared_mapping_fail;
@@ -1421,6 +1425,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1421 break; 1425 break;
1422 1426
1423 case PM_WRITE: 1427 case PM_WRITE:
1428 dm_pool_metadata_read_write(pool->pmd);
1424 pool->process_bio = process_bio; 1429 pool->process_bio = process_bio;
1425 pool->process_discard = process_discard; 1430 pool->process_discard = process_discard;
1426 pool->process_prepared_mapping = process_prepared_mapping; 1431 pool->process_prepared_mapping = process_prepared_mapping;
@@ -1637,12 +1642,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1637 struct pool_c *pt = ti->private; 1642 struct pool_c *pt = ti->private;
1638 1643
1639 /* 1644 /*
1640 * We want to make sure that degraded pools are never upgraded. 1645 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
1641 */ 1646 */
1642 enum pool_mode old_mode = pool->pf.mode; 1647 enum pool_mode old_mode = pool->pf.mode;
1643 enum pool_mode new_mode = pt->adjusted_pf.mode; 1648 enum pool_mode new_mode = pt->adjusted_pf.mode;
1644 1649
1645 if (old_mode > new_mode) 1650 /*
1651 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1652 * not going to recover without a thin_repair. So we never let the
1653 * pool move out of the old mode. On the other hand a PM_READ_ONLY
1654 * may have been due to a lack of metadata or data space, and may
1655 * now work (ie. if the underlying devices have been resized).
1656 */
1657 if (old_mode == PM_FAIL)
1646 new_mode = old_mode; 1658 new_mode = old_mode;
1647 1659
1648 pool->ti = ti; 1660 pool->ti = ti;
@@ -2266,7 +2278,7 @@ static int pool_preresume(struct dm_target *ti)
2266 return r; 2278 return r;
2267 2279
2268 if (need_commit1 || need_commit2) 2280 if (need_commit1 || need_commit2)
2269 (void) commit_or_fallback(pool); 2281 (void) commit(pool);
2270 2282
2271 return 0; 2283 return 0;
2272} 2284}
@@ -2293,7 +2305,7 @@ static void pool_postsuspend(struct dm_target *ti)
2293 2305
2294 cancel_delayed_work(&pool->waker); 2306 cancel_delayed_work(&pool->waker);
2295 flush_workqueue(pool->wq); 2307 flush_workqueue(pool->wq);
2296 (void) commit_or_fallback(pool); 2308 (void) commit(pool);
2297} 2309}
2298 2310
2299static int check_arg_count(unsigned argc, unsigned args_required) 2311static int check_arg_count(unsigned argc, unsigned args_required)
@@ -2427,7 +2439,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct
2427 if (r) 2439 if (r)
2428 return r; 2440 return r;
2429 2441
2430 (void) commit_or_fallback(pool); 2442 (void) commit(pool);
2431 2443
2432 r = dm_pool_reserve_metadata_snap(pool->pmd); 2444 r = dm_pool_reserve_metadata_snap(pool->pmd);
2433 if (r) 2445 if (r)
@@ -2489,7 +2501,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2489 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2501 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2490 2502
2491 if (!r) 2503 if (!r)
2492 (void) commit_or_fallback(pool); 2504 (void) commit(pool);
2493 2505
2494 return r; 2506 return r;
2495} 2507}
@@ -2544,7 +2556,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
2544 2556
2545 /* Commit to ensure statistics aren't out-of-date */ 2557 /* Commit to ensure statistics aren't out-of-date */
2546 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 2558 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2547 (void) commit_or_fallback(pool); 2559 (void) commit(pool);
2548 2560
2549 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); 2561 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
2550 if (r) { 2562 if (r) {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8766eabb0014..21f4d7ff0da2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev)
112 112
113static struct ctl_table_header *raid_table_header; 113static struct ctl_table_header *raid_table_header;
114 114
115static ctl_table raid_table[] = { 115static struct ctl_table raid_table[] = {
116 { 116 {
117 .procname = "speed_limit_min", 117 .procname = "speed_limit_min",
118 .data = &sysctl_speed_limit_min, 118 .data = &sysctl_speed_limit_min,
@@ -130,7 +130,7 @@ static ctl_table raid_table[] = {
130 { } 130 { }
131}; 131};
132 132
133static ctl_table raid_dir_table[] = { 133static struct ctl_table raid_dir_table[] = {
134 { 134 {
135 .procname = "raid", 135 .procname = "raid",
136 .maxlen = 0, 136 .maxlen = 0,
@@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = {
140 { } 140 { }
141}; 141};
142 142
143static ctl_table raid_root_table[] = { 143static struct ctl_table raid_root_table[] = {
144 { 144 {
145 .procname = "dev", 145 .procname = "dev",
146 .maxlen = 0, 146 .maxlen = 0,
@@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit)
562 goto retry; 562 goto retry;
563} 563}
564 564
565static inline int mddev_lock(struct mddev * mddev) 565static inline int __must_check mddev_lock(struct mddev * mddev)
566{ 566{
567 return mutex_lock_interruptible(&mddev->reconfig_mutex); 567 return mutex_lock_interruptible(&mddev->reconfig_mutex);
568} 568}
569 569
570/* Sometimes we need to take the lock in a situation where
571 * failure due to interrupts is not acceptable.
572 */
573static inline void mddev_lock_nointr(struct mddev * mddev)
574{
575 mutex_lock(&mddev->reconfig_mutex);
576}
577
570static inline int mddev_is_locked(struct mddev *mddev) 578static inline int mddev_is_locked(struct mddev *mddev)
571{ 579{
572 return mutex_is_locked(&mddev->reconfig_mutex); 580 return mutex_is_locked(&mddev->reconfig_mutex);
@@ -768,16 +776,10 @@ void md_super_wait(struct mddev *mddev)
768 finish_wait(&mddev->sb_wait, &wq); 776 finish_wait(&mddev->sb_wait, &wq);
769} 777}
770 778
771static void bi_complete(struct bio *bio, int error)
772{
773 complete((struct completion*)bio->bi_private);
774}
775
776int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 779int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
777 struct page *page, int rw, bool metadata_op) 780 struct page *page, int rw, bool metadata_op)
778{ 781{
779 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); 782 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
780 struct completion event;
781 int ret; 783 int ret;
782 784
783 rw |= REQ_SYNC; 785 rw |= REQ_SYNC;
@@ -793,11 +795,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
793 else 795 else
794 bio->bi_sector = sector + rdev->data_offset; 796 bio->bi_sector = sector + rdev->data_offset;
795 bio_add_page(bio, page, size, 0); 797 bio_add_page(bio, page, size, 0);
796 init_completion(&event); 798 submit_bio_wait(rw, bio);
797 bio->bi_private = &event;
798 bio->bi_end_io = bi_complete;
799 submit_bio(rw, bio);
800 wait_for_completion(&event);
801 799
802 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 800 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
803 bio_put(bio); 801 bio_put(bio);
@@ -2978,7 +2976,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2978 for_each_mddev(mddev, tmp) { 2976 for_each_mddev(mddev, tmp) {
2979 struct md_rdev *rdev2; 2977 struct md_rdev *rdev2;
2980 2978
2981 mddev_lock(mddev); 2979 mddev_lock_nointr(mddev);
2982 rdev_for_each(rdev2, mddev) 2980 rdev_for_each(rdev2, mddev)
2983 if (rdev->bdev == rdev2->bdev && 2981 if (rdev->bdev == rdev2->bdev &&
2984 rdev != rdev2 && 2982 rdev != rdev2 &&
@@ -2994,7 +2992,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2994 break; 2992 break;
2995 } 2993 }
2996 } 2994 }
2997 mddev_lock(my_mddev); 2995 mddev_lock_nointr(my_mddev);
2998 if (overlap) { 2996 if (overlap) {
2999 /* Someone else could have slipped in a size 2997 /* Someone else could have slipped in a size
3000 * change here, but doing so is just silly. 2998 * change here, but doing so is just silly.
@@ -3580,6 +3578,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3580 mddev->in_sync = 1; 3578 mddev->in_sync = 1;
3581 del_timer_sync(&mddev->safemode_timer); 3579 del_timer_sync(&mddev->safemode_timer);
3582 } 3580 }
3581 blk_set_stacking_limits(&mddev->queue->limits);
3583 pers->run(mddev); 3582 pers->run(mddev);
3584 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3583 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3585 mddev_resume(mddev); 3584 mddev_resume(mddev);
@@ -5258,7 +5257,7 @@ static void __md_stop_writes(struct mddev *mddev)
5258 5257
5259void md_stop_writes(struct mddev *mddev) 5258void md_stop_writes(struct mddev *mddev)
5260{ 5259{
5261 mddev_lock(mddev); 5260 mddev_lock_nointr(mddev);
5262 __md_stop_writes(mddev); 5261 __md_stop_writes(mddev);
5263 mddev_unlock(mddev); 5262 mddev_unlock(mddev);
5264} 5263}
@@ -5291,20 +5290,35 @@ EXPORT_SYMBOL_GPL(md_stop);
5291static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5290static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5292{ 5291{
5293 int err = 0; 5292 int err = 0;
5293 int did_freeze = 0;
5294
5295 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5296 did_freeze = 1;
5297 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5298 md_wakeup_thread(mddev->thread);
5299 }
5300 if (mddev->sync_thread) {
5301 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5302 /* Thread might be blocked waiting for metadata update
5303 * which will now never happen */
5304 wake_up_process(mddev->sync_thread->tsk);
5305 }
5306 mddev_unlock(mddev);
5307 wait_event(resync_wait, mddev->sync_thread == NULL);
5308 mddev_lock_nointr(mddev);
5309
5294 mutex_lock(&mddev->open_mutex); 5310 mutex_lock(&mddev->open_mutex);
5295 if (atomic_read(&mddev->openers) > !!bdev) { 5311 if (atomic_read(&mddev->openers) > !!bdev ||
5312 mddev->sync_thread ||
5313 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5296 printk("md: %s still in use.\n",mdname(mddev)); 5314 printk("md: %s still in use.\n",mdname(mddev));
5315 if (did_freeze) {
5316 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5317 md_wakeup_thread(mddev->thread);
5318 }
5297 err = -EBUSY; 5319 err = -EBUSY;
5298 goto out; 5320 goto out;
5299 } 5321 }
5300 if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
5301 /* Someone opened the device since we flushed it
5302 * so page cache could be dirty and it is too late
5303 * to flush. So abort
5304 */
5305 mutex_unlock(&mddev->open_mutex);
5306 return -EBUSY;
5307 }
5308 if (mddev->pers) { 5322 if (mddev->pers) {
5309 __md_stop_writes(mddev); 5323 __md_stop_writes(mddev);
5310 5324
@@ -5315,7 +5329,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5315 set_disk_ro(mddev->gendisk, 1); 5329 set_disk_ro(mddev->gendisk, 1);
5316 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5330 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5317 sysfs_notify_dirent_safe(mddev->sysfs_state); 5331 sysfs_notify_dirent_safe(mddev->sysfs_state);
5318 err = 0; 5332 err = 0;
5319 } 5333 }
5320out: 5334out:
5321 mutex_unlock(&mddev->open_mutex); 5335 mutex_unlock(&mddev->open_mutex);
@@ -5331,20 +5345,34 @@ static int do_md_stop(struct mddev * mddev, int mode,
5331{ 5345{
5332 struct gendisk *disk = mddev->gendisk; 5346 struct gendisk *disk = mddev->gendisk;
5333 struct md_rdev *rdev; 5347 struct md_rdev *rdev;
5348 int did_freeze = 0;
5349
5350 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5351 did_freeze = 1;
5352 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5353 md_wakeup_thread(mddev->thread);
5354 }
5355 if (mddev->sync_thread) {
5356 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5357 /* Thread might be blocked waiting for metadata update
5358 * which will now never happen */
5359 wake_up_process(mddev->sync_thread->tsk);
5360 }
5361 mddev_unlock(mddev);
5362 wait_event(resync_wait, mddev->sync_thread == NULL);
5363 mddev_lock_nointr(mddev);
5334 5364
5335 mutex_lock(&mddev->open_mutex); 5365 mutex_lock(&mddev->open_mutex);
5336 if (atomic_read(&mddev->openers) > !!bdev || 5366 if (atomic_read(&mddev->openers) > !!bdev ||
5337 mddev->sysfs_active) { 5367 mddev->sysfs_active ||
5368 mddev->sync_thread ||
5369 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5338 printk("md: %s still in use.\n",mdname(mddev)); 5370 printk("md: %s still in use.\n",mdname(mddev));
5339 mutex_unlock(&mddev->open_mutex); 5371 mutex_unlock(&mddev->open_mutex);
5340 return -EBUSY; 5372 if (did_freeze) {
5341 } 5373 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5342 if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { 5374 md_wakeup_thread(mddev->thread);
5343 /* Someone opened the device since we flushed it 5375 }
5344 * so page cache could be dirty and it is too late
5345 * to flush. So abort
5346 */
5347 mutex_unlock(&mddev->open_mutex);
5348 return -EBUSY; 5376 return -EBUSY;
5349 } 5377 }
5350 if (mddev->pers) { 5378 if (mddev->pers) {
@@ -6551,7 +6579,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6551 wait_event(mddev->sb_wait, 6579 wait_event(mddev->sb_wait,
6552 !test_bit(MD_CHANGE_DEVS, &mddev->flags) && 6580 !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6553 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6581 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6554 mddev_lock(mddev); 6582 mddev_lock_nointr(mddev);
6555 } 6583 }
6556 } else { 6584 } else {
6557 err = -EROFS; 6585 err = -EROFS;
@@ -7361,9 +7389,6 @@ void md_do_sync(struct md_thread *thread)
7361 mddev->curr_resync = 2; 7389 mddev->curr_resync = 2;
7362 7390
7363 try_again: 7391 try_again:
7364 if (kthread_should_stop())
7365 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7366
7367 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7392 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7368 goto skip; 7393 goto skip;
7369 for_each_mddev(mddev2, tmp) { 7394 for_each_mddev(mddev2, tmp) {
@@ -7388,7 +7413,7 @@ void md_do_sync(struct md_thread *thread)
7388 * be caught by 'softlockup' 7413 * be caught by 'softlockup'
7389 */ 7414 */
7390 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 7415 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
7391 if (!kthread_should_stop() && 7416 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7392 mddev2->curr_resync >= mddev->curr_resync) { 7417 mddev2->curr_resync >= mddev->curr_resync) {
7393 printk(KERN_INFO "md: delaying %s of %s" 7418 printk(KERN_INFO "md: delaying %s of %s"
7394 " until %s has finished (they" 7419 " until %s has finished (they"
@@ -7464,7 +7489,7 @@ void md_do_sync(struct md_thread *thread)
7464 last_check = 0; 7489 last_check = 0;
7465 7490
7466 if (j>2) { 7491 if (j>2) {
7467 printk(KERN_INFO 7492 printk(KERN_INFO
7468 "md: resuming %s of %s from checkpoint.\n", 7493 "md: resuming %s of %s from checkpoint.\n",
7469 desc, mdname(mddev)); 7494 desc, mdname(mddev));
7470 mddev->curr_resync = j; 7495 mddev->curr_resync = j;
@@ -7501,7 +7526,8 @@ void md_do_sync(struct md_thread *thread)
7501 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7526 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7502 } 7527 }
7503 7528
7504 while (j >= mddev->resync_max && !kthread_should_stop()) { 7529 while (j >= mddev->resync_max &&
7530 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7505 /* As this condition is controlled by user-space, 7531 /* As this condition is controlled by user-space,
7506 * we can block indefinitely, so use '_interruptible' 7532 * we can block indefinitely, so use '_interruptible'
7507 * to avoid triggering warnings. 7533 * to avoid triggering warnings.
@@ -7509,17 +7535,18 @@ void md_do_sync(struct md_thread *thread)
7509 flush_signals(current); /* just in case */ 7535 flush_signals(current); /* just in case */
7510 wait_event_interruptible(mddev->recovery_wait, 7536 wait_event_interruptible(mddev->recovery_wait,
7511 mddev->resync_max > j 7537 mddev->resync_max > j
7512 || kthread_should_stop()); 7538 || test_bit(MD_RECOVERY_INTR,
7539 &mddev->recovery));
7513 } 7540 }
7514 7541
7515 if (kthread_should_stop()) 7542 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7516 goto interrupted; 7543 break;
7517 7544
7518 sectors = mddev->pers->sync_request(mddev, j, &skipped, 7545 sectors = mddev->pers->sync_request(mddev, j, &skipped,
7519 currspeed < speed_min(mddev)); 7546 currspeed < speed_min(mddev));
7520 if (sectors == 0) { 7547 if (sectors == 0) {
7521 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7548 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7522 goto out; 7549 break;
7523 } 7550 }
7524 7551
7525 if (!skipped) { /* actual IO requested */ 7552 if (!skipped) { /* actual IO requested */
@@ -7556,10 +7583,8 @@ void md_do_sync(struct md_thread *thread)
7556 last_mark = next; 7583 last_mark = next;
7557 } 7584 }
7558 7585
7559 7586 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7560 if (kthread_should_stop()) 7587 break;
7561 goto interrupted;
7562
7563 7588
7564 /* 7589 /*
7565 * this loop exits only if either when we are slower than 7590 * this loop exits only if either when we are slower than
@@ -7582,11 +7607,12 @@ void md_do_sync(struct md_thread *thread)
7582 } 7607 }
7583 } 7608 }
7584 } 7609 }
7585 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 7610 printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
7611 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
7612 ? "interrupted" : "done");
7586 /* 7613 /*
7587 * this also signals 'finished resyncing' to md_stop 7614 * this also signals 'finished resyncing' to md_stop
7588 */ 7615 */
7589 out:
7590 blk_finish_plug(&plug); 7616 blk_finish_plug(&plug);
7591 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 7617 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7592 7618
@@ -7640,16 +7666,6 @@ void md_do_sync(struct md_thread *thread)
7640 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 7666 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
7641 md_wakeup_thread(mddev->thread); 7667 md_wakeup_thread(mddev->thread);
7642 return; 7668 return;
7643
7644 interrupted:
7645 /*
7646 * got a signal, exit.
7647 */
7648 printk(KERN_INFO
7649 "md: md_do_sync() got signal ... exiting\n");
7650 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7651 goto out;
7652
7653} 7669}
7654EXPORT_SYMBOL_GPL(md_do_sync); 7670EXPORT_SYMBOL_GPL(md_do_sync);
7655 7671
@@ -7751,7 +7767,7 @@ void md_check_recovery(struct mddev *mddev)
7751 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 7767 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
7752 return; 7768 return;
7753 if ( ! ( 7769 if ( ! (
7754 (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || 7770 (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
7755 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 7771 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7756 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 7772 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
7757 (mddev->external == 0 && mddev->safemode == 1) || 7773 (mddev->external == 0 && mddev->safemode == 1) ||
@@ -7894,6 +7910,7 @@ void md_reap_sync_thread(struct mddev *mddev)
7894 7910
7895 /* resync has finished, collect result */ 7911 /* resync has finished, collect result */
7896 md_unregister_thread(&mddev->sync_thread); 7912 md_unregister_thread(&mddev->sync_thread);
7913 wake_up(&resync_wait);
7897 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 7914 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7898 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7915 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7899 /* success...*/ 7916 /* success...*/
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index af96e24ec328..1d75b1dc1e2e 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
317 * The shadow op will often be a noop. Only insert if it really 317 * The shadow op will often be a noop. Only insert if it really
318 * copied data. 318 * copied data.
319 */ 319 */
320 if (dm_block_location(*block) != b) 320 if (dm_block_location(*block) != b) {
321 /*
322 * dm_tm_shadow_block will have already decremented the old
323 * block, but it is still referenced by the btree. We
324 * increment to stop the insert decrementing it below zero
325 * when overwriting the old value.
326 */
327 dm_tm_inc(info->btree_info.tm, b);
321 r = insert_ablock(info, index, *block, root); 328 r = insert_ablock(info, index, *block, root);
329 }
322 330
323 return r; 331 return r;
324} 332}
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index a7e8bf296388..064a3c271baa 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -626,6 +626,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm)
626} 626}
627EXPORT_SYMBOL_GPL(dm_bm_set_read_only); 627EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
628 628
629void dm_bm_set_read_write(struct dm_block_manager *bm)
630{
631 bm->read_only = false;
632}
633EXPORT_SYMBOL_GPL(dm_bm_set_read_write);
634
629u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) 635u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
630{ 636{
631 return crc32c(~(u32) 0, data, len) ^ init_xor; 637 return crc32c(~(u32) 0, data, len) ^ init_xor;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 9a82083a66b6..13cd58e1fe69 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -108,9 +108,9 @@ int dm_bm_unlock(struct dm_block *b);
108int dm_bm_flush_and_unlock(struct dm_block_manager *bm, 108int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
109 struct dm_block *superblock); 109 struct dm_block *superblock);
110 110
111 /* 111/*
112 * Request data be prefetched into the cache. 112 * Request data is prefetched into the cache.
113 */ 113 */
114void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); 114void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
115 115
116/* 116/*
@@ -125,6 +125,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
125 * be returned if you do. 125 * be returned if you do.
126 */ 126 */
127void dm_bm_set_read_only(struct dm_block_manager *bm); 127void dm_bm_set_read_only(struct dm_block_manager *bm);
128void dm_bm_set_read_write(struct dm_block_manager *bm);
128 129
129u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); 130u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
130 131
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 6058569fe86c..466a60bbd716 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -381,7 +381,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
381} 381}
382 382
383static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, 383static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
384 uint32_t (*mutator)(void *context, uint32_t old), 384 int (*mutator)(void *context, uint32_t old, uint32_t *new),
385 void *context, enum allocation_event *ev) 385 void *context, enum allocation_event *ev)
386{ 386{
387 int r; 387 int r;
@@ -410,11 +410,17 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
410 410
411 if (old > 2) { 411 if (old > 2) {
412 r = sm_ll_lookup_big_ref_count(ll, b, &old); 412 r = sm_ll_lookup_big_ref_count(ll, b, &old);
413 if (r < 0) 413 if (r < 0) {
414 dm_tm_unlock(ll->tm, nb);
414 return r; 415 return r;
416 }
415 } 417 }
416 418
417 ref_count = mutator(context, old); 419 r = mutator(context, old, &ref_count);
420 if (r) {
421 dm_tm_unlock(ll->tm, nb);
422 return r;
423 }
418 424
419 if (ref_count <= 2) { 425 if (ref_count <= 2) {
420 sm_set_bitmap(bm_le, bit, ref_count); 426 sm_set_bitmap(bm_le, bit, ref_count);
@@ -465,9 +471,10 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
465 return ll->save_ie(ll, index, &ie_disk); 471 return ll->save_ie(ll, index, &ie_disk);
466} 472}
467 473
468static uint32_t set_ref_count(void *context, uint32_t old) 474static int set_ref_count(void *context, uint32_t old, uint32_t *new)
469{ 475{
470 return *((uint32_t *) context); 476 *new = *((uint32_t *) context);
477 return 0;
471} 478}
472 479
473int sm_ll_insert(struct ll_disk *ll, dm_block_t b, 480int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
@@ -476,9 +483,10 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
476 return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); 483 return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev);
477} 484}
478 485
479static uint32_t inc_ref_count(void *context, uint32_t old) 486static int inc_ref_count(void *context, uint32_t old, uint32_t *new)
480{ 487{
481 return old + 1; 488 *new = old + 1;
489 return 0;
482} 490}
483 491
484int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) 492int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
@@ -486,9 +494,15 @@ int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
486 return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); 494 return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev);
487} 495}
488 496
489static uint32_t dec_ref_count(void *context, uint32_t old) 497static int dec_ref_count(void *context, uint32_t old, uint32_t *new)
490{ 498{
491 return old - 1; 499 if (!old) {
500 DMERR_LIMIT("unable to decrement a reference count below 0");
501 return -EINVAL;
502 }
503
504 *new = old - 1;
505 return 0;
492} 506}
493 507
494int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) 508int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 1c959684caef..58fc1eef7499 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
384 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 384 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
385 385
386 int r = sm_metadata_new_block_(sm, b); 386 int r = sm_metadata_new_block_(sm, b);
387 if (r) 387 if (r) {
388 DMERR("unable to allocate new metadata block"); 388 DMERR("unable to allocate new metadata block");
389 return r;
390 }
389 391
390 r = sm_metadata_get_nr_free(sm, &count); 392 r = sm_metadata_get_nr_free(sm, &count);
391 if (r) 393 if (r) {
392 DMERR("couldn't get free block count"); 394 DMERR("couldn't get free block count");
395 return r;
396 }
393 397
394 check_threshold(&smm->threshold, count); 398 check_threshold(&smm->threshold, count);
395 399
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index af6681b19776..1e5a540995e9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -66,7 +66,8 @@
66 */ 66 */
67static int max_queued_requests = 1024; 67static int max_queued_requests = 1024;
68 68
69static void allow_barrier(struct r1conf *conf); 69static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
70 sector_t bi_sector);
70static void lower_barrier(struct r1conf *conf); 71static void lower_barrier(struct r1conf *conf);
71 72
72static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 73static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
@@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data)
84} 85}
85 86
86#define RESYNC_BLOCK_SIZE (64*1024) 87#define RESYNC_BLOCK_SIZE (64*1024)
87//#define RESYNC_BLOCK_SIZE PAGE_SIZE 88#define RESYNC_DEPTH 32
88#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 89#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
89#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) 90#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
90#define RESYNC_WINDOW (2048*1024) 91#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
92#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
93#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
91 94
92static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 95static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
93{ 96{
@@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
225 struct bio *bio = r1_bio->master_bio; 228 struct bio *bio = r1_bio->master_bio;
226 int done; 229 int done;
227 struct r1conf *conf = r1_bio->mddev->private; 230 struct r1conf *conf = r1_bio->mddev->private;
231 sector_t start_next_window = r1_bio->start_next_window;
232 sector_t bi_sector = bio->bi_sector;
228 233
229 if (bio->bi_phys_segments) { 234 if (bio->bi_phys_segments) {
230 unsigned long flags; 235 unsigned long flags;
@@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
232 bio->bi_phys_segments--; 237 bio->bi_phys_segments--;
233 done = (bio->bi_phys_segments == 0); 238 done = (bio->bi_phys_segments == 0);
234 spin_unlock_irqrestore(&conf->device_lock, flags); 239 spin_unlock_irqrestore(&conf->device_lock, flags);
240 /*
241 * make_request() might be waiting for
242 * bi_phys_segments to decrease
243 */
244 wake_up(&conf->wait_barrier);
235 } else 245 } else
236 done = 1; 246 done = 1;
237 247
@@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
243 * Wake up any possible resync thread that waits for the device 253 * Wake up any possible resync thread that waits for the device
244 * to go idle. 254 * to go idle.
245 */ 255 */
246 allow_barrier(conf); 256 allow_barrier(conf, start_next_window, bi_sector);
247 } 257 }
248} 258}
249 259
@@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf)
814 * there is no normal IO happeing. It must arrange to call 824 * there is no normal IO happeing. It must arrange to call
815 * lower_barrier when the particular background IO completes. 825 * lower_barrier when the particular background IO completes.
816 */ 826 */
817#define RESYNC_DEPTH 32
818
819static void raise_barrier(struct r1conf *conf) 827static void raise_barrier(struct r1conf *conf)
820{ 828{
821 spin_lock_irq(&conf->resync_lock); 829 spin_lock_irq(&conf->resync_lock);
@@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf)
827 /* block any new IO from starting */ 835 /* block any new IO from starting */
828 conf->barrier++; 836 conf->barrier++;
829 837
830 /* Now wait for all pending IO to complete */ 838 /* For these conditions we must wait:
839 * A: while the array is in frozen state
840 * B: while barrier >= RESYNC_DEPTH, meaning resync reach
841 * the max count which allowed.
842 * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
843 * next resync will reach to the window which normal bios are
844 * handling.
845 */
831 wait_event_lock_irq(conf->wait_barrier, 846 wait_event_lock_irq(conf->wait_barrier,
832 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 847 !conf->array_frozen &&
848 conf->barrier < RESYNC_DEPTH &&
849 (conf->start_next_window >=
850 conf->next_resync + RESYNC_SECTORS),
833 conf->resync_lock); 851 conf->resync_lock);
834 852
835 spin_unlock_irq(&conf->resync_lock); 853 spin_unlock_irq(&conf->resync_lock);
@@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf)
845 wake_up(&conf->wait_barrier); 863 wake_up(&conf->wait_barrier);
846} 864}
847 865
848static void wait_barrier(struct r1conf *conf) 866static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
849{ 867{
868 bool wait = false;
869
870 if (conf->array_frozen || !bio)
871 wait = true;
872 else if (conf->barrier && bio_data_dir(bio) == WRITE) {
873 if (conf->next_resync < RESYNC_WINDOW_SECTORS)
874 wait = true;
875 else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
876 >= bio_end_sector(bio)) ||
877 (conf->next_resync + NEXT_NORMALIO_DISTANCE
878 <= bio->bi_sector))
879 wait = false;
880 else
881 wait = true;
882 }
883
884 return wait;
885}
886
887static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
888{
889 sector_t sector = 0;
890
850 spin_lock_irq(&conf->resync_lock); 891 spin_lock_irq(&conf->resync_lock);
851 if (conf->barrier) { 892 if (need_to_wait_for_sync(conf, bio)) {
852 conf->nr_waiting++; 893 conf->nr_waiting++;
853 /* Wait for the barrier to drop. 894 /* Wait for the barrier to drop.
854 * However if there are already pending 895 * However if there are already pending
@@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf)
860 * count down. 901 * count down.
861 */ 902 */
862 wait_event_lock_irq(conf->wait_barrier, 903 wait_event_lock_irq(conf->wait_barrier,
863 !conf->barrier || 904 !conf->array_frozen &&
864 (conf->nr_pending && 905 (!conf->barrier ||
906 ((conf->start_next_window <
907 conf->next_resync + RESYNC_SECTORS) &&
865 current->bio_list && 908 current->bio_list &&
866 !bio_list_empty(current->bio_list)), 909 !bio_list_empty(current->bio_list))),
867 conf->resync_lock); 910 conf->resync_lock);
868 conf->nr_waiting--; 911 conf->nr_waiting--;
869 } 912 }
913
914 if (bio && bio_data_dir(bio) == WRITE) {
915 if (conf->next_resync + NEXT_NORMALIO_DISTANCE
916 <= bio->bi_sector) {
917 if (conf->start_next_window == MaxSector)
918 conf->start_next_window =
919 conf->next_resync +
920 NEXT_NORMALIO_DISTANCE;
921
922 if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
923 <= bio->bi_sector)
924 conf->next_window_requests++;
925 else
926 conf->current_window_requests++;
927 }
928 if (bio->bi_sector >= conf->start_next_window)
929 sector = conf->start_next_window;
930 }
931
870 conf->nr_pending++; 932 conf->nr_pending++;
871 spin_unlock_irq(&conf->resync_lock); 933 spin_unlock_irq(&conf->resync_lock);
934 return sector;
872} 935}
873 936
874static void allow_barrier(struct r1conf *conf) 937static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
938 sector_t bi_sector)
875{ 939{
876 unsigned long flags; 940 unsigned long flags;
941
877 spin_lock_irqsave(&conf->resync_lock, flags); 942 spin_lock_irqsave(&conf->resync_lock, flags);
878 conf->nr_pending--; 943 conf->nr_pending--;
944 if (start_next_window) {
945 if (start_next_window == conf->start_next_window) {
946 if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
947 <= bi_sector)
948 conf->next_window_requests--;
949 else
950 conf->current_window_requests--;
951 } else
952 conf->current_window_requests--;
953
954 if (!conf->current_window_requests) {
955 if (conf->next_window_requests) {
956 conf->current_window_requests =
957 conf->next_window_requests;
958 conf->next_window_requests = 0;
959 conf->start_next_window +=
960 NEXT_NORMALIO_DISTANCE;
961 } else
962 conf->start_next_window = MaxSector;
963 }
964 }
879 spin_unlock_irqrestore(&conf->resync_lock, flags); 965 spin_unlock_irqrestore(&conf->resync_lock, flags);
880 wake_up(&conf->wait_barrier); 966 wake_up(&conf->wait_barrier);
881} 967}
@@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra)
884{ 970{
885 /* stop syncio and normal IO and wait for everything to 971 /* stop syncio and normal IO and wait for everything to
886 * go quite. 972 * go quite.
887 * We increment barrier and nr_waiting, and then 973 * We wait until nr_pending match nr_queued+extra
888 * wait until nr_pending match nr_queued+extra
889 * This is called in the context of one normal IO request 974 * This is called in the context of one normal IO request
890 * that has failed. Thus any sync request that might be pending 975 * that has failed. Thus any sync request that might be pending
891 * will be blocked by nr_pending, and we need to wait for 976 * will be blocked by nr_pending, and we need to wait for
@@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra)
895 * we continue. 980 * we continue.
896 */ 981 */
897 spin_lock_irq(&conf->resync_lock); 982 spin_lock_irq(&conf->resync_lock);
898 conf->barrier++; 983 conf->array_frozen = 1;
899 conf->nr_waiting++;
900 wait_event_lock_irq_cmd(conf->wait_barrier, 984 wait_event_lock_irq_cmd(conf->wait_barrier,
901 conf->nr_pending == conf->nr_queued+extra, 985 conf->nr_pending == conf->nr_queued+extra,
902 conf->resync_lock, 986 conf->resync_lock,
@@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf)
907{ 991{
908 /* reverse the effect of the freeze */ 992 /* reverse the effect of the freeze */
909 spin_lock_irq(&conf->resync_lock); 993 spin_lock_irq(&conf->resync_lock);
910 conf->barrier--; 994 conf->array_frozen = 0;
911 conf->nr_waiting--;
912 wake_up(&conf->wait_barrier); 995 wake_up(&conf->wait_barrier);
913 spin_unlock_irq(&conf->resync_lock); 996 spin_unlock_irq(&conf->resync_lock);
914} 997}
@@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1013 int first_clone; 1096 int first_clone;
1014 int sectors_handled; 1097 int sectors_handled;
1015 int max_sectors; 1098 int max_sectors;
1099 sector_t start_next_window;
1016 1100
1017 /* 1101 /*
1018 * Register the new request and wait if the reconstruction 1102 * Register the new request and wait if the reconstruction
@@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1042 finish_wait(&conf->wait_barrier, &w); 1126 finish_wait(&conf->wait_barrier, &w);
1043 } 1127 }
1044 1128
1045 wait_barrier(conf); 1129 start_next_window = wait_barrier(conf, bio);
1046 1130
1047 bitmap = mddev->bitmap; 1131 bitmap = mddev->bitmap;
1048 1132
@@ -1163,6 +1247,7 @@ read_again:
1163 1247
1164 disks = conf->raid_disks * 2; 1248 disks = conf->raid_disks * 2;
1165 retry_write: 1249 retry_write:
1250 r1_bio->start_next_window = start_next_window;
1166 blocked_rdev = NULL; 1251 blocked_rdev = NULL;
1167 rcu_read_lock(); 1252 rcu_read_lock();
1168 max_sectors = r1_bio->sectors; 1253 max_sectors = r1_bio->sectors;
@@ -1231,14 +1316,24 @@ read_again:
1231 if (unlikely(blocked_rdev)) { 1316 if (unlikely(blocked_rdev)) {
1232 /* Wait for this device to become unblocked */ 1317 /* Wait for this device to become unblocked */
1233 int j; 1318 int j;
1319 sector_t old = start_next_window;
1234 1320
1235 for (j = 0; j < i; j++) 1321 for (j = 0; j < i; j++)
1236 if (r1_bio->bios[j]) 1322 if (r1_bio->bios[j])
1237 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1323 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
1238 r1_bio->state = 0; 1324 r1_bio->state = 0;
1239 allow_barrier(conf); 1325 allow_barrier(conf, start_next_window, bio->bi_sector);
1240 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1326 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1241 wait_barrier(conf); 1327 start_next_window = wait_barrier(conf, bio);
1328 /*
1329 * We must make sure the multi r1bios of bio have
1330 * the same value of bi_phys_segments
1331 */
1332 if (bio->bi_phys_segments && old &&
1333 old != start_next_window)
1334 /* Wait for the former r1bio(s) to complete */
1335 wait_event(conf->wait_barrier,
1336 bio->bi_phys_segments == 1);
1242 goto retry_write; 1337 goto retry_write;
1243 } 1338 }
1244 1339
@@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf)
1438 1533
1439static void close_sync(struct r1conf *conf) 1534static void close_sync(struct r1conf *conf)
1440{ 1535{
1441 wait_barrier(conf); 1536 wait_barrier(conf, NULL);
1442 allow_barrier(conf); 1537 allow_barrier(conf, 0, 0);
1443 1538
1444 mempool_destroy(conf->r1buf_pool); 1539 mempool_destroy(conf->r1buf_pool);
1445 conf->r1buf_pool = NULL; 1540 conf->r1buf_pool = NULL;
1541
1542 conf->next_resync = 0;
1543 conf->start_next_window = MaxSector;
1446} 1544}
1447 1545
1448static int raid1_spare_active(struct mddev *mddev) 1546static int raid1_spare_active(struct mddev *mddev)
@@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2714 conf->pending_count = 0; 2812 conf->pending_count = 0;
2715 conf->recovery_disabled = mddev->recovery_disabled - 1; 2813 conf->recovery_disabled = mddev->recovery_disabled - 1;
2716 2814
2815 conf->start_next_window = MaxSector;
2816 conf->current_window_requests = conf->next_window_requests = 0;
2817
2717 err = -EIO; 2818 err = -EIO;
2718 for (i = 0; i < conf->raid_disks * 2; i++) { 2819 for (i = 0; i < conf->raid_disks * 2; i++) {
2719 2820
@@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev)
2871 atomic_read(&bitmap->behind_writes) == 0); 2972 atomic_read(&bitmap->behind_writes) == 0);
2872 } 2973 }
2873 2974
2874 raise_barrier(conf); 2975 freeze_array(conf, 0);
2875 lower_barrier(conf); 2976 unfreeze_array(conf);
2876 2977
2877 md_unregister_thread(&mddev->thread); 2978 md_unregister_thread(&mddev->thread);
2878 if (conf->r1bio_pool) 2979 if (conf->r1bio_pool)
@@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
3031 wake_up(&conf->wait_barrier); 3132 wake_up(&conf->wait_barrier);
3032 break; 3133 break;
3033 case 1: 3134 case 1:
3034 raise_barrier(conf); 3135 freeze_array(conf, 0);
3035 break; 3136 break;
3036 case 0: 3137 case 0:
3037 lower_barrier(conf); 3138 unfreeze_array(conf);
3038 break; 3139 break;
3039 } 3140 }
3040} 3141}
@@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev)
3051 mddev->new_chunk_sectors = 0; 3152 mddev->new_chunk_sectors = 0;
3052 conf = setup_conf(mddev); 3153 conf = setup_conf(mddev);
3053 if (!IS_ERR(conf)) 3154 if (!IS_ERR(conf))
3054 conf->barrier = 1; 3155 /* Array must appear to be quiesced */
3156 conf->array_frozen = 1;
3055 return conf; 3157 return conf;
3056 } 3158 }
3057 return ERR_PTR(-EINVAL); 3159 return ERR_PTR(-EINVAL);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 0ff3715fb7eb..9bebca7bff2f 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -41,6 +41,19 @@ struct r1conf {
41 */ 41 */
42 sector_t next_resync; 42 sector_t next_resync;
43 43
44 /* When raid1 starts resync, we divide array into four partitions
45 * |---------|--------------|---------------------|-------------|
46 * next_resync start_next_window end_window
47 * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
48 * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
49 * current_window_requests means the count of normalIO between
50 * start_next_window and end_window.
51 * next_window_requests means the count of normalIO after end_window.
52 * */
53 sector_t start_next_window;
54 int current_window_requests;
55 int next_window_requests;
56
44 spinlock_t device_lock; 57 spinlock_t device_lock;
45 58
46 /* list of 'struct r1bio' that need to be processed by raid1d, 59 /* list of 'struct r1bio' that need to be processed by raid1d,
@@ -65,6 +78,7 @@ struct r1conf {
65 int nr_waiting; 78 int nr_waiting;
66 int nr_queued; 79 int nr_queued;
67 int barrier; 80 int barrier;
81 int array_frozen;
68 82
69 /* Set to 1 if a full sync is needed, (fresh device added). 83 /* Set to 1 if a full sync is needed, (fresh device added).
70 * Cleared when a sync completes. 84 * Cleared when a sync completes.
@@ -111,6 +125,7 @@ struct r1bio {
111 * in this BehindIO request 125 * in this BehindIO request
112 */ 126 */
113 sector_t sector; 127 sector_t sector;
128 sector_t start_next_window;
114 int sectors; 129 int sectors;
115 unsigned long state; 130 unsigned long state;
116 struct mddev *mddev; 131 struct mddev *mddev;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7c3508abb5e1..c504e8389e69 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4384 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4384 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4385 md_wakeup_thread(mddev->thread); 4385 md_wakeup_thread(mddev->thread);
4386 wait_event(mddev->sb_wait, mddev->flags == 0 || 4386 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4387 kthread_should_stop()); 4387 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4388 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4389 allow_barrier(conf);
4390 return sectors_done;
4391 }
4388 conf->reshape_safe = mddev->reshape_position; 4392 conf->reshape_safe = mddev->reshape_position;
4389 allow_barrier(conf); 4393 allow_barrier(conf);
4390 } 4394 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7f0e17a27aeb..cc055da02e2a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
85 return &conf->stripe_hashtbl[hash]; 85 return &conf->stripe_hashtbl[hash];
86} 86}
87 87
88static inline int stripe_hash_locks_hash(sector_t sect)
89{
90 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
91}
92
93static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
94{
95 spin_lock_irq(conf->hash_locks + hash);
96 spin_lock(&conf->device_lock);
97}
98
99static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
100{
101 spin_unlock(&conf->device_lock);
102 spin_unlock_irq(conf->hash_locks + hash);
103}
104
105static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
106{
107 int i;
108 local_irq_disable();
109 spin_lock(conf->hash_locks);
110 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
111 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
112 spin_lock(&conf->device_lock);
113}
114
115static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
116{
117 int i;
118 spin_unlock(&conf->device_lock);
119 for (i = NR_STRIPE_HASH_LOCKS; i; i--)
120 spin_unlock(conf->hash_locks + i - 1);
121 local_irq_enable();
122}
123
88/* bio's attached to a stripe+device for I/O are linked together in bi_sector 124/* bio's attached to a stripe+device for I/O are linked together in bi_sector
89 * order without overlap. There may be several bio's per stripe+device, and 125 * order without overlap. There may be several bio's per stripe+device, and
90 * a bio could span several devices. 126 * a bio could span several devices.
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
249 } 285 }
250} 286}
251 287
252static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 288static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
289 struct list_head *temp_inactive_list)
253{ 290{
254 BUG_ON(!list_empty(&sh->lru)); 291 BUG_ON(!list_empty(&sh->lru));
255 BUG_ON(atomic_read(&conf->active_stripes)==0); 292 BUG_ON(atomic_read(&conf->active_stripes)==0);
@@ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
278 < IO_THRESHOLD) 315 < IO_THRESHOLD)
279 md_wakeup_thread(conf->mddev->thread); 316 md_wakeup_thread(conf->mddev->thread);
280 atomic_dec(&conf->active_stripes); 317 atomic_dec(&conf->active_stripes);
281 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 318 if (!test_bit(STRIPE_EXPANDING, &sh->state))
282 list_add_tail(&sh->lru, &conf->inactive_list); 319 list_add_tail(&sh->lru, temp_inactive_list);
283 wake_up(&conf->wait_for_stripe);
284 if (conf->retry_read_aligned)
285 md_wakeup_thread(conf->mddev->thread);
286 }
287 } 320 }
288} 321}
289 322
290static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 323static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
324 struct list_head *temp_inactive_list)
291{ 325{
292 if (atomic_dec_and_test(&sh->count)) 326 if (atomic_dec_and_test(&sh->count))
293 do_release_stripe(conf, sh); 327 do_release_stripe(conf, sh, temp_inactive_list);
328}
329
330/*
331 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
332 *
333 * Be careful: Only one task can add/delete stripes from temp_inactive_list at
334 * given time. Adding stripes only takes device lock, while deleting stripes
335 * only takes hash lock.
336 */
337static void release_inactive_stripe_list(struct r5conf *conf,
338 struct list_head *temp_inactive_list,
339 int hash)
340{
341 int size;
342 bool do_wakeup = false;
343 unsigned long flags;
344
345 if (hash == NR_STRIPE_HASH_LOCKS) {
346 size = NR_STRIPE_HASH_LOCKS;
347 hash = NR_STRIPE_HASH_LOCKS - 1;
348 } else
349 size = 1;
350 while (size) {
351 struct list_head *list = &temp_inactive_list[size - 1];
352
353 /*
354 * We don't hold any lock here yet, get_active_stripe() might
355 * remove stripes from the list
356 */
357 if (!list_empty_careful(list)) {
358 spin_lock_irqsave(conf->hash_locks + hash, flags);
359 if (list_empty(conf->inactive_list + hash) &&
360 !list_empty(list))
361 atomic_dec(&conf->empty_inactive_list_nr);
362 list_splice_tail_init(list, conf->inactive_list + hash);
363 do_wakeup = true;
364 spin_unlock_irqrestore(conf->hash_locks + hash, flags);
365 }
366 size--;
367 hash--;
368 }
369
370 if (do_wakeup) {
371 wake_up(&conf->wait_for_stripe);
372 if (conf->retry_read_aligned)
373 md_wakeup_thread(conf->mddev->thread);
374 }
294} 375}
295 376
296/* should hold conf->device_lock already */ 377/* should hold conf->device_lock already */
297static int release_stripe_list(struct r5conf *conf) 378static int release_stripe_list(struct r5conf *conf,
379 struct list_head *temp_inactive_list)
298{ 380{
299 struct stripe_head *sh; 381 struct stripe_head *sh;
300 int count = 0; 382 int count = 0;
@@ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf)
303 head = llist_del_all(&conf->released_stripes); 385 head = llist_del_all(&conf->released_stripes);
304 head = llist_reverse_order(head); 386 head = llist_reverse_order(head);
305 while (head) { 387 while (head) {
388 int hash;
389
306 sh = llist_entry(head, struct stripe_head, release_list); 390 sh = llist_entry(head, struct stripe_head, release_list);
307 head = llist_next(head); 391 head = llist_next(head);
308 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 392 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
@@ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf)
313 * again, the count is always > 1. This is true for 397 * again, the count is always > 1. This is true for
314 * STRIPE_ON_UNPLUG_LIST bit too. 398 * STRIPE_ON_UNPLUG_LIST bit too.
315 */ 399 */
316 __release_stripe(conf, sh); 400 hash = sh->hash_lock_index;
401 __release_stripe(conf, sh, &temp_inactive_list[hash]);
317 count++; 402 count++;
318 } 403 }
319 404
@@ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh)
324{ 409{
325 struct r5conf *conf = sh->raid_conf; 410 struct r5conf *conf = sh->raid_conf;
326 unsigned long flags; 411 unsigned long flags;
412 struct list_head list;
413 int hash;
327 bool wakeup; 414 bool wakeup;
328 415
329 if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 416 if (unlikely(!conf->mddev->thread) ||
417 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
330 goto slow_path; 418 goto slow_path;
331 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 419 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
332 if (wakeup) 420 if (wakeup)
@@ -336,8 +424,11 @@ slow_path:
336 local_irq_save(flags); 424 local_irq_save(flags);
337 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 425 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
338 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 426 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
339 do_release_stripe(conf, sh); 427 INIT_LIST_HEAD(&list);
428 hash = sh->hash_lock_index;
429 do_release_stripe(conf, sh, &list);
340 spin_unlock(&conf->device_lock); 430 spin_unlock(&conf->device_lock);
431 release_inactive_stripe_list(conf, &list, hash);
341 } 432 }
342 local_irq_restore(flags); 433 local_irq_restore(flags);
343} 434}
@@ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
362 453
363 454
364/* find an idle stripe, make sure it is unhashed, and return it. */ 455/* find an idle stripe, make sure it is unhashed, and return it. */
365static struct stripe_head *get_free_stripe(struct r5conf *conf) 456static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
366{ 457{
367 struct stripe_head *sh = NULL; 458 struct stripe_head *sh = NULL;
368 struct list_head *first; 459 struct list_head *first;
369 460
370 if (list_empty(&conf->inactive_list)) 461 if (list_empty(conf->inactive_list + hash))
371 goto out; 462 goto out;
372 first = conf->inactive_list.next; 463 first = (conf->inactive_list + hash)->next;
373 sh = list_entry(first, struct stripe_head, lru); 464 sh = list_entry(first, struct stripe_head, lru);
374 list_del_init(first); 465 list_del_init(first);
375 remove_hash(sh); 466 remove_hash(sh);
376 atomic_inc(&conf->active_stripes); 467 atomic_inc(&conf->active_stripes);
468 BUG_ON(hash != sh->hash_lock_index);
469 if (list_empty(conf->inactive_list + hash))
470 atomic_inc(&conf->empty_inactive_list_nr);
377out: 471out:
378 return sh; 472 return sh;
379} 473}
@@ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
416static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 510static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
417{ 511{
418 struct r5conf *conf = sh->raid_conf; 512 struct r5conf *conf = sh->raid_conf;
419 int i; 513 int i, seq;
420 514
421 BUG_ON(atomic_read(&sh->count) != 0); 515 BUG_ON(atomic_read(&sh->count) != 0);
422 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 516 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
@@ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
426 (unsigned long long)sh->sector); 520 (unsigned long long)sh->sector);
427 521
428 remove_hash(sh); 522 remove_hash(sh);
429 523retry:
524 seq = read_seqcount_begin(&conf->gen_lock);
430 sh->generation = conf->generation - previous; 525 sh->generation = conf->generation - previous;
431 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 526 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
432 sh->sector = sector; 527 sh->sector = sector;
@@ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
448 dev->flags = 0; 543 dev->flags = 0;
449 raid5_build_block(sh, i, previous); 544 raid5_build_block(sh, i, previous);
450 } 545 }
546 if (read_seqcount_retry(&conf->gen_lock, seq))
547 goto retry;
451 insert_hash(conf, sh); 548 insert_hash(conf, sh);
452 sh->cpu = smp_processor_id(); 549 sh->cpu = smp_processor_id();
453} 550}
@@ -552,57 +649,59 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
552 int previous, int noblock, int noquiesce) 649 int previous, int noblock, int noquiesce)
553{ 650{
554 struct stripe_head *sh; 651 struct stripe_head *sh;
652 int hash = stripe_hash_locks_hash(sector);
555 653
556 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 654 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
557 655
558 spin_lock_irq(&conf->device_lock); 656 spin_lock_irq(conf->hash_locks + hash);
559 657
560 do { 658 do {
561 wait_event_lock_irq(conf->wait_for_stripe, 659 wait_event_lock_irq(conf->wait_for_stripe,
562 conf->quiesce == 0 || noquiesce, 660 conf->quiesce == 0 || noquiesce,
563 conf->device_lock); 661 *(conf->hash_locks + hash));
564 sh = __find_stripe(conf, sector, conf->generation - previous); 662 sh = __find_stripe(conf, sector, conf->generation - previous);
565 if (!sh) { 663 if (!sh) {
566 if (!conf->inactive_blocked) 664 if (!conf->inactive_blocked)
567 sh = get_free_stripe(conf); 665 sh = get_free_stripe(conf, hash);
568 if (noblock && sh == NULL) 666 if (noblock && sh == NULL)
569 break; 667 break;
570 if (!sh) { 668 if (!sh) {
571 conf->inactive_blocked = 1; 669 conf->inactive_blocked = 1;
572 wait_event_lock_irq(conf->wait_for_stripe, 670 wait_event_lock_irq(
573 !list_empty(&conf->inactive_list) && 671 conf->wait_for_stripe,
574 (atomic_read(&conf->active_stripes) 672 !list_empty(conf->inactive_list + hash) &&
575 < (conf->max_nr_stripes *3/4) 673 (atomic_read(&conf->active_stripes)
576 || !conf->inactive_blocked), 674 < (conf->max_nr_stripes * 3 / 4)
577 conf->device_lock); 675 || !conf->inactive_blocked),
676 *(conf->hash_locks + hash));
578 conf->inactive_blocked = 0; 677 conf->inactive_blocked = 0;
579 } else 678 } else
580 init_stripe(sh, sector, previous); 679 init_stripe(sh, sector, previous);
581 } else { 680 } else {
681 spin_lock(&conf->device_lock);
582 if (atomic_read(&sh->count)) { 682 if (atomic_read(&sh->count)) {
583 BUG_ON(!list_empty(&sh->lru) 683 BUG_ON(!list_empty(&sh->lru)
584 && !test_bit(STRIPE_EXPANDING, &sh->state) 684 && !test_bit(STRIPE_EXPANDING, &sh->state)
585 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) 685 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
586 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 686 );
587 } else { 687 } else {
588 if (!test_bit(STRIPE_HANDLE, &sh->state)) 688 if (!test_bit(STRIPE_HANDLE, &sh->state))
589 atomic_inc(&conf->active_stripes); 689 atomic_inc(&conf->active_stripes);
590 if (list_empty(&sh->lru) && 690 BUG_ON(list_empty(&sh->lru));
591 !test_bit(STRIPE_EXPANDING, &sh->state))
592 BUG();
593 list_del_init(&sh->lru); 691 list_del_init(&sh->lru);
594 if (sh->group) { 692 if (sh->group) {
595 sh->group->stripes_cnt--; 693 sh->group->stripes_cnt--;
596 sh->group = NULL; 694 sh->group = NULL;
597 } 695 }
598 } 696 }
697 spin_unlock(&conf->device_lock);
599 } 698 }
600 } while (sh == NULL); 699 } while (sh == NULL);
601 700
602 if (sh) 701 if (sh)
603 atomic_inc(&sh->count); 702 atomic_inc(&sh->count);
604 703
605 spin_unlock_irq(&conf->device_lock); 704 spin_unlock_irq(conf->hash_locks + hash);
606 return sh; 705 return sh;
607} 706}
608 707
@@ -758,7 +857,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
758 bi->bi_sector = (sh->sector 857 bi->bi_sector = (sh->sector
759 + rdev->data_offset); 858 + rdev->data_offset);
760 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 859 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
761 bi->bi_rw |= REQ_FLUSH; 860 bi->bi_rw |= REQ_NOMERGE;
762 861
763 bi->bi_vcnt = 1; 862 bi->bi_vcnt = 1;
764 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 863 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1582,7 +1681,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1582 put_cpu(); 1681 put_cpu();
1583} 1682}
1584 1683
1585static int grow_one_stripe(struct r5conf *conf) 1684static int grow_one_stripe(struct r5conf *conf, int hash)
1586{ 1685{
1587 struct stripe_head *sh; 1686 struct stripe_head *sh;
1588 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1687 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1598,6 +1697,7 @@ static int grow_one_stripe(struct r5conf *conf)
1598 kmem_cache_free(conf->slab_cache, sh); 1697 kmem_cache_free(conf->slab_cache, sh);
1599 return 0; 1698 return 0;
1600 } 1699 }
1700 sh->hash_lock_index = hash;
1601 /* we just created an active stripe so... */ 1701 /* we just created an active stripe so... */
1602 atomic_set(&sh->count, 1); 1702 atomic_set(&sh->count, 1);
1603 atomic_inc(&conf->active_stripes); 1703 atomic_inc(&conf->active_stripes);
@@ -1610,6 +1710,7 @@ static int grow_stripes(struct r5conf *conf, int num)
1610{ 1710{
1611 struct kmem_cache *sc; 1711 struct kmem_cache *sc;
1612 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1712 int devs = max(conf->raid_disks, conf->previous_raid_disks);
1713 int hash;
1613 1714
1614 if (conf->mddev->gendisk) 1715 if (conf->mddev->gendisk)
1615 sprintf(conf->cache_name[0], 1716 sprintf(conf->cache_name[0],
@@ -1627,9 +1728,13 @@ static int grow_stripes(struct r5conf *conf, int num)
1627 return 1; 1728 return 1;
1628 conf->slab_cache = sc; 1729 conf->slab_cache = sc;
1629 conf->pool_size = devs; 1730 conf->pool_size = devs;
1630 while (num--) 1731 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
1631 if (!grow_one_stripe(conf)) 1732 while (num--) {
1733 if (!grow_one_stripe(conf, hash))
1632 return 1; 1734 return 1;
1735 conf->max_nr_stripes++;
1736 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
1737 }
1633 return 0; 1738 return 0;
1634} 1739}
1635 1740
@@ -1687,6 +1792,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1687 int err; 1792 int err;
1688 struct kmem_cache *sc; 1793 struct kmem_cache *sc;
1689 int i; 1794 int i;
1795 int hash, cnt;
1690 1796
1691 if (newsize <= conf->pool_size) 1797 if (newsize <= conf->pool_size)
1692 return 0; /* never bother to shrink */ 1798 return 0; /* never bother to shrink */
@@ -1726,19 +1832,29 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1726 * OK, we have enough stripes, start collecting inactive 1832 * OK, we have enough stripes, start collecting inactive
1727 * stripes and copying them over 1833 * stripes and copying them over
1728 */ 1834 */
1835 hash = 0;
1836 cnt = 0;
1729 list_for_each_entry(nsh, &newstripes, lru) { 1837 list_for_each_entry(nsh, &newstripes, lru) {
1730 spin_lock_irq(&conf->device_lock); 1838 lock_device_hash_lock(conf, hash);
1731 wait_event_lock_irq(conf->wait_for_stripe, 1839 wait_event_cmd(conf->wait_for_stripe,
1732 !list_empty(&conf->inactive_list), 1840 !list_empty(conf->inactive_list + hash),
1733 conf->device_lock); 1841 unlock_device_hash_lock(conf, hash),
1734 osh = get_free_stripe(conf); 1842 lock_device_hash_lock(conf, hash));
1735 spin_unlock_irq(&conf->device_lock); 1843 osh = get_free_stripe(conf, hash);
1844 unlock_device_hash_lock(conf, hash);
1736 atomic_set(&nsh->count, 1); 1845 atomic_set(&nsh->count, 1);
1737 for(i=0; i<conf->pool_size; i++) 1846 for(i=0; i<conf->pool_size; i++)
1738 nsh->dev[i].page = osh->dev[i].page; 1847 nsh->dev[i].page = osh->dev[i].page;
1739 for( ; i<newsize; i++) 1848 for( ; i<newsize; i++)
1740 nsh->dev[i].page = NULL; 1849 nsh->dev[i].page = NULL;
1850 nsh->hash_lock_index = hash;
1741 kmem_cache_free(conf->slab_cache, osh); 1851 kmem_cache_free(conf->slab_cache, osh);
1852 cnt++;
1853 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
1854 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
1855 hash++;
1856 cnt = 0;
1857 }
1742 } 1858 }
1743 kmem_cache_destroy(conf->slab_cache); 1859 kmem_cache_destroy(conf->slab_cache);
1744 1860
@@ -1797,13 +1913,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1797 return err; 1913 return err;
1798} 1914}
1799 1915
1800static int drop_one_stripe(struct r5conf *conf) 1916static int drop_one_stripe(struct r5conf *conf, int hash)
1801{ 1917{
1802 struct stripe_head *sh; 1918 struct stripe_head *sh;
1803 1919
1804 spin_lock_irq(&conf->device_lock); 1920 spin_lock_irq(conf->hash_locks + hash);
1805 sh = get_free_stripe(conf); 1921 sh = get_free_stripe(conf, hash);
1806 spin_unlock_irq(&conf->device_lock); 1922 spin_unlock_irq(conf->hash_locks + hash);
1807 if (!sh) 1923 if (!sh)
1808 return 0; 1924 return 0;
1809 BUG_ON(atomic_read(&sh->count)); 1925 BUG_ON(atomic_read(&sh->count));
@@ -1815,8 +1931,10 @@ static int drop_one_stripe(struct r5conf *conf)
1815 1931
1816static void shrink_stripes(struct r5conf *conf) 1932static void shrink_stripes(struct r5conf *conf)
1817{ 1933{
1818 while (drop_one_stripe(conf)) 1934 int hash;
1819 ; 1935 for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
1936 while (drop_one_stripe(conf, hash))
1937 ;
1820 1938
1821 if (conf->slab_cache) 1939 if (conf->slab_cache)
1822 kmem_cache_destroy(conf->slab_cache); 1940 kmem_cache_destroy(conf->slab_cache);
@@ -1921,6 +2039,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
1921 mdname(conf->mddev), bdn); 2039 mdname(conf->mddev), bdn);
1922 else 2040 else
1923 retry = 1; 2041 retry = 1;
2042 if (set_bad && test_bit(In_sync, &rdev->flags)
2043 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2044 retry = 1;
1924 if (retry) 2045 if (retry)
1925 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2046 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
1926 set_bit(R5_ReadError, &sh->dev[i].flags); 2047 set_bit(R5_ReadError, &sh->dev[i].flags);
@@ -3900,7 +4021,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
3900 } 4021 }
3901} 4022}
3902 4023
3903static void activate_bit_delay(struct r5conf *conf) 4024static void activate_bit_delay(struct r5conf *conf,
4025 struct list_head *temp_inactive_list)
3904{ 4026{
3905 /* device_lock is held */ 4027 /* device_lock is held */
3906 struct list_head head; 4028 struct list_head head;
@@ -3908,9 +4030,11 @@ static void activate_bit_delay(struct r5conf *conf)
3908 list_del_init(&conf->bitmap_list); 4030 list_del_init(&conf->bitmap_list);
3909 while (!list_empty(&head)) { 4031 while (!list_empty(&head)) {
3910 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 4032 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
4033 int hash;
3911 list_del_init(&sh->lru); 4034 list_del_init(&sh->lru);
3912 atomic_inc(&sh->count); 4035 atomic_inc(&sh->count);
3913 __release_stripe(conf, sh); 4036 hash = sh->hash_lock_index;
4037 __release_stripe(conf, sh, &temp_inactive_list[hash]);
3914 } 4038 }
3915} 4039}
3916 4040
@@ -3926,7 +4050,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
3926 return 1; 4050 return 1;
3927 if (conf->quiesce) 4051 if (conf->quiesce)
3928 return 1; 4052 return 1;
3929 if (list_empty_careful(&conf->inactive_list)) 4053 if (atomic_read(&conf->empty_inactive_list_nr))
3930 return 1; 4054 return 1;
3931 4055
3932 return 0; 4056 return 0;
@@ -4256,6 +4380,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
4256struct raid5_plug_cb { 4380struct raid5_plug_cb {
4257 struct blk_plug_cb cb; 4381 struct blk_plug_cb cb;
4258 struct list_head list; 4382 struct list_head list;
4383 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
4259}; 4384};
4260 4385
4261static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4386static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
@@ -4266,6 +4391,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4266 struct mddev *mddev = cb->cb.data; 4391 struct mddev *mddev = cb->cb.data;
4267 struct r5conf *conf = mddev->private; 4392 struct r5conf *conf = mddev->private;
4268 int cnt = 0; 4393 int cnt = 0;
4394 int hash;
4269 4395
4270 if (cb->list.next && !list_empty(&cb->list)) { 4396 if (cb->list.next && !list_empty(&cb->list)) {
4271 spin_lock_irq(&conf->device_lock); 4397 spin_lock_irq(&conf->device_lock);
@@ -4283,11 +4409,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4283 * STRIPE_ON_RELEASE_LIST could be set here. In that 4409 * STRIPE_ON_RELEASE_LIST could be set here. In that
4284 * case, the count is always > 1 here 4410 * case, the count is always > 1 here
4285 */ 4411 */
4286 __release_stripe(conf, sh); 4412 hash = sh->hash_lock_index;
4413 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
4287 cnt++; 4414 cnt++;
4288 } 4415 }
4289 spin_unlock_irq(&conf->device_lock); 4416 spin_unlock_irq(&conf->device_lock);
4290 } 4417 }
4418 release_inactive_stripe_list(conf, cb->temp_inactive_list,
4419 NR_STRIPE_HASH_LOCKS);
4291 if (mddev->queue) 4420 if (mddev->queue)
4292 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4421 trace_block_unplug(mddev->queue, cnt, !from_schedule);
4293 kfree(cb); 4422 kfree(cb);
@@ -4308,8 +4437,12 @@ static void release_stripe_plug(struct mddev *mddev,
4308 4437
4309 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4438 cb = container_of(blk_cb, struct raid5_plug_cb, cb);
4310 4439
4311 if (cb->list.next == NULL) 4440 if (cb->list.next == NULL) {
4441 int i;
4312 INIT_LIST_HEAD(&cb->list); 4442 INIT_LIST_HEAD(&cb->list);
4443 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
4444 INIT_LIST_HEAD(cb->temp_inactive_list + i);
4445 }
4313 4446
4314 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4447 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
4315 list_add_tail(&sh->lru, &cb->list); 4448 list_add_tail(&sh->lru, &cb->list);
@@ -4692,14 +4825,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4692 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4825 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4693 /* Cannot proceed until we've updated the superblock... */ 4826 /* Cannot proceed until we've updated the superblock... */
4694 wait_event(conf->wait_for_overlap, 4827 wait_event(conf->wait_for_overlap,
4695 atomic_read(&conf->reshape_stripes)==0); 4828 atomic_read(&conf->reshape_stripes)==0
4829 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4830 if (atomic_read(&conf->reshape_stripes) != 0)
4831 return 0;
4696 mddev->reshape_position = conf->reshape_progress; 4832 mddev->reshape_position = conf->reshape_progress;
4697 mddev->curr_resync_completed = sector_nr; 4833 mddev->curr_resync_completed = sector_nr;
4698 conf->reshape_checkpoint = jiffies; 4834 conf->reshape_checkpoint = jiffies;
4699 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4835 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4700 md_wakeup_thread(mddev->thread); 4836 md_wakeup_thread(mddev->thread);
4701 wait_event(mddev->sb_wait, mddev->flags == 0 || 4837 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4702 kthread_should_stop()); 4838 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4839 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4840 return 0;
4703 spin_lock_irq(&conf->device_lock); 4841 spin_lock_irq(&conf->device_lock);
4704 conf->reshape_safe = mddev->reshape_position; 4842 conf->reshape_safe = mddev->reshape_position;
4705 spin_unlock_irq(&conf->device_lock); 4843 spin_unlock_irq(&conf->device_lock);
@@ -4782,7 +4920,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4782 >= mddev->resync_max - mddev->curr_resync_completed) { 4920 >= mddev->resync_max - mddev->curr_resync_completed) {
4783 /* Cannot proceed until we've updated the superblock... */ 4921 /* Cannot proceed until we've updated the superblock... */
4784 wait_event(conf->wait_for_overlap, 4922 wait_event(conf->wait_for_overlap,
4785 atomic_read(&conf->reshape_stripes) == 0); 4923 atomic_read(&conf->reshape_stripes) == 0
4924 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4925 if (atomic_read(&conf->reshape_stripes) != 0)
4926 goto ret;
4786 mddev->reshape_position = conf->reshape_progress; 4927 mddev->reshape_position = conf->reshape_progress;
4787 mddev->curr_resync_completed = sector_nr; 4928 mddev->curr_resync_completed = sector_nr;
4788 conf->reshape_checkpoint = jiffies; 4929 conf->reshape_checkpoint = jiffies;
@@ -4790,13 +4931,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
4790 md_wakeup_thread(mddev->thread); 4931 md_wakeup_thread(mddev->thread);
4791 wait_event(mddev->sb_wait, 4932 wait_event(mddev->sb_wait,
4792 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4933 !test_bit(MD_CHANGE_DEVS, &mddev->flags)
4793 || kthread_should_stop()); 4934 || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4935 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4936 goto ret;
4794 spin_lock_irq(&conf->device_lock); 4937 spin_lock_irq(&conf->device_lock);
4795 conf->reshape_safe = mddev->reshape_position; 4938 conf->reshape_safe = mddev->reshape_position;
4796 spin_unlock_irq(&conf->device_lock); 4939 spin_unlock_irq(&conf->device_lock);
4797 wake_up(&conf->wait_for_overlap); 4940 wake_up(&conf->wait_for_overlap);
4798 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4941 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4799 } 4942 }
4943ret:
4800 return reshape_sectors; 4944 return reshape_sectors;
4801} 4945}
4802 4946
@@ -4954,27 +5098,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4954} 5098}
4955 5099
4956static int handle_active_stripes(struct r5conf *conf, int group, 5100static int handle_active_stripes(struct r5conf *conf, int group,
4957 struct r5worker *worker) 5101 struct r5worker *worker,
5102 struct list_head *temp_inactive_list)
4958{ 5103{
4959 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 5104 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
4960 int i, batch_size = 0; 5105 int i, batch_size = 0, hash;
5106 bool release_inactive = false;
4961 5107
4962 while (batch_size < MAX_STRIPE_BATCH && 5108 while (batch_size < MAX_STRIPE_BATCH &&
4963 (sh = __get_priority_stripe(conf, group)) != NULL) 5109 (sh = __get_priority_stripe(conf, group)) != NULL)
4964 batch[batch_size++] = sh; 5110 batch[batch_size++] = sh;
4965 5111
4966 if (batch_size == 0) 5112 if (batch_size == 0) {
4967 return batch_size; 5113 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5114 if (!list_empty(temp_inactive_list + i))
5115 break;
5116 if (i == NR_STRIPE_HASH_LOCKS)
5117 return batch_size;
5118 release_inactive = true;
5119 }
4968 spin_unlock_irq(&conf->device_lock); 5120 spin_unlock_irq(&conf->device_lock);
4969 5121
5122 release_inactive_stripe_list(conf, temp_inactive_list,
5123 NR_STRIPE_HASH_LOCKS);
5124
5125 if (release_inactive) {
5126 spin_lock_irq(&conf->device_lock);
5127 return 0;
5128 }
5129
4970 for (i = 0; i < batch_size; i++) 5130 for (i = 0; i < batch_size; i++)
4971 handle_stripe(batch[i]); 5131 handle_stripe(batch[i]);
4972 5132
4973 cond_resched(); 5133 cond_resched();
4974 5134
4975 spin_lock_irq(&conf->device_lock); 5135 spin_lock_irq(&conf->device_lock);
4976 for (i = 0; i < batch_size; i++) 5136 for (i = 0; i < batch_size; i++) {
4977 __release_stripe(conf, batch[i]); 5137 hash = batch[i]->hash_lock_index;
5138 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
5139 }
4978 return batch_size; 5140 return batch_size;
4979} 5141}
4980 5142
@@ -4995,9 +5157,10 @@ static void raid5_do_work(struct work_struct *work)
4995 while (1) { 5157 while (1) {
4996 int batch_size, released; 5158 int batch_size, released;
4997 5159
4998 released = release_stripe_list(conf); 5160 released = release_stripe_list(conf, worker->temp_inactive_list);
4999 5161
5000 batch_size = handle_active_stripes(conf, group_id, worker); 5162 batch_size = handle_active_stripes(conf, group_id, worker,
5163 worker->temp_inactive_list);
5001 worker->working = false; 5164 worker->working = false;
5002 if (!batch_size && !released) 5165 if (!batch_size && !released)
5003 break; 5166 break;
@@ -5036,7 +5199,7 @@ static void raid5d(struct md_thread *thread)
5036 struct bio *bio; 5199 struct bio *bio;
5037 int batch_size, released; 5200 int batch_size, released;
5038 5201
5039 released = release_stripe_list(conf); 5202 released = release_stripe_list(conf, conf->temp_inactive_list);
5040 5203
5041 if ( 5204 if (
5042 !list_empty(&conf->bitmap_list)) { 5205 !list_empty(&conf->bitmap_list)) {
@@ -5046,7 +5209,7 @@ static void raid5d(struct md_thread *thread)
5046 bitmap_unplug(mddev->bitmap); 5209 bitmap_unplug(mddev->bitmap);
5047 spin_lock_irq(&conf->device_lock); 5210 spin_lock_irq(&conf->device_lock);
5048 conf->seq_write = conf->seq_flush; 5211 conf->seq_write = conf->seq_flush;
5049 activate_bit_delay(conf); 5212 activate_bit_delay(conf, conf->temp_inactive_list);
5050 } 5213 }
5051 raid5_activate_delayed(conf); 5214 raid5_activate_delayed(conf);
5052 5215
@@ -5060,7 +5223,8 @@ static void raid5d(struct md_thread *thread)
5060 handled++; 5223 handled++;
5061 } 5224 }
5062 5225
5063 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); 5226 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
5227 conf->temp_inactive_list);
5064 if (!batch_size && !released) 5228 if (!batch_size && !released)
5065 break; 5229 break;
5066 handled += batch_size; 5230 handled += batch_size;
@@ -5096,22 +5260,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
5096{ 5260{
5097 struct r5conf *conf = mddev->private; 5261 struct r5conf *conf = mddev->private;
5098 int err; 5262 int err;
5263 int hash;
5099 5264
5100 if (size <= 16 || size > 32768) 5265 if (size <= 16 || size > 32768)
5101 return -EINVAL; 5266 return -EINVAL;
5267 hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
5102 while (size < conf->max_nr_stripes) { 5268 while (size < conf->max_nr_stripes) {
5103 if (drop_one_stripe(conf)) 5269 if (drop_one_stripe(conf, hash))
5104 conf->max_nr_stripes--; 5270 conf->max_nr_stripes--;
5105 else 5271 else
5106 break; 5272 break;
5273 hash--;
5274 if (hash < 0)
5275 hash = NR_STRIPE_HASH_LOCKS - 1;
5107 } 5276 }
5108 err = md_allow_write(mddev); 5277 err = md_allow_write(mddev);
5109 if (err) 5278 if (err)
5110 return err; 5279 return err;
5280 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
5111 while (size > conf->max_nr_stripes) { 5281 while (size > conf->max_nr_stripes) {
5112 if (grow_one_stripe(conf)) 5282 if (grow_one_stripe(conf, hash))
5113 conf->max_nr_stripes++; 5283 conf->max_nr_stripes++;
5114 else break; 5284 else break;
5285 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
5115 } 5286 }
5116 return 0; 5287 return 0;
5117} 5288}
@@ -5199,15 +5370,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
5199 return 0; 5370 return 0;
5200} 5371}
5201 5372
5202static int alloc_thread_groups(struct r5conf *conf, int cnt); 5373static int alloc_thread_groups(struct r5conf *conf, int cnt,
5374 int *group_cnt,
5375 int *worker_cnt_per_group,
5376 struct r5worker_group **worker_groups);
5203static ssize_t 5377static ssize_t
5204raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 5378raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
5205{ 5379{
5206 struct r5conf *conf = mddev->private; 5380 struct r5conf *conf = mddev->private;
5207 unsigned long new; 5381 unsigned long new;
5208 int err; 5382 int err;
5209 struct r5worker_group *old_groups; 5383 struct r5worker_group *new_groups, *old_groups;
5210 int old_group_cnt; 5384 int group_cnt, worker_cnt_per_group;
5211 5385
5212 if (len >= PAGE_SIZE) 5386 if (len >= PAGE_SIZE)
5213 return -EINVAL; 5387 return -EINVAL;
@@ -5223,14 +5397,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
5223 mddev_suspend(mddev); 5397 mddev_suspend(mddev);
5224 5398
5225 old_groups = conf->worker_groups; 5399 old_groups = conf->worker_groups;
5226 old_group_cnt = conf->worker_cnt_per_group; 5400 if (old_groups)
5401 flush_workqueue(raid5_wq);
5402
5403 err = alloc_thread_groups(conf, new,
5404 &group_cnt, &worker_cnt_per_group,
5405 &new_groups);
5406 if (!err) {
5407 spin_lock_irq(&conf->device_lock);
5408 conf->group_cnt = group_cnt;
5409 conf->worker_cnt_per_group = worker_cnt_per_group;
5410 conf->worker_groups = new_groups;
5411 spin_unlock_irq(&conf->device_lock);
5227 5412
5228 conf->worker_groups = NULL;
5229 err = alloc_thread_groups(conf, new);
5230 if (err) {
5231 conf->worker_groups = old_groups;
5232 conf->worker_cnt_per_group = old_group_cnt;
5233 } else {
5234 if (old_groups) 5413 if (old_groups)
5235 kfree(old_groups[0].workers); 5414 kfree(old_groups[0].workers);
5236 kfree(old_groups); 5415 kfree(old_groups);
@@ -5260,40 +5439,47 @@ static struct attribute_group raid5_attrs_group = {
5260 .attrs = raid5_attrs, 5439 .attrs = raid5_attrs,
5261}; 5440};
5262 5441
5263static int alloc_thread_groups(struct r5conf *conf, int cnt) 5442static int alloc_thread_groups(struct r5conf *conf, int cnt,
5443 int *group_cnt,
5444 int *worker_cnt_per_group,
5445 struct r5worker_group **worker_groups)
5264{ 5446{
5265 int i, j; 5447 int i, j, k;
5266 ssize_t size; 5448 ssize_t size;
5267 struct r5worker *workers; 5449 struct r5worker *workers;
5268 5450
5269 conf->worker_cnt_per_group = cnt; 5451 *worker_cnt_per_group = cnt;
5270 if (cnt == 0) { 5452 if (cnt == 0) {
5271 conf->worker_groups = NULL; 5453 *group_cnt = 0;
5454 *worker_groups = NULL;
5272 return 0; 5455 return 0;
5273 } 5456 }
5274 conf->group_cnt = num_possible_nodes(); 5457 *group_cnt = num_possible_nodes();
5275 size = sizeof(struct r5worker) * cnt; 5458 size = sizeof(struct r5worker) * cnt;
5276 workers = kzalloc(size * conf->group_cnt, GFP_NOIO); 5459 workers = kzalloc(size * *group_cnt, GFP_NOIO);
5277 conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * 5460 *worker_groups = kzalloc(sizeof(struct r5worker_group) *
5278 conf->group_cnt, GFP_NOIO); 5461 *group_cnt, GFP_NOIO);
5279 if (!conf->worker_groups || !workers) { 5462 if (!*worker_groups || !workers) {
5280 kfree(workers); 5463 kfree(workers);
5281 kfree(conf->worker_groups); 5464 kfree(*worker_groups);
5282 conf->worker_groups = NULL;
5283 return -ENOMEM; 5465 return -ENOMEM;
5284 } 5466 }
5285 5467
5286 for (i = 0; i < conf->group_cnt; i++) { 5468 for (i = 0; i < *group_cnt; i++) {
5287 struct r5worker_group *group; 5469 struct r5worker_group *group;
5288 5470
5289 group = &conf->worker_groups[i]; 5471 group = &(*worker_groups)[i];
5290 INIT_LIST_HEAD(&group->handle_list); 5472 INIT_LIST_HEAD(&group->handle_list);
5291 group->conf = conf; 5473 group->conf = conf;
5292 group->workers = workers + i * cnt; 5474 group->workers = workers + i * cnt;
5293 5475
5294 for (j = 0; j < cnt; j++) { 5476 for (j = 0; j < cnt; j++) {
5295 group->workers[j].group = group; 5477 struct r5worker *worker = group->workers + j;
5296 INIT_WORK(&group->workers[j].work, raid5_do_work); 5478 worker->group = group;
5479 INIT_WORK(&worker->work, raid5_do_work);
5480
5481 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
5482 INIT_LIST_HEAD(worker->temp_inactive_list + k);
5297 } 5483 }
5298 } 5484 }
5299 5485
@@ -5444,6 +5630,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5444 struct md_rdev *rdev; 5630 struct md_rdev *rdev;
5445 struct disk_info *disk; 5631 struct disk_info *disk;
5446 char pers_name[6]; 5632 char pers_name[6];
5633 int i;
5634 int group_cnt, worker_cnt_per_group;
5635 struct r5worker_group *new_group;
5447 5636
5448 if (mddev->new_level != 5 5637 if (mddev->new_level != 5
5449 && mddev->new_level != 4 5638 && mddev->new_level != 4
@@ -5478,7 +5667,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5478 if (conf == NULL) 5667 if (conf == NULL)
5479 goto abort; 5668 goto abort;
5480 /* Don't enable multi-threading by default*/ 5669 /* Don't enable multi-threading by default*/
5481 if (alloc_thread_groups(conf, 0)) 5670 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
5671 &new_group)) {
5672 conf->group_cnt = group_cnt;
5673 conf->worker_cnt_per_group = worker_cnt_per_group;
5674 conf->worker_groups = new_group;
5675 } else
5482 goto abort; 5676 goto abort;
5483 spin_lock_init(&conf->device_lock); 5677 spin_lock_init(&conf->device_lock);
5484 seqcount_init(&conf->gen_lock); 5678 seqcount_init(&conf->gen_lock);
@@ -5488,7 +5682,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5488 INIT_LIST_HEAD(&conf->hold_list); 5682 INIT_LIST_HEAD(&conf->hold_list);
5489 INIT_LIST_HEAD(&conf->delayed_list); 5683 INIT_LIST_HEAD(&conf->delayed_list);
5490 INIT_LIST_HEAD(&conf->bitmap_list); 5684 INIT_LIST_HEAD(&conf->bitmap_list);
5491 INIT_LIST_HEAD(&conf->inactive_list);
5492 init_llist_head(&conf->released_stripes); 5685 init_llist_head(&conf->released_stripes);
5493 atomic_set(&conf->active_stripes, 0); 5686 atomic_set(&conf->active_stripes, 0);
5494 atomic_set(&conf->preread_active_stripes, 0); 5687 atomic_set(&conf->preread_active_stripes, 0);
@@ -5514,6 +5707,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5514 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5707 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
5515 goto abort; 5708 goto abort;
5516 5709
5710 /* We init hash_locks[0] separately to that it can be used
5711 * as the reference lock in the spin_lock_nest_lock() call
5712 * in lock_all_device_hash_locks_irq in order to convince
5713 * lockdep that we know what we are doing.
5714 */
5715 spin_lock_init(conf->hash_locks);
5716 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
5717 spin_lock_init(conf->hash_locks + i);
5718
5719 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5720 INIT_LIST_HEAD(conf->inactive_list + i);
5721
5722 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5723 INIT_LIST_HEAD(conf->temp_inactive_list + i);
5724
5517 conf->level = mddev->new_level; 5725 conf->level = mddev->new_level;
5518 if (raid5_alloc_percpu(conf) != 0) 5726 if (raid5_alloc_percpu(conf) != 0)
5519 goto abort; 5727 goto abort;
@@ -5554,7 +5762,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5554 else 5762 else
5555 conf->max_degraded = 1; 5763 conf->max_degraded = 1;
5556 conf->algorithm = mddev->new_layout; 5764 conf->algorithm = mddev->new_layout;
5557 conf->max_nr_stripes = NR_STRIPES;
5558 conf->reshape_progress = mddev->reshape_position; 5765 conf->reshape_progress = mddev->reshape_position;
5559 if (conf->reshape_progress != MaxSector) { 5766 if (conf->reshape_progress != MaxSector) {
5560 conf->prev_chunk_sectors = mddev->chunk_sectors; 5767 conf->prev_chunk_sectors = mddev->chunk_sectors;
@@ -5563,7 +5770,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5563 5770
5564 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5771 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
5565 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5772 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
5566 if (grow_stripes(conf, conf->max_nr_stripes)) { 5773 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
5774 if (grow_stripes(conf, NR_STRIPES)) {
5567 printk(KERN_ERR 5775 printk(KERN_ERR
5568 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5776 "md/raid:%s: couldn't allocate %dkB for buffers\n",
5569 mdname(mddev), memory); 5777 mdname(mddev), memory);
@@ -6369,12 +6577,18 @@ static int raid5_start_reshape(struct mddev *mddev)
6369 if (!mddev->sync_thread) { 6577 if (!mddev->sync_thread) {
6370 mddev->recovery = 0; 6578 mddev->recovery = 0;
6371 spin_lock_irq(&conf->device_lock); 6579 spin_lock_irq(&conf->device_lock);
6580 write_seqcount_begin(&conf->gen_lock);
6372 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 6581 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
6582 mddev->new_chunk_sectors =
6583 conf->chunk_sectors = conf->prev_chunk_sectors;
6584 mddev->new_layout = conf->algorithm = conf->prev_algo;
6373 rdev_for_each(rdev, mddev) 6585 rdev_for_each(rdev, mddev)
6374 rdev->new_data_offset = rdev->data_offset; 6586 rdev->new_data_offset = rdev->data_offset;
6375 smp_wmb(); 6587 smp_wmb();
6588 conf->generation --;
6376 conf->reshape_progress = MaxSector; 6589 conf->reshape_progress = MaxSector;
6377 mddev->reshape_position = MaxSector; 6590 mddev->reshape_position = MaxSector;
6591 write_seqcount_end(&conf->gen_lock);
6378 spin_unlock_irq(&conf->device_lock); 6592 spin_unlock_irq(&conf->device_lock);
6379 return -EAGAIN; 6593 return -EAGAIN;
6380 } 6594 }
@@ -6462,27 +6676,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
6462 break; 6676 break;
6463 6677
6464 case 1: /* stop all writes */ 6678 case 1: /* stop all writes */
6465 spin_lock_irq(&conf->device_lock); 6679 lock_all_device_hash_locks_irq(conf);
6466 /* '2' tells resync/reshape to pause so that all 6680 /* '2' tells resync/reshape to pause so that all
6467 * active stripes can drain 6681 * active stripes can drain
6468 */ 6682 */
6469 conf->quiesce = 2; 6683 conf->quiesce = 2;
6470 wait_event_lock_irq(conf->wait_for_stripe, 6684 wait_event_cmd(conf->wait_for_stripe,
6471 atomic_read(&conf->active_stripes) == 0 && 6685 atomic_read(&conf->active_stripes) == 0 &&
6472 atomic_read(&conf->active_aligned_reads) == 0, 6686 atomic_read(&conf->active_aligned_reads) == 0,
6473 conf->device_lock); 6687 unlock_all_device_hash_locks_irq(conf),
6688 lock_all_device_hash_locks_irq(conf));
6474 conf->quiesce = 1; 6689 conf->quiesce = 1;
6475 spin_unlock_irq(&conf->device_lock); 6690 unlock_all_device_hash_locks_irq(conf);
6476 /* allow reshape to continue */ 6691 /* allow reshape to continue */
6477 wake_up(&conf->wait_for_overlap); 6692 wake_up(&conf->wait_for_overlap);
6478 break; 6693 break;
6479 6694
6480 case 0: /* re-enable writes */ 6695 case 0: /* re-enable writes */
6481 spin_lock_irq(&conf->device_lock); 6696 lock_all_device_hash_locks_irq(conf);
6482 conf->quiesce = 0; 6697 conf->quiesce = 0;
6483 wake_up(&conf->wait_for_stripe); 6698 wake_up(&conf->wait_for_stripe);
6484 wake_up(&conf->wait_for_overlap); 6699 wake_up(&conf->wait_for_overlap);
6485 spin_unlock_irq(&conf->device_lock); 6700 unlock_all_device_hash_locks_irq(conf);
6486 break; 6701 break;
6487 } 6702 }
6488} 6703}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index b42e6b462eda..01ad8ae8f578 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -205,6 +205,7 @@ struct stripe_head {
205 short pd_idx; /* parity disk index */ 205 short pd_idx; /* parity disk index */
206 short qd_idx; /* 'Q' disk index for raid6 */ 206 short qd_idx; /* 'Q' disk index for raid6 */
207 short ddf_layout;/* use DDF ordering to calculate Q */ 207 short ddf_layout;/* use DDF ordering to calculate Q */
208 short hash_lock_index;
208 unsigned long state; /* state flags */ 209 unsigned long state; /* state flags */
209 atomic_t count; /* nr of active thread/requests */ 210 atomic_t count; /* nr of active thread/requests */
210 int bm_seq; /* sequence number for bitmap flushes */ 211 int bm_seq; /* sequence number for bitmap flushes */
@@ -367,9 +368,18 @@ struct disk_info {
367 struct md_rdev *rdev, *replacement; 368 struct md_rdev *rdev, *replacement;
368}; 369};
369 370
371/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
372 * This is because we sometimes take all the spinlocks
373 * and creating that much locking depth can cause
374 * problems.
375 */
376#define NR_STRIPE_HASH_LOCKS 8
377#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
378
370struct r5worker { 379struct r5worker {
371 struct work_struct work; 380 struct work_struct work;
372 struct r5worker_group *group; 381 struct r5worker_group *group;
382 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
373 bool working; 383 bool working;
374}; 384};
375 385
@@ -382,6 +392,8 @@ struct r5worker_group {
382 392
383struct r5conf { 393struct r5conf {
384 struct hlist_head *stripe_hashtbl; 394 struct hlist_head *stripe_hashtbl;
395 /* only protect corresponding hash list and inactive_list */
396 spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS];
385 struct mddev *mddev; 397 struct mddev *mddev;
386 int chunk_sectors; 398 int chunk_sectors;
387 int level, algorithm; 399 int level, algorithm;
@@ -462,7 +474,8 @@ struct r5conf {
462 * Free stripes pool 474 * Free stripes pool
463 */ 475 */
464 atomic_t active_stripes; 476 atomic_t active_stripes;
465 struct list_head inactive_list; 477 struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
478 atomic_t empty_inactive_list_nr;
466 struct llist_head released_stripes; 479 struct llist_head released_stripes;
467 wait_queue_head_t wait_for_stripe; 480 wait_queue_head_t wait_for_stripe;
468 wait_queue_head_t wait_for_overlap; 481 wait_queue_head_t wait_for_overlap;
@@ -477,6 +490,7 @@ struct r5conf {
477 * the new thread here until we fully activate the array. 490 * the new thread here until we fully activate the array.
478 */ 491 */
479 struct md_thread *thread; 492 struct md_thread *thread;
493 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
480 struct r5worker_group *worker_groups; 494 struct r5worker_group *worker_groups;
481 int group_cnt; 495 int group_cnt;
482 int worker_cnt_per_group; 496 int worker_cnt_per_group;