Merge tag 'v3.13-rc4' into core/locking

Merge Linux 3.13-rc4, to refresh this rather old tree with the latest fixes. Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2013-12-17 09:27:08 -0500
committer: Ingo Molnar <mingo@kernel.org> 2013-12-17 09:27:08 -0500
commit: bb799d3b980eb803ca2da4a4eefbd9308f8d988a (patch)
tree: 69fbe0cd6d47b23a50f5e1d87bf7489532fae149 /drivers/md
parent: 919fc6e34831d1c2b58bfb5ae261dc3facc9b269 (diff)
parent: 319e2e3f63c348a9b66db4667efa73178e18b17d (diff)
21 files changed, 760 insertions, 269 deletions
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 173cbb20d104..54bdd923316f 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1717,6 +1717,11 @@ static int __init dm_bufio_init(void)
 {
        __u64 mem;
+        dm_bufio_allocated_kmem_cache = 0;
+        dm_bufio_allocated_get_free_pages = 0;
+        dm_bufio_allocated_vmalloc = 0;
+        dm_bufio_current_allocated = 0;
        memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
        memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 416b7b752a6e..64780ad73bb0 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -730,15 +730,18 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
        int r = 0;
        bool updated = updated_this_tick(mq, e);
-        requeue_and_update_tick(mq, e);
        if ((!discarded_oblock && updated) ||
-            !should_promote(mq, e, discarded_oblock, data_dir))
+            !should_promote(mq, e, discarded_oblock, data_dir)) {
+                requeue_and_update_tick(mq, e);
                result->op = POLICY_MISS;
-        else if (!can_migrate)
+        } else if (!can_migrate)
                r = -EWOULDBLOCK;
-        else
+        else {
+                requeue_and_update_tick(mq, e);
                r = pre_cache_to_cache(mq, e, result);
+        }
        return r;
 }
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9efcf1059b99..1b1469ebe5cb 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2755,7 +2755,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
 {
        int r;
-        r = dm_cache_resize(cache->cmd, cache->cache_size);
+        r = dm_cache_resize(cache->cmd, new_size);
        if (r) {
                DMERR("could not resize cache metadata");
                return r;
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 496d5f3646a5..2f91d6d4a2cc 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -20,6 +20,7 @@
 struct delay_c {
        struct timer_list delay_timer;
        struct mutex timer_lock;
+        struct workqueue_struct *kdelayd_wq;
        struct work_struct flush_expired_bios;
        struct list_head delayed_bios;
        atomic_t may_delay;
@@ -45,14 +46,13 @@ struct dm_delay_info {
 static DEFINE_MUTEX(delayed_bios_lock);
-static struct workqueue_struct *kdelayd_wq;
 static struct kmem_cache *delayed_cache;
 static void handle_delayed_timer(unsigned long data)
 {
        struct delay_c *dc = (struct delay_c *)data;
-        queue_work(kdelayd_wq, &dc->flush_expired_bios);
+        queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
 }
 static void queue_timeout(struct delay_c *dc, unsigned long expires)
@@ -191,6 +191,12 @@ out:
                goto bad_dev_write;
        }
+        dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
+        if (!dc->kdelayd_wq) {
+                DMERR("Couldn't start kdelayd");
+                goto bad_queue;
+        }
        setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc);
        INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
@@ -203,6 +209,8 @@ out:
        ti->private = dc;
        return 0;
+bad_queue:
+        mempool_destroy(dc->delayed_pool);
 bad_dev_write:
        if (dc->dev_write)
                dm_put_device(ti, dc->dev_write);
@@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti)
 {
        struct delay_c *dc = ti->private;
-        flush_workqueue(kdelayd_wq);
+        destroy_workqueue(dc->kdelayd_wq);
        dm_put_device(ti, dc->dev_read);
@@ -350,12 +358,6 @@ static int __init dm_delay_init(void)
 {
        int r = -ENOMEM;
-        kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
-        if (!kdelayd_wq) {
-                DMERR("Couldn't start kdelayd");
-                goto bad_queue;
-        }
        delayed_cache = KMEM_CACHE(dm_delay_info, 0);
        if (!delayed_cache) {
                DMERR("Couldn't create delayed bio cache.");
@@ -373,8 +375,6 @@ static int __init dm_delay_init(void)
 bad_register:
        kmem_cache_destroy(delayed_cache);
 bad_memcache:
-        destroy_workqueue(kdelayd_wq);
-bad_queue:
        return r;
 }
@@ -382,7 +382,6 @@ static void __exit dm_delay_exit(void)
 {
        dm_unregister_target(&delay_target);
        kmem_cache_destroy(delayed_cache);
-        destroy_workqueue(kdelayd_wq);
 }
 /* Module hooks */
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index aec57d76db5d..944690bafd93 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -66,6 +66,18 @@ struct dm_snapshot {
        atomic_t pending_exceptions_count;
+        /* Protected by "lock" */
+        sector_t exception_start_sequence;
+        /* Protected by kcopyd single-threaded callback */
+        sector_t exception_complete_sequence;
+        /*
+         * A list of pending exceptions that completed out of order.
+         * Protected by kcopyd single-threaded callback.
+         */
+        struct list_head out_of_order_list;
        mempool_t *pending_pool;
        struct dm_exception_table pending;
@@ -173,6 +185,14 @@ struct dm_snap_pending_exception {
         */
        int started;
+        /* There was copying error. */
+        int copy_error;
+        /* A sequence number, it is used for in-order completion. */
+        sector_t exception_sequence;
+        struct list_head out_of_order_entry;
        /*
         * For writing a complete chunk, bypassing the copy.
         */
@@ -1094,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        s->valid = 1;
        s->active = 0;
        atomic_set(&s->pending_exceptions_count, 0);
+        s->exception_start_sequence = 0;
+        s->exception_complete_sequence = 0;
+        INIT_LIST_HEAD(&s->out_of_order_list);
        init_rwsem(&s->lock);
        INIT_LIST_HEAD(&s->list);
        spin_lock_init(&s->pe_lock);
@@ -1443,6 +1466,19 @@ static void commit_callback(void *context, int success)
        pending_complete(pe, success);
 }
+static void complete_exception(struct dm_snap_pending_exception *pe)
+{
+        struct dm_snapshot *s = pe->snap;
+        if (unlikely(pe->copy_error))
+                pending_complete(pe, 0);
+        else
+                /* Update the metadata if we are persistent */
+                s->store->type->commit_exception(s->store, &pe->e,
+                                                 commit_callback, pe);
+}
 /*
 * Called when the copy I/O has finished.  kcopyd actually runs
 * this code so don't block.
@@ -1452,13 +1488,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
        struct dm_snap_pending_exception *pe = context;
        struct dm_snapshot *s = pe->snap;
-        if (read_err || write_err)
+        pe->copy_error = read_err || write_err;
-                pending_complete(pe, 0);
-        else
+        if (pe->exception_sequence == s->exception_complete_sequence) {
-                /* Update the metadata if we are persistent */
+                s->exception_complete_sequence++;
-                s->store->type->commit_exception(s->store, &pe->e,
+                complete_exception(pe);
-                                                 commit_callback, pe);
+                while (!list_empty(&s->out_of_order_list)) {
+                        pe = list_entry(s->out_of_order_list.next,
+                                        struct dm_snap_pending_exception, out_of_order_entry);
+                        if (pe->exception_sequence != s->exception_complete_sequence)
+                                break;
+                        s->exception_complete_sequence++;
+                        list_del(&pe->out_of_order_entry);
+                        complete_exception(pe);
+                }
+        } else {
+                struct list_head *lh;
+                struct dm_snap_pending_exception *pe2;
+                list_for_each_prev(lh, &s->out_of_order_list) {
+                        pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry);
+                        if (pe2->exception_sequence < pe->exception_sequence)
+                                break;
+                }
+                list_add(&pe->out_of_order_entry, lh);
+        }
 }
 /*
@@ -1553,6 +1608,8 @@ __find_pending_exception(struct dm_snapshot *s,
                return NULL;
        }
+        pe->exception_sequence = s->exception_start_sequence++;
        dm_insert_exception(&s->pending, &pe->e);
        return pe;
@@ -2192,7 +2249,7 @@ static struct target_type origin_target = {
 static struct target_type snapshot_target = {
        .name    = "snapshot",
-        .version = {1, 11, 1},
+        .version = {1, 12, 0},
        .module  = THIS_MODULE,
        .ctr     = snapshot_ctr,
        .dtr     = snapshot_dtr,
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 3d404c1371ed..28a90122a5a8 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -964,6 +964,7 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
 int __init dm_statistics_init(void)
 {
+        shared_memory_amount = 0;
        dm_stat_need_rcu_barrier = 0;
        return 0;
 }
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 465f08ca62b1..3ba6a3859ce3 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -200,6 +200,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
        num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
+        if (!num_targets) {
+                kfree(t);
+                return -ENOMEM;
+        }
        if (alloc_targets(t, num_targets)) {
                kfree(t);
                return -ENOMEM;
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 60bce435f4fa..8a30ad54bd46 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1697,6 +1697,14 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
        up_write(&pmd->root_lock);
 }
+void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
+{
+        down_write(&pmd->root_lock);
+        pmd->read_only = false;
+        dm_bm_set_read_write(pmd->bm);
+        up_write(&pmd->root_lock);
+}
 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
                                        dm_block_t threshold,
                                        dm_sm_threshold_fn fn,
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 845ebbe589a9..7bcc0e1d6238 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -193,6 +193,7 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_siz
 * that nothing is changing.
 */
 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
+void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd);
 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
                                        dm_block_t threshold,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2c0cf511ec23..ee29037ffc2e 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -640,7 +640,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
         */
        r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
        if (r) {
-                DMERR_LIMIT("dm_thin_insert_block() failed");
+                DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d",
+                            dm_device_name(pool->pool_md), r);
+                set_pool_mode(pool, PM_READ_ONLY);
                cell_error(pool, m->cell);
                goto out;
        }
@@ -881,32 +883,23 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
        }
 }
-static int commit(struct pool *pool)
-{
-        int r;
-        r = dm_pool_commit_metadata(pool->pmd);
-        if (r)
-                DMERR_LIMIT("%s: commit failed: error = %d",
-                            dm_device_name(pool->pool_md), r);
-        return r;
-}
 /*
 * A non-zero return indicates read_only or fail_io mode.
 * Many callers don't care about the return value.
 */
-static int commit_or_fallback(struct pool *pool)
+static int commit(struct pool *pool)
 {
        int r;
        if (get_pool_mode(pool) != PM_WRITE)
                return -EINVAL;
-        r = commit(pool);
+        r = dm_pool_commit_metadata(pool->pmd);
-        if (r)
+        if (r) {
+                DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d",
+                            dm_device_name(pool->pool_md), r);
                set_pool_mode(pool, PM_READ_ONLY);
+        }
        return r;
 }
@@ -943,7 +936,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
                 * Try to commit to see if that will free up some
                 * more space.
                 */
-                (void) commit_or_fallback(pool);
+                r = commit(pool);
+                if (r)
+                        return r;
                r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
                if (r)
@@ -957,7 +952,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
                 * table reload).
                 */
                if (!free_blocks) {
-                        DMWARN("%s: no free space available.",
+                        DMWARN("%s: no free data space available.",
                               dm_device_name(pool->pool_md));
                        spin_lock_irqsave(&pool->lock, flags);
                        pool->no_free_space = 1;
@@ -967,8 +962,16 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
        }
        r = dm_pool_alloc_data_block(pool->pmd, result);
-        if (r)
+        if (r) {
+                if (r == -ENOSPC &&
+                    !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
+                    !free_blocks) {
+                        DMWARN("%s: no free metadata space available.",
+                               dm_device_name(pool->pool_md));
+                        set_pool_mode(pool, PM_READ_ONLY);
+                }
                return r;
+        }
        return 0;
 }
@@ -1349,7 +1352,7 @@ static void process_deferred_bios(struct pool *pool)
        if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
                return;
-        if (commit_or_fallback(pool)) {
+        if (commit(pool)) {
                while ((bio = bio_list_pop(&bios)))
                        bio_io_error(bio);
                return;
@@ -1397,6 +1400,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
        case PM_FAIL:
                DMERR("%s: switching pool to failure mode",
                      dm_device_name(pool->pool_md));
+                dm_pool_metadata_read_only(pool->pmd);
                pool->process_bio = process_bio_fail;
                pool->process_discard = process_bio_fail;
                pool->process_prepared_mapping = process_prepared_mapping_fail;
@@ -1421,6 +1425,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
                break;
        case PM_WRITE:
+                dm_pool_metadata_read_write(pool->pmd);
                pool->process_bio = process_bio;
                pool->process_discard = process_discard;
                pool->process_prepared_mapping = process_prepared_mapping;
@@ -1637,12 +1642,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
        struct pool_c *pt = ti->private;
        /*
-         * We want to make sure that degraded pools are never upgraded.
+         * We want to make sure that a pool in PM_FAIL mode is never upgraded.
         */
        enum pool_mode old_mode = pool->pf.mode;
        enum pool_mode new_mode = pt->adjusted_pf.mode;
-        if (old_mode > new_mode)
+        /*
+         * If we were in PM_FAIL mode, rollback of metadata failed.  We're
+         * not going to recover without a thin_repair.  So we never let the
+         * pool move out of the old mode.  On the other hand a PM_READ_ONLY
+         * may have been due to a lack of metadata or data space, and may
+         * now work (ie. if the underlying devices have been resized).
+         */
+        if (old_mode == PM_FAIL)
                new_mode = old_mode;
        pool->ti = ti;
@@ -2266,7 +2278,7 @@ static int pool_preresume(struct dm_target *ti)
                return r;
        if (need_commit1 || need_commit2)
-                (void) commit_or_fallback(pool);
+                (void) commit(pool);
        return 0;
 }
@@ -2293,7 +2305,7 @@ static void pool_postsuspend(struct dm_target *ti)
        cancel_delayed_work(&pool->waker);
        flush_workqueue(pool->wq);
-        (void) commit_or_fallback(pool);
+        (void) commit(pool);
 }
 static int check_arg_count(unsigned argc, unsigned args_required)
@@ -2427,7 +2439,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct
        if (r)
                return r;
-        (void) commit_or_fallback(pool);
+        (void) commit(pool);
        r = dm_pool_reserve_metadata_snap(pool->pmd);
        if (r)
@@ -2489,7 +2501,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
                DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
        if (!r)
-                (void) commit_or_fallback(pool);
+                (void) commit(pool);
        return r;
 }
@@ -2544,7 +2556,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
                /* Commit to ensure statistics aren't out-of-date */
                if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
-                        (void) commit_or_fallback(pool);
+                        (void) commit(pool);
                r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
                if (r) {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8766eabb0014..21f4d7ff0da2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev)
 static struct ctl_table_header *raid_table_header;
-static ctl_table raid_table[] = {
+static struct ctl_table raid_table[] = {
        {
                .procname       = "speed_limit_min",
                .data           = &sysctl_speed_limit_min,
@@ -130,7 +130,7 @@ static ctl_table raid_table[] = {
        { }
 };
-static ctl_table raid_dir_table[] = {
+static struct ctl_table raid_dir_table[] = {
        {
                .procname       = "raid",
                .maxlen         = 0,
@@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = {
        { }
 };
-static ctl_table raid_root_table[] = {
+static struct ctl_table raid_root_table[] = {
        {
                .procname       = "dev",
                .maxlen         = 0,
@@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit)
        goto retry;
 }
-static inline int mddev_lock(struct mddev * mddev)
+static inline int __must_check mddev_lock(struct mddev * mddev)
 {
        return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
+/* Sometimes we need to take the lock in a situation where
+ * failure due to interrupts is not acceptable.
+ */
+static inline void mddev_lock_nointr(struct mddev * mddev)
+{
+        mutex_lock(&mddev->reconfig_mutex);
+}
 static inline int mddev_is_locked(struct mddev *mddev)
 {
        return mutex_is_locked(&mddev->reconfig_mutex);
@@ -768,16 +776,10 @@ void md_super_wait(struct mddev *mddev)
        finish_wait(&mddev->sb_wait, &wq);
 }
-static void bi_complete(struct bio *bio, int error)
-{
-        complete((struct completion*)bio->bi_private);
-}
 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
                 struct page *page, int rw, bool metadata_op)
 {
        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
-        struct completion event;
        int ret;
        rw |= REQ_SYNC;
@@ -793,11 +795,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
        else
                bio->bi_sector = sector + rdev->data_offset;
        bio_add_page(bio, page, size, 0);
-        init_completion(&event);
+        submit_bio_wait(rw, bio);
-        bio->bi_private = &event;
-        bio->bi_end_io = bi_complete;
-        submit_bio(rw, bio);
-        wait_for_completion(&event);
        ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
        bio_put(bio);
@@ -2978,7 +2976,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
                for_each_mddev(mddev, tmp) {
                        struct md_rdev *rdev2;
-                        mddev_lock(mddev);
+                        mddev_lock_nointr(mddev);
                        rdev_for_each(rdev2, mddev)
                                if (rdev->bdev == rdev2->bdev &&
                                    rdev != rdev2 &&
@@ -2994,7 +2992,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
                                break;
                        }
                }
-                mddev_lock(my_mddev);
+                mddev_lock_nointr(my_mddev);
                if (overlap) {
                        /* Someone else could have slipped in a size
                         * change here, but doing so is just silly.
@@ -3580,6 +3578,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                mddev->in_sync = 1;
                del_timer_sync(&mddev->safemode_timer);
        }
+        blk_set_stacking_limits(&mddev->queue->limits);
        pers->run(mddev);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
        mddev_resume(mddev);
@@ -5258,7 +5257,7 @@ static void __md_stop_writes(struct mddev *mddev)
 void md_stop_writes(struct mddev *mddev)
 {
-        mddev_lock(mddev);
+        mddev_lock_nointr(mddev);
        __md_stop_writes(mddev);
        mddev_unlock(mddev);
 }
@@ -5291,20 +5290,35 @@ EXPORT_SYMBOL_GPL(md_stop);
 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 {
        int err = 0;
+        int did_freeze = 0;
+        if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+                did_freeze = 1;
+                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+                md_wakeup_thread(mddev->thread);
+        }
+        if (mddev->sync_thread) {
+                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+                /* Thread might be blocked waiting for metadata update
+                 * which will now never happen */
+                wake_up_process(mddev->sync_thread->tsk);
+        }
+        mddev_unlock(mddev);
+        wait_event(resync_wait, mddev->sync_thread == NULL);
+        mddev_lock_nointr(mddev);
        mutex_lock(&mddev->open_mutex);
-        if (atomic_read(&mddev->openers) > !!bdev) {
+        if (atomic_read(&mddev->openers) > !!bdev ||
+            mddev->sync_thread ||
+            (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
                printk("md: %s still in use.\n",mdname(mddev));
+                if (did_freeze) {
+                        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+                        md_wakeup_thread(mddev->thread);
+                }
                err = -EBUSY;
                goto out;
        }
-        if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
-                /* Someone opened the device since we flushed it
-                 * so page cache could be dirty and it is too late
-                 * to flush.  So abort
-                 */
-                mutex_unlock(&mddev->open_mutex);
-                return -EBUSY;
-        }
        if (mddev->pers) {
                __md_stop_writes(mddev);
@@ -5315,7 +5329,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
                set_disk_ro(mddev->gendisk, 1);
                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                sysfs_notify_dirent_safe(mddev->sysfs_state);
-                err = 0;        
+                err = 0;
        }
 out:
        mutex_unlock(&mddev->open_mutex);
@@ -5331,20 +5345,34 @@ static int do_md_stop(struct mddev * mddev, int mode,
 {
        struct gendisk *disk = mddev->gendisk;
        struct md_rdev *rdev;
+        int did_freeze = 0;
+        if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+                did_freeze = 1;
+                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+                md_wakeup_thread(mddev->thread);
+        }
+        if (mddev->sync_thread) {
+                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+                /* Thread might be blocked waiting for metadata update
+                 * which will now never happen */
+                wake_up_process(mddev->sync_thread->tsk);
+        }
+        mddev_unlock(mddev);
+        wait_event(resync_wait, mddev->sync_thread == NULL);
+        mddev_lock_nointr(mddev);
        mutex_lock(&mddev->open_mutex);
        if (atomic_read(&mddev->openers) > !!bdev ||
-            mddev->sysfs_active) {
+            mddev->sysfs_active ||
+            mddev->sync_thread ||
+            (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
                printk("md: %s still in use.\n",mdname(mddev));
                mutex_unlock(&mddev->open_mutex);
-                return -EBUSY;
+                if (did_freeze) {
-        }
+                        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-        if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
+                        md_wakeup_thread(mddev->thread);
-                /* Someone opened the device since we flushed it
+                }
-                 * so page cache could be dirty and it is too late
-                 * to flush.  So abort
-                 */
-                mutex_unlock(&mddev->open_mutex);
                return -EBUSY;
        }
        if (mddev->pers) {
@@ -6551,7 +6579,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
                                wait_event(mddev->sb_wait,
                                           !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
                                           !test_bit(MD_CHANGE_PENDING, &mddev->flags));
-                                mddev_lock(mddev);
+                                mddev_lock_nointr(mddev);
                        }
                } else {
                        err = -EROFS;
@@ -7361,9 +7389,6 @@ void md_do_sync(struct md_thread *thread)
                mddev->curr_resync = 2;
        try_again:
-                if (kthread_should_stop())
-                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
                        goto skip;
                for_each_mddev(mddev2, tmp) {
@@ -7388,7 +7413,7 @@ void md_do_sync(struct md_thread *thread)
                                 * be caught by 'softlockup'
                                 */
                                prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
-                                if (!kthread_should_stop() &&
+                                if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
                                    mddev2->curr_resync >= mddev->curr_resync) {
                                        printk(KERN_INFO "md: delaying %s of %s"
                                               " until %s has finished (they"
@@ -7464,7 +7489,7 @@ void md_do_sync(struct md_thread *thread)
        last_check = 0;
        if (j>2) {
-                printk(KERN_INFO 
+                printk(KERN_INFO
                       "md: resuming %s of %s from checkpoint.\n",
                       desc, mdname(mddev));
                mddev->curr_resync = j;
@@ -7501,7 +7526,8 @@ void md_do_sync(struct md_thread *thread)
                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
                }
-                while (j >= mddev->resync_max && !kthread_should_stop()) {
+                while (j >= mddev->resync_max &&
+                       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
                        /* As this condition is controlled by user-space,
                         * we can block indefinitely, so use '_interruptible'
                         * to avoid triggering warnings.
@@ -7509,17 +7535,18 @@ void md_do_sync(struct md_thread *thread)
                        flush_signals(current); /* just in case */
                        wait_event_interruptible(mddev->recovery_wait,
                                                 mddev->resync_max > j
-                                                 || kthread_should_stop());
+                                                 || test_bit(MD_RECOVERY_INTR,
+                                                             &mddev->recovery));
                }
-                if (kthread_should_stop())
+                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
-                        goto interrupted;
+                        break;
                sectors = mddev->pers->sync_request(mddev, j, &skipped,
                                                  currspeed < speed_min(mddev));
                if (sectors == 0) {
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                        goto out;
+                        break;
                }
                if (!skipped) { /* actual IO requested */
@@ -7556,10 +7583,8 @@ void md_do_sync(struct md_thread *thread)
                        last_mark = next;
                }
+                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
-                if (kthread_should_stop())
+                        break;
-                        goto interrupted;
                /*
                 * this loop exits only if either when we are slower than
@@ -7582,11 +7607,12 @@ void md_do_sync(struct md_thread *thread)
                        }
                }
        }
-        printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
+        printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
+               test_bit(MD_RECOVERY_INTR, &mddev->recovery)
+               ? "interrupted" : "done");
        /*
         * this also signals 'finished resyncing' to md_stop
         */
- out:
        blk_finish_plug(&plug);
        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
@@ -7640,16 +7666,6 @@ void md_do_sync(struct md_thread *thread)
        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
        return;
- interrupted:
-        /*
-         * got a signal, exit.
-         */
-        printk(KERN_INFO
-               "md: md_do_sync() got signal ... exiting\n");
-        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-        goto out;
 }
 EXPORT_SYMBOL_GPL(md_do_sync);
@@ -7751,7 +7767,7 @@ void md_check_recovery(struct mddev *mddev)
        if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
                return;
        if ( ! (
-                (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) ||
+                (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
                (mddev->external == 0 && mddev->safemode == 1) ||
@@ -7894,6 +7910,7 @@ void md_reap_sync_thread(struct mddev *mddev)
        /* resync has finished, collect result */
        md_unregister_thread(&mddev->sync_thread);
+        wake_up(&resync_wait);
        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
                /* success...*/
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index af96e24ec328..1d75b1dc1e2e 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
         * The shadow op will often be a noop.  Only insert if it really
         * copied data.
         */
-        if (dm_block_location(*block) != b)
+        if (dm_block_location(*block) != b) {
+                /*
+                 * dm_tm_shadow_block will have already decremented the old
+                 * block, but it is still referenced by the btree.  We
+                 * increment to stop the insert decrementing it below zero
+                 * when overwriting the old value.
+                 */
+                dm_tm_inc(info->btree_info.tm, b);
                r = insert_ablock(info, index, *block, root);
+        }
        return r;
 }
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index a7e8bf296388..064a3c271baa 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -626,6 +626,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm)
 }
 EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
+void dm_bm_set_read_write(struct dm_block_manager *bm)
+{
+        bm->read_only = false;
+}
+EXPORT_SYMBOL_GPL(dm_bm_set_read_write);
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
 {
        return crc32c(~(u32) 0, data, len) ^ init_xor;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 9a82083a66b6..13cd58e1fe69 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -108,9 +108,9 @@ int dm_bm_unlock(struct dm_block *b);
 int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
                           struct dm_block *superblock);
- /*
+/*
-  * Request data be prefetched into the cache.
+ * Request data is prefetched into the cache.
-  */
+ */
 void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
 /*
@@ -125,6 +125,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
 * be returned if you do.
 */
 void dm_bm_set_read_only(struct dm_block_manager *bm);
+void dm_bm_set_read_write(struct dm_block_manager *bm);
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 6058569fe86c..466a60bbd716 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -381,7 +381,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
 }
 static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
-                        uint32_t (*mutator)(void *context, uint32_t old),
+                        int (*mutator)(void *context, uint32_t old, uint32_t *new),
                        void *context, enum allocation_event *ev)
 {
        int r;
@@ -410,11 +410,17 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
        if (old > 2) {
                r = sm_ll_lookup_big_ref_count(ll, b, &old);
-                if (r < 0)
+                if (r < 0) {
+                        dm_tm_unlock(ll->tm, nb);
                        return r;
+                }
        }
-        ref_count = mutator(context, old);
+        r = mutator(context, old, &ref_count);
+        if (r) {
+                dm_tm_unlock(ll->tm, nb);
+                return r;
+        }
        if (ref_count <= 2) {
                sm_set_bitmap(bm_le, bit, ref_count);
@@ -465,9 +471,10 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
        return ll->save_ie(ll, index, &ie_disk);
 }
-static uint32_t set_ref_count(void *context, uint32_t old)
+static int set_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-        return *((uint32_t *) context);
+        *new = *((uint32_t *) context);
+        return 0;
 }
 int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
@@ -476,9 +483,10 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
        return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev);
 }
-static uint32_t inc_ref_count(void *context, uint32_t old)
+static int inc_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-        return old + 1;
+        *new = old + 1;
+        return 0;
 }
 int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
@@ -486,9 +494,15 @@ int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
        return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev);
 }
-static uint32_t dec_ref_count(void *context, uint32_t old)
+static int dec_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-        return old - 1;
+        if (!old) {
+                DMERR_LIMIT("unable to decrement a reference count below 0");
+                return -EINVAL;
+        }
+        *new = old - 1;
+        return 0;
 }
 int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 1c959684caef..58fc1eef7499 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
        int r = sm_metadata_new_block_(sm, b);
-        if (r)
+        if (r) {
                DMERR("unable to allocate new metadata block");
+                return r;
+        }
        r = sm_metadata_get_nr_free(sm, &count);
-        if (r)
+        if (r) {
                DMERR("couldn't get free block count");
+                return r;
+        }
        check_threshold(&smm->threshold, count);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index af6681b19776..1e5a540995e9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -66,7 +66,8 @@
 */
 static int max_queued_requests = 1024;
-static void allow_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+                          sector_t bi_sector);
 static void lower_barrier(struct r1conf *conf);
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
@@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 }
 #define RESYNC_BLOCK_SIZE (64*1024)
-//#define RESYNC_BLOCK_SIZE PAGE_SIZE
+#define RESYNC_DEPTH 32
 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
-#define RESYNC_WINDOW (2048*1024)
+#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
+#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
+#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
        struct bio *bio = r1_bio->master_bio;
        int done;
        struct r1conf *conf = r1_bio->mddev->private;
+        sector_t start_next_window = r1_bio->start_next_window;
+        sector_t bi_sector = bio->bi_sector;
        if (bio->bi_phys_segments) {
                unsigned long flags;
@@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
                bio->bi_phys_segments--;
                done = (bio->bi_phys_segments == 0);
                spin_unlock_irqrestore(&conf->device_lock, flags);
+                /*
+                 * make_request() might be waiting for
+                 * bi_phys_segments to decrease
+                 */
+                wake_up(&conf->wait_barrier);
        } else
                done = 1;
@@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
                 * Wake up any possible resync thread that waits for the device
                 * to go idle.
                 */
-                allow_barrier(conf);
+                allow_barrier(conf, start_next_window, bi_sector);
        }
 }
@@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf)
 *    there is no normal IO happeing.  It must arrange to call
 *    lower_barrier when the particular background IO completes.
 */
-#define RESYNC_DEPTH 32
 static void raise_barrier(struct r1conf *conf)
 {
        spin_lock_irq(&conf->resync_lock);
@@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf)
        /* block any new IO from starting */
        conf->barrier++;
-        /* Now wait for all pending IO to complete */
+        /* For these conditions we must wait:
+         * A: while the array is in frozen state
+         * B: while barrier >= RESYNC_DEPTH, meaning resync reach
+         *    the max count which allowed.
+         * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
+         *    next resync will reach to the window which normal bios are
+         *    handling.
+         */
        wait_event_lock_irq(conf->wait_barrier,
-                            !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+                            !conf->array_frozen &&
+                            conf->barrier < RESYNC_DEPTH &&
+                            (conf->start_next_window >=
+                             conf->next_resync + RESYNC_SECTORS),
                            conf->resync_lock);
        spin_unlock_irq(&conf->resync_lock);
@@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf)
        wake_up(&conf->wait_barrier);
 }
-static void wait_barrier(struct r1conf *conf)
+static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
 {
+        bool wait = false;
+        if (conf->array_frozen || !bio)
+                wait = true;
+        else if (conf->barrier && bio_data_dir(bio) == WRITE) {
+                if (conf->next_resync < RESYNC_WINDOW_SECTORS)
+                        wait = true;
+                else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
+                                >= bio_end_sector(bio)) ||
+                         (conf->next_resync + NEXT_NORMALIO_DISTANCE
+                                <= bio->bi_sector))
+                        wait = false;
+                else
+                        wait = true;
+        }
+        return wait;
+}
+static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
+{
+        sector_t sector = 0;
        spin_lock_irq(&conf->resync_lock);
-        if (conf->barrier) {
+        if (need_to_wait_for_sync(conf, bio)) {
                conf->nr_waiting++;
                /* Wait for the barrier to drop.
                 * However if there are already pending
@@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf)
                 * count down.
                 */
                wait_event_lock_irq(conf->wait_barrier,
-                                    !conf->barrier ||
+                                    !conf->array_frozen &&
-                                    (conf->nr_pending &&
+                                    (!conf->barrier ||
+                                    ((conf->start_next_window <
+                                      conf->next_resync + RESYNC_SECTORS) &&
                                     current->bio_list &&
-                                     !bio_list_empty(current->bio_list)),
+                                     !bio_list_empty(current->bio_list))),
                                    conf->resync_lock);
                conf->nr_waiting--;
        }
+        if (bio && bio_data_dir(bio) == WRITE) {
+                if (conf->next_resync + NEXT_NORMALIO_DISTANCE
+                    <= bio->bi_sector) {
+                        if (conf->start_next_window == MaxSector)
+                                conf->start_next_window =
+                                        conf->next_resync +
+                                        NEXT_NORMALIO_DISTANCE;
+                        if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
+                            <= bio->bi_sector)
+                                conf->next_window_requests++;
+                        else
+                                conf->current_window_requests++;
+                }
+                if (bio->bi_sector >= conf->start_next_window)
+                        sector = conf->start_next_window;
+        }
        conf->nr_pending++;
        spin_unlock_irq(&conf->resync_lock);
+        return sector;
 }
-static void allow_barrier(struct r1conf *conf)
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+                          sector_t bi_sector)
 {
        unsigned long flags;
        spin_lock_irqsave(&conf->resync_lock, flags);
        conf->nr_pending--;
+        if (start_next_window) {
+                if (start_next_window == conf->start_next_window) {
+                        if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
+                            <= bi_sector)
+                                conf->next_window_requests--;
+                        else
+                                conf->current_window_requests--;
+                } else
+                        conf->current_window_requests--;
+                if (!conf->current_window_requests) {
+                        if (conf->next_window_requests) {
+                                conf->current_window_requests =
+                                        conf->next_window_requests;
+                                conf->next_window_requests = 0;
+                                conf->start_next_window +=
+                                        NEXT_NORMALIO_DISTANCE;
+                        } else
+                                conf->start_next_window = MaxSector;
+                }
+        }
        spin_unlock_irqrestore(&conf->resync_lock, flags);
        wake_up(&conf->wait_barrier);
 }
@@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra)
 {
        /* stop syncio and normal IO and wait for everything to
         * go quite.
-         * We increment barrier and nr_waiting, and then
+         * We wait until nr_pending match nr_queued+extra
-         * wait until nr_pending match nr_queued+extra
         * This is called in the context of one normal IO request
         * that has failed. Thus any sync request that might be pending
         * will be blocked by nr_pending, and we need to wait for
@@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra)
         * we continue.
         */
        spin_lock_irq(&conf->resync_lock);
-        conf->barrier++;
+        conf->array_frozen = 1;
-        conf->nr_waiting++;
        wait_event_lock_irq_cmd(conf->wait_barrier,
                                conf->nr_pending == conf->nr_queued+extra,
                                conf->resync_lock,
@@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf)
 {
        /* reverse the effect of the freeze */
        spin_lock_irq(&conf->resync_lock);
-        conf->barrier--;
+        conf->array_frozen = 0;
-        conf->nr_waiting--;
        wake_up(&conf->wait_barrier);
        spin_unlock_irq(&conf->resync_lock);
 }
@@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        int first_clone;
        int sectors_handled;
        int max_sectors;
+        sector_t start_next_window;
        /*
         * Register the new request and wait if the reconstruction
@@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
                finish_wait(&conf->wait_barrier, &w);
        }
-        wait_barrier(conf);
+        start_next_window = wait_barrier(conf, bio);
        bitmap = mddev->bitmap;
@@ -1163,6 +1247,7 @@ read_again:
        disks = conf->raid_disks * 2;
 retry_write:
+        r1_bio->start_next_window = start_next_window;
        blocked_rdev = NULL;
        rcu_read_lock();
        max_sectors = r1_bio->sectors;
@@ -1231,14 +1316,24 @@ read_again:
        if (unlikely(blocked_rdev)) {
                /* Wait for this device to become unblocked */
                int j;
+                sector_t old = start_next_window;
                for (j = 0; j < i; j++)
                        if (r1_bio->bios[j])
                                rdev_dec_pending(conf->mirrors[j].rdev, mddev);
                r1_bio->state = 0;
-                allow_barrier(conf);
+                allow_barrier(conf, start_next_window, bio->bi_sector);
                md_wait_for_blocked_rdev(blocked_rdev, mddev);
-                wait_barrier(conf);
+                start_next_window = wait_barrier(conf, bio);
+                /*
+                 * We must make sure the multi r1bios of bio have
+                 * the same value of bi_phys_segments
+                 */
+                if (bio->bi_phys_segments && old &&
+                    old != start_next_window)
+                        /* Wait for the former r1bio(s) to complete */
+                        wait_event(conf->wait_barrier,
+                                   bio->bi_phys_segments == 1);
                goto retry_write;
        }
@@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf)
 static void close_sync(struct r1conf *conf)
 {
-        wait_barrier(conf);
+        wait_barrier(conf, NULL);
-        allow_barrier(conf);
+        allow_barrier(conf, 0, 0);
        mempool_destroy(conf->r1buf_pool);
        conf->r1buf_pool = NULL;
+        conf->next_resync = 0;
+        conf->start_next_window = MaxSector;
 }
 static int raid1_spare_active(struct mddev *mddev)
@@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        conf->pending_count = 0;
        conf->recovery_disabled = mddev->recovery_disabled - 1;
+        conf->start_next_window = MaxSector;
+        conf->current_window_requests = conf->next_window_requests = 0;
        err = -EIO;
        for (i = 0; i < conf->raid_disks * 2; i++) {
@@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev)
                           atomic_read(&bitmap->behind_writes) == 0);
        }
-        raise_barrier(conf);
+        freeze_array(conf, 0);
-        lower_barrier(conf);
+        unfreeze_array(conf);
        md_unregister_thread(&mddev->thread);
        if (conf->r1bio_pool)
@@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
                wake_up(&conf->wait_barrier);
                break;
        case 1:
-                raise_barrier(conf);
+                freeze_array(conf, 0);
                break;
        case 0:
-                lower_barrier(conf);
+                unfreeze_array(conf);
                break;
        }
 }
@@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev)
                mddev->new_chunk_sectors = 0;
                conf = setup_conf(mddev);
                if (!IS_ERR(conf))
-                        conf->barrier = 1;
+                        /* Array must appear to be quiesced */
+                        conf->array_frozen = 1;
                return conf;
        }
        return ERR_PTR(-EINVAL);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 0ff3715fb7eb..9bebca7bff2f 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -41,6 +41,19 @@ struct r1conf {
         */
        sector_t                next_resync;
+        /* When raid1 starts resync, we divide array into four partitions
+         * |---------|--------------|---------------------|-------------|
+         *        next_resync   start_next_window       end_window
+         * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
+         * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
+         * current_window_requests means the count of normalIO between
+         *   start_next_window and end_window.
+         * next_window_requests means the count of normalIO after end_window.
+         * */
+        sector_t                start_next_window;
+        int                     current_window_requests;
+        int                     next_window_requests;
        spinlock_t              device_lock;
        /* list of 'struct r1bio' that need to be processed by raid1d,
@@ -65,6 +78,7 @@ struct r1conf {
        int                     nr_waiting;
        int                     nr_queued;
        int                     barrier;
+        int                     array_frozen;
        /* Set to 1 if a full sync is needed, (fresh device added).
         * Cleared when a sync completes.
@@ -111,6 +125,7 @@ struct r1bio {
                                                 * in this BehindIO request
                                                 */
        sector_t                sector;
+        sector_t                start_next_window;
        int                     sectors;
        unsigned long           state;
        struct mddev            *mddev;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7c3508abb5e1..c504e8389e69 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait, mddev->flags == 0 ||
-                           kthread_should_stop());
+                           test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+                        allow_barrier(conf);
+                        return sectors_done;
+                }
                conf->reshape_safe = mddev->reshape_position;
                allow_barrier(conf);
        }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7f0e17a27aeb..cc055da02e2a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
        return &conf->stripe_hashtbl[hash];
 }
+static inline int stripe_hash_locks_hash(sector_t sect)
+{
+        return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
+}
+static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
+{
+        spin_lock_irq(conf->hash_locks + hash);
+        spin_lock(&conf->device_lock);
+}
+static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
+{
+        spin_unlock(&conf->device_lock);
+        spin_unlock_irq(conf->hash_locks + hash);
+}
+static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+        int i;
+        local_irq_disable();
+        spin_lock(conf->hash_locks);
+        for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+                spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
+        spin_lock(&conf->device_lock);
+}
+static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+        int i;
+        spin_unlock(&conf->device_lock);
+        for (i = NR_STRIPE_HASH_LOCKS; i; i--)
+                spin_unlock(conf->hash_locks + i - 1);
+        local_irq_enable();
+}
 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
 * order without overlap.  There may be several bio's per stripe+device, and
 * a bio could span several devices.
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
        }
 }
-static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
+                              struct list_head *temp_inactive_list)
 {
        BUG_ON(!list_empty(&sh->lru));
        BUG_ON(atomic_read(&conf->active_stripes)==0);
@@ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
                            < IO_THRESHOLD)
                                md_wakeup_thread(conf->mddev->thread);
                atomic_dec(&conf->active_stripes);
-                if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+                if (!test_bit(STRIPE_EXPANDING, &sh->state))
-                        list_add_tail(&sh->lru, &conf->inactive_list);
+                        list_add_tail(&sh->lru, temp_inactive_list);
-                        wake_up(&conf->wait_for_stripe);
-                        if (conf->retry_read_aligned)
-                                md_wakeup_thread(conf->mddev->thread);
-                }
        }
 }
-static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
+                             struct list_head *temp_inactive_list)
 {
        if (atomic_dec_and_test(&sh->count))
-                do_release_stripe(conf, sh);
+                do_release_stripe(conf, sh, temp_inactive_list);
+}
+/*
+ * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
+ *
+ * Be careful: Only one task can add/delete stripes from temp_inactive_list at
+ * given time. Adding stripes only takes device lock, while deleting stripes
+ * only takes hash lock.
+ */
+static void release_inactive_stripe_list(struct r5conf *conf,
+                                         struct list_head *temp_inactive_list,
+                                         int hash)
+{
+        int size;
+        bool do_wakeup = false;
+        unsigned long flags;
+        if (hash == NR_STRIPE_HASH_LOCKS) {
+                size = NR_STRIPE_HASH_LOCKS;
+                hash = NR_STRIPE_HASH_LOCKS - 1;
+        } else
+                size = 1;
+        while (size) {
+                struct list_head *list = &temp_inactive_list[size - 1];
+                /*
+                 * We don't hold any lock here yet, get_active_stripe() might
+                 * remove stripes from the list
+                 */
+                if (!list_empty_careful(list)) {
+                        spin_lock_irqsave(conf->hash_locks + hash, flags);
+                        if (list_empty(conf->inactive_list + hash) &&
+                            !list_empty(list))
+                                atomic_dec(&conf->empty_inactive_list_nr);
+                        list_splice_tail_init(list, conf->inactive_list + hash);
+                        do_wakeup = true;
+                        spin_unlock_irqrestore(conf->hash_locks + hash, flags);
+                }
+                size--;
+                hash--;
+        }
+        if (do_wakeup) {
+                wake_up(&conf->wait_for_stripe);
+                if (conf->retry_read_aligned)
+                        md_wakeup_thread(conf->mddev->thread);
+        }
 }
 /* should hold conf->device_lock already */
-static int release_stripe_list(struct r5conf *conf)
+static int release_stripe_list(struct r5conf *conf,
+                               struct list_head *temp_inactive_list)
 {
        struct stripe_head *sh;
        int count = 0;
@@ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf)
        head = llist_del_all(&conf->released_stripes);
        head = llist_reverse_order(head);
        while (head) {
+                int hash;
                sh = llist_entry(head, struct stripe_head, release_list);
                head = llist_next(head);
                /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
@@ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf)
                 * again, the count is always > 1. This is true for
                 * STRIPE_ON_UNPLUG_LIST bit too.
                 */
-                __release_stripe(conf, sh);
+                hash = sh->hash_lock_index;
+                __release_stripe(conf, sh, &temp_inactive_list[hash]);
                count++;
        }
@@ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh)
 {
        struct r5conf *conf = sh->raid_conf;
        unsigned long flags;
+        struct list_head list;
+        int hash;
        bool wakeup;
-        if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
+        if (unlikely(!conf->mddev->thread) ||
+                test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
                goto slow_path;
        wakeup = llist_add(&sh->release_list, &conf->released_stripes);
        if (wakeup)
@@ -336,8 +424,11 @@ slow_path:
        local_irq_save(flags);
        /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
        if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
-                do_release_stripe(conf, sh);
+                INIT_LIST_HEAD(&list);
+                hash = sh->hash_lock_index;
+                do_release_stripe(conf, sh, &list);
                spin_unlock(&conf->device_lock);
+                release_inactive_stripe_list(conf, &list, hash);
        }
        local_irq_restore(flags);
 }
@@ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 /* find an idle stripe, make sure it is unhashed, and return it. */
-static struct stripe_head *get_free_stripe(struct r5conf *conf)
+static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh = NULL;
        struct list_head *first;
-        if (list_empty(&conf->inactive_list))
+        if (list_empty(conf->inactive_list + hash))
                goto out;
-        first = conf->inactive_list.next;
+        first = (conf->inactive_list + hash)->next;
        sh = list_entry(first, struct stripe_head, lru);
        list_del_init(first);
        remove_hash(sh);
        atomic_inc(&conf->active_stripes);
+        BUG_ON(hash != sh->hash_lock_index);
+        if (list_empty(conf->inactive_list + hash))
+                atomic_inc(&conf->empty_inactive_list_nr);
 out:
        return sh;
 }
@@ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 {
        struct r5conf *conf = sh->raid_conf;
-        int i;
+        int i, seq;
        BUG_ON(atomic_read(&sh->count) != 0);
        BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
@@ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                (unsigned long long)sh->sector);
        remove_hash(sh);
+retry:
+        seq = read_seqcount_begin(&conf->gen_lock);
        sh->generation = conf->generation - previous;
        sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
        sh->sector = sector;
@@ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                dev->flags = 0;
                raid5_build_block(sh, i, previous);
        }
+        if (read_seqcount_retry(&conf->gen_lock, seq))
+                goto retry;
        insert_hash(conf, sh);
        sh->cpu = smp_processor_id();
 }
@@ -552,57 +649,59 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                  int previous, int noblock, int noquiesce)
 {
        struct stripe_head *sh;
+        int hash = stripe_hash_locks_hash(sector);
        pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
-        spin_lock_irq(&conf->device_lock);
+        spin_lock_irq(conf->hash_locks + hash);
        do {
                wait_event_lock_irq(conf->wait_for_stripe,
                                    conf->quiesce == 0 || noquiesce,
-                                    conf->device_lock);
+                                    *(conf->hash_locks + hash));
                sh = __find_stripe(conf, sector, conf->generation - previous);
                if (!sh) {
                        if (!conf->inactive_blocked)
-                                sh = get_free_stripe(conf);
+                                sh = get_free_stripe(conf, hash);
                        if (noblock && sh == NULL)
                                break;
                        if (!sh) {
                                conf->inactive_blocked = 1;
-                                wait_event_lock_irq(conf->wait_for_stripe,
+                                wait_event_lock_irq(
-                                                    !list_empty(&conf->inactive_list) &&
+                                        conf->wait_for_stripe,
-                                                    (atomic_read(&conf->active_stripes)
+                                        !list_empty(conf->inactive_list + hash) &&
-                                                     < (conf->max_nr_stripes *3/4)
+                                        (atomic_read(&conf->active_stripes)
-                                                     || !conf->inactive_blocked),
+                                         < (conf->max_nr_stripes * 3 / 4)
-                                                    conf->device_lock);
+                                         || !conf->inactive_blocked),
+                                        *(conf->hash_locks + hash));
                                conf->inactive_blocked = 0;
                        } else
                                init_stripe(sh, sector, previous);
                } else {
+                        spin_lock(&conf->device_lock);
                        if (atomic_read(&sh->count)) {
                                BUG_ON(!list_empty(&sh->lru)
                                    && !test_bit(STRIPE_EXPANDING, &sh->state)
                                    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
-                                    && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
+                                        );
                        } else {
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
-                                if (list_empty(&sh->lru) &&
+                                BUG_ON(list_empty(&sh->lru));
-                                    !test_bit(STRIPE_EXPANDING, &sh->state))
-                                        BUG();
                                list_del_init(&sh->lru);
                                if (sh->group) {
                                        sh->group->stripes_cnt--;
                                        sh->group = NULL;
                                }
                        }
+                        spin_unlock(&conf->device_lock);
                }
        } while (sh == NULL);
        if (sh)
                atomic_inc(&sh->count);
-        spin_unlock_irq(&conf->device_lock);
+        spin_unlock_irq(conf->hash_locks + hash);
        return sh;
 }
@@ -758,7 +857,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                bi->bi_sector = (sh->sector
                                                 + rdev->data_offset);
                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
-                                bi->bi_rw |= REQ_FLUSH;
+                                bi->bi_rw |= REQ_NOMERGE;
                        bi->bi_vcnt = 1;
                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1582,7 +1681,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
        put_cpu();
 }
-static int grow_one_stripe(struct r5conf *conf)
+static int grow_one_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh;
        sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1598,6 +1697,7 @@ static int grow_one_stripe(struct r5conf *conf)
                kmem_cache_free(conf->slab_cache, sh);
                return 0;
        }
+        sh->hash_lock_index = hash;
        /* we just created an active stripe so... */
        atomic_set(&sh->count, 1);
        atomic_inc(&conf->active_stripes);
@@ -1610,6 +1710,7 @@ static int grow_stripes(struct r5conf *conf, int num)
 {
        struct kmem_cache *sc;
        int devs = max(conf->raid_disks, conf->previous_raid_disks);
+        int hash;
        if (conf->mddev->gendisk)
                sprintf(conf->cache_name[0],
@@ -1627,9 +1728,13 @@ static int grow_stripes(struct r5conf *conf, int num)
                return 1;
        conf->slab_cache = sc;
        conf->pool_size = devs;
-        while (num--)
+        hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
-                if (!grow_one_stripe(conf))
+        while (num--) {
+                if (!grow_one_stripe(conf, hash))
                        return 1;
+                conf->max_nr_stripes++;
+                hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
+        }
        return 0;
 }
@@ -1687,6 +1792,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        int err;
        struct kmem_cache *sc;
        int i;
+        int hash, cnt;
        if (newsize <= conf->pool_size)
                return 0; /* never bother to shrink */
@@ -1726,19 +1832,29 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         * OK, we have enough stripes, start collecting inactive
         * stripes and copying them over
         */
+        hash = 0;
+        cnt = 0;
        list_for_each_entry(nsh, &newstripes, lru) {
-                spin_lock_irq(&conf->device_lock);
+                lock_device_hash_lock(conf, hash);
-                wait_event_lock_irq(conf->wait_for_stripe,
+                wait_event_cmd(conf->wait_for_stripe,
-                                    !list_empty(&conf->inactive_list),
+                                    !list_empty(conf->inactive_list + hash),
-                                    conf->device_lock);
+                                    unlock_device_hash_lock(conf, hash),
-                osh = get_free_stripe(conf);
+                                    lock_device_hash_lock(conf, hash));
-                spin_unlock_irq(&conf->device_lock);
+                osh = get_free_stripe(conf, hash);
+                unlock_device_hash_lock(conf, hash);
                atomic_set(&nsh->count, 1);
                for(i=0; i<conf->pool_size; i++)
                        nsh->dev[i].page = osh->dev[i].page;
                for( ; i<newsize; i++)
                        nsh->dev[i].page = NULL;
+                nsh->hash_lock_index = hash;
                kmem_cache_free(conf->slab_cache, osh);
+                cnt++;
+                if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
+                    !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
+                        hash++;
+                        cnt = 0;
+                }
        }
        kmem_cache_destroy(conf->slab_cache);
@@ -1797,13 +1913,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        return err;
 }
-static int drop_one_stripe(struct r5conf *conf)
+static int drop_one_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh;
-        spin_lock_irq(&conf->device_lock);
+        spin_lock_irq(conf->hash_locks + hash);
-        sh = get_free_stripe(conf);
+        sh = get_free_stripe(conf, hash);
-        spin_unlock_irq(&conf->device_lock);
+        spin_unlock_irq(conf->hash_locks + hash);
        if (!sh)
                return 0;
        BUG_ON(atomic_read(&sh->count));
@@ -1815,8 +1931,10 @@ static int drop_one_stripe(struct r5conf *conf)
 static void shrink_stripes(struct r5conf *conf)
 {
-        while (drop_one_stripe(conf))
+        int hash;
-                ;
+        for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
+                while (drop_one_stripe(conf, hash))
+                        ;
        if (conf->slab_cache)
                kmem_cache_destroy(conf->slab_cache);
@@ -1921,6 +2039,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
                               mdname(conf->mddev), bdn);
                else
                        retry = 1;
+                if (set_bad && test_bit(In_sync, &rdev->flags)
+                    && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+                        retry = 1;
                if (retry)
                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
                                set_bit(R5_ReadError, &sh->dev[i].flags);
@@ -3900,7 +4021,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
        }
 }
-static void activate_bit_delay(struct r5conf *conf)
+static void activate_bit_delay(struct r5conf *conf,
+        struct list_head *temp_inactive_list)
 {
        /* device_lock is held */
        struct list_head head;
@@ -3908,9 +4030,11 @@ static void activate_bit_delay(struct r5conf *conf)
        list_del_init(&conf->bitmap_list);
        while (!list_empty(&head)) {
                struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+                int hash;
                list_del_init(&sh->lru);
                atomic_inc(&sh->count);
-                __release_stripe(conf, sh);
+                hash = sh->hash_lock_index;
+                __release_stripe(conf, sh, &temp_inactive_list[hash]);
        }
 }
@@ -3926,7 +4050,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
                return 1;
        if (conf->quiesce)
                return 1;
-        if (list_empty_careful(&conf->inactive_list))
+        if (atomic_read(&conf->empty_inactive_list_nr))
                return 1;
        return 0;
@@ -4256,6 +4380,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 struct raid5_plug_cb {
        struct blk_plug_cb      cb;
        struct list_head        list;
+        struct list_head        temp_inactive_list[NR_STRIPE_HASH_LOCKS];
 };
 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
@@ -4266,6 +4391,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
        struct mddev *mddev = cb->cb.data;
        struct r5conf *conf = mddev->private;
        int cnt = 0;
+        int hash;
        if (cb->list.next && !list_empty(&cb->list)) {
                spin_lock_irq(&conf->device_lock);
@@ -4283,11 +4409,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
                         * STRIPE_ON_RELEASE_LIST could be set here. In that
                         * case, the count is always > 1 here
                         */
-                        __release_stripe(conf, sh);
+                        hash = sh->hash_lock_index;
+                        __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
                        cnt++;
                }
                spin_unlock_irq(&conf->device_lock);
        }
+        release_inactive_stripe_list(conf, cb->temp_inactive_list,
+                                     NR_STRIPE_HASH_LOCKS);
        if (mddev->queue)
                trace_block_unplug(mddev->queue, cnt, !from_schedule);
        kfree(cb);
@@ -4308,8 +4437,12 @@ static void release_stripe_plug(struct mddev *mddev,
        cb = container_of(blk_cb, struct raid5_plug_cb, cb);
-        if (cb->list.next == NULL)
+        if (cb->list.next == NULL) {
+                int i;
                INIT_LIST_HEAD(&cb->list);
+                for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                        INIT_LIST_HEAD(cb->temp_inactive_list + i);
+        }
        if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
                list_add_tail(&sh->lru, &cb->list);
@@ -4692,14 +4825,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
            time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
                /* Cannot proceed until we've updated the superblock... */
                wait_event(conf->wait_for_overlap,
-                           atomic_read(&conf->reshape_stripes)==0);
+                           atomic_read(&conf->reshape_stripes)==0
+                           || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+                if (atomic_read(&conf->reshape_stripes) != 0)
+                        return 0;
                mddev->reshape_position = conf->reshape_progress;
                mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait, mddev->flags == 0 ||
-                           kthread_should_stop());
+                           test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+                        return 0;
                spin_lock_irq(&conf->device_lock);
                conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
@@ -4782,7 +4920,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
            >= mddev->resync_max - mddev->curr_resync_completed) {
                /* Cannot proceed until we've updated the superblock... */
                wait_event(conf->wait_for_overlap,
-                           atomic_read(&conf->reshape_stripes) == 0);
+                           atomic_read(&conf->reshape_stripes) == 0
+                           || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+                if (atomic_read(&conf->reshape_stripes) != 0)
+                        goto ret;
                mddev->reshape_position = conf->reshape_progress;
                mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
@@ -4790,13 +4931,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait,
                           !test_bit(MD_CHANGE_DEVS, &mddev->flags)
-                           || kthread_should_stop());
+                           || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+                        goto ret;
                spin_lock_irq(&conf->device_lock);
                conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
        }
+ret:
        return reshape_sectors;
 }
@@ -4954,27 +5098,45 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 }
 static int handle_active_stripes(struct r5conf *conf, int group,
-                                 struct r5worker *worker)
+                                 struct r5worker *worker,
+                                 struct list_head *temp_inactive_list)
 {
        struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
-        int i, batch_size = 0;
+        int i, batch_size = 0, hash;
+        bool release_inactive = false;
        while (batch_size < MAX_STRIPE_BATCH &&
                        (sh = __get_priority_stripe(conf, group)) != NULL)
                batch[batch_size++] = sh;
-        if (batch_size == 0)
+        if (batch_size == 0) {
-                return batch_size;
+                for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                        if (!list_empty(temp_inactive_list + i))
+                                break;
+                if (i == NR_STRIPE_HASH_LOCKS)
+                        return batch_size;
+                release_inactive = true;
+        }
        spin_unlock_irq(&conf->device_lock);
+        release_inactive_stripe_list(conf, temp_inactive_list,
+                                     NR_STRIPE_HASH_LOCKS);
+        if (release_inactive) {
+                spin_lock_irq(&conf->device_lock);
+                return 0;
+        }
        for (i = 0; i < batch_size; i++)
                handle_stripe(batch[i]);
        cond_resched();
        spin_lock_irq(&conf->device_lock);
-        for (i = 0; i < batch_size; i++)
+        for (i = 0; i < batch_size; i++) {
-                __release_stripe(conf, batch[i]);
+                hash = batch[i]->hash_lock_index;
+                __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
+        }
        return batch_size;
 }
@@ -4995,9 +5157,10 @@ static void raid5_do_work(struct work_struct *work)
        while (1) {
                int batch_size, released;
-                released = release_stripe_list(conf);
+                released = release_stripe_list(conf, worker->temp_inactive_list);
-                batch_size = handle_active_stripes(conf, group_id, worker);
+                batch_size = handle_active_stripes(conf, group_id, worker,
+                                                   worker->temp_inactive_list);
                worker->working = false;
                if (!batch_size && !released)
                        break;
@@ -5036,7 +5199,7 @@ static void raid5d(struct md_thread *thread)
                struct bio *bio;
                int batch_size, released;
-                released = release_stripe_list(conf);
+                released = release_stripe_list(conf, conf->temp_inactive_list);
                if (
                    !list_empty(&conf->bitmap_list)) {
@@ -5046,7 +5209,7 @@ static void raid5d(struct md_thread *thread)
                        bitmap_unplug(mddev->bitmap);
                        spin_lock_irq(&conf->device_lock);
                        conf->seq_write = conf->seq_flush;
-                        activate_bit_delay(conf);
+                        activate_bit_delay(conf, conf->temp_inactive_list);
                }
                raid5_activate_delayed(conf);
@@ -5060,7 +5223,8 @@ static void raid5d(struct md_thread *thread)
                        handled++;
                }
-                batch_size = handle_active_stripes(conf, ANY_GROUP, NULL);
+                batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
+                                                   conf->temp_inactive_list);
                if (!batch_size && !released)
                        break;
                handled += batch_size;
@@ -5096,22 +5260,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 {
        struct r5conf *conf = mddev->private;
        int err;
+        int hash;
        if (size <= 16 || size > 32768)
                return -EINVAL;
+        hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
        while (size < conf->max_nr_stripes) {
-                if (drop_one_stripe(conf))
+                if (drop_one_stripe(conf, hash))
                        conf->max_nr_stripes--;
                else
                        break;
+                hash--;
+                if (hash < 0)
+                        hash = NR_STRIPE_HASH_LOCKS - 1;
        }
        err = md_allow_write(mddev);
        if (err)
                return err;
+        hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
        while (size > conf->max_nr_stripes) {
-                if (grow_one_stripe(conf))
+                if (grow_one_stripe(conf, hash))
                        conf->max_nr_stripes++;
                else break;
+                hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
        }
        return 0;
 }
@@ -5199,15 +5370,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
                return 0;
 }
-static int alloc_thread_groups(struct r5conf *conf, int cnt);
+static int alloc_thread_groups(struct r5conf *conf, int cnt,
+                               int *group_cnt,
+                               int *worker_cnt_per_group,
+                               struct r5worker_group **worker_groups);
 static ssize_t
 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
 {
        struct r5conf *conf = mddev->private;
        unsigned long new;
        int err;
-        struct r5worker_group *old_groups;
+        struct r5worker_group *new_groups, *old_groups;
-        int old_group_cnt;
+        int group_cnt, worker_cnt_per_group;
        if (len >= PAGE_SIZE)
                return -EINVAL;
@@ -5223,14 +5397,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
        mddev_suspend(mddev);
        old_groups = conf->worker_groups;
-        old_group_cnt = conf->worker_cnt_per_group;
+        if (old_groups)
+                flush_workqueue(raid5_wq);
+        err = alloc_thread_groups(conf, new,
+                                  &group_cnt, &worker_cnt_per_group,
+                                  &new_groups);
+        if (!err) {
+                spin_lock_irq(&conf->device_lock);
+                conf->group_cnt = group_cnt;
+                conf->worker_cnt_per_group = worker_cnt_per_group;
+                conf->worker_groups = new_groups;
+                spin_unlock_irq(&conf->device_lock);
-        conf->worker_groups = NULL;
-        err = alloc_thread_groups(conf, new);
-        if (err) {
-                conf->worker_groups = old_groups;
-                conf->worker_cnt_per_group = old_group_cnt;
-        } else {
                if (old_groups)
                        kfree(old_groups[0].workers);
                kfree(old_groups);
@@ -5260,40 +5439,47 @@ static struct attribute_group raid5_attrs_group = {
        .attrs = raid5_attrs,
 };
-static int alloc_thread_groups(struct r5conf *conf, int cnt)
+static int alloc_thread_groups(struct r5conf *conf, int cnt,
+                               int *group_cnt,
+                               int *worker_cnt_per_group,
+                               struct r5worker_group **worker_groups)
 {
-        int i, j;
+        int i, j, k;
        ssize_t size;
        struct r5worker *workers;
-        conf->worker_cnt_per_group = cnt;
+        *worker_cnt_per_group = cnt;
        if (cnt == 0) {
-                conf->worker_groups = NULL;
+                *group_cnt = 0;
+                *worker_groups = NULL;
                return 0;
        }
-        conf->group_cnt = num_possible_nodes();
+        *group_cnt = num_possible_nodes();
        size = sizeof(struct r5worker) * cnt;
-        workers = kzalloc(size * conf->group_cnt, GFP_NOIO);
+        workers = kzalloc(size * *group_cnt, GFP_NOIO);
-        conf->worker_groups = kzalloc(sizeof(struct r5worker_group) *
+        *worker_groups = kzalloc(sizeof(struct r5worker_group) *
-                                conf->group_cnt, GFP_NOIO);
+                                *group_cnt, GFP_NOIO);
-        if (!conf->worker_groups || !workers) {
+        if (!*worker_groups || !workers) {
                kfree(workers);
-                kfree(conf->worker_groups);
+                kfree(*worker_groups);
-                conf->worker_groups = NULL;
                return -ENOMEM;
        }
-        for (i = 0; i < conf->group_cnt; i++) {
+        for (i = 0; i < *group_cnt; i++) {
                struct r5worker_group *group;
-                group = &conf->worker_groups[i];
+                group = &(*worker_groups)[i];
                INIT_LIST_HEAD(&group->handle_list);
                group->conf = conf;
                group->workers = workers + i * cnt;
                for (j = 0; j < cnt; j++) {
-                        group->workers[j].group = group;
+                        struct r5worker *worker = group->workers + j;
-                        INIT_WORK(&group->workers[j].work, raid5_do_work);
+                        worker->group = group;
+                        INIT_WORK(&worker->work, raid5_do_work);
+                        for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
+                                INIT_LIST_HEAD(worker->temp_inactive_list + k);
                }
        }
@@ -5444,6 +5630,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        struct md_rdev *rdev;
        struct disk_info *disk;
        char pers_name[6];
+        int i;
+        int group_cnt, worker_cnt_per_group;
+        struct r5worker_group *new_group;
        if (mddev->new_level != 5
            && mddev->new_level != 4
@@ -5478,7 +5667,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        if (conf == NULL)
                goto abort;
        /* Don't enable multi-threading by default*/
-        if (alloc_thread_groups(conf, 0))
+        if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
+                                 &new_group)) {
+                conf->group_cnt = group_cnt;
+                conf->worker_cnt_per_group = worker_cnt_per_group;
+                conf->worker_groups = new_group;
+        } else
                goto abort;
        spin_lock_init(&conf->device_lock);
        seqcount_init(&conf->gen_lock);
@@ -5488,7 +5682,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        INIT_LIST_HEAD(&conf->hold_list);
        INIT_LIST_HEAD(&conf->delayed_list);
        INIT_LIST_HEAD(&conf->bitmap_list);
-        INIT_LIST_HEAD(&conf->inactive_list);
        init_llist_head(&conf->released_stripes);
        atomic_set(&conf->active_stripes, 0);
        atomic_set(&conf->preread_active_stripes, 0);
@@ -5514,6 +5707,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
                goto abort;
+        /* We init hash_locks[0] separately to that it can be used
+         * as the reference lock in the spin_lock_nest_lock() call
+         * in lock_all_device_hash_locks_irq in order to convince
+         * lockdep that we know what we are doing.
+         */
+        spin_lock_init(conf->hash_locks);
+        for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+                spin_lock_init(conf->hash_locks + i);
+        for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                INIT_LIST_HEAD(conf->inactive_list + i);
+        for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                INIT_LIST_HEAD(conf->temp_inactive_list + i);
        conf->level = mddev->new_level;
        if (raid5_alloc_percpu(conf) != 0)
                goto abort;
@@ -5554,7 +5762,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        else
                conf->max_degraded = 1;
        conf->algorithm = mddev->new_layout;
-        conf->max_nr_stripes = NR_STRIPES;
        conf->reshape_progress = mddev->reshape_position;
        if (conf->reshape_progress != MaxSector) {
                conf->prev_chunk_sectors = mddev->chunk_sectors;
@@ -5563,7 +5770,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
                 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
-        if (grow_stripes(conf, conf->max_nr_stripes)) {
+        atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
+        if (grow_stripes(conf, NR_STRIPES)) {
                printk(KERN_ERR
                       "md/raid:%s: couldn't allocate %dkB for buffers\n",
                       mdname(mddev), memory);
@@ -6369,12 +6577,18 @@ static int raid5_start_reshape(struct mddev *mddev)
        if (!mddev->sync_thread) {
                mddev->recovery = 0;
                spin_lock_irq(&conf->device_lock);
+                write_seqcount_begin(&conf->gen_lock);
                mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+                mddev->new_chunk_sectors =
+                        conf->chunk_sectors = conf->prev_chunk_sectors;
+                mddev->new_layout = conf->algorithm = conf->prev_algo;
                rdev_for_each(rdev, mddev)
                        rdev->new_data_offset = rdev->data_offset;
                smp_wmb();
+                conf->generation --;
                conf->reshape_progress = MaxSector;
                mddev->reshape_position = MaxSector;
+                write_seqcount_end(&conf->gen_lock);
                spin_unlock_irq(&conf->device_lock);
                return -EAGAIN;
        }
@@ -6462,27 +6676,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
                break;
        case 1: /* stop all writes */
-                spin_lock_irq(&conf->device_lock);
+                lock_all_device_hash_locks_irq(conf);
                /* '2' tells resync/reshape to pause so that all
                 * active stripes can drain
                 */
                conf->quiesce = 2;
-                wait_event_lock_irq(conf->wait_for_stripe,
+                wait_event_cmd(conf->wait_for_stripe,
                                    atomic_read(&conf->active_stripes) == 0 &&
                                    atomic_read(&conf->active_aligned_reads) == 0,
-                                    conf->device_lock);
+                                    unlock_all_device_hash_locks_irq(conf),
+                                    lock_all_device_hash_locks_irq(conf));
                conf->quiesce = 1;
-                spin_unlock_irq(&conf->device_lock);
+                unlock_all_device_hash_locks_irq(conf);
                /* allow reshape to continue */
                wake_up(&conf->wait_for_overlap);
                break;
        case 0: /* re-enable writes */
-                spin_lock_irq(&conf->device_lock);
+                lock_all_device_hash_locks_irq(conf);
                conf->quiesce = 0;
                wake_up(&conf->wait_for_stripe);
                wake_up(&conf->wait_for_overlap);
-                spin_unlock_irq(&conf->device_lock);
+                unlock_all_device_hash_locks_irq(conf);
                break;
        }
 }
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index b42e6b462eda..01ad8ae8f578 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -205,6 +205,7 @@ struct stripe_head {
        short                   pd_idx;         /* parity disk index */
        short                   qd_idx;         /* 'Q' disk index for raid6 */
        short                   ddf_layout;/* use DDF ordering to calculate Q */
+        short                   hash_lock_index;
        unsigned long           state;          /* state flags */
        atomic_t                count;        /* nr of active thread/requests */
        int                     bm_seq; /* sequence number for bitmap flushes */
@@ -367,9 +368,18 @@ struct disk_info {
        struct md_rdev  *rdev, *replacement;
 };
+/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
+ * This is because we sometimes take all the spinlocks
+ * and creating that much locking depth can cause
+ * problems.
+ */
+#define NR_STRIPE_HASH_LOCKS 8
+#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
 struct r5worker {
        struct work_struct work;
        struct r5worker_group *group;
+        struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
        bool working;
 };
@@ -382,6 +392,8 @@ struct r5worker_group {
 struct r5conf {
        struct hlist_head       *stripe_hashtbl;
+        /* only protect corresponding hash list and inactive_list */
+        spinlock_t              hash_locks[NR_STRIPE_HASH_LOCKS];
        struct mddev            *mddev;
        int                     chunk_sectors;
        int                     level, algorithm;
@@ -462,7 +474,8 @@ struct r5conf {
         * Free stripes pool
         */
        atomic_t                active_stripes;
-        struct list_head        inactive_list;
+        struct list_head        inactive_list[NR_STRIPE_HASH_LOCKS];
+        atomic_t                empty_inactive_list_nr;
        struct llist_head       released_stripes;
        wait_queue_head_t       wait_for_stripe;
        wait_queue_head_t       wait_for_overlap;
@@ -477,6 +490,7 @@ struct r5conf {
         * the new thread here until we fully activate the array.
         */
        struct md_thread        *thread;
+        struct list_head        temp_inactive_list[NR_STRIPE_HASH_LOCKS];
        struct r5worker_group   *worker_groups;
        int                     group_cnt;
        int                     worker_cnt_per_group;
author	Ingo Molnar <mingo@kernel.org>	2013-12-17 09:27:08 -0500
committer	Ingo Molnar <mingo@kernel.org>	2013-12-17 09:27:08 -0500
commit	bb799d3b980eb803ca2da4a4eefbd9308f8d988a (patch)
tree	69fbe0cd6d47b23a50f5e1d87bf7489532fae149 /drivers/md
parent	919fc6e34831d1c2b58bfb5ae261dc3facc9b269 (diff)
parent	319e2e3f63c348a9b66db4667efa73178e18b17d (diff)