Merge tag 'md/3.13' of git://neil.brown.name/md

Pull md update from Neil Brown: "Mostly optimisations and obscure bug fixes. - raid5 gets less lock contention - raid1 gets less contention between normal-io and resync-io during resync" * tag 'md/3.13' of git://neil.brown.name/md: md/raid5: Use conf->device_lock protect changing of multi-thread resources. md/raid5: Before freeing old multi-thread worker, it should flush them. md/raid5: For stripe with R5_ReadNoMerge, we replace REQ_FLUSH with REQ_NOMERGE. UAPI: include <asm/byteorder.h> in linux/raid/md_p.h raid1: Rewrite the implementation of iobarrier. raid1: Add some macros to make code clearly. raid1: Replace raise_barrier/lower_barrier with freeze_array/unfreeze_array when reconfiguring the array. raid1: Add a field array_frozen to indicate whether raid in freeze state. md: Convert use of typedef ctl_table to struct ctl_table md/raid5: avoid deadlock when raid5 array has unack badblocks during md_stop_writes. md: use MD_RECOVERY_INTR instead of kthread_should_stop in resync thread. md: fix some places where mddev_lock return value is not checked. raid5: Retry R5_ReadNoMerge flag when hit a read error. raid5: relieve lock contention in get_active_stripe() raid5: relieve lock contention in get_active_stripe() wait: add wait_event_cmd() md/raid5.c: add proper locking to error path of raid5_start_reshape. md: fix calculation of stacking limits on level change. raid5: Use slow_path to release stripe when mddev->thread is null
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-11-20 16:05:25 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-11-20 16:05:25 -0500
commit: 6d6e352c80f22c446d933ca8103e02bac1f09129 (patch)
tree: 248a6a7ebc5ea95986da5bccdd6d75b255cf28e4
parent: b4789b8e6be3151a955ade74872822f30e8cd914 (diff)
parent: 60aaf933854511630e16be4efe0f96485e132de4 (diff)
8 files changed, 592 insertions, 186 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8766eabb0014..b6b7a2866c9e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev)
 static struct ctl_table_header *raid_table_header;
-static ctl_table raid_table[] = {
+static struct ctl_table raid_table[] = {
        {
                .procname       = "speed_limit_min",
                .data           = &sysctl_speed_limit_min,
@@ -130,7 +130,7 @@ static ctl_table raid_table[] = {
        { }
 };
-static ctl_table raid_dir_table[] = {
+static struct ctl_table raid_dir_table[] = {
        {
                .procname       = "raid",
                .maxlen         = 0,
@@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = {
        { }
 };
-static ctl_table raid_root_table[] = {
+static struct ctl_table raid_root_table[] = {
        {
                .procname       = "dev",
                .maxlen         = 0,
@@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit)
        goto retry;
 }
-static inline int mddev_lock(struct mddev * mddev)
+static inline int __must_check mddev_lock(struct mddev * mddev)
 {
        return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
+/* Sometimes we need to take the lock in a situation where
+ * failure due to interrupts is not acceptable.
+ */
+static inline void mddev_lock_nointr(struct mddev * mddev)
+{
+        mutex_lock(&mddev->reconfig_mutex);
+}
 static inline int mddev_is_locked(struct mddev *mddev)
 {
        return mutex_is_locked(&mddev->reconfig_mutex);
@@ -2978,7 +2986,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
                for_each_mddev(mddev, tmp) {
                        struct md_rdev *rdev2;
-                        mddev_lock(mddev);
+                        mddev_lock_nointr(mddev);
                        rdev_for_each(rdev2, mddev)
                                if (rdev->bdev == rdev2->bdev &&
                                    rdev != rdev2 &&
@@ -2994,7 +3002,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
                                break;
                        }
                }
-                mddev_lock(my_mddev);
+                mddev_lock_nointr(my_mddev);
                if (overlap) {
                        /* Someone else could have slipped in a size
                         * change here, but doing so is just silly.
@@ -3580,6 +3588,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                mddev->in_sync = 1;
                del_timer_sync(&mddev->safemode_timer);
        }
+        blk_set_stacking_limits(&mddev->queue->limits);
        pers->run(mddev);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
        mddev_resume(mddev);
@@ -5258,7 +5267,7 @@ static void __md_stop_writes(struct mddev *mddev)
 void md_stop_writes(struct mddev *mddev)
 {
-        mddev_lock(mddev);
+        mddev_lock_nointr(mddev);
        __md_stop_writes(mddev);
        mddev_unlock(mddev);
 }
@@ -5291,20 +5300,35 @@ EXPORT_SYMBOL_GPL(md_stop);
 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 {
        int err = 0;
+        int did_freeze = 0;
+        if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+                did_freeze = 1;
+                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+                md_wakeup_thread(mddev->thread);
+        }
+        if (mddev->sync_thread) {
+                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+                /* Thread might be blocked waiting for metadata update
+                 * which will now never happen */
+                wake_up_process(mddev->sync_thread->tsk);
+        }
+        mddev_unlock(mddev);
+        wait_event(resync_wait, mddev->sync_thread == NULL);
+        mddev_lock_nointr(mddev);
        mutex_lock(&mddev->open_mutex);
-        if (atomic_read(&mddev->openers) > !!bdev) {
+        if (atomic_read(&mddev->openers) > !!bdev ||
+            mddev->sync_thread ||
+            (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
                printk("md: %s still in use.\n",mdname(mddev));
+                if (did_freeze) {
+                        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+                        md_wakeup_thread(mddev->thread);
+                }
                err = -EBUSY;
                goto out;
        }
-        if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
-                /* Someone opened the device since we flushed it
-                 * so page cache could be dirty and it is too late
-                 * to flush.  So abort
-                 */
-                mutex_unlock(&mddev->open_mutex);
-                return -EBUSY;
-        }
        if (mddev->pers) {
                __md_stop_writes(mddev);
@@ -5315,7 +5339,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
                set_disk_ro(mddev->gendisk, 1);
                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                sysfs_notify_dirent_safe(mddev->sysfs_state);
-                err = 0;        
+                err = 0;
        }
 out:
        mutex_unlock(&mddev->open_mutex);
@@ -5331,20 +5355,34 @@ static int do_md_stop(struct mddev * mddev, int mode,
 {
        struct gendisk *disk = mddev->gendisk;
        struct md_rdev *rdev;
+        int did_freeze = 0;
+        if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+                did_freeze = 1;
+                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+                md_wakeup_thread(mddev->thread);
+        }
+        if (mddev->sync_thread) {
+                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+                /* Thread might be blocked waiting for metadata update
+                 * which will now never happen */
+                wake_up_process(mddev->sync_thread->tsk);
+        }
+        mddev_unlock(mddev);
+        wait_event(resync_wait, mddev->sync_thread == NULL);
+        mddev_lock_nointr(mddev);
        mutex_lock(&mddev->open_mutex);
        if (atomic_read(&mddev->openers) > !!bdev ||
-            mddev->sysfs_active) {
+            mddev->sysfs_active ||
+            mddev->sync_thread ||
+            (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
                printk("md: %s still in use.\n",mdname(mddev));
                mutex_unlock(&mddev->open_mutex);
-                return -EBUSY;
+                if (did_freeze) {
-        }
+                        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-        if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
+                        md_wakeup_thread(mddev->thread);
-                /* Someone opened the device since we flushed it
+                }
-                 * so page cache could be dirty and it is too late
-                 * to flush.  So abort
-                 */
-                mutex_unlock(&mddev->open_mutex);
                return -EBUSY;
        }
        if (mddev->pers) {
@@ -6551,7 +6589,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
                                wait_event(mddev->sb_wait,
                                           !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
                                           !test_bit(MD_CHANGE_PENDING, &mddev->flags));
-                                mddev_lock(mddev);
+                                mddev_lock_nointr(mddev);
                        }
                } else {
                        err = -EROFS;
@@ -7361,9 +7399,6 @@ void md_do_sync(struct md_thread *thread)
                mddev->curr_resync = 2;
        try_again:
-                if (kthread_should_stop())
-                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
                        goto skip;
                for_each_mddev(mddev2, tmp) {
@@ -7388,7 +7423,7 @@ void md_do_sync(struct md_thread *thread)
                                 * be caught by 'softlockup'
                                 */
                                prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
-                                if (!kthread_should_stop() &&
+                                if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
                                    mddev2->curr_resync >= mddev->curr_resync) {
                                        printk(KERN_INFO "md: delaying %s of %s"
                                               " until %s has finished (they"
@@ -7464,7 +7499,7 @@ void md_do_sync(struct md_thread *thread)
        last_check = 0;
        if (j>2) {
-                printk(KERN_INFO 
+                printk(KERN_INFO
                       "md: resuming %s of %s from checkpoint.\n",
                       desc, mdname(mddev));
                mddev->curr_resync = j;
@@ -7501,7 +7536,8 @@ void md_do_sync(struct md_thread *thread)
                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
                }
-                while (j >= mddev->resync_max && !kthread_should_stop()) {
+                while (j >= mddev->resync_max &&
+                       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
                        /* As this condition is controlled by user-space,
                         * we can block indefinitely, so use '_interruptible'
                         * to avoid triggering warnings.
@@ -7509,17 +7545,18 @@ void md_do_sync(struct md_thread *thread)
                        flush_signals(current); /* just in case */
                        wait_event_interruptible(mddev->recovery_wait,
                                                 mddev->resync_max > j
-                                                 || kthread_should_stop());
+                                                 || test_bit(MD_RECOVERY_INTR,
+                                                             &mddev->recovery));
                }
-                if (kthread_should_stop())
+                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
-                        goto interrupted;
+                        break;
                sectors = mddev->pers->sync_request(mddev, j, &skipped,
                                                  currspeed < speed_min(mddev));
                if (sectors == 0) {
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                        goto out;
+                        break;
                }
                if (!skipped) { /* actual IO requested */
@@ -7556,10 +7593,8 @@ void md_do_sync(struct md_thread *thread)
                        last_mark = next;
                }
+                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
-                if (kthread_should_stop())
+                        break;
-                        goto interrupted;
                /*
                 * this loop exits only if either when we are slower than
@@ -7582,11 +7617,12 @@ void md_do_sync(struct md_thread *thread)
                        }
                }
        }
-        printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
+        printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
+               test_bit(MD_RECOVERY_INTR, &mddev->recovery)
+               ? "interrupted" : "done");
        /*
         * this also signals 'finished resyncing' to md_stop
         */
- out:
        blk_finish_plug(&plug);
        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
@@ -7640,16 +7676,6 @@ void md_do_sync(struct md_thread *thread)
        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
        return;
- interrupted:
-        /*
-         * got a signal, exit.
-         */
-        printk(KERN_INFO
-               "md: md_do_sync() got signal ... exiting\n");
-        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-        goto out;
 }
 EXPORT_SYMBOL_GPL(md_do_sync);
@@ -7894,6 +7920,7 @@ void md_reap_sync_thread(struct mddev *mddev)
        /* resync has finished, collect result */
        md_unregister_thread(&mddev->sync_thread);
+        wake_up(&resync_wait);
        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
                /* success...*/
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index af6681b19776..1e5a540995e9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -66,7 +66,8 @@
 */
 static int max_queued_requests = 1024;
-static void allow_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+                          sector_t bi_sector);
 static void lower_barrier(struct r1conf *conf);
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
@@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 }
 #define RESYNC_BLOCK_SIZE (64*1024)
-//#define RESYNC_BLOCK_SIZE PAGE_SIZE
+#define RESYNC_DEPTH 32
 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
-#define RESYNC_WINDOW (2048*1024)
+#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
+#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
+#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
        struct bio *bio = r1_bio->master_bio;
        int done;
        struct r1conf *conf = r1_bio->mddev->private;
+        sector_t start_next_window = r1_bio->start_next_window;
+        sector_t bi_sector = bio->bi_sector;
        if (bio->bi_phys_segments) {
                unsigned long flags;
@@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
                bio->bi_phys_segments--;
                done = (bio->bi_phys_segments == 0);
                spin_unlock_irqrestore(&conf->device_lock, flags);
+                /*
+                 * make_request() might be waiting for
+                 * bi_phys_segments to decrease
+                 */
+                wake_up(&conf->wait_barrier);
        } else
                done = 1;
@@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
                 * Wake up any possible resync thread that waits for the device
                 * to go idle.
                 */
-                allow_barrier(conf);
+                allow_barrier(conf, start_next_window, bi_sector);
        }
 }
@@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf)
 *    there is no normal IO happeing.  It must arrange to call
 *    lower_barrier when the particular background IO completes.
 */
-#define RESYNC_DEPTH 32
 static void raise_barrier(struct r1conf *conf)
 {
        spin_lock_irq(&conf->resync_lock);
@@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf)
        /* block any new IO from starting */
        conf->barrier++;
-        /* Now wait for all pending IO to complete */
+        /* For these conditions we must wait:
+         * A: while the array is in frozen state
+         * B: while barrier >= RESYNC_DEPTH, meaning resync reach
+         *    the max count which allowed.
+         * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
+         *    next resync will reach to the window which normal bios are
+         *    handling.
+         */
        wait_event_lock_irq(conf->wait_barrier,
-                            !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+                            !conf->array_frozen &&
+                            conf->barrier < RESYNC_DEPTH &&
+                            (conf->start_next_window >=
+                             conf->next_resync + RESYNC_SECTORS),
                            conf->resync_lock);
        spin_unlock_irq(&conf->resync_lock);
@@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf)
        wake_up(&conf->wait_barrier);
 }
-static void wait_barrier(struct r1conf *conf)
+static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
 {
+        bool wait = false;
+        if (conf->array_frozen || !bio)
+                wait = true;
+        else if (conf->barrier && bio_data_dir(bio) == WRITE) {
+                if (conf->next_resync < RESYNC_WINDOW_SECTORS)
+                        wait = true;
+                else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
+                                >= bio_end_sector(bio)) ||
+                         (conf->next_resync + NEXT_NORMALIO_DISTANCE
+                                <= bio->bi_sector))
+                        wait = false;
+                else
+                        wait = true;
+        }
+        return wait;
+}
+static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
+{
+        sector_t sector = 0;
        spin_lock_irq(&conf->resync_lock);
-        if (conf->barrier) {
+        if (need_to_wait_for_sync(conf, bio)) {
                conf->nr_waiting++;
                /* Wait for the barrier to drop.
                 * However if there are already pending
@@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf)
                 * count down.
                 */
                wait_event_lock_irq(conf->wait_barrier,
-                                    !conf->barrier ||
+                                    !conf->array_frozen &&
-                                    (conf->nr_pending &&
+                                    (!conf->barrier ||
+                                    ((conf->start_next_window <
+                                      conf->next_resync + RESYNC_SECTORS) &&
                                     current->bio_list &&
-                                     !bio_list_empty(current->bio_list)),
+                                     !bio_list_empty(current->bio_list))),
                                    conf->resync_lock);
                conf->nr_waiting--;
        }
+        if (bio && bio_data_dir(bio) == WRITE) {
+                if (conf->next_resync + NEXT_NORMALIO_DISTANCE
+                    <= bio->bi_sector) {
+                        if (conf->start_next_window == MaxSector)
+                                conf->start_next_window =
+                                        conf->next_resync +
+                                        NEXT_NORMALIO_DISTANCE;
+                        if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
+                            <= bio->bi_sector)
+                                conf->next_window_requests++;
+                        else
+                                conf->current_window_requests++;
+                }
+                if (bio->bi_sector >= conf->start_next_window)
+                        sector = conf->start_next_window;
+        }
        conf->nr_pending++;
        spin_unlock_irq(&conf->resync_lock);
+        return sector;
 }
-static void allow_barrier(struct r1conf *conf)
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+                          sector_t bi_sector)
 {
        unsigned long flags;
        spin_lock_irqsave(&conf->resync_lock, flags);
        conf->nr_pending--;
+        if (start_next_window) {
+                if (start_next_window == conf->start_next_window) {
+                        if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
+                            <= bi_sector)
+                                conf->next_window_requests--;
+                        else
+                                conf->current_window_requests--;
+                } else
+                        conf->current_window_requests--;
+                if (!conf->current_window_requests) {
+                        if (conf->next_window_requests) {
+                                conf->current_window_requests =
+                                        conf->next_window_requests;
+                                conf->next_window_requests = 0;
+                                conf->start_next_window +=
+                                        NEXT_NORMALIO_DISTANCE;
+                        } else
+                                conf->start_next_window = MaxSector;
+                }
+        }
        spin_unlock_irqrestore(&conf->resync_lock, flags);
        wake_up(&conf->wait_barrier);
 }
@@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra)
 {
        /* stop syncio and normal IO and wait for everything to
         * go quite.
-         * We increment barrier and nr_waiting, and then
+         * We wait until nr_pending match nr_queued+extra
-         * wait until nr_pending match nr_queued+extra
         * This is called in the context of one normal IO request
         * that has failed. Thus any sync request that might be pending
         * will be blocked by nr_pending, and we need to wait for
@@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra)
         * we continue.
         */
        spin_lock_irq(&conf->resync_lock);
-        conf->barrier++;
+        conf->array_frozen = 1;
-        conf->nr_waiting++;
        wait_event_lock_irq_cmd(conf->wait_barrier,
                                conf->nr_pending == conf->nr_queued+extra,
                                conf->resync_lock,
@@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf)
 {
        /* reverse the effect of the freeze */
        spin_lock_irq(&conf->resync_lock);
-        conf->barrier--;
+        conf->array_frozen = 0;
-        conf->nr_waiting--;
        wake_up(&conf->wait_barrier);
        spin_unlock_irq(&conf->resync_lock);
 }
@@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        int first_clone;
        int sectors_handled;
        int max_sectors;
+        sector_t start_next_window;
        /*
         * Register the new request and wait if the reconstruction
@@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
                finish_wait(&conf->wait_barrier, &w);
        }
-        wait_barrier(conf);
+        start_next_window = wait_barrier(conf, bio);
        bitmap = mddev->bitmap;
@@ -1163,6 +1247,7 @@ read_again:
        disks = conf->raid_disks * 2;
 retry_write:
+        r1_bio->start_next_window = start_next_window;
        blocked_rdev = NULL;
        rcu_read_lock();
        max_sectors = r1_bio->sectors;
@@ -1231,14 +1316,24 @@ read_again:
        if (unlikely(blocked_rdev)) {
                /* Wait for this device to become unblocked */
                int j;
+                sector_t old = start_next_window;
                for (j = 0; j < i; j++)
                        if (r1_bio->bios[j])
                                rdev_dec_pending(conf->mirrors[j].rdev, mddev);
                r1_bio->state = 0;
-                allow_barrier(conf);
+                allow_barrier(conf, start_next_window, bio->bi_sector);
                md_wait_for_blocked_rdev(blocked_rdev, mddev);
-                wait_barrier(conf);
+                start_next_window = wait_barrier(conf, bio);
+                /*
+                 * We must make sure the multi r1bios of bio have
+                 * the same value of bi_phys_segments
+                 */
+                if (bio->bi_phys_segments && old &&
+                    old != start_next_window)
+                        /* Wait for the former r1bio(s) to complete */
+                        wait_event(conf->wait_barrier,
+                                   bio->bi_phys_segments == 1);
                goto retry_write;
        }
@@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf)
 static void close_sync(struct r1conf *conf)
 {
-        wait_barrier(conf);
+        wait_barrier(conf, NULL);
-        allow_barrier(conf);
+        allow_barrier(conf, 0, 0);
        mempool_destroy(conf->r1buf_pool);
        conf->r1buf_pool = NULL;
+        conf->next_resync = 0;
+        conf->start_next_window = MaxSector;
 }
 static int raid1_spare_active(struct mddev *mddev)
@@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        conf->pending_count = 0;
        conf->recovery_disabled = mddev->recovery_disabled - 1;
+        conf->start_next_window = MaxSector;
+        conf->current_window_requests = conf->next_window_requests = 0;
        err = -EIO;
        for (i = 0; i < conf->raid_disks * 2; i++) {
@@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev)
                           atomic_read(&bitmap->behind_writes) == 0);
        }
-        raise_barrier(conf);
+        freeze_array(conf, 0);
-        lower_barrier(conf);
+        unfreeze_array(conf);
        md_unregister_thread(&mddev->thread);
        if (conf->r1bio_pool)
@@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
                wake_up(&conf->wait_barrier);
                break;
        case 1:
-                raise_barrier(conf);
+                freeze_array(conf, 0);
                break;
        case 0:
-                lower_barrier(conf);
+                unfreeze_array(conf);
                break;
        }
 }
@@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev)
                mddev->new_chunk_sectors = 0;
                conf = setup_conf(mddev);
                if (!IS_ERR(conf))
-                        conf->barrier = 1;
+                        /* Array must appear to be quiesced */
+                        conf->array_frozen = 1;
                return conf;
        }
        return ERR_PTR(-EINVAL);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 0ff3715fb7eb..9bebca7bff2f 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -41,6 +41,19 @@ struct r1conf {
         */
        sector_t                next_resync;
+        /* When raid1 starts resync, we divide array into four partitions
+         * |---------|--------------|---------------------|-------------|
+         *        next_resync   start_next_window       end_window
+         * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
+         * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
+         * current_window_requests means the count of normalIO between
+         *   start_next_window and end_window.
+         * next_window_requests means the count of normalIO after end_window.
+         * */
+        sector_t                start_next_window;
+        int                     current_window_requests;
+        int                     next_window_requests;
        spinlock_t              device_lock;
        /* list of 'struct r1bio' that need to be processed by raid1d,
@@ -65,6 +78,7 @@ struct r1conf {
        int                     nr_waiting;
        int                     nr_queued;
        int                     barrier;
+        int                     array_frozen;
        /* Set to 1 if a full sync is needed, (fresh device added).
         * Cleared when a sync completes.
@@ -111,6 +125,7 @@ struct r1bio {
                                                 * in this BehindIO request
                                                 */
        sector_t                sector;
+        sector_t                start_next_window;
        int                     sectors;
        unsigned long           state;
        struct mddev            *mddev;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7c3508abb5e1..c504e8389e69 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait, mddev->flags == 0 ||
-                           kthread_should_stop());
+                           test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+                        allow_barrier(conf);
+                        return sectors_done;
+                }
                conf->reshape_safe = mddev->reshape_position;
                allow_barrier(conf);
        }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7f0e17a27aeb..47da0af6322b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
        return &conf->stripe_hashtbl[hash];
 }
+static inline int stripe_hash_locks_hash(sector_t sect)
+{
+        return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
+}
+static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
+{
+        spin_lock_irq(conf->hash_locks + hash);
+        spin_lock(&conf->device_lock);
+}
+static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
+{
+        spin_unlock(&conf->device_lock);
+        spin_unlock_irq(conf->hash_locks + hash);
+}
+static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+        int i;
+        local_irq_disable();
+        spin_lock(conf->hash_locks);
+        for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+                spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
+        spin_lock(&conf->device_lock);
+}
+static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+        int i;
+        spin_unlock(&conf->device_lock);
+        for (i = NR_STRIPE_HASH_LOCKS; i; i--)
+                spin_unlock(conf->hash_locks + i - 1);
+        local_irq_enable();
+}
 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
 * order without overlap.  There may be several bio's per stripe+device, and
 * a bio could span several devices.
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
        }
 }
-static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
+                              struct list_head *temp_inactive_list)
 {
        BUG_ON(!list_empty(&sh->lru));
        BUG_ON(atomic_read(&conf->active_stripes)==0);
@@ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
                            < IO_THRESHOLD)
                                md_wakeup_thread(conf->mddev->thread);
                atomic_dec(&conf->active_stripes);
-                if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+                if (!test_bit(STRIPE_EXPANDING, &sh->state))
-                        list_add_tail(&sh->lru, &conf->inactive_list);
+                        list_add_tail(&sh->lru, temp_inactive_list);
-                        wake_up(&conf->wait_for_stripe);
-                        if (conf->retry_read_aligned)
-                                md_wakeup_thread(conf->mddev->thread);
-                }
        }
 }
-static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
+                             struct list_head *temp_inactive_list)
 {
        if (atomic_dec_and_test(&sh->count))
-                do_release_stripe(conf, sh);
+                do_release_stripe(conf, sh, temp_inactive_list);
+}
+/*
+ * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
+ *
+ * Be careful: Only one task can add/delete stripes from temp_inactive_list at
+ * given time. Adding stripes only takes device lock, while deleting stripes
+ * only takes hash lock.
+ */
+static void release_inactive_stripe_list(struct r5conf *conf,
+                                         struct list_head *temp_inactive_list,
+                                         int hash)
+{
+        int size;
+        bool do_wakeup = false;
+        unsigned long flags;
+        if (hash == NR_STRIPE_HASH_LOCKS) {
+                size = NR_STRIPE_HASH_LOCKS;
+                hash = NR_STRIPE_HASH_LOCKS - 1;
+        } else
+                size = 1;
+        while (size) {
+                struct list_head *list = &temp_inactive_list[size - 1];
+                /*
+                 * We don't hold any lock here yet, get_active_stripe() might
+                 * remove stripes from the list
+                 */
+                if (!list_empty_careful(list)) {
+                        spin_lock_irqsave(conf->hash_locks + hash, flags);
+                        if (list_empty(conf->inactive_list + hash) &&
+                            !list_empty(list))
+                                atomic_dec(&conf->empty_inactive_list_nr);
+                        list_splice_tail_init(list, conf->inactive_list + hash);
+                        do_wakeup = true;
+                        spin_unlock_irqrestore(conf->hash_locks + hash, flags);
+                }
+                size--;
+                hash--;
+        }
+        if (do_wakeup) {
+                wake_up(&conf->wait_for_stripe);
+                if (conf->retry_read_aligned)
+                        md_wakeup_thread(conf->mddev->thread);
+        }
 }
 /* should hold conf->device_lock already */
-static int release_stripe_list(struct r5conf *conf)
+static int release_stripe_list(struct r5conf *conf,
+                               struct list_head *temp_inactive_list)
 {
        struct stripe_head *sh;
        int count = 0;
@@ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf)
        head = llist_del_all(&conf->released_stripes);
        head = llist_reverse_order(head);
        while (head) {
+                int hash;
                sh = llist_entry(head, struct stripe_head, release_list);
                head = llist_next(head);
                /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
@@ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf)
                 * again, the count is always > 1. This is true for
                 * STRIPE_ON_UNPLUG_LIST bit too.
                 */
-                __release_stripe(conf, sh);
+                hash = sh->hash_lock_index;
+                __release_stripe(conf, sh, &temp_inactive_list[hash]);
                count++;
        }
@@ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh)
 {
        struct r5conf *conf = sh->raid_conf;
        unsigned long flags;
+        struct list_head list;
+        int hash;
        bool wakeup;
-        if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
+        if (unlikely(!conf->mddev->thread) ||
+                test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
                goto slow_path;
        wakeup = llist_add(&sh->release_list, &conf->released_stripes);
        if (wakeup)
@@ -336,8 +424,11 @@ slow_path:
        local_irq_save(flags);
        /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
        if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
-                do_release_stripe(conf, sh);
+                INIT_LIST_HEAD(&list);
+                hash = sh->hash_lock_index;
+                do_release_stripe(conf, sh, &list);
                spin_unlock(&conf->device_lock);
+                release_inactive_stripe_list(conf, &list, hash);
        }
        local_irq_restore(flags);
 }
@@ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 /* find an idle stripe, make sure it is unhashed, and return it. */
-static struct stripe_head *get_free_stripe(struct r5conf *conf)
+static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh = NULL;
        struct list_head *first;
-        if (list_empty(&conf->inactive_list))
+        if (list_empty(conf->inactive_list + hash))
                goto out;
-        first = conf->inactive_list.next;
+        first = (conf->inactive_list + hash)->next;
        sh = list_entry(first, struct stripe_head, lru);
        list_del_init(first);
        remove_hash(sh);
        atomic_inc(&conf->active_stripes);
+        BUG_ON(hash != sh->hash_lock_index);
+        if (list_empty(conf->inactive_list + hash))
+                atomic_inc(&conf->empty_inactive_list_nr);
 out:
        return sh;
 }
@@ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 {
        struct r5conf *conf = sh->raid_conf;
-        int i;
+        int i, seq;
        BUG_ON(atomic_read(&sh->count) != 0);
        BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
@@ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                (unsigned long long)sh->sector);
        remove_hash(sh);
+retry:
+        seq = read_seqcount_begin(&conf->gen_lock);
        sh->generation = conf->generation - previous;
        sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
        sh->sector = sector;
@@ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                dev->flags = 0;
                raid5_build_block(sh, i, previous);
        }
+        if (read_seqcount_retry(&conf->gen_lock, seq))
+                goto retry;
        insert_hash(conf, sh);
        sh->cpu = smp_processor_id();
 }
@@ -552,29 +649,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                  int previous, int noblock, int noquiesce)
 {
        struct stripe_head *sh;
+        int hash = stripe_hash_locks_hash(sector);
        pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
-        spin_lock_irq(&conf->device_lock);
+        spin_lock_irq(conf->hash_locks + hash);
        do {
                wait_event_lock_irq(conf->wait_for_stripe,
                                    conf->quiesce == 0 || noquiesce,
-                                    conf->device_lock);
+                                    *(conf->hash_locks + hash));
                sh = __find_stripe(conf, sector, conf->generation - previous);
                if (!sh) {
                        if (!conf->inactive_blocked)
-                                sh = get_free_stripe(conf);
+                                sh = get_free_stripe(conf, hash);
                        if (noblock && sh == NULL)
                                break;
                        if (!sh) {
                                conf->inactive_blocked = 1;
-                                wait_event_lock_irq(conf->wait_for_stripe,
+                                wait_event_lock_irq(
-                                                    !list_empty(&conf->inactive_list) &&
+                                        conf->wait_for_stripe,
-                                                    (atomic_read(&conf->active_stripes)
+                                        !list_empty(conf->inactive_list + hash) &&
-                                                     < (conf->max_nr_stripes *3/4)
+                                        (atomic_read(&conf->active_stripes)
-                                                     || !conf->inactive_blocked),
+                                         < (conf->max_nr_stripes * 3 / 4)
-                                                    conf->device_lock);
+                                         || !conf->inactive_blocked),
+                                        *(conf->hash_locks + hash));
                                conf->inactive_blocked = 0;
                        } else
                                init_stripe(sh, sector, previous);
@@ -585,9 +684,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
                                    && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
                        } else {
+                                spin_lock(&conf->device_lock);
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
                                if (list_empty(&sh->lru) &&
+                                    !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) &&
                                    !test_bit(STRIPE_EXPANDING, &sh->state))
                                        BUG();
                                list_del_init(&sh->lru);
@@ -595,6 +696,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                        sh->group->stripes_cnt--;
                                        sh->group = NULL;
                                }
+                                spin_unlock(&conf->device_lock);
                        }
                }
        } while (sh == NULL);
@@ -602,7 +704,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
        if (sh)
                atomic_inc(&sh->count);
-        spin_unlock_irq(&conf->device_lock);
+        spin_unlock_irq(conf->hash_locks + hash);
        return sh;
 }
@@ -758,7 +860,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                bi->bi_sector = (sh->sector
                                                 + rdev->data_offset);
                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
-                                bi->bi_rw |= REQ_FLUSH;
+                                bi->bi_rw |= REQ_NOMERGE;
                        bi->bi_vcnt = 1;
                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1582,7 +1684,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
        put_cpu();
 }
-static int grow_one_stripe(struct r5conf *conf)
+static int grow_one_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh;
        sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1598,6 +1700,7 @@ static int grow_one_stripe(struct r5conf *conf)
                kmem_cache_free(conf->slab_cache, sh);
                return 0;
        }
+        sh->hash_lock_index = hash;
        /* we just created an active stripe so... */
        atomic_set(&sh->count, 1);
        atomic_inc(&conf->active_stripes);
@@ -1610,6 +1713,7 @@ static int grow_stripes(struct r5conf *conf, int num)
 {
        struct kmem_cache *sc;
        int devs = max(conf->raid_disks, conf->previous_raid_disks);
+        int hash;
        if (conf->mddev->gendisk)
                sprintf(conf->cache_name[0],
@@ -1627,9 +1731,13 @@ static int grow_stripes(struct r5conf *conf, int num)
                return 1;
        conf->slab_cache = sc;
        conf->pool_size = devs;
-        while (num--)
+        hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
-                if (!grow_one_stripe(conf))
+        while (num--) {
+                if (!grow_one_stripe(conf, hash))
                        return 1;
+                conf->max_nr_stripes++;
+                hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
+        }
        return 0;
 }
@@ -1687,6 +1795,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        int err;
        struct kmem_cache *sc;
        int i;
+        int hash, cnt;
        if (newsize <= conf->pool_size)
                return 0; /* never bother to shrink */
@@ -1726,19 +1835,29 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         * OK, we have enough stripes, start collecting inactive
         * stripes and copying them over
         */
+        hash = 0;
+        cnt = 0;
        list_for_each_entry(nsh, &newstripes, lru) {
-                spin_lock_irq(&conf->device_lock);
+                lock_device_hash_lock(conf, hash);
-                wait_event_lock_irq(conf->wait_for_stripe,
+                wait_event_cmd(conf->wait_for_stripe,
-                                    !list_empty(&conf->inactive_list),
+                                    !list_empty(conf->inactive_list + hash),
-                                    conf->device_lock);
+                                    unlock_device_hash_lock(conf, hash),
-                osh = get_free_stripe(conf);
+                                    lock_device_hash_lock(conf, hash));
-                spin_unlock_irq(&conf->device_lock);
+                osh = get_free_stripe(conf, hash);
+                unlock_device_hash_lock(conf, hash);
                atomic_set(&nsh->count, 1);
                for(i=0; i<conf->pool_size; i++)
                        nsh->dev[i].page = osh->dev[i].page;
                for( ; i<newsize; i++)
                        nsh->dev[i].page = NULL;
+                nsh->hash_lock_index = hash;
                kmem_cache_free(conf->slab_cache, osh);
+                cnt++;
+                if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
+                    !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
+                        hash++;
+                        cnt = 0;
+                }
        }
        kmem_cache_destroy(conf->slab_cache);
@@ -1797,13 +1916,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        return err;
 }
-static int drop_one_stripe(struct r5conf *conf)
+static int drop_one_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh;
-        spin_lock_irq(&conf->device_lock);
+        spin_lock_irq(conf->hash_locks + hash);
-        sh = get_free_stripe(conf);
+        sh = get_free_stripe(conf, hash);
-        spin_unlock_irq(&conf->device_lock);
+        spin_unlock_irq(conf->hash_locks + hash);
        if (!sh)
                return 0;
        BUG_ON(atomic_read(&sh->count));
@@ -1815,8 +1934,10 @@ static int drop_one_stripe(struct r5conf *conf)
 static void shrink_stripes(struct r5conf *conf)
 {
-        while (drop_one_stripe(conf))
+        int hash;
-                ;
+        for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
+                while (drop_one_stripe(conf, hash))
+                        ;
        if (conf->slab_cache)
                kmem_cache_destroy(conf->slab_cache);
@@ -1921,6 +2042,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
                               mdname(conf->mddev), bdn);
                else
                        retry = 1;
+                if (set_bad && test_bit(In_sync, &rdev->flags)
+                    && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+                        retry = 1;
                if (retry)
                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
                                set_bit(R5_ReadError, &sh->dev[i].flags);
@@ -3900,7 +4024,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
        }
 }
-static void activate_bit_delay(struct r5conf *conf)
+static void activate_bit_delay(struct r5conf *conf,
+        struct list_head *temp_inactive_list)
 {
        /* device_lock is held */
        struct list_head head;
@@ -3908,9 +4033,11 @@ static void activate_bit_delay(struct r5conf *conf)
        list_del_init(&conf->bitmap_list);
        while (!list_empty(&head)) {
                struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+                int hash;
                list_del_init(&sh->lru);
                atomic_inc(&sh->count);
-                __release_stripe(conf, sh);
+                hash = sh->hash_lock_index;
+                __release_stripe(conf, sh, &temp_inactive_list[hash]);
        }
 }
@@ -3926,7 +4053,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
                return 1;
        if (conf->quiesce)
                return 1;
-        if (list_empty_careful(&conf->inactive_list))
+        if (atomic_read(&conf->empty_inactive_list_nr))
                return 1;
        return 0;
@@ -4256,6 +4383,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 struct raid5_plug_cb {
        struct blk_plug_cb      cb;
        struct list_head        list;
+        struct list_head        temp_inactive_list[NR_STRIPE_HASH_LOCKS];
 };
 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
@@ -4266,6 +4394,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
        struct mddev *mddev = cb->cb.data;
        struct r5conf *conf = mddev->private;
        int cnt = 0;
+        int hash;
        if (cb->list.next && !list_empty(&cb->list)) {
                spin_lock_irq(&conf->device_lock);
@@ -4283,11 +4412,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
                         * STRIPE_ON_RELEASE_LIST could be set here. In that
                         * case, the count is always > 1 here
                         */
-                        __release_stripe(conf, sh);
+                        hash = sh->hash_lock_index;
+                        __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
                        cnt++;
                }
                spin_unlock_irq(&conf->device_lock);
        }
+        release_inactive_stripe_list(conf, cb->temp_inactive_list,
+                                     NR_STRIPE_HASH_LOCKS);
        if (mddev->queue)
                trace_block_unplug(mddev->queue, cnt, !from_schedule);
        kfree(cb);
@@ -4308,8 +4440,12 @@ static void release_stripe_plug(struct mddev *mddev,
        cb = container_of(blk_cb, struct raid5_plug_cb, cb);
-        if (cb->list.next == NULL)
+        if (cb->list.next == NULL) {
+                int i;
                INIT_LIST_HEAD(&cb->list);
+                for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                        INIT_LIST_HEAD(cb->temp_inactive_list + i);
+        }
        if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
                list_add_tail(&sh->lru, &cb->list);
@@ -4692,14 +4828,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
            time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
                /* Cannot proceed until we've updated the superblock... */
                wait_event(conf->wait_for_overlap,
-                           atomic_read(&conf->reshape_stripes)==0);
+                           atomic_read(&conf->reshape_stripes)==0
+                           || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+                if (atomic_read(&conf->reshape_stripes) != 0)
+                        return 0;
                mddev->reshape_position = conf->reshape_progress;
                mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait, mddev->flags == 0 ||
-                           kthread_should_stop());
+                           test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+                        return 0;
                spin_lock_irq(&conf->device_lock);
                conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
@@ -4782,7 +4923,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
            >= mddev->resync_max - mddev->curr_resync_completed) {
                /* Cannot proceed until we've updated the superblock... */
                wait_event(conf->wait_for_overlap,
-                           atomic_read(&conf->reshape_stripes) == 0);
+                           atomic_read(&conf->reshape_stripes) == 0
+                           || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+                if (atomic_read(&conf->reshape_stripes) != 0)
+                        goto ret;
                mddev->reshape_position = conf->reshape_progress;
                mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
@@ -4790,13 +4934,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait,
                           !test_bit(MD_CHANGE_DEVS, &mddev->flags)
-                           || kthread_should_stop());
+                           || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+                        goto ret;
                spin_lock_irq(&conf->device_lock);
                conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
        }
+ret:
        return reshape_sectors;
 }
@@ -4954,27 +5101,45 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 }
 static int handle_active_stripes(struct r5conf *conf, int group,
-                                 struct r5worker *worker)
+                                 struct r5worker *worker,
+                                 struct list_head *temp_inactive_list)
 {
        struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
-        int i, batch_size = 0;
+        int i, batch_size = 0, hash;
+        bool release_inactive = false;
        while (batch_size < MAX_STRIPE_BATCH &&
                        (sh = __get_priority_stripe(conf, group)) != NULL)
                batch[batch_size++] = sh;
-        if (batch_size == 0)
+        if (batch_size == 0) {
-                return batch_size;
+                for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                        if (!list_empty(temp_inactive_list + i))
+                                break;
+                if (i == NR_STRIPE_HASH_LOCKS)
+                        return batch_size;
+                release_inactive = true;
+        }
        spin_unlock_irq(&conf->device_lock);
+        release_inactive_stripe_list(conf, temp_inactive_list,
+                                     NR_STRIPE_HASH_LOCKS);
+        if (release_inactive) {
+                spin_lock_irq(&conf->device_lock);
+                return 0;
+        }
        for (i = 0; i < batch_size; i++)
                handle_stripe(batch[i]);
        cond_resched();
        spin_lock_irq(&conf->device_lock);
-        for (i = 0; i < batch_size; i++)
+        for (i = 0; i < batch_size; i++) {
-                __release_stripe(conf, batch[i]);
+                hash = batch[i]->hash_lock_index;
+                __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
+        }
        return batch_size;
 }
@@ -4995,9 +5160,10 @@ static void raid5_do_work(struct work_struct *work)
        while (1) {
                int batch_size, released;
-                released = release_stripe_list(conf);
+                released = release_stripe_list(conf, worker->temp_inactive_list);
-                batch_size = handle_active_stripes(conf, group_id, worker);
+                batch_size = handle_active_stripes(conf, group_id, worker,
+                                                   worker->temp_inactive_list);
                worker->working = false;
                if (!batch_size && !released)
                        break;
@@ -5036,7 +5202,7 @@ static void raid5d(struct md_thread *thread)
                struct bio *bio;
                int batch_size, released;
-                released = release_stripe_list(conf);
+                released = release_stripe_list(conf, conf->temp_inactive_list);
                if (
                    !list_empty(&conf->bitmap_list)) {
@@ -5046,7 +5212,7 @@ static void raid5d(struct md_thread *thread)
                        bitmap_unplug(mddev->bitmap);
                        spin_lock_irq(&conf->device_lock);
                        conf->seq_write = conf->seq_flush;
-                        activate_bit_delay(conf);
+                        activate_bit_delay(conf, conf->temp_inactive_list);
                }
                raid5_activate_delayed(conf);
@@ -5060,7 +5226,8 @@ static void raid5d(struct md_thread *thread)
                        handled++;
                }
-                batch_size = handle_active_stripes(conf, ANY_GROUP, NULL);
+                batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
+                                                   conf->temp_inactive_list);
                if (!batch_size && !released)
                        break;
                handled += batch_size;
@@ -5096,22 +5263,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 {
        struct r5conf *conf = mddev->private;
        int err;
+        int hash;
        if (size <= 16 || size > 32768)
                return -EINVAL;
+        hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
        while (size < conf->max_nr_stripes) {
-                if (drop_one_stripe(conf))
+                if (drop_one_stripe(conf, hash))
                        conf->max_nr_stripes--;
                else
                        break;
+                hash--;
+                if (hash < 0)
+                        hash = NR_STRIPE_HASH_LOCKS - 1;
        }
        err = md_allow_write(mddev);
        if (err)
                return err;
+        hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
        while (size > conf->max_nr_stripes) {
-                if (grow_one_stripe(conf))
+                if (grow_one_stripe(conf, hash))
                        conf->max_nr_stripes++;
                else break;
+                hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
        }
        return 0;
 }
@@ -5199,15 +5373,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
                return 0;
 }
-static int alloc_thread_groups(struct r5conf *conf, int cnt);
+static int alloc_thread_groups(struct r5conf *conf, int cnt,
+                               int *group_cnt,
+                               int *worker_cnt_per_group,
+                               struct r5worker_group **worker_groups);
 static ssize_t
 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
 {
        struct r5conf *conf = mddev->private;
        unsigned long new;
        int err;
-        struct r5worker_group *old_groups;
+        struct r5worker_group *new_groups, *old_groups;
-        int old_group_cnt;
+        int group_cnt, worker_cnt_per_group;
        if (len >= PAGE_SIZE)
                return -EINVAL;
@@ -5223,14 +5400,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
        mddev_suspend(mddev);
        old_groups = conf->worker_groups;
-        old_group_cnt = conf->worker_cnt_per_group;
+        if (old_groups)
+                flush_workqueue(raid5_wq);
+        err = alloc_thread_groups(conf, new,
+                                  &group_cnt, &worker_cnt_per_group,
+                                  &new_groups);
+        if (!err) {
+                spin_lock_irq(&conf->device_lock);
+                conf->group_cnt = group_cnt;
+                conf->worker_cnt_per_group = worker_cnt_per_group;
+                conf->worker_groups = new_groups;
+                spin_unlock_irq(&conf->device_lock);
-        conf->worker_groups = NULL;
-        err = alloc_thread_groups(conf, new);
-        if (err) {
-                conf->worker_groups = old_groups;
-                conf->worker_cnt_per_group = old_group_cnt;
-        } else {
                if (old_groups)
                        kfree(old_groups[0].workers);
                kfree(old_groups);
@@ -5260,40 +5442,47 @@ static struct attribute_group raid5_attrs_group = {
        .attrs = raid5_attrs,
 };
-static int alloc_thread_groups(struct r5conf *conf, int cnt)
+static int alloc_thread_groups(struct r5conf *conf, int cnt,
+                               int *group_cnt,
+                               int *worker_cnt_per_group,
+                               struct r5worker_group **worker_groups)
 {
-        int i, j;
+        int i, j, k;
        ssize_t size;
        struct r5worker *workers;
-        conf->worker_cnt_per_group = cnt;
+        *worker_cnt_per_group = cnt;
        if (cnt == 0) {
-                conf->worker_groups = NULL;
+                *group_cnt = 0;
+                *worker_groups = NULL;
                return 0;
        }
-        conf->group_cnt = num_possible_nodes();
+        *group_cnt = num_possible_nodes();
        size = sizeof(struct r5worker) * cnt;
-        workers = kzalloc(size * conf->group_cnt, GFP_NOIO);
+        workers = kzalloc(size * *group_cnt, GFP_NOIO);
-        conf->worker_groups = kzalloc(sizeof(struct r5worker_group) *
+        *worker_groups = kzalloc(sizeof(struct r5worker_group) *
-                                conf->group_cnt, GFP_NOIO);
+                                *group_cnt, GFP_NOIO);
-        if (!conf->worker_groups || !workers) {
+        if (!*worker_groups || !workers) {
                kfree(workers);
-                kfree(conf->worker_groups);
+                kfree(*worker_groups);
-                conf->worker_groups = NULL;
                return -ENOMEM;
        }
-        for (i = 0; i < conf->group_cnt; i++) {
+        for (i = 0; i < *group_cnt; i++) {
                struct r5worker_group *group;
-                group = &conf->worker_groups[i];
+                group = worker_groups[i];
                INIT_LIST_HEAD(&group->handle_list);
                group->conf = conf;
                group->workers = workers + i * cnt;
                for (j = 0; j < cnt; j++) {
-                        group->workers[j].group = group;
+                        struct r5worker *worker = group->workers + j;
-                        INIT_WORK(&group->workers[j].work, raid5_do_work);
+                        worker->group = group;
+                        INIT_WORK(&worker->work, raid5_do_work);
+                        for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
+                                INIT_LIST_HEAD(worker->temp_inactive_list + k);
                }
        }
@@ -5444,6 +5633,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        struct md_rdev *rdev;
        struct disk_info *disk;
        char pers_name[6];
+        int i;
+        int group_cnt, worker_cnt_per_group;
+        struct r5worker_group *new_group;
        if (mddev->new_level != 5
            && mddev->new_level != 4
@@ -5478,7 +5670,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        if (conf == NULL)
                goto abort;
        /* Don't enable multi-threading by default*/
-        if (alloc_thread_groups(conf, 0))
+        if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
+                                 &new_group)) {
+                conf->group_cnt = group_cnt;
+                conf->worker_cnt_per_group = worker_cnt_per_group;
+                conf->worker_groups = new_group;
+        } else
                goto abort;
        spin_lock_init(&conf->device_lock);
        seqcount_init(&conf->gen_lock);
@@ -5488,7 +5685,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        INIT_LIST_HEAD(&conf->hold_list);
        INIT_LIST_HEAD(&conf->delayed_list);
        INIT_LIST_HEAD(&conf->bitmap_list);
-        INIT_LIST_HEAD(&conf->inactive_list);
        init_llist_head(&conf->released_stripes);
        atomic_set(&conf->active_stripes, 0);
        atomic_set(&conf->preread_active_stripes, 0);
@@ -5514,6 +5710,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
                goto abort;
+        /* We init hash_locks[0] separately to that it can be used
+         * as the reference lock in the spin_lock_nest_lock() call
+         * in lock_all_device_hash_locks_irq in order to convince
+         * lockdep that we know what we are doing.
+         */
+        spin_lock_init(conf->hash_locks);
+        for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+                spin_lock_init(conf->hash_locks + i);
+        for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                INIT_LIST_HEAD(conf->inactive_list + i);
+        for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                INIT_LIST_HEAD(conf->temp_inactive_list + i);
        conf->level = mddev->new_level;
        if (raid5_alloc_percpu(conf) != 0)
                goto abort;
@@ -5554,7 +5765,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        else
                conf->max_degraded = 1;
        conf->algorithm = mddev->new_layout;
-        conf->max_nr_stripes = NR_STRIPES;
        conf->reshape_progress = mddev->reshape_position;
        if (conf->reshape_progress != MaxSector) {
                conf->prev_chunk_sectors = mddev->chunk_sectors;
@@ -5563,7 +5773,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
                 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
-        if (grow_stripes(conf, conf->max_nr_stripes)) {
+        atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
+        if (grow_stripes(conf, NR_STRIPES)) {
                printk(KERN_ERR
                       "md/raid:%s: couldn't allocate %dkB for buffers\n",
                       mdname(mddev), memory);
@@ -6369,12 +6580,18 @@ static int raid5_start_reshape(struct mddev *mddev)
        if (!mddev->sync_thread) {
                mddev->recovery = 0;
                spin_lock_irq(&conf->device_lock);
+                write_seqcount_begin(&conf->gen_lock);
                mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+                mddev->new_chunk_sectors =
+                        conf->chunk_sectors = conf->prev_chunk_sectors;
+                mddev->new_layout = conf->algorithm = conf->prev_algo;
                rdev_for_each(rdev, mddev)
                        rdev->new_data_offset = rdev->data_offset;
                smp_wmb();
+                conf->generation --;
                conf->reshape_progress = MaxSector;
                mddev->reshape_position = MaxSector;
+                write_seqcount_end(&conf->gen_lock);
                spin_unlock_irq(&conf->device_lock);
                return -EAGAIN;
        }
@@ -6462,27 +6679,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
                break;
        case 1: /* stop all writes */
-                spin_lock_irq(&conf->device_lock);
+                lock_all_device_hash_locks_irq(conf);
                /* '2' tells resync/reshape to pause so that all
                 * active stripes can drain
                 */
                conf->quiesce = 2;
-                wait_event_lock_irq(conf->wait_for_stripe,
+                wait_event_cmd(conf->wait_for_stripe,
                                    atomic_read(&conf->active_stripes) == 0 &&
                                    atomic_read(&conf->active_aligned_reads) == 0,
-                                    conf->device_lock);
+                                    unlock_all_device_hash_locks_irq(conf),
+                                    lock_all_device_hash_locks_irq(conf));
                conf->quiesce = 1;
-                spin_unlock_irq(&conf->device_lock);
+                unlock_all_device_hash_locks_irq(conf);
                /* allow reshape to continue */
                wake_up(&conf->wait_for_overlap);
                break;
        case 0: /* re-enable writes */
-                spin_lock_irq(&conf->device_lock);
+                lock_all_device_hash_locks_irq(conf);
                conf->quiesce = 0;
                wake_up(&conf->wait_for_stripe);
                wake_up(&conf->wait_for_overlap);
-                spin_unlock_irq(&conf->device_lock);
+                unlock_all_device_hash_locks_irq(conf);
                break;
        }
 }
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index b42e6b462eda..01ad8ae8f578 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -205,6 +205,7 @@ struct stripe_head {
        short                   pd_idx;         /* parity disk index */
        short                   qd_idx;         /* 'Q' disk index for raid6 */
        short                   ddf_layout;/* use DDF ordering to calculate Q */
+        short                   hash_lock_index;
        unsigned long           state;          /* state flags */
        atomic_t                count;        /* nr of active thread/requests */
        int                     bm_seq; /* sequence number for bitmap flushes */
@@ -367,9 +368,18 @@ struct disk_info {
        struct md_rdev  *rdev, *replacement;
 };
+/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
+ * This is because we sometimes take all the spinlocks
+ * and creating that much locking depth can cause
+ * problems.
+ */
+#define NR_STRIPE_HASH_LOCKS 8
+#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
 struct r5worker {
        struct work_struct work;
        struct r5worker_group *group;
+        struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
        bool working;
 };
@@ -382,6 +392,8 @@ struct r5worker_group {
 struct r5conf {
        struct hlist_head       *stripe_hashtbl;
+        /* only protect corresponding hash list and inactive_list */
+        spinlock_t              hash_locks[NR_STRIPE_HASH_LOCKS];
        struct mddev            *mddev;
        int                     chunk_sectors;
        int                     level, algorithm;
@@ -462,7 +474,8 @@ struct r5conf {
         * Free stripes pool
         */
        atomic_t                active_stripes;
-        struct list_head        inactive_list;
+        struct list_head        inactive_list[NR_STRIPE_HASH_LOCKS];
+        atomic_t                empty_inactive_list_nr;
        struct llist_head       released_stripes;
        wait_queue_head_t       wait_for_stripe;
        wait_queue_head_t       wait_for_overlap;
@@ -477,6 +490,7 @@ struct r5conf {
         * the new thread here until we fully activate the array.
         */
        struct md_thread        *thread;
+        struct list_head        temp_inactive_list[NR_STRIPE_HASH_LOCKS];
        struct r5worker_group   *worker_groups;
        int                     group_cnt;
        int                     worker_cnt_per_group;
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 61939ba30aa0..eaa00b10abaa 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -278,6 +278,31 @@ do {									\
        __ret;                                                          \
 })
+#define __wait_event_cmd(wq, condition, cmd1, cmd2)                     \
+        (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
+                            cmd1; schedule(); cmd2)
+/**
+ * wait_event_cmd - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * cmd1: the command will be executed before sleep
+ * cmd2: the command will be executed after sleep
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define wait_event_cmd(wq, condition, cmd1, cmd2)                       \
+do {                                                                    \
+        if (condition)                                                  \
+                break;                                                  \
+        __wait_event_cmd(wq, condition, cmd1, cmd2);                    \
+} while (0)
 #define __wait_event_interruptible(wq, condition)                       \
        ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,          \
                      schedule())
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index fe1a5406d4d9..f7cf7f351144 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -16,6 +16,7 @@
 #define _MD_P_H
 #include <linux/types.h>
+#include <asm/byteorder.h>
 /*
 * RAID superblock.
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-11-20 16:05:25 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-11-20 16:05:25 -0500
commit	6d6e352c80f22c446d933ca8103e02bac1f09129 (patch)
tree	248a6a7ebc5ea95986da5bccdd6d75b255cf28e4
parent	b4789b8e6be3151a955ade74872822f30e8cd914 (diff)
parent	60aaf933854511630e16be4efe0f96485e132de4 (diff)