1 files changed, 366 insertions, 230 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 830ff2b20346..7453d94eeed7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -71,9 +71,8 @@
 */
 static int max_queued_requests = 1024;
-static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
-                          sector_t bi_sector);
+static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
-static void lower_barrier(struct r1conf *conf);
 #define raid1_log(md, fmt, args...)                             \
        do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
@@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
 #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
-#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -205,6 +203,7 @@ static void free_r1bio(struct r1bio *r1_bio)
 static void put_buf(struct r1bio *r1_bio)
 {
        struct r1conf *conf = r1_bio->mddev->private;
+        sector_t sect = r1_bio->sector;
        int i;
        for (i = 0; i < conf->raid_disks * 2; i++) {
@@ -215,7 +214,7 @@ static void put_buf(struct r1bio *r1_bio)
        mempool_free(r1_bio, conf->r1buf_pool);
-        lower_barrier(conf);
+        lower_barrier(conf, sect);
 }
 static void reschedule_retry(struct r1bio *r1_bio)
@@ -223,10 +222,12 @@ static void reschedule_retry(struct r1bio *r1_bio)
        unsigned long flags;
        struct mddev *mddev = r1_bio->mddev;
        struct r1conf *conf = mddev->private;
+        int idx;
+        idx = sector_to_idx(r1_bio->sector);
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&r1_bio->retry_list, &conf->retry_list);
-        conf->nr_queued ++;
+        atomic_inc(&conf->nr_queued[idx]);
        spin_unlock_irqrestore(&conf->device_lock, flags);
        wake_up(&conf->wait_barrier);
@@ -243,7 +244,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
        struct bio *bio = r1_bio->master_bio;
        int done;
        struct r1conf *conf = r1_bio->mddev->private;
-        sector_t start_next_window = r1_bio->start_next_window;
        sector_t bi_sector = bio->bi_iter.bi_sector;
        if (bio->bi_phys_segments) {
@@ -269,7 +269,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
                 * Wake up any possible resync thread that waits for the device
                 * to go idle.
                 */
-                allow_barrier(conf, start_next_window, bi_sector);
+                allow_barrier(conf, bi_sector);
        }
 }
@@ -517,6 +517,25 @@ static void raid1_end_write_request(struct bio *bio)
                bio_put(to_put);
 }
+static sector_t align_to_barrier_unit_end(sector_t start_sector,
+                                          sector_t sectors)
+{
+        sector_t len;
+        WARN_ON(sectors == 0);
+        /*
+         * len is the number of sectors from start_sector to end of the
+         * barrier unit which start_sector belongs to.
+         */
+        len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
+              start_sector;
+        if (len > sectors)
+                len = sectors;
+        return len;
+}
 /*
 * This routine returns the disk from which the requested read should
 * be done. There is a per-array 'next expected sequential IO' sector
@@ -813,168 +832,228 @@ static void flush_pending_writes(struct r1conf *conf)
 */
 static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
 {
+        int idx = sector_to_idx(sector_nr);
        spin_lock_irq(&conf->resync_lock);
        /* Wait until no block IO is waiting */
-        wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+        wait_event_lock_irq(conf->wait_barrier,
+                            !atomic_read(&conf->nr_waiting[idx]),
                            conf->resync_lock);
        /* block any new IO from starting */
-        conf->barrier++;
+        atomic_inc(&conf->barrier[idx]);
-        conf->next_resync = sector_nr;
+        /*
+         * In raise_barrier() we firstly increase conf->barrier[idx] then
+         * check conf->nr_pending[idx]. In _wait_barrier() we firstly
+         * increase conf->nr_pending[idx] then check conf->barrier[idx].
+         * A memory barrier here to make sure conf->nr_pending[idx] won't
+         * be fetched before conf->barrier[idx] is increased. Otherwise
+         * there will be a race between raise_barrier() and _wait_barrier().
+         */
+        smp_mb__after_atomic();
        /* For these conditions we must wait:
         * A: while the array is in frozen state
-         * B: while barrier >= RESYNC_DEPTH, meaning resync reach
+         * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
-         *    the max count which allowed.
+         *    existing in corresponding I/O barrier bucket.
-         * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
+         * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
-         *    next resync will reach to the window which normal bios are
+         *    max resync count which allowed on current I/O barrier bucket.
-         *    handling.
-         * D: while there are any active requests in the current window.
         */
        wait_event_lock_irq(conf->wait_barrier,
                            !conf->array_frozen &&
-                            conf->barrier < RESYNC_DEPTH &&
+                             !atomic_read(&conf->nr_pending[idx]) &&
-                            conf->current_window_requests == 0 &&
+                             atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
-                            (conf->start_next_window >=
-                             conf->next_resync + RESYNC_SECTORS),
                            conf->resync_lock);
-        conf->nr_pending++;
+        atomic_inc(&conf->nr_pending[idx]);
        spin_unlock_irq(&conf->resync_lock);
 }
-static void lower_barrier(struct r1conf *conf)
+static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
 {
-        unsigned long flags;
+        int idx = sector_to_idx(sector_nr);
-        BUG_ON(conf->barrier <= 0);
-        spin_lock_irqsave(&conf->resync_lock, flags);
+        BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
-        conf->barrier--;
-        conf->nr_pending--;
+        atomic_dec(&conf->barrier[idx]);
-        spin_unlock_irqrestore(&conf->resync_lock, flags);
+        atomic_dec(&conf->nr_pending[idx]);
        wake_up(&conf->wait_barrier);
 }
-static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
+static void _wait_barrier(struct r1conf *conf, int idx)
 {
-        bool wait = false;
+        /*
+         * We need to increase conf->nr_pending[idx] very early here,
+         * then raise_barrier() can be blocked when it waits for
+         * conf->nr_pending[idx] to be 0. Then we can avoid holding
+         * conf->resync_lock when there is no barrier raised in same
+         * barrier unit bucket. Also if the array is frozen, I/O
+         * should be blocked until array is unfrozen.
+         */
+        atomic_inc(&conf->nr_pending[idx]);
+        /*
+         * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
+         * check conf->barrier[idx]. In raise_barrier() we firstly increase
+         * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
+         * barrier is necessary here to make sure conf->barrier[idx] won't be
+         * fetched before conf->nr_pending[idx] is increased. Otherwise there
+         * will be a race between _wait_barrier() and raise_barrier().
+         */
+        smp_mb__after_atomic();
-        if (conf->array_frozen || !bio)
+        /*
-                wait = true;
+         * Don't worry about checking two atomic_t variables at same time
-        else if (conf->barrier && bio_data_dir(bio) == WRITE) {
+         * here. If during we check conf->barrier[idx], the array is
-                if ((conf->mddev->curr_resync_completed
+         * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
-                     >= bio_end_sector(bio)) ||
+         * 0, it is safe to return and make the I/O continue. Because the
-                    (conf->start_next_window + NEXT_NORMALIO_DISTANCE
+         * array is frozen, all I/O returned here will eventually complete
-                     <= bio->bi_iter.bi_sector))
+         * or be queued, no race will happen. See code comment in
-                        wait = false;
+         * frozen_array().
-                else
+         */
-                        wait = true;
+        if (!READ_ONCE(conf->array_frozen) &&
-        }
+            !atomic_read(&conf->barrier[idx]))
+                return;
-        return wait;
+        /*
+         * After holding conf->resync_lock, conf->nr_pending[idx]
+         * should be decreased before waiting for barrier to drop.
+         * Otherwise, we may encounter a race condition because
+         * raise_barrer() might be waiting for conf->nr_pending[idx]
+         * to be 0 at same time.
+         */
+        spin_lock_irq(&conf->resync_lock);
+        atomic_inc(&conf->nr_waiting[idx]);
+        atomic_dec(&conf->nr_pending[idx]);
+        /*
+         * In case freeze_array() is waiting for
+         * get_unqueued_pending() == extra
+         */
+        wake_up(&conf->wait_barrier);
+        /* Wait for the barrier in same barrier unit bucket to drop. */
+        wait_event_lock_irq(conf->wait_barrier,
+                            !conf->array_frozen &&
+                             !atomic_read(&conf->barrier[idx]),
+                            conf->resync_lock);
+        atomic_inc(&conf->nr_pending[idx]);
+        atomic_dec(&conf->nr_waiting[idx]);
+        spin_unlock_irq(&conf->resync_lock);
 }
-static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
+static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 {
-        sector_t sector = 0;
+        int idx = sector_to_idx(sector_nr);
-        spin_lock_irq(&conf->resync_lock);
+        /*
-        if (need_to_wait_for_sync(conf, bio)) {
+         * Very similar to _wait_barrier(). The difference is, for read
-                conf->nr_waiting++;
+         * I/O we don't need wait for sync I/O, but if the whole array
-                /* Wait for the barrier to drop.
+         * is frozen, the read I/O still has to wait until the array is
-                 * However if there are already pending
+         * unfrozen. Since there is no ordering requirement with
-                 * requests (preventing the barrier from
+         * conf->barrier[idx] here, memory barrier is unnecessary as well.
-                 * rising completely), and the
+         */
-                 * per-process bio queue isn't empty,
+        atomic_inc(&conf->nr_pending[idx]);
-                 * then don't wait, as we need to empty
-                 * that queue to allow conf->start_next_window
-                 * to increase.
-                 */
-                raid1_log(conf->mddev, "wait barrier");
-                wait_event_lock_irq(conf->wait_barrier,
-                                    !conf->array_frozen &&
-                                    (!conf->barrier ||
-                                     ((conf->start_next_window <
-                                       conf->next_resync + RESYNC_SECTORS) &&
-                                      current->bio_list &&
-                                      !bio_list_empty(current->bio_list))),
-                                    conf->resync_lock);
-                conf->nr_waiting--;
-        }
-        if (bio && bio_data_dir(bio) == WRITE) {
-                if (bio->bi_iter.bi_sector >= conf->next_resync) {
-                        if (conf->start_next_window == MaxSector)
-                                conf->start_next_window =
-                                        conf->next_resync +
-                                        NEXT_NORMALIO_DISTANCE;
-                        if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
-                            <= bio->bi_iter.bi_sector)
-                                conf->next_window_requests++;
-                        else
-                                conf->current_window_requests++;
-                        sector = conf->start_next_window;
-                }
-        }
-        conf->nr_pending++;
+        if (!READ_ONCE(conf->array_frozen))
+                return;
+        spin_lock_irq(&conf->resync_lock);
+        atomic_inc(&conf->nr_waiting[idx]);
+        atomic_dec(&conf->nr_pending[idx]);
+        /*
+         * In case freeze_array() is waiting for
+         * get_unqueued_pending() == extra
+         */
+        wake_up(&conf->wait_barrier);
+        /* Wait for array to be unfrozen */
+        wait_event_lock_irq(conf->wait_barrier,
+                            !conf->array_frozen,
+                            conf->resync_lock);
+        atomic_inc(&conf->nr_pending[idx]);
+        atomic_dec(&conf->nr_waiting[idx]);
        spin_unlock_irq(&conf->resync_lock);
-        return sector;
 }
-static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
-                          sector_t bi_sector)
 {
-        unsigned long flags;
+        int idx = sector_to_idx(sector_nr);
-        spin_lock_irqsave(&conf->resync_lock, flags);
+        _wait_barrier(conf, idx);
-        conf->nr_pending--;
+}
-        if (start_next_window) {
-                if (start_next_window == conf->start_next_window) {
+static void wait_all_barriers(struct r1conf *conf)
-                        if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
+{
-                            <= bi_sector)
+        int idx;
-                                conf->next_window_requests--;
-                        else
+        for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
-                                conf->current_window_requests--;
+                _wait_barrier(conf, idx);
-                } else
+}
-                        conf->current_window_requests--;
+static void _allow_barrier(struct r1conf *conf, int idx)
-                if (!conf->current_window_requests) {
+{
-                        if (conf->next_window_requests) {
+        atomic_dec(&conf->nr_pending[idx]);
-                                conf->current_window_requests =
-                                        conf->next_window_requests;
-                                conf->next_window_requests = 0;
-                                conf->start_next_window +=
-                                        NEXT_NORMALIO_DISTANCE;
-                        } else
-                                conf->start_next_window = MaxSector;
-                }
-        }
-        spin_unlock_irqrestore(&conf->resync_lock, flags);
        wake_up(&conf->wait_barrier);
 }
+static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
+{
+        int idx = sector_to_idx(sector_nr);
+        _allow_barrier(conf, idx);
+}
+static void allow_all_barriers(struct r1conf *conf)
+{
+        int idx;
+        for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+                _allow_barrier(conf, idx);
+}
+/* conf->resync_lock should be held */
+static int get_unqueued_pending(struct r1conf *conf)
+{
+        int idx, ret;
+        for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+                ret += atomic_read(&conf->nr_pending[idx]) -
+                        atomic_read(&conf->nr_queued[idx]);
+        return ret;
+}
 static void freeze_array(struct r1conf *conf, int extra)
 {
-        /* stop syncio and normal IO and wait for everything to
+        /* Stop sync I/O and normal I/O and wait for everything to
         * go quite.
-         * We wait until nr_pending match nr_queued+extra
+         * This is called in two situations:
-         * This is called in the context of one normal IO request
+         * 1) management command handlers (reshape, remove disk, quiesce).
-         * that has failed. Thus any sync request that might be pending
+         * 2) one normal I/O request failed.
-         * will be blocked by nr_pending, and we need to wait for
-         * pending IO requests to complete or be queued for re-try.
+         * After array_frozen is set to 1, new sync IO will be blocked at
-         * Thus the number queued (nr_queued) plus this request (extra)
+         * raise_barrier(), and new normal I/O will blocked at _wait_barrier()
-         * must match the number of pending IOs (nr_pending) before
+         * or wait_read_barrier(). The flying I/Os will either complete or be
-         * we continue.
+         * queued. When everything goes quite, there are only queued I/Os left.
+         * Every flying I/O contributes to a conf->nr_pending[idx], idx is the
+         * barrier bucket index which this I/O request hits. When all sync and
+         * normal I/O are queued, sum of all conf->nr_pending[] will match sum
+         * of all conf->nr_queued[]. But normal I/O failure is an exception,
+         * in handle_read_error(), we may call freeze_array() before trying to
+         * fix the read error. In this case, the error read I/O is not queued,
+         * so get_unqueued_pending() == 1.
+         *
+         * Therefore before this function returns, we need to wait until
+         * get_unqueued_pendings(conf) gets equal to extra. For
+         * normal I/O context, extra is 1, in rested situations extra is 0.
         */
        spin_lock_irq(&conf->resync_lock);
        conf->array_frozen = 1;
        raid1_log(conf->mddev, "wait freeze");
-        wait_event_lock_irq_cmd(conf->wait_barrier,
+        wait_event_lock_irq_cmd(
-                                conf->nr_pending == conf->nr_queued+extra,
+                conf->wait_barrier,
-                                conf->resync_lock,
+                get_unqueued_pending(conf) == extra,
-                                flush_pending_writes(conf));
+                conf->resync_lock,
+                flush_pending_writes(conf));
        spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(struct r1conf *conf)
@@ -982,8 +1061,8 @@ static void unfreeze_array(struct r1conf *conf)
        /* reverse the effect of the freeze */
        spin_lock_irq(&conf->resync_lock);
        conf->array_frozen = 0;
-        wake_up(&conf->wait_barrier);
        spin_unlock_irq(&conf->resync_lock);
+        wake_up(&conf->wait_barrier);
 }
 /* duplicate the data pages for behind I/O
@@ -1070,11 +1149,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
        kfree(plug);
 }
-static void raid1_read_request(struct mddev *mddev, struct bio *bio,
+static inline struct r1bio *
-                                 struct r1bio *r1_bio)
+alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
+{
+        struct r1conf *conf = mddev->private;
+        struct r1bio *r1_bio;
+        r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+        r1_bio->master_bio = bio;
+        r1_bio->sectors = bio_sectors(bio) - sectors_handled;
+        r1_bio->state = 0;
+        r1_bio->mddev = mddev;
+        r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+        return r1_bio;
+}
+static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 {
        struct r1conf *conf = mddev->private;
        struct raid1_info *mirror;
+        struct r1bio *r1_bio;
        struct bio *read_bio;
        struct bitmap *bitmap = mddev->bitmap;
        const int op = bio_op(bio);
@@ -1083,8 +1179,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
        int max_sectors;
        int rdisk;
-        wait_barrier(conf, bio);
+        /*
+         * Still need barrier for READ in case that whole
+         * array is frozen.
+         */
+        wait_read_barrier(conf, bio->bi_iter.bi_sector);
+        r1_bio = alloc_r1bio(mddev, bio, 0);
+        /*
+         * We might need to issue multiple reads to different
+         * devices if there are bad blocks around, so we keep
+         * track of the number of reads in bio->bi_phys_segments.
+         * If this is 0, there is only one r1_bio and no locking
+         * will be needed when requests complete.  If it is
+         * non-zero, then it is the number of not-completed requests.
+         */
+        bio->bi_phys_segments = 0;
+        bio_clear_flag(bio, BIO_SEG_VALID);
+        /*
+         * make_request() can abort the operation when read-ahead is being
+         * used and no empty request is available.
+         */
 read_again:
        rdisk = read_balance(conf, r1_bio, &max_sectors);
@@ -1106,9 +1223,8 @@ read_again:
                           atomic_read(&bitmap->behind_writes) == 0);
        }
        r1_bio->read_disk = rdisk;
-        r1_bio->start_next_window = 0;
-        read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+        read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
        bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
                 max_sectors);
@@ -1151,22 +1267,16 @@ read_again:
                 */
                reschedule_retry(r1_bio);
-                r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+                r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
-                r1_bio->master_bio = bio;
-                r1_bio->sectors = bio_sectors(bio) - sectors_handled;
-                r1_bio->state = 0;
-                r1_bio->mddev = mddev;
-                r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
                goto read_again;
        } else
                generic_make_request(read_bio);
 }
-static void raid1_write_request(struct mddev *mddev, struct bio *bio,
+static void raid1_write_request(struct mddev *mddev, struct bio *bio)
-                                struct r1bio *r1_bio)
 {
        struct r1conf *conf = mddev->private;
+        struct r1bio *r1_bio;
        int i, disks;
        struct bitmap *bitmap = mddev->bitmap;
        unsigned long flags;
@@ -1176,7 +1286,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
        int first_clone;
        int sectors_handled;
        int max_sectors;
-        sector_t start_next_window;
        /*
         * Register the new request and wait if the reconstruction
@@ -1212,7 +1321,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                }
                finish_wait(&conf->wait_barrier, &w);
        }
-        start_next_window = wait_barrier(conf, bio);
+        wait_barrier(conf, bio->bi_iter.bi_sector);
+        r1_bio = alloc_r1bio(mddev, bio, 0);
+        /* We might need to issue multiple writes to different
+         * devices if there are bad blocks around, so we keep
+         * track of the number of writes in bio->bi_phys_segments.
+         * If this is 0, there is only one r1_bio and no locking
+         * will be needed when requests complete.  If it is
+         * non-zero, then it is the number of not-completed requests.
+         */
+        bio->bi_phys_segments = 0;
+        bio_clear_flag(bio, BIO_SEG_VALID);
        if (conf->pending_count >= max_queued_requests) {
                md_wakeup_thread(mddev->thread);
@@ -1233,7 +1354,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
        disks = conf->raid_disks * 2;
 retry_write:
-        r1_bio->start_next_window = start_next_window;
        blocked_rdev = NULL;
        rcu_read_lock();
        max_sectors = r1_bio->sectors;
@@ -1300,25 +1420,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
        if (unlikely(blocked_rdev)) {
                /* Wait for this device to become unblocked */
                int j;
-                sector_t old = start_next_window;
                for (j = 0; j < i; j++)
                        if (r1_bio->bios[j])
                                rdev_dec_pending(conf->mirrors[j].rdev, mddev);
                r1_bio->state = 0;
-                allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
+                allow_barrier(conf, bio->bi_iter.bi_sector);
                raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
                md_wait_for_blocked_rdev(blocked_rdev, mddev);
-                start_next_window = wait_barrier(conf, bio);
+                wait_barrier(conf, bio->bi_iter.bi_sector);
-                /*
-                 * We must make sure the multi r1bios of bio have
-                 * the same value of bi_phys_segments
-                 */
-                if (bio->bi_phys_segments && old &&
-                    old != start_next_window)
-                        /* Wait for the former r1bio(s) to complete */
-                        wait_event(conf->wait_barrier,
-                                   bio->bi_phys_segments == 1);
                goto retry_write;
        }
@@ -1341,13 +1451,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
        first_clone = 1;
        for (i = 0; i < disks; i++) {
-                struct bio *mbio;
+                struct bio *mbio = NULL;
+                sector_t offset;
                if (!r1_bio->bios[i])
                        continue;
-                mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                offset = r1_bio->sector - bio->bi_iter.bi_sector;
-                bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
-                         max_sectors);
                if (first_clone) {
                        /* do behind I/O ?
@@ -1357,8 +1466,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                        if (bitmap &&
                            (atomic_read(&bitmap->behind_writes)
                             < mddev->bitmap_info.max_write_behind) &&
-                            !waitqueue_active(&bitmap->behind_wait))
+                            !waitqueue_active(&bitmap->behind_wait)) {
+                                mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
+                                                                mddev->bio_set,
+                                                                offset << 9,
+                                                                max_sectors << 9);
                                alloc_behind_pages(mbio, r1_bio);
+                        }
                        bitmap_startwrite(bitmap, r1_bio->sector,
                                          r1_bio->sectors,
@@ -1366,6 +1480,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                                                   &r1_bio->state));
                        first_clone = 0;
                }
+                if (!mbio) {
+                        if (r1_bio->behind_bvecs)
+                                mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
+                                                                mddev->bio_set,
+                                                                offset << 9,
+                                                                max_sectors << 9);
+                        else {
+                                mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+                                bio_trim(mbio, offset, max_sectors);
+                        }
+                }
                if (r1_bio->behind_bvecs) {
                        struct bio_vec *bvec;
                        int j;
@@ -1385,8 +1512,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                                   conf->mirrors[i].rdev->data_offset);
                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
                mbio->bi_end_io = raid1_end_write_request;
-                mbio->bi_opf = bio_op(bio) |
+                mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
-                        (bio->bi_opf & (REQ_SYNC | REQ_PREFLUSH | REQ_FUA));
                if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
                    !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
                    conf->raid_disks - mddev->degraded > 1)
@@ -1427,12 +1553,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                /* We need another r1_bio.  It has already been counted
                 * in bio->bi_phys_segments
                 */
-                r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+                r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
-                r1_bio->master_bio = bio;
-                r1_bio->sectors = bio_sectors(bio) - sectors_handled;
-                r1_bio->state = 0;
-                r1_bio->mddev = mddev;
-                r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
                goto retry_write;
        }
@@ -1444,36 +1565,30 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 {
-        struct r1conf *conf = mddev->private;
+        struct bio *split;
-        struct r1bio *r1_bio;
+        sector_t sectors;
-        /*
+        if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
-         * make_request() can abort the operation when read-ahead is being
+                md_flush_request(mddev, bio);
-         * used and no empty request is available.
+                return;
-         *
+        }
-         */
-        r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-        r1_bio->master_bio = bio;
-        r1_bio->sectors = bio_sectors(bio);
-        r1_bio->state = 0;
-        r1_bio->mddev = mddev;
-        r1_bio->sector = bio->bi_iter.bi_sector;
-        /*
+        /* if bio exceeds barrier unit boundary, split it */
-         * We might need to issue multiple reads to different devices if there
+        do {
-         * are bad blocks around, so we keep track of the number of reads in
+                sectors = align_to_barrier_unit_end(
-         * bio->bi_phys_segments.  If this is 0, there is only one r1_bio and
+                                bio->bi_iter.bi_sector, bio_sectors(bio));
-         * no locking will be needed when requests complete.  If it is
+                if (sectors < bio_sectors(bio)) {
-         * non-zero, then it is the number of not-completed requests.
+                        split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
-         */
+                        bio_chain(split, bio);
-        bio->bi_phys_segments = 0;
+                } else {
-        bio_clear_flag(bio, BIO_SEG_VALID);
+                        split = bio;
+                }
-        if (bio_data_dir(bio) == READ)
+                if (bio_data_dir(split) == READ)
-                raid1_read_request(mddev, bio, r1_bio);
+                        raid1_read_request(mddev, split);
-        else
+                else
-                raid1_write_request(mddev, bio, r1_bio);
+                        raid1_write_request(mddev, split);
+        } while (split != bio);
 }
 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -1564,19 +1679,11 @@ static void print_conf(struct r1conf *conf)
 static void close_sync(struct r1conf *conf)
 {
-        wait_barrier(conf, NULL);
+        wait_all_barriers(conf);
-        allow_barrier(conf, 0, 0);
+        allow_all_barriers(conf);
        mempool_destroy(conf->r1buf_pool);
        conf->r1buf_pool = NULL;
-        spin_lock_irq(&conf->resync_lock);
-        conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE;
-        conf->start_next_window = MaxSector;
-        conf->current_window_requests +=
-                conf->next_window_requests;
-        conf->next_window_requests = 0;
-        spin_unlock_irq(&conf->resync_lock);
 }
 static int raid1_spare_active(struct mddev *mddev)
@@ -2273,7 +2380,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
                        wbio->bi_vcnt = vcnt;
                } else {
-                        wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+                        wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
+                                              mddev->bio_set);
                }
                bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
@@ -2323,8 +2431,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 {
-        int m;
+        int m, idx;
        bool fail = false;
        for (m = 0; m < conf->raid_disks * 2 ; m++)
                if (r1_bio->bios[m] == IO_MADE_GOOD) {
                        struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2350,8 +2459,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
        if (fail) {
                spin_lock_irq(&conf->device_lock);
                list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
-                conf->nr_queued++;
+                idx = sector_to_idx(r1_bio->sector);
+                atomic_inc(&conf->nr_queued[idx]);
                spin_unlock_irq(&conf->device_lock);
+                /*
+                 * In case freeze_array() is waiting for condition
+                 * get_unqueued_pending() == extra to be true.
+                 */
+                wake_up(&conf->wait_barrier);
                md_wakeup_thread(conf->mddev->thread);
        } else {
                if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2411,7 +2526,8 @@ read_more:
                const unsigned long do_sync
                        = r1_bio->master_bio->bi_opf & REQ_SYNC;
                r1_bio->read_disk = disk;
-                bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+                bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
+                                     mddev->bio_set);
                bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
                         max_sectors);
                r1_bio->bios[r1_bio->read_disk] = bio;
@@ -2445,15 +2561,8 @@ read_more:
                        generic_make_request(bio);
                        bio = NULL;
-                        r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+                        r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
-                        r1_bio->master_bio = mbio;
-                        r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
-                        r1_bio->state = 0;
                        set_bit(R1BIO_ReadError, &r1_bio->state);
-                        r1_bio->mddev = mddev;
-                        r1_bio->sector = mbio->bi_iter.bi_sector +
-                                sectors_handled;
                        goto read_more;
                } else {
@@ -2472,6 +2581,7 @@ static void raid1d(struct md_thread *thread)
        struct r1conf *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
        struct blk_plug plug;
+        int idx;
        md_check_recovery(mddev);
@@ -2479,17 +2589,15 @@ static void raid1d(struct md_thread *thread)
            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
                LIST_HEAD(tmp);
                spin_lock_irqsave(&conf->device_lock, flags);
-                if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
+                if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
-                        while (!list_empty(&conf->bio_end_io_list)) {
+                        list_splice_init(&conf->bio_end_io_list, &tmp);
-                                list_move(conf->bio_end_io_list.prev, &tmp);
-                                conf->nr_queued--;
-                        }
-                }
                spin_unlock_irqrestore(&conf->device_lock, flags);
                while (!list_empty(&tmp)) {
                        r1_bio = list_first_entry(&tmp, struct r1bio,
                                                  retry_list);
                        list_del(&r1_bio->retry_list);
+                        idx = sector_to_idx(r1_bio->sector);
+                        atomic_dec(&conf->nr_queued[idx]);
                        if (mddev->degraded)
                                set_bit(R1BIO_Degraded, &r1_bio->state);
                        if (test_bit(R1BIO_WriteError, &r1_bio->state))
@@ -2510,7 +2618,8 @@ static void raid1d(struct md_thread *thread)
                }
                r1_bio = list_entry(head->prev, struct r1bio, retry_list);
                list_del(head->prev);
-                conf->nr_queued--;
+                idx = sector_to_idx(r1_bio->sector);
+                atomic_dec(&conf->nr_queued[idx]);
                spin_unlock_irqrestore(&conf->device_lock, flags);
                mddev = r1_bio->mddev;
@@ -2549,7 +2658,6 @@ static int init_resync(struct r1conf *conf)
                                          conf->poolinfo);
        if (!conf->r1buf_pool)
                return -ENOMEM;
-        conf->next_resync = 0;
        return 0;
 }
@@ -2578,6 +2686,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
        int still_degraded = 0;
        int good_sectors = RESYNC_SECTORS;
        int min_bad = 0; /* number of sectors that are bad in all devices */
+        int idx = sector_to_idx(sector_nr);
        if (!conf->r1buf_pool)
                if (init_resync(conf))
@@ -2627,7 +2736,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
         * If there is non-resync activity waiting for a turn, then let it
         * though before starting on this new sync request.
         */
-        if (conf->nr_waiting)
+        if (atomic_read(&conf->nr_waiting[idx]))
                schedule_timeout_uninterruptible(1);
        /* we are incrementing sector_nr below. To be safe, we check against
@@ -2654,6 +2763,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
        r1_bio->sector = sector_nr;
        r1_bio->state = 0;
        set_bit(R1BIO_IsSync, &r1_bio->state);
+        /* make sure good_sectors won't go across barrier unit boundary */
+        good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
        for (i = 0; i < conf->raid_disks * 2; i++) {
                struct md_rdev *rdev;
@@ -2884,6 +2995,26 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        if (!conf)
                goto abort;
+        conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
+                                   sizeof(atomic_t), GFP_KERNEL);
+        if (!conf->nr_pending)
+                goto abort;
+        conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
+                                   sizeof(atomic_t), GFP_KERNEL);
+        if (!conf->nr_waiting)
+                goto abort;
+        conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
+                                  sizeof(atomic_t), GFP_KERNEL);
+        if (!conf->nr_queued)
+                goto abort;
+        conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
+                                sizeof(atomic_t), GFP_KERNEL);
+        if (!conf->barrier)
+                goto abort;
        conf->mirrors = kzalloc(sizeof(struct raid1_info)
                                * mddev->raid_disks * 2,
                                 GFP_KERNEL);
@@ -2939,9 +3070,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        conf->pending_count = 0;
        conf->recovery_disabled = mddev->recovery_disabled - 1;
-        conf->start_next_window = MaxSector;
-        conf->current_window_requests = conf->next_window_requests = 0;
        err = -EIO;
        for (i = 0; i < conf->raid_disks * 2; i++) {
@@ -2984,6 +3112,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
                kfree(conf->mirrors);
                safe_put_page(conf->tmppage);
                kfree(conf->poolinfo);
+                kfree(conf->nr_pending);
+                kfree(conf->nr_waiting);
+                kfree(conf->nr_queued);
+                kfree(conf->barrier);
                kfree(conf);
        }
        return ERR_PTR(err);
@@ -3085,6 +3217,10 @@ static void raid1_free(struct mddev *mddev, void *priv)
        kfree(conf->mirrors);
        safe_put_page(conf->tmppage);
        kfree(conf->poolinfo);
+        kfree(conf->nr_pending);
+        kfree(conf->nr_waiting);
+        kfree(conf->nr_queued);
+        kfree(conf->barrier);
        kfree(conf);
 }