1 files changed, 270 insertions, 333 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84718383124d..6e846688962f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -5,7 +5,7 @@
 *
 * RAID-10 support for md.
 *
- * Base on code in raid1.c.  See raid1.c for futher copyright information.
+ * Base on code in raid1.c.  See raid1.c for further copyright information.
 *
 *
 * This program is free software; you can redistribute it and/or modify
@@ -57,23 +57,16 @@
 */
 #define NR_RAID10_BIOS 256
-static void unplug_slaves(mddev_t *mddev);
 static void allow_barrier(conf_t *conf);
 static void lower_barrier(conf_t *conf);
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
        conf_t *conf = data;
-        r10bio_t *r10_bio;
        int size = offsetof(struct r10bio_s, devs[conf->copies]);
        /* allocate a r10bio with room for raid_disks entries in the bios array */
-        r10_bio = kzalloc(size, gfp_flags);
+        return kzalloc(size, gfp_flags);
-        if (!r10_bio && conf->mddev)
-                unplug_slaves(conf->mddev);
-        return r10_bio;
 }
 static void r10bio_pool_free(void *r10_bio, void *data)
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
        int nalloc;
        r10_bio = r10bio_pool_alloc(gfp_flags, conf);
-        if (!r10_bio) {
+        if (!r10_bio)
-                unplug_slaves(conf->mddev);
                return NULL;
-        }
        if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
                nalloc = conf->copies; /* resync */
@@ -120,7 +111,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
         * Allocate bios.
         */
        for (j = nalloc ; j-- ; ) {
-                bio = bio_alloc(gfp_flags, RESYNC_PAGES);
+                bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
                if (!bio)
                        goto out_free_bio;
                r10_bio->devs[j].bio = bio;
@@ -280,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
                 */
                set_bit(R10BIO_Uptodate, &r10_bio->state);
                raid_end_bio_io(r10_bio);
+                rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
        } else {
                /*
-                 * oops, read error:
+                 * oops, read error - keep the refcount on the rdev
                 */
                char b[BDEVNAME_SIZE];
                if (printk_ratelimit())
@@ -291,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
                               bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
                reschedule_retry(r10_bio);
        }
-        rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 }
 static void raid10_end_write_request(struct bio *bio, int error)
@@ -349,14 +339,14 @@ static void raid10_end_write_request(struct bio *bio, int error)
 /*
 * RAID10 layout manager
- * Aswell as the chunksize and raid_disks count, there are two
+ * As well as the chunksize and raid_disks count, there are two
 * parameters: near_copies and far_copies.
 * near_copies * far_copies must be <= raid_disks.
 * Normally one of these will be 1.
 * If both are 1, we get raid0.
 * If near_copies == raid_disks, we get raid1.
 *
- * Chunks are layed out in raid0 style with near_copies copies of the
+ * Chunks are laid out in raid0 style with near_copies copies of the
 * first chunk, followed by near_copies copies of the next chunk and
 * so on.
 * If far_copies > 1, then after 1/far_copies of the array has been assigned
@@ -497,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 {
        const sector_t this_sector = r10_bio->sector;
-        int disk, slot, nslot;
+        int disk, slot;
        const int sectors = r10_bio->sectors;
-        sector_t new_distance, current_distance;
+        sector_t new_distance, best_dist;
        mdk_rdev_t *rdev;
+        int do_balance;
+        int best_slot;
        raid10_find_phys(conf, r10_bio);
        rcu_read_lock();
+retry:
+        best_slot = -1;
+        best_dist = MaxSector;
+        do_balance = 1;
        /*
         * Check if we can balance. We can balance on the whole
         * device if no resync is going on (recovery is ok), or below
@@ -511,123 +507,64 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
         * above the resync window.
         */
        if (conf->mddev->recovery_cp < MaxSector
-            && (this_sector + sectors >= conf->next_resync)) {
+            && (this_sector + sectors >= conf->next_resync))
-                /* make sure that disk is operational */
+                do_balance = 0;
-                slot = 0;
-                disk = r10_bio->devs[slot].devnum;
-                while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
+        for (slot = 0; slot < conf->copies ; slot++) {
-                       r10_bio->devs[slot].bio == IO_BLOCKED ||
+                if (r10_bio->devs[slot].bio == IO_BLOCKED)
-                       !test_bit(In_sync, &rdev->flags)) {
+                        continue;
-                        slot++;
-                        if (slot == conf->copies) {
-                                slot = 0;
-                                disk = -1;
-                                break;
-                        }
-                        disk = r10_bio->devs[slot].devnum;
-                }
-                goto rb_out;
-        }
-        /* make sure the disk is operational */
-        slot = 0;
-        disk = r10_bio->devs[slot].devnum;
-        while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
-               r10_bio->devs[slot].bio == IO_BLOCKED ||
-               !test_bit(In_sync, &rdev->flags)) {
-                slot ++;
-                if (slot == conf->copies) {
-                        disk = -1;
-                        goto rb_out;
-                }
                disk = r10_bio->devs[slot].devnum;
-        }
+                rdev = rcu_dereference(conf->mirrors[disk].rdev);
+                if (rdev == NULL)
+                        continue;
-        current_distance = abs(r10_bio->devs[slot].addr -
+                if (!test_bit(In_sync, &rdev->flags))
-                               conf->mirrors[disk].head_position);
-        /* Find the disk whose head is closest,
-         * or - for far > 1 - find the closest to partition beginning */
-        for (nslot = slot; nslot < conf->copies; nslot++) {
-                int ndisk = r10_bio->devs[nslot].devnum;
-                if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
-                    r10_bio->devs[nslot].bio == IO_BLOCKED ||
-                    !test_bit(In_sync, &rdev->flags))
                        continue;
+                if (!do_balance)
+                        break;
                /* This optimisation is debatable, and completely destroys
                 * sequential read speed for 'far copies' arrays.  So only
                 * keep it for 'near' arrays, and review those later.
                 */
-                if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
+                if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
-                        disk = ndisk;
-                        slot = nslot;
                        break;
-                }
                /* for far > 1 always use the lowest address */
                if (conf->far_copies > 1)
-                        new_distance = r10_bio->devs[nslot].addr;
+                        new_distance = r10_bio->devs[slot].addr;
                else
-                        new_distance = abs(r10_bio->devs[nslot].addr -
+                        new_distance = abs(r10_bio->devs[slot].addr -
-                                           conf->mirrors[ndisk].head_position);
+                                           conf->mirrors[disk].head_position);
-                if (new_distance < current_distance) {
+                if (new_distance < best_dist) {
-                        current_distance = new_distance;
+                        best_dist = new_distance;
-                        disk = ndisk;
+                        best_slot = slot;
-                        slot = nslot;
                }
        }
+        if (slot == conf->copies)
+                slot = best_slot;
-rb_out:
+        if (slot >= 0) {
-        r10_bio->read_slot = slot;
+                disk = r10_bio->devs[slot].devnum;
-/*      conf->next_seq_sect = this_sector + sectors;*/
+                rdev = rcu_dereference(conf->mirrors[disk].rdev);
+                if (!rdev)
-        if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
+                        goto retry;
-                atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
+                atomic_inc(&rdev->nr_pending);
-        else
+                if (test_bit(Faulty, &rdev->flags)) {
+                        /* Cannot risk returning a device that failed
+                         * before we inc'ed nr_pending
+                         */
+                        rdev_dec_pending(rdev, conf->mddev);
+                        goto retry;
+                }
+                r10_bio->read_slot = slot;
+        } else
                disk = -1;
        rcu_read_unlock();
        return disk;
 }
-static void unplug_slaves(mddev_t *mddev)
-{
-        conf_t *conf = mddev->private;
-        int i;
-        rcu_read_lock();
-        for (i=0; i < conf->raid_disks; i++) {
-                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-                if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-                        struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-                        atomic_inc(&rdev->nr_pending);
-                        rcu_read_unlock();
-                        blk_unplug(r_queue);
-                        rdev_dec_pending(rdev, mddev);
-                        rcu_read_lock();
-                }
-        }
-        rcu_read_unlock();
-}
-static void raid10_unplug(struct request_queue *q)
-{
-        mddev_t *mddev = q->queuedata;
-        unplug_slaves(q->queuedata);
-        md_wakeup_thread(mddev->thread);
-}
 static int raid10_congested(void *data, int bits)
 {
        mddev_t *mddev = data;
@@ -649,20 +586,16 @@ static int raid10_congested(void *data, int bits)
        return ret;
 }
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
 {
        /* Any writes that have been queued but are awaiting
         * bitmap updates get flushed here.
-         * We return 1 if any requests were actually submitted.
         */
-        int rv = 0;
        spin_lock_irq(&conf->device_lock);
        if (conf->pending_bio_list.head) {
                struct bio *bio;
                bio = bio_list_get(&conf->pending_bio_list);
-                blk_remove_plug(conf->mddev->queue);
                spin_unlock_irq(&conf->device_lock);
                /* flush any pending bitmap writes to disk
                 * before proceeding w/ I/O */
@@ -674,11 +607,10 @@ static int flush_pending_writes(conf_t *conf)
                        generic_make_request(bio);
                        bio = next;
                }
-                rv = 1;
        } else
                spin_unlock_irq(&conf->device_lock);
-        return rv;
 }
 /* Barriers....
 * Sometimes we need to suspend IO while we do something else,
 * either some resync/recovery, or reconfigure the array.
@@ -708,17 +640,15 @@ static void raise_barrier(conf_t *conf, int force)
        /* Wait until no block IO is waiting (unless 'force') */
        wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-                            conf->resync_lock,
+                            conf->resync_lock, );
-                            raid10_unplug(conf->mddev->queue));
        /* block any new IO from starting */
        conf->barrier++;
-        /* No wait for all pending IO to complete */
+        /* Now wait for all pending IO to complete */
        wait_event_lock_irq(conf->wait_barrier,
                            !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-                            conf->resync_lock,
+                            conf->resync_lock, );
-                            raid10_unplug(conf->mddev->queue));
        spin_unlock_irq(&conf->resync_lock);
 }
@@ -739,7 +669,7 @@ static void wait_barrier(conf_t *conf)
                conf->nr_waiting++;
                wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
                                    conf->resync_lock,
-                                    raid10_unplug(conf->mddev->queue));
+                                    );
                conf->nr_waiting--;
        }
        conf->nr_pending++;
@@ -775,8 +705,8 @@ static void freeze_array(conf_t *conf)
        wait_event_lock_irq(conf->wait_barrier,
                            conf->nr_pending == conf->nr_queued+1,
                            conf->resync_lock,
-                            ({ flush_pending_writes(conf);
+                            flush_pending_writes(conf));
-                               raid10_unplug(conf->mddev->queue); }));
        spin_unlock_irq(&conf->resync_lock);
 }
@@ -800,12 +730,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        int chunk_sects = conf->chunk_mask + 1;
        const int rw = bio_data_dir(bio);
        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
-        struct bio_list bl;
+        const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
        unsigned long flags;
        mdk_rdev_t *blocked_rdev;
+        int plugged;
-        if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-                md_barrier_request(mddev, bio);
+                md_flush_request(mddev, bio);
                return 0;
        }
@@ -889,7 +820,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                }
                mirror = conf->mirrors + disk;
-                read_bio = bio_clone(bio, GFP_NOIO);
+                read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
                r10_bio->devs[slot].bio = read_bio;
@@ -911,6 +842,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
         * inc refcount on their rdev.  Record them by setting
         * bios[x] to bio
         */
+        plugged = mddev_check_plugged(mddev);
        raid10_find_phys(conf, r10_bio);
 retry_write:
        blocked_rdev = NULL;
@@ -949,48 +882,46 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                goto retry_write;
        }
-        atomic_set(&r10_bio->remaining, 0);
+        atomic_set(&r10_bio->remaining, 1);
+        bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
-        bio_list_init(&bl);
        for (i = 0; i < conf->copies; i++) {
                struct bio *mbio;
                int d = r10_bio->devs[i].devnum;
                if (!r10_bio->devs[i].bio)
                        continue;
-                mbio = bio_clone(bio, GFP_NOIO);
+                mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
                r10_bio->devs[i].bio = mbio;
                mbio->bi_sector = r10_bio->devs[i].addr+
                        conf->mirrors[d].rdev->data_offset;
                mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
                mbio->bi_end_io = raid10_end_write_request;
-                mbio->bi_rw = WRITE | do_sync;
+                mbio->bi_rw = WRITE | do_sync | do_fua;
                mbio->bi_private = r10_bio;
                atomic_inc(&r10_bio->remaining);
-                bio_list_add(&bl, mbio);
+                spin_lock_irqsave(&conf->device_lock, flags);
+                bio_list_add(&conf->pending_bio_list, mbio);
+                spin_unlock_irqrestore(&conf->device_lock, flags);
        }
-        if (unlikely(!atomic_read(&r10_bio->remaining))) {
+        if (atomic_dec_and_test(&r10_bio->remaining)) {
-                /* the array is dead */
+                /* This matches the end of raid10_end_write_request() */
+                bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
+                                r10_bio->sectors,
+                                !test_bit(R10BIO_Degraded, &r10_bio->state),
+                                0);
                md_write_end(mddev);
                raid_end_bio_io(r10_bio);
-                return 0;
        }
-        bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
-        spin_lock_irqsave(&conf->device_lock, flags);
-        bio_list_merge(&conf->pending_bio_list, &bl);
-        blk_plug_device(mddev->queue);
-        spin_unlock_irqrestore(&conf->device_lock, flags);
        /* In case raid10d snuck in to freeze_array */
        wake_up(&conf->wait_barrier);
-        if (do_sync)
+        if (do_sync || !mddev->bitmap || !plugged)
                md_wakeup_thread(mddev->thread);
        return 0;
 }
@@ -1051,8 +982,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
        }
        set_bit(Faulty, &rdev->flags);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
-        printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n"
+        printk(KERN_ALERT
-               KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n",
+               "md/raid10:%s: Disk failure on %s, disabling device.\n"
+               "md/raid10:%s: Operation continuing on %d devices.\n",
               mdname(mddev), bdevname(rdev->bdev, b),
               mdname(mddev), conf->raid_disks - mddev->degraded);
 }
@@ -1229,7 +1161,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
                        p->rdev = rdev;
                        goto abort;
                }
-                md_integrity_register(mddev);
+                err = md_integrity_register(mddev);
        }
 abort:
@@ -1505,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
        int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
        int d = r10_bio->devs[r10_bio->read_slot].devnum;
-        rcu_read_lock();
+        /* still own a reference to this rdev, so it cannot
-        rdev = rcu_dereference(conf->mirrors[d].rdev);
+         * have been cleared recently.
-        if (rdev) { /* If rdev is not NULL */
+         */
-                char b[BDEVNAME_SIZE];
+        rdev = conf->mirrors[d].rdev;
-                int cur_read_error_count = 0;
-                bdevname(rdev->bdev, b);
+        if (test_bit(Faulty, &rdev->flags))
+                /* drive has already been failed, just ignore any
+                   more fix_read_error() attempts */
+                return;
-                if (test_bit(Faulty, &rdev->flags)) {
+        check_decay_read_errors(mddev, rdev);
-                        rcu_read_unlock();
+        atomic_inc(&rdev->read_errors);
-                        /* drive has already been failed, just ignore any
+        if (atomic_read(&rdev->read_errors) > max_read_errors) {
-                           more fix_read_error() attempts */
+                char b[BDEVNAME_SIZE];
-                        return;
+                bdevname(rdev->bdev, b);
-                }
-                check_decay_read_errors(mddev, rdev);
+                printk(KERN_NOTICE
-                atomic_inc(&rdev->read_errors);
+                       "md/raid10:%s: %s: Raid device exceeded "
-                cur_read_error_count = atomic_read(&rdev->read_errors);
+                       "read_error threshold [cur %d:max %d]\n",
-                if (cur_read_error_count > max_read_errors) {
+                       mdname(mddev), b,
-                        rcu_read_unlock();
+                       atomic_read(&rdev->read_errors), max_read_errors);
-                        printk(KERN_NOTICE
+                printk(KERN_NOTICE
-                               "md/raid10:%s: %s: Raid device exceeded "
+                       "md/raid10:%s: %s: Failing raid device\n",
-                               "read_error threshold "
+                       mdname(mddev), b);
-                               "[cur %d:max %d]\n",
+                md_error(mddev, conf->mirrors[d].rdev);
-                               mdname(mddev),
+                return;
-                               b, cur_read_error_count, max_read_errors);
-                        printk(KERN_NOTICE
-                               "md/raid10:%s: %s: Failing raid "
-                               "device\n", mdname(mddev), b);
-                        md_error(mddev, conf->mirrors[d].rdev);
-                        return;
-                }
        }
-        rcu_read_unlock();
        while(sectors) {
                int s = sectors;
@@ -1557,11 +1482,11 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                            test_bit(In_sync, &rdev->flags)) {
                                atomic_inc(&rdev->nr_pending);
                                rcu_read_unlock();
-                                success = sync_page_io(rdev->bdev,
+                                success = sync_page_io(rdev,
                                                       r10_bio->devs[sl].addr +
-                                                       sect + rdev->data_offset,
+                                                       sect,
                                                       s<<9,
-                                                       conf->tmppage, READ);
+                                                       conf->tmppage, READ, false);
                                rdev_dec_pending(rdev, mddev);
                                rcu_read_lock();
                                if (success)
@@ -1596,10 +1521,10 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                atomic_inc(&rdev->nr_pending);
                                rcu_read_unlock();
                                atomic_add(s, &rdev->corrected_errors);
-                                if (sync_page_io(rdev->bdev,
+                                if (sync_page_io(rdev,
                                                 r10_bio->devs[sl].addr +
-                                                 sect + rdev->data_offset,
+                                                 sect,
-                                                 s<<9, conf->tmppage, WRITE)
+                                                 s<<9, conf->tmppage, WRITE, false)
                                    == 0) {
                                        /* Well, this device is dead */
                                        printk(KERN_NOTICE
@@ -1607,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "write failed"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                               (unsigned long long)(sect+
+                                               (unsigned long long)(
-                                               rdev->data_offset),
+                                                       sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                        printk(KERN_NOTICE "md/raid10:%s: %s: failing "
                                               "drive\n",
@@ -1633,19 +1558,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                char b[BDEVNAME_SIZE];
                                atomic_inc(&rdev->nr_pending);
                                rcu_read_unlock();
-                                if (sync_page_io(rdev->bdev,
+                                if (sync_page_io(rdev,
                                                 r10_bio->devs[sl].addr +
-                                                 sect + rdev->data_offset,
+                                                 sect,
                                                 s<<9, conf->tmppage,
-                                                 READ) == 0) {
+                                                 READ, false) == 0) {
                                        /* Well, this device is dead */
                                        printk(KERN_NOTICE
                                               "md/raid10:%s: unable to read back "
                                               "corrected sectors"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                               (unsigned long long)(sect+
+                                               (unsigned long long)(
-                                                    rdev->data_offset),
+                                                       sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                        printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
                                               mdname(mddev),
@@ -1657,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "md/raid10:%s: read error corrected"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                               (unsigned long long)(sect+
+                                               (unsigned long long)(
-                                                    rdev->data_offset),
+                                                       sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                }
@@ -1680,15 +1605,16 @@ static void raid10d(mddev_t *mddev)
        unsigned long flags;
        conf_t *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
-        int unplug=0;
        mdk_rdev_t *rdev;
+        struct blk_plug plug;
        md_check_recovery(mddev);
+        blk_start_plug(&plug);
        for (;;) {
                char b[BDEVNAME_SIZE];
-                unplug += flush_pending_writes(conf);
+                flush_pending_writes(conf);
                spin_lock_irqsave(&conf->device_lock, flags);
                if (list_empty(head)) {
@@ -1702,14 +1628,13 @@ static void raid10d(mddev_t *mddev)
                mddev = r10_bio->mddev;
                conf = mddev->private;
-                if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
+                if (test_bit(R10BIO_IsSync, &r10_bio->state))
                        sync_request_write(mddev, r10_bio);
-                        unplug = 1;
+                else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
-                } else  if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
                        recovery_request_write(mddev, r10_bio);
-                        unplug = 1;
+                else {
-                } else {
+                        int slot = r10_bio->read_slot;
-                        int mirror;
+                        int mirror = r10_bio->devs[slot].devnum;
                        /* we got a read error. Maybe the drive is bad.  Maybe just
                         * the block and we can fix it.
                         * We freeze all other IO, and try reading the block from
@@ -1723,9 +1648,10 @@ static void raid10d(mddev_t *mddev)
                                fix_read_error(conf, mddev, r10_bio);
                                unfreeze_array(conf);
                        }
+                        rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
-                        bio = r10_bio->devs[r10_bio->read_slot].bio;
+                        bio = r10_bio->devs[slot].bio;
-                        r10_bio->devs[r10_bio->read_slot].bio =
+                        r10_bio->devs[slot].bio =
                                mddev->ro ? IO_BLOCKED : NULL;
                        mirror = read_balance(conf, r10_bio);
                        if (mirror == -1) {
@@ -1739,6 +1665,7 @@ static void raid10d(mddev_t *mddev)
                        } else {
                                const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
                                bio_put(bio);
+                                slot = r10_bio->read_slot;
                                rdev = conf->mirrors[mirror].rdev;
                                if (printk_ratelimit())
                                        printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
@@ -1746,22 +1673,21 @@ static void raid10d(mddev_t *mddev)
                                               mdname(mddev),
                                               bdevname(rdev->bdev,b),
                                               (unsigned long long)r10_bio->sector);
-                                bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
+                                bio = bio_clone_mddev(r10_bio->master_bio,
-                                r10_bio->devs[r10_bio->read_slot].bio = bio;
+                                                      GFP_NOIO, mddev);
-                                bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
+                                r10_bio->devs[slot].bio = bio;
+                                bio->bi_sector = r10_bio->devs[slot].addr
                                        + rdev->data_offset;
                                bio->bi_bdev = rdev->bdev;
                                bio->bi_rw = READ | do_sync;
                                bio->bi_private = r10_bio;
                                bio->bi_end_io = raid10_end_read_request;
-                                unplug = 1;
                                generic_make_request(bio);
                        }
                }
                cond_resched();
        }
-        if (unplug)
+        blk_finish_plug(&plug);
-                unplug_slaves(mddev);
 }
@@ -1810,16 +1736,16 @@ static int init_resync(conf_t *conf)
 *
 */
-static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
+                             int *skipped, int go_faster)
 {
        conf_t *conf = mddev->private;
        r10bio_t *r10_bio;
        struct bio *biolist = NULL, *bio;
        sector_t max_sector, nr_sectors;
-        int disk;
        int i;
        int max_sync;
-        int sync_blocks;
+        sector_t sync_blocks;
        sector_t sectors_skipped = 0;
        int chunks_skipped = 0;
@@ -1905,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                int j, k;
                r10_bio = NULL;
-                for (i=0 ; i<conf->raid_disks; i++)
+                for (i=0 ; i<conf->raid_disks; i++) {
-                        if (conf->mirrors[i].rdev &&
+                        int still_degraded;
-                            !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
+                        r10bio_t *rb2;
-                                int still_degraded = 0;
+                        sector_t sect;
-                                /* want to reconstruct this device */
+                        int must_sync;
-                                r10bio_t *rb2 = r10_bio;
-                                sector_t sect = raid10_find_virt(conf, sector_nr, i);
-                                int must_sync;
-                                /* Unless we are doing a full sync, we only need
-                                 * to recover the block if it is set in the bitmap
-                                 */
-                                must_sync = bitmap_start_sync(mddev->bitmap, sect,
-                                                              &sync_blocks, 1);
-                                if (sync_blocks < max_sync)
-                                        max_sync = sync_blocks;
-                                if (!must_sync &&
-                                    !conf->fullsync) {
-                                        /* yep, skip the sync_blocks here, but don't assume
-                                         * that there will never be anything to do here
-                                         */
-                                        chunks_skipped = -1;
-                                        continue;
-                                }
-                                r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+                        if (conf->mirrors[i].rdev == NULL ||
-                                raise_barrier(conf, rb2 != NULL);
+                            test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 
-                                atomic_set(&r10_bio->remaining, 0);
+                                continue;
-                                r10_bio->master_bio = (struct bio*)rb2;
+                        still_degraded = 0;
-                                if (rb2)
+                        /* want to reconstruct this device */
-                                        atomic_inc(&rb2->remaining);
+                        rb2 = r10_bio;
-                                r10_bio->mddev = mddev;
+                        sect = raid10_find_virt(conf, sector_nr, i);
-                                set_bit(R10BIO_IsRecover, &r10_bio->state);
+                        /* Unless we are doing a full sync, we only need
-                                r10_bio->sector = sect;
+                         * to recover the block if it is set in the bitmap
+                         */
+                        must_sync = bitmap_start_sync(mddev->bitmap, sect,
+                                                      &sync_blocks, 1);
+                        if (sync_blocks < max_sync)
+                                max_sync = sync_blocks;
+                        if (!must_sync &&
+                            !conf->fullsync) {
+                                /* yep, skip the sync_blocks here, but don't assume
+                                 * that there will never be anything to do here
+                                 */
+                                chunks_skipped = -1;
+                                continue;
+                        }
-                                raid10_find_phys(conf, r10_bio);
+                        r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+                        raise_barrier(conf, rb2 != NULL);
+                        atomic_set(&r10_bio->remaining, 0);
-                                /* Need to check if the array will still be
+                        r10_bio->master_bio = (struct bio*)rb2;
-                                 * degraded
+                        if (rb2)
-                                 */
+                                atomic_inc(&rb2->remaining);
-                                for (j=0; j<conf->raid_disks; j++)
+                        r10_bio->mddev = mddev;
-                                        if (conf->mirrors[j].rdev == NULL ||
+                        set_bit(R10BIO_IsRecover, &r10_bio->state);
-                                            test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
+                        r10_bio->sector = sect;
-                                                still_degraded = 1;
-                                                break;
-                                        }
-                                must_sync = bitmap_start_sync(mddev->bitmap, sect,
-                                                              &sync_blocks, still_degraded);
-                                for (j=0; j<conf->copies;j++) {
-                                        int d = r10_bio->devs[j].devnum;
-                                        if (conf->mirrors[d].rdev &&
-                                            test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
-                                                /* This is where we read from */
-                                                bio = r10_bio->devs[0].bio;
-                                                bio->bi_next = biolist;
-                                                biolist = bio;
-                                                bio->bi_private = r10_bio;
-                                                bio->bi_end_io = end_sync_read;
-                                                bio->bi_rw = READ;
-                                                bio->bi_sector = r10_bio->devs[j].addr +
-                                                        conf->mirrors[d].rdev->data_offset;
-                                                bio->bi_bdev = conf->mirrors[d].rdev->bdev;
-                                                atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-                                                atomic_inc(&r10_bio->remaining);
-                                                /* and we write to 'i' */
-                                                for (k=0; k<conf->copies; k++)
-                                                        if (r10_bio->devs[k].devnum == i)
-                                                                break;
-                                                BUG_ON(k == conf->copies);
-                                                bio = r10_bio->devs[1].bio;
-                                                bio->bi_next = biolist;
-                                                biolist = bio;
-                                                bio->bi_private = r10_bio;
-                                                bio->bi_end_io = end_sync_write;
-                                                bio->bi_rw = WRITE;
-                                                bio->bi_sector = r10_bio->devs[k].addr +
-                                                        conf->mirrors[i].rdev->data_offset;
-                                                bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-                                                r10_bio->devs[0].devnum = d;
-                                                r10_bio->devs[1].devnum = i;
-                                                break;
+                        raid10_find_phys(conf, r10_bio);
-                                        }
-                                }
+                        /* Need to check if the array will still be
-                                if (j == conf->copies) {
+                         * degraded
-                                        /* Cannot recover, so abort the recovery */
+                         */
-                                        put_buf(r10_bio);
+                        for (j=0; j<conf->raid_disks; j++)
-                                        if (rb2)
+                                if (conf->mirrors[j].rdev == NULL ||
-                                                atomic_dec(&rb2->remaining);
+                                    test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
-                                        r10_bio = rb2;
+                                        still_degraded = 1;
-                                        if (!test_and_set_bit(MD_RECOVERY_INTR,
-                                                              &mddev->recovery))
-                                                printk(KERN_INFO "md/raid10:%s: insufficient "
-                                                       "working devices for recovery.\n",
-                                                       mdname(mddev));
                                        break;
                                }
+                        must_sync = bitmap_start_sync(mddev->bitmap, sect,
+                                                      &sync_blocks, still_degraded);
+                        for (j=0; j<conf->copies;j++) {
+                                int d = r10_bio->devs[j].devnum;
+                                if (!conf->mirrors[d].rdev ||
+                                    !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
+                                        continue;
+                                /* This is where we read from */
+                                bio = r10_bio->devs[0].bio;
+                                bio->bi_next = biolist;
+                                biolist = bio;
+                                bio->bi_private = r10_bio;
+                                bio->bi_end_io = end_sync_read;
+                                bio->bi_rw = READ;
+                                bio->bi_sector = r10_bio->devs[j].addr +
+                                        conf->mirrors[d].rdev->data_offset;
+                                bio->bi_bdev = conf->mirrors[d].rdev->bdev;
+                                atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+                                atomic_inc(&r10_bio->remaining);
+                                /* and we write to 'i' */
+                                for (k=0; k<conf->copies; k++)
+                                        if (r10_bio->devs[k].devnum == i)
+                                                break;
+                                BUG_ON(k == conf->copies);
+                                bio = r10_bio->devs[1].bio;
+                                bio->bi_next = biolist;
+                                biolist = bio;
+                                bio->bi_private = r10_bio;
+                                bio->bi_end_io = end_sync_write;
+                                bio->bi_rw = WRITE;
+                                bio->bi_sector = r10_bio->devs[k].addr +
+                                        conf->mirrors[i].rdev->data_offset;
+                                bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+                                r10_bio->devs[0].devnum = d;
+                                r10_bio->devs[1].devnum = i;
+                                break;
+                        }
+                        if (j == conf->copies) {
+                                /* Cannot recover, so abort the recovery */
+                                put_buf(r10_bio);
+                                if (rb2)
+                                        atomic_dec(&rb2->remaining);
+                                r10_bio = rb2;
+                                if (!test_and_set_bit(MD_RECOVERY_INTR,
+                                                      &mddev->recovery))
+                                        printk(KERN_INFO "md/raid10:%s: insufficient "
+                                               "working devices for recovery.\n",
+                                               mdname(mddev));
+                                break;
                        }
+                }
                if (biolist == NULL) {
                        while (r10_bio) {
                                r10bio_t *rb2 = r10_bio;
@@ -2024,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                if (!bitmap_start_sync(mddev->bitmap, sector_nr,
                                       &sync_blocks, mddev->degraded) &&
-                    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+                    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
+                                                 &mddev->recovery)) {
                        /* We can skip this block */
                        *skipped = 1;
                        return sync_blocks + sectors_skipped;
@@ -2069,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                        for (i=0; i<conf->copies; i++) {
                                int d = r10_bio->devs[i].devnum;
                                if (r10_bio->devs[i].bio->bi_end_io)
-                                        rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+                                        rdev_dec_pending(conf->mirrors[d].rdev,
+                                                         mddev);
                        }
                        put_buf(r10_bio);
                        biolist = NULL;
@@ -2094,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
        do {
                struct page *page;
                int len = PAGE_SIZE;
-                disk = 0;
                if (sector_nr + (len>>9) > max_sector)
                        len = (max_sector - sector_nr) << 9;
                if (len == 0)
                        break;
                for (bio= biolist ; bio ; bio=bio->bi_next) {
+                        struct bio *bio2;
                        page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
-                        if (bio_add_page(bio, page, len, 0) == 0) {
+                        if (bio_add_page(bio, page, len, 0))
-                                /* stop here */
+                                continue;
-                                struct bio *bio2;
-                                bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+                        /* stop here */
-                                for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
+                        bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
-                                        /* remove last page from this bio */
+                        for (bio2 = biolist;
-                                        bio2->bi_vcnt--;
+                             bio2 && bio2 != bio;
-                                        bio2->bi_size -= len;
+                             bio2 = bio2->bi_next) {
-                                        bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
+                                /* remove last page from this bio */
-                                }
+                                bio2->bi_vcnt--;
-                                goto bio_full;
+                                bio2->bi_size -= len;
+                                bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
                        }
-                        disk = i;
+                        goto bio_full;
                }
                nr_sectors += len>>9;
                sector_nr += len>>9;
@@ -2302,8 +2237,6 @@ static int run(mddev_t *mddev)
        if (!conf)
                goto out;
-        mddev->queue->queue_lock = &conf->device_lock;
        mddev->thread = conf->thread;
        conf->thread = NULL;
@@ -2374,7 +2307,6 @@ static int run(mddev_t *mddev)
        md_set_array_sectors(mddev, size);
        mddev->resync_max_sectors = size;
-        mddev->queue->unplug_fn = raid10_unplug;
        mddev->queue->backing_dev_info.congested_fn = raid10_congested;
        mddev->queue->backing_dev_info.congested_data = mddev;
@@ -2392,17 +2324,20 @@ static int run(mddev_t *mddev)
        if (conf->near_copies < conf->raid_disks)
                blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
-        md_integrity_register(mddev);
+        if (md_integrity_register(mddev))
+                goto out_free_conf;
        return 0;
 out_free_conf:
+        md_unregister_thread(mddev->thread);
        if (conf->r10bio_pool)
                mempool_destroy(conf->r10bio_pool);
        safe_put_page(conf->tmppage);
        kfree(conf->mirrors);
        kfree(conf);
        mddev->private = NULL;
-        md_unregister_thread(mddev->thread);
 out:
        return -EIO;
 }
@@ -2461,11 +2396,13 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
        mddev->recovery_cp = MaxSector;
        conf = setup_conf(mddev);
-        if (!IS_ERR(conf))
+        if (!IS_ERR(conf)) {
                list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0)
                                rdev->new_raid_disk = rdev->raid_disk * 2;
-                
+                conf->barrier = 1;
+        }
        return conf;
 }