12 files changed, 504 insertions, 161 deletions
diff --git a/block/Makefile b/block/Makefile
index 6c54ed0ff755..ba74ca6bfa14 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                        blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-                        ioctl.o genhd.o scsi_ioctl.o
+                        blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
 obj-$(CONFIG_BLK_DEV_BSG)       += bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)      += noop-iosched.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 30022b4e2f63..6593ab39cfe9 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -348,6 +348,9 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
                clear_bit(BIO_UPTODATE, &bio->bi_flags);
        }
+        if (bio->bi_private)
+                complete(bio->bi_private);
        bio_put(bio);
 }
@@ -357,21 +360,20 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
 * @sector:     start sector
 * @nr_sects:   number of sectors to discard
 * @gfp_mask:   memory allocation flags (for bio_alloc)
+ * @flags:      DISCARD_FL_* flags to control behaviour
 *
 * Description:
- *    Issue a discard request for the sectors in question. Does not wait.
+ *    Issue a discard request for the sectors in question.
 */
-int blkdev_issue_discard(struct block_device *bdev,
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-                         sector_t sector, sector_t nr_sects, gfp_t gfp_mask)
+                sector_t nr_sects, gfp_t gfp_mask, int flags)
 {
-        struct request_queue *q;
+        DECLARE_COMPLETION_ONSTACK(wait);
-        struct bio *bio;
+        struct request_queue *q = bdev_get_queue(bdev);
+        int type = flags & DISCARD_FL_BARRIER ?
+                DISCARD_BARRIER : DISCARD_NOBARRIER;
        int ret = 0;
-        if (bdev->bd_disk == NULL)
-                return -ENXIO;
-        q = bdev_get_queue(bdev);
        if (!q)
                return -ENXIO;
@@ -379,12 +381,14 @@ int blkdev_issue_discard(struct block_device *bdev,
                return -EOPNOTSUPP;
        while (nr_sects && !ret) {
-                bio = bio_alloc(gfp_mask, 0);
+                struct bio *bio = bio_alloc(gfp_mask, 0);
                if (!bio)
                        return -ENOMEM;
                bio->bi_end_io = blkdev_discard_end_io;
                bio->bi_bdev = bdev;
+                if (flags & DISCARD_FL_WAIT)
+                        bio->bi_private = &wait;
                bio->bi_sector = sector;
@@ -396,10 +400,13 @@ int blkdev_issue_discard(struct block_device *bdev,
                        bio->bi_size = nr_sects << 9;
                        nr_sects = 0;
                }
                bio_get(bio);
-                submit_bio(DISCARD_BARRIER, bio);
+                submit_bio(type, bio);
+                if (flags & DISCARD_FL_WAIT)
+                        wait_for_completion(&wait);
-                /* Check if it failed immediately */
                if (bio_flagged(bio, BIO_EOPNOTSUPP))
                        ret = -EOPNOTSUPP;
                else if (!bio_flagged(bio, BIO_UPTODATE))
diff --git a/block/blk-core.c b/block/blk-core.c
index e695634882a6..8135228e4b29 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -69,7 +69,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
                part_stat_inc(cpu, part, merges[rw]);
        else {
                part_round_stats(cpu, part);
-                part_inc_in_flight(part);
+                part_inc_in_flight(part, rw);
        }
        part_stat_unlock();
@@ -1031,7 +1031,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part,
        if (part->in_flight) {
                __part_stat_add(cpu, part, time_in_queue,
-                                part->in_flight * (now - part->stamp));
+                                part_in_flight(part) * (now - part->stamp));
                __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
        }
        part->stamp = now;
@@ -1112,31 +1112,27 @@ void init_request_from_bio(struct request *req, struct bio *bio)
        req->cmd_type = REQ_TYPE_FS;
        /*
-         * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
+         * Inherit FAILFAST from bio (for read-ahead, and explicit
+         * FAILFAST).  FAILFAST flags are identical for req and bio.
         */
-        if (bio_rw_ahead(bio))
+        if (bio_rw_flagged(bio, BIO_RW_AHEAD))
-                req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
+                req->cmd_flags |= REQ_FAILFAST_MASK;
-                                   REQ_FAILFAST_DRIVER);
+        else
-        if (bio_failfast_dev(bio))
+                req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK;
-                req->cmd_flags |= REQ_FAILFAST_DEV;
-        if (bio_failfast_transport(bio))
+        if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
-                req->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-        if (bio_failfast_driver(bio))
-                req->cmd_flags |= REQ_FAILFAST_DRIVER;
-        if (unlikely(bio_discard(bio))) {
                req->cmd_flags |= REQ_DISCARD;
-                if (bio_barrier(bio))
+                if (bio_rw_flagged(bio, BIO_RW_BARRIER))
                        req->cmd_flags |= REQ_SOFTBARRIER;
                req->q->prepare_discard_fn(req->q, req);
-        } else if (unlikely(bio_barrier(bio)))
+        } else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)))
                req->cmd_flags |= REQ_HARDBARRIER;
-        if (bio_sync(bio))
+        if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
                req->cmd_flags |= REQ_RW_SYNC;
-        if (bio_rw_meta(bio))
+        if (bio_rw_flagged(bio, BIO_RW_META))
                req->cmd_flags |= REQ_RW_META;
-        if (bio_noidle(bio))
+        if (bio_rw_flagged(bio, BIO_RW_NOIDLE))
                req->cmd_flags |= REQ_NOIDLE;
        req->errors = 0;
@@ -1151,7 +1147,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 */
 static inline bool queue_should_plug(struct request_queue *q)
 {
-        return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
+        return !(blk_queue_nonrot(q) && blk_queue_queuing(q));
 }
 static int __make_request(struct request_queue *q, struct bio *bio)
@@ -1160,11 +1156,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
        int el_ret;
        unsigned int bytes = bio->bi_size;
        const unsigned short prio = bio_prio(bio);
-        const int sync = bio_sync(bio);
+        const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
-        const int unplug = bio_unplug(bio);
+        const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
+        const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
        int rw_flags;
-        if (bio_barrier(bio) && bio_has_data(bio) &&
+        if (bio_rw_flagged(bio, BIO_RW_BARRIER) && bio_has_data(bio) &&
            (q->next_ordered == QUEUE_ORDERED_NONE)) {
                bio_endio(bio, -EOPNOTSUPP);
                return 0;
@@ -1178,7 +1175,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
        spin_lock_irq(q->queue_lock);
-        if (unlikely(bio_barrier(bio)) || elv_queue_empty(q))
+        if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
                goto get_rq;
        el_ret = elv_merge(q, &req, bio);
@@ -1191,6 +1188,9 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                trace_block_bio_backmerge(q, bio);
+                if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+                        blk_rq_set_mixed_merge(req);
                req->biotail->bi_next = bio;
                req->biotail = bio;
                req->__data_len += bytes;
@@ -1210,6 +1210,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                trace_block_bio_frontmerge(q, bio);
+                if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
+                        blk_rq_set_mixed_merge(req);
+                        req->cmd_flags &= ~REQ_FAILFAST_MASK;
+                        req->cmd_flags |= ff;
+                }
                bio->bi_next = req->bio;
                req->bio = bio;
@@ -1457,19 +1463,20 @@ static inline void __generic_make_request(struct bio *bio)
                if (old_sector != -1)
                        trace_block_remap(q, bio, old_dev, old_sector);
-                trace_block_bio_queue(q, bio);
                old_sector = bio->bi_sector;
                old_dev = bio->bi_bdev->bd_dev;
                if (bio_check_eod(bio, nr_sectors))
                        goto end_io;
-                if (bio_discard(bio) && !q->prepare_discard_fn) {
+                if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
+                    !q->prepare_discard_fn) {
                        err = -EOPNOTSUPP;
                        goto end_io;
                }
+                trace_block_bio_queue(q, bio);
                ret = q->make_request_fn(q, bio);
        } while (ret);
@@ -1654,6 +1661,50 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
+/**
+ * blk_rq_err_bytes - determine number of bytes till the next failure boundary
+ * @rq: request to examine
+ *
+ * Description:
+ *     A request could be merge of IOs which require different failure
+ *     handling.  This function determines the number of bytes which
+ *     can be failed from the beginning of the request without
+ *     crossing into area which need to be retried further.
+ *
+ * Return:
+ *     The number of bytes to fail.
+ *
+ * Context:
+ *     queue_lock must be held.
+ */
+unsigned int blk_rq_err_bytes(const struct request *rq)
+{
+        unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+        unsigned int bytes = 0;
+        struct bio *bio;
+        if (!(rq->cmd_flags & REQ_MIXED_MERGE))
+                return blk_rq_bytes(rq);
+        /*
+         * Currently the only 'mixing' which can happen is between
+         * different fastfail types.  We can safely fail portions
+         * which have all the failfast bits that the first one has -
+         * the ones which are at least as eager to fail as the first
+         * one.
+         */
+        for (bio = rq->bio; bio; bio = bio->bi_next) {
+                if ((bio->bi_rw & ff) != ff)
+                        break;
+                bytes += bio->bi_size;
+        }
+        /* this could lead to infinite loop */
+        BUG_ON(blk_rq_bytes(rq) && !bytes);
+        return bytes;
+}
+EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
        if (blk_do_io_stat(req)) {
@@ -1687,7 +1738,7 @@ static void blk_account_io_done(struct request *req)
                part_stat_inc(cpu, part, ios[rw]);
                part_stat_add(cpu, part, ticks[rw], duration);
                part_round_stats(cpu, part);
-                part_dec_in_flight(part);
+                part_dec_in_flight(part, rw);
                part_stat_unlock();
        }
@@ -1807,8 +1858,15 @@ void blk_dequeue_request(struct request *rq)
         * and to it is freed is accounted as io that is in progress at
         * the driver side.
         */
-        if (blk_account_rq(rq))
+        if (blk_account_rq(rq)) {
                q->in_flight[rq_is_sync(rq)]++;
+                /*
+                 * Mark this device as supporting hardware queuing, if
+                 * we have more IOs in flight than 4.
+                 */
+                if (!blk_queue_queuing(q) && queue_in_flight(q) > 4)
+                        set_bit(QUEUE_FLAG_CQ, &q->queue_flags);
+        }
 }
 /**
@@ -2000,6 +2058,12 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
        if (blk_fs_request(req) || blk_discard_rq(req))
                req->__sector += total_bytes >> 9;
+        /* mixed attributes always follow the first bio */
+        if (req->cmd_flags & REQ_MIXED_MERGE) {
+                req->cmd_flags &= ~REQ_FAILFAST_MASK;
+                req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK;
+        }
        /*
         * If total number of sectors is less than the first segment
         * size, something has gone terribly wrong.
@@ -2179,6 +2243,25 @@ bool blk_end_request_cur(struct request *rq, int error)
 EXPORT_SYMBOL(blk_end_request_cur);
 /**
+ * blk_end_request_err - Finish a request till the next failure boundary.
+ * @rq: the request to finish till the next failure boundary for
+ * @error: must be negative errno
+ *
+ * Description:
+ *     Complete @rq till the next failure boundary.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ */
+bool blk_end_request_err(struct request *rq, int error)
+{
+        WARN_ON(error >= 0);
+        return blk_end_request(rq, error, blk_rq_err_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(blk_end_request_err);
+/**
 * __blk_end_request - Helper function for drivers to complete the request.
 * @rq:       the request being processed
 * @error:    %0 for success, < %0 for error
@@ -2237,12 +2320,31 @@ bool __blk_end_request_cur(struct request *rq, int error)
 }
 EXPORT_SYMBOL(__blk_end_request_cur);
+/**
+ * __blk_end_request_err - Finish a request till the next failure boundary.
+ * @rq: the request to finish till the next failure boundary for
+ * @error: must be negative errno
+ *
+ * Description:
+ *     Complete @rq till the next failure boundary.  Must be called
+ *     with queue lock held.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ */
+bool __blk_end_request_err(struct request *rq, int error)
+{
+        WARN_ON(error >= 0);
+        return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(__blk_end_request_err);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                     struct bio *bio)
 {
-        /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
+        /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
-           we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
+        rq->cmd_flags |= bio->bi_rw & REQ_RW;
-        rq->cmd_flags |= (bio->bi_rw & 3);
        if (bio_has_data(bio)) {
                rq->nr_phys_segments = bio_phys_segments(q, bio);
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
new file mode 100644
index 000000000000..ca564202ed7a
--- /dev/null
+++ b/block/blk-iopoll.c
@@ -0,0 +1,227 @@
+/*
+ * Functions related to interrupt-poll handling in the block layer. This
+ * is similar to NAPI for network devices.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/blk-iopoll.h>
+#include <linux/delay.h>
+#include "blk.h"
+int blk_iopoll_enabled = 1;
+EXPORT_SYMBOL(blk_iopoll_enabled);
+static unsigned int blk_iopoll_budget __read_mostly = 256;
+static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
+/**
+ * blk_iopoll_sched - Schedule a run of the iopoll handler
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Add this blk_iopoll structure to the pending poll list and trigger the
+ *     raise of the blk iopoll softirq. The driver must already have gotten a
+ *     succesful return from blk_iopoll_sched_prep() before calling this.
+ **/
+void blk_iopoll_sched(struct blk_iopoll *iop)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll));
+        __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_iopoll_sched);
+/**
+ * __blk_iopoll_complete - Mark this @iop as un-polled again
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     See blk_iopoll_complete(). This function must be called with interrupts
+ *     disabled.
+ **/
+void __blk_iopoll_complete(struct blk_iopoll *iop)
+{
+        list_del(&iop->list);
+        smp_mb__before_clear_bit();
+        clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(__blk_iopoll_complete);
+/**
+ * blk_iopoll_complete - Mark this @iop as un-polled again
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     If a driver consumes less than the assigned budget in its run of the
+ *     iopoll handler, it'll end the polled mode by calling this function. The
+ *     iopoll handler will not be invoked again before blk_iopoll_sched_prep()
+ *     is called.
+ **/
+void blk_iopoll_complete(struct blk_iopoll *iopoll)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __blk_iopoll_complete(iopoll);
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_iopoll_complete);
+static void blk_iopoll_softirq(struct softirq_action *h)
+{
+        struct list_head *list = &__get_cpu_var(blk_cpu_iopoll);
+        int rearm = 0, budget = blk_iopoll_budget;
+        unsigned long start_time = jiffies;
+        local_irq_disable();
+        while (!list_empty(list)) {
+                struct blk_iopoll *iop;
+                int work, weight;
+                /*
+                 * If softirq window is exhausted then punt.
+                 */
+                if (budget <= 0 || time_after(jiffies, start_time)) {
+                        rearm = 1;
+                        break;
+                }
+                local_irq_enable();
+                /* Even though interrupts have been re-enabled, this
+                 * access is safe because interrupts can only add new
+                 * entries to the tail of this list, and only ->poll()
+                 * calls can remove this head entry from the list.
+                 */
+                iop = list_entry(list->next, struct blk_iopoll, list);
+                weight = iop->weight;
+                work = 0;
+                if (test_bit(IOPOLL_F_SCHED, &iop->state))
+                        work = iop->poll(iop, weight);
+                budget -= work;
+                local_irq_disable();
+                /*
+                 * Drivers must not modify the iopoll state, if they
+                 * consume their assigned weight (or more, some drivers can't
+                 * easily just stop processing, they have to complete an
+                 * entire mask of commands).In such cases this code
+                 * still "owns" the iopoll instance and therefore can
+                 * move the instance around on the list at-will.
+                 */
+                if (work >= weight) {
+                        if (blk_iopoll_disable_pending(iop))
+                                __blk_iopoll_complete(iop);
+                        else
+                                list_move_tail(&iop->list, list);
+                }
+        }
+        if (rearm)
+                __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+        local_irq_enable();
+}
+/**
+ * blk_iopoll_disable - Disable iopoll on this @iop
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Disable io polling and wait for any pending callbacks to have completed.
+ **/
+void blk_iopoll_disable(struct blk_iopoll *iop)
+{
+        set_bit(IOPOLL_F_DISABLE, &iop->state);
+        while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
+                msleep(1);
+        clear_bit(IOPOLL_F_DISABLE, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_disable);
+/**
+ * blk_iopoll_enable - Enable iopoll on this @iop
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Enable iopoll on this @iop. Note that the handler run will not be
+ *     scheduled, it will only mark it as active.
+ **/
+void blk_iopoll_enable(struct blk_iopoll *iop)
+{
+        BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
+        smp_mb__before_clear_bit();
+        clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_enable);
+/**
+ * blk_iopoll_init - Initialize this @iop
+ * @iop:      The parent iopoll structure
+ * @weight:   The default weight (or command completion budget)
+ * @poll_fn:  The handler to invoke
+ *
+ * Description:
+ *     Initialize this blk_iopoll structure. Before being actively used, the
+ *     driver must call blk_iopoll_enable().
+ **/
+void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
+{
+        memset(iop, 0, sizeof(*iop));
+        INIT_LIST_HEAD(&iop->list);
+        iop->weight = weight;
+        iop->poll = poll_fn;
+        set_bit(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_init);
+static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self,
+                                          unsigned long action, void *hcpu)
+{
+        /*
+         * If a CPU goes away, splice its entries to the current CPU
+         * and trigger a run of the softirq
+         */
+        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+                int cpu = (unsigned long) hcpu;
+                local_irq_disable();
+                list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
+                                 &__get_cpu_var(blk_cpu_iopoll));
+                __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+                local_irq_enable();
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __cpuinitdata blk_iopoll_cpu_notifier = {
+        .notifier_call  = blk_iopoll_cpu_notify,
+};
+static __init int blk_iopoll_setup(void)
+{
+        int i;
+        for_each_possible_cpu(i)
+                INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
+        open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
+        register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
+        return 0;
+}
+subsys_initcall(blk_iopoll_setup);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e1999679a4d5..99cb5cf1f447 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -311,6 +311,36 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
        return 1;
 }
+/**
+ * blk_rq_set_mixed_merge - mark a request as mixed merge
+ * @rq: request to mark as mixed merge
+ *
+ * Description:
+ *     @rq is about to be mixed merged.  Make sure the attributes
+ *     which can be mixed are set in each bio and mark @rq as mixed
+ *     merged.
+ */
+void blk_rq_set_mixed_merge(struct request *rq)
+{
+        unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+        struct bio *bio;
+        if (rq->cmd_flags & REQ_MIXED_MERGE)
+                return;
+        /*
+         * @rq will no longer represent mixable attributes for all the
+         * contained bios.  It will just track those of the first one.
+         * Distributes the attributs to each bio.
+         */
+        for (bio = rq->bio; bio; bio = bio->bi_next) {
+                WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) &&
+                             (bio->bi_rw & REQ_FAILFAST_MASK) != ff);
+                bio->bi_rw |= ff;
+        }
+        rq->cmd_flags |= REQ_MIXED_MERGE;
+}
 static void blk_account_io_merge(struct request *req)
 {
        if (blk_do_io_stat(req)) {
@@ -321,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
                part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
                part_round_stats(cpu, part);
-                part_dec_in_flight(part);
+                part_dec_in_flight(part, rq_data_dir(req));
                part_stat_unlock();
        }
@@ -350,12 +380,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
        if (blk_integrity_rq(req) != blk_integrity_rq(next))
                return 0;
-        /* don't merge requests of different failfast settings */
-        if (blk_failfast_dev(req)       != blk_failfast_dev(next)       ||
-            blk_failfast_transport(req) != blk_failfast_transport(next) ||
-            blk_failfast_driver(req)    != blk_failfast_driver(next))
-                return 0;
        /*
         * If we are allowed to merge, then append bio list
         * from next to rq and release next. merge_requests_fn
@@ -366,6 +390,19 @@ static int attempt_merge(struct request_queue *q, struct request *req,
                return 0;
        /*
+         * If failfast settings disagree or any of the two is already
+         * a mixed merge, mark both as mixed before proceeding.  This
+         * makes sure that all involved bios have mixable attributes
+         * set properly.
+         */
+        if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE ||
+            (req->cmd_flags & REQ_FAILFAST_MASK) !=
+            (next->cmd_flags & REQ_FAILFAST_MASK)) {
+                blk_rq_set_mixed_merge(req);
+                blk_rq_set_mixed_merge(next);
+        }
+        /*
         * At this point we have either done a back merge
         * or front merge. We need the smaller start_time of
         * the merged requests to be the current request
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 476d87065073..83413ff83739 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -428,6 +428,25 @@ void blk_queue_io_min(struct request_queue *q, unsigned int min)
 EXPORT_SYMBOL(blk_queue_io_min);
 /**
+ * blk_limits_io_opt - set optimal request size for a device
+ * @limits: the queue limits
+ * @opt:  smallest I/O size in bytes
+ *
+ * Description:
+ *   Storage devices may report an optimal I/O size, which is the
+ *   device's preferred unit for sustained I/O.  This is rarely reported
+ *   for disk drives.  For RAID arrays it is usually the stripe width or
+ *   the internal track size.  A properly aligned multiple of
+ *   optimal_io_size is the preferred request size for workloads where
+ *   sustained throughput is desired.
+ */
+void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt)
+{
+        limits->io_opt = opt;
+}
+EXPORT_SYMBOL(blk_limits_io_opt);
+/**
 * blk_queue_io_opt - set optimal request size for the queue
 * @q:  the request queue for the device
 * @opt:  optimal request size in bytes
@@ -442,7 +461,7 @@ EXPORT_SYMBOL(blk_queue_io_min);
 */
 void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
 {
-        q->limits.io_opt = opt;
+        blk_limits_io_opt(&q->limits, opt);
 }
 EXPORT_SYMBOL(blk_queue_io_opt);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d3aa2aadb3e0..b78c9c3e2670 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -40,7 +40,12 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
        struct request_list *rl = &q->rq;
        unsigned long nr;
-        int ret = queue_var_store(&nr, page, count);
+        int ret;
+        if (!q->request_fn)
+                return -EINVAL;
+        ret = queue_var_store(&nr, page, count);
        if (nr < BLKDEV_MIN_RQ)
                nr = BLKDEV_MIN_RQ;
diff --git a/block/blk.h b/block/blk.h
index 3fae6add5430..5ee3d7e72feb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -104,6 +104,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 int attempt_back_merge(struct request_queue *q, struct request *rq);
 int attempt_front_merge(struct request_queue *q, struct request *rq);
 void blk_recalc_rq_segments(struct request *rq);
+void blk_rq_set_mixed_merge(struct request *rq);
 void blk_queue_congestion_threshold(struct request_queue *q);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fd7080ed7935..0e3814b662af 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -134,13 +134,8 @@ struct cfq_data {
        struct rb_root prio_trees[CFQ_PRIO_LISTS];
        unsigned int busy_queues;
-        /*
-         * Used to track any pending rt requests so we can pre-empt current
-         * non-RT cfqq in service when this value is non-zero.
-         */
-        unsigned int busy_rt_queues;
-        int rq_in_driver;
+        int rq_in_driver[2];
        int sync_flight;
        /*
@@ -191,7 +186,6 @@ enum cfqq_state_flags {
        CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
        CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
        CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
-        CFQ_CFQQ_FLAG_must_alloc,       /* must be allowed rq alloc */
        CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
        CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
        CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
@@ -218,7 +212,6 @@ static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
 CFQ_CFQQ_FNS(on_rr);
 CFQ_CFQQ_FNS(wait_request);
 CFQ_CFQQ_FNS(must_dispatch);
-CFQ_CFQQ_FNS(must_alloc);
 CFQ_CFQQ_FNS(must_alloc_slice);
 CFQ_CFQQ_FNS(fifo_expire);
 CFQ_CFQQ_FNS(idle_window);
@@ -239,6 +232,11 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
 static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
                                                struct io_context *);
+static inline int rq_in_driver(struct cfq_data *cfqd)
+{
+        return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
+}
 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
                                            int is_sync)
 {
@@ -257,7 +255,7 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
 */
 static inline int cfq_bio_sync(struct bio *bio)
 {
-        if (bio_data_dir(bio) == READ || bio_sync(bio))
+        if (bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO))
                return 1;
        return 0;
@@ -648,8 +646,6 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        BUG_ON(cfq_cfqq_on_rr(cfqq));
        cfq_mark_cfqq_on_rr(cfqq);
        cfqd->busy_queues++;
-        if (cfq_class_rt(cfqq))
-                cfqd->busy_rt_queues++;
        cfq_resort_rr_list(cfqd, cfqq);
 }
@@ -673,8 +669,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        BUG_ON(!cfqd->busy_queues);
        cfqd->busy_queues--;
-        if (cfq_class_rt(cfqq))
-                cfqd->busy_rt_queues--;
 }
 /*
@@ -760,9 +754,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
-        cfqd->rq_in_driver++;
+        cfqd->rq_in_driver[rq_is_sync(rq)]++;
        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
-                                                cfqd->rq_in_driver);
+                                                rq_in_driver(cfqd));
        cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 }
@@ -770,11 +764,12 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
 {
        struct cfq_data *cfqd = q->elevator->elevator_data;
+        const int sync = rq_is_sync(rq);
-        WARN_ON(!cfqd->rq_in_driver);
+        WARN_ON(!cfqd->rq_in_driver[sync]);
-        cfqd->rq_in_driver--;
+        cfqd->rq_in_driver[sync]--;
        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
-                                                cfqd->rq_in_driver);
+                                                rq_in_driver(cfqd));
 }
 static void cfq_remove_request(struct request *rq)
@@ -1080,7 +1075,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
        /*
         * still requests with the driver, don't idle
         */
-        if (cfqd->rq_in_driver)
+        if (rq_in_driver(cfqd))
                return;
        /*
@@ -1115,6 +1110,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
        cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
+        cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
        cfq_remove_request(rq);
        cfqq->dispatched++;
        elv_dispatch_sort(q, rq);
@@ -1179,20 +1175,6 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
                goto expire;
        /*
-         * If we have a RT cfqq waiting, then we pre-empt the current non-rt
-         * cfqq.
-         */
-        if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) {
-                /*
-                 * We simulate this as cfqq timed out so that it gets to bank
-                 * the remaining of its time slice.
-                 */
-                cfq_log_cfqq(cfqd, cfqq, "preempt");
-                cfq_slice_expired(cfqd, 1);
-                goto new_queue;
-        }
-        /*
         * The active queue has requests and isn't expired, allow it to
         * dispatch.
         */
@@ -1312,6 +1294,12 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
                return 0;
        /*
+         * Drain async requests before we start sync IO
+         */
+        if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
+                return 0;
+        /*
         * If this is an async queue and we have sync IO in flight, let it wait
         */
        if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
@@ -1362,7 +1350,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
                cfq_slice_expired(cfqd, 0);
        }
-        cfq_log(cfqd, "dispatched a request");
+        cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
        return 1;
 }
@@ -2130,11 +2118,11 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 */
 static void cfq_update_hw_tag(struct cfq_data *cfqd)
 {
-        if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak)
+        if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
-                cfqd->rq_in_driver_peak = cfqd->rq_in_driver;
+                cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
        if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
-            cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
+            rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
                return;
        if (cfqd->hw_tag_samples++ < 50)
@@ -2161,9 +2149,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
        cfq_update_hw_tag(cfqd);
-        WARN_ON(!cfqd->rq_in_driver);
+        WARN_ON(!cfqd->rq_in_driver[sync]);
        WARN_ON(!cfqq->dispatched);
-        cfqd->rq_in_driver--;
+        cfqd->rq_in_driver[sync]--;
        cfqq->dispatched--;
        if (cfq_cfqq_sync(cfqq))
@@ -2197,7 +2185,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
                        cfq_arm_slice_timer(cfqd);
        }
-        if (!cfqd->rq_in_driver)
+        if (!rq_in_driver(cfqd))
                cfq_schedule_dispatch(cfqd);
 }
@@ -2229,8 +2217,7 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
 {
-        if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
+        if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
-            !cfq_cfqq_must_alloc_slice(cfqq)) {
                cfq_mark_cfqq_must_alloc_slice(cfqq);
                return ELV_MQUEUE_MUST;
        }
@@ -2317,7 +2304,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
        }
        cfqq->allocated[rw]++;
-        cfq_clear_cfqq_must_alloc(cfqq);
        atomic_inc(&cfqq->ref);
        spin_unlock_irqrestore(q->queue_lock, flags);
diff --git a/block/elevator.c b/block/elevator.c
index 2d511f9105e1..1975b619c86d 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -79,7 +79,8 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
        /*
         * Don't merge file system requests and discard requests
         */
-        if (bio_discard(bio) != bio_discard(rq->bio))
+        if (bio_rw_flagged(bio, BIO_RW_DISCARD) !=
+            bio_rw_flagged(rq->bio, BIO_RW_DISCARD))
                return 0;
        /*
@@ -100,19 +101,6 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
        if (bio_integrity(bio) != blk_integrity_rq(rq))
                return 0;
-        /*
-         * Don't merge if failfast settings don't match.
-         *
-         * FIXME: The negation in front of each condition is necessary
-         * because bio and request flags use different bit positions
-         * and the accessors return those bits directly.  This
-         * ugliness will soon go away.
-         */
-        if (!bio_failfast_dev(bio)       != !blk_failfast_dev(rq)       ||
-            !bio_failfast_transport(bio) != !blk_failfast_transport(rq) ||
-            !bio_failfast_driver(bio)    != !blk_failfast_driver(rq))
-                return 0;
        if (!elv_iosched_allow_merge(rq, bio))
                return 0;
diff --git a/block/genhd.c b/block/genhd.c
index f4c64c2b303a..5b76bf55d05c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -869,6 +869,7 @@ static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
        __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -888,6 +889,7 @@ static struct attribute *disk_attrs[] = {
        &dev_attr_alignment_offset.attr,
        &dev_attr_capability.attr,
        &dev_attr_stat.attr,
+        &dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
        &dev_attr_fail.attr,
 #endif
@@ -1053,7 +1055,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
                           part_stat_read(hd, merges[1]),
                           (unsigned long long)part_stat_read(hd, sectors[1]),
                           jiffies_to_msecs(part_stat_read(hd, ticks[1])),
-                           hd->in_flight,
+                           part_in_flight(hd),
                           jiffies_to_msecs(part_stat_read(hd, io_ticks)),
                           jiffies_to_msecs(part_stat_read(hd, time_in_queue))
                        );
@@ -1215,6 +1217,16 @@ void put_disk(struct gendisk *disk)
 EXPORT_SYMBOL(put_disk);
+static void set_disk_ro_uevent(struct gendisk *gd, int ro)
+{
+        char event[] = "DISK_RO=1";
+        char *envp[] = { event, NULL };
+        if (!ro)
+                event[8] = '0';
+        kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
+}
 void set_device_ro(struct block_device *bdev, int flag)
 {
        bdev->bd_part->policy = flag;
@@ -1227,8 +1239,12 @@ void set_disk_ro(struct gendisk *disk, int flag)
        struct disk_part_iter piter;
        struct hd_struct *part;
-        disk_part_iter_init(&piter, disk,
+        if (disk->part0.policy != flag) {
-                            DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0);
+                set_disk_ro_uevent(disk, flag);
+                disk->part0.policy = flag;
+        }
+        disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
        while ((part = disk_part_iter_next(&piter)))
                part->policy = flag;
        disk_part_iter_exit(&piter);
diff --git a/block/ioctl.c b/block/ioctl.c
index 500e4c73cc52..d3e6b5827a34 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -112,22 +112,9 @@ static int blkdev_reread_part(struct block_device *bdev)
        return res;
 }
-static void blk_ioc_discard_endio(struct bio *bio, int err)
-{
-        if (err) {
-                if (err == -EOPNOTSUPP)
-                        set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-                clear_bit(BIO_UPTODATE, &bio->bi_flags);
-        }
-        complete(bio->bi_private);
-}
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
                             uint64_t len)
 {
-        struct request_queue *q = bdev_get_queue(bdev);
-        int ret = 0;
        if (start & 511)
                return -EINVAL;
        if (len & 511)
@@ -137,40 +124,8 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
        if (start + len > (bdev->bd_inode->i_size >> 9))
                return -EINVAL;
+        return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
-        if (!q->prepare_discard_fn)
+                                    DISCARD_FL_WAIT);
-                return -EOPNOTSUPP;
-        while (len && !ret) {
-                DECLARE_COMPLETION_ONSTACK(wait);
-                struct bio *bio;
-                bio = bio_alloc(GFP_KERNEL, 0);
-                bio->bi_end_io = blk_ioc_discard_endio;
-                bio->bi_bdev = bdev;
-                bio->bi_private = &wait;
-                bio->bi_sector = start;
-                if (len > queue_max_hw_sectors(q)) {
-                        bio->bi_size = queue_max_hw_sectors(q) << 9;
-                        len -= queue_max_hw_sectors(q);
-                        start += queue_max_hw_sectors(q);
-                } else {
-                        bio->bi_size = len << 9;
-                        len = 0;
-                }
-                submit_bio(DISCARD_NOBARRIER, bio);
-                wait_for_completion(&wait);
-                if (bio_flagged(bio, BIO_EOPNOTSUPP))
-                        ret = -EOPNOTSUPP;
-                else if (!bio_flagged(bio, BIO_UPTODATE))
-                        ret = -EIO;
-                bio_put(bio);
-        }
-        return ret;
 }
 static int put_ushort(unsigned long arg, unsigned short val)