Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp

Conflicts: litmus/sched_cedf.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
commit: c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree: ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /block/blk-flush.c
parent: ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent: 6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
1 files changed, 443 insertions, 0 deletions
diff --git a/block/blk-flush.c b/block/blk-flush.c
new file mode 100644
index 000000000000..bb21e4c36f70
--- /dev/null
+++ b/block/blk-flush.c
@@ -0,0 +1,443 @@
+/*
+ * Functions to sequence FLUSH and FUA writes.
+ *
+ * Copyright (C) 2011           Max Planck Institute for Gravitational Physics
+ * Copyright (C) 2011           Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
+ * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
+ * properties and hardware capability.
+ *
+ * If a request doesn't have data, only REQ_FLUSH makes sense, which
+ * indicates a simple flush request.  If there is data, REQ_FLUSH indicates
+ * that the device cache should be flushed before the data is executed, and
+ * REQ_FUA means that the data must be on non-volatile media on request
+ * completion.
+ *
+ * If the device doesn't have writeback cache, FLUSH and FUA don't make any
+ * difference.  The requests are either completed immediately if there's no
+ * data or executed as normal requests otherwise.
+ *
+ * If the device has writeback cache and supports FUA, REQ_FLUSH is
+ * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
+ *
+ * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
+ * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
+ *
+ * The actual execution of flush is double buffered.  Whenever a request
+ * needs to execute PRE or POSTFLUSH, it queues at
+ * q->flush_queue[q->flush_pending_idx].  Once certain criteria are met, a
+ * flush is issued and the pending_idx is toggled.  When the flush
+ * completes, all the requests which were pending are proceeded to the next
+ * step.  This allows arbitrary merging of different types of FLUSH/FUA
+ * requests.
+ *
+ * Currently, the following conditions are used to determine when to issue
+ * flush.
+ *
+ * C1. At any given time, only one flush shall be in progress.  This makes
+ *     double buffering sufficient.
+ *
+ * C2. Flush is deferred if any request is executing DATA of its sequence.
+ *     This avoids issuing separate POSTFLUSHes for requests which shared
+ *     PREFLUSH.
+ *
+ * C3. The second condition is ignored if there is a request which has
+ *     waited longer than FLUSH_PENDING_TIMEOUT.  This is to avoid
+ *     starvation in the unlikely case where there are continuous stream of
+ *     FUA (without FLUSH) requests.
+ *
+ * For devices which support FUA, it isn't clear whether C2 (and thus C3)
+ * is beneficial.
+ *
+ * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
+ * Once while executing DATA and again after the whole sequence is
+ * complete.  The first completion updates the contained bio but doesn't
+ * finish it so that the bio submitter is notified only after the whole
+ * sequence is complete.  This is implemented by testing REQ_FLUSH_SEQ in
+ * req_bio_endio().
+ *
+ * The above peculiarity requires that each FLUSH/FUA request has only one
+ * bio attached to it, which is guaranteed as they aren't allowed to be
+ * merged in the usual way.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/gfp.h>
+#include "blk.h"
+/* FLUSH/FUA sequences */
+enum {
+        REQ_FSEQ_PREFLUSH       = (1 << 0), /* pre-flushing in progress */
+        REQ_FSEQ_DATA           = (1 << 1), /* data write in progress */
+        REQ_FSEQ_POSTFLUSH      = (1 << 2), /* post-flushing in progress */
+        REQ_FSEQ_DONE           = (1 << 3),
+        REQ_FSEQ_ACTIONS        = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
+                                  REQ_FSEQ_POSTFLUSH,
+        /*
+         * If flush has been pending longer than the following timeout,
+         * it's issued even if flush_data requests are still in flight.
+         */
+        FLUSH_PENDING_TIMEOUT   = 5 * HZ,
+};
+static bool blk_kick_flush(struct request_queue *q);
+static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
+{
+        unsigned int policy = 0;
+        if (fflags & REQ_FLUSH) {
+                if (rq->cmd_flags & REQ_FLUSH)
+                        policy |= REQ_FSEQ_PREFLUSH;
+                if (blk_rq_sectors(rq))
+                        policy |= REQ_FSEQ_DATA;
+                if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
+                        policy |= REQ_FSEQ_POSTFLUSH;
+        }
+        return policy;
+}
+static unsigned int blk_flush_cur_seq(struct request *rq)
+{
+        return 1 << ffz(rq->flush.seq);
+}
+static void blk_flush_restore_request(struct request *rq)
+{
+        /*
+         * After flush data completion, @rq->bio is %NULL but we need to
+         * complete the bio again.  @rq->biotail is guaranteed to equal the
+         * original @rq->bio.  Restore it.
+         */
+        rq->bio = rq->biotail;
+        /* make @rq a normal request */
+        rq->cmd_flags &= ~REQ_FLUSH_SEQ;
+        rq->end_io = NULL;
+}
+/**
+ * blk_flush_complete_seq - complete flush sequence
+ * @rq: FLUSH/FUA request being sequenced
+ * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
+ * @error: whether an error occurred
+ *
+ * @rq just completed @seq part of its flush sequence, record the
+ * completion and trigger the next step.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ *
+ * RETURNS:
+ * %true if requests were added to the dispatch queue, %false otherwise.
+ */
+static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
+                                   int error)
+{
+        struct request_queue *q = rq->q;
+        struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+        bool queued = false;
+        BUG_ON(rq->flush.seq & seq);
+        rq->flush.seq |= seq;
+        if (likely(!error))
+                seq = blk_flush_cur_seq(rq);
+        else
+                seq = REQ_FSEQ_DONE;
+        switch (seq) {
+        case REQ_FSEQ_PREFLUSH:
+        case REQ_FSEQ_POSTFLUSH:
+                /* queue for flush */
+                if (list_empty(pending))
+                        q->flush_pending_since = jiffies;
+                list_move_tail(&rq->flush.list, pending);
+                break;
+        case REQ_FSEQ_DATA:
+                list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
+                list_add(&rq->queuelist, &q->queue_head);
+                queued = true;
+                break;
+        case REQ_FSEQ_DONE:
+                /*
+                 * @rq was previously adjusted by blk_flush_issue() for
+                 * flush sequencing and may already have gone through the
+                 * flush data request completion path.  Restore @rq for
+                 * normal completion and end it.
+                 */
+                BUG_ON(!list_empty(&rq->queuelist));
+                list_del_init(&rq->flush.list);
+                blk_flush_restore_request(rq);
+                __blk_end_request_all(rq, error);
+                break;
+        default:
+                BUG();
+        }
+        return blk_kick_flush(q) | queued;
+}
+static void flush_end_io(struct request *flush_rq, int error)
+{
+        struct request_queue *q = flush_rq->q;
+        struct list_head *running = &q->flush_queue[q->flush_running_idx];
+        bool queued = false;
+        struct request *rq, *n;
+        BUG_ON(q->flush_pending_idx == q->flush_running_idx);
+        /* account completion of the flush request */
+        q->flush_running_idx ^= 1;
+        elv_completed_request(q, flush_rq);
+        /* and push the waiting requests to the next stage */
+        list_for_each_entry_safe(rq, n, running, flush.list) {
+                unsigned int seq = blk_flush_cur_seq(rq);
+                BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
+                queued |= blk_flush_complete_seq(rq, seq, error);
+        }
+        /*
+         * Kick the queue to avoid stall for two cases:
+         * 1. Moving a request silently to empty queue_head may stall the
+         * queue.
+         * 2. When flush request is running in non-queueable queue, the
+         * queue is hold. Restart the queue after flush request is finished
+         * to avoid stall.
+         * This function is called from request completion path and calling
+         * directly into request_fn may confuse the driver.  Always use
+         * kblockd.
+         */
+        if (queued || q->flush_queue_delayed)
+                blk_run_queue_async(q);
+        q->flush_queue_delayed = 0;
+}
+/**
+ * blk_kick_flush - consider issuing flush request
+ * @q: request_queue being kicked
+ *
+ * Flush related states of @q have changed, consider issuing flush request.
+ * Please read the comment at the top of this file for more info.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ *
+ * RETURNS:
+ * %true if flush was issued, %false otherwise.
+ */
+static bool blk_kick_flush(struct request_queue *q)
+{
+        struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+        struct request *first_rq =
+                list_first_entry(pending, struct request, flush.list);
+        /* C1 described at the top of this file */
+        if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
+                return false;
+        /* C2 and C3 */
+        if (!list_empty(&q->flush_data_in_flight) &&
+            time_before(jiffies,
+                        q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
+                return false;
+        /*
+         * Issue flush and toggle pending_idx.  This makes pending_idx
+         * different from running_idx, which means flush is in flight.
+         */
+        blk_rq_init(q, &q->flush_rq);
+        q->flush_rq.cmd_type = REQ_TYPE_FS;
+        q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
+        q->flush_rq.rq_disk = first_rq->rq_disk;
+        q->flush_rq.end_io = flush_end_io;
+        q->flush_pending_idx ^= 1;
+        list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
+        return true;
+}
+static void flush_data_end_io(struct request *rq, int error)
+{
+        struct request_queue *q = rq->q;
+        /*
+         * After populating an empty queue, kick it to avoid stall.  Read
+         * the comment in flush_end_io().
+         */
+        if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
+                blk_run_queue_async(q);
+}
+/**
+ * blk_insert_flush - insert a new FLUSH/FUA request
+ * @rq: request to insert
+ *
+ * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
+ * @rq is being submitted.  Analyze what needs to be done and put it on the
+ * right queue.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ */
+void blk_insert_flush(struct request *rq)
+{
+        struct request_queue *q = rq->q;
+        unsigned int fflags = q->flush_flags;   /* may change, cache */
+        unsigned int policy = blk_flush_policy(fflags, rq);
+        BUG_ON(rq->end_io);
+        BUG_ON(!rq->bio || rq->bio != rq->biotail);
+        /*
+         * @policy now records what operations need to be done.  Adjust
+         * REQ_FLUSH and FUA for the driver.
+         */
+        rq->cmd_flags &= ~REQ_FLUSH;
+        if (!(fflags & REQ_FUA))
+                rq->cmd_flags &= ~REQ_FUA;
+        /*
+         * If there's data but flush is not necessary, the request can be
+         * processed directly without going through flush machinery.  Queue
+         * for normal execution.
+         */
+        if ((policy & REQ_FSEQ_DATA) &&
+            !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
+                list_add_tail(&rq->queuelist, &q->queue_head);
+                return;
+        }
+        /*
+         * @rq should go through flush machinery.  Mark it part of flush
+         * sequence and submit for further processing.
+         */
+        memset(&rq->flush, 0, sizeof(rq->flush));
+        INIT_LIST_HEAD(&rq->flush.list);
+        rq->cmd_flags |= REQ_FLUSH_SEQ;
+        rq->end_io = flush_data_end_io;
+        blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
+}
+/**
+ * blk_abort_flushes - @q is being aborted, abort flush requests
+ * @q: request_queue being aborted
+ *
+ * To be called from elv_abort_queue().  @q is being aborted.  Prepare all
+ * FLUSH/FUA requests for abortion.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ */
+void blk_abort_flushes(struct request_queue *q)
+{
+        struct request *rq, *n;
+        int i;
+        /*
+         * Requests in flight for data are already owned by the dispatch
+         * queue or the device driver.  Just restore for normal completion.
+         */
+        list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
+                list_del_init(&rq->flush.list);
+                blk_flush_restore_request(rq);
+        }
+        /*
+         * We need to give away requests on flush queues.  Restore for
+         * normal completion and put them on the dispatch queue.
+         */
+        for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
+                list_for_each_entry_safe(rq, n, &q->flush_queue[i],
+                                         flush.list) {
+                        list_del_init(&rq->flush.list);
+                        blk_flush_restore_request(rq);
+                        list_add_tail(&rq->queuelist, &q->queue_head);
+                }
+        }
+}
+static void bio_end_flush(struct bio *bio, int err)
+{
+        if (err)
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+        if (bio->bi_private)
+                complete(bio->bi_private);
+        bio_put(bio);
+}
+/**
+ * blkdev_issue_flush - queue a flush
+ * @bdev:       blockdev to issue flush for
+ * @gfp_mask:   memory allocation flags (for bio_alloc)
+ * @error_sector:       error sector
+ *
+ * Description:
+ *    Issue a flush for the block device in question. Caller can supply
+ *    room for storing the error offset in case of a flush error, if they
+ *    wish to. If WAIT flag is not passed then caller may check only what
+ *    request was pushed in some internal queue for later handling.
+ */
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+                sector_t *error_sector)
+{
+        DECLARE_COMPLETION_ONSTACK(wait);
+        struct request_queue *q;
+        struct bio *bio;
+        int ret = 0;
+        if (bdev->bd_disk == NULL)
+                return -ENXIO;
+        q = bdev_get_queue(bdev);
+        if (!q)
+                return -ENXIO;
+        /*
+         * some block devices may not have their queue correctly set up here
+         * (e.g. loop device without a backing file) and so issuing a flush
+         * here will panic. Ensure there is a request function before issuing
+         * the flush.
+         */
+        if (!q->make_request_fn)
+                return -ENXIO;
+        bio = bio_alloc(gfp_mask, 0);
+        bio->bi_end_io = bio_end_flush;
+        bio->bi_bdev = bdev;
+        bio->bi_private = &wait;
+        bio_get(bio);
+        submit_bio(WRITE_FLUSH, bio);
+        wait_for_completion(&wait);
+        /*
+         * The driver must store the error location in ->bi_sector, if
+         * it supports it. For non-stacked drivers, this should be
+         * copied from blk_rq_pos(rq).
+         */
+        if (error_sector)
+               *error_sector = bio->bi_sector;
+        if (!bio_flagged(bio, BIO_UPTODATE))
+                ret = -EIO;
+        bio_put(bio);
+        return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_flush);
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
commit	c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree	ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /block/blk-flush.c
parent	ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent	6a00f206debf8a5c8899055726ad127dbeeed098 (diff)

diff --git a/block/blk-flush.c b/block/blk-flush.c new file mode 100644 index 000000000000..bb21e4c36f70 --- /dev/null +++ b/block/blk-flush.c
@@ -0,0 +1,443 @@
	1	/*
	2	* Functions to sequence FLUSH and FUA writes.
	3	*
	4	* Copyright (C) 2011 Max Planck Institute for Gravitational Physics
	5	* Copyright (C) 2011 Tejun Heo <tj@kernel.org>
	6	*
	7	* This file is released under the GPLv2.
	8	*
	9	* REQ_{FLUSH\|FUA} requests are decomposed to sequences consisted of three
	10	* optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
	11	* properties and hardware capability.
	12	*
	13	* If a request doesn't have data, only REQ_FLUSH makes sense, which
	14	* indicates a simple flush request. If there is data, REQ_FLUSH indicates
	15	* that the device cache should be flushed before the data is executed, and
	16	* REQ_FUA means that the data must be on non-volatile media on request
	17	* completion.
	18	*
	19	* If the device doesn't have writeback cache, FLUSH and FUA don't make any
	20	* difference. The requests are either completed immediately if there's no
	21	* data or executed as normal requests otherwise.
	22	*
	23	* If the device has writeback cache and supports FUA, REQ_FLUSH is
	24	* translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
	25	*
	26	* If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
	27	* translated to PREFLUSH and REQ_FUA to POSTFLUSH.
	28	*
	29	* The actual execution of flush is double buffered. Whenever a request
	30	* needs to execute PRE or POSTFLUSH, it queues at
	31	* q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a
	32	* flush is issued and the pending_idx is toggled. When the flush
	33	* completes, all the requests which were pending are proceeded to the next
	34	* step. This allows arbitrary merging of different types of FLUSH/FUA
	35	* requests.
	36	*
	37	* Currently, the following conditions are used to determine when to issue
	38	* flush.
	39	*
	40	* C1. At any given time, only one flush shall be in progress. This makes
	41	* double buffering sufficient.
	42	*
	43	* C2. Flush is deferred if any request is executing DATA of its sequence.
	44	* This avoids issuing separate POSTFLUSHes for requests which shared
	45	* PREFLUSH.
	46	*
	47	* C3. The second condition is ignored if there is a request which has
	48	* waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
	49	* starvation in the unlikely case where there are continuous stream of
	50	* FUA (without FLUSH) requests.
	51	*
	52	* For devices which support FUA, it isn't clear whether C2 (and thus C3)
	53	* is beneficial.
	54	*
	55	* Note that a sequenced FLUSH/FUA request with DATA is completed twice.
	56	* Once while executing DATA and again after the whole sequence is
	57	* complete. The first completion updates the contained bio but doesn't
	58	* finish it so that the bio submitter is notified only after the whole
	59	* sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in
	60	* req_bio_endio().
	61	*
	62	* The above peculiarity requires that each FLUSH/FUA request has only one
	63	* bio attached to it, which is guaranteed as they aren't allowed to be
	64	* merged in the usual way.
	65	*/
	66
	67	#include <linux/kernel.h>
	68	#include <linux/module.h>
	69	#include <linux/bio.h>
	70	#include <linux/blkdev.h>
	71	#include <linux/gfp.h>
	72
	73	#include "blk.h"
	74
	75	/* FLUSH/FUA sequences */
	76	enum {
	77	REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */
	78	REQ_FSEQ_DATA = (1 << 1), /* data write in progress */
	79	REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */
	80	REQ_FSEQ_DONE = (1 << 3),
	81
	82	REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH \| REQ_FSEQ_DATA \|
	83	REQ_FSEQ_POSTFLUSH,
	84
	85	/*
	86	* If flush has been pending longer than the following timeout,
	87	* it's issued even if flush_data requests are still in flight.
	88	*/
	89	FLUSH_PENDING_TIMEOUT = 5 * HZ,
	90	};
	91
	92	static bool blk_kick_flush(struct request_queue *q);
	93
	94	static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
	95	{
	96	unsigned int policy = 0;
	97
	98	if (fflags & REQ_FLUSH) {
	99	if (rq->cmd_flags & REQ_FLUSH)
	100	policy \|= REQ_FSEQ_PREFLUSH;
	101	if (blk_rq_sectors(rq))
	102	policy \|= REQ_FSEQ_DATA;
	103	if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
	104	policy \|= REQ_FSEQ_POSTFLUSH;
	105	}
	106	return policy;
	107	}
	108
	109	static unsigned int blk_flush_cur_seq(struct request *rq)
	110	{
	111	return 1 << ffz(rq->flush.seq);
	112	}
	113
	114	static void blk_flush_restore_request(struct request *rq)
	115	{
	116	/*
	117	* After flush data completion, @rq->bio is %NULL but we need to
	118	* complete the bio again. @rq->biotail is guaranteed to equal the
	119	* original @rq->bio. Restore it.
	120	*/
	121	rq->bio = rq->biotail;
	122
	123	/* make @rq a normal request */
	124	rq->cmd_flags &= ~REQ_FLUSH_SEQ;
	125	rq->end_io = NULL;
	126	}
	127
	128	/**
	129	* blk_flush_complete_seq - complete flush sequence
	130	* @rq: FLUSH/FUA request being sequenced
	131	* @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
	132	* @error: whether an error occurred
	133	*
	134	* @rq just completed @seq part of its flush sequence, record the
	135	* completion and trigger the next step.
	136	*
	137	* CONTEXT:
	138	* spin_lock_irq(q->queue_lock)
	139	*
	140	* RETURNS:
	141	* %true if requests were added to the dispatch queue, %false otherwise.
	142	*/
	143	static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
	144	int error)
	145	{
	146	struct request_queue *q = rq->q;
	147	struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
	148	bool queued = false;
	149
	150	BUG_ON(rq->flush.seq & seq);
	151	rq->flush.seq \|= seq;
	152
	153	if (likely(!error))
	154	seq = blk_flush_cur_seq(rq);
	155	else
	156	seq = REQ_FSEQ_DONE;
	157
	158	switch (seq) {
	159	case REQ_FSEQ_PREFLUSH:
	160	case REQ_FSEQ_POSTFLUSH:
	161	/* queue for flush */
	162	if (list_empty(pending))
	163	q->flush_pending_since = jiffies;
	164	list_move_tail(&rq->flush.list, pending);
	165	break;
	166
	167	case REQ_FSEQ_DATA:
	168	list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
	169	list_add(&rq->queuelist, &q->queue_head);
	170	queued = true;
	171	break;
	172
	173	case REQ_FSEQ_DONE:
	174	/*
	175	* @rq was previously adjusted by blk_flush_issue() for
	176	* flush sequencing and may already have gone through the
	177	* flush data request completion path. Restore @rq for
	178	* normal completion and end it.
	179	*/
	180	BUG_ON(!list_empty(&rq->queuelist));
	181	list_del_init(&rq->flush.list);
	182	blk_flush_restore_request(rq);
	183	__blk_end_request_all(rq, error);
	184	break;
	185
	186	default:
	187	BUG();
	188	}
	189
	190	return blk_kick_flush(q) \| queued;
	191	}
	192
	193	static void flush_end_io(struct request *flush_rq, int error)
	194	{
	195	struct request_queue *q = flush_rq->q;
	196	struct list_head *running = &q->flush_queue[q->flush_running_idx];
	197	bool queued = false;
	198	struct request rq, n;
	199
	200	BUG_ON(q->flush_pending_idx == q->flush_running_idx);
	201
	202	/* account completion of the flush request */
	203	q->flush_running_idx ^= 1;
	204	elv_completed_request(q, flush_rq);
	205
	206	/* and push the waiting requests to the next stage */
	207	list_for_each_entry_safe(rq, n, running, flush.list) {
	208	unsigned int seq = blk_flush_cur_seq(rq);
	209
	210	BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
	211	queued \|= blk_flush_complete_seq(rq, seq, error);
	212	}
	213
	214	/*
	215	* Kick the queue to avoid stall for two cases:
	216	* 1. Moving a request silently to empty queue_head may stall the
	217	* queue.
	218	* 2. When flush request is running in non-queueable queue, the
	219	* queue is hold. Restart the queue after flush request is finished
	220	* to avoid stall.
	221	* This function is called from request completion path and calling
	222	* directly into request_fn may confuse the driver. Always use
	223	* kblockd.
	224	*/
	225	if (queued \|\| q->flush_queue_delayed)
	226	blk_run_queue_async(q);
	227	q->flush_queue_delayed = 0;
	228	}
	229
	230	/**
	231	* blk_kick_flush - consider issuing flush request
	232	* @q: request_queue being kicked
	233	*
	234	* Flush related states of @q have changed, consider issuing flush request.
	235	* Please read the comment at the top of this file for more info.
	236	*
	237	* CONTEXT:
	238	* spin_lock_irq(q->queue_lock)
	239	*
	240	* RETURNS:
	241	* %true if flush was issued, %false otherwise.
	242	*/
	243	static bool blk_kick_flush(struct request_queue *q)
	244	{
	245	struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
	246	struct request *first_rq =
	247	list_first_entry(pending, struct request, flush.list);
	248
	249	/* C1 described at the top of this file */
	250	if (q->flush_pending_idx != q->flush_running_idx \|\| list_empty(pending))
	251	return false;
	252
	253	/* C2 and C3 */
	254	if (!list_empty(&q->flush_data_in_flight) &&
	255	time_before(jiffies,
	256	q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
	257	return false;
	258
	259	/*
	260	* Issue flush and toggle pending_idx. This makes pending_idx
	261	* different from running_idx, which means flush is in flight.
	262	*/
	263	blk_rq_init(q, &q->flush_rq);
	264	q->flush_rq.cmd_type = REQ_TYPE_FS;
	265	q->flush_rq.cmd_flags = WRITE_FLUSH \| REQ_FLUSH_SEQ;
	266	q->flush_rq.rq_disk = first_rq->rq_disk;
	267	q->flush_rq.end_io = flush_end_io;
	268
	269	q->flush_pending_idx ^= 1;
	270	list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
	271	return true;
	272	}
	273
	274	static void flush_data_end_io(struct request *rq, int error)
	275	{
	276	struct request_queue *q = rq->q;
	277
	278	/*
	279	* After populating an empty queue, kick it to avoid stall. Read
	280	* the comment in flush_end_io().
	281	*/
	282	if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
	283	blk_run_queue_async(q);
	284	}
	285
	286	/**
	287	* blk_insert_flush - insert a new FLUSH/FUA request
	288	* @rq: request to insert
	289	*
	290	* To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
	291	* @rq is being submitted. Analyze what needs to be done and put it on the
	292	* right queue.
	293	*
	294	* CONTEXT:
	295	* spin_lock_irq(q->queue_lock)
	296	*/
	297	void blk_insert_flush(struct request *rq)
	298	{
	299	struct request_queue *q = rq->q;
	300	unsigned int fflags = q->flush_flags; /* may change, cache */
	301	unsigned int policy = blk_flush_policy(fflags, rq);
	302
	303	BUG_ON(rq->end_io);
	304	BUG_ON(!rq->bio \|\| rq->bio != rq->biotail);
	305
	306	/*
	307	* @policy now records what operations need to be done. Adjust
	308	* REQ_FLUSH and FUA for the driver.
	309	*/
	310	rq->cmd_flags &= ~REQ_FLUSH;
	311	if (!(fflags & REQ_FUA))
	312	rq->cmd_flags &= ~REQ_FUA;
	313
	314	/*
	315	* If there's data but flush is not necessary, the request can be
	316	* processed directly without going through flush machinery. Queue
	317	* for normal execution.
	318	*/
	319	if ((policy & REQ_FSEQ_DATA) &&
	320	!(policy & (REQ_FSEQ_PREFLUSH \| REQ_FSEQ_POSTFLUSH))) {
	321	list_add_tail(&rq->queuelist, &q->queue_head);
	322	return;
	323	}
	324
	325	/*
	326	* @rq should go through flush machinery. Mark it part of flush
	327	* sequence and submit for further processing.
	328	*/
	329	memset(&rq->flush, 0, sizeof(rq->flush));
	330	INIT_LIST_HEAD(&rq->flush.list);
	331	rq->cmd_flags \|= REQ_FLUSH_SEQ;
	332	rq->end_io = flush_data_end_io;
	333
	334	blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
	335	}
	336
	337	/**
	338	* blk_abort_flushes - @q is being aborted, abort flush requests
	339	* @q: request_queue being aborted
	340	*
	341	* To be called from elv_abort_queue(). @q is being aborted. Prepare all
	342	* FLUSH/FUA requests for abortion.
	343	*
	344	* CONTEXT:
	345	* spin_lock_irq(q->queue_lock)
	346	*/
	347	void blk_abort_flushes(struct request_queue *q)
	348	{
	349	struct request rq, n;
	350	int i;
	351
	352	/*
	353	* Requests in flight for data are already owned by the dispatch
	354	* queue or the device driver. Just restore for normal completion.
	355	*/
	356	list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
	357	list_del_init(&rq->flush.list);
	358	blk_flush_restore_request(rq);
	359	}
	360
	361	/*
	362	* We need to give away requests on flush queues. Restore for
	363	* normal completion and put them on the dispatch queue.
	364	*/
	365	for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
	366	list_for_each_entry_safe(rq, n, &q->flush_queue[i],
	367	flush.list) {
	368	list_del_init(&rq->flush.list);
	369	blk_flush_restore_request(rq);
	370	list_add_tail(&rq->queuelist, &q->queue_head);
	371	}
	372	}
	373	}
	374
	375	static void bio_end_flush(struct bio *bio, int err)
	376	{
	377	if (err)
	378	clear_bit(BIO_UPTODATE, &bio->bi_flags);
	379	if (bio->bi_private)
	380	complete(bio->bi_private);
	381	bio_put(bio);
	382	}
	383
	384	/**
	385	* blkdev_issue_flush - queue a flush
	386	* @bdev: blockdev to issue flush for
	387	* @gfp_mask: memory allocation flags (for bio_alloc)
	388	* @error_sector: error sector
	389	*
	390	* Description:
	391	* Issue a flush for the block device in question. Caller can supply
	392	* room for storing the error offset in case of a flush error, if they
	393	* wish to. If WAIT flag is not passed then caller may check only what
	394	* request was pushed in some internal queue for later handling.
	395	*/
	396	int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
	397	sector_t *error_sector)
	398	{
	399	DECLARE_COMPLETION_ONSTACK(wait);
	400	struct request_queue *q;
	401	struct bio *bio;
	402	int ret = 0;
	403
	404	if (bdev->bd_disk == NULL)
	405	return -ENXIO;
	406
	407	q = bdev_get_queue(bdev);
	408	if (!q)
	409	return -ENXIO;
	410
	411	/*
	412	* some block devices may not have their queue correctly set up here
	413	* (e.g. loop device without a backing file) and so issuing a flush
	414	* here will panic. Ensure there is a request function before issuing
	415	* the flush.
	416	*/
	417	if (!q->make_request_fn)
	418	return -ENXIO;
	419
	420	bio = bio_alloc(gfp_mask, 0);
	421	bio->bi_end_io = bio_end_flush;
	422	bio->bi_bdev = bdev;
	423	bio->bi_private = &wait;
	424
	425	bio_get(bio);
	426	submit_bio(WRITE_FLUSH, bio);
	427	wait_for_completion(&wait);
	428
	429	/*
	430	* The driver must store the error location in ->bi_sector, if
	431	* it supports it. For non-stacked drivers, this should be
	432	* copied from blk_rq_pos(rq).
	433	*/
	434	if (error_sector)
	435	*error_sector = bio->bi_sector;
	436
	437	if (!bio_flagged(bio, BIO_UPTODATE))
	438	ret = -EIO;
	439
	440	bio_put(bio);
	441	return ret;
	442	}
	443	EXPORT_SYMBOL(blkdev_issue_flush);