aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2011-01-25 06:43:54 -0500
committerJens Axboe <jaxboe@fusionio.com>2011-01-25 06:43:54 -0500
commitae1b1539622fb46e51b4d13b3f9e5f4c713f86ae (patch)
treec5cb540141003a3ec7ebf0b8c6e01653ab6aaef5
parent143a87f4c9c629067afea5b6703d66ea88c82f8e (diff)
block: reimplement FLUSH/FUA to support merge
The current FLUSH/FUA support has evolved from the implementation which had to perform queue draining. As such, sequencing is done queue-wide one flush request after another. However, with the draining requirement gone, there's no reason to keep the queue-wide sequential approach. This patch reimplements FLUSH/FUA support such that each FLUSH/FUA request is sequenced individually. The actual FLUSH execution is double buffered and whenever a request wants to execute one for either PRE or POSTFLUSH, it queues on the pending queue. Once certain conditions are met, a flush request is issued and on its completion all pending requests proceed to the next sequence. This allows arbitrary merging of different type of flushes. How they are merged can be primarily controlled and tuned by adjusting the above said 'conditions' used to determine when to issue the next flush. This is inspired by Darrick's patches to merge multiple zero-data flushes which helps workloads with highly concurrent fsync requests. * As flush requests are never put on the IO scheduler, request fields used for flush share space with rq->rb_node. rq->completion_data is moved out of the union. This increases the request size by one pointer. As rq->elevator_private* are used only by the iosched too, it is possible to reduce the request size further. However, to do that, we need to modify request allocation path such that iosched data is not allocated for flush requests. * FLUSH/FUA processing happens on insertion now instead of dispatch. - Comments updated as per Vivek and Mike. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: "Darrick J. Wong" <djwong@us.ibm.com> Cc: Shaohua Li <shli@kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
-rw-r--r--block/blk-core.c10
-rw-r--r--block/blk-flush.c440
-rw-r--r--block/blk.h12
-rw-r--r--block/elevator.c7
-rw-r--r--include/linux/blkdev.h18
-rw-r--r--include/linux/elevator.h1
6 files changed, 332 insertions, 156 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 617bb9e40927..05746093b45e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -134,8 +134,6 @@ EXPORT_SYMBOL(blk_rq_init);
134static void req_bio_endio(struct request *rq, struct bio *bio, 134static void req_bio_endio(struct request *rq, struct bio *bio,
135 unsigned int nbytes, int error) 135 unsigned int nbytes, int error)
136{ 136{
137 struct request_queue *q = rq->q;
138
139 if (error) 137 if (error)
140 clear_bit(BIO_UPTODATE, &bio->bi_flags); 138 clear_bit(BIO_UPTODATE, &bio->bi_flags);
141 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 139 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -159,8 +157,6 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
159 /* don't actually finish bio if it's part of flush sequence */ 157 /* don't actually finish bio if it's part of flush sequence */
160 if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) 158 if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
161 bio_endio(bio, error); 159 bio_endio(bio, error);
162 else if (error && !q->flush_err)
163 q->flush_err = error;
164} 160}
165 161
166void blk_dump_rq_flags(struct request *rq, char *msg) 162void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -519,7 +515,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
519 init_timer(&q->unplug_timer); 515 init_timer(&q->unplug_timer);
520 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 516 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
521 INIT_LIST_HEAD(&q->timeout_list); 517 INIT_LIST_HEAD(&q->timeout_list);
522 INIT_LIST_HEAD(&q->pending_flushes); 518 INIT_LIST_HEAD(&q->flush_queue[0]);
519 INIT_LIST_HEAD(&q->flush_queue[1]);
520 INIT_LIST_HEAD(&q->flush_data_in_flight);
523 INIT_WORK(&q->unplug_work, blk_unplug_work); 521 INIT_WORK(&q->unplug_work, blk_unplug_work);
524 522
525 kobject_init(&q->kobj, &blk_queue_ktype); 523 kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1198,7 +1196,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1198 spin_lock_irq(q->queue_lock); 1196 spin_lock_irq(q->queue_lock);
1199 1197
1200 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 1198 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1201 where = ELEVATOR_INSERT_FRONT; 1199 where = ELEVATOR_INSERT_FLUSH;
1202 goto get_rq; 1200 goto get_rq;
1203 } 1201 }
1204 1202
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8592869bcbe7..a867e3f524f3 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,6 +1,69 @@
1/* 1/*
2 * Functions to sequence FLUSH and FUA writes. 2 * Functions to sequence FLUSH and FUA writes.
3 *
4 * Copyright (C) 2011 Max Planck Institute for Gravitational Physics
5 * Copyright (C) 2011 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
10 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
11 * properties and hardware capability.
12 *
13 * If a request doesn't have data, only REQ_FLUSH makes sense, which
14 * indicates a simple flush request. If there is data, REQ_FLUSH indicates
15 * that the device cache should be flushed before the data is executed, and
16 * REQ_FUA means that the data must be on non-volatile media on request
17 * completion.
18 *
19 * If the device doesn't have writeback cache, FLUSH and FUA don't make any
20 * difference. The requests are either completed immediately if there's no
21 * data or executed as normal requests otherwise.
22 *
23 * If the device has writeback cache and supports FUA, REQ_FLUSH is
24 * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
25 *
26 * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
27 * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
28 *
29 * The actual execution of flush is double buffered. Whenever a request
30 * needs to execute PRE or POSTFLUSH, it queues at
31 * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a
32 * flush is issued and the pending_idx is toggled. When the flush
33 * completes, all the requests which were pending are proceeded to the next
34 * step. This allows arbitrary merging of different types of FLUSH/FUA
35 * requests.
36 *
37 * Currently, the following conditions are used to determine when to issue
38 * flush.
39 *
40 * C1. At any given time, only one flush shall be in progress. This makes
41 * double buffering sufficient.
42 *
43 * C2. Flush is deferred if any request is executing DATA of its sequence.
44 * This avoids issuing separate POSTFLUSHes for requests which shared
45 * PREFLUSH.
46 *
47 * C3. The second condition is ignored if there is a request which has
48 * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
49 * starvation in the unlikely case where there are continuous stream of
50 * FUA (without FLUSH) requests.
51 *
52 * For devices which support FUA, it isn't clear whether C2 (and thus C3)
53 * is beneficial.
54 *
55 * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
56 * Once while executing DATA and again after the whole sequence is
57 * complete. The first completion updates the contained bio but doesn't
58 * finish it so that the bio submitter is notified only after the whole
59 * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in
60 * req_bio_endio().
61 *
62 * The above peculiarity requires that each FLUSH/FUA request has only one
63 * bio attached to it, which is guaranteed as they aren't allowed to be
64 * merged in the usual way.
3 */ 65 */
66
4#include <linux/kernel.h> 67#include <linux/kernel.h>
5#include <linux/module.h> 68#include <linux/module.h>
6#include <linux/bio.h> 69#include <linux/bio.h>
@@ -11,185 +74,290 @@
11 74
12/* FLUSH/FUA sequences */ 75/* FLUSH/FUA sequences */
13enum { 76enum {
14 QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */ 77 REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */
15 QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */ 78 REQ_FSEQ_DATA = (1 << 1), /* data write in progress */
16 QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */ 79 REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */
17 QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */ 80 REQ_FSEQ_DONE = (1 << 3),
18 QUEUE_FSEQ_DONE = (1 << 4), 81
82 REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
83 REQ_FSEQ_POSTFLUSH,
84
85 /*
86 * If flush has been pending longer than the following timeout,
87 * it's issued even if flush_data requests are still in flight.
88 */
89 FLUSH_PENDING_TIMEOUT = 5 * HZ,
19}; 90};
20 91
21static struct request *queue_next_fseq(struct request_queue *q); 92static bool blk_kick_flush(struct request_queue *q);
22 93
23unsigned blk_flush_cur_seq(struct request_queue *q) 94static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
24{ 95{
25 if (!q->flush_seq) 96 unsigned int policy = 0;
26 return 0; 97
27 return 1 << ffz(q->flush_seq); 98 if (fflags & REQ_FLUSH) {
99 if (rq->cmd_flags & REQ_FLUSH)
100 policy |= REQ_FSEQ_PREFLUSH;
101 if (blk_rq_sectors(rq))
102 policy |= REQ_FSEQ_DATA;
103 if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
104 policy |= REQ_FSEQ_POSTFLUSH;
105 }
106 return policy;
28} 107}
29 108
30static struct request *blk_flush_complete_seq(struct request_queue *q, 109static unsigned int blk_flush_cur_seq(struct request *rq)
31 unsigned seq, int error)
32{ 110{
33 struct request *next_rq = NULL; 111 return 1 << ffz(rq->flush.seq);
34
35 if (error && !q->flush_err)
36 q->flush_err = error;
37
38 BUG_ON(q->flush_seq & seq);
39 q->flush_seq |= seq;
40
41 if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) {
42 /* not complete yet, queue the next flush sequence */
43 next_rq = queue_next_fseq(q);
44 } else {
45 /* complete this flush request */
46 __blk_end_request_all(q->orig_flush_rq, q->flush_err);
47 q->orig_flush_rq = NULL;
48 q->flush_seq = 0;
49
50 /* dispatch the next flush if there's one */
51 if (!list_empty(&q->pending_flushes)) {
52 next_rq = list_entry_rq(q->pending_flushes.next);
53 list_move(&next_rq->queuelist, &q->queue_head);
54 }
55 }
56 return next_rq;
57} 112}
58 113
59static void blk_flush_complete_seq_end_io(struct request_queue *q, 114static void blk_flush_restore_request(struct request *rq)
60 unsigned seq, int error)
61{ 115{
62 bool was_empty = elv_queue_empty(q);
63 struct request *next_rq;
64
65 next_rq = blk_flush_complete_seq(q, seq, error);
66
67 /* 116 /*
68 * Moving a request silently to empty queue_head may stall the 117 * After flush data completion, @rq->bio is %NULL but we need to
69 * queue. Kick the queue in those cases. 118 * complete the bio again. @rq->biotail is guaranteed to equal the
119 * original @rq->bio. Restore it.
70 */ 120 */
71 if (was_empty && next_rq) 121 rq->bio = rq->biotail;
72 __blk_run_queue(q); 122
123 /* make @rq a normal request */
124 rq->cmd_flags &= ~REQ_FLUSH_SEQ;
125 rq->end_io = NULL;
73} 126}
74 127
75static void pre_flush_end_io(struct request *rq, int error) 128/**
129 * blk_flush_complete_seq - complete flush sequence
130 * @rq: FLUSH/FUA request being sequenced
131 * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
132 * @error: whether an error occurred
133 *
134 * @rq just completed @seq part of its flush sequence, record the
135 * completion and trigger the next step.
136 *
137 * CONTEXT:
138 * spin_lock_irq(q->queue_lock)
139 *
140 * RETURNS:
141 * %true if requests were added to the dispatch queue, %false otherwise.
142 */
143static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
144 int error)
76{ 145{
77 elv_completed_request(rq->q, rq); 146 struct request_queue *q = rq->q;
78 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error); 147 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
148 bool queued = false;
149
150 BUG_ON(rq->flush.seq & seq);
151 rq->flush.seq |= seq;
152
153 if (likely(!error))
154 seq = blk_flush_cur_seq(rq);
155 else
156 seq = REQ_FSEQ_DONE;
157
158 switch (seq) {
159 case REQ_FSEQ_PREFLUSH:
160 case REQ_FSEQ_POSTFLUSH:
161 /* queue for flush */
162 if (list_empty(pending))
163 q->flush_pending_since = jiffies;
164 list_move_tail(&rq->flush.list, pending);
165 break;
166
167 case REQ_FSEQ_DATA:
168 list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
169 list_add(&rq->queuelist, &q->queue_head);
170 queued = true;
171 break;
172
173 case REQ_FSEQ_DONE:
174 /*
175 * @rq was previously adjusted by blk_flush_issue() for
176 * flush sequencing and may already have gone through the
177 * flush data request completion path. Restore @rq for
178 * normal completion and end it.
179 */
180 BUG_ON(!list_empty(&rq->queuelist));
181 list_del_init(&rq->flush.list);
182 blk_flush_restore_request(rq);
183 __blk_end_request_all(rq, error);
184 break;
185
186 default:
187 BUG();
188 }
189
190 return blk_kick_flush(q) | queued;
79} 191}
80 192
81static void flush_data_end_io(struct request *rq, int error) 193static void flush_end_io(struct request *flush_rq, int error)
82{ 194{
83 elv_completed_request(rq->q, rq); 195 struct request_queue *q = flush_rq->q;
84 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error); 196 struct list_head *running = &q->flush_queue[q->flush_running_idx];
197 bool was_empty = elv_queue_empty(q);
198 bool queued = false;
199 struct request *rq, *n;
200
201 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
202
203 /* account completion of the flush request */
204 q->flush_running_idx ^= 1;
205 elv_completed_request(q, flush_rq);
206
207 /* and push the waiting requests to the next stage */
208 list_for_each_entry_safe(rq, n, running, flush.list) {
209 unsigned int seq = blk_flush_cur_seq(rq);
210
211 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
212 queued |= blk_flush_complete_seq(rq, seq, error);
213 }
214
215 /* after populating an empty queue, kick it to avoid stall */
216 if (queued && was_empty)
217 __blk_run_queue(q);
85} 218}
86 219
87static void post_flush_end_io(struct request *rq, int error) 220/**
221 * blk_kick_flush - consider issuing flush request
222 * @q: request_queue being kicked
223 *
224 * Flush related states of @q have changed, consider issuing flush request.
225 * Please read the comment at the top of this file for more info.
226 *
227 * CONTEXT:
228 * spin_lock_irq(q->queue_lock)
229 *
230 * RETURNS:
231 * %true if flush was issued, %false otherwise.
232 */
233static bool blk_kick_flush(struct request_queue *q)
88{ 234{
89 elv_completed_request(rq->q, rq); 235 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
90 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error); 236 struct request *first_rq =
237 list_first_entry(pending, struct request, flush.list);
238
239 /* C1 described at the top of this file */
240 if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
241 return false;
242
243 /* C2 and C3 */
244 if (!list_empty(&q->flush_data_in_flight) &&
245 time_before(jiffies,
246 q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
247 return false;
248
249 /*
250 * Issue flush and toggle pending_idx. This makes pending_idx
251 * different from running_idx, which means flush is in flight.
252 */
253 blk_rq_init(q, &q->flush_rq);
254 q->flush_rq.cmd_type = REQ_TYPE_FS;
255 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
256 q->flush_rq.rq_disk = first_rq->rq_disk;
257 q->flush_rq.end_io = flush_end_io;
258
259 q->flush_pending_idx ^= 1;
260 elv_insert(q, &q->flush_rq, ELEVATOR_INSERT_FRONT);
261 return true;
91} 262}
92 263
93static void init_flush_request(struct request *rq, struct gendisk *disk) 264static void flush_data_end_io(struct request *rq, int error)
94{ 265{
95 rq->cmd_type = REQ_TYPE_FS; 266 struct request_queue *q = rq->q;
96 rq->cmd_flags = WRITE_FLUSH; 267 bool was_empty = elv_queue_empty(q);
97 rq->rq_disk = disk; 268
269 /* after populating an empty queue, kick it to avoid stall */
270 if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error) && was_empty)
271 __blk_run_queue(q);
98} 272}
99 273
100static struct request *queue_next_fseq(struct request_queue *q) 274/**
275 * blk_insert_flush - insert a new FLUSH/FUA request
276 * @rq: request to insert
277 *
278 * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions.
279 * @rq is being submitted. Analyze what needs to be done and put it on the
280 * right queue.
281 *
282 * CONTEXT:
283 * spin_lock_irq(q->queue_lock)
284 */
285void blk_insert_flush(struct request *rq)
101{ 286{
102 struct request *orig_rq = q->orig_flush_rq; 287 struct request_queue *q = rq->q;
103 struct request *rq = &q->flush_rq; 288 unsigned int fflags = q->flush_flags; /* may change, cache */
289 unsigned int policy = blk_flush_policy(fflags, rq);
104 290
105 blk_rq_init(q, rq); 291 BUG_ON(rq->end_io);
292 BUG_ON(!rq->bio || rq->bio != rq->biotail);
106 293
107 switch (blk_flush_cur_seq(q)) { 294 /*
108 case QUEUE_FSEQ_PREFLUSH: 295 * @policy now records what operations need to be done. Adjust
109 init_flush_request(rq, orig_rq->rq_disk); 296 * REQ_FLUSH and FUA for the driver.
110 rq->end_io = pre_flush_end_io; 297 */
111 break; 298 rq->cmd_flags &= ~REQ_FLUSH;
112 case QUEUE_FSEQ_DATA: 299 if (!(fflags & REQ_FUA))
113 init_request_from_bio(rq, orig_rq->bio); 300 rq->cmd_flags &= ~REQ_FUA;
114 /* 301
115 * orig_rq->rq_disk may be different from 302 /*
116 * bio->bi_bdev->bd_disk if orig_rq got here through 303 * If there's data but flush is not necessary, the request can be
117 * remapping drivers. Make sure rq->rq_disk points 304 * processed directly without going through flush machinery. Queue
118 * to the same one as orig_rq. 305 * for normal execution.
119 */ 306 */
120 rq->rq_disk = orig_rq->rq_disk; 307 if ((policy & REQ_FSEQ_DATA) &&
121 rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA); 308 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
122 rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA); 309 list_add(&rq->queuelist, &q->queue_head);
123 rq->end_io = flush_data_end_io; 310 return;
124 break;
125 case QUEUE_FSEQ_POSTFLUSH:
126 init_flush_request(rq, orig_rq->rq_disk);
127 rq->end_io = post_flush_end_io;
128 break;
129 default:
130 BUG();
131 } 311 }
132 312
313 /*
314 * @rq should go through flush machinery. Mark it part of flush
315 * sequence and submit for further processing.
316 */
317 memset(&rq->flush, 0, sizeof(rq->flush));
318 INIT_LIST_HEAD(&rq->flush.list);
133 rq->cmd_flags |= REQ_FLUSH_SEQ; 319 rq->cmd_flags |= REQ_FLUSH_SEQ;
134 elv_insert(q, rq, ELEVATOR_INSERT_FRONT); 320 rq->end_io = flush_data_end_io;
135 return rq; 321
322 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
136} 323}
137 324
138struct request *blk_do_flush(struct request_queue *q, struct request *rq) 325/**
326 * blk_abort_flushes - @q is being aborted, abort flush requests
327 * @q: request_queue being aborted
328 *
329 * To be called from elv_abort_queue(). @q is being aborted. Prepare all
330 * FLUSH/FUA requests for abortion.
331 *
332 * CONTEXT:
333 * spin_lock_irq(q->queue_lock)
334 */
335void blk_abort_flushes(struct request_queue *q)
139{ 336{
140 unsigned int fflags = q->flush_flags; /* may change, cache it */ 337 struct request *rq, *n;
141 bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA; 338 int i;
142 bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
143 bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
144 unsigned skip = 0;
145 339
146 /* 340 /*
147 * Special case. If there's data but flush is not necessary, 341 * Requests in flight for data are already owned by the dispatch
148 * the request can be issued directly. 342 * queue or the device driver. Just restore for normal completion.
149 *
150 * Flush w/o data should be able to be issued directly too but
151 * currently some drivers assume that rq->bio contains
152 * non-zero data if it isn't NULL and empty FLUSH requests
153 * getting here usually have bio's without data.
154 */ 343 */
155 if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) { 344 list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
156 rq->cmd_flags &= ~REQ_FLUSH; 345 list_del_init(&rq->flush.list);
157 if (!has_fua) 346 blk_flush_restore_request(rq);
158 rq->cmd_flags &= ~REQ_FUA;
159 return rq;
160 } 347 }
161 348
162 /* 349 /*
163 * Sequenced flushes can't be processed in parallel. If 350 * We need to give away requests on flush queues. Restore for
164 * another one is already in progress, queue for later 351 * normal completion and put them on the dispatch queue.
165 * processing.
166 */ 352 */
167 if (q->flush_seq) { 353 for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
168 list_move_tail(&rq->queuelist, &q->pending_flushes); 354 list_for_each_entry_safe(rq, n, &q->flush_queue[i],
169 return NULL; 355 flush.list) {
356 list_del_init(&rq->flush.list);
357 blk_flush_restore_request(rq);
358 list_add_tail(&rq->queuelist, &q->queue_head);
359 }
170 } 360 }
171
172 /*
173 * Start a new flush sequence
174 */
175 q->flush_err = 0;
176 q->flush_seq |= QUEUE_FSEQ_STARTED;
177
178 /* adjust FLUSH/FUA of the original request and stash it away */
179 rq->cmd_flags &= ~REQ_FLUSH;
180 if (!has_fua)
181 rq->cmd_flags &= ~REQ_FUA;
182 blk_dequeue_request(rq);
183 q->orig_flush_rq = rq;
184
185 /* skip unneded sequences and return the first one */
186 if (!do_preflush)
187 skip |= QUEUE_FSEQ_PREFLUSH;
188 if (!blk_rq_sectors(rq))
189 skip |= QUEUE_FSEQ_DATA;
190 if (!do_postflush)
191 skip |= QUEUE_FSEQ_POSTFLUSH;
192 return blk_flush_complete_seq(q, skip, 0);
193} 361}
194 362
195static void bio_end_flush(struct bio *bio, int err) 363static void bio_end_flush(struct bio *bio, int err)
diff --git a/block/blk.h b/block/blk.h
index 9d2ee8f4d9af..284b500852bd 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,21 +51,17 @@ static inline void blk_clear_rq_complete(struct request *rq)
51 */ 51 */
52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) 52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
53 53
54struct request *blk_do_flush(struct request_queue *q, struct request *rq); 54void blk_insert_flush(struct request *rq);
55void blk_abort_flushes(struct request_queue *q);
55 56
56static inline struct request *__elv_next_request(struct request_queue *q) 57static inline struct request *__elv_next_request(struct request_queue *q)
57{ 58{
58 struct request *rq; 59 struct request *rq;
59 60
60 while (1) { 61 while (1) {
61 while (!list_empty(&q->queue_head)) { 62 if (!list_empty(&q->queue_head)) {
62 rq = list_entry_rq(q->queue_head.next); 63 rq = list_entry_rq(q->queue_head.next);
63 if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) || 64 return rq;
64 (rq->cmd_flags & REQ_FLUSH_SEQ))
65 return rq;
66 rq = blk_do_flush(q, rq);
67 if (rq)
68 return rq;
69 } 65 }
70 66
71 if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) 67 if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
diff --git a/block/elevator.c b/block/elevator.c
index 2569512830d3..270e0972eb9f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -673,6 +673,11 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
673 q->elevator->ops->elevator_add_req_fn(q, rq); 673 q->elevator->ops->elevator_add_req_fn(q, rq);
674 break; 674 break;
675 675
676 case ELEVATOR_INSERT_FLUSH:
677 rq->cmd_flags |= REQ_SOFTBARRIER;
678 blk_insert_flush(rq);
679 break;
680
676 default: 681 default:
677 printk(KERN_ERR "%s: bad insertion point %d\n", 682 printk(KERN_ERR "%s: bad insertion point %d\n",
678 __func__, where); 683 __func__, where);
@@ -785,6 +790,8 @@ void elv_abort_queue(struct request_queue *q)
785{ 790{
786 struct request *rq; 791 struct request *rq;
787 792
793 blk_abort_flushes(q);
794
788 while (!list_empty(&q->queue_head)) { 795 while (!list_empty(&q->queue_head)) {
789 rq = list_entry_rq(q->queue_head.next); 796 rq = list_entry_rq(q->queue_head.next);
790 rq->cmd_flags |= REQ_QUIET; 797 rq->cmd_flags |= REQ_QUIET;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 36ab42c9bb99..6d7e9afd08c3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -99,13 +99,18 @@ struct request {
99 /* 99 /*
100 * The rb_node is only used inside the io scheduler, requests 100 * The rb_node is only used inside the io scheduler, requests
101 * are pruned when moved to the dispatch queue. So let the 101 * are pruned when moved to the dispatch queue. So let the
102 * completion_data share space with the rb_node. 102 * flush fields share space with the rb_node.
103 */ 103 */
104 union { 104 union {
105 struct rb_node rb_node; /* sort/lookup */ 105 struct rb_node rb_node; /* sort/lookup */
106 void *completion_data; 106 struct {
107 unsigned int seq;
108 struct list_head list;
109 } flush;
107 }; 110 };
108 111
112 void *completion_data;
113
109 /* 114 /*
110 * Three pointers are available for the IO schedulers, if they need 115 * Three pointers are available for the IO schedulers, if they need
111 * more they have to dynamically allocate it. 116 * more they have to dynamically allocate it.
@@ -362,11 +367,12 @@ struct request_queue
362 * for flush operations 367 * for flush operations
363 */ 368 */
364 unsigned int flush_flags; 369 unsigned int flush_flags;
365 unsigned int flush_seq; 370 unsigned int flush_pending_idx:1;
366 int flush_err; 371 unsigned int flush_running_idx:1;
372 unsigned long flush_pending_since;
373 struct list_head flush_queue[2];
374 struct list_head flush_data_in_flight;
367 struct request flush_rq; 375 struct request flush_rq;
368 struct request *orig_flush_rq;
369 struct list_head pending_flushes;
370 376
371 struct mutex sysfs_lock; 377 struct mutex sysfs_lock;
372 378
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 4fd978e7eb83..86120c916fcc 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -167,6 +167,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
167#define ELEVATOR_INSERT_BACK 2 167#define ELEVATOR_INSERT_BACK 2
168#define ELEVATOR_INSERT_SORT 3 168#define ELEVATOR_INSERT_SORT 3
169#define ELEVATOR_INSERT_REQUEUE 4 169#define ELEVATOR_INSERT_REQUEUE 4
170#define ELEVATOR_INSERT_FLUSH 5
170 171
171/* 172/*
172 * return values from elevator_may_queue_fn 173 * return values from elevator_may_queue_fn