aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--block/blk-core.c10
-rw-r--r--block/blk-flush.c440
-rw-r--r--block/blk.h12
-rw-r--r--block/elevator.c7
-rw-r--r--include/linux/blkdev.h18
-rw-r--r--include/linux/elevator.h1
6 files changed, 332 insertions, 156 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 617bb9e40927..05746093b45e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -134,8 +134,6 @@ EXPORT_SYMBOL(blk_rq_init);
134static void req_bio_endio(struct request *rq, struct bio *bio, 134static void req_bio_endio(struct request *rq, struct bio *bio,
135 unsigned int nbytes, int error) 135 unsigned int nbytes, int error)
136{ 136{
137 struct request_queue *q = rq->q;
138
139 if (error) 137 if (error)
140 clear_bit(BIO_UPTODATE, &bio->bi_flags); 138 clear_bit(BIO_UPTODATE, &bio->bi_flags);
141 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 139 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -159,8 +157,6 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
159 /* don't actually finish bio if it's part of flush sequence */ 157 /* don't actually finish bio if it's part of flush sequence */
160 if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) 158 if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
161 bio_endio(bio, error); 159 bio_endio(bio, error);
162 else if (error && !q->flush_err)
163 q->flush_err = error;
164} 160}
165 161
166void blk_dump_rq_flags(struct request *rq, char *msg) 162void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -519,7 +515,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
519 init_timer(&q->unplug_timer); 515 init_timer(&q->unplug_timer);
520 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 516 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
521 INIT_LIST_HEAD(&q->timeout_list); 517 INIT_LIST_HEAD(&q->timeout_list);
522 INIT_LIST_HEAD(&q->pending_flushes); 518 INIT_LIST_HEAD(&q->flush_queue[0]);
519 INIT_LIST_HEAD(&q->flush_queue[1]);
520 INIT_LIST_HEAD(&q->flush_data_in_flight);
523 INIT_WORK(&q->unplug_work, blk_unplug_work); 521 INIT_WORK(&q->unplug_work, blk_unplug_work);
524 522
525 kobject_init(&q->kobj, &blk_queue_ktype); 523 kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1198,7 +1196,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1198 spin_lock_irq(q->queue_lock); 1196 spin_lock_irq(q->queue_lock);
1199 1197
1200 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 1198 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1201 where = ELEVATOR_INSERT_FRONT; 1199 where = ELEVATOR_INSERT_FLUSH;
1202 goto get_rq; 1200 goto get_rq;
1203 } 1201 }
1204 1202
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8592869bcbe7..a867e3f524f3 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,6 +1,69 @@
1/* 1/*
2 * Functions to sequence FLUSH and FUA writes. 2 * Functions to sequence FLUSH and FUA writes.
3 *
4 * Copyright (C) 2011 Max Planck Institute for Gravitational Physics
5 * Copyright (C) 2011 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
10 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
11 * properties and hardware capability.
12 *
13 * If a request doesn't have data, only REQ_FLUSH makes sense, which
14 * indicates a simple flush request. If there is data, REQ_FLUSH indicates
15 * that the device cache should be flushed before the data is executed, and
16 * REQ_FUA means that the data must be on non-volatile media on request
17 * completion.
18 *
19 * If the device doesn't have writeback cache, FLUSH and FUA don't make any
20 * difference. The requests are either completed immediately if there's no
21 * data or executed as normal requests otherwise.
22 *
23 * If the device has writeback cache and supports FUA, REQ_FLUSH is
24 * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
25 *
26 * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
27 * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
28 *
29 * The actual execution of flush is double buffered. Whenever a request
30 * needs to execute PRE or POSTFLUSH, it queues at
31 * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a
32 * flush is issued and the pending_idx is toggled. When the flush
33 * completes, all the requests which were pending are proceeded to the next
34 * step. This allows arbitrary merging of different types of FLUSH/FUA
35 * requests.
36 *
37 * Currently, the following conditions are used to determine when to issue
38 * flush.
39 *
40 * C1. At any given time, only one flush shall be in progress. This makes
41 * double buffering sufficient.
42 *
43 * C2. Flush is deferred if any request is executing DATA of its sequence.
44 * This avoids issuing separate POSTFLUSHes for requests which shared
45 * PREFLUSH.
46 *
47 * C3. The second condition is ignored if there is a request which has
48 * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
49 * starvation in the unlikely case where there are continuous stream of
50 * FUA (without FLUSH) requests.
51 *
52 * For devices which support FUA, it isn't clear whether C2 (and thus C3)
53 * is beneficial.
54 *
55 * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
56 * Once while executing DATA and again after the whole sequence is
57 * complete. The first completion updates the contained bio but doesn't
58 * finish it so that the bio submitter is notified only after the whole
59 * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in
60 * req_bio_endio().
61 *
62 * The above peculiarity requires that each FLUSH/FUA request has only one
63 * bio attached to it, which is guaranteed as they aren't allowed to be
64 * merged in the usual way.
3 */ 65 */
66
4#include <linux/kernel.h> 67#include <linux/kernel.h>
5#include <linux/module.h> 68#include <linux/module.h>
6#include <linux/bio.h> 69#include <linux/bio.h>
@@ -11,185 +74,290 @@
11 74
12/* FLUSH/FUA sequences */ 75/* FLUSH/FUA sequences */
13enum { 76enum {
14 QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */ 77 REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */
15 QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */ 78 REQ_FSEQ_DATA = (1 << 1), /* data write in progress */
16 QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */ 79 REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */
17 QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */ 80 REQ_FSEQ_DONE = (1 << 3),
18 QUEUE_FSEQ_DONE = (1 << 4), 81
82 REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
83 REQ_FSEQ_POSTFLUSH,
84
85 /*
86 * If flush has been pending longer than the following timeout,
87 * it's issued even if flush_data requests are still in flight.
88 */
89 FLUSH_PENDING_TIMEOUT = 5 * HZ,
19}; 90};
20 91
21static struct request *queue_next_fseq(struct request_queue *q); 92static bool blk_kick_flush(struct request_queue *q);
22 93
23unsigned blk_flush_cur_seq(struct request_queue *q) 94static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
24{ 95{
25 if (!q->flush_seq) 96 unsigned int policy = 0;
26 return 0; 97
27 return 1 << ffz(q->flush_seq); 98 if (fflags & REQ_FLUSH) {
99 if (rq->cmd_flags & REQ_FLUSH)
100 policy |= REQ_FSEQ_PREFLUSH;
101 if (blk_rq_sectors(rq))
102 policy |= REQ_FSEQ_DATA;
103 if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
104 policy |= REQ_FSEQ_POSTFLUSH;
105 }
106 return policy;
28} 107}
29 108
30static struct request *blk_flush_complete_seq(struct request_queue *q, 109static unsigned int blk_flush_cur_seq(struct request *rq)
31 unsigned seq, int error)
32{ 110{
33 struct request *next_rq = NULL; 111 return 1 << ffz(rq->flush.seq);
34
35 if (error && !q->flush_err)
36 q->flush_err = error;
37
38 BUG_ON(q->flush_seq & seq);
39 q->flush_seq |= seq;
40
41 if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) {
42 /* not complete yet, queue the next flush sequence */
43 next_rq = queue_next_fseq(q);
44 } else {
45 /* complete this flush request */
46 __blk_end_request_all(q->orig_flush_rq, q->flush_err);
47 q->orig_flush_rq = NULL;
48 q->flush_seq = 0;
49
50 /* dispatch the next flush if there's one */
51 if (!list_empty(&q->pending_flushes)) {
52 next_rq = list_entry_rq(q->pending_flushes.next);
53 list_move(&next_rq->queuelist, &q->queue_head);
54 }
55 }
56 return next_rq;
57} 112}
58 113
59static void blk_flush_complete_seq_end_io(struct request_queue *q, 114static void blk_flush_restore_request(struct request *rq)
60 unsigned seq, int error)
61{ 115{
62 bool was_empty = elv_queue_empty(q);
63 struct request *next_rq;
64
65 next_rq = blk_flush_complete_seq(q, seq, error);
66
67 /* 116 /*
68 * Moving a request silently to empty queue_head may stall the 117 * After flush data completion, @rq->bio is %NULL but we need to
69 * queue. Kick the queue in those cases. 118 * complete the bio again. @rq->biotail is guaranteed to equal the
119 * original @rq->bio. Restore it.
70 */ 120 */
71 if (was_empty && next_rq) 121 rq->bio = rq->biotail;
72 __blk_run_queue(q); 122
123 /* make @rq a normal request */
124 rq->cmd_flags &= ~REQ_FLUSH_SEQ;
125 rq->end_io = NULL;
73} 126}
74 127
75static void pre_flush_end_io(struct request *rq, int error) 128/**
129 * blk_flush_complete_seq - complete flush sequence
130 * @rq: FLUSH/FUA request being sequenced
131 * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
132 * @error: whether an error occurred
133 *
134 * @rq just completed @seq part of its flush sequence, record the
135 * completion and trigger the next step.
136 *
137 * CONTEXT:
138 * spin_lock_irq(q->queue_lock)
139 *
140 * RETURNS:
141 * %true if requests were added to the dispatch queue, %false otherwise.
142 */
143static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
144 int error)
76{ 145{
77 elv_completed_request(rq->q, rq); 146 struct request_queue *q = rq->q;
78 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error); 147 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
148 bool queued = false;
149
150 BUG_ON(rq->flush.seq & seq);
151 rq->flush.seq |= seq;
152
153 if (likely(!error))
154 seq = blk_flush_cur_seq(rq);
155 else
156 seq = REQ_FSEQ_DONE;
157
158 switch (seq) {
159 case REQ_FSEQ_PREFLUSH:
160 case REQ_FSEQ_POSTFLUSH:
161 /* queue for flush */
162 if (list_empty(pending))
163 q->flush_pending_since = jiffies;
164 list_move_tail(&rq->flush.list, pending);
165 break;
166
167 case REQ_FSEQ_DATA:
168 list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
169 list_add(&rq->queuelist, &q->queue_head);
170 queued = true;
171 break;
172
173 case REQ_FSEQ_DONE:
174 /*
175 * @rq was previously adjusted by blk_flush_issue() for
176 * flush sequencing and may already have gone through the
177 * flush data request completion path. Restore @rq for
178 * normal completion and end it.
179 */
180 BUG_ON(!list_empty(&rq->queuelist));
181 list_del_init(&rq->flush.list);
182 blk_flush_restore_request(rq);
183 __blk_end_request_all(rq, error);
184 break;
185
186 default:
187 BUG();
188 }
189
190 return blk_kick_flush(q) | queued;
79} 191}
80 192
81static void flush_data_end_io(struct request *rq, int error) 193static void flush_end_io(struct request *flush_rq, int error)
82{ 194{
83 elv_completed_request(rq->q, rq); 195 struct request_queue *q = flush_rq->q;
84 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error); 196 struct list_head *running = &q->flush_queue[q->flush_running_idx];
197 bool was_empty = elv_queue_empty(q);
198 bool queued = false;
199 struct request *rq, *n;
200
201 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
202
203 /* account completion of the flush request */
204 q->flush_running_idx ^= 1;
205 elv_completed_request(q, flush_rq);
206
207 /* and push the waiting requests to the next stage */
208 list_for_each_entry_safe(rq, n, running, flush.list) {
209 unsigned int seq = blk_flush_cur_seq(rq);
210
211 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
212 queued |= blk_flush_complete_seq(rq, seq, error);
213 }
214
215 /* after populating an empty queue, kick it to avoid stall */
216 if (queued && was_empty)
217 __blk_run_queue(q);
85} 218}
86 219
87static void post_flush_end_io(struct request *rq, int error) 220/**
221 * blk_kick_flush - consider issuing flush request
222 * @q: request_queue being kicked
223 *
224 * Flush related states of @q have changed, consider issuing flush request.
225 * Please read the comment at the top of this file for more info.
226 *
227 * CONTEXT:
228 * spin_lock_irq(q->queue_lock)
229 *
230 * RETURNS:
231 * %true if flush was issued, %false otherwise.
232 */
233static bool blk_kick_flush(struct request_queue *q)
88{ 234{
89 elv_completed_request(rq->q, rq); 235 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
90 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error); 236 struct request *first_rq =
237 list_first_entry(pending, struct request, flush.list);
238
239 /* C1 described at the top of this file */
240 if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
241 return false;
242
243 /* C2 and C3 */
244 if (!list_empty(&q->flush_data_in_flight) &&
245 time_before(jiffies,
246 q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
247 return false;
248
249 /*
250 * Issue flush and toggle pending_idx. This makes pending_idx
251 * different from running_idx, which means flush is in flight.
252 */
253 blk_rq_init(q, &q->flush_rq);
254 q->flush_rq.cmd_type = REQ_TYPE_FS;
255 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
256 q->flush_rq.rq_disk = first_rq->rq_disk;
257 q->flush_rq.end_io = flush_end_io;
258
259 q->flush_pending_idx ^= 1;
260 elv_insert(q, &q->flush_rq, ELEVATOR_INSERT_FRONT);
261 return true;
91} 262}
92 263
93static void init_flush_request(struct request *rq, struct gendisk *disk) 264static void flush_data_end_io(struct request *rq, int error)
94{ 265{
95 rq->cmd_type = REQ_TYPE_FS; 266 struct request_queue *q = rq->q;
96 rq->cmd_flags = WRITE_FLUSH; 267 bool was_empty = elv_queue_empty(q);
97 rq->rq_disk = disk; 268
269 /* after populating an empty queue, kick it to avoid stall */
270 if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error) && was_empty)
271 __blk_run_queue(q);
98} 272}
99 273
100static struct request *queue_next_fseq(struct request_queue *q) 274/**
275 * blk_insert_flush - insert a new FLUSH/FUA request
276 * @rq: request to insert
277 *
278 * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions.
279 * @rq is being submitted. Analyze what needs to be done and put it on the
280 * right queue.
281 *
282 * CONTEXT:
283 * spin_lock_irq(q->queue_lock)
284 */
285void blk_insert_flush(struct request *rq)
101{ 286{
102 struct request *orig_rq = q->orig_flush_rq; 287 struct request_queue *q = rq->q;
103 struct request *rq = &q->flush_rq; 288 unsigned int fflags = q->flush_flags; /* may change, cache */
289 unsigned int policy = blk_flush_policy(fflags, rq);
104 290
105 blk_rq_init(q, rq); 291 BUG_ON(rq->end_io);
292 BUG_ON(!rq->bio || rq->bio != rq->biotail);
106 293
107 switch (blk_flush_cur_seq(q)) { 294 /*
108 case QUEUE_FSEQ_PREFLUSH: 295 * @policy now records what operations need to be done. Adjust
109 init_flush_request(rq, orig_rq->rq_disk); 296 * REQ_FLUSH and FUA for the driver.
110 rq->end_io = pre_flush_end_io; 297 */
111 break; 298 rq->cmd_flags &= ~REQ_FLUSH;
112 case QUEUE_FSEQ_DATA: 299 if (!(fflags & REQ_FUA))
113 init_request_from_bio(rq, orig_rq->bio); 300 rq->cmd_flags &= ~REQ_FUA;
114 /* 301
115 * orig_rq->rq_disk may be different from 302 /*
116 * bio->bi_bdev->bd_disk if orig_rq got here through 303 * If there's data but flush is not necessary, the request can be
117 * remapping drivers. Make sure rq->rq_disk points 304 * processed directly without going through flush machinery. Queue
118 * to the same one as orig_rq. 305 * for normal execution.
119 */ 306 */
120 rq->rq_disk = orig_rq->rq_disk; 307 if ((policy & REQ_FSEQ_DATA) &&
121 rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA); 308 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
122 rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA); 309 list_add(&rq->queuelist, &q->queue_head);
123 rq->end_io = flush_data_end_io; 310 return;
124 break;
125 case QUEUE_FSEQ_POSTFLUSH:
126 init_flush_request(rq, orig_rq->rq_disk);
127 rq->end_io = post_flush_end_io;
128 break;
129 default:
130 BUG();
131 } 311 }
132 312
313 /*
314 * @rq should go through flush machinery. Mark it part of flush
315 * sequence and submit for further processing.
316 */
317 memset(&rq->flush, 0, sizeof(rq->flush));
318 INIT_LIST_HEAD(&rq->flush.list);
133 rq->cmd_flags |= REQ_FLUSH_SEQ; 319 rq->cmd_flags |= REQ_FLUSH_SEQ;
134 elv_insert(q, rq, ELEVATOR_INSERT_FRONT); 320 rq->end_io = flush_data_end_io;
135 return rq; 321
322 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
136} 323}
137 324
138struct request *blk_do_flush(struct request_queue *q, struct request *rq) 325/**
326 * blk_abort_flushes - @q is being aborted, abort flush requests
327 * @q: request_queue being aborted
328 *
329 * To be called from elv_abort_queue(). @q is being aborted. Prepare all
330 * FLUSH/FUA requests for abortion.
331 *
332 * CONTEXT:
333 * spin_lock_irq(q->queue_lock)
334 */
335void blk_abort_flushes(struct request_queue *q)
139{ 336{
140 unsigned int fflags = q->flush_flags; /* may change, cache it */ 337 struct request *rq, *n;
141 bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA; 338 int i;
142 bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
143 bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
144 unsigned skip = 0;
145 339
146 /* 340 /*
147 * Special case. If there's data but flush is not necessary, 341 * Requests in flight for data are already owned by the dispatch
148 * the request can be issued directly. 342 * queue or the device driver. Just restore for normal completion.
149 *
150 * Flush w/o data should be able to be issued directly too but
151 * currently some drivers assume that rq->bio contains
152 * non-zero data if it isn't NULL and empty FLUSH requests
153 * getting here usually have bio's without data.
154 */ 343 */
155 if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) { 344 list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
156 rq->cmd_flags &= ~REQ_FLUSH; 345 list_del_init(&rq->flush.list);
157 if (!has_fua) 346 blk_flush_restore_request(rq);
158 rq->cmd_flags &= ~REQ_FUA;
159 return rq;
160 } 347 }
161 348
162 /* 349 /*
163 * Sequenced flushes can't be processed in parallel. If 350 * We need to give away requests on flush queues. Restore for
164 * another one is already in progress, queue for later 351 * normal completion and put them on the dispatch queue.
165 * processing.
166 */ 352 */
167 if (q->flush_seq) { 353 for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
168 list_move_tail(&rq->queuelist, &q->pending_flushes); 354 list_for_each_entry_safe(rq, n, &q->flush_queue[i],
169 return NULL; 355 flush.list) {
356 list_del_init(&rq->flush.list);
357 blk_flush_restore_request(rq);
358 list_add_tail(&rq->queuelist, &q->queue_head);
359 }
170 } 360 }
171
172 /*
173 * Start a new flush sequence
174 */
175 q->flush_err = 0;
176 q->flush_seq |= QUEUE_FSEQ_STARTED;
177
178 /* adjust FLUSH/FUA of the original request and stash it away */
179 rq->cmd_flags &= ~REQ_FLUSH;
180 if (!has_fua)
181 rq->cmd_flags &= ~REQ_FUA;
182 blk_dequeue_request(rq);
183 q->orig_flush_rq = rq;
184
185 /* skip unneded sequences and return the first one */
186 if (!do_preflush)
187 skip |= QUEUE_FSEQ_PREFLUSH;
188 if (!blk_rq_sectors(rq))
189 skip |= QUEUE_FSEQ_DATA;
190 if (!do_postflush)
191 skip |= QUEUE_FSEQ_POSTFLUSH;
192 return blk_flush_complete_seq(q, skip, 0);
193} 361}
194 362
195static void bio_end_flush(struct bio *bio, int err) 363static void bio_end_flush(struct bio *bio, int err)
diff --git a/block/blk.h b/block/blk.h
index 9d2ee8f4d9af..284b500852bd 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,21 +51,17 @@ static inline void blk_clear_rq_complete(struct request *rq)
51 */ 51 */
52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) 52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
53 53
54struct request *blk_do_flush(struct request_queue *q, struct request *rq); 54void blk_insert_flush(struct request *rq);
55void blk_abort_flushes(struct request_queue *q);
55 56
56static inline struct request *__elv_next_request(struct request_queue *q) 57static inline struct request *__elv_next_request(struct request_queue *q)
57{ 58{
58 struct request *rq; 59 struct request *rq;
59 60
60 while (1) { 61 while (1) {
61 while (!list_empty(&q->queue_head)) { 62 if (!list_empty(&q->queue_head)) {
62 rq = list_entry_rq(q->queue_head.next); 63 rq = list_entry_rq(q->queue_head.next);
63 if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) || 64 return rq;
64 (rq->cmd_flags & REQ_FLUSH_SEQ))
65 return rq;
66 rq = blk_do_flush(q, rq);
67 if (rq)
68 return rq;
69 } 65 }
70 66
71 if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) 67 if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
diff --git a/block/elevator.c b/block/elevator.c
index 2569512830d3..270e0972eb9f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -673,6 +673,11 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
673 q->elevator->ops->elevator_add_req_fn(q, rq); 673 q->elevator->ops->elevator_add_req_fn(q, rq);
674 break; 674 break;
675 675
676 case ELEVATOR_INSERT_FLUSH:
677 rq->cmd_flags |= REQ_SOFTBARRIER;
678 blk_insert_flush(rq);
679 break;
680
676 default: 681 default:
677 printk(KERN_ERR "%s: bad insertion point %d\n", 682 printk(KERN_ERR "%s: bad insertion point %d\n",
678 __func__, where); 683 __func__, where);
@@ -785,6 +790,8 @@ void elv_abort_queue(struct request_queue *q)
785{ 790{
786 struct request *rq; 791 struct request *rq;
787 792
793 blk_abort_flushes(q);
794
788 while (!list_empty(&q->queue_head)) { 795 while (!list_empty(&q->queue_head)) {
789 rq = list_entry_rq(q->queue_head.next); 796 rq = list_entry_rq(q->queue_head.next);
790 rq->cmd_flags |= REQ_QUIET; 797 rq->cmd_flags |= REQ_QUIET;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 36ab42c9bb99..6d7e9afd08c3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -99,13 +99,18 @@ struct request {
99 /* 99 /*
100 * The rb_node is only used inside the io scheduler, requests 100 * The rb_node is only used inside the io scheduler, requests
101 * are pruned when moved to the dispatch queue. So let the 101 * are pruned when moved to the dispatch queue. So let the
102 * completion_data share space with the rb_node. 102 * flush fields share space with the rb_node.
103 */ 103 */
104 union { 104 union {
105 struct rb_node rb_node; /* sort/lookup */ 105 struct rb_node rb_node; /* sort/lookup */
106 void *completion_data; 106 struct {
107 unsigned int seq;
108 struct list_head list;
109 } flush;
107 }; 110 };
108 111
112 void *completion_data;
113
109 /* 114 /*
110 * Three pointers are available for the IO schedulers, if they need 115 * Three pointers are available for the IO schedulers, if they need
111 * more they have to dynamically allocate it. 116 * more they have to dynamically allocate it.
@@ -362,11 +367,12 @@ struct request_queue
362 * for flush operations 367 * for flush operations
363 */ 368 */
364 unsigned int flush_flags; 369 unsigned int flush_flags;
365 unsigned int flush_seq; 370 unsigned int flush_pending_idx:1;
366 int flush_err; 371 unsigned int flush_running_idx:1;
372 unsigned long flush_pending_since;
373 struct list_head flush_queue[2];
374 struct list_head flush_data_in_flight;
367 struct request flush_rq; 375 struct request flush_rq;
368 struct request *orig_flush_rq;
369 struct list_head pending_flushes;
370 376
371 struct mutex sysfs_lock; 377 struct mutex sysfs_lock;
372 378
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 4fd978e7eb83..86120c916fcc 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -167,6 +167,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
167#define ELEVATOR_INSERT_BACK 2 167#define ELEVATOR_INSERT_BACK 2
168#define ELEVATOR_INSERT_SORT 3 168#define ELEVATOR_INSERT_SORT 3
169#define ELEVATOR_INSERT_REQUEUE 4 169#define ELEVATOR_INSERT_REQUEUE 4
170#define ELEVATOR_INSERT_FLUSH 5
170 171
171/* 172/*
172 * return values from elevator_may_queue_fn 173 * return values from elevator_may_queue_fn