aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Moyer <jmoyer@redhat.com>2011-08-15 15:37:25 -0400
committerJens Axboe <jaxboe@fusionio.com>2011-08-15 15:37:25 -0400
commit4853abaae7e4a2af938115ce9071ef8684fb7af4 (patch)
tree167eb7cb1b48541fa6d0ca5042f7452e2dd9e4de
parentbcf30e75b773b60379338768677a1301ef602ff9 (diff)
block: fix flush machinery for stacking drivers with differring flush flags
Commit ae1b1539622fb46e51b4d13b3f9e5f4c713f86ae, block: reimplement FLUSH/FUA to support merge, introduced a performance regression when running any sort of fsyncing workload using dm-multipath and certain storage (in our case, an HP EVA). The test I ran was fs_mark, and it dropped from ~800 files/sec on ext4 to ~100 files/sec. It turns out that dm-multipath always advertised flush+fua support, and passed commands on down the stack, where those flags used to get stripped off. The above commit changed that behavior: static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; while (1) { - while (!list_empty(&q->queue_head)) { + if (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); - if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) || - (rq->cmd_flags & REQ_FLUSH_SEQ)) - return rq; - rq = blk_do_flush(q, rq); - if (rq) - return rq; + return rq; } Note that previously, a command would come in here, have REQ_FLUSH|REQ_FUA set, and then get handed off to blk_do_flush: struct request *blk_do_flush(struct request_queue *q, struct request *rq) { unsigned int fflags = q->flush_flags; /* may change, cache it */ bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA; bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH); bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA); unsigned skip = 0; ... if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) { rq->cmd_flags &= ~REQ_FLUSH; if (!has_fua) rq->cmd_flags &= ~REQ_FUA; return rq; } So, the flush machinery was bypassed in such cases (q->flush_flags == 0 && rq->cmd_flags & (REQ_FLUSH|REQ_FUA)). Now, however, we don't get into the flush machinery at all. Instead, __elv_next_request just hands a request with flush and fua bits set to the scsi_request_fn, even if the underlying request_queue does not support flush or fua. The agreed upon approach is to fix the flush machinery to allow stacking. While this isn't used in practice (since there is only one request-based dm target, and that target will now reflect the flush flags of the underlying device), it does future-proof the solution, and make it function as designed. In order to make this work, I had to add a field to the struct request, inside the flush structure (to store the original req->end_io). Shaohua had suggested overloading the union with rb_node and completion_data, but the completion data is used by device mapper and can also be used by other drivers. So, I didn't see a way around the additional field. I tested this patch on an HP EVA with both ext4 and xfs, and it recovers the lost performance. Comments and other testers, as always, are appreciated. Cheers, Jeff Signed-off-by: Jeff Moyer <jmoyer@redhat.com> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
-rw-r--r--block/blk-core.c8
-rw-r--r--block/blk-flush.c20
-rw-r--r--block/blk.h2
-rw-r--r--include/linux/blkdev.h1
4 files changed, 25 insertions, 6 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index b850bedad229..7c59b0f5eae8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1700,6 +1700,7 @@ EXPORT_SYMBOL_GPL(blk_rq_check_limits);
1700int blk_insert_cloned_request(struct request_queue *q, struct request *rq) 1700int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1701{ 1701{
1702 unsigned long flags; 1702 unsigned long flags;
1703 int where = ELEVATOR_INSERT_BACK;
1703 1704
1704 if (blk_rq_check_limits(q, rq)) 1705 if (blk_rq_check_limits(q, rq))
1705 return -EIO; 1706 return -EIO;
@@ -1716,7 +1717,10 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1716 */ 1717 */
1717 BUG_ON(blk_queued_rq(rq)); 1718 BUG_ON(blk_queued_rq(rq));
1718 1719
1719 add_acct_request(q, rq, ELEVATOR_INSERT_BACK); 1720 if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA))
1721 where = ELEVATOR_INSERT_FLUSH;
1722
1723 add_acct_request(q, rq, where);
1720 spin_unlock_irqrestore(q->queue_lock, flags); 1724 spin_unlock_irqrestore(q->queue_lock, flags);
1721 1725
1722 return 0; 1726 return 0;
@@ -2273,7 +2277,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
2273 * %false - we are done with this request 2277 * %false - we are done with this request
2274 * %true - still buffers pending for this request 2278 * %true - still buffers pending for this request
2275 **/ 2279 **/
2276static bool __blk_end_bidi_request(struct request *rq, int error, 2280bool __blk_end_bidi_request(struct request *rq, int error,
2277 unsigned int nr_bytes, unsigned int bidi_bytes) 2281 unsigned int nr_bytes, unsigned int bidi_bytes)
2278{ 2282{
2279 if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) 2283 if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 2d162bd840d3..491eb30a242d 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -123,7 +123,7 @@ static void blk_flush_restore_request(struct request *rq)
123 123
124 /* make @rq a normal request */ 124 /* make @rq a normal request */
125 rq->cmd_flags &= ~REQ_FLUSH_SEQ; 125 rq->cmd_flags &= ~REQ_FLUSH_SEQ;
126 rq->end_io = NULL; 126 rq->end_io = rq->flush.saved_end_io;
127} 127}
128 128
129/** 129/**
@@ -301,9 +301,6 @@ void blk_insert_flush(struct request *rq)
301 unsigned int fflags = q->flush_flags; /* may change, cache */ 301 unsigned int fflags = q->flush_flags; /* may change, cache */
302 unsigned int policy = blk_flush_policy(fflags, rq); 302 unsigned int policy = blk_flush_policy(fflags, rq);
303 303
304 BUG_ON(rq->end_io);
305 BUG_ON(!rq->bio || rq->bio != rq->biotail);
306
307 /* 304 /*
308 * @policy now records what operations need to be done. Adjust 305 * @policy now records what operations need to be done. Adjust
309 * REQ_FLUSH and FUA for the driver. 306 * REQ_FLUSH and FUA for the driver.
@@ -313,6 +310,19 @@ void blk_insert_flush(struct request *rq)
313 rq->cmd_flags &= ~REQ_FUA; 310 rq->cmd_flags &= ~REQ_FUA;
314 311
315 /* 312 /*
313 * An empty flush handed down from a stacking driver may
314 * translate into nothing if the underlying device does not
315 * advertise a write-back cache. In this case, simply
316 * complete the request.
317 */
318 if (!policy) {
319 __blk_end_bidi_request(rq, 0, 0, 0);
320 return;
321 }
322
323 BUG_ON(!rq->bio || rq->bio != rq->biotail);
324
325 /*
316 * If there's data but flush is not necessary, the request can be 326 * If there's data but flush is not necessary, the request can be
317 * processed directly without going through flush machinery. Queue 327 * processed directly without going through flush machinery. Queue
318 * for normal execution. 328 * for normal execution.
@@ -320,6 +330,7 @@ void blk_insert_flush(struct request *rq)
320 if ((policy & REQ_FSEQ_DATA) && 330 if ((policy & REQ_FSEQ_DATA) &&
321 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 331 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
322 list_add_tail(&rq->queuelist, &q->queue_head); 332 list_add_tail(&rq->queuelist, &q->queue_head);
333 blk_run_queue_async(q);
323 return; 334 return;
324 } 335 }
325 336
@@ -330,6 +341,7 @@ void blk_insert_flush(struct request *rq)
330 memset(&rq->flush, 0, sizeof(rq->flush)); 341 memset(&rq->flush, 0, sizeof(rq->flush));
331 INIT_LIST_HEAD(&rq->flush.list); 342 INIT_LIST_HEAD(&rq->flush.list);
332 rq->cmd_flags |= REQ_FLUSH_SEQ; 343 rq->cmd_flags |= REQ_FLUSH_SEQ;
344 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
333 rq->end_io = flush_data_end_io; 345 rq->end_io = flush_data_end_io;
334 346
335 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); 347 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
diff --git a/block/blk.h b/block/blk.h
index d6586287adc9..20b900a377c9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -17,6 +17,8 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
17 struct bio *bio); 17 struct bio *bio);
18void blk_dequeue_request(struct request *rq); 18void blk_dequeue_request(struct request *rq);
19void __blk_queue_free_tags(struct request_queue *q); 19void __blk_queue_free_tags(struct request_queue *q);
20bool __blk_end_bidi_request(struct request *rq, int error,
21 unsigned int nr_bytes, unsigned int bidi_bytes);
20 22
21void blk_rq_timed_out_timer(unsigned long data); 23void blk_rq_timed_out_timer(unsigned long data);
22void blk_delete_timer(struct request *); 24void blk_delete_timer(struct request *);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 847928546076..84b15d54f8c2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -118,6 +118,7 @@ struct request {
118 struct { 118 struct {
119 unsigned int seq; 119 unsigned int seq;
120 struct list_head list; 120 struct list_head list;
121 rq_end_io_fn *saved_end_io;
121 } flush; 122 } flush;
122 }; 123 };
123 124