From eef35c2d41ddcc653c20d26b977acaa45c811e1f Mon Sep 17 00:00:00 2001
From: Stefan Weil <weil@mail.berlios.de>
Date: Fri, 6 Aug 2010 21:11:15 +0200
Subject: Fix spelling fuction -> function in comments

To avoid more patches, I also fixed other spelling
and grammar bugs when they were in the same or
following line:

successfull -> successful
parse -> parses
controler -> controller
controlers -> controllers

Cc: Jiri Kosina <trivial@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Stefan Weil <weil@mail.berlios.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index f0640d7f800f..7ac24fa71f7a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1618,7 +1618,7 @@ EXPORT_SYMBOL(submit_bio);
  *    the insertion using this generic function.
  *
  *    This function should also be useful for request stacking drivers
- *    in some cases below, so export this fuction.
+ *    in some cases below, so export this function.
  *    Request stacking drivers like request-based dm may change the queue
  *    limits while requests are in the queue (e.g. dm's table swapping).
  *    Such request stacking drivers should check those requests agaist
-- 
cgit v1.2.2


From 6958f145459ca7ad9715024de97445addacb8510 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: kill QUEUE_ORDERED_BY_TAG

Nobody is making meaningful use of ORDERED_BY_TAG now and queue
draining for barrier requests will be removed soon which will render
the advantage of tag ordering moot.  Kill ORDERED_BY_TAG.  The
following users are affected.

* brd: converted to ORDERED_DRAIN.
* virtio_blk: ORDERED_TAG path was already marked deprecated.  Removed.
* xen-blkfront: ORDERED_TAG case dropped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index f0faefca032f..c807e9ca3a68 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -26,10 +26,7 @@ int blk_queue_ordered(struct request_queue *q, unsigned ordered)
 	if (ordered != QUEUE_ORDERED_NONE &&
 	    ordered != QUEUE_ORDERED_DRAIN &&
 	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
-	    ordered != QUEUE_ORDERED_DRAIN_FUA &&
-	    ordered != QUEUE_ORDERED_TAG &&
-	    ordered != QUEUE_ORDERED_TAG_FLUSH &&
-	    ordered != QUEUE_ORDERED_TAG_FUA) {
+	    ordered != QUEUE_ORDERED_DRAIN_FUA) {
 		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
 		return -EINVAL;
 	}
@@ -155,21 +152,9 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	 * For an empty barrier, there's no actual BAR request, which
 	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
 	 */
-	if (!blk_rq_sectors(rq)) {
+	if (!blk_rq_sectors(rq))
 		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
 				QUEUE_ORDERED_DO_POSTFLUSH);
-		/*
-		 * Empty barrier on a write-through device w/ ordered
-		 * tag has no command to issue and without any command
-		 * to issue, ordering by tag can't be used.  Drain
-		 * instead.
-		 */
-		if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
-		    !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
-			q->ordered &= ~QUEUE_ORDERED_BY_TAG;
-			q->ordered |= QUEUE_ORDERED_BY_DRAIN;
-		}
-	}
 
 	/* stash away the original request */
 	blk_dequeue_request(rq);
@@ -210,7 +195,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	} else
 		skip |= QUEUE_ORDSEQ_PREFLUSH;
 
-	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
+	if (queue_in_flight(q))
 		rq = NULL;
 	else
 		skip |= QUEUE_ORDSEQ_DRAIN;
@@ -257,16 +242,10 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
 		return true;
 
-	if (q->ordered & QUEUE_ORDERED_BY_TAG) {
-		/* Ordered by tag.  Blocking the next barrier is enough. */
-		if (is_barrier && rq != &q->bar_rq)
-			*rqp = NULL;
-	} else {
-		/* Ordered by draining.  Wait for turn. */
-		WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
-		if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
-			*rqp = NULL;
-	}
+	/* Ordered by draining.  Wait for turn. */
+	WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
+	if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
+		*rqp = NULL;
 
 	return true;
 }
-- 
cgit v1.2.2


From 4913efe456c987057e5d36a3f0a55422a9072cae Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: deprecate barrier and replace blk_queue_ordered() with
 blk_queue_flush()

Barrier is deemed too heavy and will soon be replaced by FLUSH/FUA
requests.  Deprecate barrier.  All REQ_HARDBARRIERs are failed with
-EOPNOTSUPP and blk_queue_ordered() is replaced with simpler
blk_queue_flush().

blk_queue_flush() takes combinations of REQ_FLUSH and FUA.  If a
device has write cache and can flush it, it should set REQ_FLUSH.  If
the device can handle FUA writes, it should also set REQ_FUA.

All blk_queue_ordered() users are converted.

* ORDERED_DRAIN is mapped to 0 which is the default value.
* ORDERED_DRAIN_FLUSH is mapped to REQ_FLUSH.
* ORDERED_DRAIN_FLUSH_FUA is mapped to REQ_FLUSH | REQ_FUA.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Pierre Ossman <drzeus@drzeus.cx>
Cc: Stefan Weinhuber <wein@de.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c  | 29 -----------------------------
 block/blk-core.c     |  6 ++++--
 block/blk-settings.c | 20 ++++++++++++++++++++
 3 files changed, 24 insertions(+), 31 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index c807e9ca3a68..ed0aba5463ab 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -9,35 +9,6 @@
 
 #include "blk.h"
 
-/**
- * blk_queue_ordered - does this queue support ordered writes
- * @q:        the request queue
- * @ordered:  one of QUEUE_ORDERED_*
- *
- * Description:
- *   For journalled file systems, doing ordered writes on a commit
- *   block instead of explicitly doing wait_on_buffer (which is bad
- *   for performance) can be a big win. Block drivers supporting this
- *   feature should call this function and indicate so.
- *
- **/
-int blk_queue_ordered(struct request_queue *q, unsigned ordered)
-{
-	if (ordered != QUEUE_ORDERED_NONE &&
-	    ordered != QUEUE_ORDERED_DRAIN &&
-	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
-	    ordered != QUEUE_ORDERED_DRAIN_FUA) {
-		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
-		return -EINVAL;
-	}
-
-	q->ordered = ordered;
-	q->next_ordered = ordered;
-
-	return 0;
-}
-EXPORT_SYMBOL(blk_queue_ordered);
-
 /*
  * Cache flushing for ordered writes handling
  */
diff --git a/block/blk-core.c b/block/blk-core.c
index ee1a1e7e63cc..f06354183b29 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1203,11 +1203,13 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	int rw_flags;
 
-	if ((bio->bi_rw & REQ_HARDBARRIER) &&
-	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
+	/* REQ_HARDBARRIER is no more */
+	if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER,
+		"block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
+
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a234f4bf1d6f..9b18afcfe925 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -794,6 +794,26 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
+/**
+ * blk_queue_flush - configure queue's cache flush capability
+ * @q:		the request queue for the device
+ * @flush:	0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
+ *
+ * Tell block layer cache flush capability of @q.  If it supports
+ * flushing, REQ_FLUSH should be set.  If it supports bypassing
+ * write cache for individual writes, REQ_FUA should be set.
+ */
+void blk_queue_flush(struct request_queue *q, unsigned int flush)
+{
+	WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
+
+	if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
+		flush &= ~REQ_FUA;
+
+	q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush);
+
 static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
-- 
cgit v1.2.2


From dd831006d5be7f74c3fe7aef82380c51c3637960 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: misc cleanups in barrier code

Make the following cleanups in preparation of barrier/flush update.

* blk_do_ordered() declaration is moved from include/linux/blkdev.h to
  block/blk.h.

* blk_do_ordered() now returns pointer to struct request, with %NULL
  meaning "try the next request" and ERR_PTR(-EAGAIN) "try again
  later".  The third case will be dropped with further changes.

* In the initialization of proxy barrier request, data direction is
  already set by init_request_from_bio().  Drop unnecessary explicit
  REQ_WRITE setting and move init_request_from_bio() above REQ_FUA
  flag setting.

* add_request() is collapsed into __make_request().

These changes don't make any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c | 32 ++++++++++++++------------------
 block/blk-core.c    | 21 ++++-----------------
 block/blk.h         |  7 +++++--
 3 files changed, 23 insertions(+), 37 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index ed0aba5463ab..f1be85ba2bb5 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -110,9 +110,9 @@ static void queue_flush(struct request_queue *q, unsigned which)
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
-static inline bool start_ordered(struct request_queue *q, struct request **rqp)
+static inline struct request *start_ordered(struct request_queue *q,
+					    struct request *rq)
 {
-	struct request *rq = *rqp;
 	unsigned skip = 0;
 
 	q->orderr = 0;
@@ -149,11 +149,9 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 
 		/* initialize proxy request and queue it */
 		blk_rq_init(q, rq);
-		if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
-			rq->cmd_flags |= REQ_WRITE;
+		init_request_from_bio(rq, q->orig_bar_rq->bio);
 		if (q->ordered & QUEUE_ORDERED_DO_FUA)
 			rq->cmd_flags |= REQ_FUA;
-		init_request_from_bio(rq, q->orig_bar_rq->bio);
 		rq->end_io = bar_end_io;
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
@@ -171,27 +169,26 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	else
 		skip |= QUEUE_ORDSEQ_DRAIN;
 
-	*rqp = rq;
-
 	/*
 	 * Complete skipped sequences.  If whole sequence is complete,
-	 * return false to tell elevator that this request is gone.
+	 * return %NULL to tell elevator that this request is gone.
 	 */
-	return !blk_ordered_complete_seq(q, skip, 0);
+	if (blk_ordered_complete_seq(q, skip, 0))
+		rq = NULL;
+	return rq;
 }
 
-bool blk_do_ordered(struct request_queue *q, struct request **rqp)
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 {
-	struct request *rq = *rqp;
 	const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
 				(rq->cmd_flags & REQ_HARDBARRIER);
 
 	if (!q->ordseq) {
 		if (!is_barrier)
-			return true;
+			return rq;
 
 		if (q->next_ordered != QUEUE_ORDERED_NONE)
-			return start_ordered(q, rqp);
+			return start_ordered(q, rq);
 		else {
 			/*
 			 * Queue ordering not supported.  Terminate
@@ -199,8 +196,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 			 */
 			blk_dequeue_request(rq);
 			__blk_end_request_all(rq, -EOPNOTSUPP);
-			*rqp = NULL;
-			return false;
+			return NULL;
 		}
 	}
 
@@ -211,14 +207,14 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 	/* Special requests are not subject to ordering rules. */
 	if (rq->cmd_type != REQ_TYPE_FS &&
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
-		return true;
+		return rq;
 
 	/* Ordered by draining.  Wait for turn. */
 	WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
 	if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
-		*rqp = NULL;
+		rq = ERR_PTR(-EAGAIN);
 
-	return true;
+	return rq;
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
diff --git a/block/blk-core.c b/block/blk-core.c
index f06354183b29..f8d37a8e2c55 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1037,22 +1037,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 }
 EXPORT_SYMBOL(blk_insert_request);
 
-/*
- * add-request adds a request to the linked list.
- * queue lock is held and interrupts disabled, as we muck with the
- * request queue list.
- */
-static inline void add_request(struct request_queue *q, struct request *req)
-{
-	drive_stat_acct(req, 1);
-
-	/*
-	 * elevator indicated where it wants this request to be
-	 * inserted at elevator_merge time
-	 */
-	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
-}
-
 static void part_round_stats_single(int cpu, struct hd_struct *part,
 				    unsigned long now)
 {
@@ -1316,7 +1300,10 @@ get_rq:
 		req->cpu = blk_cpu_to_group(smp_processor_id());
 	if (queue_should_plug(q) && elv_queue_empty(q))
 		blk_plug_device(q);
-	add_request(q, req);
+
+	/* insert the request into the elevator */
+	drive_stat_acct(req, 1);
+	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
 out:
 	if (unplug || !queue_should_plug(q))
 		__generic_unplug_device(q);
diff --git a/block/blk.h b/block/blk.h
index 6e7dc87141e4..874eb4ea8093 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete(struct request *rq)
  */
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq);
+
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
@@ -58,8 +60,9 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
-			if (blk_do_ordered(q, &rq))
-				return rq;
+			rq = blk_do_ordered(q, rq);
+			if (rq)
+				return !IS_ERR(rq) ? rq : NULL;
 		}
 
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
-- 
cgit v1.2.2


From 28e7d1845216538303bb95d679d8fd4de50e2f1a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: drop barrier ordering by queue draining

Filesystems will take all the responsibilities for ordering requests
around commit writes and will only indicate how the commit writes
themselves should be handled by block layers.  This patch drops
barrier ordering by queue draining from block layer.  Ordering by
draining implementation was somewhat invasive to request handling.
List of notable changes follow.

* Each queue has 1 bit color which is flipped on each barrier issue.
  This is used to track whether a given request is issued before the
  current barrier or not.  REQ_ORDERED_COLOR flag and coloring
  implementation in __elv_add_request() are removed.

* Requests which shouldn't be processed yet for draining were stalled
  by returning -EAGAIN from blk_do_ordered() according to the test
  result between blk_ordered_req_seq() and blk_blk_ordered_cur_seq().
  This logic is removed.

* Draining completion logic in elv_completed_request() removed.

* All barrier sequence requests were queued to request queue and then
  trckled to lower layer according to progress and thus maintaining
  request orders during requeue was necessary.  This is replaced by
  queueing the next request in the barrier sequence only after the
  current one is complete from blk_ordered_complete_seq(), which
  removes the need for multiple proxy requests in struct request_queue
  and the request sorting logic in the ELEVATOR_INSERT_REQUEUE path of
  elv_insert().

* As barriers no longer have ordering constraints, there's no need to
  dump the whole elevator onto the dispatch queue on each barrier.
  Insert barriers at the front instead.

* If other barrier requests come to the front of the dispatch queue
  while one is already in progress, they are stored in
  q->pending_barriers and restored to dispatch queue one-by-one after
  each barrier completion from blk_ordered_complete_seq().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c | 220 ++++++++++++++++++++--------------------------------
 block/blk-core.c    |  11 ++-
 block/blk.h         |   2 +-
 block/elevator.c    |  79 +++----------------
 4 files changed, 105 insertions(+), 207 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index f1be85ba2bb5..e8b2e5c091b1 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -9,6 +9,8 @@
 
 #include "blk.h"
 
+static struct request *queue_next_ordseq(struct request_queue *q);
+
 /*
  * Cache flushing for ordered writes handling
  */
@@ -19,38 +21,10 @@ unsigned blk_ordered_cur_seq(struct request_queue *q)
 	return 1 << ffz(q->ordseq);
 }
 
-unsigned blk_ordered_req_seq(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-
-	BUG_ON(q->ordseq == 0);
-
-	if (rq == &q->pre_flush_rq)
-		return QUEUE_ORDSEQ_PREFLUSH;
-	if (rq == &q->bar_rq)
-		return QUEUE_ORDSEQ_BAR;
-	if (rq == &q->post_flush_rq)
-		return QUEUE_ORDSEQ_POSTFLUSH;
-
-	/*
-	 * !fs requests don't need to follow barrier ordering.  Always
-	 * put them at the front.  This fixes the following deadlock.
-	 *
-	 * http://thread.gmane.org/gmane.linux.kernel/537473
-	 */
-	if (rq->cmd_type != REQ_TYPE_FS)
-		return QUEUE_ORDSEQ_DRAIN;
-
-	if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
-	    (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
-		return QUEUE_ORDSEQ_DRAIN;
-	else
-		return QUEUE_ORDSEQ_DONE;
-}
-
-bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
+static struct request *blk_ordered_complete_seq(struct request_queue *q,
+						unsigned seq, int error)
 {
-	struct request *rq;
+	struct request *next_rq = NULL;
 
 	if (error && !q->orderr)
 		q->orderr = error;
@@ -58,16 +32,22 @@ bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 	BUG_ON(q->ordseq & seq);
 	q->ordseq |= seq;
 
-	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
-		return false;
-
-	/*
-	 * Okay, sequence complete.
-	 */
-	q->ordseq = 0;
-	rq = q->orig_bar_rq;
-	__blk_end_request_all(rq, q->orderr);
-	return true;
+	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
+		/* not complete yet, queue the next ordered sequence */
+		next_rq = queue_next_ordseq(q);
+	} else {
+		/* complete this barrier request */
+		__blk_end_request_all(q->orig_bar_rq, q->orderr);
+		q->orig_bar_rq = NULL;
+		q->ordseq = 0;
+
+		/* dispatch the next barrier if there's one */
+		if (!list_empty(&q->pending_barriers)) {
+			next_rq = list_entry_rq(q->pending_barriers.next);
+			list_move(&next_rq->queuelist, &q->queue_head);
+		}
+	}
+	return next_rq;
 }
 
 static void pre_flush_end_io(struct request *rq, int error)
@@ -88,133 +68,105 @@ static void post_flush_end_io(struct request *rq, int error)
 	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
 }
 
-static void queue_flush(struct request_queue *q, unsigned which)
+static void queue_flush(struct request_queue *q, struct request *rq,
+			rq_end_io_fn *end_io)
 {
-	struct request *rq;
-	rq_end_io_fn *end_io;
-
-	if (which == QUEUE_ORDERED_DO_PREFLUSH) {
-		rq = &q->pre_flush_rq;
-		end_io = pre_flush_end_io;
-	} else {
-		rq = &q->post_flush_rq;
-		end_io = post_flush_end_io;
-	}
-
 	blk_rq_init(q, rq);
 	rq->cmd_type = REQ_TYPE_FS;
-	rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
+	rq->cmd_flags = REQ_FLUSH;
 	rq->rq_disk = q->orig_bar_rq->rq_disk;
 	rq->end_io = end_io;
 
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
-static inline struct request *start_ordered(struct request_queue *q,
-					    struct request *rq)
+static struct request *queue_next_ordseq(struct request_queue *q)
 {
-	unsigned skip = 0;
-
-	q->orderr = 0;
-	q->ordered = q->next_ordered;
-	q->ordseq |= QUEUE_ORDSEQ_STARTED;
-
-	/*
-	 * For an empty barrier, there's no actual BAR request, which
-	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
-	 */
-	if (!blk_rq_sectors(rq))
-		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
-				QUEUE_ORDERED_DO_POSTFLUSH);
-
-	/* stash away the original request */
-	blk_dequeue_request(rq);
-	q->orig_bar_rq = rq;
-	rq = NULL;
-
-	/*
-	 * Queue ordered sequence.  As we stack them at the head, we
-	 * need to queue in reverse order.  Note that we rely on that
-	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
-	 * request gets inbetween ordered sequence.
-	 */
-	if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
-		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
-		rq = &q->post_flush_rq;
-	} else
-		skip |= QUEUE_ORDSEQ_POSTFLUSH;
+	struct request *rq = &q->bar_rq;
 
-	if (q->ordered & QUEUE_ORDERED_DO_BAR) {
-		rq = &q->bar_rq;
+	switch (blk_ordered_cur_seq(q)) {
+	case QUEUE_ORDSEQ_PREFLUSH:
+		queue_flush(q, rq, pre_flush_end_io);
+		break;
 
+	case QUEUE_ORDSEQ_BAR:
 		/* initialize proxy request and queue it */
 		blk_rq_init(q, rq);
 		init_request_from_bio(rq, q->orig_bar_rq->bio);
+		rq->cmd_flags &= ~REQ_HARDBARRIER;
 		if (q->ordered & QUEUE_ORDERED_DO_FUA)
 			rq->cmd_flags |= REQ_FUA;
 		rq->end_io = bar_end_io;
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-	} else
-		skip |= QUEUE_ORDSEQ_BAR;
+		break;
 
-	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
-		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
-		rq = &q->pre_flush_rq;
-	} else
-		skip |= QUEUE_ORDSEQ_PREFLUSH;
+	case QUEUE_ORDSEQ_POSTFLUSH:
+		queue_flush(q, rq, post_flush_end_io);
+		break;
 
-	if (queue_in_flight(q))
-		rq = NULL;
-	else
-		skip |= QUEUE_ORDSEQ_DRAIN;
-
-	/*
-	 * Complete skipped sequences.  If whole sequence is complete,
-	 * return %NULL to tell elevator that this request is gone.
-	 */
-	if (blk_ordered_complete_seq(q, skip, 0))
-		rq = NULL;
+	default:
+		BUG();
+	}
 	return rq;
 }
 
 struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 {
-	const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
-				(rq->cmd_flags & REQ_HARDBARRIER);
-
-	if (!q->ordseq) {
-		if (!is_barrier)
-			return rq;
-
-		if (q->next_ordered != QUEUE_ORDERED_NONE)
-			return start_ordered(q, rq);
-		else {
-			/*
-			 * Queue ordering not supported.  Terminate
-			 * with prejudice.
-			 */
-			blk_dequeue_request(rq);
-			__blk_end_request_all(rq, -EOPNOTSUPP);
-			return NULL;
-		}
+	unsigned skip = 0;
+
+	if (!(rq->cmd_flags & REQ_HARDBARRIER))
+		return rq;
+
+	if (q->ordseq) {
+		/*
+		 * Barrier is already in progress and they can't be
+		 * processed in parallel.  Queue for later processing.
+		 */
+		list_move_tail(&rq->queuelist, &q->pending_barriers);
+		return NULL;
+	}
+
+	if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) {
+		/*
+		 * Queue ordering not supported.  Terminate
+		 * with prejudice.
+		 */
+		blk_dequeue_request(rq);
+		__blk_end_request_all(rq, -EOPNOTSUPP);
+		return NULL;
 	}
 
 	/*
-	 * Ordered sequence in progress
+	 * Start a new ordered sequence
 	 */
+	q->orderr = 0;
+	q->ordered = q->next_ordered;
+	q->ordseq |= QUEUE_ORDSEQ_STARTED;
 
-	/* Special requests are not subject to ordering rules. */
-	if (rq->cmd_type != REQ_TYPE_FS &&
-	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
-		return rq;
+	/*
+	 * For an empty barrier, there's no actual BAR request, which
+	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
+	 */
+	if (!blk_rq_sectors(rq))
+		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
+				QUEUE_ORDERED_DO_POSTFLUSH);
 
-	/* Ordered by draining.  Wait for turn. */
-	WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
-	if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
-		rq = ERR_PTR(-EAGAIN);
+	/* stash away the original request */
+	blk_dequeue_request(rq);
+	q->orig_bar_rq = rq;
 
-	return rq;
+	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
+		skip |= QUEUE_ORDSEQ_PREFLUSH;
+
+	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
+		skip |= QUEUE_ORDSEQ_BAR;
+
+	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
+		skip |= QUEUE_ORDSEQ_POSTFLUSH;
+
+	/* complete skipped sequences and return the first sequence */
+	return blk_ordered_complete_seq(q, skip, 0);
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
diff --git a/block/blk-core.c b/block/blk-core.c
index f8d37a8e2c55..d316662682c8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
+	INIT_LIST_HEAD(&q->pending_barriers);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1185,6 +1186,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	const bool sync = (bio->bi_rw & REQ_SYNC);
 	const bool unplug = (bio->bi_rw & REQ_UNPLUG);
 	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+	int where = ELEVATOR_INSERT_SORT;
 	int rw_flags;
 
 	/* REQ_HARDBARRIER is no more */
@@ -1203,7 +1205,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q))
+	if (bio->bi_rw & REQ_HARDBARRIER) {
+		where = ELEVATOR_INSERT_FRONT;
+		goto get_rq;
+	}
+
+	if (elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1303,7 +1310,7 @@ get_rq:
 
 	/* insert the request into the elevator */
 	drive_stat_acct(req, 1);
-	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
+	__elv_add_request(q, req, where, 0);
 out:
 	if (unplug || !queue_should_plug(q))
 		__generic_unplug_device(q);
diff --git a/block/blk.h b/block/blk.h
index 874eb4ea8093..08081e4b294e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -62,7 +62,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 			rq = list_entry_rq(q->queue_head.next);
 			rq = blk_do_ordered(q, rq);
 			if (rq)
-				return !IS_ERR(rq) ? rq : NULL;
+				return rq;
 		}
 
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
diff --git a/block/elevator.c b/block/elevator.c
index ec585c9554d3..241c69c45c5f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -617,8 +617,6 @@ void elv_quiesce_end(struct request_queue *q)
 
 void elv_insert(struct request_queue *q, struct request *rq, int where)
 {
-	struct list_head *pos;
-	unsigned ordseq;
 	int unplug_it = 1;
 
 	trace_block_rq_insert(q, rq);
@@ -626,9 +624,16 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 	rq->q = q;
 
 	switch (where) {
+	case ELEVATOR_INSERT_REQUEUE:
+		/*
+		 * Most requeues happen because of a busy condition,
+		 * don't force unplug of the queue for that case.
+		 * Clear unplug_it and fall through.
+		 */
+		unplug_it = 0;
+
 	case ELEVATOR_INSERT_FRONT:
 		rq->cmd_flags |= REQ_SOFTBARRIER;
-
 		list_add(&rq->queuelist, &q->queue_head);
 		break;
 
@@ -668,36 +673,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
 
-	case ELEVATOR_INSERT_REQUEUE:
-		/*
-		 * If ordered flush isn't in progress, we do front
-		 * insertion; otherwise, requests should be requeued
-		 * in ordseq order.
-		 */
-		rq->cmd_flags |= REQ_SOFTBARRIER;
-
-		/*
-		 * Most requeues happen because of a busy condition,
-		 * don't force unplug of the queue for that case.
-		 */
-		unplug_it = 0;
-
-		if (q->ordseq == 0) {
-			list_add(&rq->queuelist, &q->queue_head);
-			break;
-		}
-
-		ordseq = blk_ordered_req_seq(rq);
-
-		list_for_each(pos, &q->queue_head) {
-			struct request *pos_rq = list_entry_rq(pos);
-			if (ordseq <= blk_ordered_req_seq(pos_rq))
-				break;
-		}
-
-		list_add_tail(&rq->queuelist, pos);
-		break;
-
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __func__, where);
@@ -716,26 +691,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		       int plug)
 {
-	if (q->ordcolor)
-		rq->cmd_flags |= REQ_ORDERED_COLOR;
-
 	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
-		/*
-		 * toggle ordered color
-		 */
-		if (rq->cmd_flags & REQ_HARDBARRIER)
-			q->ordcolor ^= 1;
-
-		/*
-		 * barriers implicitly indicate back insertion
-		 */
-		if (where == ELEVATOR_INSERT_SORT)
-			where = ELEVATOR_INSERT_BACK;
-
-		/*
-		 * this request is scheduling boundary, update
-		 * end_sector
-		 */
+		/* barriers are scheduling boundary, update end_sector */
 		if (rq->cmd_type == REQ_TYPE_FS ||
 		    (rq->cmd_flags & REQ_DISCARD)) {
 			q->end_sector = rq_end_sector(rq);
@@ -855,24 +812,6 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 		    e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
-
-	/*
-	 * Check if the queue is waiting for fs requests to be
-	 * drained for flush sequence.
-	 */
-	if (unlikely(q->ordseq)) {
-		struct request *next = NULL;
-
-		if (!list_empty(&q->queue_head))
-			next = list_entry_rq(q->queue_head.next);
-
-		if (!queue_in_flight(q) &&
-		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
-		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
-			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
-			__blk_run_queue(q);
-		}
-	}
 }
 
 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
-- 
cgit v1.2.2


From 8839a0e055d9abd6c011d533373a8dd266cad011 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: rename blk-barrier.c to blk-flush.c

Without ordering requirements, barrier and ordering are minomers.
Rename block/blk-barrier.c to block/blk-flush.c.  Rename of symbols
will follow.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/Makefile      |   2 +-
 block/blk-barrier.c | 248 ----------------------------------------------------
 block/blk-flush.c   | 248 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 249 insertions(+), 249 deletions(-)
 delete mode 100644 block/blk-barrier.c
 create mode 100644 block/blk-flush.c

(limited to 'block')

diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..f627e4b1a9da 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,7 +3,7 @@
 #
 
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
-			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
+			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
 			blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
 
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
deleted file mode 100644
index e8b2e5c091b1..000000000000
--- a/block/blk-barrier.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Functions related to barrier IO handling
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/gfp.h>
-
-#include "blk.h"
-
-static struct request *queue_next_ordseq(struct request_queue *q);
-
-/*
- * Cache flushing for ordered writes handling
- */
-unsigned blk_ordered_cur_seq(struct request_queue *q)
-{
-	if (!q->ordseq)
-		return 0;
-	return 1 << ffz(q->ordseq);
-}
-
-static struct request *blk_ordered_complete_seq(struct request_queue *q,
-						unsigned seq, int error)
-{
-	struct request *next_rq = NULL;
-
-	if (error && !q->orderr)
-		q->orderr = error;
-
-	BUG_ON(q->ordseq & seq);
-	q->ordseq |= seq;
-
-	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
-		/* not complete yet, queue the next ordered sequence */
-		next_rq = queue_next_ordseq(q);
-	} else {
-		/* complete this barrier request */
-		__blk_end_request_all(q->orig_bar_rq, q->orderr);
-		q->orig_bar_rq = NULL;
-		q->ordseq = 0;
-
-		/* dispatch the next barrier if there's one */
-		if (!list_empty(&q->pending_barriers)) {
-			next_rq = list_entry_rq(q->pending_barriers.next);
-			list_move(&next_rq->queuelist, &q->queue_head);
-		}
-	}
-	return next_rq;
-}
-
-static void pre_flush_end_io(struct request *rq, int error)
-{
-	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
-}
-
-static void bar_end_io(struct request *rq, int error)
-{
-	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
-}
-
-static void post_flush_end_io(struct request *rq, int error)
-{
-	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
-}
-
-static void queue_flush(struct request_queue *q, struct request *rq,
-			rq_end_io_fn *end_io)
-{
-	blk_rq_init(q, rq);
-	rq->cmd_type = REQ_TYPE_FS;
-	rq->cmd_flags = REQ_FLUSH;
-	rq->rq_disk = q->orig_bar_rq->rq_disk;
-	rq->end_io = end_io;
-
-	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-}
-
-static struct request *queue_next_ordseq(struct request_queue *q)
-{
-	struct request *rq = &q->bar_rq;
-
-	switch (blk_ordered_cur_seq(q)) {
-	case QUEUE_ORDSEQ_PREFLUSH:
-		queue_flush(q, rq, pre_flush_end_io);
-		break;
-
-	case QUEUE_ORDSEQ_BAR:
-		/* initialize proxy request and queue it */
-		blk_rq_init(q, rq);
-		init_request_from_bio(rq, q->orig_bar_rq->bio);
-		rq->cmd_flags &= ~REQ_HARDBARRIER;
-		if (q->ordered & QUEUE_ORDERED_DO_FUA)
-			rq->cmd_flags |= REQ_FUA;
-		rq->end_io = bar_end_io;
-
-		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-		break;
-
-	case QUEUE_ORDSEQ_POSTFLUSH:
-		queue_flush(q, rq, post_flush_end_io);
-		break;
-
-	default:
-		BUG();
-	}
-	return rq;
-}
-
-struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
-{
-	unsigned skip = 0;
-
-	if (!(rq->cmd_flags & REQ_HARDBARRIER))
-		return rq;
-
-	if (q->ordseq) {
-		/*
-		 * Barrier is already in progress and they can't be
-		 * processed in parallel.  Queue for later processing.
-		 */
-		list_move_tail(&rq->queuelist, &q->pending_barriers);
-		return NULL;
-	}
-
-	if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) {
-		/*
-		 * Queue ordering not supported.  Terminate
-		 * with prejudice.
-		 */
-		blk_dequeue_request(rq);
-		__blk_end_request_all(rq, -EOPNOTSUPP);
-		return NULL;
-	}
-
-	/*
-	 * Start a new ordered sequence
-	 */
-	q->orderr = 0;
-	q->ordered = q->next_ordered;
-	q->ordseq |= QUEUE_ORDSEQ_STARTED;
-
-	/*
-	 * For an empty barrier, there's no actual BAR request, which
-	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
-	 */
-	if (!blk_rq_sectors(rq))
-		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
-				QUEUE_ORDERED_DO_POSTFLUSH);
-
-	/* stash away the original request */
-	blk_dequeue_request(rq);
-	q->orig_bar_rq = rq;
-
-	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
-		skip |= QUEUE_ORDSEQ_PREFLUSH;
-
-	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
-		skip |= QUEUE_ORDSEQ_BAR;
-
-	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
-		skip |= QUEUE_ORDSEQ_POSTFLUSH;
-
-	/* complete skipped sequences and return the first sequence */
-	return blk_ordered_complete_seq(q, skip, 0);
-}
-
-static void bio_end_empty_barrier(struct bio *bio, int err)
-{
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	}
-	if (bio->bi_private)
-		complete(bio->bi_private);
-	bio_put(bio);
-}
-
-/**
- * blkdev_issue_flush - queue a flush
- * @bdev:	blockdev to issue flush for
- * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @error_sector:	error sector
- * @flags:	BLKDEV_IFL_* flags to control behaviour
- *
- * Description:
- *    Issue a flush for the block device in question. Caller can supply
- *    room for storing the error offset in case of a flush error, if they
- *    wish to. If WAIT flag is not passed then caller may check only what
- *    request was pushed in some internal queue for later handling.
- */
-int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
-		sector_t *error_sector, unsigned long flags)
-{
-	DECLARE_COMPLETION_ONSTACK(wait);
-	struct request_queue *q;
-	struct bio *bio;
-	int ret = 0;
-
-	if (bdev->bd_disk == NULL)
-		return -ENXIO;
-
-	q = bdev_get_queue(bdev);
-	if (!q)
-		return -ENXIO;
-
-	/*
-	 * some block devices may not have their queue correctly set up here
-	 * (e.g. loop device without a backing file) and so issuing a flush
-	 * here will panic. Ensure there is a request function before issuing
-	 * the barrier.
-	 */
-	if (!q->make_request_fn)
-		return -ENXIO;
-
-	bio = bio_alloc(gfp_mask, 0);
-	bio->bi_end_io = bio_end_empty_barrier;
-	bio->bi_bdev = bdev;
-	if (test_bit(BLKDEV_WAIT, &flags))
-		bio->bi_private = &wait;
-
-	bio_get(bio);
-	submit_bio(WRITE_BARRIER, bio);
-	if (test_bit(BLKDEV_WAIT, &flags)) {
-		wait_for_completion(&wait);
-		/*
-		 * The driver must store the error location in ->bi_sector, if
-		 * it supports it. For non-stacked drivers, this should be
-		 * copied from blk_rq_pos(rq).
-		 */
-		if (error_sector)
-			*error_sector = bio->bi_sector;
-	}
-
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-	else if (!bio_flagged(bio, BIO_UPTODATE))
-		ret = -EIO;
-
-	bio_put(bio);
-	return ret;
-}
-EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-flush.c b/block/blk-flush.c
new file mode 100644
index 000000000000..e8b2e5c091b1
--- /dev/null
+++ b/block/blk-flush.c
@@ -0,0 +1,248 @@
+/*
+ * Functions related to barrier IO handling
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/gfp.h>
+
+#include "blk.h"
+
+static struct request *queue_next_ordseq(struct request_queue *q);
+
+/*
+ * Cache flushing for ordered writes handling
+ */
+unsigned blk_ordered_cur_seq(struct request_queue *q)
+{
+	if (!q->ordseq)
+		return 0;
+	return 1 << ffz(q->ordseq);
+}
+
+static struct request *blk_ordered_complete_seq(struct request_queue *q,
+						unsigned seq, int error)
+{
+	struct request *next_rq = NULL;
+
+	if (error && !q->orderr)
+		q->orderr = error;
+
+	BUG_ON(q->ordseq & seq);
+	q->ordseq |= seq;
+
+	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
+		/* not complete yet, queue the next ordered sequence */
+		next_rq = queue_next_ordseq(q);
+	} else {
+		/* complete this barrier request */
+		__blk_end_request_all(q->orig_bar_rq, q->orderr);
+		q->orig_bar_rq = NULL;
+		q->ordseq = 0;
+
+		/* dispatch the next barrier if there's one */
+		if (!list_empty(&q->pending_barriers)) {
+			next_rq = list_entry_rq(q->pending_barriers.next);
+			list_move(&next_rq->queuelist, &q->queue_head);
+		}
+	}
+	return next_rq;
+}
+
+static void pre_flush_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
+}
+
+static void bar_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
+}
+
+static void post_flush_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
+}
+
+static void queue_flush(struct request_queue *q, struct request *rq,
+			rq_end_io_fn *end_io)
+{
+	blk_rq_init(q, rq);
+	rq->cmd_type = REQ_TYPE_FS;
+	rq->cmd_flags = REQ_FLUSH;
+	rq->rq_disk = q->orig_bar_rq->rq_disk;
+	rq->end_io = end_io;
+
+	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+}
+
+static struct request *queue_next_ordseq(struct request_queue *q)
+{
+	struct request *rq = &q->bar_rq;
+
+	switch (blk_ordered_cur_seq(q)) {
+	case QUEUE_ORDSEQ_PREFLUSH:
+		queue_flush(q, rq, pre_flush_end_io);
+		break;
+
+	case QUEUE_ORDSEQ_BAR:
+		/* initialize proxy request and queue it */
+		blk_rq_init(q, rq);
+		init_request_from_bio(rq, q->orig_bar_rq->bio);
+		rq->cmd_flags &= ~REQ_HARDBARRIER;
+		if (q->ordered & QUEUE_ORDERED_DO_FUA)
+			rq->cmd_flags |= REQ_FUA;
+		rq->end_io = bar_end_io;
+
+		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+		break;
+
+	case QUEUE_ORDSEQ_POSTFLUSH:
+		queue_flush(q, rq, post_flush_end_io);
+		break;
+
+	default:
+		BUG();
+	}
+	return rq;
+}
+
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
+{
+	unsigned skip = 0;
+
+	if (!(rq->cmd_flags & REQ_HARDBARRIER))
+		return rq;
+
+	if (q->ordseq) {
+		/*
+		 * Barrier is already in progress and they can't be
+		 * processed in parallel.  Queue for later processing.
+		 */
+		list_move_tail(&rq->queuelist, &q->pending_barriers);
+		return NULL;
+	}
+
+	if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) {
+		/*
+		 * Queue ordering not supported.  Terminate
+		 * with prejudice.
+		 */
+		blk_dequeue_request(rq);
+		__blk_end_request_all(rq, -EOPNOTSUPP);
+		return NULL;
+	}
+
+	/*
+	 * Start a new ordered sequence
+	 */
+	q->orderr = 0;
+	q->ordered = q->next_ordered;
+	q->ordseq |= QUEUE_ORDSEQ_STARTED;
+
+	/*
+	 * For an empty barrier, there's no actual BAR request, which
+	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
+	 */
+	if (!blk_rq_sectors(rq))
+		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
+				QUEUE_ORDERED_DO_POSTFLUSH);
+
+	/* stash away the original request */
+	blk_dequeue_request(rq);
+	q->orig_bar_rq = rq;
+
+	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
+		skip |= QUEUE_ORDSEQ_PREFLUSH;
+
+	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
+		skip |= QUEUE_ORDSEQ_BAR;
+
+	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
+		skip |= QUEUE_ORDSEQ_POSTFLUSH;
+
+	/* complete skipped sequences and return the first sequence */
+	return blk_ordered_complete_seq(q, skip, 0);
+}
+
+static void bio_end_empty_barrier(struct bio *bio, int err)
+{
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	}
+	if (bio->bi_private)
+		complete(bio->bi_private);
+	bio_put(bio);
+}
+
+/**
+ * blkdev_issue_flush - queue a flush
+ * @bdev:	blockdev to issue flush for
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @error_sector:	error sector
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ *    Issue a flush for the block device in question. Caller can supply
+ *    room for storing the error offset in case of a flush error, if they
+ *    wish to. If WAIT flag is not passed then caller may check only what
+ *    request was pushed in some internal queue for later handling.
+ */
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+		sector_t *error_sector, unsigned long flags)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct request_queue *q;
+	struct bio *bio;
+	int ret = 0;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	/*
+	 * some block devices may not have their queue correctly set up here
+	 * (e.g. loop device without a backing file) and so issuing a flush
+	 * here will panic. Ensure there is a request function before issuing
+	 * the barrier.
+	 */
+	if (!q->make_request_fn)
+		return -ENXIO;
+
+	bio = bio_alloc(gfp_mask, 0);
+	bio->bi_end_io = bio_end_empty_barrier;
+	bio->bi_bdev = bdev;
+	if (test_bit(BLKDEV_WAIT, &flags))
+		bio->bi_private = &wait;
+
+	bio_get(bio);
+	submit_bio(WRITE_BARRIER, bio);
+	if (test_bit(BLKDEV_WAIT, &flags)) {
+		wait_for_completion(&wait);
+		/*
+		 * The driver must store the error location in ->bi_sector, if
+		 * it supports it. For non-stacked drivers, this should be
+		 * copied from blk_rq_pos(rq).
+		 */
+		if (error_sector)
+			*error_sector = bio->bi_sector;
+	}
+
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+	else if (!bio_flagged(bio, BIO_UPTODATE))
+		ret = -EIO;
+
+	bio_put(bio);
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_flush);
-- 
cgit v1.2.2


From dd4c133f387c48f526022860ad70354637a80f4c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: rename barrier/ordered to flush

With ordering requirements dropped, barrier and ordered are misnomers.
Now all block layer does is sequencing FLUSH and FUA.  Rename them to
flush.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c  | 21 ++++++------
 block/blk-flush.c | 98 +++++++++++++++++++++++++++----------------------------
 block/blk.h       |  4 +--
 3 files changed, 60 insertions(+), 63 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index d316662682c8..8870ae40179d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -136,7 +136,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 {
 	struct request_queue *q = rq->q;
 
-	if (&q->bar_rq != rq) {
+	if (&q->flush_rq != rq) {
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -160,13 +160,12 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 		if (bio->bi_size == 0)
 			bio_endio(bio, error);
 	} else {
-
 		/*
-		 * Okay, this is the barrier request in progress, just
-		 * record the error;
+		 * Okay, this is the sequenced flush request in
+		 * progress, just record the error;
 		 */
-		if (error && !q->orderr)
-			q->orderr = error;
+		if (error && !q->flush_err)
+			q->flush_err = error;
 	}
 }
 
@@ -520,7 +519,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
-	INIT_LIST_HEAD(&q->pending_barriers);
+	INIT_LIST_HEAD(&q->pending_flushes);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1764,11 +1763,11 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 static void blk_account_io_done(struct request *req)
 {
 	/*
-	 * Account IO completion.  bar_rq isn't accounted as a normal
-	 * IO on queueing nor completion.  Accounting the containing
-	 * request is enough.
+	 * Account IO completion.  flush_rq isn't accounted as a
+	 * normal IO on queueing nor completion.  Accounting the
+	 * containing request is enough.
 	 */
-	if (blk_do_io_stat(req) && req != &req->q->bar_rq) {
+	if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index e8b2e5c091b1..dd873225da97 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -9,41 +9,38 @@
 
 #include "blk.h"
 
-static struct request *queue_next_ordseq(struct request_queue *q);
+static struct request *queue_next_fseq(struct request_queue *q);
 
-/*
- * Cache flushing for ordered writes handling
- */
-unsigned blk_ordered_cur_seq(struct request_queue *q)
+unsigned blk_flush_cur_seq(struct request_queue *q)
 {
-	if (!q->ordseq)
+	if (!q->flush_seq)
 		return 0;
-	return 1 << ffz(q->ordseq);
+	return 1 << ffz(q->flush_seq);
 }
 
-static struct request *blk_ordered_complete_seq(struct request_queue *q,
-						unsigned seq, int error)
+static struct request *blk_flush_complete_seq(struct request_queue *q,
+					      unsigned seq, int error)
 {
 	struct request *next_rq = NULL;
 
-	if (error && !q->orderr)
-		q->orderr = error;
+	if (error && !q->flush_err)
+		q->flush_err = error;
 
-	BUG_ON(q->ordseq & seq);
-	q->ordseq |= seq;
+	BUG_ON(q->flush_seq & seq);
+	q->flush_seq |= seq;
 
-	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
-		/* not complete yet, queue the next ordered sequence */
-		next_rq = queue_next_ordseq(q);
+	if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) {
+		/* not complete yet, queue the next flush sequence */
+		next_rq = queue_next_fseq(q);
 	} else {
-		/* complete this barrier request */
-		__blk_end_request_all(q->orig_bar_rq, q->orderr);
-		q->orig_bar_rq = NULL;
-		q->ordseq = 0;
-
-		/* dispatch the next barrier if there's one */
-		if (!list_empty(&q->pending_barriers)) {
-			next_rq = list_entry_rq(q->pending_barriers.next);
+		/* complete this flush request */
+		__blk_end_request_all(q->orig_flush_rq, q->flush_err);
+		q->orig_flush_rq = NULL;
+		q->flush_seq = 0;
+
+		/* dispatch the next flush if there's one */
+		if (!list_empty(&q->pending_flushes)) {
+			next_rq = list_entry_rq(q->pending_flushes.next);
 			list_move(&next_rq->queuelist, &q->queue_head);
 		}
 	}
@@ -53,19 +50,19 @@ static struct request *blk_ordered_complete_seq(struct request_queue *q,
 static void pre_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
+	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_PREFLUSH, error);
 }
 
-static void bar_end_io(struct request *rq, int error)
+static void flush_data_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
+	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_DATA, error);
 }
 
 static void post_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
+	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
 }
 
 static void queue_flush(struct request_queue *q, struct request *rq,
@@ -74,34 +71,34 @@ static void queue_flush(struct request_queue *q, struct request *rq,
 	blk_rq_init(q, rq);
 	rq->cmd_type = REQ_TYPE_FS;
 	rq->cmd_flags = REQ_FLUSH;
-	rq->rq_disk = q->orig_bar_rq->rq_disk;
+	rq->rq_disk = q->orig_flush_rq->rq_disk;
 	rq->end_io = end_io;
 
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
-static struct request *queue_next_ordseq(struct request_queue *q)
+static struct request *queue_next_fseq(struct request_queue *q)
 {
-	struct request *rq = &q->bar_rq;
+	struct request *rq = &q->flush_rq;
 
-	switch (blk_ordered_cur_seq(q)) {
-	case QUEUE_ORDSEQ_PREFLUSH:
+	switch (blk_flush_cur_seq(q)) {
+	case QUEUE_FSEQ_PREFLUSH:
 		queue_flush(q, rq, pre_flush_end_io);
 		break;
 
-	case QUEUE_ORDSEQ_BAR:
+	case QUEUE_FSEQ_DATA:
 		/* initialize proxy request and queue it */
 		blk_rq_init(q, rq);
-		init_request_from_bio(rq, q->orig_bar_rq->bio);
+		init_request_from_bio(rq, q->orig_flush_rq->bio);
 		rq->cmd_flags &= ~REQ_HARDBARRIER;
 		if (q->ordered & QUEUE_ORDERED_DO_FUA)
 			rq->cmd_flags |= REQ_FUA;
-		rq->end_io = bar_end_io;
+		rq->end_io = flush_data_end_io;
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 		break;
 
-	case QUEUE_ORDSEQ_POSTFLUSH:
+	case QUEUE_FSEQ_POSTFLUSH:
 		queue_flush(q, rq, post_flush_end_io);
 		break;
 
@@ -111,19 +108,20 @@ static struct request *queue_next_ordseq(struct request_queue *q)
 	return rq;
 }
 
-struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
+struct request *blk_do_flush(struct request_queue *q, struct request *rq)
 {
 	unsigned skip = 0;
 
 	if (!(rq->cmd_flags & REQ_HARDBARRIER))
 		return rq;
 
-	if (q->ordseq) {
+	if (q->flush_seq) {
 		/*
-		 * Barrier is already in progress and they can't be
-		 * processed in parallel.  Queue for later processing.
+		 * Sequenced flush is already in progress and they
+		 * can't be processed in parallel.  Queue for later
+		 * processing.
 		 */
-		list_move_tail(&rq->queuelist, &q->pending_barriers);
+		list_move_tail(&rq->queuelist, &q->pending_flushes);
 		return NULL;
 	}
 
@@ -138,11 +136,11 @@ struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 	}
 
 	/*
-	 * Start a new ordered sequence
+	 * Start a new flush sequence
 	 */
-	q->orderr = 0;
+	q->flush_err = 0;
 	q->ordered = q->next_ordered;
-	q->ordseq |= QUEUE_ORDSEQ_STARTED;
+	q->flush_seq |= QUEUE_FSEQ_STARTED;
 
 	/*
 	 * For an empty barrier, there's no actual BAR request, which
@@ -154,19 +152,19 @@ struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 
 	/* stash away the original request */
 	blk_dequeue_request(rq);
-	q->orig_bar_rq = rq;
+	q->orig_flush_rq = rq;
 
 	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
-		skip |= QUEUE_ORDSEQ_PREFLUSH;
+		skip |= QUEUE_FSEQ_PREFLUSH;
 
 	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
-		skip |= QUEUE_ORDSEQ_BAR;
+		skip |= QUEUE_FSEQ_DATA;
 
 	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
-		skip |= QUEUE_ORDSEQ_POSTFLUSH;
+		skip |= QUEUE_FSEQ_POSTFLUSH;
 
 	/* complete skipped sequences and return the first sequence */
-	return blk_ordered_complete_seq(q, skip, 0);
+	return blk_flush_complete_seq(q, skip, 0);
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
diff --git a/block/blk.h b/block/blk.h
index 08081e4b294e..24b92bd78f37 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,7 +51,7 @@ static inline void blk_clear_rq_complete(struct request *rq)
  */
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
-struct request *blk_do_ordered(struct request_queue *q, struct request *rq);
+struct request *blk_do_flush(struct request_queue *q, struct request *rq);
 
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
@@ -60,7 +60,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
-			rq = blk_do_ordered(q, rq);
+			rq = blk_do_flush(q, rq);
 			if (rq)
 				return rq;
 		}
-- 
cgit v1.2.2


From 4fed947cb311e5aa51781d316cefca836352f6ce Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: implement REQ_FLUSH/FUA based interface for FLUSH/FUA requests

Now that the backend conversion is complete, export sequenced
FLUSH/FUA capability through REQ_FLUSH/FUA flags.  REQ_FLUSH means the
device cache should be flushed before executing the request.  REQ_FUA
means that the data in the request should be on non-volatile media on
completion.

Block layer will choose the correct way of implementing the semantics
and execute it.  The request may be passed to the device directly if
the device can handle it; otherwise, it will be sequenced using one or
more proxy requests.  Devices will never see REQ_FLUSH and/or FUA
which it doesn't support.

Also, unlike the original REQ_HARDBARRIER, REQ_FLUSH/FUA requests are
never failed with -EOPNOTSUPP.  If the underlying device doesn't
support FLUSH/FUA, the block layer simply make those noop.  IOW, it no
longer distinguishes between writeback cache which doesn't support
cache flush and writethrough/no cache.  Devices which have WB cache
w/o flush are very difficult to come by these days and there's nothing
much we can do anyway, so it doesn't make sense to require everyone to
implement -EOPNOTSUPP handling.  This will simplify filesystems and
block drivers as they can drop -EOPNOTSUPP retry logic for barriers.

* QUEUE_ORDERED_* are removed and QUEUE_FSEQ_* are moved into
  blk-flush.c.

* REQ_FLUSH w/o data can also be directly passed to drivers without
  sequencing but some drivers assume that zero length requests don't
  have rq->bio which isn't true for these requests requiring the use
  of proxy requests.

* REQ_COMMON_MASK now includes REQ_FLUSH | REQ_FUA so that they are
  copied from bio to request.

* WRITE_BARRIER is marked deprecated and WRITE_FLUSH, WRITE_FUA and
  WRITE_FLUSH_FUA are added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c  |  2 +-
 block/blk-flush.c | 85 ++++++++++++++++++++++++++++++-------------------------
 block/blk.h       |  3 ++
 3 files changed, 50 insertions(+), 40 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 8870ae40179d..18455c4f618a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1204,7 +1204,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(q->queue_lock);
 
-	if (bio->bi_rw & REQ_HARDBARRIER) {
+	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
 		where = ELEVATOR_INSERT_FRONT;
 		goto get_rq;
 	}
diff --git a/block/blk-flush.c b/block/blk-flush.c
index dd873225da97..452c552e9ead 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,5 +1,5 @@
 /*
- * Functions related to barrier IO handling
+ * Functions to sequence FLUSH and FUA writes.
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -9,6 +9,15 @@
 
 #include "blk.h"
 
+/* FLUSH/FUA sequences */
+enum {
+	QUEUE_FSEQ_STARTED	= (1 << 0), /* flushing in progress */
+	QUEUE_FSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
+	QUEUE_FSEQ_DATA		= (1 << 2), /* data write in progress */
+	QUEUE_FSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
+	QUEUE_FSEQ_DONE		= (1 << 4),
+};
+
 static struct request *queue_next_fseq(struct request_queue *q);
 
 unsigned blk_flush_cur_seq(struct request_queue *q)
@@ -79,6 +88,7 @@ static void queue_flush(struct request_queue *q, struct request *rq,
 
 static struct request *queue_next_fseq(struct request_queue *q)
 {
+	struct request *orig_rq = q->orig_flush_rq;
 	struct request *rq = &q->flush_rq;
 
 	switch (blk_flush_cur_seq(q)) {
@@ -87,12 +97,11 @@ static struct request *queue_next_fseq(struct request_queue *q)
 		break;
 
 	case QUEUE_FSEQ_DATA:
-		/* initialize proxy request and queue it */
+		/* initialize proxy request, inherit FLUSH/FUA and queue it */
 		blk_rq_init(q, rq);
-		init_request_from_bio(rq, q->orig_flush_rq->bio);
-		rq->cmd_flags &= ~REQ_HARDBARRIER;
-		if (q->ordered & QUEUE_ORDERED_DO_FUA)
-			rq->cmd_flags |= REQ_FUA;
+		init_request_from_bio(rq, orig_rq->bio);
+		rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
+		rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
 		rq->end_io = flush_data_end_io;
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
@@ -110,60 +119,58 @@ static struct request *queue_next_fseq(struct request_queue *q)
 
 struct request *blk_do_flush(struct request_queue *q, struct request *rq)
 {
+	unsigned int fflags = q->flush_flags; /* may change, cache it */
+	bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
+	bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
+	bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
 	unsigned skip = 0;
 
-	if (!(rq->cmd_flags & REQ_HARDBARRIER))
+	/*
+	 * Special case.  If there's data but flush is not necessary,
+	 * the request can be issued directly.
+	 *
+	 * Flush w/o data should be able to be issued directly too but
+	 * currently some drivers assume that rq->bio contains
+	 * non-zero data if it isn't NULL and empty FLUSH requests
+	 * getting here usually have bio's without data.
+	 */
+	if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
+		rq->cmd_flags &= ~REQ_FLUSH;
+		if (!has_fua)
+			rq->cmd_flags &= ~REQ_FUA;
 		return rq;
+	}
 
+	/*
+	 * Sequenced flushes can't be processed in parallel.  If
+	 * another one is already in progress, queue for later
+	 * processing.
+	 */
 	if (q->flush_seq) {
-		/*
-		 * Sequenced flush is already in progress and they
-		 * can't be processed in parallel.  Queue for later
-		 * processing.
-		 */
 		list_move_tail(&rq->queuelist, &q->pending_flushes);
 		return NULL;
 	}
 
-	if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) {
-		/*
-		 * Queue ordering not supported.  Terminate
-		 * with prejudice.
-		 */
-		blk_dequeue_request(rq);
-		__blk_end_request_all(rq, -EOPNOTSUPP);
-		return NULL;
-	}
-
 	/*
 	 * Start a new flush sequence
 	 */
 	q->flush_err = 0;
-	q->ordered = q->next_ordered;
 	q->flush_seq |= QUEUE_FSEQ_STARTED;
 
-	/*
-	 * For an empty barrier, there's no actual BAR request, which
-	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
-	 */
-	if (!blk_rq_sectors(rq))
-		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
-				QUEUE_ORDERED_DO_POSTFLUSH);
-
-	/* stash away the original request */
+	/* adjust FLUSH/FUA of the original request and stash it away */
+	rq->cmd_flags &= ~REQ_FLUSH;
+	if (!has_fua)
+		rq->cmd_flags &= ~REQ_FUA;
 	blk_dequeue_request(rq);
 	q->orig_flush_rq = rq;
 
-	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
+	/* skip unneded sequences and return the first one */
+	if (!do_preflush)
 		skip |= QUEUE_FSEQ_PREFLUSH;
-
-	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
+	if (!blk_rq_sectors(rq))
 		skip |= QUEUE_FSEQ_DATA;
-
-	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
+	if (!do_postflush)
 		skip |= QUEUE_FSEQ_POSTFLUSH;
-
-	/* complete skipped sequences and return the first sequence */
 	return blk_flush_complete_seq(q, skip, 0);
 }
 
diff --git a/block/blk.h b/block/blk.h
index 24b92bd78f37..a09c18b19116 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -60,6 +60,9 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
+			if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
+			    rq == &q->flush_rq)
+				return rq;
 			rq = blk_do_flush(q, rq);
 			if (rq)
 				return rq;
-- 
cgit v1.2.2


From 1e87901e189c8f01750d67485009fe3827c691bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: filter flush bio's in __generic_make_request()

There are a number of make_request based drivers which don't support
cache flushes.  Filter out flush bio's in __generic_make_request() so
that they don't have to worry about them.  All FLUSH/FUA requests with
data are converted to regular IO requests and empty ones are completed
immediately.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 18455c4f618a..495bdc4a23da 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1509,6 +1509,19 @@ static inline void __generic_make_request(struct bio *bio)
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
 
+		/*
+		 * Filter flush bio's early so that make_request based
+		 * drivers without flush support don't have to worry
+		 * about them.
+		 */
+		if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+			bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+			if (!nr_sectors) {
+				err = 0;
+				goto end_io;
+			}
+		}
+
 		if ((bio->bi_rw & REQ_DISCARD) &&
 		    (!blk_queue_discard(q) ||
 		     ((bio->bi_rw & REQ_SECURE) &&
-- 
cgit v1.2.2


From cde4c406d8fb051c5aafc917643adbb9dbd0abc2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: simplify queue_next_fseq

We need to call blk_rq_init and elv_insert for all cases in queue_next_fseq,
so take these calls into common code.  Also move the end_io initialization
from queue_flush into queue_next_fseq and rename queue_flush to
init_flush_request now that it's old name doesn't apply anymore.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 452c552e9ead..72905f862d31 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -74,16 +74,11 @@ static void post_flush_end_io(struct request *rq, int error)
 	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
 }
 
-static void queue_flush(struct request_queue *q, struct request *rq,
-			rq_end_io_fn *end_io)
+static void init_flush_request(struct request *rq, struct gendisk *disk)
 {
-	blk_rq_init(q, rq);
 	rq->cmd_type = REQ_TYPE_FS;
 	rq->cmd_flags = REQ_FLUSH;
-	rq->rq_disk = q->orig_flush_rq->rq_disk;
-	rq->end_io = end_io;
-
-	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+	rq->rq_disk = disk;
 }
 
 static struct request *queue_next_fseq(struct request_queue *q)
@@ -91,29 +86,28 @@ static struct request *queue_next_fseq(struct request_queue *q)
 	struct request *orig_rq = q->orig_flush_rq;
 	struct request *rq = &q->flush_rq;
 
+	blk_rq_init(q, rq);
+
 	switch (blk_flush_cur_seq(q)) {
 	case QUEUE_FSEQ_PREFLUSH:
-		queue_flush(q, rq, pre_flush_end_io);
+		init_flush_request(rq, orig_rq->rq_disk);
+		rq->end_io = pre_flush_end_io;
 		break;
-
 	case QUEUE_FSEQ_DATA:
-		/* initialize proxy request, inherit FLUSH/FUA and queue it */
-		blk_rq_init(q, rq);
 		init_request_from_bio(rq, orig_rq->bio);
 		rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
 		rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
 		rq->end_io = flush_data_end_io;
-
-		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 		break;
-
 	case QUEUE_FSEQ_POSTFLUSH:
-		queue_flush(q, rq, post_flush_end_io);
+		init_flush_request(rq, orig_rq->rq_disk);
+		rq->end_io = post_flush_end_io;
 		break;
-
 	default:
 		BUG();
 	}
+
+	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 	return rq;
 }
 
-- 
cgit v1.2.2


From 337238be1bf52e1242f940fc6fe83fb395e55057 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: initialize flush request with WRITE_FLUSH instead of REQ_FLUSH

init_flush_request() only set REQ_FLUSH when initializing flush
requests making them READ requests.  Use WRITE_FLUSH instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 72905f862d31..f357f1fc411c 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -77,7 +77,7 @@ static void post_flush_end_io(struct request *rq, int error)
 static void init_flush_request(struct request *rq, struct gendisk *disk)
 {
 	rq->cmd_type = REQ_TYPE_FS;
-	rq->cmd_flags = REQ_FLUSH;
+	rq->cmd_flags = WRITE_FLUSH;
 	rq->rq_disk = disk;
 }
 
-- 
cgit v1.2.2


From 47f70d5a6ca78c40a1c799d43506efbfed914f7b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: kick queue after sequencing REQ_FLUSH/FUA

While completing a request from a REQ_FLUSH/FUA sequence, another
request can be pushed to the request queue.  If a driver tests
elv_queue_empty() before completing a request and runs the queue again
only if the queue wasn't empty, this may lead to hang.  Please note
that most drivers either kick the queue unconditionally or test queue
emptiness after completing the current request and don't have this
problem.

This patch removes this possibility by making REQ_FLUSH/FUA sequence
code kick the queue if the queue was empty before completing a request
from REQ_FLUSH/FUA sequence.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index f357f1fc411c..cb4c8440a1fc 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -56,22 +56,38 @@ static struct request *blk_flush_complete_seq(struct request_queue *q,
 	return next_rq;
 }
 
+static void blk_flush_complete_seq_end_io(struct request_queue *q,
+					  unsigned seq, int error)
+{
+	bool was_empty = elv_queue_empty(q);
+	struct request *next_rq;
+
+	next_rq = blk_flush_complete_seq(q, seq, error);
+
+	/*
+	 * Moving a request silently to empty queue_head may stall the
+	 * queue.  Kick the queue in those cases.
+	 */
+	if (was_empty && next_rq)
+		__blk_run_queue(q);
+}
+
 static void pre_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_PREFLUSH, error);
+	blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error);
 }
 
 static void flush_data_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_DATA, error);
+	blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error);
 }
 
 static void post_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
+	blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
 }
 
 static void init_flush_request(struct request *rq, struct gendisk *disk)
-- 
cgit v1.2.2


From 09d60c701b64b509f328cac72970eb894f485b9e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: make sure FSEQ_DATA request has the same rq_disk as the
 original

rq->rq_disk and bio->bi_bdev->bd_disk may differ if a request has
passed through remapping drivers.  FSEQ_DATA request incorrectly
followed bio->bi_bdev->bd_disk ending up being issued w/ mismatching
rq_disk.  Make it follow orig_rq->rq_disk.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Tested-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index cb4c8440a1fc..7d1fc982e78f 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -111,6 +111,13 @@ static struct request *queue_next_fseq(struct request_queue *q)
 		break;
 	case QUEUE_FSEQ_DATA:
 		init_request_from_bio(rq, orig_rq->bio);
+		/*
+		 * orig_rq->rq_disk may be different from
+		 * bio->bi_bdev->bd_disk if orig_rq got here through
+		 * remapping drivers.  Make sure rq->rq_disk points
+		 * to the same one as orig_rq.
+		 */
+		rq->rq_disk = orig_rq->rq_disk;
 		rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
 		rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
 		rq->end_io = flush_data_end_io;
-- 
cgit v1.2.2


From d391a2dda2f1c993f094bdb3a8a342c5e0546553 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: use REQ_FLUSH in blkdev_issue_flush()

Update blkdev_issue_flush() to use new REQ_FLUSH interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 7d1fc982e78f..62b7df9bca9d 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -191,13 +191,10 @@ struct request *blk_do_flush(struct request_queue *q, struct request *rq)
 	return blk_flush_complete_seq(q, skip, 0);
 }
 
-static void bio_end_empty_barrier(struct bio *bio, int err)
+static void bio_end_flush(struct bio *bio, int err)
 {
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+	if (err)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	}
 	if (bio->bi_private)
 		complete(bio->bi_private);
 	bio_put(bio);
@@ -235,19 +232,19 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 	 * some block devices may not have their queue correctly set up here
 	 * (e.g. loop device without a backing file) and so issuing a flush
 	 * here will panic. Ensure there is a request function before issuing
-	 * the barrier.
+	 * the flush.
 	 */
 	if (!q->make_request_fn)
 		return -ENXIO;
 
 	bio = bio_alloc(gfp_mask, 0);
-	bio->bi_end_io = bio_end_empty_barrier;
+	bio->bi_end_io = bio_end_flush;
 	bio->bi_bdev = bdev;
 	if (test_bit(BLKDEV_WAIT, &flags))
 		bio->bi_private = &wait;
 
 	bio_get(bio);
-	submit_bio(WRITE_BARRIER, bio);
+	submit_bio(WRITE_FLUSH, bio);
 	if (test_bit(BLKDEV_WAIT, &flags)) {
 		wait_for_completion(&wait);
 		/*
@@ -259,9 +256,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 			*error_sector = bio->bi_sector;
 	}
 
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-	else if (!bio_flagged(bio, BIO_UPTODATE))
+	if (!bio_flagged(bio, BIO_UPTODATE))
 		ret = -EIO;
 
 	bio_put(bio);
-- 
cgit v1.2.2


From 3a2edd0d6ddbd5fa3b389ea6db811285415ce6c8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:18 +0200
Subject: block: make __blk_rq_prep_clone() copy most command flags

Currently __blk_rq_prep_clone() copies only REQ_WRITE and REQ_DISCARD.
There's no reason to omit other command flags and REQ_FUA needs to be
copied to implement FUA support in request-based dm.

REQ_COMMON_MASK which specifies flags to be copied from bio to request
already identifies all the command flags.  Define REQ_CLONE_MASK to be
the same as REQ_COMMON_MASK for clarity and make __blk_rq_prep_clone()
copy all flags in the mask.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 495bdc4a23da..2a5b19204546 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2505,9 +2505,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE);
-	if (src->cmd_flags & REQ_DISCARD)
-		dst->cmd_flags |= REQ_DISCARD;
+	dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
-- 
cgit v1.2.2


From 8c5553678237b7121355108e03c36086037d8975 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:22 -0400
Subject: block: remove the BLKDEV_IFL_BARRIER flag

Remove support for barriers on discards, which is unused now.  Also
remove the DISCARD_NOBARRIER I/O type in favour of just setting the
rw flags up locally in blkdev_issue_discard.

tj: Also remove DISCARD_SECURE and use REQ_SECURE directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-lib.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index c392029a104e..fe2e6ed0f510 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -39,8 +39,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q = bdev_get_queue(bdev);
-	int type = flags & BLKDEV_IFL_BARRIER ?
-		DISCARD_BARRIER : DISCARD_NOBARRIER;
+	int type = REQ_WRITE | REQ_DISCARD;
 	unsigned int max_discard_sectors;
 	struct bio *bio;
 	int ret = 0;
@@ -65,7 +64,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	if (flags & BLKDEV_IFL_SECURE) {
 		if (!blk_queue_secdiscard(q))
 			return -EOPNOTSUPP;
-		type |= DISCARD_SECURE;
+		type |= REQ_SECURE;
 	}
 
 	while (nr_sects && !ret) {
@@ -162,12 +161,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	bb.wait = &wait;
 	bb.end_io = NULL;
 
-	if (flags & BLKDEV_IFL_BARRIER) {
-		/* issue async barrier before the data */
-		ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
-		if (ret)
-			return ret;
-	}
 submit:
 	ret = 0;
 	while (nr_sects != 0) {
@@ -199,13 +192,6 @@ submit:
 		issued++;
 		submit_bio(WRITE, bio);
 	}
-	/*
-	 * When all data bios are in flight. Send final barrier if requeted.
-	 */
-	if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
-		ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
-					flags & BLKDEV_IFL_WAIT);
-
 
 	if (flags & BLKDEV_IFL_WAIT)
 		/* Wait for bios in-flight */
-- 
cgit v1.2.2


From c8bf1336824ebd698d37b71763e1c43190f2229a Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 10 Sep 2010 20:07:38 +0200
Subject: Consolidate min_not_zero

We have several users of min_not_zero, each of them using their own
definition.  Move the define to kernel.h.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 block/blk-settings.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index a234f4bf1d6f..8d592b559bd3 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -455,11 +455,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
 }
 EXPORT_SYMBOL(blk_queue_io_opt);
 
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-
 /**
  * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
  * @t:	the stacking driver (top)
-- 
cgit v1.2.2


From 13f05c8d8e98bbdce89158bfdb2e380940695a88 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 10 Sep 2010 20:50:10 +0200
Subject: block/scsi: Provide a limit on the number of integrity segments

Some controllers have a hardware limit on the number of protection
information scatter-gather list segments they can handle.

Introduce a max_integrity_segments limit in the block layer and provide
a new scsi_host_template setting that allows HBA drivers to provide a
value suitable for the hardware.

Add support for honoring the integrity segment limit when merging both
bios and requests.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 block/blk-integrity.c | 93 +++++++++++++++++++++++++++++++++++++++------------
 block/blk-merge.c     | 23 ++++++++-----
 block/blk-settings.c  |  3 ++
 block/blk-sysfs.c     | 11 ++++++
 block/blk.h           |  8 -----
 5 files changed, 100 insertions(+), 38 deletions(-)

(limited to 'block')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index edce1ef7933d..885cbb59967e 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -32,24 +32,37 @@ static struct kmem_cache *integrity_cachep;
 
 /**
  * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
- * @rq:		request with integrity metadata attached
+ * @q:		request queue
+ * @bio:	bio with integrity metadata attached
  *
  * Description: Returns the number of elements required in a
- * scatterlist corresponding to the integrity metadata in a request.
+ * scatterlist corresponding to the integrity metadata in a bio.
  */
-int blk_rq_count_integrity_sg(struct request *rq)
+int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
 {
-	struct bio_vec *iv, *ivprv;
-	struct req_iterator iter;
-	unsigned int segments;
+	struct bio_vec *iv, *ivprv = NULL;
+	unsigned int segments = 0;
+	unsigned int seg_size = 0;
+	unsigned int i = 0;
 
-	ivprv = NULL;
-	segments = 0;
+	bio_for_each_integrity_vec(iv, bio, i) {
 
-	rq_for_each_integrity_segment(iv, rq, iter) {
+		if (ivprv) {
+			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+				goto new_segment;
+
+			if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
+				goto new_segment;
+
+			if (seg_size + iv->bv_len > queue_max_segment_size(q))
+				goto new_segment;
 
-		if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+			seg_size += iv->bv_len;
+		} else {
+new_segment:
 			segments++;
+			seg_size = iv->bv_len;
+		}
 
 		ivprv = iv;
 	}
@@ -60,30 +73,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg);
 
 /**
  * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
- * @rq:		request with integrity metadata attached
+ * @q:		request queue
+ * @bio:	bio with integrity metadata attached
  * @sglist:	target scatterlist
  *
  * Description: Map the integrity vectors in request into a
  * scatterlist.  The scatterlist must be big enough to hold all
  * elements.  I.e. sized using blk_rq_count_integrity_sg().
  */
-int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
+int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
+			    struct scatterlist *sglist)
 {
-	struct bio_vec *iv, *ivprv;
-	struct req_iterator iter;
-	struct scatterlist *sg;
-	unsigned int segments;
+	struct bio_vec *iv, *ivprv = NULL;
+	struct scatterlist *sg = NULL;
+	unsigned int segments = 0;
+	unsigned int i = 0;
 
-	ivprv = NULL;
-	sg = NULL;
-	segments = 0;
-
-	rq_for_each_integrity_segment(iv, rq, iter) {
+	bio_for_each_integrity_vec(iv, bio, i) {
 
 		if (ivprv) {
 			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
 				goto new_segment;
 
+			if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
+				goto new_segment;
+
+			if (sg->length + iv->bv_len > queue_max_segment_size(q))
+				goto new_segment;
+
 			sg->length += iv->bv_len;
 		} else {
 new_segment:
@@ -162,6 +179,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
 }
 EXPORT_SYMBOL(blk_integrity_compare);
 
+int blk_integrity_merge_rq(struct request_queue *q, struct request *req,
+			   struct request *next)
+{
+	if (blk_integrity_rq(req) != blk_integrity_rq(next))
+		return -1;
+
+	if (req->nr_integrity_segments + next->nr_integrity_segments >
+	    q->limits.max_integrity_segments)
+		return -1;
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_merge_rq);
+
+int blk_integrity_merge_bio(struct request_queue *q, struct request *req,
+			    struct bio *bio)
+{
+	int nr_integrity_segs;
+	struct bio *next = bio->bi_next;
+
+	bio->bi_next = NULL;
+	nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
+	bio->bi_next = next;
+
+	if (req->nr_integrity_segments + nr_integrity_segs >
+	    q->limits.max_integrity_segments)
+		return -1;
+
+	req->nr_integrity_segments += nr_integrity_segs;
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_merge_bio);
+
 struct integrity_sysfs_entry {
 	struct attribute attr;
 	ssize_t (*show)(struct blk_integrity *, char *);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3b0cd4249671..6a725461654d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 {
 	int nr_phys_segs = bio_phys_segments(q, bio);
 
-	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) {
-		req->cmd_flags |= REQ_NOMERGE;
-		if (req == q->last_merge)
-			q->last_merge = NULL;
-		return 0;
-	}
+	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
+		goto no_merge;
+
+	if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio))
+		goto no_merge;
 
 	/*
 	 * This will form the start of a new hw segment.  Bump both
@@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 	 */
 	req->nr_phys_segments += nr_phys_segs;
 	return 1;
+
+no_merge:
+	req->cmd_flags |= REQ_NOMERGE;
+	if (req == q->last_merge)
+		q->last_merge = NULL;
+	return 0;
 }
 
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
@@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	if (total_phys_segments > queue_max_segments(q))
 		return 0;
 
+	if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next))
+		return 0;
+
 	/* Merge is OK... */
 	req->nr_phys_segments = total_phys_segments;
 	return 1;
@@ -372,9 +380,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	    || next->special)
 		return 0;
 
-	if (blk_integrity_rq(req) != blk_integrity_rq(next))
-		return 0;
-
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8d592b559bd3..f8f2ddf20613 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
 void blk_set_default_limits(struct queue_limits *lim)
 {
 	lim->max_segments = BLK_MAX_SEGMENTS;
+	lim->max_integrity_segments = 0;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@ -509,6 +510,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 					    b->seg_boundary_mask);
 
 	t->max_segments = min_not_zero(t->max_segments, b->max_segments);
+	t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
+						 b->max_integrity_segments);
 
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 001ab18078f5..b014f7739e98 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -112,6 +112,11 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
 	return queue_var_show(queue_max_segments(q), (page));
 }
 
+static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.max_integrity_segments, (page));
+}
+
 static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
 {
 	if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
@@ -288,6 +293,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = {
 	.show = queue_max_segments_show,
 };
 
+static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
+	.attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
+	.show = queue_max_integrity_segments_show,
+};
+
 static struct queue_sysfs_entry queue_max_segment_size_entry = {
 	.attr = {.name = "max_segment_size", .mode = S_IRUGO },
 	.show = queue_max_segment_size_show,
@@ -375,6 +385,7 @@ static struct attribute *default_attrs[] = {
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
 	&queue_max_segments_entry.attr,
+	&queue_max_integrity_segments_entry.attr,
 	&queue_max_segment_size_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
diff --git a/block/blk.h b/block/blk.h
index 6e7dc87141e4..6738831ba447 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -132,14 +132,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 	return q->nr_congestion_off;
 }
 
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-
-#define rq_for_each_integrity_segment(bvl, _rq, _iter)		\
-	__rq_for_each_bio(_iter.bio, _rq)			\
-		bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
-
-#endif /* BLK_DEV_INTEGRITY */
-
 static inline int blk_cpu_to_group(int cpu)
 {
 #ifdef CONFIG_SCHED_MC
-- 
cgit v1.2.2


From 8dcbdc742fec9e6c542ff9cb599d2ee620753d07 Mon Sep 17 00:00:00 2001
From: San Mehat <san@android.com>
Date: Tue, 14 Sep 2010 08:48:01 +0200
Subject: block: block_dump: Add number of sectors to debug output

Signed-off-by: San Mehat <san@android.com>
Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index ee1a1e7e63cc..8d07c1b7e701 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1612,11 +1612,12 @@ void submit_bio(int rw, struct bio *bio)
 
 		if (unlikely(block_dump)) {
 			char b[BDEVNAME_SIZE];
-			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
+			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
 			current->comm, task_pid_nr(current),
 				(rw & WRITE) ? "WRITE" : "READ",
 				(unsigned long long)bio->bi_sector,
-				bdevname(bio->bi_bdev, b));
+				bdevname(bio->bi_bdev, b),
+				count);
 		}
 	}
 
-- 
cgit v1.2.2


From 144177991ca624841ddbd1e7edff958fc0f6d1fe Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 15 Sep 2010 13:08:27 +0200
Subject: block: fix an address space warning in blk-map.c

Change type of 2nd parameter of blk_rq_aligned() into unsigned long
and remove unnecessary casting. Now we can call it with 'uaddr'
instead of 'ubuf' in __blk_rq_map_user() so that it can remove
following warnings from sparse:

 block/blk-map.c:57:31: warning: incorrect type in argument 2 (different address spaces)
 block/blk-map.c:57:31:    expected void *addr
 block/blk-map.c:57:31:    got void [noderef] <asn:1>*ubuf

However blk_rq_map_kern() needs one more local variable to handle it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-map.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index c65d7593f7f1..ac0f7d46db63 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 	 * direct dma. else, set up kernel bounce buffers
 	 */
 	uaddr = (unsigned long) ubuf;
-	if (blk_rq_aligned(q, ubuf, len) && !map_data)
+	if (blk_rq_aligned(q, uaddr, len) && !map_data)
 		bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
 	else
 		bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
@@ -288,6 +288,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 		    unsigned int len, gfp_t gfp_mask)
 {
 	int reading = rq_data_dir(rq) == READ;
+	unsigned long addr = (unsigned long) kbuf;
 	int do_copy = 0;
 	struct bio *bio;
 	int ret;
@@ -297,7 +298,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (!len || !kbuf)
 		return -EINVAL;
 
-	do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf);
+	do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
 	if (do_copy)
 		bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
 	else
-- 
cgit v1.2.2


From 6d1d8050b4bc89d0165d29b58e894aeba2564a97 Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Tue, 31 Aug 2010 15:47:05 -0500
Subject: block, partition: add partition_meta_info to hd_struct

I'm reposting this patch series as v4 since there have been no additional
comments, and I cleaned up one extra bit of unneeded code (in 3/3). The patches
are against Linus's tree: 2bfc96a127bc1cc94d26bfaa40159966064f9c8c
(2.6.36-rc3).

Would this patchset be suitable for inclusion in an mm branch?

This changes adds a partition_meta_info struct which itself contains a
union of structures that provide partition table specific metadata.

This change leaves the union empty. The subsequent patch includes an
implementation for CONFIG_EFI_PARTITION-based metadata.

Signed-off-by: Will Drewry <wad@chromium.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 1 +
 block/ioctl.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 59a2db6fecef..c8da12055264 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1004,6 +1004,7 @@ static void disk_release(struct device *dev)
 	kfree(disk->random);
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
+	free_part_info(&disk->part0);
 	kfree(disk);
 }
 struct class block_class = {
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..2c15fe0912c4 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -62,7 +62,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 
 			/* all seems OK */
 			part = add_partition(disk, partno, start, length,
-					     ADDPART_FLAG_NONE);
+					     ADDPART_FLAG_NONE, NULL);
 			mutex_unlock(&bdev->bd_mutex);
 			return IS_ERR(part) ? PTR_ERR(part) : 0;
 		case BLKPG_DEL_PARTITION:
-- 
cgit v1.2.2


From b5af921ec02333e943efb59aca4f56b78fc0e100 Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Tue, 31 Aug 2010 15:47:07 -0500
Subject: init: add support for root devices specified by partition UUID

This is the third patch in a series which adds support for
storing partition metadata, optionally, off of the hd_struct.

One major use for that data is being able to resolve partition
by other identities than just the index on a block device.  Device
enumeration varies by platform and there's a benefit to being able
to use something like EFI GPT's GUIDs to determine the correct
block device and partition to mount as the root.

This change adds that support to root= by adding support for
the following syntax:

  root=PARTUUID=hex-uuid

Signed-off-by: Will Drewry <wad@chromium.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index c8da12055264..5c9c503de423 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -642,6 +642,7 @@ void __init printk_all_partitions(void)
 		struct hd_struct *part;
 		char name_buf[BDEVNAME_SIZE];
 		char devt_buf[BDEVT_SIZE];
+		u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
 
 		/*
 		 * Don't show empty devices or things that have been
@@ -660,10 +661,14 @@ void __init printk_all_partitions(void)
 		while ((part = disk_part_iter_next(&piter))) {
 			bool is_part0 = part == &disk->part0;
 
-			printk("%s%s %10llu %s", is_part0 ? "" : "  ",
+			uuid[0] = 0;
+			if (part->info)
+				part_unpack_uuid(part->info->uuid, uuid);
+
+			printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
 			       bdevt_str(part_devt(part), devt_buf),
 			       (unsigned long long)part->nr_sects >> 1,
-			       disk_name(disk, part->partno, name_buf));
+			       disk_name(disk, part->partno, name_buf), uuid);
 			if (is_part0) {
 				if (disk->driverfs_dev != NULL &&
 				    disk->driverfs_dev->driver != NULL)
-- 
cgit v1.2.2


From af41d7bd9b685ab4e8f930627874ba4f4728e128 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:32 -0400
Subject: blk-cgroup: Kill the header printed at the start of
 blkio.weight_device file

o Kill extra "dev weight" header which is printed when somebody reads
  blkio.weight_device file. This really seems to be out of convention. No other
  blkio files are printing any header at the start of file. I think it is ok
  to just print values and how to interpret values should be part of
  documentation.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a6809645d212..c1a39d90d14a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -811,8 +811,6 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
 	struct blkio_cgroup *blkcg;
 	struct blkio_policy_node *pn;
 
-	seq_printf(m, "dev\tweight\n");
-
 	blkcg = cgroup_to_blkio_cgroup(cgrp);
 	if (!list_empty(&blkcg->policy_list)) {
 		spin_lock_irq(&blkcg->lock);
-- 
cgit v1.2.2


From 062a644d6121d5e2f51c0b2ca0cbc5155ebf845b Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:33 -0400
Subject: blk-cgroup: Prepare the base for supporting more than one IO control
 policies

o This patch prepares the base for introducing new IO control policies.
  Currently all the code is written knowing there is only one policy
  and that is proportional bandwidth. Creating infrastructure for newer
  policies to come in.

o Also there were many functions which were generated using macro. It was
  very confusing. Got rid of those.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c  | 544 +++++++++++++++++++++++++++++++++++++---------------
 block/blk-cgroup.h  |  36 +++-
 block/cfq-iosched.c |   1 +
 block/cfq.h         |   2 +-
 4 files changed, 429 insertions(+), 154 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c1a39d90d14a..7762987fdf9e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -37,6 +37,12 @@ static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
 
+/* for encoding cft->private value on file */
+#define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
+/* What policy owns the file, proportional or throttle */
+#define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
+#define BLKIOFILE_ATTR(val)		((val) & 0xffff)
+
 struct cgroup_subsys blkio_subsys = {
 	.name = "blkio",
 	.create = blkiocg_create,
@@ -59,6 +65,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
 	list_add(&pn->node, &blkcg->policy_list);
 }
 
+static inline bool cftype_blkg_same_policy(struct cftype *cft,
+			struct blkio_group *blkg)
+{
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+
+	if (blkg->plid == plid)
+		return 1;
+
+	return 0;
+}
+
+/* Determines if policy node matches cgroup file being accessed */
+static inline bool pn_matches_cftype(struct cftype *cft,
+			struct blkio_policy_node *pn)
+{
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int fileid = BLKIOFILE_ATTR(cft->private);
+
+	return (plid == pn->plid && fileid == pn->fileid);
+}
+
 /* Must be called with blkcg->lock held */
 static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
 {
@@ -67,12 +94,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
 
 /* Must be called with blkcg->lock held */
 static struct blkio_policy_node *
-blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
+		enum blkio_policy_id plid, int fileid)
 {
 	struct blkio_policy_node *pn;
 
 	list_for_each_entry(pn, &blkcg->policy_list, node) {
-		if (pn->dev == dev)
+		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
 			return pn;
 	}
 
@@ -86,6 +114,20 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
+static inline void
+blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
+{
+	struct blkio_policy_type *blkiop;
+
+	list_for_each_entry(blkiop, &blkio_list, list) {
+		/* If this policy does not own the blkg, do not send updates */
+		if (blkiop->plid != blkg->plid)
+			continue;
+		if (blkiop->ops.blkio_update_group_weight_fn)
+			blkiop->ops.blkio_update_group_weight_fn(blkg, weight);
+	}
+}
+
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
@@ -341,7 +383,8 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev)
+		struct blkio_group *blkg, void *key, dev_t dev,
+		enum blkio_policy_id plid)
 {
 	unsigned long flags;
 
@@ -350,6 +393,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 	rcu_assign_pointer(blkg->key, key);
 	blkg->blkcg_id = css_id(&blkcg->css);
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
+	blkg->plid = plid;
 	spin_unlock_irqrestore(&blkcg->lock, flags);
 	/* Need to take css reference ? */
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -408,51 +452,6 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
 }
 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
 
-#define SHOW_FUNCTION(__VAR)						\
-static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
-				       struct cftype *cftype)		\
-{									\
-	struct blkio_cgroup *blkcg;					\
-									\
-	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
-	return (u64)blkcg->__VAR;					\
-}
-
-SHOW_FUNCTION(weight);
-#undef SHOW_FUNCTION
-
-static int
-blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
-{
-	struct blkio_cgroup *blkcg;
-	struct blkio_group *blkg;
-	struct hlist_node *n;
-	struct blkio_policy_type *blkiop;
-	struct blkio_policy_node *pn;
-
-	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
-		return -EINVAL;
-
-	blkcg = cgroup_to_blkio_cgroup(cgroup);
-	spin_lock(&blkio_list_lock);
-	spin_lock_irq(&blkcg->lock);
-	blkcg->weight = (unsigned int)val;
-
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		pn = blkio_policy_search_node(blkcg, blkg->dev);
-
-		if (pn)
-			continue;
-
-		list_for_each_entry(blkiop, &blkio_list, list)
-			blkiop->ops.blkio_update_group_weight_fn(blkg,
-					blkcg->weight);
-	}
-	spin_unlock_irq(&blkcg->lock);
-	spin_unlock(&blkio_list_lock);
-	return 0;
-}
-
 static int
 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 {
@@ -593,52 +592,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 	return disk_total;
 }
 
-#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)		\
-static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
-		struct cftype *cftype, struct cgroup_map_cb *cb)	\
-{									\
-	struct blkio_cgroup *blkcg;					\
-	struct blkio_group *blkg;					\
-	struct hlist_node *n;						\
-	uint64_t cgroup_total = 0;					\
-									\
-	if (!cgroup_lock_live_group(cgroup))				\
-		return -ENODEV;						\
-									\
-	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
-	rcu_read_lock();						\
-	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
-		if (blkg->dev) {					\
-			spin_lock_irq(&blkg->stats_lock);		\
-			cgroup_total += blkio_get_stat(blkg, cb,	\
-						blkg->dev, type);	\
-			spin_unlock_irq(&blkg->stats_lock);		\
-		}							\
-	}								\
-	if (show_total)							\
-		cb->fill(cb, "Total", cgroup_total);			\
-	rcu_read_unlock();						\
-	cgroup_unlock();						\
-	return 0;							\
-}
-
-SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
-SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
-SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
-SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
-SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
-SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
-SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
-SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
-SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
-#endif
-#undef SHOW_FUNCTION_PER_GROUP
-
 static int blkio_check_dev_num(dev_t dev)
 {
 	int part = 0;
@@ -652,7 +605,7 @@ static int blkio_check_dev_num(dev_t dev)
 }
 
 static int blkio_policy_parse_and_set(char *buf,
-				      struct blkio_policy_node *newpn)
+	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
 {
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 	int ret;
@@ -705,12 +658,20 @@ static int blkio_policy_parse_and_set(char *buf,
 	if (s[1] == NULL)
 		return -EINVAL;
 
-	ret = strict_strtoul(s[1], 10, &temp);
-	if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
-	    temp > BLKIO_WEIGHT_MAX)
-		return -EINVAL;
+	switch (plid) {
+	case BLKIO_POLICY_PROP:
+		ret = strict_strtoul(s[1], 10, &temp);
+		if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+			temp > BLKIO_WEIGHT_MAX)
+			return -EINVAL;
 
-	newpn->weight =  temp;
+		newpn->plid = plid;
+		newpn->fileid = fileid;
+		newpn->weight = temp;
+		break;
+	default:
+		BUG();
+	}
 
 	return 0;
 }
@@ -720,7 +681,8 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 {
 	struct blkio_policy_node *pn;
 
-	pn = blkio_policy_search_node(blkcg, dev);
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
+				BLKIO_PROP_weight_device);
 	if (pn)
 		return pn->weight;
 	else
@@ -728,18 +690,86 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
+/* Checks whether user asked for deleting a policy rule */
+static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
+{
+	switch(pn->plid) {
+	case BLKIO_POLICY_PROP:
+		if (pn->weight == 0)
+			return 1;
+		break;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
+					struct blkio_policy_node *newpn)
+{
+	switch(oldpn->plid) {
+	case BLKIO_POLICY_PROP:
+		oldpn->weight = newpn->weight;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Some rules/values in blkg have changed. Propogate those to respective
+ * policies.
+ */
+static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
+		struct blkio_group *blkg, struct blkio_policy_node *pn)
+{
+	unsigned int weight;
+
+	switch(pn->plid) {
+	case BLKIO_POLICY_PROP:
+		weight = pn->weight ? pn->weight :
+				blkcg->weight;
+		blkio_update_group_weight(blkg, weight);
+		break;
+	default:
+		BUG();
+	}
+}
 
-static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
-				       const char *buffer)
+/*
+ * A policy node rule has been updated. Propogate this update to all the
+ * block groups which might be affected by this update.
+ */
+static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
+				struct blkio_policy_node *pn)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+
+	spin_lock(&blkio_list_lock);
+	spin_lock_irq(&blkcg->lock);
+
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
+			continue;
+		blkio_update_blkg_policy(blkcg, blkg, pn);
+	}
+
+	spin_unlock_irq(&blkcg->lock);
+	spin_unlock(&blkio_list_lock);
+}
+
+static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
+ 				       const char *buffer)
 {
 	int ret = 0;
 	char *buf;
 	struct blkio_policy_node *newpn, *pn;
 	struct blkio_cgroup *blkcg;
-	struct blkio_group *blkg;
 	int keep_newpn = 0;
-	struct hlist_node *n;
-	struct blkio_policy_type *blkiop;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int fileid = BLKIOFILE_ATTR(cft->private);
 
 	buf = kstrdup(buffer, GFP_KERNEL);
 	if (!buf)
@@ -751,7 +781,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
 		goto free_buf;
 	}
 
-	ret = blkio_policy_parse_and_set(buf, newpn);
+	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
 	if (ret)
 		goto free_newpn;
 
@@ -759,9 +789,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
 
 	spin_lock_irq(&blkcg->lock);
 
-	pn = blkio_policy_search_node(blkcg, newpn->dev);
+	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
 	if (!pn) {
-		if (newpn->weight != 0) {
+		if (!blkio_delete_rule_command(newpn)) {
 			blkio_policy_insert_node(blkcg, newpn);
 			keep_newpn = 1;
 		}
@@ -769,33 +799,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
 		goto update_io_group;
 	}
 
-	if (newpn->weight == 0) {
-		/* weight == 0 means deleteing a specific weight */
+	if (blkio_delete_rule_command(newpn)) {
 		blkio_policy_delete_node(pn);
 		spin_unlock_irq(&blkcg->lock);
 		goto update_io_group;
 	}
 	spin_unlock_irq(&blkcg->lock);
 
-	pn->weight = newpn->weight;
+	blkio_update_policy_rule(pn, newpn);
 
 update_io_group:
-	/* update weight for each cfqg */
-	spin_lock(&blkio_list_lock);
-	spin_lock_irq(&blkcg->lock);
-
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		if (newpn->dev == blkg->dev) {
-			list_for_each_entry(blkiop, &blkio_list, list)
-				blkiop->ops.blkio_update_group_weight_fn(blkg,
-							 newpn->weight ?
-							 newpn->weight :
-							 blkcg->weight);
-		}
-	}
-
-	spin_unlock_irq(&blkcg->lock);
-	spin_unlock(&blkio_list_lock);
+	blkio_update_policy_node_blkg(blkcg, newpn);
 
 free_newpn:
 	if (!keep_newpn)
@@ -805,21 +819,219 @@ free_buf:
 	return ret;
 }
 
-static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
-				      struct seq_file *m)
+static void
+blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
+{
+	switch(pn->plid) {
+		case BLKIO_POLICY_PROP:
+			if (pn->fileid == BLKIO_PROP_weight_device)
+				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+					MINOR(pn->dev), pn->weight);
+			break;
+		default:
+			BUG();
+	}
+}
+
+/* cgroup files which read their data from policy nodes end up here */
+static void blkio_read_policy_node_files(struct cftype *cft,
+			struct blkio_cgroup *blkcg, struct seq_file *m)
 {
-	struct blkio_cgroup *blkcg;
 	struct blkio_policy_node *pn;
 
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
 	if (!list_empty(&blkcg->policy_list)) {
 		spin_lock_irq(&blkcg->lock);
 		list_for_each_entry(pn, &blkcg->policy_list, node) {
-			seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-				   MINOR(pn->dev), pn->weight);
+			if (!pn_matches_cftype(cft, pn))
+				continue;
+			blkio_print_policy_node(m, pn);
 		}
 		spin_unlock_irq(&blkcg->lock);
 	}
+}
+
+static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
+				struct seq_file *m)
+{
+	struct blkio_cgroup *blkcg;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int name = BLKIOFILE_ATTR(cft->private);
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	switch(plid) {
+	case BLKIO_POLICY_PROP:
+		switch(name) {
+		case BLKIO_PROP_weight_device:
+			blkio_read_policy_node_files(cft, blkcg, m);
+			return 0;
+		default:
+			BUG();
+		}
+		break;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
+		struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
+		bool show_total)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	uint64_t cgroup_total = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		if (blkg->dev) {
+			if (!cftype_blkg_same_policy(cft, blkg))
+				continue;
+			spin_lock_irq(&blkg->stats_lock);
+			cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
+						type);
+			spin_unlock_irq(&blkg->stats_lock);
+		}
+	}
+	if (show_total)
+		cb->fill(cb, "Total", cgroup_total);
+	rcu_read_unlock();
+	return 0;
+}
+
+/* All map kind of cgroup file get serviced by this function */
+static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
+				struct cgroup_map_cb *cb)
+{
+	struct blkio_cgroup *blkcg;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int name = BLKIOFILE_ATTR(cft->private);
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	switch(plid) {
+	case BLKIO_POLICY_PROP:
+		switch(name) {
+		case BLKIO_PROP_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_TIME, 0);
+		case BLKIO_PROP_sectors:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SECTORS, 0);
+		case BLKIO_PROP_io_service_bytes:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICE_BYTES, 1);
+		case BLKIO_PROP_io_serviced:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICED, 1);
+		case BLKIO_PROP_io_service_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICE_TIME, 1);
+		case BLKIO_PROP_io_wait_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_WAIT_TIME, 1);
+		case BLKIO_PROP_io_merged:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_MERGED, 1);
+		case BLKIO_PROP_io_queued:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_QUEUED, 1);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+		case BLKIO_PROP_dequeue:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_DEQUEUE, 0);
+		case BLKIO_PROP_avg_queue_size:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+		case BLKIO_PROP_group_wait_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_GROUP_WAIT_TIME, 0);
+		case BLKIO_PROP_idle_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_IDLE_TIME, 0);
+		case BLKIO_PROP_empty_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_EMPTY_TIME, 0);
+#endif
+		default:
+			BUG();
+		}
+		break;
+
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	struct blkio_policy_node *pn;
+
+	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+		return -EINVAL;
+
+	spin_lock(&blkio_list_lock);
+	spin_lock_irq(&blkcg->lock);
+	blkcg->weight = (unsigned int)val;
+
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		pn = blkio_policy_search_node(blkcg, blkg->dev,
+				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
+		if (pn)
+			continue;
+
+		blkio_update_group_weight(blkg, blkcg->weight);
+	}
+	spin_unlock_irq(&blkcg->lock);
+	spin_unlock(&blkio_list_lock);
+	return 0;
+}
+
+static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
+	struct blkio_cgroup *blkcg;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int name = BLKIOFILE_ATTR(cft->private);
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	switch(plid) {
+	case BLKIO_POLICY_PROP:
+		switch(name) {
+		case BLKIO_PROP_weight:
+			return (u64)blkcg->weight;
+		}
+		break;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+static int
+blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	struct blkio_cgroup *blkcg;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int name = BLKIOFILE_ATTR(cft->private);
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	switch(plid) {
+	case BLKIO_POLICY_PROP:
+		switch(name) {
+		case BLKIO_PROP_weight:
+			return blkio_weight_write(blkcg, val);
+		}
+		break;
+	default:
+		BUG();
+	}
 
 	return 0;
 }
@@ -827,46 +1039,66 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
 struct cftype blkio_files[] = {
 	{
 		.name = "weight_device",
-		.read_seq_string = blkiocg_weight_device_read,
-		.write_string = blkiocg_weight_device_write,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_weight_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
 		.max_write_len = 256,
 	},
 	{
 		.name = "weight",
-		.read_u64 = blkiocg_weight_read,
-		.write_u64 = blkiocg_weight_write,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_weight),
+		.read_u64 = blkiocg_file_read_u64,
+		.write_u64 = blkiocg_file_write_u64,
 	},
 	{
 		.name = "time",
-		.read_map = blkiocg_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "sectors",
-		.read_map = blkiocg_sectors_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_sectors),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_service_bytes",
-		.read_map = blkiocg_io_service_bytes_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_service_bytes),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_serviced",
-		.read_map = blkiocg_io_serviced_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_serviced),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_service_time",
-		.read_map = blkiocg_io_service_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_service_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_wait_time",
-		.read_map = blkiocg_io_wait_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_wait_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_merged",
-		.read_map = blkiocg_io_merged_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_merged),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_queued",
-		.read_map = blkiocg_io_queued_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_queued),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "reset_stats",
@@ -875,23 +1107,33 @@ struct cftype blkio_files[] = {
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
 		.name = "avg_queue_size",
-		.read_map = blkiocg_avg_queue_size_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_avg_queue_size),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "group_wait_time",
-		.read_map = blkiocg_group_wait_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_group_wait_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "idle_time",
-		.read_map = blkiocg_idle_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_idle_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "empty_time",
-		.read_map = blkiocg_empty_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_empty_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "dequeue",
-		.read_map = blkiocg_dequeue_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_dequeue),
+		.read_map = blkiocg_file_read_map,
 	},
 #endif
 };
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2b866ec1dcea..c8de2598429d 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,6 +15,10 @@
 
 #include <linux/cgroup.h>
 
+enum blkio_policy_id {
+	BLKIO_POLICY_PROP = 0,		/* Proportional Bandwidth division */
+};
+
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 
 #ifndef CONFIG_BLK_CGROUP
@@ -65,6 +69,25 @@ enum blkg_state_flags {
 	BLKG_empty,
 };
 
+/* cgroup files owned by proportional weight policy */
+enum blkcg_file_name_prop {
+	BLKIO_PROP_weight = 1,
+	BLKIO_PROP_weight_device,
+	BLKIO_PROP_io_service_bytes,
+	BLKIO_PROP_io_serviced,
+	BLKIO_PROP_time,
+	BLKIO_PROP_sectors,
+	BLKIO_PROP_io_service_time,
+	BLKIO_PROP_io_wait_time,
+	BLKIO_PROP_io_merged,
+	BLKIO_PROP_io_queued,
+	BLKIO_PROP_avg_queue_size,
+	BLKIO_PROP_group_wait_time,
+	BLKIO_PROP_idle_time,
+	BLKIO_PROP_empty_time,
+	BLKIO_PROP_dequeue,
+};
+
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
@@ -112,6 +135,8 @@ struct blkio_group {
 	char path[128];
 	/* The device MKDEV(major, minor), this group has been created for */
 	dev_t dev;
+	/* policy which owns this blk group */
+	enum blkio_policy_id plid;
 
 	/* Need to serialize the stats in the case of reset/update */
 	spinlock_t stats_lock;
@@ -122,6 +147,10 @@ struct blkio_policy_node {
 	struct list_head node;
 	dev_t dev;
 	unsigned int weight;
+	/* This node belongs to max bw policy or porportional weight policy */
+	enum blkio_policy_id plid;
+	/* cgroup file to which this rule belongs to */
+	int fileid;
 };
 
 extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
@@ -139,6 +168,7 @@ struct blkio_policy_ops {
 struct blkio_policy_type {
 	struct list_head list;
 	struct blkio_policy_ops ops;
+	enum blkio_policy_id plid;
 };
 
 /* Blkio controller policy registration */
@@ -212,7 +242,8 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
 extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev);
+	struct blkio_group *blkg, void *key, dev_t dev,
+	enum blkio_policy_id plid);
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
@@ -234,7 +265,8 @@ static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 
 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev) {}
+		struct blkio_group *blkg, void *key, dev_t dev,
+		enum blkio_policy_id plid) {}
 
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index eb4086f7dfef..b9f86190763b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4013,6 +4013,7 @@ static struct blkio_policy_type blkio_policy_cfq = {
 		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
 		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
 	},
+	.plid = BLKIO_POLICY_PROP,
 };
 #else
 static struct blkio_policy_type blkio_policy_cfq;
diff --git a/block/cfq.h b/block/cfq.h
index 93448e5a2e41..54a6d90f8e8c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
 
 static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 			struct blkio_group *blkg, void *key, dev_t dev) {
-	blkiocg_add_blkio_group(blkcg, blkg, key, dev);
+	blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
 }
 
 static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
-- 
cgit v1.2.2


From 4c9eefa16c6f124ffcc736cb719b24ea27f85017 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:34 -0400
Subject: blk-cgroup: Introduce cgroup changes for throttling policy

o cgroup chagnes for throttle policy.

o Introduces READ and WRITE bytes per second throttling rules.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 block/blk-cgroup.h |  30 ++++++++++-
 2 files changed, 167 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7762987fdf9e..aae8c930a6f8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -128,6 +128,27 @@ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
 	}
 }
 
+static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
+				int fileid)
+{
+	struct blkio_policy_type *blkiop;
+
+	list_for_each_entry(blkiop, &blkio_list, list) {
+
+		/* If this policy does not own the blkg, do not send updates */
+		if (blkiop->plid != blkg->plid)
+			continue;
+
+		if (fileid == BLKIO_THROTL_read_bps_device
+		    && blkiop->ops.blkio_update_group_read_bps_fn)
+			blkiop->ops.blkio_update_group_read_bps_fn(blkg, bps);
+
+		if (fileid == BLKIO_THROTL_write_bps_device
+		    && blkiop->ops.blkio_update_group_write_bps_fn)
+			blkiop->ops.blkio_update_group_write_bps_fn(blkg, bps);
+	}
+}
+
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
@@ -612,6 +633,7 @@ static int blkio_policy_parse_and_set(char *buf,
 	unsigned long major, minor, temp;
 	int i = 0;
 	dev_t dev;
+	u64 bps;
 
 	memset(s, 0, sizeof(s));
 
@@ -667,7 +689,16 @@ static int blkio_policy_parse_and_set(char *buf,
 
 		newpn->plid = plid;
 		newpn->fileid = fileid;
-		newpn->weight = temp;
+		newpn->val.weight = temp;
+		break;
+	case BLKIO_POLICY_THROTL:
+		ret = strict_strtoull(s[1], 10, &bps);
+		if (ret)
+			return -EINVAL;
+
+		newpn->plid = plid;
+		newpn->fileid = fileid;
+		newpn->val.bps = bps;
 		break;
 	default:
 		BUG();
@@ -684,18 +715,45 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
 				BLKIO_PROP_weight_device);
 	if (pn)
-		return pn->weight;
+		return pn->val.weight;
 	else
 		return blkcg->weight;
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
+uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_bps_device);
+	if (pn)
+		return pn->val.bps;
+	else
+		return -1;
+}
+
+uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_bps_device);
+	if (pn)
+		return pn->val.bps;
+	else
+		return -1;
+}
+
 /* Checks whether user asked for deleting a policy rule */
 static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
 {
 	switch(pn->plid) {
 	case BLKIO_POLICY_PROP:
-		if (pn->weight == 0)
+		if (pn->val.weight == 0)
+			return 1;
+		break;
+	case BLKIO_POLICY_THROTL:
+		if (pn->val.bps == 0)
 			return 1;
 		break;
 	default:
@@ -710,7 +768,10 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 {
 	switch(oldpn->plid) {
 	case BLKIO_POLICY_PROP:
-		oldpn->weight = newpn->weight;
+		oldpn->val.weight = newpn->val.weight;
+		break;
+	case BLKIO_POLICY_THROTL:
+		oldpn->val.bps = newpn->val.bps;
 		break;
 	default:
 		BUG();
@@ -725,13 +786,23 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 		struct blkio_group *blkg, struct blkio_policy_node *pn)
 {
 	unsigned int weight;
+	u64 bps;
 
 	switch(pn->plid) {
 	case BLKIO_POLICY_PROP:
-		weight = pn->weight ? pn->weight :
+		weight = pn->val.weight ? pn->val.weight :
 				blkcg->weight;
 		blkio_update_group_weight(blkg, weight);
 		break;
+	case BLKIO_POLICY_THROTL:
+		switch(pn->fileid) {
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			bps = pn->val.bps ? pn->val.bps : (-1);
+			blkio_update_group_bps(blkg, bps, pn->fileid);
+			break;
+		}
+		break;
 	default:
 		BUG();
 	}
@@ -826,7 +897,17 @@ blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
 		case BLKIO_POLICY_PROP:
 			if (pn->fileid == BLKIO_PROP_weight_device)
 				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-					MINOR(pn->dev), pn->weight);
+					MINOR(pn->dev), pn->val.weight);
+			break;
+		case BLKIO_POLICY_THROTL:
+			if (pn->fileid == BLKIO_THROTL_read_bps_device)
+				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
+					MINOR(pn->dev), pn->val.bps);
+			else if (pn->fileid == BLKIO_THROTL_write_bps_device)
+				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
+					MINOR(pn->dev), pn->val.bps);
+			else
+				BUG();
 			break;
 		default:
 			BUG();
@@ -869,6 +950,16 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
 			BUG();
 		}
 		break;
+	case BLKIO_POLICY_THROTL:
+		switch(name){
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			blkio_read_policy_node_files(cft, blkcg, m);
+			return 0;
+		default:
+			BUG();
+		}
+		break;
 	default:
 		BUG();
 	}
@@ -959,7 +1050,18 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 			BUG();
 		}
 		break;
-
+	case BLKIO_POLICY_THROTL:
+		switch(name){
+		case BLKIO_THROTL_io_service_bytes:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICE_BYTES, 1);
+		case BLKIO_THROTL_io_serviced:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICED, 1);
+		default:
+			BUG();
+		}
+		break;
 	default:
 		BUG();
 	}
@@ -1052,6 +1154,23 @@ struct cftype blkio_files[] = {
 		.read_u64 = blkiocg_file_read_u64,
 		.write_u64 = blkiocg_file_write_u64,
 	},
+	{
+		.name = "throttle.read_bps_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_bps_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+
+	{
+		.name = "throttle.write_bps_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_bps_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
 	{
 		.name = "time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
@@ -1070,12 +1189,24 @@ struct cftype blkio_files[] = {
 				BLKIO_PROP_io_service_bytes),
 		.read_map = blkiocg_file_read_map,
 	},
+	{
+		.name = "throttle.io_service_bytes",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_io_service_bytes),
+		.read_map = blkiocg_file_read_map,
+	},
 	{
 		.name = "io_serviced",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
 				BLKIO_PROP_io_serviced),
 		.read_map = blkiocg_file_read_map,
 	},
+	{
+		.name = "throttle.io_serviced",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_io_serviced),
+		.read_map = blkiocg_file_read_map,
+	},
 	{
 		.name = "io_service_time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c8de2598429d..1b738827b2f6 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -17,6 +17,7 @@
 
 enum blkio_policy_id {
 	BLKIO_POLICY_PROP = 0,		/* Proportional Bandwidth division */
+	BLKIO_POLICY_THROTL,		/* Throttling */
 };
 
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
@@ -88,6 +89,14 @@ enum blkcg_file_name_prop {
 	BLKIO_PROP_dequeue,
 };
 
+/* cgroup files owned by throttle policy */
+enum blkcg_file_name_throtl {
+	BLKIO_THROTL_read_bps_device,
+	BLKIO_THROTL_write_bps_device,
+	BLKIO_THROTL_io_service_bytes,
+	BLKIO_THROTL_io_serviced,
+};
+
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
@@ -146,23 +155,42 @@ struct blkio_group {
 struct blkio_policy_node {
 	struct list_head node;
 	dev_t dev;
-	unsigned int weight;
 	/* This node belongs to max bw policy or porportional weight policy */
 	enum blkio_policy_id plid;
 	/* cgroup file to which this rule belongs to */
 	int fileid;
+
+	union {
+		unsigned int weight;
+		/*
+		 * Rate read/write in terms of byptes per second
+		 * Whether this rate represents read or write is determined
+		 * by file type "fileid".
+		 */
+		u64 bps;
+	} val;
 };
 
 extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 				     dev_t dev);
+extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
+				     dev_t dev);
+extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
+				     dev_t dev);
 
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
 typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
 						unsigned int weight);
+typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg,
+						u64 read_bps);
+typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg,
+						u64 write_bps);
 
 struct blkio_policy_ops {
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
 	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
+	blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
+	blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
 };
 
 struct blkio_policy_type {
-- 
cgit v1.2.2


From e43473b7f223ec866f7db273697e76c337c390f9 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:35 -0400
Subject: blkio: Core implementation of throttle policy

o Actual implementation of throttling policy in block layer. Currently it
  implements READ and WRITE bytes per second throttling logic. IOPS throttling
  comes in later patches.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/Kconfig        |  12 +
 block/Makefile       |   1 +
 block/blk-core.c     |  24 ++
 block/blk-throttle.c | 909 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 946 insertions(+)
 create mode 100644 block/blk-throttle.c

(limited to 'block')

diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..6c9213ef15a1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY
 	T10/SCSI Data Integrity Field or the T13/ATA External Path
 	Protection.  If in doubt, say N.
 
+config BLK_DEV_THROTTLING
+	bool "Block layer bio throttling support"
+	depends on BLK_CGROUP=y && EXPERIMENTAL
+	default n
+	---help---
+	Block layer bio throttling support. It can be used to limit
+	the IO rate to a device. IO rate policies are per cgroup and
+	one needs to mount and use blkio cgroup controller for creating
+	cgroups and specifying per device IO rate policies.
+
+	See Documentation/cgroups/blkio-controller.txt for more information.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..c850d5ef80a2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
+obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 8d07c1b7e701..797d5095eb83 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -382,6 +382,7 @@ void blk_sync_queue(struct request_queue *q)
 	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
 	cancel_work_sync(&q->unplug_work);
+	throtl_shutdown_timer_wq(q);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -459,6 +460,8 @@ void blk_cleanup_queue(struct request_queue *q)
 	if (q->elevator)
 		elevator_exit(q->elevator);
 
+	blk_throtl_exit(q);
+
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
@@ -515,6 +518,11 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 		return NULL;
 	}
 
+	if (blk_throtl_init(q)) {
+		kmem_cache_free(blk_requestq_cachep, q);
+		return NULL;
+	}
+
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	init_timer(&q->unplug_timer);
@@ -1522,6 +1530,15 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
+		blk_throtl_bio(q, &bio);
+
+		/*
+		 * If bio = NULL, bio has been throttled and will be submitted
+		 * later.
+		 */
+		if (!bio)
+			break;
+
 		trace_block_bio_queue(q, bio);
 
 		ret = q->make_request_fn(q, bio);
@@ -2580,6 +2597,13 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
+int kblockd_schedule_delayed_work(struct request_queue *q,
+			struct delayed_work *dwork, unsigned long delay)
+{
+	return queue_delayed_work(kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_schedule_delayed_work);
+
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
new file mode 100644
index 000000000000..4b492011e0de
--- /dev/null
+++ b/block/blk-throttle.c
@@ -0,0 +1,909 @@
+/*
+ * Interface for controlling IO bandwidth on a request queue
+ *
+ * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/blktrace_api.h>
+#include "blk-cgroup.h"
+
+/* Max dispatch from a group in 1 round */
+static int throtl_grp_quantum = 8;
+
+/* Total max dispatch from all groups in one round */
+static int throtl_quantum = 32;
+
+/* Throttling is performed over 100ms slice and after that slice is renewed */
+static unsigned long throtl_slice = HZ/10;	/* 100 ms */
+
+struct throtl_rb_root {
+	struct rb_root rb;
+	struct rb_node *left;
+	unsigned int count;
+	unsigned long min_disptime;
+};
+
+#define THROTL_RB_ROOT	(struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
+			.count = 0, .min_disptime = 0}
+
+#define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
+
+struct throtl_grp {
+	/* List of throtl groups on the request queue*/
+	struct hlist_node tg_node;
+
+	/* active throtl group service_tree member */
+	struct rb_node rb_node;
+
+	/*
+	 * Dispatch time in jiffies. This is the estimated time when group
+	 * will unthrottle and is ready to dispatch more bio. It is used as
+	 * key to sort active groups in service tree.
+	 */
+	unsigned long disptime;
+
+	struct blkio_group blkg;
+	atomic_t ref;
+	unsigned int flags;
+
+	/* Two lists for READ and WRITE */
+	struct bio_list bio_lists[2];
+
+	/* Number of queued bios on READ and WRITE lists */
+	unsigned int nr_queued[2];
+
+	/* bytes per second rate limits */
+	uint64_t bps[2];
+
+	/* Number of bytes disptached in current slice */
+	uint64_t bytes_disp[2];
+
+	/* When did we start a new slice */
+	unsigned long slice_start[2];
+	unsigned long slice_end[2];
+};
+
+struct throtl_data
+{
+	/* List of throtl groups */
+	struct hlist_head tg_list;
+
+	/* service tree for active throtl groups */
+	struct throtl_rb_root tg_service_tree;
+
+	struct throtl_grp root_tg;
+	struct request_queue *queue;
+
+	/* Total Number of queued bios on READ and WRITE lists */
+	unsigned int nr_queued[2];
+
+	/*
+	 * number of total undestroyed groups (excluding root group)
+	 */
+	unsigned int nr_undestroyed_grps;
+
+	/* Work for dispatching throttled bios */
+	struct delayed_work throtl_work;
+};
+
+enum tg_state_flags {
+	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
+};
+
+#define THROTL_TG_FNS(name)						\
+static inline void throtl_mark_tg_##name(struct throtl_grp *tg)		\
+{									\
+	(tg)->flags |= (1 << THROTL_TG_FLAG_##name);			\
+}									\
+static inline void throtl_clear_tg_##name(struct throtl_grp *tg)	\
+{									\
+	(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);			\
+}									\
+static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
+{									\
+	return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;	\
+}
+
+THROTL_TG_FNS(on_rr);
+
+#define throtl_log_tg(td, tg, fmt, args...)				\
+	blk_add_trace_msg((td)->queue, "throtl %s " fmt,		\
+				blkg_path(&(tg)->blkg), ##args);      	\
+
+#define throtl_log(td, fmt, args...)	\
+	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
+
+static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
+{
+	if (blkg)
+		return container_of(blkg, struct throtl_grp, blkg);
+
+	return NULL;
+}
+
+static inline int total_nr_queued(struct throtl_data *td)
+{
+	return (td->nr_queued[0] + td->nr_queued[1]);
+}
+
+static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
+{
+	atomic_inc(&tg->ref);
+	return tg;
+}
+
+static void throtl_put_tg(struct throtl_grp *tg)
+{
+	BUG_ON(atomic_read(&tg->ref) <= 0);
+	if (!atomic_dec_and_test(&tg->ref))
+		return;
+	kfree(tg);
+}
+
+static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
+			struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct throtl_grp *tg = NULL;
+	void *key = td;
+	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+	unsigned int major, minor;
+
+	/*
+	 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
+	 * tree of blkg (instead of traversing through hash list all
+	 * the time.
+	 */
+	tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+
+	/* Fill in device details for root group */
+	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		tg->blkg.dev = MKDEV(major, minor);
+		goto done;
+	}
+
+	if (tg)
+		goto done;
+
+	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+	if (!tg)
+		goto done;
+
+	INIT_HLIST_NODE(&tg->tg_node);
+	RB_CLEAR_NODE(&tg->rb_node);
+	bio_list_init(&tg->bio_lists[0]);
+	bio_list_init(&tg->bio_lists[1]);
+
+	/*
+	 * Take the initial reference that will be released on destroy
+	 * This can be thought of a joint reference by cgroup and
+	 * request queue which will be dropped by either request queue
+	 * exit or cgroup deletion path depending on who is exiting first.
+	 */
+	atomic_set(&tg->ref, 1);
+
+	/* Add group onto cgroup list */
+	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
+				MKDEV(major, minor), BLKIO_POLICY_THROTL);
+
+	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
+	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
+
+	hlist_add_head(&tg->tg_node, &td->tg_list);
+	td->nr_undestroyed_grps++;
+done:
+	return tg;
+}
+
+static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+{
+	struct cgroup *cgroup;
+	struct throtl_grp *tg = NULL;
+
+	rcu_read_lock();
+	cgroup = task_cgroup(current, blkio_subsys_id);
+	tg = throtl_find_alloc_tg(td, cgroup);
+	if (!tg)
+		tg = &td->root_tg;
+	rcu_read_unlock();
+	return tg;
+}
+
+static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
+{
+	/* Service tree is empty */
+	if (!root->count)
+		return NULL;
+
+	if (!root->left)
+		root->left = rb_first(&root->rb);
+
+	if (root->left)
+		return rb_entry_tg(root->left);
+
+	return NULL;
+}
+
+static void rb_erase_init(struct rb_node *n, struct rb_root *root)
+{
+	rb_erase(n, root);
+	RB_CLEAR_NODE(n);
+}
+
+static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
+{
+	if (root->left == n)
+		root->left = NULL;
+	rb_erase_init(n, &root->rb);
+	--root->count;
+}
+
+static void update_min_dispatch_time(struct throtl_rb_root *st)
+{
+	struct throtl_grp *tg;
+
+	tg = throtl_rb_first(st);
+	if (!tg)
+		return;
+
+	st->min_disptime = tg->disptime;
+}
+
+static void
+tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
+{
+	struct rb_node **node = &st->rb.rb_node;
+	struct rb_node *parent = NULL;
+	struct throtl_grp *__tg;
+	unsigned long key = tg->disptime;
+	int left = 1;
+
+	while (*node != NULL) {
+		parent = *node;
+		__tg = rb_entry_tg(parent);
+
+		if (time_before(key, __tg->disptime))
+			node = &parent->rb_left;
+		else {
+			node = &parent->rb_right;
+			left = 0;
+		}
+	}
+
+	if (left)
+		st->left = &tg->rb_node;
+
+	rb_link_node(&tg->rb_node, parent, node);
+	rb_insert_color(&tg->rb_node, &st->rb);
+}
+
+static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	struct throtl_rb_root *st = &td->tg_service_tree;
+
+	tg_service_tree_add(st, tg);
+	throtl_mark_tg_on_rr(tg);
+	st->count++;
+}
+
+static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	if (!throtl_tg_on_rr(tg))
+		__throtl_enqueue_tg(td, tg);
+}
+
+static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
+	throtl_clear_tg_on_rr(tg);
+}
+
+static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	if (throtl_tg_on_rr(tg))
+		__throtl_dequeue_tg(td, tg);
+}
+
+static void throtl_schedule_next_dispatch(struct throtl_data *td)
+{
+	struct throtl_rb_root *st = &td->tg_service_tree;
+
+	/*
+	 * If there are more bios pending, schedule more work.
+	 */
+	if (!total_nr_queued(td))
+		return;
+
+	BUG_ON(!st->count);
+
+	update_min_dispatch_time(st);
+
+	if (time_before_eq(st->min_disptime, jiffies))
+		throtl_schedule_delayed_work(td->queue, 0);
+	else
+		throtl_schedule_delayed_work(td->queue,
+				(st->min_disptime - jiffies));
+}
+
+static inline void
+throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+	tg->bytes_disp[rw] = 0;
+	tg->slice_start[rw] = jiffies;
+	tg->slice_end[rw] = jiffies + throtl_slice;
+	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
+			rw == READ ? 'R' : 'W', tg->slice_start[rw],
+			tg->slice_end[rw], jiffies);
+}
+
+static inline void throtl_extend_slice(struct throtl_data *td,
+		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+{
+	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+	throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
+			rw == READ ? 'R' : 'W', tg->slice_start[rw],
+			tg->slice_end[rw], jiffies);
+}
+
+/* Determine if previously allocated or extended slice is complete or not */
+static bool
+throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
+		return 0;
+
+	return 1;
+}
+
+/* Trim the used slices and adjust slice start accordingly */
+static inline void
+throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+	unsigned long nr_slices, bytes_trim, time_elapsed;
+
+	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
+
+	/*
+	 * If bps are unlimited (-1), then time slice don't get
+	 * renewed. Don't try to trim the slice if slice is used. A new
+	 * slice will start when appropriate.
+	 */
+	if (throtl_slice_used(td, tg, rw))
+		return;
+
+	time_elapsed = jiffies - tg->slice_start[rw];
+
+	nr_slices = time_elapsed / throtl_slice;
+
+	if (!nr_slices)
+		return;
+
+	bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
+
+	if (!bytes_trim)
+		return;
+
+	if (tg->bytes_disp[rw] >= bytes_trim)
+		tg->bytes_disp[rw] -= bytes_trim;
+	else
+		tg->bytes_disp[rw] = 0;
+
+	tg->slice_start[rw] += nr_slices * throtl_slice;
+
+	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu"
+			" start=%lu end=%lu jiffies=%lu",
+			rw == READ ? 'R' : 'W', nr_slices, bytes_trim,
+			tg->slice_start[rw], tg->slice_end[rw], jiffies);
+}
+
+/*
+ * Returns whether one can dispatch a bio or not. Also returns approx number
+ * of jiffies to wait before this bio is with-in IO rate and can be dispatched
+ */
+static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
+				struct bio *bio, unsigned long *wait)
+{
+	bool rw = bio_data_dir(bio);
+	u64 bytes_allowed, extra_bytes;
+	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+
+	/*
+	 * Currently whole state machine of group depends on first bio
+	 * queued in the group bio list. So one should not be calling
+	 * this function with a different bio if there are other bios
+	 * queued.
+	 */
+	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+
+	/* If tg->bps = -1, then BW is unlimited */
+	if (tg->bps[rw] == -1) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	/*
+	 * If previous slice expired, start a new one otherwise renew/extend
+	 * existing slice to make sure it is at least throtl_slice interval
+	 * long since now.
+	 */
+	if (throtl_slice_used(td, tg, rw))
+		throtl_start_new_slice(td, tg, rw);
+	else {
+		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
+			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+	}
+
+	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
+
+	/* Slice has just started. Consider one slice interval */
+	if (!jiffy_elapsed)
+		jiffy_elapsed_rnd = throtl_slice;
+
+	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+
+	bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
+				/ MSEC_PER_SEC;
+
+	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	/* Calc approx time to dispatch */
+	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
+	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
+
+	if (!jiffy_wait)
+		jiffy_wait = 1;
+
+	/*
+	 * This wait time is without taking into consideration the rounding
+	 * up we did. Add that time also.
+	 */
+	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
+
+	if (wait)
+		*wait = jiffy_wait;
+
+	if (time_before(tg->slice_end[rw], jiffies + jiffy_wait))
+		throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait);
+
+	return 0;
+}
+
+static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
+{
+	bool rw = bio_data_dir(bio);
+	bool sync = bio->bi_rw & REQ_SYNC;
+
+	/* Charge the bio to the group */
+	tg->bytes_disp[rw] += bio->bi_size;
+
+	/*
+	 * TODO: This will take blkg->stats_lock. Figure out a way
+	 * to avoid this cost.
+	 */
+	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
+
+}
+
+static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
+			struct bio *bio)
+{
+	bool rw = bio_data_dir(bio);
+
+	bio_list_add(&tg->bio_lists[rw], bio);
+	/* Take a bio reference on tg */
+	throtl_ref_get_tg(tg);
+	tg->nr_queued[rw]++;
+	td->nr_queued[rw]++;
+	throtl_enqueue_tg(td, tg);
+}
+
+static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
+{
+	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
+	struct bio *bio;
+
+	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
+		tg_may_dispatch(td, tg, bio, &read_wait);
+
+	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
+		tg_may_dispatch(td, tg, bio, &write_wait);
+
+	min_wait = min(read_wait, write_wait);
+	disptime = jiffies + min_wait;
+
+	/*
+	 * If group is already on active tree, then update dispatch time
+	 * only if it is lesser than existing dispatch time. Otherwise
+	 * always update the dispatch time
+	 */
+
+	if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime))
+		return;
+
+	/* Update dispatch time */
+	throtl_dequeue_tg(td, tg);
+	tg->disptime = disptime;
+	throtl_enqueue_tg(td, tg);
+}
+
+static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
+				bool rw, struct bio_list *bl)
+{
+	struct bio *bio;
+
+	bio = bio_list_pop(&tg->bio_lists[rw]);
+	tg->nr_queued[rw]--;
+	/* Drop bio reference on tg */
+	throtl_put_tg(tg);
+
+	BUG_ON(td->nr_queued[rw] <= 0);
+	td->nr_queued[rw]--;
+
+	throtl_charge_bio(tg, bio);
+	bio_list_add(bl, bio);
+	bio->bi_rw |= REQ_THROTTLED;
+
+	throtl_trim_slice(td, tg, rw);
+}
+
+static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
+				struct bio_list *bl)
+{
+	unsigned int nr_reads = 0, nr_writes = 0;
+	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
+	unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
+	struct bio *bio;
+
+	/* Try to dispatch 75% READS and 25% WRITES */
+
+	while ((bio = bio_list_peek(&tg->bio_lists[READ]))
+		&& tg_may_dispatch(td, tg, bio, NULL)) {
+
+		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+		nr_reads++;
+
+		if (nr_reads >= max_nr_reads)
+			break;
+	}
+
+	while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
+		&& tg_may_dispatch(td, tg, bio, NULL)) {
+
+		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+		nr_writes++;
+
+		if (nr_writes >= max_nr_writes)
+			break;
+	}
+
+	return nr_reads + nr_writes;
+}
+
+static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
+{
+	unsigned int nr_disp = 0;
+	struct throtl_grp *tg;
+	struct throtl_rb_root *st = &td->tg_service_tree;
+
+	while (1) {
+		tg = throtl_rb_first(st);
+
+		if (!tg)
+			break;
+
+		if (time_before(jiffies, tg->disptime))
+			break;
+
+		throtl_dequeue_tg(td, tg);
+
+		nr_disp += throtl_dispatch_tg(td, tg, bl);
+
+		if (tg->nr_queued[0] || tg->nr_queued[1]) {
+			tg_update_disptime(td, tg);
+			throtl_enqueue_tg(td, tg);
+		}
+
+		if (nr_disp >= throtl_quantum)
+			break;
+	}
+
+	return nr_disp;
+}
+
+/* Dispatch throttled bios. Should be called without queue lock held. */
+static int throtl_dispatch(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+	unsigned int nr_disp = 0;
+	struct bio_list bio_list_on_stack;
+	struct bio *bio;
+
+	spin_lock_irq(q->queue_lock);
+
+	if (!total_nr_queued(td))
+		goto out;
+
+	bio_list_init(&bio_list_on_stack);
+
+	throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
+			total_nr_queued(td), td->nr_queued[READ],
+			td->nr_queued[WRITE]);
+
+	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
+
+	if (nr_disp)
+		throtl_log(td, "bios disp=%u", nr_disp);
+
+	throtl_schedule_next_dispatch(td);
+out:
+	spin_unlock_irq(q->queue_lock);
+
+	/*
+	 * If we dispatched some requests, unplug the queue to make sure
+	 * immediate dispatch
+	 */
+	if (nr_disp) {
+		while((bio = bio_list_pop(&bio_list_on_stack)))
+			generic_make_request(bio);
+		blk_unplug(q);
+	}
+	return nr_disp;
+}
+
+void blk_throtl_work(struct work_struct *work)
+{
+	struct throtl_data *td = container_of(work, struct throtl_data,
+					throtl_work.work);
+	struct request_queue *q = td->queue;
+
+	throtl_dispatch(q);
+}
+
+/* Call with queue lock held */
+void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
+{
+
+	struct throtl_data *td = q->td;
+	struct delayed_work *dwork = &td->throtl_work;
+
+	if (total_nr_queued(td) > 0) {
+		/*
+		 * We might have a work scheduled to be executed in future.
+		 * Cancel that and schedule a new one.
+		 */
+		__cancel_delayed_work(dwork);
+		kblockd_schedule_delayed_work(q, dwork, delay);
+		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
+				delay, jiffies);
+	}
+}
+EXPORT_SYMBOL(throtl_schedule_delayed_work);
+
+static void
+throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	/* Something wrong if we are trying to remove same group twice */
+	BUG_ON(hlist_unhashed(&tg->tg_node));
+
+	hlist_del_init(&tg->tg_node);
+
+	/*
+	 * Put the reference taken at the time of creation so that when all
+	 * queues are gone, group can be destroyed.
+	 */
+	throtl_put_tg(tg);
+	td->nr_undestroyed_grps--;
+}
+
+static void throtl_release_tgs(struct throtl_data *td)
+{
+	struct hlist_node *pos, *n;
+	struct throtl_grp *tg;
+
+	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+		/*
+		 * If cgroup removal path got to blk_group first and removed
+		 * it from cgroup list, then it will take care of destroying
+		 * cfqg also.
+		 */
+		if (!blkiocg_del_blkio_group(&tg->blkg))
+			throtl_destroy_tg(td, tg);
+	}
+}
+
+static void throtl_td_free(struct throtl_data *td)
+{
+	kfree(td);
+}
+
+/*
+ * Blk cgroup controller notification saying that blkio_group object is being
+ * delinked as associated cgroup object is going away. That also means that
+ * no new IO will come in this group. So get rid of this group as soon as
+ * any pending IO in the group is finished.
+ *
+ * This function is called under rcu_read_lock(). key is the rcu protected
+ * pointer. That means "key" is a valid throtl_data pointer as long as we are
+ * rcu read lock.
+ *
+ * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
+ * it should not be NULL as even if queue was going away, cgroup deltion
+ * path got to it first.
+ */
+void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
+{
+	unsigned long flags;
+	struct throtl_data *td = key;
+
+	spin_lock_irqsave(td->queue->queue_lock, flags);
+	throtl_destroy_tg(td, tg_of_blkg(blkg));
+	spin_unlock_irqrestore(td->queue->queue_lock, flags);
+}
+
+static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg,
+			u64 read_bps)
+{
+	tg_of_blkg(blkg)->bps[READ] = read_bps;
+}
+
+static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
+			u64 write_bps)
+{
+	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
+}
+
+void throtl_shutdown_timer_wq(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+
+	cancel_delayed_work_sync(&td->throtl_work);
+}
+
+static struct blkio_policy_type blkio_policy_throtl = {
+	.ops = {
+		.blkio_unlink_group_fn = throtl_unlink_blkio_group,
+		.blkio_update_group_read_bps_fn =
+					throtl_update_blkio_group_read_bps,
+		.blkio_update_group_write_bps_fn =
+					throtl_update_blkio_group_write_bps,
+	},
+};
+
+int blk_throtl_bio(struct request_queue *q, struct bio **biop)
+{
+	struct throtl_data *td = q->td;
+	struct throtl_grp *tg;
+	struct bio *bio = *biop;
+	bool rw = bio_data_dir(bio), update_disptime = true;
+
+	if (bio->bi_rw & REQ_THROTTLED) {
+		bio->bi_rw &= ~REQ_THROTTLED;
+		return 0;
+	}
+
+	spin_lock_irq(q->queue_lock);
+	tg = throtl_get_tg(td);
+
+	if (tg->nr_queued[rw]) {
+		/*
+		 * There is already another bio queued in same dir. No
+		 * need to update dispatch time.
+		 */
+		update_disptime = false;
+		goto queue_bio;
+	}
+
+	/* Bio is with-in rate limit of group */
+	if (tg_may_dispatch(td, tg, bio, NULL)) {
+		throtl_charge_bio(tg, bio);
+		goto out;
+	}
+
+queue_bio:
+	throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu"
+			" queued=%d/%d", rw == READ ? 'R' : 'W',
+			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+			tg->nr_queued[READ], tg->nr_queued[WRITE]);
+
+	throtl_add_bio_tg(q->td, tg, bio);
+	*biop = NULL;
+
+	if (update_disptime) {
+		tg_update_disptime(td, tg);
+		throtl_schedule_next_dispatch(td);
+	}
+
+out:
+	spin_unlock_irq(q->queue_lock);
+	return 0;
+}
+
+int blk_throtl_init(struct request_queue *q)
+{
+	struct throtl_data *td;
+	struct throtl_grp *tg;
+
+	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
+	if (!td)
+		return -ENOMEM;
+
+	INIT_HLIST_HEAD(&td->tg_list);
+	td->tg_service_tree = THROTL_RB_ROOT;
+
+	/* Init root group */
+	tg = &td->root_tg;
+	INIT_HLIST_NODE(&tg->tg_node);
+	RB_CLEAR_NODE(&tg->rb_node);
+	bio_list_init(&tg->bio_lists[0]);
+	bio_list_init(&tg->bio_lists[1]);
+
+	/* Practically unlimited BW */
+	tg->bps[0] = tg->bps[1] = -1;
+	atomic_set(&tg->ref, 1);
+
+	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+
+	rcu_read_lock();
+	blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
+					0, BLKIO_POLICY_THROTL);
+	rcu_read_unlock();
+
+	/* Attach throtl data to request queue */
+	td->queue = q;
+	q->td = td;
+	return 0;
+}
+
+void blk_throtl_exit(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+	bool wait = false;
+
+	BUG_ON(!td);
+
+	throtl_shutdown_timer_wq(q);
+
+	spin_lock_irq(q->queue_lock);
+	throtl_release_tgs(td);
+	blkiocg_del_blkio_group(&td->root_tg.blkg);
+
+	/* If there are other groups */
+	if (td->nr_undestroyed_grps >= 1)
+		wait = true;
+
+	spin_unlock_irq(q->queue_lock);
+
+	/*
+	 * Wait for tg->blkg->key accessors to exit their grace periods.
+	 * Do this wait only if there are other undestroyed groups out
+	 * there (other than root group). This can happen if cgroup deletion
+	 * path claimed the responsibility of cleaning up a group before
+	 * queue cleanup code get to the group.
+	 *
+	 * Do not call synchronize_rcu() unconditionally as there are drivers
+	 * which create/delete request queue hundreds of times during scan/boot
+	 * and synchronize_rcu() can take significant time and slow down boot.
+	 */
+	if (wait)
+		synchronize_rcu();
+	throtl_td_free(td);
+}
+
+static int __init throtl_init(void)
+{
+	blkio_policy_register(&blkio_policy_throtl);
+	return 0;
+}
+
+module_init(throtl_init);
-- 
cgit v1.2.2


From 7702e8f45b0a3bb262b9366c60beb5445758d94c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:36 -0400
Subject: blk-cgroup: cgroup changes for IOPS limit support

o cgroup changes for IOPS throttling rules.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++-------
 block/blk-cgroup.h |  13 +++++
 2 files changed, 135 insertions(+), 17 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index aae8c930a6f8..20ce6f584e43 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -149,6 +149,27 @@ static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
 	}
 }
 
+static inline void blkio_update_group_iops(struct blkio_group *blkg,
+			unsigned int iops, int fileid)
+{
+	struct blkio_policy_type *blkiop;
+
+	list_for_each_entry(blkiop, &blkio_list, list) {
+
+		/* If this policy does not own the blkg, do not send updates */
+		if (blkiop->plid != blkg->plid)
+			continue;
+
+		if (fileid == BLKIO_THROTL_read_iops_device
+		    && blkiop->ops.blkio_update_group_read_iops_fn)
+			blkiop->ops.blkio_update_group_read_iops_fn(blkg, iops);
+
+		if (fileid == BLKIO_THROTL_write_iops_device
+		    && blkiop->ops.blkio_update_group_write_iops_fn)
+			blkiop->ops.blkio_update_group_write_iops_fn(blkg,iops);
+	}
+}
+
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
@@ -630,7 +651,7 @@ static int blkio_policy_parse_and_set(char *buf,
 {
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 	int ret;
-	unsigned long major, minor, temp;
+	unsigned long major, minor, temp, iops;
 	int i = 0;
 	dev_t dev;
 	u64 bps;
@@ -692,13 +713,28 @@ static int blkio_policy_parse_and_set(char *buf,
 		newpn->val.weight = temp;
 		break;
 	case BLKIO_POLICY_THROTL:
-		ret = strict_strtoull(s[1], 10, &bps);
-		if (ret)
-			return -EINVAL;
+		switch(fileid) {
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			ret = strict_strtoull(s[1], 10, &bps);
+			if (ret)
+				return -EINVAL;
 
-		newpn->plid = plid;
-		newpn->fileid = fileid;
-		newpn->val.bps = bps;
+			newpn->plid = plid;
+			newpn->fileid = fileid;
+			newpn->val.bps = bps;
+			break;
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
+			ret = strict_strtoul(s[1], 10, &iops);
+			if (ret)
+				return -EINVAL;
+
+			newpn->plid = plid;
+			newpn->fileid = fileid;
+			newpn->val.iops = iops;
+			break;
+		}
 		break;
 	default:
 		BUG();
@@ -744,6 +780,29 @@ uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
 		return -1;
 }
 
+unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_iops_device);
+	if (pn)
+		return pn->val.iops;
+	else
+		return -1;
+}
+
+unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_iops_device);
+	if (pn)
+		return pn->val.iops;
+	else
+		return -1;
+}
+
 /* Checks whether user asked for deleting a policy rule */
 static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
 {
@@ -753,8 +812,17 @@ static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
 			return 1;
 		break;
 	case BLKIO_POLICY_THROTL:
-		if (pn->val.bps == 0)
-			return 1;
+		switch(pn->fileid) {
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			if (pn->val.bps == 0)
+				return 1;
+			break;
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
+			if (pn->val.iops == 0)
+				return 1;
+		}
 		break;
 	default:
 		BUG();
@@ -771,7 +839,15 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 		oldpn->val.weight = newpn->val.weight;
 		break;
 	case BLKIO_POLICY_THROTL:
-		oldpn->val.bps = newpn->val.bps;
+		switch(newpn->fileid) {
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			oldpn->val.bps = newpn->val.bps;
+			break;
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
+			oldpn->val.iops = newpn->val.iops;
+		}
 		break;
 	default:
 		BUG();
@@ -785,7 +861,7 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 		struct blkio_group *blkg, struct blkio_policy_node *pn)
 {
-	unsigned int weight;
+	unsigned int weight, iops;
 	u64 bps;
 
 	switch(pn->plid) {
@@ -801,6 +877,11 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 			bps = pn->val.bps ? pn->val.bps : (-1);
 			blkio_update_group_bps(blkg, bps, pn->fileid);
 			break;
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
+			iops = pn->val.iops ? pn->val.iops : (-1);
+			blkio_update_group_iops(blkg, iops, pn->fileid);
+			break;
 		}
 		break;
 	default:
@@ -900,14 +981,18 @@ blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
 					MINOR(pn->dev), pn->val.weight);
 			break;
 		case BLKIO_POLICY_THROTL:
-			if (pn->fileid == BLKIO_THROTL_read_bps_device)
+			switch(pn->fileid) {
+			case BLKIO_THROTL_read_bps_device:
+			case BLKIO_THROTL_write_bps_device:
 				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
 					MINOR(pn->dev), pn->val.bps);
-			else if (pn->fileid == BLKIO_THROTL_write_bps_device)
-				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
-					MINOR(pn->dev), pn->val.bps);
-			else
-				BUG();
+				break;
+			case BLKIO_THROTL_read_iops_device:
+			case BLKIO_THROTL_write_iops_device:
+				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+					MINOR(pn->dev), pn->val.iops);
+				break;
+			}
 			break;
 		default:
 			BUG();
@@ -954,6 +1039,8 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
 		switch(name){
 		case BLKIO_THROTL_read_bps_device:
 		case BLKIO_THROTL_write_bps_device:
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
 			blkio_read_policy_node_files(cft, blkcg, m);
 			return 0;
 		default:
@@ -1171,6 +1258,24 @@ struct cftype blkio_files[] = {
 		.write_string = blkiocg_file_write,
 		.max_write_len = 256,
 	},
+
+	{
+		.name = "throttle.read_iops_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_iops_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+
+	{
+		.name = "throttle.write_iops_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_iops_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
 	{
 		.name = "time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1b738827b2f6..2070053a30b1 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -93,6 +93,8 @@ enum blkcg_file_name_prop {
 enum blkcg_file_name_throtl {
 	BLKIO_THROTL_read_bps_device,
 	BLKIO_THROTL_write_bps_device,
+	BLKIO_THROTL_read_iops_device,
+	BLKIO_THROTL_write_iops_device,
 	BLKIO_THROTL_io_service_bytes,
 	BLKIO_THROTL_io_serviced,
 };
@@ -168,6 +170,7 @@ struct blkio_policy_node {
 		 * by file type "fileid".
 		 */
 		u64 bps;
+		unsigned int iops;
 	} val;
 };
 
@@ -177,6 +180,10 @@ extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
 				     dev_t dev);
+extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
+				     dev_t dev);
+extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
+				     dev_t dev);
 
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
 typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
@@ -185,12 +192,18 @@ typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg,
 						u64 read_bps);
 typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg,
 						u64 write_bps);
+typedef void (blkio_update_group_read_iops_fn) (struct blkio_group *blkg,
+						unsigned int read_iops);
+typedef void (blkio_update_group_write_iops_fn) (struct blkio_group *blkg,
+						unsigned int write_iops);
 
 struct blkio_policy_ops {
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
 	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
 	blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
 	blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
+	blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
+	blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
 };
 
 struct blkio_policy_type {
-- 
cgit v1.2.2


From 8e89d13f4ede2467629a971618537430fafaaea3 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:37 -0400
Subject: blkio: Implementation of IOPS limit logic

o core logic of implementing IOPS throttling.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 164 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 127 insertions(+), 37 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 4b492011e0de..af53f37c1b13 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -59,8 +59,13 @@ struct throtl_grp {
 	/* bytes per second rate limits */
 	uint64_t bps[2];
 
+	/* IOPS limits */
+	unsigned int iops[2];
+
 	/* Number of bytes disptached in current slice */
 	uint64_t bytes_disp[2];
+	/* Number of bio's dispatched in current slice */
+	unsigned int io_disp[2];
 
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
@@ -194,6 +199,8 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
 
 	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
 	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
+	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
+	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
 
 	hlist_add_head(&tg->tg_node, &td->tg_list);
 	td->nr_undestroyed_grps++;
@@ -335,6 +342,7 @@ static inline void
 throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
 	tg->bytes_disp[rw] = 0;
+	tg->io_disp[rw] = 0;
 	tg->slice_start[rw] = jiffies;
 	tg->slice_end[rw] = jiffies + throtl_slice;
 	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
@@ -365,7 +373,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 static inline void
 throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
-	unsigned long nr_slices, bytes_trim, time_elapsed;
+	unsigned long nr_slices, bytes_trim, time_elapsed, io_trim;
 
 	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
 
@@ -385,8 +393,9 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 		return;
 
 	bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
+	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
 
-	if (!bytes_trim)
+	if (!bytes_trim && !io_trim)
 		return;
 
 	if (tg->bytes_disp[rw] >= bytes_trim)
@@ -394,51 +403,62 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 	else
 		tg->bytes_disp[rw] = 0;
 
+	if (tg->io_disp[rw] >= io_trim)
+		tg->io_disp[rw] -= io_trim;
+	else
+		tg->io_disp[rw] = 0;
+
 	tg->slice_start[rw] += nr_slices * throtl_slice;
 
-	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu"
+	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu"
 			" start=%lu end=%lu jiffies=%lu",
-			rw == READ ? 'R' : 'W', nr_slices, bytes_trim,
+			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
 			tg->slice_start[rw], tg->slice_end[rw], jiffies);
 }
 
-/*
- * Returns whether one can dispatch a bio or not. Also returns approx number
- * of jiffies to wait before this bio is with-in IO rate and can be dispatched
- */
-static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
-				struct bio *bio, unsigned long *wait)
+static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
+		struct bio *bio, unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
-	u64 bytes_allowed, extra_bytes;
+	unsigned int io_allowed;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 
-	/*
-	 * Currently whole state machine of group depends on first bio
-	 * queued in the group bio list. So one should not be calling
-	 * this function with a different bio if there are other bios
-	 * queued.
-	 */
-	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 
-	/* If tg->bps = -1, then BW is unlimited */
-	if (tg->bps[rw] == -1) {
+	/* Slice has just started. Consider one slice interval */
+	if (!jiffy_elapsed)
+		jiffy_elapsed_rnd = throtl_slice;
+
+	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+
+	io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
+				/ MSEC_PER_SEC;
+
+	if (tg->io_disp[rw] + 1 <= io_allowed) {
 		if (wait)
 			*wait = 0;
 		return 1;
 	}
 
-	/*
-	 * If previous slice expired, start a new one otherwise renew/extend
-	 * existing slice to make sure it is at least throtl_slice interval
-	 * long since now.
-	 */
-	if (throtl_slice_used(td, tg, rw))
-		throtl_start_new_slice(td, tg, rw);
-	else {
-		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
-			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
-	}
+	/* Calc approx time to dispatch */
+	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
+
+	if (jiffy_wait > jiffy_elapsed)
+		jiffy_wait = jiffy_wait - jiffy_elapsed;
+	else
+		jiffy_wait = 1;
+
+	if (wait)
+		*wait = jiffy_wait;
+	return 0;
+}
+
+static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
+		struct bio *bio, unsigned long *wait)
+{
+	bool rw = bio_data_dir(bio);
+	u64 bytes_allowed, extra_bytes;
+	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 
@@ -469,12 +489,62 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 	 * up we did. Add that time also.
 	 */
 	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
-
 	if (wait)
 		*wait = jiffy_wait;
+	return 0;
+}
+
+/*
+ * Returns whether one can dispatch a bio or not. Also returns approx number
+ * of jiffies to wait before this bio is with-in IO rate and can be dispatched
+ */
+static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
+				struct bio *bio, unsigned long *wait)
+{
+	bool rw = bio_data_dir(bio);
+	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
+
+	/*
+ 	 * Currently whole state machine of group depends on first bio
+	 * queued in the group bio list. So one should not be calling
+	 * this function with a different bio if there are other bios
+	 * queued.
+	 */
+	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
 
-	if (time_before(tg->slice_end[rw], jiffies + jiffy_wait))
-		throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait);
+	/* If tg->bps = -1, then BW is unlimited */
+	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	/*
+	 * If previous slice expired, start a new one otherwise renew/extend
+	 * existing slice to make sure it is at least throtl_slice interval
+	 * long since now.
+	 */
+	if (throtl_slice_used(td, tg, rw))
+		throtl_start_new_slice(td, tg, rw);
+	else {
+		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
+			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+	}
+
+	if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
+	    && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	max_wait = max(bps_wait, iops_wait);
+
+	if (wait)
+		*wait = max_wait;
+
+	if (time_before(tg->slice_end[rw], jiffies + max_wait))
+		throtl_extend_slice(td, tg, rw, jiffies + max_wait);
 
 	return 0;
 }
@@ -486,13 +556,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 
 	/* Charge the bio to the group */
 	tg->bytes_disp[rw] += bio->bi_size;
+	tg->io_disp[rw]++;
 
 	/*
 	 * TODO: This will take blkg->stats_lock. Figure out a way
 	 * to avoid this cost.
 	 */
 	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
-
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -763,6 +833,18 @@ static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
 	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
 }
 
+static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg,
+			unsigned int read_iops)
+{
+	tg_of_blkg(blkg)->iops[READ] = read_iops;
+}
+
+static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg,
+			unsigned int write_iops)
+{
+	tg_of_blkg(blkg)->iops[WRITE] = write_iops;
+}
+
 void throtl_shutdown_timer_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
@@ -777,7 +859,12 @@ static struct blkio_policy_type blkio_policy_throtl = {
 					throtl_update_blkio_group_read_bps,
 		.blkio_update_group_write_bps_fn =
 					throtl_update_blkio_group_write_bps,
+		.blkio_update_group_read_iops_fn =
+					throtl_update_blkio_group_read_iops,
+		.blkio_update_group_write_iops_fn =
+					throtl_update_blkio_group_write_iops,
 	},
+	.plid = BLKIO_POLICY_THROTL,
 };
 
 int blk_throtl_bio(struct request_queue *q, struct bio **biop)
@@ -811,9 +898,11 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 	}
 
 queue_bio:
-	throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu"
-			" queued=%d/%d", rw == READ ? 'R' : 'W',
+	throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
+			" iodisp=%u iops=%u queued=%d/%d",
+			rw == READ ? 'R' : 'W',
 			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+			tg->io_disp[rw], tg->iops[rw],
 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
 
 	throtl_add_bio_tg(q->td, tg, bio);
@@ -850,6 +939,7 @@ int blk_throtl_init(struct request_queue *q)
 
 	/* Practically unlimited BW */
 	tg->bps[0] = tg->bps[1] = -1;
+	tg->iops[0] = tg->iops[1] = -1;
 	atomic_set(&tg->ref, 1);
 
 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
-- 
cgit v1.2.2


From 01ea50638bc04ca5259f5711fcdedefcdde1cf43 Mon Sep 17 00:00:00 2001
From: "Signed-off-by: Jan Kara" <jack@suse.cz>
Date: Thu, 16 Sep 2010 20:36:36 +0200
Subject: block: Fix race during disk initialization

When a new disk is being discovered, add_disk() first ties the bdev to gendisk
(via register_disk()->blkdev_get()) and only after that calls
bdi_register_bdev(). Because register_disk() also creates disk's kobject, it
can happen that userspace manages to open and modify the device's data (or
inode) before its BDI is properly initialized leading to a warning in
__mark_inode_dirty().

Fix the problem by registering BDI early enough.

This patch addresses https://bugzilla.kernel.org/show_bug.cgi?id=16312

Cc: stable@kernel.org
Reported-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 5c9c503de423..7923e720ddf5 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -541,13 +541,15 @@ void add_disk(struct gendisk *disk)
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
+	/* Register BDI before referencing it from bdev */ 
+	bdi = &disk->queue->backing_dev_info;
+	bdi_register_dev(bdi, disk_devt(disk));
+
 	blk_register_region(disk_devt(disk), disk->minors, NULL,
 			    exact_match, exact_lock, disk);
 	register_disk(disk);
 	blk_register_queue(disk);
 
-	bdi = &disk->queue->backing_dev_info;
-	bdi_register_dev(bdi, disk_devt(disk));
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
 	WARN_ON(retval);
-- 
cgit v1.2.2


From dd3932eddf428571762596e17b65f5dc92ca361b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 16 Sep 2010 20:51:46 +0200
Subject: block: remove BLKDEV_IFL_WAIT

All the blkdev_issue_* helpers can only sanely be used for synchronous
caller.  To issue cache flushes or barriers asynchronously the caller needs
to set up a bio by itself with a completion callback to move the asynchronous
state machine ahead.  So drop the BLKDEV_IFL_WAIT flag that is always
specified when calling blkdev_issue_* and also remove the now unused flags
argument to blkdev_issue_flush and blkdev_issue_zeroout.  For
blkdev_issue_discard we need to keep it for the secure discard flag, which
gains a more descriptive name and loses the bitops vs flag confusion.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 25 +++++++++++--------------
 block/blk-lib.c   | 21 ++++++++-------------
 block/ioctl.c     |  4 ++--
 3 files changed, 21 insertions(+), 29 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 62b7df9bca9d..54b123d6563e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -205,7 +205,6 @@ static void bio_end_flush(struct bio *bio, int err)
  * @bdev:	blockdev to issue flush for
  * @gfp_mask:	memory allocation flags (for bio_alloc)
  * @error_sector:	error sector
- * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *    Issue a flush for the block device in question. Caller can supply
@@ -214,7 +213,7 @@ static void bio_end_flush(struct bio *bio, int err)
  *    request was pushed in some internal queue for later handling.
  */
 int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
-		sector_t *error_sector, unsigned long flags)
+		sector_t *error_sector)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q;
@@ -240,21 +239,19 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 	bio = bio_alloc(gfp_mask, 0);
 	bio->bi_end_io = bio_end_flush;
 	bio->bi_bdev = bdev;
-	if (test_bit(BLKDEV_WAIT, &flags))
-		bio->bi_private = &wait;
+	bio->bi_private = &wait;
 
 	bio_get(bio);
 	submit_bio(WRITE_FLUSH, bio);
-	if (test_bit(BLKDEV_WAIT, &flags)) {
-		wait_for_completion(&wait);
-		/*
-		 * The driver must store the error location in ->bi_sector, if
-		 * it supports it. For non-stacked drivers, this should be
-		 * copied from blk_rq_pos(rq).
-		 */
-		if (error_sector)
-			*error_sector = bio->bi_sector;
-	}
+	wait_for_completion(&wait);
+
+	/*
+	 * The driver must store the error location in ->bi_sector, if
+	 * it supports it. For non-stacked drivers, this should be
+	 * copied from blk_rq_pos(rq).
+	 */
+	if (error_sector)
+               *error_sector = bio->bi_sector;
 
 	if (!bio_flagged(bio, BIO_UPTODATE))
 		ret = -EIO;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index fe2e6ed0f510..1a320d2406b0 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -61,7 +61,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		max_discard_sectors &= ~(disc_sects - 1);
 	}
 
-	if (flags & BLKDEV_IFL_SECURE) {
+	if (flags & BLKDEV_DISCARD_SECURE) {
 		if (!blk_queue_secdiscard(q))
 			return -EOPNOTSUPP;
 		type |= REQ_SECURE;
@@ -77,8 +77,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio->bi_sector = sector;
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
-		if (flags & BLKDEV_IFL_WAIT)
-			bio->bi_private = &wait;
+		bio->bi_private = &wait;
 
 		if (nr_sects > max_discard_sectors) {
 			bio->bi_size = max_discard_sectors << 9;
@@ -92,8 +91,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio_get(bio);
 		submit_bio(type, bio);
 
-		if (flags & BLKDEV_IFL_WAIT)
-			wait_for_completion(&wait);
+		wait_for_completion(&wait);
 
 		if (bio_flagged(bio, BIO_EOPNOTSUPP))
 			ret = -EOPNOTSUPP;
@@ -139,7 +137,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *  Generate and issue number of bios with zerofiled pages.
@@ -148,7 +145,7 @@ static void bio_batch_end_io(struct bio *bio, int err)
  */
 
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+			sector_t nr_sects, gfp_t gfp_mask)
 {
 	int ret;
 	struct bio *bio;
@@ -174,8 +171,7 @@ submit:
 		bio->bi_sector = sector;
 		bio->bi_bdev   = bdev;
 		bio->bi_end_io = bio_batch_end_io;
-		if (flags & BLKDEV_IFL_WAIT)
-			bio->bi_private = &bb;
+		bio->bi_private = &bb;
 
 		while (nr_sects != 0) {
 			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
@@ -193,10 +189,9 @@ submit:
 		submit_bio(WRITE, bio);
 	}
 
-	if (flags & BLKDEV_IFL_WAIT)
-		/* Wait for bios in-flight */
-		while ( issued != atomic_read(&bb.done))
-			wait_for_completion(&wait);
+	/* Wait for bios in-flight */
+	while (issued != atomic_read(&bb.done))
+		wait_for_completion(&wait);
 
 	if (!test_bit(BIO_UPTODATE, &bb.flags))
 		/* One of bios in the batch was completed with error.*/
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..cb2b9099862b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -116,7 +116,7 @@ static int blkdev_reread_part(struct block_device *bdev)
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len, int secure)
 {
-	unsigned long flags = BLKDEV_IFL_WAIT;
+	unsigned long flags = 0;
 
 	if (start & 511)
 		return -EINVAL;
@@ -128,7 +128,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 	if (start + len > (bdev->bd_inode->i_size >> 9))
 		return -EINVAL;
 	if (secure)
-		flags |= BLKDEV_IFL_SECURE;
+		flags |= BLKDEV_DISCARD_SECURE;
 	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
 }
 
-- 
cgit v1.2.2


From 749ef9f8423054e326f3a246327ed2db4b6d395f Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Mon, 20 Sep 2010 15:24:50 +0200
Subject: cfq: improve fsync performance for small files

Fsync performance for small files achieved by cfq on high-end disks is
lower than what deadline can achieve, due to idling introduced between
the sync write happening in process context and the journal commit.

Moreover, when competing with a sequential reader, a process writing
small files and fsync-ing them is starved.

This patch fixes the two problems by:
- marking journal commits as WRITE_SYNC, so that they get the REQ_NOIDLE
  flag set,
- force all queues that have REQ_NOIDLE requests to be put in the noidle
  tree.

Having the queue associated to the fsync-ing process and the one associated
 to journal commits in the noidle tree allows:
- switching between them without idling,
- fairness vs. competing idling queues, since they will be serviced only
  after the noidle tree expires its slice.

Acked-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Tested-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b9f86190763b..684592621736 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -216,7 +216,6 @@ struct cfq_data {
 	enum wl_type_t serving_type;
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
-	bool noidle_tree_requires_idle;
 
 	/*
 	 * Each priority tree is sorted by next_request position.  These
@@ -2126,7 +2125,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	slice = max_t(unsigned, slice, CFQ_MIN_TT);
 	cfq_log(cfqd, "workload slice:%d", slice);
 	cfqd->workload_expires = jiffies + slice;
-	cfqd->noidle_tree_requires_idle = false;
 }
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@ -3108,7 +3106,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
 		cfq_mark_cfqq_deep(cfqq);
 
-	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
+	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
+		enable_idle = 0;
+	else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
 	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
@@ -3421,17 +3421,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			cfq_slice_expired(cfqd, 1);
 		else if (sync && cfqq_empty &&
 			 !cfq_close_cooperator(cfqd, cfqq)) {
-			cfqd->noidle_tree_requires_idle |=
-				!(rq->cmd_flags & REQ_NOIDLE);
-			/*
-			 * Idling is enabled for SYNC_WORKLOAD.
-			 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
-			 * only if we processed at least one !REQ_NOIDLE request
-			 */
-			if (cfqd->serving_type == SYNC_WORKLOAD
-			    || cfqd->noidle_tree_requires_idle
-			    || cfqq->cfqg->nr_cfqq == 1)
-				cfq_arm_slice_timer(cfqd);
+			cfq_arm_slice_timer(cfqd);
 		}
 	}
 
-- 
cgit v1.2.2


From 4b1977698ceb4c4caa800d475127139da49966f9 Mon Sep 17 00:00:00 2001
From: Mark Lord <kernel@teksavvy.com>
Date: Fri, 24 Sep 2010 09:51:13 -0400
Subject: block: Prevent hang_check firing during long I/O

During long I/O operations, the hang_check timer may fire,
trigger stack dumps that unnecessarily alarm the user.

Eg.  hdparm --security-erase NULL /dev/sdb  ## can take *hours* to complete

So, if hang_check is armed, we should wake up periodically
to prevent it from triggering.  This patch uses a wake-up interval
equal to half the hang_check timer period, which keeps overhead low enough.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-exec.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-exec.c b/block/blk-exec.c
index e1672f14840e..cf1456a02acd 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 	DECLARE_COMPLETION_ONSTACK(wait);
 	char sense[SCSI_SENSE_BUFFERSIZE];
 	int err = 0;
+	unsigned long hang_check;
 
 	/*
 	 * we need an extra reference to the request, so we can look at
@@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 
 	rq->end_io_data = &wait;
 	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
-	wait_for_completion(&wait);
+
+	/* Prevent hang_check timer from firing at us during very long I/O */
+	hang_check = sysctl_hung_task_timeout_secs;
+	if (hang_check)
+		while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2)));
+	else
+		wait_for_completion(&wait);
 
 	if (rq->errors)
 		err = -EIO;
-- 
cgit v1.2.2


From c49825facfd4969585224a896a5e717f88450cad Mon Sep 17 00:00:00 2001
From: Malahal Naineni <malahal@us.ibm.com>
Date: Fri, 24 Sep 2010 20:25:49 +0200
Subject: block: set the bounce_pfn to the actual DMA limit rather than to max
 memory

The bounce_pfn of the request queue in 64 bit systems is set to the
current max_low_pfn. Adding more memory later makes this incorrect.
Memory allocated beyond this boot time max_low_pfn appear to require
bounce buffers (bounce buffers are actually not allocated but used in
calculating segments that may result in "over max segments limit"
errors).

Signed-off-by: Malahal Naineni <malahal@us.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index f8f2ddf20613..f47af5031eaa 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -214,16 +214,14 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
-	q->limits.bounce_pfn = max_low_pfn;
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;
-	q->limits.bounce_pfn = b_pfn;
 #endif
+	q->limits.bounce_pfn = b_pfn;
 	if (dma) {
 		init_emergency_isa_pool();
 		q->bounce_gfp = GFP_NOIO | GFP_DMA;
-		q->limits.bounce_pfn = b_pfn;
 	}
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
-- 
cgit v1.2.2


From 260a67a9e534f0c7d49ddd6451833d54ba39ac81 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 1 Oct 2010 14:42:43 +0200
Subject: block: revert bad fix for memory hotplug causing bounces

Revert "block: set the bounce_pfn to the actual DMA limit rather than to max memory"

This reverts commit c49825facfd4969585224a896a5e717f88450cad.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index f47af5031eaa..f8f2ddf20613 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -214,14 +214,16 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
+	q->limits.bounce_pfn = max_low_pfn;
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;
-#endif
 	q->limits.bounce_pfn = b_pfn;
+#endif
 	if (dma) {
 		init_emergency_isa_pool();
 		q->bounce_gfp = GFP_NOIO | GFP_DMA;
+		q->limits.bounce_pfn = b_pfn;
 	}
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
-- 
cgit v1.2.2


From efb012b361cf9319cd86ff169afa7550b7aa9336 Mon Sep 17 00:00:00 2001
From: Malahal Naineni <malahal@us.ibm.com>
Date: Fri, 1 Oct 2010 14:45:27 +0200
Subject: block: set the bounce_pfn to the actual DMA limit rather than to max
 memory

The bounce_pfn of the request queue in 64 bit systems is set to the
current max_low_pfn. Adding more memory later makes this incorrect.
Memory allocated beyond this boot time max_low_pfn appear to require
bounce buffers (bounce buffers are actually not allocated but used in
calculating segments that may result in "over max segments limit"
errors).

Signed-off-by: Malahal Naineni <malahal@us.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index f8f2ddf20613..a3600a7ab8bb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -214,7 +214,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
-	q->limits.bounce_pfn = max_low_pfn;
+	q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;
-- 
cgit v1.2.2


From 13f98250f587b7defa39ed738dfa74b600e46e7b Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:49:41 +0200
Subject: blkio: Do not export throttle files if CONFIG_BLK_DEV_THROTTLING=n

Currently throttling related files were visible even if user had disabled
throttling using config options. It was switching off background throttling
of bio but not the cgroup files. This patch fixes it.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 97 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 50 insertions(+), 47 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 20ce6f584e43..86e7066a0b62 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1241,41 +1241,6 @@ struct cftype blkio_files[] = {
 		.read_u64 = blkiocg_file_read_u64,
 		.write_u64 = blkiocg_file_write_u64,
 	},
-	{
-		.name = "throttle.read_bps_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_read_bps_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
-
-	{
-		.name = "throttle.write_bps_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_write_bps_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
-
-	{
-		.name = "throttle.read_iops_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_read_iops_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
-
-	{
-		.name = "throttle.write_iops_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_write_iops_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
 	{
 		.name = "time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
@@ -1294,24 +1259,12 @@ struct cftype blkio_files[] = {
 				BLKIO_PROP_io_service_bytes),
 		.read_map = blkiocg_file_read_map,
 	},
-	{
-		.name = "throttle.io_service_bytes",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_io_service_bytes),
-		.read_map = blkiocg_file_read_map,
-	},
 	{
 		.name = "io_serviced",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
 				BLKIO_PROP_io_serviced),
 		.read_map = blkiocg_file_read_map,
 	},
-	{
-		.name = "throttle.io_serviced",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_io_serviced),
-		.read_map = blkiocg_file_read_map,
-	},
 	{
 		.name = "io_service_time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
@@ -1340,6 +1293,56 @@ struct cftype blkio_files[] = {
 		.name = "reset_stats",
 		.write_u64 = blkiocg_reset_stats,
 	},
+#ifdef CONFIG_BLK_DEV_THROTTLING
+	{
+		.name = "throttle.read_bps_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_bps_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+
+	{
+		.name = "throttle.write_bps_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_bps_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+
+	{
+		.name = "throttle.read_iops_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_iops_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+
+	{
+		.name = "throttle.write_iops_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_iops_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+	{
+		.name = "throttle.io_service_bytes",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_io_service_bytes),
+		.read_map = blkiocg_file_read_map,
+	},
+	{
+		.name = "throttle.io_serviced",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_io_serviced),
+		.read_map = blkiocg_file_read_map,
+	},
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
 		.name = "avg_queue_size",
-- 
cgit v1.2.2


From 61014e96e6ed55b8db0af31574eec2a75d4e8755 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:49:44 +0200
Subject: blkio: deletion of a cgroup was causes oops

o Now a cgroup list of blkg elements can contain blkg from multiple policies.
  Before sending an unlink event, make sure blkg belongs to they policy. If
  policy does not own the blkg, do not send update for this blkg.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 86e7066a0b62..b06ca70354e3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1411,13 +1411,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		/*
 		 * This blkio_group is being unlinked as associated cgroup is
 		 * going away. Let all the IO controlling policies know about
-		 * this event. Currently this is static call to one io
-		 * controlling policy. Once we have more policies in place, we
-		 * need some dynamic registration of callback function.
+		 * this event.
 		 */
 		spin_lock(&blkio_list_lock);
-		list_for_each_entry(blkiop, &blkio_list, list)
+		list_for_each_entry(blkiop, &blkio_list, list) {
+			if (blkiop->plid != blkg->plid)
+				continue;
 			blkiop->ops.blkio_unlink_group_fn(key, blkg);
+		}
 		spin_unlock(&blkio_list_lock);
 	} while (1);
 
-- 
cgit v1.2.2


From 02977e4af7ed3b478c505e50491ffdf3e1314cf4 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:49:48 +0200
Subject: blkio: Add root group to td->tg_list

o Currently all the dynamically allocated groups, except root grp is added
  to td->tg_list. This was not a problem so far but in next patch I will
  travel through td->tg_list to process any updates of limits on the group.
  If root group is not in tg_list, then root group's updates are not
  processed.

o It is better to root group also to tg_list instead of doing special
  processing for it during limit updates.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index af53f37c1b13..bc2936b80add 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -87,7 +87,7 @@ struct throtl_data
 	unsigned int nr_queued[2];
 
 	/*
-	 * number of total undestroyed groups (excluding root group)
+	 * number of total undestroyed groups
 	 */
 	unsigned int nr_undestroyed_grps;
 
@@ -940,7 +940,17 @@ int blk_throtl_init(struct request_queue *q)
 	/* Practically unlimited BW */
 	tg->bps[0] = tg->bps[1] = -1;
 	tg->iops[0] = tg->iops[1] = -1;
-	atomic_set(&tg->ref, 1);
+
+	/*
+	 * Set root group reference to 2. One reference will be dropped when
+	 * all groups on tg_list are being deleted during queue exit. Other
+	 * reference will remain there as we don't want to delete this group
+	 * as it is statically allocated and gets destroyed when throtl_data
+	 * goes away.
+	 */
+	atomic_set(&tg->ref, 2);
+	hlist_add_head(&tg->tg_node, &td->tg_list);
+	td->nr_undestroyed_grps++;
 
 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
 
@@ -966,10 +976,9 @@ void blk_throtl_exit(struct request_queue *q)
 
 	spin_lock_irq(q->queue_lock);
 	throtl_release_tgs(td);
-	blkiocg_del_blkio_group(&td->root_tg.blkg);
 
 	/* If there are other groups */
-	if (td->nr_undestroyed_grps >= 1)
+	if (td->nr_undestroyed_grps > 0)
 		wait = true;
 
 	spin_unlock_irq(q->queue_lock);
-- 
cgit v1.2.2


From fe0714377ee2ca161bf2afb7773e22f15f1786d4 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:49:49 +0200
Subject: blkio: Recalculate the throttled bio dispatch time upon throttle
 limit change

o Currently any cgroup throttle limit changes are processed asynchronousy and
  the change does not take affect till a new bio is dispatched from same group.

o It might happen that a user sets a redicuously low limit on throttling.
  Say 1 bytes per second on reads. In such cases simple operations like mount
  a disk can wait for a very long time.

o Once bio is throttled, there is no easy way to come out of that wait even if
  user increases the read limit later.

o This patch fixes it. Now if a user changes the cgroup limits, we recalculate
  the bio dispatch time according to new limits.

o Can't take queueu lock under blkcg_lock, hence after the change I wake
  up the dispatch thread again which recalculates the time. So there are some
  variables being synchronized across two threads without lock and I had to
  make use of barriers. Hoping I have used barriers correctly. Any review of
  memory barrier code especially will help.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c   |  15 ++++--
 block/blk-cgroup.h   |  21 ++++----
 block/blk-throttle.c | 134 ++++++++++++++++++++++++++++++++++++++++++++-------
 block/cfq-iosched.c  |   4 +-
 4 files changed, 139 insertions(+), 35 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b06ca70354e3..52c12130a5de 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -124,7 +124,8 @@ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
 		if (blkiop->plid != blkg->plid)
 			continue;
 		if (blkiop->ops.blkio_update_group_weight_fn)
-			blkiop->ops.blkio_update_group_weight_fn(blkg, weight);
+			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
+							blkg, weight);
 	}
 }
 
@@ -141,11 +142,13 @@ static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
 
 		if (fileid == BLKIO_THROTL_read_bps_device
 		    && blkiop->ops.blkio_update_group_read_bps_fn)
-			blkiop->ops.blkio_update_group_read_bps_fn(blkg, bps);
+			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
+								blkg, bps);
 
 		if (fileid == BLKIO_THROTL_write_bps_device
 		    && blkiop->ops.blkio_update_group_write_bps_fn)
-			blkiop->ops.blkio_update_group_write_bps_fn(blkg, bps);
+			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
+								blkg, bps);
 	}
 }
 
@@ -162,11 +165,13 @@ static inline void blkio_update_group_iops(struct blkio_group *blkg,
 
 		if (fileid == BLKIO_THROTL_read_iops_device
 		    && blkiop->ops.blkio_update_group_read_iops_fn)
-			blkiop->ops.blkio_update_group_read_iops_fn(blkg, iops);
+			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
+								blkg, iops);
 
 		if (fileid == BLKIO_THROTL_write_iops_device
 		    && blkiop->ops.blkio_update_group_write_iops_fn)
-			blkiop->ops.blkio_update_group_write_iops_fn(blkg,iops);
+			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
+								blkg,iops);
 	}
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2070053a30b1..034c35562dba 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -186,16 +186,17 @@ extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
-typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
-						unsigned int weight);
-typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg,
-						u64 read_bps);
-typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg,
-						u64 write_bps);
-typedef void (blkio_update_group_read_iops_fn) (struct blkio_group *blkg,
-						unsigned int read_iops);
-typedef void (blkio_update_group_write_iops_fn) (struct blkio_group *blkg,
-						unsigned int write_iops);
+
+typedef void (blkio_update_group_weight_fn) (void *key,
+			struct blkio_group *blkg, unsigned int weight);
+typedef void (blkio_update_group_read_bps_fn) (void * key,
+			struct blkio_group *blkg, u64 read_bps);
+typedef void (blkio_update_group_write_bps_fn) (void *key,
+			struct blkio_group *blkg, u64 write_bps);
+typedef void (blkio_update_group_read_iops_fn) (void *key,
+			struct blkio_group *blkg, unsigned int read_iops);
+typedef void (blkio_update_group_write_iops_fn) (void *key,
+			struct blkio_group *blkg, unsigned int write_iops);
 
 struct blkio_policy_ops {
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index bc2936b80add..11713ed852f4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -70,6 +70,9 @@ struct throtl_grp {
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
+
+	/* Some throttle limits got updated for the group */
+	bool limits_changed;
 };
 
 struct throtl_data
@@ -93,6 +96,8 @@ struct throtl_data
 
 	/* Work for dispatching throttled bios */
 	struct delayed_work throtl_work;
+
+	atomic_t limits_changed;
 };
 
 enum tg_state_flags {
@@ -592,15 +597,6 @@ static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
 	min_wait = min(read_wait, write_wait);
 	disptime = jiffies + min_wait;
 
-	/*
-	 * If group is already on active tree, then update dispatch time
-	 * only if it is lesser than existing dispatch time. Otherwise
-	 * always update the dispatch time
-	 */
-
-	if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime))
-		return;
-
 	/* Update dispatch time */
 	throtl_dequeue_tg(td, tg);
 	tg->disptime = disptime;
@@ -691,6 +687,46 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 	return nr_disp;
 }
 
+static void throtl_process_limit_change(struct throtl_data *td)
+{
+	struct throtl_grp *tg;
+	struct hlist_node *pos, *n;
+
+	/*
+	 * Make sure atomic_inc() effects from
+	 * throtl_update_blkio_group_read_bps(), group of functions are
+	 * visible.
+	 * Is this required or smp_mb__after_atomic_inc() was suffcient
+	 * after the atomic_inc().
+	 */
+	smp_rmb();
+	if (!atomic_read(&td->limits_changed))
+		return;
+
+	throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
+
+	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+		/*
+		 * Do I need an smp_rmb() here to make sure tg->limits_changed
+		 * update is visible. I am relying on smp_rmb() at the
+		 * beginning of function and not putting a new one here.
+		 */
+
+		if (throtl_tg_on_rr(tg) && tg->limits_changed) {
+			throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
+				" riops=%u wiops=%u", tg->bps[READ],
+				tg->bps[WRITE], tg->iops[READ],
+				tg->iops[WRITE]);
+			tg_update_disptime(td, tg);
+			tg->limits_changed = false;
+		}
+	}
+
+	smp_mb__before_atomic_dec();
+	atomic_dec(&td->limits_changed);
+	smp_mb__after_atomic_dec();
+}
+
 /* Dispatch throttled bios. Should be called without queue lock held. */
 static int throtl_dispatch(struct request_queue *q)
 {
@@ -701,6 +737,8 @@ static int throtl_dispatch(struct request_queue *q)
 
 	spin_lock_irq(q->queue_lock);
 
+	throtl_process_limit_change(td);
+
 	if (!total_nr_queued(td))
 		goto out;
 
@@ -821,28 +859,74 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
 	spin_unlock_irqrestore(td->queue->queue_lock, flags);
 }
 
-static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg,
-			u64 read_bps)
+/*
+ * For all update functions, key should be a valid pointer because these
+ * update functions are called under blkcg_lock, that means, blkg is
+ * valid and in turn key is valid. queue exit path can not race becuase
+ * of blkcg_lock
+ *
+ * Can not take queue lock in update functions as queue lock under blkcg_lock
+ * is not allowed. Under other paths we take blkcg_lock under queue_lock.
+ */
+static void throtl_update_blkio_group_read_bps(void *key,
+				struct blkio_group *blkg, u64 read_bps)
 {
+	struct throtl_data *td = key;
+
 	tg_of_blkg(blkg)->bps[READ] = read_bps;
+	/* Make sure read_bps is updated before setting limits_changed */
+	smp_wmb();
+	tg_of_blkg(blkg)->limits_changed = true;
+
+	/* Make sure tg->limits_changed is updated before td->limits_changed */
+	smp_mb__before_atomic_inc();
+	atomic_inc(&td->limits_changed);
+	smp_mb__after_atomic_inc();
+
+	/* Schedule a work now to process the limit change */
+	throtl_schedule_delayed_work(td->queue, 0);
 }
 
-static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
-			u64 write_bps)
+static void throtl_update_blkio_group_write_bps(void *key,
+				struct blkio_group *blkg, u64 write_bps)
 {
+	struct throtl_data *td = key;
+
 	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
+	smp_wmb();
+	tg_of_blkg(blkg)->limits_changed = true;
+	smp_mb__before_atomic_inc();
+	atomic_inc(&td->limits_changed);
+	smp_mb__after_atomic_inc();
+	throtl_schedule_delayed_work(td->queue, 0);
 }
 
-static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg,
-			unsigned int read_iops)
+static void throtl_update_blkio_group_read_iops(void *key,
+			struct blkio_group *blkg, unsigned int read_iops)
 {
+	struct throtl_data *td = key;
+
 	tg_of_blkg(blkg)->iops[READ] = read_iops;
+	smp_wmb();
+	tg_of_blkg(blkg)->limits_changed = true;
+	smp_mb__before_atomic_inc();
+	atomic_inc(&td->limits_changed);
+	smp_mb__after_atomic_inc();
+	throtl_schedule_delayed_work(td->queue, 0);
 }
 
-static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg,
-			unsigned int write_iops)
+static void throtl_update_blkio_group_write_iops(void *key,
+			struct blkio_group *blkg, unsigned int write_iops)
 {
+	struct throtl_data *td = key;
+
 	tg_of_blkg(blkg)->iops[WRITE] = write_iops;
+	smp_wmb();
+	tg_of_blkg(blkg)->limits_changed = true;
+	smp_mb__before_atomic_inc();
+	atomic_inc(&td->limits_changed);
+	smp_mb__after_atomic_inc();
+	throtl_schedule_delayed_work(td->queue, 0);
 }
 
 void throtl_shutdown_timer_wq(struct request_queue *q)
@@ -886,8 +970,14 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 		/*
 		 * There is already another bio queued in same dir. No
 		 * need to update dispatch time.
+		 * Still update the disptime if rate limits on this group
+		 * were changed.
 		 */
-		update_disptime = false;
+		if (!tg->limits_changed)
+			update_disptime = false;
+		else
+			tg->limits_changed = false;
+
 		goto queue_bio;
 	}
 
@@ -929,6 +1019,7 @@ int blk_throtl_init(struct request_queue *q)
 
 	INIT_HLIST_HEAD(&td->tg_list);
 	td->tg_service_tree = THROTL_RB_ROOT;
+	atomic_set(&td->limits_changed, 0);
 
 	/* Init root group */
 	tg = &td->root_tg;
@@ -996,6 +1087,13 @@ void blk_throtl_exit(struct request_queue *q)
 	 */
 	if (wait)
 		synchronize_rcu();
+
+	/*
+	 * Just being safe to make sure after previous flush if some body did
+	 * update limits through cgroup and another work got queued, cancel
+	 * it.
+	 */
+	throtl_shutdown_timer_wq(q);
 	throtl_td_free(td);
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 684592621736..86338d5d4d09 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -951,8 +951,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 	return NULL;
 }
 
-void
-cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
+void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
+					unsigned int weight)
 {
 	cfqg_of_blkg(blkg)->weight = weight;
 }
-- 
cgit v1.2.2


From 3aad5d3ee4e4fce8f4b5bb6ca73342dcade42b33 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:51:14 +0200
Subject: blkio-throttle: Fix link failure failure on i386

o Randy Dunlap reported following linux-next failure. This patch fixes it.

on i386:

blk-throttle.c:(.text+0x1abb8): undefined reference to `__udivdi3'
blk-throttle.c:(.text+0x1b1dc): undefined reference to `__udivdi3'

o bytes_per_second interface is 64bit and I was continuing to do 64 bit
  division even on 32bit platform without help of special macros/functions
  hence the failure.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 11713ed852f4..a46700255719 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -378,7 +378,8 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 static inline void
 throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
-	unsigned long nr_slices, bytes_trim, time_elapsed, io_trim;
+	unsigned long nr_slices, time_elapsed, io_trim;
+	u64 bytes_trim, tmp;
 
 	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
 
@@ -396,8 +397,10 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 
 	if (!nr_slices)
 		return;
+	tmp = tg->bps[rw] * throtl_slice * nr_slices;
+	do_div(tmp, HZ);
+	bytes_trim = tmp;
 
-	bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
 	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
 
 	if (!bytes_trim && !io_trim)
@@ -415,7 +418,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 
 	tg->slice_start[rw] += nr_slices * throtl_slice;
 
-	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu"
+	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
 			" start=%lu end=%lu jiffies=%lu",
 			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
 			tg->slice_start[rw], tg->slice_end[rw], jiffies);
@@ -462,7 +465,7 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 		struct bio *bio, unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
-	u64 bytes_allowed, extra_bytes;
+	u64 bytes_allowed, extra_bytes, tmp;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
@@ -473,8 +476,9 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-	bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
-				/ MSEC_PER_SEC;
+	tmp = tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd);
+	do_div(tmp, MSEC_PER_SEC);
+	bytes_allowed = tmp;
 
 	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
 		if (wait)
-- 
cgit v1.2.2


From 5e901a2b95db709c5e40599ff4df6029be1e2a12 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 21:16:38 +0200
Subject: blkio-throttle: There is no need to convert jiffies to milli seconds

o Do not convert jiffies to mili seconds as it is not required. Just work
  with jiffies and HZ.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a46700255719..c1bc1b6c887a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -439,8 +439,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-	io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
-				/ MSEC_PER_SEC;
+	io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ;
 
 	if (tg->io_disp[rw] + 1 <= io_allowed) {
 		if (wait)
@@ -476,8 +475,8 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-	tmp = tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd);
-	do_div(tmp, MSEC_PER_SEC);
+	tmp = tg->bps[rw] * jiffy_elapsed_rnd;
+	do_div(tmp, HZ);
 	bytes_allowed = tmp;
 
 	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
-- 
cgit v1.2.2


From 9355aede5a3c4975e0ba8bbfe2b9d1fd73308916 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 21:16:41 +0200
Subject: blkio-throttle: limit max iops value to UINT_MAX

- Limit max iops value to UINT_MAX and return error to user if value is more
  than that instead of accepting bigger values and truncating implicitly.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 11 +++++++----
 block/blk-cgroup.h |  3 +++
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 52c12130a5de..0f59b23096db 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -656,10 +656,10 @@ static int blkio_policy_parse_and_set(char *buf,
 {
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 	int ret;
-	unsigned long major, minor, temp, iops;
+	unsigned long major, minor, temp;
 	int i = 0;
 	dev_t dev;
-	u64 bps;
+	u64 bps, iops;
 
 	memset(s, 0, sizeof(s));
 
@@ -731,13 +731,16 @@ static int blkio_policy_parse_and_set(char *buf,
 			break;
 		case BLKIO_THROTL_read_iops_device:
 		case BLKIO_THROTL_write_iops_device:
-			ret = strict_strtoul(s[1], 10, &iops);
+			ret = strict_strtoull(s[1], 10, &iops);
 			if (ret)
 				return -EINVAL;
 
+			if (iops > THROTL_IOPS_MAX)
+				return -EINVAL;
+
 			newpn->plid = plid;
 			newpn->fileid = fileid;
-			newpn->val.iops = iops;
+			newpn->val.iops = (unsigned int)iops;
 			break;
 		}
 		break;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 034c35562dba..ea4861bdd549 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -20,6 +20,9 @@ enum blkio_policy_id {
 	BLKIO_POLICY_THROTL,		/* Throttling */
 };
 
+/* Max limits for throttle policy */
+#define THROTL_IOPS_MAX		UINT_MAX
+
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 
 #ifndef CONFIG_BLK_CGROUP
-- 
cgit v1.2.2


From c49c06e4960949a9bced708858433fcf6ca36a9c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 21:16:42 +0200
Subject: blkio-throttle: Fix possible multiplication overflow in iops
 calculations

o User can specify max iops value of 32bit (UINT_MAX), through cgroup
  interface. If a user has specified say 4294967294 (UNIT_MAX  - 2), then
  on 32bit platform, following multiplication can overflow.

  io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd)

o Explicitly cast the multiplication to 64bit and then perform division and
  then check whether result is still great then UNINT_MAX.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c1bc1b6c887a..56ad4531b412 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -430,6 +430,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 	bool rw = bio_data_dir(bio);
 	unsigned int io_allowed;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+	u64 tmp;
 
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 
@@ -439,7 +440,20 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-	io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ;
+	/*
+	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
+	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
+	 * will allow dispatch after 1 second and after that slice should
+	 * have been trimmed.
+	 */
+
+	tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
+	do_div(tmp, HZ);
+
+	if (tmp > UINT_MAX)
+		io_allowed = UINT_MAX;
+	else
+		io_allowed = tmp;
 
 	if (tg->io_disp[rw] + 1 <= io_allowed) {
 		if (wait)
-- 
cgit v1.2.2


From 2a48fc0ab24241755dc93bfd4f01d68efab47f5a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 2 Jun 2010 14:28:52 +0200
Subject: block: autoconvert trivial BKL users to private mutex

The block device drivers have all gained new lock_kernel
calls from a recent pushdown, and some of the drivers
were already using the BKL before.

This turns the BKL into a set of per-driver mutexes.
Still need to check whether this is safe to do.

file=$1
name=$2
if grep -q lock_kernel ${file} ; then
    if grep -q 'include.*linux.mutex.h' ${file} ; then
            sed -i '/include.*<linux\/smp_lock.h>/d' ${file}
    else
            sed -i 's/include.*<linux\/smp_lock.h>.*$/include <linux\/mutex.h>/g' ${file}
    fi
    sed -i ${file} \
        -e "/^#include.*linux.mutex.h/,$ {
                1,/^\(static\|int\|long\)/ {
                     /^\(static\|int\|long\)/istatic DEFINE_MUTEX(${name}_mutex);

} }"  \
    -e "s/\(un\)*lock_kernel\>[ ]*()/mutex_\1lock(\&${name}_mutex)/g" \
    -e '/[      ]*cycle_kernel_lock();/d'
else
    sed -i -e '/include.*\<smp_lock.h\>/d' ${file}  \
                -e '/cycle_kernel_lock()/d'
fi

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 block/bsg.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index 82d58829ba59..ffd79ca25c05 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -20,7 +20,6 @@
 #include <linux/uio.h>
 #include <linux/idr.h>
 #include <linux/bsg.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 
 #include <scsi/scsi.h>
@@ -843,9 +842,7 @@ static int bsg_open(struct inode *inode, struct file *file)
 {
 	struct bsg_device *bd;
 
-	lock_kernel();
 	bd = bsg_get_device(inode, file);
-	unlock_kernel();
 
 	if (IS_ERR(bd))
 		return PTR_ERR(bd);
-- 
cgit v1.2.2


From 892b6f90db81cccb723d5d92f4fddc2d68b206e1 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 13 Oct 2010 21:18:03 +0200
Subject: block: Ensure physical block size is unsigned int

Physical block size was declared unsigned int to accomodate the maximum
size reported by READ CAPACITY(16).  Make sure we use the right type in
the related functions.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index a3600a7ab8bb..315b88c8cbbb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -344,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
  *   hardware can operate on without reverting to read-modify-write
  *   operations.
  */
-void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
+void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
 {
 	q->limits.physical_block_size = size;
 
-- 
cgit v1.2.2


From e817bf3f68f55e7307c3e9abe5f32d0c05c83988 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 15 Oct 2010 15:49:18 +0200
Subject: block: Fix double free in blk_integrity_unregister

Commit 3839e4b introduced a kobject_put but failed to remove the
kmem_cache_free beneath it, leading to a double free.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-integrity.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 885cbb59967e..54bcba6c02a7 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -432,7 +432,6 @@ void blk_integrity_unregister(struct gendisk *disk)
 	kobject_uevent(&bi->kobj, KOBJ_REMOVE);
 	kobject_del(&bi->kobj);
 	kobject_put(&bi->kobj);
-	kmem_cache_free(integrity_cachep, bi);
 	disk->integrity = NULL;
 }
 EXPORT_SYMBOL(blk_integrity_unregister);
-- 
cgit v1.2.2


From 6038f373a3dc1f1c26496e60b6c40b164716f07e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 15 Aug 2010 18:52:59 +0200
Subject: llseek: automatically add .llseek fop

All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.

The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.

New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time.  Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.

The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.

Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.

Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.

===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
//   but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}

@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}

@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
   *off = E
|
   *off += E
|
   func(..., off, ...)
|
   E = *off
)
...+>
}

@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}

@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
  *off = E
|
  *off += E
|
  func(..., off, ...)
|
  E = *off
)
...+>
}

@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}

@ fops0 @
identifier fops;
@@
struct file_operations fops = {
 ...
};

@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
 .llseek = llseek_f,
...
};

@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
 .read = read_f,
...
};

@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
 .write = write_f,
...
};

@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
 .open = open_f,
...
};

// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
...  .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};

@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
...  .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};

// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
...  .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};

// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};

// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};

@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+	.llseek = default_llseek, /* write accesses f_pos */
};

// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////

@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
 .write = write_f,
 .read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};

@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};

@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};

@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
---
 block/bsg.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index 82d58829ba59..83e381a26b58 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -968,6 +968,7 @@ static const struct file_operations bsg_fops = {
 	.release	=	bsg_release,
 	.unlocked_ioctl	=	bsg_ioctl,
 	.owner		=	THIS_MODULE,
+	.llseek		=	default_llseek,
 };
 
 void bsg_unregister_queue(struct request_queue *q)
-- 
cgit v1.2.2


From 7681bfeeccff5efa9eb29bf09249a3c400b15327 Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Date: Tue, 19 Oct 2010 09:05:00 +0200
Subject: block: fix accounting bug on cross partition merges

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
                                                ~~~~~~~~~~
   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
   is 0 and sda2's one is 1.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
   from sda2 region to sda1 region. However the two partition's
   hd_struct->in_flight are not changed.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
   sda2's hd_struct->in_flight, not a sda1's one, is decremented.

        | hd_struct->in_flight
   ---------------------------
   sda1 |         -1
   sda2 |          1
   ---------------------------

The patch fixes the problem by caching the partition lookup
inside the request structure, hence making sure that the increment
and decrement will always happen on the same partition struct. This
also speeds up IO with accounting enabled, since it cuts down on
the number of lookups we have to do.

When reloading partition tables, quiesce IO to ensure that no
request references to the partition struct exists. When it is safe
to free the partition table, the IO for that device is restarted
again.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c  | 24 ++++++++++++++++--------
 block/blk-merge.c |  2 +-
 block/blk.h       |  4 ----
 block/genhd.c     | 14 ++++++++++++++
 4 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 797d5095eb83..ddc68332d655 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		return;
 
 	cpu = part_stat_lock();
-	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
-	if (!new_io)
+	if (!new_io) {
+		part = rq->part;
 		part_stat_inc(cpu, part, merges[rw]);
-	else {
+	} else {
+		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
+		rq->part = part;
 	}
 
 	part_stat_unlock();
@@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
+	rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -804,11 +807,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	rl->starved[is_sync] = 0;
 
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	if (priv)
+	if (priv) {
 		rl->elvpriv++;
 
-	if (blk_queue_io_stat(q))
-		rw_flags |= REQ_IO_STAT;
+		/*
+		 * Don't do stats for non-priv requests
+		 */
+		if (blk_queue_io_stat(q))
+			rw_flags |= REQ_IO_STAT;
+	}
+
 	spin_unlock_irq(q->queue_lock);
 
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1777,7 +1785,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1797,7 +1805,7 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6a725461654d..38ff234012a4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
diff --git a/block/blk.h b/block/blk.h
index 6738831ba447..1340cce5721a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
 
 int blk_dev_init(void);
 
-void elv_quiesce_start(struct request_queue *q);
-void elv_quiesce_end(struct request_queue *q);
-
-
 /*
  * Return the threshold (number of used requests) at which the queue is
  * considered to be congested.  It include a little hysteresis to keep the
diff --git a/block/genhd.c b/block/genhd.c
index 7923e720ddf5..8313834596db 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -932,8 +932,15 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
 {
 	struct disk_part_tbl *ptbl =
 		container_of(head, struct disk_part_tbl, rcu_head);
+	struct gendisk *disk = ptbl->disk;
+	struct request_queue *q = disk->queue;
+	unsigned long flags;
 
 	kfree(ptbl);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	elv_quiesce_end(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 /**
@@ -951,11 +958,17 @@ static void disk_replace_part_tbl(struct gendisk *disk,
 				  struct disk_part_tbl *new_ptbl)
 {
 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
+	struct request_queue *q = disk->queue;
 
 	rcu_assign_pointer(disk->part_tbl, new_ptbl);
 
 	if (old_ptbl) {
 		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
+
+		spin_lock_irq(q->queue_lock);
+		elv_quiesce_start(q);
+		spin_unlock_irq(q->queue_lock);
+
 		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
 	}
 }
@@ -996,6 +1009,7 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
 		return -ENOMEM;
 
 	new_ptbl->len = target;
+	new_ptbl->disk = disk;
 
 	for (i = 0; i < len; i++)
 		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
-- 
cgit v1.2.2


From b4627321e18582dcbdeb45d77df29d3177107c65 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 22 Oct 2010 09:48:43 +0200
Subject: cfq-iosched: Fix a gcc 4.5 warning and put some comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Andi encountedred following warning with gcc 4.5

  linux/block/cfq-iosched.c: In function ‘cfq_dispatch_requests’:
  linux/block/cfq-iosched.c:2156:3: warning: array subscript is above array
  bounds

- Warning happens due to following code.

  slice = group_slice * count /
		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
		cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));

  gcc is complaining about cfqg->busy_queues_avg[] being indexed by CFQ
  prio classes (RT, BE and IDLE) while the array size is only 2.

- At run time, we never access cfqg->busy_queues_avg[IDLE] and return from
  function before this code hits.

- To fix warning increase the array size though it will remain unused. This
  patch also puts some comments to clarify some of the confusions.

- I have taken Jens's patch and modified it a bit.

- Compile tested with gcc 4.4 and boot tested. I don't have gcc 4.5
  running, Andi can you please test it with gcc 4.5 to make sure it
  worked.

Reported-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 86338d5d4d09..dfceb6386bd5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -157,6 +157,7 @@ enum wl_prio_t {
 	BE_WORKLOAD = 0,
 	RT_WORKLOAD = 1,
 	IDLE_WORKLOAD = 2,
+	CFQ_PRIO_NR,
 };
 
 /*
@@ -181,10 +182,19 @@ struct cfq_group {
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
 
-	/* Per group busy queus average. Useful for workload slice calc. */
-	unsigned int busy_queues_avg[2];
 	/*
-	 * rr lists of queues with requests, onle rr for each priority class.
+	 * Per group busy queus average. Useful for workload slice calc. We
+	 * create the array for each prio class but at run time it is used
+	 * only for RT and BE class and slot for IDLE class remains unused.
+	 * This is primarily done to avoid confusion and a gcc warning.
+	 */
+	unsigned int busy_queues_avg[CFQ_PRIO_NR];
+	/*
+	 * rr lists of queues with requests. We maintain service trees for
+	 * RT and BE classes. These trees are subdivided in subclasses
+	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
+	 * class there is no subclassification and all the cfq queues go on
+	 * a single tree service_tree_idle.
 	 * Counts are embedded in the cfq_rb_root
 	 */
 	struct cfq_rb_root service_trees[2][3];
-- 
cgit v1.2.2


From e52eec13cd6b7f30ab19081b387813e03e592ae5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 8 Sep 2010 16:54:17 +0200
Subject: SYSFS: Allow boot time switching between deprecated and modern sysfs
 layout

I have some systems which need legacy sysfs due to old tools that are
making assumptions that a directory can never be a symlink to another
directory, and it's a big hazzle to compile separate kernels for them.

This patch turns CONFIG_SYSFS_DEPRECATED into a run time option
that can be switched on/off the kernel command line. This way
the same binary can be used in both cases with just a option
on the command line.

The old CONFIG_SYSFS_DEPRECATED_V2 option is still there to set
the default. I kept the weird name to not break existing
config files.

Also the compat code can be still completely disabled by undefining
CONFIG_SYSFS_DEPRECATED_SWITCH -- just the optimizer takes
care of this now instead of lots of ifdefs. This makes the code
look nicer.

v2: This is an updated version on top of Kay's patch to only
handle the block devices. I tested it on my old systems
and that seems to work.

Cc: axboe@kernel.dk
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 59a2db6fecef..4e28a840bde0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -22,9 +22,7 @@
 #include "blk.h"
 
 static DEFINE_MUTEX(block_class_lock);
-#ifndef CONFIG_SYSFS_DEPRECATED
 struct kobject *block_depr;
-#endif
 
 /* for extended dynamic devt allocation, currently only one major is used */
 #define MAX_EXT_DEVT		(1 << MINORBITS)
@@ -803,10 +801,9 @@ static int __init genhd_device_init(void)
 
 	register_blkdev(BLOCK_EXT_MAJOR, "blkext");
 
-#ifndef CONFIG_SYSFS_DEPRECATED
 	/* create top-level block dir */
-	block_depr = kobject_create_and_add("block", NULL);
-#endif
+	if (!sysfs_deprecated)
+		block_depr = kobject_create_and_add("block", NULL);
 	return 0;
 }
 
-- 
cgit v1.2.2


From 7ad58c028652753814054f4e3ac58f925e7343f4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Sat, 23 Oct 2010 20:40:26 +0200
Subject: block: fix use-after-free bug in blk throttle code

blk_throtl_exit() frees the throttle data hanging off the queue
in blk_cleanup_queue(), but blk_put_queue() will indirectly
dereference this data when calling blk_sync_queue() which in
turns calls throtl_shutdown_timer_wq().

Fix this by moving the freeing of the throttle data to when
the queue is truly being released, and post the call to
blk_sync_queue().

Reported-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c  | 2 --
 block/blk-sysfs.c | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 45141469e89e..51efd835d4cf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -462,8 +462,6 @@ void blk_cleanup_queue(struct request_queue *q)
 	if (q->elevator)
 		elevator_exit(q->elevator);
 
-	blk_throtl_exit(q);
-
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index da8a8a40cd4c..013457f47fdc 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -471,6 +471,8 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_sync_queue(q);
 
+	blk_throtl_exit(q);
+
 	if (rl->rq_pool)
 		mempool_destroy(rl->rq_pool);
 
-- 
cgit v1.2.2


From f253b86b4ad1b3220544e75880510fd455ebd23f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Sun, 24 Oct 2010 22:06:02 +0200
Subject: Revert "block: fix accounting bug on cross partition merges"

This reverts commit 7681bfeeccff5efa9eb29bf09249a3c400b15327.

Conflicts:

	include/linux/genhd.h

It has numerous issues with the cleanup path and non-elevator
devices. Revert it for now so we can come up with a clean
version without rushing things.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c  | 24 ++++++++----------------
 block/blk-merge.c |  2 +-
 block/blk.h       |  4 ++++
 block/genhd.c     | 14 --------------
 4 files changed, 13 insertions(+), 31 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 51efd835d4cf..f8548876d7ea 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,15 +64,13 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		return;
 
 	cpu = part_stat_lock();
+	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
-	if (!new_io) {
-		part = rq->part;
+	if (!new_io)
 		part_stat_inc(cpu, part, merges[rw]);
-	} else {
-		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+	else {
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
-		rq->part = part;
 	}
 
 	part_stat_unlock();
@@ -130,7 +128,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
-	rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -805,16 +802,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	rl->starved[is_sync] = 0;
 
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	if (priv) {
+	if (priv)
 		rl->elvpriv++;
 
-		/*
-		 * Don't do stats for non-priv requests
-		 */
-		if (blk_queue_io_stat(q))
-			rw_flags |= REQ_IO_STAT;
-	}
-
+	if (blk_queue_io_stat(q))
+		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1791,7 +1783,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = req->part;
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1811,7 +1803,7 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = req->part;
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 0a2fd8a48a38..77b7c26df6b5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = req->part;
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
diff --git a/block/blk.h b/block/blk.h
index 1e675e5ade02..2db8f32838e7 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -116,6 +116,10 @@ void blk_queue_congestion_threshold(struct request_queue *q);
 
 int blk_dev_init(void);
 
+void elv_quiesce_start(struct request_queue *q);
+void elv_quiesce_end(struct request_queue *q);
+
+
 /*
  * Return the threshold (number of used requests) at which the queue is
  * considered to be congested.  It include a little hysteresis to keep the
diff --git a/block/genhd.c b/block/genhd.c
index a8adf96a4b41..5fa2b44a72ff 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -929,15 +929,8 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
 {
 	struct disk_part_tbl *ptbl =
 		container_of(head, struct disk_part_tbl, rcu_head);
-	struct gendisk *disk = ptbl->disk;
-	struct request_queue *q = disk->queue;
-	unsigned long flags;
 
 	kfree(ptbl);
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	elv_quiesce_end(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 /**
@@ -955,17 +948,11 @@ static void disk_replace_part_tbl(struct gendisk *disk,
 				  struct disk_part_tbl *new_ptbl)
 {
 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
-	struct request_queue *q = disk->queue;
 
 	rcu_assign_pointer(disk->part_tbl, new_ptbl);
 
 	if (old_ptbl) {
 		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
-
-		spin_lock_irq(q->queue_lock);
-		elv_quiesce_start(q);
-		spin_unlock_irq(q->queue_lock);
-
 		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
 	}
 }
@@ -1006,7 +993,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
 		return -ENOMEM;
 
 	new_ptbl->len = target;
-	new_ptbl->disk = disk;
 
 	for (i = 0; i < len; i++)
 		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
-- 
cgit v1.2.2


From b595076a180a56d1bb170e6eceda6eb9d76f4cd3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Mon, 1 Nov 2010 15:38:34 -0400
Subject: tree-wide: fix comment/printk typos
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

"gadget", "through", "command", "maintain", "maintain", "controller", "address",
"between", "initiali[zs]e", "instead", "function", "select", "already",
"equal", "access", "management", "hierarchy", "registration", "interest",
"relative", "memory", "offset", "already",

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 block/cfq-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4cd59b0d7c15..78ee4b1d4e85 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1030,7 +1030,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 
 	/*
 	 * Add group onto cgroup list. It might happen that bdi->dev is
-	 * not initiliazed yet. Initialize this new group without major
+	 * not initialized yet. Initialize this new group without major
 	 * and minor info and this info will be filled in once a new thread
 	 * comes for IO. See code above.
 	 */
-- 
cgit v1.2.2


From c1e44756fdb7b363fd22cb5514dced40752e36c5 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 8 Nov 2010 15:01:02 +0100
Subject: cfq-iosched: do cleanup

Some functions should return boolean.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9eba291eb6fd..b8174bb4a6a1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -637,11 +637,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
 {
 	if (cfq_cfqq_slice_new(cfqq))
-		return 0;
+		return false;
 	if (time_before(jiffies, cfqq->slice_end))
-		return 0;
+		return false;
 
-	return 1;
+	return true;
 }
 
 /*
@@ -1892,10 +1892,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * in their service tree.
 	 */
 	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
-		return 1;
+		return true;
 	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
 			service_tree->count);
-	return 0;
+	return false;
 }
 
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@ -2359,12 +2359,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
 {
 	/* the queue hasn't finished any request, can't estimate */
 	if (cfq_cfqq_slice_new(cfqq))
-		return 1;
+		return true;
 	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
 		cfqq->slice_end))
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-- 
cgit v1.2.2


From d2d59e18a1ea8ecdd1c0a52af320e9a7f5391cc4 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 8 Nov 2010 15:01:03 +0100
Subject: cfq-iosched: schedule dispatch for noidle queue

A queue is idle at cfq_dispatch_requests(), but it gets noidle later. Unless
other task explictly does unplug or all requests are drained, we will not
deliever requests to the disk even cfq_arm_slice_timer doesn't make the
queue idle. For example, cfq_should_idle() returns true because of
service_tree->count == 1, and then other queues are added. Note, I didn't
see obvious performance impacts so far with the patch, but just thought
this could be a problem.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b8174bb4a6a1..986865e3fbc5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3255,6 +3255,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
 		return true;
 
+	/* An idle queue should not be idle now for some reason */
+	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
+		return true;
+
 	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
 		return false;
 
@@ -3508,8 +3512,25 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		}
 	}
 
-	if (!cfqd->rq_in_driver)
+	if (!cfqd->rq_in_driver) {
+		cfq_schedule_dispatch(cfqd);
+		return;
+	}
+	/*
+	 * A queue is idle at cfq_dispatch_requests(), but it gets noidle
+	 * later. We schedule a dispatch if the queue has no requests,
+	 * otherwise the disk is actually in idle till all requests
+	 * are finished even cfq_arm_slice_timer doesn't make the queue idle
+	 * */
+	cfqq = cfqd->active_queue;
+	if (!cfqq)
+		return;
+
+	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq) &&
+	    (!cfqd->cfq_group_idle || cfqq->cfqg->nr_cfqq > 1)) {
+		cfq_del_timer(cfqd, cfqq);
 		cfq_schedule_dispatch(cfqd);
+	}
 }
 
 /*
-- 
cgit v1.2.2


From 8e1ac6655104bc6e1e79d67e2df88cc8fa9b6e07 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 8 Nov 2010 15:01:04 +0100
Subject: cfq-iosched: don't idle if a deep seek queue is slow

If a deep seek queue slowly deliver requests but disk is much faster, idle
for the queue just wastes disk throughput. If the queue delevers all requests
before half its slice is used, the patch disable idle for it.
In my test, application delivers 32 requests one time, the disk can accept
128 requests at maxium and disk is fast. without the patch, the throughput
is just around 30m/s, while with it, the speed is about 80m/s. The disk is
a SSD, but is detected as a rotational disk. I can configure it as SSD, but
I thought the deep seek queue logic should be fixed too, for example,
considering a fast raid.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 986865e3fbc5..ca4d19907243 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2285,6 +2285,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 		goto keep_queue;
 	}
 
+	/*
+	 * This is a deep seek queue, but the device is much faster than
+	 * the queue can deliver, don't idle
+	 **/
+	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
+	    (cfq_cfqq_slice_new(cfqq) ||
+	    (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
+		cfq_clear_cfqq_deep(cfqq);
+		cfq_clear_cfqq_idle_window(cfqq);
+	}
+
 	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
 		cfqq = NULL;
 		goto keep_queue;
-- 
cgit v1.2.2


From 2b9408a45978dcda77407859148deeccf403c372 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 9 Nov 2010 14:51:13 +0100
Subject: cfq-iosched: don't schedule a dispatch for a non-idle queue

Vivek suggests we don't need schedule a dispatch when an idle queue
becomes nonidle. And he is right, cfq_should_preempt already covers
the logic.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ca4d19907243..f90519430be6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3523,25 +3523,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		}
 	}
 
-	if (!cfqd->rq_in_driver) {
+	if (!cfqd->rq_in_driver)
 		cfq_schedule_dispatch(cfqd);
-		return;
-	}
-	/*
-	 * A queue is idle at cfq_dispatch_requests(), but it gets noidle
-	 * later. We schedule a dispatch if the queue has no requests,
-	 * otherwise the disk is actually in idle till all requests
-	 * are finished even cfq_arm_slice_timer doesn't make the queue idle
-	 * */
-	cfqq = cfqd->active_queue;
-	if (!cfqq)
-		return;
-
-	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq) &&
-	    (!cfqd->cfq_group_idle || cfqq->cfqg->nr_cfqq > 1)) {
-		cfq_del_timer(cfqd, cfqq);
-		cfq_schedule_dispatch(cfqd);
-	}
 }
 
 /*
-- 
cgit v1.2.2


From 9284bcf4e335e5f18a8bc7b26461c33ab60d0689 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 29 Oct 2010 08:10:18 -0600
Subject: block: check for proper length of iov entries in
 blk_rq_map_user_iov()

Ensure that we pass down properly validated iov segments before
calling into the mapping or copy functions.

Reported-by: Dan Rosenberg <drosenberg@vsecurity.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-map.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index d4a586d8691e..5d5dbe47c228 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -205,6 +205,8 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 			unaligned = 1;
 			break;
 		}
+		if (!iov[i].iov_len)
+			return -EINVAL;
 	}
 
 	if (unaligned || (q->dma_pad_mask & len) || map_data)
-- 
cgit v1.2.2


From 9f864c80913467312c7b8690e41fb5ebd1b50e92 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 29 Oct 2010 11:31:42 -0600
Subject: block: take care not to overflow when calculating total iov length

Reported-by: Dan Rosenberg <drosenberg@vsecurity.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/scsi_ioctl.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a8b5a10eb5b0..4f4230b79bb6 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -321,33 +321,47 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	if (hdr->iovec_count) {
 		const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
 		size_t iov_data_len;
-		struct sg_iovec *iov;
+		struct sg_iovec *sg_iov;
+		struct iovec *iov;
+		int i;
 
-		iov = kmalloc(size, GFP_KERNEL);
-		if (!iov) {
+		sg_iov = kmalloc(size, GFP_KERNEL);
+		if (!sg_iov) {
 			ret = -ENOMEM;
 			goto out;
 		}
 
-		if (copy_from_user(iov, hdr->dxferp, size)) {
-			kfree(iov);
+		if (copy_from_user(sg_iov, hdr->dxferp, size)) {
+			kfree(sg_iov);
 			ret = -EFAULT;
 			goto out;
 		}
 
+		/*
+		 * Sum up the vecs, making sure they don't overflow
+		 */
+		iov = (struct iovec *) sg_iov;
+		iov_data_len = 0;
+		for (i = 0; i < hdr->iovec_count; i++) {
+			if (iov_data_len + iov[i].iov_len < iov_data_len) {
+				kfree(sg_iov);
+				ret = -EINVAL;
+				goto out;
+			}
+			iov_data_len += iov[i].iov_len;
+		}
+
 		/* SG_IO howto says that the shorter of the two wins */
-		iov_data_len = iov_length((struct iovec *)iov,
-					  hdr->iovec_count);
 		if (hdr->dxfer_len < iov_data_len) {
-			hdr->iovec_count = iov_shorten((struct iovec *)iov,
+			hdr->iovec_count = iov_shorten(iov,
 						       hdr->iovec_count,
 						       hdr->dxfer_len);
 			iov_data_len = hdr->dxfer_len;
 		}
 
-		ret = blk_rq_map_user_iov(q, rq, NULL, iov, hdr->iovec_count,
+		ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count,
 					  iov_data_len, GFP_KERNEL);
-		kfree(iov);
+		kfree(sg_iov);
 	} else if (hdr->dxfer_len)
 		ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
 				      GFP_KERNEL);
-- 
cgit v1.2.2


From 77304d2abac6101f7249754ffdd4421258877ab0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 8 Nov 2010 14:39:12 +0100
Subject: block: read i_size with i_size_read()

Convert direct reads of an inode's i_size to using i_size_read().

i_size_{read,write} use a seqcount to protect reads from accessing
incomple writes.  Concurrent i_size_write()s require mutual exclussion
to protect the seqcount that is used by i_size_{read,write}.  But
i_size_read() callers do not need to use additional locking.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: NeilBrown <neilb@suse.de>
Acked-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c     | 4 ++--
 block/compat_ioctl.c | 4 ++--
 block/ioctl.c        | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index f0834e2f5727..17fcb83670c0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1351,7 +1351,7 @@ static void handle_bad_sector(struct bio *bio)
 			bdevname(bio->bi_bdev, b),
 			bio->bi_rw,
 			(unsigned long long)bio->bi_sector + bio_sectors(bio),
-			(long long)(bio->bi_bdev->bd_inode->i_size >> 9));
+			(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
 
 	set_bit(BIO_EOF, &bio->bi_flags);
 }
@@ -1404,7 +1404,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 		return 0;
 
 	/* Test device or partition size, when known. */
-	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
 	if (maxsector) {
 		sector_t sector = bio->bi_sector;
 
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 119f07b74dc0..58c6ee5b010c 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -744,13 +744,13 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
 		return 0;
 	case BLKGETSIZE:
-		size = bdev->bd_inode->i_size;
+		size = i_size_read(bdev->bd_inode);
 		if ((size >> 9) > ~0UL)
 			return -EFBIG;
 		return compat_put_ulong(arg, size >> 9);
 
 	case BLKGETSIZE64_32:
-		return compat_put_u64(arg, bdev->bd_inode->i_size);
+		return compat_put_u64(arg, i_size_read(bdev->bd_inode));
 
 	case BLKTRACESETUP32:
 	case BLKTRACESTART: /* compatible */
diff --git a/block/ioctl.c b/block/ioctl.c
index d724ceb1d465..38aa194f63ec 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -125,7 +125,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 	start >>= 9;
 	len >>= 9;
 
-	if (start + len > (bdev->bd_inode->i_size >> 9))
+	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
 		return -EINVAL;
 	if (secure)
 		flags |= BLKDEV_DISCARD_SECURE;
@@ -307,12 +307,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		ret = blkdev_reread_part(bdev);
 		break;
 	case BLKGETSIZE:
-		size = bdev->bd_inode->i_size;
+		size = i_size_read(bdev->bd_inode);
 		if ((size >> 9) > ~0UL)
 			return -EFBIG;
 		return put_ulong(arg, size >> 9);
 	case BLKGETSIZE64:
-		return put_u64(arg, bdev->bd_inode->i_size);
+		return put_u64(arg, i_size_read(bdev->bd_inode));
 	case BLKTRACESTART:
 	case BLKTRACESTOP:
 	case BLKTRACESETUP:
-- 
cgit v1.2.2


From a014741c0adfb8fb79952939ca087cf03d272bb9 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segooon@gmail.com>
Date: Mon, 8 Nov 2010 14:42:40 +0100
Subject: block: ioctl: fix information leak to userland

Structure hd_geometry is copied to userland with 4 padding bytes
between cylinders and start fields uninitialized on 64-bit platforms.
It leads to leaking of contents of kernel stack memory.

Currently there is no memset() in real implementations of getgeo()
in drivers/block/, so it makes sense to have memset() in blkdev_ioctl().

Signed-off-by: Vasiliy Kulikov <segooon@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/ioctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/ioctl.c b/block/ioctl.c
index 38aa194f63ec..3d866d0037f2 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -242,6 +242,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		 * We need to set the startsect first, the driver may
 		 * want to override it.
 		 */
+		memset(&geo, 0, sizeof(geo));
 		geo.start = get_start_sect(bdev);
 		ret = disk->fops->getgeo(bdev, &geo);
 		if (ret)
-- 
cgit v1.2.2


From 02e031cbc843b010e72fcc05c76113c688b2860f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Nov 2010 14:54:09 +0100
Subject: block: remove REQ_HARDBARRIER

REQ_HARDBARRIER is dead now, so remove the leftovers.  What's left
at this point is:

 - various checks inside the block layer.
 - sanity checks in bio based drivers.
 - now unused bio_empty_barrier helper.
 - Xen blockfront use of BLKIF_OP_WRITE_BARRIER - it's dead for a while,
   but Xen really needs to sort out it's barrier situaton.
 - setting of ordered tags in uas - dead code copied from old scsi
   drivers.
 - scsi different retry for barriers - it's dead and should have been
   removed when flushes were converted to FS requests.
 - blktrace handling of barriers - removed.  Someone who knows blktrace
   better should add support for REQ_FLUSH and REQ_FUA, though.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 7 -------
 block/elevator.c | 4 ++--
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 17fcb83670c0..4ce953f1b390 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1194,13 +1194,6 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	int where = ELEVATOR_INSERT_SORT;
 	int rw_flags;
 
-	/* REQ_HARDBARRIER is no more */
-	if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER,
-		"block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) {
-		bio_endio(bio, -EOPNOTSUPP);
-		return 0;
-	}
-
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
diff --git a/block/elevator.c b/block/elevator.c
index 282e8308f7e2..2569512830d3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -429,7 +429,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 	q->nr_sorted--;
 
 	boundary = q->end_sector;
-	stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED;
+	stop_flags = REQ_SOFTBARRIER | REQ_STARTED;
 	list_for_each_prev(entry, &q->queue_head) {
 		struct request *pos = list_entry_rq(entry);
 
@@ -691,7 +691,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		       int plug)
 {
-	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
+	if (rq->cmd_flags & REQ_SOFTBARRIER) {
 		/* barriers are scheduling boundary, update end_sector */
 		if (rq->cmd_type == REQ_TYPE_FS ||
 		    (rq->cmd_flags & REQ_DISCARD)) {
-- 
cgit v1.2.2


From cedb4a7d9f6aedb0dce94d6285b69dcb3c10fa05 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 11 Nov 2010 13:37:54 +0100
Subject: block: remove unused copy_io_context()

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-ioc.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index d22c4c55c406..3c7a339fe381 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -153,20 +153,6 @@ struct io_context *get_io_context(gfp_t gfp_flags, int node)
 }
 EXPORT_SYMBOL(get_io_context);
 
-void copy_io_context(struct io_context **pdst, struct io_context **psrc)
-{
-	struct io_context *src = *psrc;
-	struct io_context *dst = *pdst;
-
-	if (src) {
-		BUG_ON(atomic_long_read(&src->refcount) == 0);
-		atomic_long_inc(&src->refcount);
-		put_io_context(dst);
-		*pdst = src;
-	}
-}
-EXPORT_SYMBOL(copy_io_context);
-
 static int __init blk_ioc_init(void)
 {
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
-- 
cgit v1.2.2


From e525fd89d380c4a94c0d63913a1dd1a593ed25e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:17 +0100
Subject: block: make blkdev_get/put() handle exclusive access

Over time, block layer has accumulated a set of APIs dealing with bdev
open, close, claim and release.

* blkdev_get/put() are the primary open and close functions.

* bd_claim/release() deal with exclusive open.

* open/close_bdev_exclusive() are combination of open and claim and
  the other way around, respectively.

* bd_link/unlink_disk_holder() to create and remove holder/slave
  symlinks.

* open_by_devnum() wraps bdget() + blkdev_get().

The interface is a bit confusing and the decoupling of open and claim
makes it impossible to properly guarantee exclusive access as
in-kernel open + claim sequence can disturb the existing exclusive
open even before the block layer knows the current open if for another
exclusive access.  Reorganize the interface such that,

* blkdev_get() is extended to include exclusive access management.
  @holder argument is added and, if is @FMODE_EXCL specified, it will
  gain exclusive access atomically w.r.t. other exclusive accesses.

* blkdev_put() is similarly extended.  It now takes @mode argument and
  if @FMODE_EXCL is set, it releases an exclusive access.  Also, when
  the last exclusive claim is released, the holder/slave symlinks are
  removed automatically.

* bd_claim/release() and close_bdev_exclusive() are no longer
  necessary and either made static or removed.

* bd_link_disk_holder() remains the same but bd_unlink_disk_holder()
  is no longer necessary and removed.

* open_bdev_exclusive() becomes a simple wrapper around lookup_bdev()
  and blkdev_get().  It also has an unexpected extra bdev_read_only()
  test which probably should be moved into blkdev_get().

* open_by_devnum() is modified to take @holder argument and pass it to
  blkdev_get().

Most of bdev open/close operations are unified into blkdev_get/put()
and most exclusive accesses are tested atomically at the open time (as
it should).  This cleans up code and removes some, both valid and
invalid, but unnecessary all the same, corner cases.

open_bdev_exclusive() and open_by_devnum() can use further cleanup -
rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop
special features.  Well, let's leave them for another day.

Most conversions are straight-forward.  drbd conversion is a bit more
involved as there was some reordering, but the logic should stay the
same.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Neil Brown <neilb@suse.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Peter Osterlund <petero2@telia.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Alex Elder <aelder@sgi.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: dm-devel@redhat.com
Cc: drbd-dev@lists.linbit.com
Cc: Leo Chen <leochen@broadcom.com>
Cc: Scott Branden <sbranden@broadcom.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: Joern Engel <joern@logfs.org>
Cc: reiserfs-devel@vger.kernel.org
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 block/ioctl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/ioctl.c b/block/ioctl.c
index d724ceb1d465..cc46d499fd27 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -294,11 +294,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 			return -EINVAL;
 		if (get_user(n, (int __user *) arg))
 			return -EFAULT;
-		if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0)
+		if (!(mode & FMODE_EXCL) &&
+		    blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
 			return -EBUSY;
 		ret = set_blocksize(bdev, n);
 		if (!(mode & FMODE_EXCL))
-			bd_release(bdev);
+			blkdev_put(bdev, mode | FMODE_EXCL);
 		return ret;
 	case BLKPG:
 		ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
-- 
cgit v1.2.2


From c2f6805d470af369a7337801deeecea800dbfe1c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 15 Nov 2010 19:32:42 +0100
Subject: blk-throttle: Fix calculation of max number of WRITES to be
 dispatched

o Currently we try to dispatch more READS and less WRITES (75%, 25%) in one
  dispatch round. ummy pointed out that there is a bug in max_nr_writes
  calculation. This patch fixes it.

Reported-by: ummy y <yummylln@yahoo.com.cn>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 56ad4531b412..004be80fd894 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -645,7 +645,7 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
 {
 	unsigned int nr_reads = 0, nr_writes = 0;
 	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
-	unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
+	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
 	struct bio *bio;
 
 	/* Try to dispatch 75% READS and 25% WRITES */
-- 
cgit v1.2.2


From bdc85df7a8417b9893443ff5520804699416b6f3 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 15 Nov 2010 19:37:36 +0100
Subject: blk-cgroup: Allow creation of hierarchical cgroups

o Allow hierarchical cgroup creation for blkio controller

o Currently we disallow it as both the io controller policies (throttling
  as well as proportion bandwidth) do not support hierarhical accounting
  and control. But the flip side is that blkio controller can not be used with
  libvirt as libvirt creates a cgroup hierarchy deeper than 1 level.

  <top-level-cgroup-dir>/<controller>/libvirt/qemu/<virtual-machine-groups>

o So this patch will allow creation of cgroup hierarhcy but at the backend
  everything will be treated as flat. So if somebody created a an hierarchy
  like as follows.

			root
			/  \
		     test1 test2
			|
		     test3

  CFQ and throttling will practically treat all groups at same level.

				pivot
			     /  |   \  \
			root  test1 test2  test3

o Once we have actual support for hierarchical accounting and control
  then we can introduce another cgroup tunable file "blkio.use_hierarchy"
  which will be 0 by default but if user wants to enforce hierarhical
  control then it can be set to 1. This way there should not be any
  ABI problems down the line.

o The only not so pretty part is introduction of extra file "use_hierarchy"
  down the line. Kame-san had mentioned that hierarhical accounting is
  expensive in memory controller hence they keep it off by default. I
  suspect same will be the case for IO controller also as for each IO
  completion we shall have to account IO through hierarchy up to the root.
  if yes, then it probably is not a very bad idea to introduce this extra
  file so that it will be used only when somebody needs it and some people
  might enable hierarchy only in part of the hierarchy.

o This is how basically memory controller also uses "use_hierarhcy" and
  they also allowed creation of hierarchies when actual backend support
  was not available.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Reviewed-by: Ciju Rajan K <ciju@linux.vnet.ibm.com>
Tested-by: Ciju Rajan K <ciju@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b1febd0f6d2a..455768a3eb9e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1452,10 +1452,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		goto done;
 	}
 
-	/* Currently we do not support hierarchy deeper than two level (0,1) */
-	if (parent != cgroup->top_cgroup)
-		return ERR_PTR(-EPERM);
-
 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 	if (!blkcg)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.2


From d07335e51df0c6dec202d315fc4f1f7e100eec4e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 16 Nov 2010 12:52:38 +0100
Subject: block: Rename "block_remap" tracepoint to "block_bio_remap" to
 clarify the event.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 4ce953f1b390..151070541e21 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,7 +33,7 @@
 
 #include "blk.h"
 
-EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
@@ -1329,9 +1329,9 @@ static inline void blk_partition_remap(struct bio *bio)
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 
-		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
-				    bdev->bd_dev,
-				    bio->bi_sector - p->start_sect);
+		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
+				      bdev->bd_dev,
+				      bio->bi_sector - p->start_sect);
 	}
 }
 
@@ -1500,7 +1500,7 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 
 		if (old_sector != -1)
-			trace_block_remap(q, bio, old_dev, old_sector);
+			trace_block_bio_remap(q, bio, old_dev, old_sector);
 
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
-- 
cgit v1.2.2


From 451a3c24b0135bce54542009b5fde43846c7cf67 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 17 Nov 2010 16:26:55 +0100
Subject: BKL: remove extraneous #include <smp_lock.h>

The big kernel lock has been removed from all these files at some point,
leaving only the #include.

Remove this too as a cleanup.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/compat_ioctl.c | 1 -
 block/ioctl.c        | 1 -
 2 files changed, 2 deletions(-)

(limited to 'block')

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 58c6ee5b010c..cc3eb78e333a 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -8,7 +8,6 @@
 #include <linux/hdreg.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
-#include <linux/smp_lock.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
 
diff --git a/block/ioctl.c b/block/ioctl.c
index 3d866d0037f2..a9a302eba01e 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,7 +5,6 @@
 #include <linux/hdreg.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 
-- 
cgit v1.2.2


From 5478755616ae2ef1ce144dded589b62b2a50d575 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Mon, 29 Nov 2010 10:03:55 +0100
Subject: block: check for proper length of iov entries earlier in
 blk_rq_map_user_iov()

commit 9284bcf checks for proper length of iov entries in
blk_rq_map_user_iov(). But if the map is unaligned, kernel
will break out the loop without checking for the proper length.
So we need to check the proper length before the unalign check.

Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-map.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index 5d5dbe47c228..e663ac2d8e68 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -201,12 +201,13 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	for (i = 0; i < iov_count; i++) {
 		unsigned long uaddr = (unsigned long)iov[i].iov_base;
 
+		if (!iov[i].iov_len)
+			return -EINVAL;
+
 		if (uaddr & queue_dma_alignment(q)) {
 			unaligned = 1;
 			break;
 		}
-		if (!iov[i].iov_len)
-			return -EINVAL;
 	}
 
 	if (unaligned || (q->dma_pad_mask & len) || map_data)
-- 
cgit v1.2.2


From b54ce60eb7f61f8e314b8b241b0469eda3bb1d42 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue, 30 Nov 2010 20:52:46 +0100
Subject: cfq-iosched: Get rid of st->active

When a cfq group is running, it won't be dequeued from service tree, so
there's no need to store the active one in st->active. Just gid rid of it.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 73a58628f54a..e18d316ae652 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -87,7 +87,6 @@ struct cfq_rb_root {
 	unsigned count;
 	unsigned total_weight;
 	u64 min_vdisktime;
-	struct rb_node *active;
 };
 #define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
 			.count = 0, .min_vdisktime = 0, }
@@ -563,11 +562,6 @@ static void update_min_vdisktime(struct cfq_rb_root *st)
 	u64 vdisktime = st->min_vdisktime;
 	struct cfq_group *cfqg;
 
-	if (st->active) {
-		cfqg = rb_entry_cfqg(st->active);
-		vdisktime = cfqg->vdisktime;
-	}
-
 	if (st->left) {
 		cfqg = rb_entry_cfqg(st->left);
 		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
@@ -894,9 +888,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 
-	if (st->active == &cfqg->rb_node)
-		st->active = NULL;
-
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
 
@@ -1095,7 +1086,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 	if (!atomic_dec_and_test(&cfqg->ref))
 		return;
 	for_each_cfqg_st(cfqg, i, j, st)
-		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
+		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
 	kfree(cfqg);
 }
 
@@ -1687,9 +1678,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (cfqq == cfqd->active_queue)
 		cfqd->active_queue = NULL;
 
-	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
-		cfqd->grp_service_tree.active = NULL;
-
 	if (cfqd->active_cic) {
 		put_io_context(cfqd->active_cic->ioc);
 		cfqd->active_cic = NULL;
@@ -2199,7 +2187,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
 	if (RB_EMPTY_ROOT(&st->rb))
 		return NULL;
 	cfqg = cfq_rb_first_group(st);
-	st->active = &cfqg->rb_node;
 	update_min_vdisktime(st);
 	return cfqg;
 }
-- 
cgit v1.2.2


From 760701bfe14faee8ea0608a9cab2046071d98a39 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue, 30 Nov 2010 20:52:47 +0100
Subject: cfq-iosched: Get rid of on_st flag

It's able to check whether a CFQ group on a service tree by
checking "cfqg->rb_node". There's no need to maintain an
extra flag here.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e18d316ae652..5d0349d602fe 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -179,7 +179,6 @@ struct cfq_group {
 	/* group service_tree key */
 	u64 vdisktime;
 	unsigned int weight;
-	bool on_st;
 
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
@@ -863,7 +862,7 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	struct rb_node *n;
 
 	cfqg->nr_cfqq++;
-	if (cfqg->on_st)
+	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		return;
 
 	/*
@@ -879,7 +878,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		cfqg->vdisktime = st->min_vdisktime;
 
 	__cfq_group_service_tree_add(st, cfqg);
-	cfqg->on_st = true;
 	st->total_weight += cfqg->weight;
 }
 
@@ -896,7 +894,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		return;
 
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	cfqg->on_st = false;
 	st->total_weight -= cfqg->weight;
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
-- 
cgit v1.2.2


From d1ae8ffdfaa16b2ab2e9346e81cf0ab6eaaae347 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 1 Dec 2010 19:34:46 +0100
Subject: blk-throttle: Trim/adjust slice_end once a bio has been dispatched

o During some testing I did following and noticed throttling stops working.

        - Put a very low limit on a cgroup, say 1 byte per second.
        - Start some reads, this will set slice_end to a very high value.
        - Change the limit to higher value say 1MB/s
        - Now IO unthrottles and finishes as expected.
        - Try to do the read again but IO is not limited to 1MB/s as expected.

o What is happening.
        - Initially low value of limit sets slice_end to a very high value.
        - During updation of limit, slice_end is not being truncated.
        - Very high value of slice_end leads to keeping the existing slice
          valid for a very long time and new slice does not start.
        - tg_may_dispatch() is called in blk_throtle_bio(), and trim_slice()
          is not called in this path. So slice_start is some old value and
          practically we are able to do huge amount of IO.

o There are many ways it can be fixed. I have fixed it by trying to
  adjust/cleanup slice_end in trim_slice(). Generally we extend slices if bio
  is big and can't be dispatched in one slice. After dispatch of bio, readjust
  the slice_end to make sure we don't end up with huge values.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 004be80fd894..2d134b7c40a3 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -355,6 +355,12 @@ throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 			tg->slice_end[rw], jiffies);
 }
 
+static inline void throtl_set_slice_end(struct throtl_data *td,
+		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+{
+	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+}
+
 static inline void throtl_extend_slice(struct throtl_data *td,
 		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
 {
@@ -391,6 +397,16 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 	if (throtl_slice_used(td, tg, rw))
 		return;
 
+	/*
+	 * A bio has been dispatched. Also adjust slice_end. It might happen
+	 * that initially cgroup limit was very low resulting in high
+	 * slice_end, but later limit was bumped up and bio was dispached
+	 * sooner, then we need to reduce slice_end. A high bogus slice_end
+	 * is bad because it does not allow new slice to start.
+	 */
+
+	throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
+
 	time_elapsed = jiffies - tg->slice_start[rw];
 
 	nr_slices = time_elapsed / throtl_slice;
-- 
cgit v1.2.2


From 04a6b516cdc6efc2500b52a540cf65be8c5aaf9e Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 1 Dec 2010 19:34:52 +0100
Subject: blk-throttle: Correct the placement of smp_rmb()

o I was discussing what are the variable being updated without spin lock and
  why do we need barriers and Oleg pointed out that location of smp_rmb()
  should be between read of td->limits_changed and tg->limits_changed. This
  patch fixes it.

o Following is one possible sequence of events. Say cpu0 is executing
  throtl_update_blkio_group_read_bps() and cpu1 is executing
  throtl_process_limit_change().

 cpu0                                                cpu1

 tg->limits_changed = true;
 smp_mb__before_atomic_inc();
 atomic_inc(&td->limits_changed);

                                     if (!atomic_read(&td->limits_changed))
                                             return;

                                     if (tg->limits_changed)
                                             do_something;

 If cpu0 has updated tg->limits_changed and td->limits_changed, we want to
 make sure that if update to td->limits_changed is visible on cpu1, then
 update to tg->limits_changed should also be visible.

 Oleg pointed out to ensure that we need to insert an smp_rmb() between
 td->limits_changed read and tg->limits_changed read.

o I had erroneously put smp_rmb() before atomic_read(&td->limits_changed).
  This patch fixes it.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 2d134b7c40a3..381b09bb562b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -725,26 +725,21 @@ static void throtl_process_limit_change(struct throtl_data *td)
 	struct throtl_grp *tg;
 	struct hlist_node *pos, *n;
 
-	/*
-	 * Make sure atomic_inc() effects from
-	 * throtl_update_blkio_group_read_bps(), group of functions are
-	 * visible.
-	 * Is this required or smp_mb__after_atomic_inc() was suffcient
-	 * after the atomic_inc().
-	 */
-	smp_rmb();
 	if (!atomic_read(&td->limits_changed))
 		return;
 
 	throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
 
-	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
-		/*
-		 * Do I need an smp_rmb() here to make sure tg->limits_changed
-		 * update is visible. I am relying on smp_rmb() at the
-		 * beginning of function and not putting a new one here.
-		 */
+	/*
+	 * Make sure updates from throtl_update_blkio_group_read_bps() group
+	 * of functions to tg->limits_changed are visible. We do not
+	 * want update td->limits_changed to be visible but update to
+	 * tg->limits_changed not being visible yet on this cpu. Hence
+	 * the read barrier.
+	 */
+	smp_rmb();
 
+	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
 		if (throtl_tg_on_rr(tg) && tg->limits_changed) {
 			throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
 				" riops=%u wiops=%u", tg->bps[READ],
-- 
cgit v1.2.2


From c7a841f3aca469187db76842676951a672fd27d1 Mon Sep 17 00:00:00 2001
From: James Smart <james.smart@emulex.com>
Date: Sun, 14 Nov 2010 11:12:04 -0500
Subject: [SCSI] bsg: correct fault if queue object removed while dev_t open

This patch corrects an issue in bsg that results in a general protection
fault if an LLD is removed while an application is using an open file
handle to a bsg device, and the application issues an ioctl. The fault
occurs because the class_dev is NULL, having been cleared in
bsg_unregister_queue() when the driver was removed.  With this
patch, a check is made for the class_dev, and the application
will receive ENXIO if the related object is gone.

Signed-off-by: Carl Lajeunesse <carl.lajeunesse@emulex.com>
Signed-off-by: James Smart <james.smart@emulex.com>
Signed-off-by: James Bottomley <James.Bottomley@suse.de>
---
 block/bsg.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index f20d6a789d48..0c8b64a16484 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -250,6 +250,14 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
 	int ret, rw;
 	unsigned int dxfer_len;
 	void *dxferp = NULL;
+	struct bsg_class_device *bcd = &q->bsg_dev;
+
+	/* if the LLD has been removed then the bsg_unregister_queue will
+	 * eventually be called and the class_dev was freed, so we can no
+	 * longer use this request_queue. Return no such address.
+	 */
+	if (!bcd->class_dev)
+		return ERR_PTR(-ENXIO);
 
 	dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp,
 		hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
-- 
cgit v1.2.2


From e4ea0c16a85d221ebcc3a21f32e321440459e0fc Mon Sep 17 00:00:00 2001
From: Shaohua Li writes <shaohua.li@intel.com>
Date: Mon, 13 Dec 2010 14:32:22 +0100
Subject: block cfq: select new workload if priority changed

If priority is changed, continuing to check workload_expires and service tree
count of the previous workload does not make sense. We should always choose
the workload with lowest key of new priority in such case.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5d0349d602fe..9b186fd6bf47 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2101,6 +2101,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	unsigned count;
 	struct cfq_rb_root *st;
 	unsigned group_slice;
+	enum wl_prio_t original_prio = cfqd->serving_prio;
 
 	if (!cfqg) {
 		cfqd->serving_prio = IDLE_WORKLOAD;
@@ -2119,6 +2120,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		return;
 	}
 
+	if (original_prio != cfqd->serving_prio)
+		goto new_workload;
+
 	/*
 	 * For RT and BE, we have to choose also the type
 	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
@@ -2133,6 +2137,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	if (count && !time_after(jiffies, cfqd->workload_expires))
 		return;
 
+new_workload:
 	/* otherwise select new workload type */
 	cfqd->serving_type =
 		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
-- 
cgit v1.2.2


From dddd9dc340ae1a41d90e084529ca979c77c4ecfe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:35 +0100
Subject: block: kill genhd_media_change_notify()

There's no user of the facility.  Kill it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 5fa2b44a72ff..0905ab22c8cf 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1110,29 +1110,6 @@ static int __init proc_genhd_init(void)
 module_init(proc_genhd_init);
 #endif /* CONFIG_PROC_FS */
 
-static void media_change_notify_thread(struct work_struct *work)
-{
-	struct gendisk *gd = container_of(work, struct gendisk, async_notify);
-	char event[] = "MEDIA_CHANGE=1";
-	char *envp[] = { event, NULL };
-
-	/*
-	 * set enviroment vars to indicate which event this is for
-	 * so that user space will know to go check the media status.
-	 */
-	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
-	put_device(gd->driverfs_dev);
-}
-
-#if 0
-void genhd_media_change_notify(struct gendisk *disk)
-{
-	get_device(disk->driverfs_dev);
-	schedule_work(&disk->async_notify);
-}
-EXPORT_SYMBOL_GPL(genhd_media_change_notify);
-#endif  /*  0  */
-
 dev_t blk_lookup_devt(const char *name, int partno)
 {
 	dev_t devt = MKDEV(0, 0);
@@ -1198,8 +1175,6 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 		disk_to_dev(disk)->class = &block_class;
 		disk_to_dev(disk)->type = &disk_type;
 		device_initialize(disk_to_dev(disk));
-		INIT_WORK(&disk->async_notify,
-			media_change_notify_thread);
 	}
 	return disk;
 }
-- 
cgit v1.2.2


From d2bf1b6723ed0eab378363649d15b7893bf14e91 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:36 +0100
Subject: block: move register_disk() and del_gendisk() to block/genhd.c

There's no reason for register_disk() and del_gendisk() to be in
fs/partitions/check.c.  Move both to genhd.c.  While at it, collapse
unlink_gendisk(), which was artificially in a separate function due to
genhd.c / check.c split, into del_gendisk().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 87 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 0905ab22c8cf..2e5e4c0a1133 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -502,6 +502,64 @@ static int exact_lock(dev_t devt, void *data)
 	return 0;
 }
 
+void register_disk(struct gendisk *disk)
+{
+	struct device *ddev = disk_to_dev(disk);
+	struct block_device *bdev;
+	struct disk_part_iter piter;
+	struct hd_struct *part;
+	int err;
+
+	ddev->parent = disk->driverfs_dev;
+
+	dev_set_name(ddev, disk->disk_name);
+
+	/* delay uevents, until we scanned partition table */
+	dev_set_uevent_suppress(ddev, 1);
+
+	if (device_add(ddev))
+		return;
+	if (!sysfs_deprecated) {
+		err = sysfs_create_link(block_depr, &ddev->kobj,
+					kobject_name(&ddev->kobj));
+		if (err) {
+			device_del(ddev);
+			return;
+		}
+	}
+	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
+
+	/* No minors to use for partitions */
+	if (!disk_partitionable(disk))
+		goto exit;
+
+	/* No such device (e.g., media were just removed) */
+	if (!get_capacity(disk))
+		goto exit;
+
+	bdev = bdget_disk(disk, 0);
+	if (!bdev)
+		goto exit;
+
+	bdev->bd_invalidated = 1;
+	err = blkdev_get(bdev, FMODE_READ, NULL);
+	if (err < 0)
+		goto exit;
+	blkdev_put(bdev, FMODE_READ);
+
+exit:
+	/* announce disk after possible partitions are created */
+	dev_set_uevent_suppress(ddev, 0);
+	kobject_uevent(&ddev->kobj, KOBJ_ADD);
+
+	/* announce possible partitions */
+	disk_part_iter_init(&piter, disk, 0);
+	while ((part = disk_part_iter_next(&piter)))
+		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
+	disk_part_iter_exit(&piter);
+}
+
 /**
  * add_disk - add partitioning information to kernel list
  * @disk: per-device partitioning information
@@ -552,17 +610,43 @@ void add_disk(struct gendisk *disk)
 				   "bdi");
 	WARN_ON(retval);
 }
-
 EXPORT_SYMBOL(add_disk);
-EXPORT_SYMBOL(del_gendisk);	/* in partitions/check.c */
 
-void unlink_gendisk(struct gendisk *disk)
+void del_gendisk(struct gendisk *disk)
 {
+	struct disk_part_iter piter;
+	struct hd_struct *part;
+
+	/* invalidate stuff */
+	disk_part_iter_init(&piter, disk,
+			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
+	while ((part = disk_part_iter_next(&piter))) {
+		invalidate_partition(disk, part->partno);
+		delete_partition(disk, part->partno);
+	}
+	disk_part_iter_exit(&piter);
+
+	invalidate_partition(disk, 0);
+	blk_free_devt(disk_to_dev(disk)->devt);
+	set_capacity(disk, 0);
+	disk->flags &= ~GENHD_FL_UP;
+
 	sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
 	bdi_unregister(&disk->queue->backing_dev_info);
 	blk_unregister_queue(disk);
 	blk_unregister_region(disk_devt(disk), disk->minors);
+
+	part_stat_set_all(&disk->part0, 0);
+	disk->part0.stamp = 0;
+
+	kobject_put(disk->part0.holder_dir);
+	kobject_put(disk->slave_dir);
+	disk->driverfs_dev = NULL;
+	if (!sysfs_deprecated)
+		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
+	device_del(disk_to_dev(disk));
 }
+EXPORT_SYMBOL(del_gendisk);
 
 /**
  * get_gendisk - get partitioning information for a given device
-- 
cgit v1.2.2


From 77ea887e433ad8389d416826936c110fa7910f80 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Dec 2010 20:57:37 +0100
Subject: implement in-kernel gendisk events handling

Currently, media presence polling for removeable block devices is done
from userland.  There are several issues with this.

* Polling is done by periodically opening the device.  For SCSI
  devices, the command sequence generated by such action involves a
  few different commands including TEST_UNIT_READY.  This behavior,
  while perfectly legal, is different from Windows which only issues
  single command, GET_EVENT_STATUS_NOTIFICATION.  Unfortunately, some
  ATAPI devices lock up after being periodically queried such command
  sequences.

* There is no reliable and unintrusive way for a userland program to
  tell whether the target device is safe for media presence polling.
  For example, polling for media presence during an on-going burning
  session can make it fail.  The polling program can avoid this by
  opening the device with O_EXCL but then it risks making a valid
  exclusive user of the device fail w/ -EBUSY.

* Userland polling is unnecessarily heavy and in-kernel implementation
  is lighter and better coordinated (workqueue, timer slack).

This patch implements framework for in-kernel disk event handling,
which includes media presence polling.

* bdops->check_events() is added, which supercedes ->media_changed().
  It should check whether there's any pending event and return if so.
  Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
  DISK_EVENT_EJECT_REQUEST.  ->check_events() is guaranteed not to be
  called parallelly.

* gendisk->events and ->async_events are added.  These should be
  initialized by block driver before passing the device to add_disk().
  The former contains the mask of all supported events and the latter
  the mask of all events which the device can report without polling.
  /sys/block/*/events[_async] export these to userland.

* Kernel parameter block.events_dfl_poll_msecs controls the system
  polling interval (default is 0 which means disable) and
  /sys/block/*/events_poll_msecs control polling intervals for
  individual devices (default is -1 meaning use system setting).  Note
  that if a device can report all supported events asynchronously and
  its polling interval isn't explicitly set, the device won't be
  polled regardless of the system polling interval.

* If a device is opened exclusively with write access, event checking
  is automatically disabled until all write exclusive accesses are
  released.

* There are event 'clearing' events.  For example, both of currently
  defined events are cleared after the device has been successfully
  opened.  This information is passed to ->check_events() callback
  using @clearing argument as a hint.

* Event checking is always performed from system_nrt_wq and timer
  slack is set to 25% for polling.

* Nothing changes for drivers which implement ->media_changed() but
  not ->check_events().  Going forward, all drivers will be converted
  to ->check_events() and ->media_change() will be dropped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 429 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 429 insertions(+)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 2e5e4c0a1133..5465a824d489 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,6 +18,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
+#include <linux/log2.h>
 
 #include "blk.h"
 
@@ -35,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr);
 
 static struct device_type disk_type;
 
+static void disk_add_events(struct gendisk *disk);
+static void disk_del_events(struct gendisk *disk);
+static void disk_release_events(struct gendisk *disk);
+
 /**
  * disk_get_part - get partition
  * @disk: disk to look partition from
@@ -609,6 +614,8 @@ void add_disk(struct gendisk *disk)
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
 	WARN_ON(retval);
+
+	disk_add_events(disk);
 }
 EXPORT_SYMBOL(add_disk);
 
@@ -617,6 +624,8 @@ void del_gendisk(struct gendisk *disk)
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 
+	disk_del_events(disk);
+
 	/* invalidate stuff */
 	disk_part_iter_init(&piter, disk,
 			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
@@ -1089,6 +1098,7 @@ static void disk_release(struct device *dev)
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
+	disk_release_events(disk);
 	kfree(disk->random);
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
@@ -1350,3 +1360,422 @@ int invalidate_partition(struct gendisk *disk, int partno)
 }
 
 EXPORT_SYMBOL(invalidate_partition);
+
+/*
+ * Disk events - monitor disk events like media change and eject request.
+ */
+struct disk_events {
+	struct list_head	node;		/* all disk_event's */
+	struct gendisk		*disk;		/* the associated disk */
+	spinlock_t		lock;
+
+	int			block;		/* event blocking depth */
+	unsigned int		pending;	/* events already sent out */
+	unsigned int		clearing;	/* events being cleared */
+
+	long			poll_msecs;	/* interval, -1 for default */
+	struct delayed_work	dwork;
+};
+
+static const char *disk_events_strs[] = {
+	[ilog2(DISK_EVENT_MEDIA_CHANGE)]	= "media_change",
+	[ilog2(DISK_EVENT_EJECT_REQUEST)]	= "eject_request",
+};
+
+static char *disk_uevents[] = {
+	[ilog2(DISK_EVENT_MEDIA_CHANGE)]	= "DISK_MEDIA_CHANGE=1",
+	[ilog2(DISK_EVENT_EJECT_REQUEST)]	= "DISK_EJECT_REQUEST=1",
+};
+
+/* list of all disk_events */
+static DEFINE_MUTEX(disk_events_mutex);
+static LIST_HEAD(disk_events);
+
+/* disable in-kernel polling by default */
+static unsigned long disk_events_dfl_poll_msecs	= 0;
+
+static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
+{
+	struct disk_events *ev = disk->ev;
+	long intv_msecs = 0;
+
+	/*
+	 * If device-specific poll interval is set, always use it.  If
+	 * the default is being used, poll iff there are events which
+	 * can't be monitored asynchronously.
+	 */
+	if (ev->poll_msecs >= 0)
+		intv_msecs = ev->poll_msecs;
+	else if (disk->events & ~disk->async_events)
+		intv_msecs = disk_events_dfl_poll_msecs;
+
+	return msecs_to_jiffies(intv_msecs);
+}
+
+static void __disk_block_events(struct gendisk *disk, bool sync)
+{
+	struct disk_events *ev = disk->ev;
+	unsigned long flags;
+	bool cancel;
+
+	spin_lock_irqsave(&ev->lock, flags);
+	cancel = !ev->block++;
+	spin_unlock_irqrestore(&ev->lock, flags);
+
+	if (cancel) {
+		if (sync)
+			cancel_delayed_work_sync(&disk->ev->dwork);
+		else
+			cancel_delayed_work(&disk->ev->dwork);
+	}
+}
+
+static void __disk_unblock_events(struct gendisk *disk, bool check_now)
+{
+	struct disk_events *ev = disk->ev;
+	unsigned long intv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ev->lock, flags);
+
+	if (WARN_ON_ONCE(ev->block <= 0))
+		goto out_unlock;
+
+	if (--ev->block)
+		goto out_unlock;
+
+	/*
+	 * Not exactly a latency critical operation, set poll timer
+	 * slack to 25% and kick event check.
+	 */
+	intv = disk_events_poll_jiffies(disk);
+	set_timer_slack(&ev->dwork.timer, intv / 4);
+	if (check_now)
+		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+	else if (intv)
+		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+out_unlock:
+	spin_unlock_irqrestore(&ev->lock, flags);
+}
+
+/**
+ * disk_block_events - block and flush disk event checking
+ * @disk: disk to block events for
+ *
+ * On return from this function, it is guaranteed that event checking
+ * isn't in progress and won't happen until unblocked by
+ * disk_unblock_events().  Events blocking is counted and the actual
+ * unblocking happens after the matching number of unblocks are done.
+ *
+ * Note that this intentionally does not block event checking from
+ * disk_clear_events().
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void disk_block_events(struct gendisk *disk)
+{
+	if (disk->ev)
+		__disk_block_events(disk, true);
+}
+
+/**
+ * disk_unblock_events - unblock disk event checking
+ * @disk: disk to unblock events for
+ *
+ * Undo disk_block_events().  When the block count reaches zero, it
+ * starts events polling if configured.
+ *
+ * CONTEXT:
+ * Don't care.  Safe to call from irq context.
+ */
+void disk_unblock_events(struct gendisk *disk)
+{
+	if (disk->ev)
+		__disk_unblock_events(disk, true);
+}
+
+/**
+ * disk_check_events - schedule immediate event checking
+ * @disk: disk to check events for
+ *
+ * Schedule immediate event checking on @disk if not blocked.
+ *
+ * CONTEXT:
+ * Don't care.  Safe to call from irq context.
+ */
+void disk_check_events(struct gendisk *disk)
+{
+	if (disk->ev) {
+		__disk_block_events(disk, false);
+		__disk_unblock_events(disk, true);
+	}
+}
+EXPORT_SYMBOL_GPL(disk_check_events);
+
+/**
+ * disk_clear_events - synchronously check, clear and return pending events
+ * @disk: disk to fetch and clear events from
+ * @mask: mask of events to be fetched and clearted
+ *
+ * Disk events are synchronously checked and pending events in @mask
+ * are cleared and returned.  This ignores the block count.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
+{
+	const struct block_device_operations *bdops = disk->fops;
+	struct disk_events *ev = disk->ev;
+	unsigned int pending;
+
+	if (!ev) {
+		/* for drivers still using the old ->media_changed method */
+		if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
+		    bdops->media_changed && bdops->media_changed(disk))
+			return DISK_EVENT_MEDIA_CHANGE;
+		return 0;
+	}
+
+	/* tell the workfn about the events being cleared */
+	spin_lock_irq(&ev->lock);
+	ev->clearing |= mask;
+	spin_unlock_irq(&ev->lock);
+
+	/* uncondtionally schedule event check and wait for it to finish */
+	__disk_block_events(disk, true);
+	queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+	flush_delayed_work(&ev->dwork);
+	__disk_unblock_events(disk, false);
+
+	/* then, fetch and clear pending events */
+	spin_lock_irq(&ev->lock);
+	WARN_ON_ONCE(ev->clearing & mask);	/* cleared by workfn */
+	pending = ev->pending & mask;
+	ev->pending &= ~mask;
+	spin_unlock_irq(&ev->lock);
+
+	return pending;
+}
+
+static void disk_events_workfn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
+	struct gendisk *disk = ev->disk;
+	char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
+	unsigned int clearing = ev->clearing;
+	unsigned int events;
+	unsigned long intv;
+	int nr_events = 0, i;
+
+	/* check events */
+	events = disk->fops->check_events(disk, clearing);
+
+	/* accumulate pending events and schedule next poll if necessary */
+	spin_lock_irq(&ev->lock);
+
+	events &= ~ev->pending;
+	ev->pending |= events;
+	ev->clearing &= ~clearing;
+
+	intv = disk_events_poll_jiffies(disk);
+	if (!ev->block && intv)
+		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+
+	spin_unlock_irq(&ev->lock);
+
+	/* tell userland about new events */
+	for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
+		if (events & (1 << i))
+			envp[nr_events++] = disk_uevents[i];
+
+	if (nr_events)
+		kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+}
+
+/*
+ * A disk events enabled device has the following sysfs nodes under
+ * its /sys/block/X/ directory.
+ *
+ * events		: list of all supported events
+ * events_async		: list of events which can be detected w/o polling
+ * events_poll_msecs	: polling interval, 0: disable, -1: system default
+ */
+static ssize_t __disk_events_show(unsigned int events, char *buf)
+{
+	const char *delim = "";
+	ssize_t pos = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
+		if (events & (1 << i)) {
+			pos += sprintf(buf + pos, "%s%s",
+				       delim, disk_events_strs[i]);
+			delim = " ";
+		}
+	if (pos)
+		pos += sprintf(buf + pos, "\n");
+	return pos;
+}
+
+static ssize_t disk_events_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return __disk_events_show(disk->events, buf);
+}
+
+static ssize_t disk_events_async_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return __disk_events_show(disk->async_events, buf);
+}
+
+static ssize_t disk_events_poll_msecs_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
+}
+
+static ssize_t disk_events_poll_msecs_store(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	long intv;
+
+	if (!count || !sscanf(buf, "%ld", &intv))
+		return -EINVAL;
+
+	if (intv < 0 && intv != -1)
+		return -EINVAL;
+
+	__disk_block_events(disk, true);
+	disk->ev->poll_msecs = intv;
+	__disk_unblock_events(disk, true);
+
+	return count;
+}
+
+static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
+static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
+static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
+			 disk_events_poll_msecs_show,
+			 disk_events_poll_msecs_store);
+
+static const struct attribute *disk_events_attrs[] = {
+	&dev_attr_events.attr,
+	&dev_attr_events_async.attr,
+	&dev_attr_events_poll_msecs.attr,
+	NULL,
+};
+
+/*
+ * The default polling interval can be specified by the kernel
+ * parameter block.events_dfl_poll_msecs which defaults to 0
+ * (disable).  This can also be modified runtime by writing to
+ * /sys/module/block/events_dfl_poll_msecs.
+ */
+static int disk_events_set_dfl_poll_msecs(const char *val,
+					  const struct kernel_param *kp)
+{
+	struct disk_events *ev;
+	int ret;
+
+	ret = param_set_ulong(val, kp);
+	if (ret < 0)
+		return ret;
+
+	mutex_lock(&disk_events_mutex);
+
+	list_for_each_entry(ev, &disk_events, node)
+		disk_check_events(ev->disk);
+
+	mutex_unlock(&disk_events_mutex);
+
+	return 0;
+}
+
+static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
+	.set	= disk_events_set_dfl_poll_msecs,
+	.get	= param_get_ulong,
+};
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX	"block."
+
+module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
+		&disk_events_dfl_poll_msecs, 0644);
+
+/*
+ * disk_{add|del|release}_events - initialize and destroy disk_events.
+ */
+static void disk_add_events(struct gendisk *disk)
+{
+	struct disk_events *ev;
+
+	if (!disk->fops->check_events || !(disk->events | disk->async_events))
+		return;
+
+	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+	if (!ev) {
+		pr_warn("%s: failed to initialize events\n", disk->disk_name);
+		return;
+	}
+
+	if (sysfs_create_files(&disk_to_dev(disk)->kobj,
+			       disk_events_attrs) < 0) {
+		pr_warn("%s: failed to create sysfs files for events\n",
+			disk->disk_name);
+		kfree(ev);
+		return;
+	}
+
+	disk->ev = ev;
+
+	INIT_LIST_HEAD(&ev->node);
+	ev->disk = disk;
+	spin_lock_init(&ev->lock);
+	ev->block = 1;
+	ev->poll_msecs = -1;
+	INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
+
+	mutex_lock(&disk_events_mutex);
+	list_add_tail(&ev->node, &disk_events);
+	mutex_unlock(&disk_events_mutex);
+
+	/*
+	 * Block count is initialized to 1 and the following initial
+	 * unblock kicks it into action.
+	 */
+	__disk_unblock_events(disk, true);
+}
+
+static void disk_del_events(struct gendisk *disk)
+{
+	if (!disk->ev)
+		return;
+
+	__disk_block_events(disk, true);
+
+	mutex_lock(&disk_events_mutex);
+	list_del_init(&disk->ev->node);
+	mutex_unlock(&disk_events_mutex);
+
+	sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
+}
+
+static void disk_release_events(struct gendisk *disk)
+{
+	/* the block count should be 1 from disk_del_events() */
+	WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
+	kfree(disk->ev);
+}
-- 
cgit v1.2.2


From e692cb668fdd5a712c6ed2a2d6f2a36ee83997b4 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 1 Dec 2010 19:41:49 +0100
Subject: block: Deprecate QUEUE_FLAG_CLUSTER and use queue_limits instead

When stacking devices, a request_queue is not always available. This
forced us to have a no_cluster flag in the queue_limits that could be
used as a carrier until the request_queue had been set up for a
metadevice.

There were several problems with that approach. First of all it was up
to the stacking device to remember to set queue flag after stacking had
completed. Also, the queue flag and the queue limits had to be kept in
sync at all times. We got that wrong, which could lead to us issuing
commands that went beyond the max scatterlist limit set by the driver.

The proper fix is to avoid having two flags for tracking the same thing.
We deprecate QUEUE_FLAG_CLUSTER and use the queue limit directly in the
block layer merging functions. The queue_limit 'no_cluster' is turned
into 'cluster' to avoid double negatives and to ease stacking.
Clustering defaults to being enabled as before. The queue flag logic is
removed from the stacking function, and explicitly setting the cluster
flag is no longer necessary in DM and MD.

Reported-by: Ed Lin <ed.lin@promise.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-merge.c    |  6 +++---
 block/blk-settings.c | 25 ++-----------------------
 block/blk-sysfs.c    |  2 +-
 3 files changed, 6 insertions(+), 27 deletions(-)

(limited to 'block')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 77b7c26df6b5..74bc4a768f32 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -21,7 +21,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 		return 0;
 
 	fbio = bio;
-	cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
+	cluster = blk_queue_cluster(q);
 	seg_size = 0;
 	nr_phys_segs = 0;
 	for_each_bio(bio) {
@@ -87,7 +87,7 @@ EXPORT_SYMBOL(blk_recount_segments);
 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 				   struct bio *nxt)
 {
-	if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
+	if (!blk_queue_cluster(q))
 		return 0;
 
 	if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
@@ -123,7 +123,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 	int nsegs, cluster;
 
 	nsegs = 0;
-	cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
+	cluster = blk_queue_cluster(q);
 
 	/*
 	 * for each bio in rq
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 701859fb9647..e55f5fc4ca22 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -126,7 +126,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->alignment_offset = 0;
 	lim->io_opt = 0;
 	lim->misaligned = 0;
-	lim->no_cluster = 0;
+	lim->cluster = 1;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
@@ -464,15 +464,6 @@ EXPORT_SYMBOL(blk_queue_io_opt);
 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 {
 	blk_stack_limits(&t->limits, &b->limits, 0);
-
-	if (!t->queue_lock)
-		WARN_ON_ONCE(1);
-	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
-		unsigned long flags;
-		spin_lock_irqsave(t->queue_lock, flags);
-		queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
-		spin_unlock_irqrestore(t->queue_lock, flags);
-	}
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
 
@@ -545,7 +536,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->io_min = max(t->io_min, b->io_min);
 	t->io_opt = lcm(t->io_opt, b->io_opt);
 
-	t->no_cluster |= b->no_cluster;
+	t->cluster &= b->cluster;
 	t->discard_zeroes_data &= b->discard_zeroes_data;
 
 	/* Physical block size a multiple of the logical block size? */
@@ -641,7 +632,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
 		       sector_t offset)
 {
 	struct request_queue *t = disk->queue;
-	struct request_queue *b = bdev_get_queue(bdev);
 
 	if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
 		char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
@@ -652,17 +642,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
 		printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
 		       top, bottom);
 	}
-
-	if (!t->queue_lock)
-		WARN_ON_ONCE(1);
-	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
-		unsigned long flags;
-
-		spin_lock_irqsave(t->queue_lock, flags);
-		if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
-			queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
-		spin_unlock_irqrestore(t->queue_lock, flags);
-	}
 }
 EXPORT_SYMBOL(disk_stack_limits);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 013457f47fdc..41fb69150b4d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -119,7 +119,7 @@ static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *
 
 static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
 {
-	if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
+	if (blk_queue_cluster(q))
 		return queue_var_show(queue_max_segment_size(q), (page));
 
 	return queue_var_show(PAGE_CACHE_SIZE, (page));
-- 
cgit v1.2.2


From 72d4cd9f38b5ed96b75df4c622be25e1c2648dd3 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 17 Dec 2010 08:34:20 +0100
Subject: block: max hardware sectors limit wrapper

Implement blk_limits_max_hw_sectors() and make
blk_queue_max_hw_sectors() a wrapper around it.

DM needs this to avoid setting queue_limits' max_hw_sectors and
max_sectors directly.  dm_set_device_limits() now leverages
blk_limits_max_hw_sectors() logic to establish the appropriate
max_hw_sectors minimum (PAGE_SIZE).  Fixes issue where DM was
incorrectly setting max_sectors rather than max_hw_sectors (which
caused dm_merge_bvec()'s max_hw_sectors check to be ineffective).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@kernel.org
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index e55f5fc4ca22..36c8c1f2af18 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -229,8 +229,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
 /**
- * blk_queue_max_hw_sectors - set max sectors for a request for this queue
- * @q:  the request queue for the device
+ * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request
+ * @limits: the queue limits
  * @max_hw_sectors:  max hardware sectors in the usual 512b unit
  *
  * Description:
@@ -244,7 +244,7 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
  *    per-device basis in /sys/block/<device>/queue/max_sectors_kb.
  *    The soft limit can not exceed max_hw_sectors.
  **/
-void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
+void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors)
 {
 	if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
 		max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
@@ -252,9 +252,23 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 		       __func__, max_hw_sectors);
 	}
 
-	q->limits.max_hw_sectors = max_hw_sectors;
-	q->limits.max_sectors = min_t(unsigned int, max_hw_sectors,
-				      BLK_DEF_MAX_SECTORS);
+	limits->max_hw_sectors = max_hw_sectors;
+	limits->max_sectors = min_t(unsigned int, max_hw_sectors,
+				    BLK_DEF_MAX_SECTORS);
+}
+EXPORT_SYMBOL(blk_limits_max_hw_sectors);
+
+/**
+ * blk_queue_max_hw_sectors - set max sectors for a request for this queue
+ * @q:  the request queue for the device
+ * @max_hw_sectors:  max hardware sectors in the usual 512b unit
+ *
+ * Description:
+ *    See description for blk_limits_max_hw_sectors().
+ **/
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
+{
+	blk_limits_max_hw_sectors(&q->limits, max_hw_sectors);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
-- 
cgit v1.2.2


From 7278c9c19bd85cf33213a2e0b538a18d3ac8ad00 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Fri, 17 Dec 2010 08:57:14 +0100
Subject: cfq-iosched: don't check cfqg in choose_service_tree()

When cfq_choose_cfqg() is called in select_queue(), there must be at least one
backlogged CFQ queue waiting for dispatching, hence there must be at least one
backlogged CFQ group on service tree. So we never call choose_service_tree()
with cfqg == NULL.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9b186fd6bf47..c19d015ac5a5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2103,12 +2103,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	unsigned group_slice;
 	enum wl_prio_t original_prio = cfqd->serving_prio;
 
-	if (!cfqg) {
-		cfqd->serving_prio = IDLE_WORKLOAD;
-		cfqd->workload_expires = jiffies + 1;
-		return;
-	}
-
 	/* Choose next priority. RT > BE > IDLE */
 	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
 		cfqd->serving_prio = RT_WORKLOAD;
-- 
cgit v1.2.2


From b9f985b6e05ebd7af2aaef0eb3ae369390ef191f Mon Sep 17 00:00:00 2001
From: Yang Zhang <kthreadd@gmail.com>
Date: Fri, 17 Dec 2010 08:58:36 +0100
Subject: block: convert !IS_ERR(p) && p to !IS_ERR_NOR_NULL(p)

Signed-off-by: Yang Zhang <kthreadd@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 5fa2b44a72ff..79b9e327f3cb 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -735,7 +735,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
 	static void *p;
 
 	p = disk_seqf_start(seqf, pos);
-	if (!IS_ERR(p) && p && !*pos)
+	if (!IS_ERR_OR_NULL(p) && !*pos)
 		seq_puts(seqf, "major minor  #blocks  name\n\n");
 	return p;
 }
-- 
cgit v1.2.2


From e61eb2e93fe86931d46831752a82dab25a5335ca Mon Sep 17 00:00:00 2001
From: Yang Zhang <kthreadd@gmail.com>
Date: Fri, 17 Dec 2010 09:00:18 +0100
Subject: fs/block: type signature of major_to_index(int) to
 major_to_index(unsigned)

The major/minor device numbers are always defined and used as `unsigned'.

Signed-off-by: Yang Zhang <kthreadd@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 79b9e327f3cb..16ccc0d2d5d3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -239,7 +239,7 @@ static struct blk_major_name {
 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
 
 /* index in the above - for now: assume no multimajor ranges */
-static inline int major_to_index(int major)
+static inline int major_to_index(unsigned major)
 {
 	return major % BLKDEV_MAJOR_HASH_SIZE;
 }
-- 
cgit v1.2.2


From 27667c996f6a0bed4ad1e10ac0a0dbb6037968db Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 21 Dec 2010 15:07:45 +0100
Subject: block: Clean up exit_io_context() source code.

This patch fixes a spelling error in a source code comment and removes
superfluous braces in the function exit_io_context().

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Cc: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-ioc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 3c7a339fe381..b791022beef3 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -64,7 +64,7 @@ static void cfq_exit(struct io_context *ioc)
 	rcu_read_unlock();
 }
 
-/* Called by the exitting task */
+/* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
@@ -74,10 +74,9 @@ void exit_io_context(struct task_struct *task)
 	task->io_context = NULL;
 	task_unlock(task);
 
-	if (atomic_dec_and_test(&ioc->nr_tasks)) {
+	if (atomic_dec_and_test(&ioc->nr_tasks))
 		cfq_exit(ioc);
 
-	}
 	put_io_context(ioc);
 }
 
-- 
cgit v1.2.2


From 89b90be2d877a904b1704e4029db65655bfc6282 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 3 Jan 2011 15:01:47 +0100
Subject: block: make kblockd_workqueue smarter

kblockd is used for unplugging and may affect IO latency and
throughput and the max number of concurrent work items are bound by
the number of block devices.  Make it HIGHPRI workqueue w/ default max
concurrency.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 151070541e21..3689319a5974 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2606,7 +2606,9 @@ int __init blk_dev_init(void)
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
 			sizeof(((struct request *)0)->cmd_flags));
 
-	kblockd_workqueue = create_workqueue("kblockd");
+	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
+	kblockd_workqueue = alloc_workqueue("kblockd",
+					    WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
 
-- 
cgit v1.2.2


From 09e099d4bafea3b15be003d548bdf94b4b6e0e17 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 5 Jan 2011 16:57:38 +0100
Subject: block: fix accounting bug on cross partition merges

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
                                                ~~~~~~~~~~
   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
   is 0 and sda2's one is 1.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
   from sda2 region to sda1 region. However the two partition's
   hd_struct->in_flight are not changed.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
   sda2's hd_struct->in_flight, not a sda1's one, is decremented.

        | hd_struct->in_flight
   ---------------------------
   sda1 |         -1
   sda2 |          1
   ---------------------------

The patch fixes the problem by caching the partition lookup
inside the request structure, hence making sure that the increment
and decrement will always happen on the same partition struct. This
also speeds up IO with accounting enabled, since it cuts down on
the number of lookups we have to do.

Also add a refcount to struct hd_struct to keep the partition in
memory as long as users exist. We use kref_test_and_get() to ensure
we don't add a reference to a partition which is going away.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c  | 26 +++++++++++++++++++++-----
 block/blk-merge.c |  3 ++-
 block/genhd.c     |  1 +
 3 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3689319a5974..500c080a6a6b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		return;
 
 	cpu = part_stat_lock();
-	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
-	if (!new_io)
+	if (!new_io) {
+		part = rq->part;
 		part_stat_inc(cpu, part, merges[rw]);
-	else {
+	} else {
+		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+		if (!kref_test_and_get(&part->ref)) {
+			/*
+			 * The partition is already being removed,
+			 * the request will be accounted on the disk only
+			 *
+			 * We take a reference on disk->part0 although that
+			 * partition will never be deleted, so we can treat
+			 * it as any other partition.
+			 */
+			part = &rq->rq_disk->part0;
+			kref_get(&part->ref);
+		}
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
+		rq->part = part;
 	}
 
 	part_stat_unlock();
@@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
+	rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -1776,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rw);
 
+		kref_put(&part->ref, __delete_partition);
 		part_stat_unlock();
 	}
 }
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 77b7c26df6b5..b06b83b89d89 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
 
+		kref_put(&part->ref, __delete_partition);
 		part_stat_unlock();
 	}
 }
diff --git a/block/genhd.c b/block/genhd.c
index 16ccc0d2d5d3..85c150598830 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 			return NULL;
 		}
 		disk->part_tbl->part[0] = &disk->part0;
+		kref_init(&disk->part0.ref);
 
 		disk->minors = minors;
 		rand_initialize_disk(disk);
-- 
cgit v1.2.2


From 6c23a9681c0fe7fb7dd331b39dda11926f43746e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 7 Jan 2011 08:43:37 +0100
Subject: block: add internal hd part table references

We can't use krefs since it's apparently restricted to very basic
reference counting.

This reverts commit e4a683c8.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c  | 6 +++---
 block/blk-merge.c | 2 +-
 block/genhd.c     | 3 ++-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 500c080a6a6b..2f4002f79a24 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -70,7 +70,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		part_stat_inc(cpu, part, merges[rw]);
 	} else {
 		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
-		if (!kref_test_and_get(&part->ref)) {
+		if (!hd_struct_try_get(part)) {
 			/*
 			 * The partition is already being removed,
 			 * the request will be accounted on the disk only
@@ -80,7 +80,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 			 * it as any other partition.
 			 */
 			part = &rq->rq_disk->part0;
-			kref_get(&part->ref);
+			hd_struct_get(part);
 		}
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
@@ -1818,7 +1818,7 @@ static void blk_account_io_done(struct request *req)
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rw);
 
-		kref_put(&part->ref, __delete_partition);
+		hd_struct_put(part);
 		part_stat_unlock();
 	}
 }
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b06b83b89d89..00b7d31b38a2 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -356,7 +356,7 @@ static void blk_account_io_merge(struct request *req)
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
 
-		kref_put(&part->ref, __delete_partition);
+		hd_struct_put(part);
 		part_stat_unlock();
 	}
 }
diff --git a/block/genhd.c b/block/genhd.c
index 85c150598830..399d37ec7412 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1192,7 +1192,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 			return NULL;
 		}
 		disk->part_tbl->part[0] = &disk->part0;
-		kref_init(&disk->part0.ref);
+
+		hd_ref_init(&disk->part0);
 
 		disk->minors = minors;
 		rand_initialize_disk(disk);
-- 
cgit v1.2.2


From 30d7b9448f03f2c82d0fd44738674cc156a8ce0a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 7 Jan 2011 08:46:59 +0100
Subject: block cfq: don't use atomic_t for cfq_queue

cfq_queue->ref is used with queue_lock hold, so ref doesn't need to be an atomic
and atomic operation is slower.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c19d015ac5a5..4cb4cf73ac00 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -96,7 +96,7 @@ struct cfq_rb_root {
  */
 struct cfq_queue {
 	/* reference count */
-	atomic_t ref;
+	int ref;
 	/* various state flags, see below */
 	unsigned int flags;
 	/* parent cfq_data */
@@ -2025,7 +2025,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq)
 	int process_refs, io_refs;
 
 	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
-	process_refs = atomic_read(&cfqq->ref) - io_refs;
+	process_refs = cfqq->ref - io_refs;
 	BUG_ON(process_refs < 0);
 	return process_refs;
 }
@@ -2065,10 +2065,10 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 	 */
 	if (new_process_refs >= process_refs) {
 		cfqq->new_cfqq = new_cfqq;
-		atomic_add(process_refs, &new_cfqq->ref);
+		new_cfqq->ref += process_refs;
 	} else {
 		new_cfqq->new_cfqq = cfqq;
-		atomic_add(new_process_refs, &cfqq->ref);
+		cfqq->ref += new_process_refs;
 	}
 }
 
@@ -2532,9 +2532,10 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct cfq_group *cfqg, *orig_cfqg;
 
-	BUG_ON(atomic_read(&cfqq->ref) <= 0);
+	BUG_ON(cfqq->ref <= 0);
 
-	if (!atomic_dec_and_test(&cfqq->ref))
+	cfqq->ref--;
+	if (cfqq->ref)
 		return;
 
 	cfq_log_cfqq(cfqd, cfqq, "put_queue");
@@ -2837,7 +2838,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	RB_CLEAR_NODE(&cfqq->p_node);
 	INIT_LIST_HEAD(&cfqq->fifo);
 
-	atomic_set(&cfqq->ref, 0);
+	cfqq->ref = 0;
 	cfqq->cfqd = cfqd;
 
 	cfq_mark_cfqq_prio_changed(cfqq);
@@ -2973,11 +2974,11 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 	 * pin the queue now that it's allocated, scheduler exit will prune it
 	 */
 	if (!is_sync && !(*async_cfqq)) {
-		atomic_inc(&cfqq->ref);
+		cfqq->ref++;
 		*async_cfqq = cfqq;
 	}
 
-	atomic_inc(&cfqq->ref);
+	cfqq->ref++;
 	return cfqq;
 }
 
@@ -3679,7 +3680,7 @@ new_queue:
 	}
 
 	cfqq->allocated[rw]++;
-	atomic_inc(&cfqq->ref);
+	cfqq->ref++;
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
@@ -3860,6 +3861,10 @@ static void *cfq_init_queue(struct request_queue *q)
 	if (!cfqd)
 		return NULL;
 
+	/*
+	 * Don't need take queue_lock in the routine, since we are
+	 * initializing the ioscheduler, and nobody is using cfqd
+	 */
 	cfqd->cic_index = i;
 
 	/* Init root service tree */
@@ -3899,7 +3904,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	 * will not attempt to free it.
 	 */
 	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
-	atomic_inc(&cfqd->oom_cfqq.ref);
+	cfqd->oom_cfqq.ref++;
 	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
 
 	INIT_LIST_HEAD(&cfqd->cic_list);
-- 
cgit v1.2.2


From 329a67815b596d23daf0caa588ae0800e925320f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 7 Jan 2011 08:48:28 +0100
Subject: block cfq: don't use atomic_t for cfq_group

cfq_group->ref is used with queue_lock hold, the only exception is
cfq_set_request, which looks like a bug to me, so ref doesn't need
to be an atomic and atomic operation is slower.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4cb4cf73ac00..f083bda30546 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -207,7 +207,7 @@ struct cfq_group {
 	struct blkio_group blkg;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	struct hlist_node cfqd_node;
-	atomic_t ref;
+	int ref;
 #endif
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
@@ -1014,7 +1014,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	 * elevator which will be dropped by either elevator exit
 	 * or cgroup deletion path depending on who is exiting first.
 	 */
-	atomic_set(&cfqg->ref, 1);
+	cfqg->ref = 1;
 
 	/*
 	 * Add group onto cgroup list. It might happen that bdi->dev is
@@ -1059,7 +1059,7 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 
 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
 {
-	atomic_inc(&cfqg->ref);
+	cfqg->ref++;
 	return cfqg;
 }
 
@@ -1071,7 +1071,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 
 	cfqq->cfqg = cfqg;
 	/* cfqq reference on cfqg */
-	atomic_inc(&cfqq->cfqg->ref);
+	cfqq->cfqg->ref++;
 }
 
 static void cfq_put_cfqg(struct cfq_group *cfqg)
@@ -1079,8 +1079,9 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 	struct cfq_rb_root *st;
 	int i, j;
 
-	BUG_ON(atomic_read(&cfqg->ref) <= 0);
-	if (!atomic_dec_and_test(&cfqg->ref))
+	BUG_ON(cfqg->ref <= 0);
+	cfqg->ref--;
+	if (cfqg->ref)
 		return;
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
@@ -1188,7 +1189,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 		cfqq->orig_cfqg = cfqq->cfqg;
 		cfqq->cfqg = &cfqd->root_group;
-		atomic_inc(&cfqd->root_group.ref);
+		cfqd->root_group.ref++;
 		group_changed = 1;
 	} else if (!cfqd->cfq_group_isolation
 		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
@@ -3681,12 +3682,12 @@ new_queue:
 
 	cfqq->allocated[rw]++;
 	cfqq->ref++;
-
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
 	rq->elevator_private = cic;
 	rq->elevator_private2 = cfqq;
 	rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
 	return 0;
 
 queue_fail:
@@ -3884,7 +3885,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	 * Take a reference to root group which we never drop. This is just
 	 * to make sure that cfq_put_cfqg() does not try to kfree root group
 	 */
-	atomic_set(&cfqg->ref, 1);
+	cfqg->ref = 1;
 	rcu_read_lock();
 	cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
 					(void *)cfqd, 0);
-- 
cgit v1.2.2


From f8ae6e3eb8251be32c6e913393d9f8d9e0609489 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 14 Jan 2011 08:41:02 +0100
Subject: block cfq: make queue preempt work for queues from different workload

I got this:
             fio-874   [007]  2157.724514:   8,32   m   N cfq874 preempt
             fio-874   [007]  2157.724519:   8,32   m   N cfq830 slice expired t=1
             fio-874   [007]  2157.724520:   8,32   m   N cfq830 sl_used=1 disp=0 charge=1 iops=0 sect=0
             fio-874   [007]  2157.724521:   8,32   m   N cfq830 set_active wl_prio:0 wl_type:0
             fio-874   [007]  2157.724522:   8,32   m   N cfq830 Not idling. st->count:1

cfq830 is an async queue, and preempted by a sync queue cfq874. But since we
have cfqg->saved_workload_slice mechanism, the preempt is a nop.
Looks currently our preempt is totally broken if the two queues are not from
the same workload type.
Below patch fixes it. This will might make async queue starvation, but it's
what our old code does before cgroup is added.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8427697c5437..7bfea53c1bb5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3284,9 +3284,18 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
  */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+	struct cfq_queue *old_cfqq = cfqd->active_queue;
+
 	cfq_log_cfqq(cfqd, cfqq, "preempt");
 	cfq_slice_expired(cfqd, 1);
 
+	/*
+	 * workload type is changed, don't save slice, otherwise preempt
+	 * doesn't happen
+	 */
+	if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
+		cfqq->cfqg->saved_workload_slice = 0;
+
 	/*
 	 * Put the new queue at the front of the of the current list,
 	 * so we know that it will be selected next.
-- 
cgit v1.2.2


From c553f8e335c00a7cff3ab3f13e793b13d3f2207f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 14 Jan 2011 08:41:03 +0100
Subject: block cfq: compensate preempted queue even if it has no slice
 assigned

If a queue is preempted before it gets slice assigned, the queue doesn't get
compensation, which looks unfair. For such queue, we compensate it for a whole
slice.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7bfea53c1bb5..501ffdf0399c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -598,8 +598,8 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	return cfq_target_latency * cfqg->weight / st->total_weight;
 }
 
-static inline void
-cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static inline unsigned
+cfq_scaled_group_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
 	if (cfqd->cfq_latency) {
@@ -625,6 +625,14 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 				    low_slice);
 		}
 	}
+	return slice;
+}
+
+static inline void
+cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	unsigned slice = cfq_scaled_group_slice(cfqd, cfqq);
+
 	cfqq->slice_start = jiffies;
 	cfqq->slice_end = jiffies + slice;
 	cfqq->allocated_slice = slice;
@@ -1661,8 +1669,11 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	/*
 	 * store what was left of this slice, if the queue idled/timed out
 	 */
-	if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
-		cfqq->slice_resid = cfqq->slice_end - jiffies;
+	if (timed_out) {
+		if (cfq_cfqq_slice_new(cfqq))
+			cfqq->slice_resid = cfq_scaled_group_slice(cfqd, cfqq);
+		else
+			cfqq->slice_resid = cfqq->slice_end - jiffies;
 		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
 	}
 
-- 
cgit v1.2.2


From ba5bd520f679c450fb6efa439618703bd0956daa Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 19 Jan 2011 08:25:02 -0700
Subject: cfq: rename a function to give it more appropriate name

o Rename a function to give it more approprate name. We are calculating
  cfq queue slice and function name gives the impression as if cfq group
  slice length is being calculated.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 501ffdf0399c..ace168657136 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -599,7 +599,7 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 }
 
 static inline unsigned
-cfq_scaled_group_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
 	if (cfqd->cfq_latency) {
@@ -631,7 +631,7 @@ cfq_scaled_group_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	unsigned slice = cfq_scaled_group_slice(cfqd, cfqq);
+	unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
 
 	cfqq->slice_start = jiffies;
 	cfqq->slice_end = jiffies + slice;
@@ -1671,7 +1671,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	 */
 	if (timed_out) {
 		if (cfq_cfqq_slice_new(cfqq))
-			cfqq->slice_resid = cfq_scaled_group_slice(cfqd, cfqq);
+			cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
 		else
 			cfqq->slice_resid = cfqq->slice_end - jiffies;
 		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
-- 
cgit v1.2.2


From be2c6b1990904dbd43f3d9b90fa2c530504375cd Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 19 Jan 2011 08:25:02 -0700
Subject: blkio-throttle: Avoid calling blkiocg_lookup_group() for root group

o Jeff Moyer was doing some testing on a RAM backed disk and
  blkiocg_lookup_group() showed up high overhead after memcpy(). Similarly
  somebody else reported that blkiocg_lookup_group() is eating 6% extra
  cpu. Though looking at the code I can't think why the overhead of
  this function is so high. One thing is that it is called with very high
  frequency (once for every IO).

o For lot of folks blkio controller will be compiled in but they might
  not have actually created cgroups. Hence optimize the case of root
  cgroup where we can avoid calling blkiocg_lookup_group() if IO is happening
  in root group (common case).

Reported-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 381b09bb562b..a89043a3caa4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -168,7 +168,15 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
 	 * tree of blkg (instead of traversing through hash list all
 	 * the time.
 	 */
-	tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+
+	/*
+	 * This is the common case when there are no blkio cgroups.
+ 	 * Avoid lookup in this case
+ 	 */
+	if (blkcg == &blkio_root_cgroup)
+		tg = &td->root_tg;
+	else
+		tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
 
 	/* Fill in device details for root group */
 	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-- 
cgit v1.2.2


From 6a108a14fa356ef607be308b68337939e56ea94e Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 20 Jan 2011 14:44:16 -0800
Subject: kconfig: rename CONFIG_EMBEDDED to CONFIG_EXPERT

The meaning of CONFIG_EMBEDDED has long since been obsoleted; the option
is used to configure any non-standard kernel with a much larger scope than
only small devices.

This patch renames the option to CONFIG_EXPERT in init/Kconfig and fixes
references to the option throughout the kernel.  A new CONFIG_EMBEDDED
option is added that automatically selects CONFIG_EXPERT when enabled and
can be used in the future to isolate options that should only be
considered for embedded systems (RISC architectures, SLOB, etc).

Calling the option "EXPERT" more accurately represents its intention: only
expert users who understand the impact of the configuration changes they
are making should enable it.

Reviewed-by: Ingo Molnar <mingo@elte.hu>
Acked-by: David Woodhouse <david.woodhouse@intel.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Greg KH <gregkh@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Robin Holt <holt@sgi.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/Kconfig b/block/Kconfig
index 6c9213ef15a1..60be1e0455da 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -2,7 +2,7 @@
 # Block layer core configuration
 #
 menuconfig BLOCK
-       bool "Enable the block layer" if EMBEDDED
+       bool "Enable the block layer" if EXPERT
        default y
        help
 	 Provide block layer support for the kernel.
-- 
cgit v1.2.2


From 414b4ff5eecff0097d09c4a7da12e435fd503692 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Jan 2011 12:43:49 +0100
Subject: block: add REQ_FLUSH_SEQ

rq == &q->flush_rq was used to determine whether a rq is part of a
flush sequence, which worked because all requests in a flush sequence
were sequenced using the single dedicated request.  This is about to
change, so introduce REQ_FLUSH_SEQ flag to distinguish flush sequence
requests.

This patch doesn't cause any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c  | 4 ++--
 block/blk-flush.c | 1 +
 block/blk.h       | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 4ce953f1b390..fc7d8ad76f44 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -136,7 +136,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 {
 	struct request_queue *q = rq->q;
 
-	if (&q->flush_rq != rq) {
+	if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) {
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -1789,7 +1789,7 @@ static void blk_account_io_done(struct request *req)
 	 * normal IO on queueing nor completion.  Accounting the
 	 * containing request is enough.
 	 */
-	if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
+	if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 54b123d6563e..8592869bcbe7 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,6 +130,7 @@ static struct request *queue_next_fseq(struct request_queue *q)
 		BUG();
 	}
 
+	rq->cmd_flags |= REQ_FLUSH_SEQ;
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 	return rq;
 }
diff --git a/block/blk.h b/block/blk.h
index 2db8f32838e7..9d2ee8f4d9af 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -61,7 +61,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
 			if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
-			    rq == &q->flush_rq)
+			    (rq->cmd_flags & REQ_FLUSH_SEQ))
 				return rq;
 			rq = blk_do_flush(q, rq);
 			if (rq)
-- 
cgit v1.2.2


From 143a87f4c9c629067afea5b6703d66ea88c82f8e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Jan 2011 12:43:52 +0100
Subject: block: improve flush bio completion

bio's for flush are completed twice - once during the data phase and
one more time after the whole sequence is complete.  The first
completion shouldn't notify completion to the issuer.

This was achieved by skipping all bio completion steps in
req_bio_endio() for the first completion; however, this has two
drawbacks.

* Error is not recorded in bio and must be tracked somewhere else.

* Partial completion is not supported.

Both don't cause problems for the current users; however, they make
further improvements difficult.  Change req_bio_endio() such that it
only skips the actual notification part for the first completion.  bio
completion is implemented with partial completions on mind anyway so
this is as simple as moving the REQ_FLUSH_SEQ conditional such that
only calling of bio_endio() is skipped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 48 +++++++++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index fc7d8ad76f44..617bb9e40927 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -136,37 +136,31 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 {
 	struct request_queue *q = rq->q;
 
-	if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) {
-		if (error)
-			clear_bit(BIO_UPTODATE, &bio->bi_flags);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			error = -EIO;
-
-		if (unlikely(nbytes > bio->bi_size)) {
-			printk(KERN_ERR "%s: want %u bytes done, %u left\n",
-			       __func__, nbytes, bio->bi_size);
-			nbytes = bio->bi_size;
-		}
+	if (error)
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		error = -EIO;
+
+	if (unlikely(nbytes > bio->bi_size)) {
+		printk(KERN_ERR "%s: want %u bytes done, %u left\n",
+		       __func__, nbytes, bio->bi_size);
+		nbytes = bio->bi_size;
+	}
 
-		if (unlikely(rq->cmd_flags & REQ_QUIET))
-			set_bit(BIO_QUIET, &bio->bi_flags);
+	if (unlikely(rq->cmd_flags & REQ_QUIET))
+		set_bit(BIO_QUIET, &bio->bi_flags);
 
-		bio->bi_size -= nbytes;
-		bio->bi_sector += (nbytes >> 9);
+	bio->bi_size -= nbytes;
+	bio->bi_sector += (nbytes >> 9);
 
-		if (bio_integrity(bio))
-			bio_integrity_advance(bio, nbytes);
+	if (bio_integrity(bio))
+		bio_integrity_advance(bio, nbytes);
 
-		if (bio->bi_size == 0)
-			bio_endio(bio, error);
-	} else {
-		/*
-		 * Okay, this is the sequenced flush request in
-		 * progress, just record the error;
-		 */
-		if (error && !q->flush_err)
-			q->flush_err = error;
-	}
+	/* don't actually finish bio if it's part of flush sequence */
+	if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+		bio_endio(bio, error);
+	else if (error && !q->flush_err)
+		q->flush_err = error;
 }
 
 void blk_dump_rq_flags(struct request *rq, char *msg)
-- 
cgit v1.2.2


From ae1b1539622fb46e51b4d13b3f9e5f4c713f86ae Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Jan 2011 12:43:54 +0100
Subject: block: reimplement FLUSH/FUA to support merge

The current FLUSH/FUA support has evolved from the implementation
which had to perform queue draining.  As such, sequencing is done
queue-wide one flush request after another.  However, with the
draining requirement gone, there's no reason to keep the queue-wide
sequential approach.

This patch reimplements FLUSH/FUA support such that each FLUSH/FUA
request is sequenced individually.  The actual FLUSH execution is
double buffered and whenever a request wants to execute one for either
PRE or POSTFLUSH, it queues on the pending queue.  Once certain
conditions are met, a flush request is issued and on its completion
all pending requests proceed to the next sequence.

This allows arbitrary merging of different type of flushes.  How they
are merged can be primarily controlled and tuned by adjusting the
above said 'conditions' used to determine when to issue the next
flush.

This is inspired by Darrick's patches to merge multiple zero-data
flushes which helps workloads with highly concurrent fsync requests.

* As flush requests are never put on the IO scheduler, request fields
  used for flush share space with rq->rb_node.  rq->completion_data is
  moved out of the union.  This increases the request size by one
  pointer.

  As rq->elevator_private* are used only by the iosched too, it is
  possible to reduce the request size further.  However, to do that,
  we need to modify request allocation path such that iosched data is
  not allocated for flush requests.

* FLUSH/FUA processing happens on insertion now instead of dispatch.

- Comments updated as per Vivek and Mike.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "Darrick J. Wong" <djwong@us.ibm.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c  |  10 +-
 block/blk-flush.c | 440 +++++++++++++++++++++++++++++++++++++-----------------
 block/blk.h       |  12 +-
 block/elevator.c  |   7 +
 4 files changed, 319 insertions(+), 150 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 617bb9e40927..05746093b45e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -134,8 +134,6 @@ EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
-	struct request_queue *q = rq->q;
-
 	if (error)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -159,8 +157,6 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	/* don't actually finish bio if it's part of flush sequence */
 	if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
 		bio_endio(bio, error);
-	else if (error && !q->flush_err)
-		q->flush_err = error;
 }
 
 void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -519,7 +515,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
-	INIT_LIST_HEAD(&q->pending_flushes);
+	INIT_LIST_HEAD(&q->flush_queue[0]);
+	INIT_LIST_HEAD(&q->flush_queue[1]);
+	INIT_LIST_HEAD(&q->flush_data_in_flight);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1198,7 +1196,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	spin_lock_irq(q->queue_lock);
 
 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
-		where = ELEVATOR_INSERT_FRONT;
+		where = ELEVATOR_INSERT_FLUSH;
 		goto get_rq;
 	}
 
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8592869bcbe7..a867e3f524f3 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,6 +1,69 @@
 /*
  * Functions to sequence FLUSH and FUA writes.
+ *
+ * Copyright (C) 2011		Max Planck Institute for Gravitational Physics
+ * Copyright (C) 2011		Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
+ * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
+ * properties and hardware capability.
+ *
+ * If a request doesn't have data, only REQ_FLUSH makes sense, which
+ * indicates a simple flush request.  If there is data, REQ_FLUSH indicates
+ * that the device cache should be flushed before the data is executed, and
+ * REQ_FUA means that the data must be on non-volatile media on request
+ * completion.
+ *
+ * If the device doesn't have writeback cache, FLUSH and FUA don't make any
+ * difference.  The requests are either completed immediately if there's no
+ * data or executed as normal requests otherwise.
+ *
+ * If the device has writeback cache and supports FUA, REQ_FLUSH is
+ * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
+ *
+ * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
+ * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
+ *
+ * The actual execution of flush is double buffered.  Whenever a request
+ * needs to execute PRE or POSTFLUSH, it queues at
+ * q->flush_queue[q->flush_pending_idx].  Once certain criteria are met, a
+ * flush is issued and the pending_idx is toggled.  When the flush
+ * completes, all the requests which were pending are proceeded to the next
+ * step.  This allows arbitrary merging of different types of FLUSH/FUA
+ * requests.
+ *
+ * Currently, the following conditions are used to determine when to issue
+ * flush.
+ *
+ * C1. At any given time, only one flush shall be in progress.  This makes
+ *     double buffering sufficient.
+ *
+ * C2. Flush is deferred if any request is executing DATA of its sequence.
+ *     This avoids issuing separate POSTFLUSHes for requests which shared
+ *     PREFLUSH.
+ *
+ * C3. The second condition is ignored if there is a request which has
+ *     waited longer than FLUSH_PENDING_TIMEOUT.  This is to avoid
+ *     starvation in the unlikely case where there are continuous stream of
+ *     FUA (without FLUSH) requests.
+ *
+ * For devices which support FUA, it isn't clear whether C2 (and thus C3)
+ * is beneficial.
+ *
+ * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
+ * Once while executing DATA and again after the whole sequence is
+ * complete.  The first completion updates the contained bio but doesn't
+ * finish it so that the bio submitter is notified only after the whole
+ * sequence is complete.  This is implemented by testing REQ_FLUSH_SEQ in
+ * req_bio_endio().
+ *
+ * The above peculiarity requires that each FLUSH/FUA request has only one
+ * bio attached to it, which is guaranteed as they aren't allowed to be
+ * merged in the usual way.
  */
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/bio.h>
@@ -11,185 +74,290 @@
 
 /* FLUSH/FUA sequences */
 enum {
-	QUEUE_FSEQ_STARTED	= (1 << 0), /* flushing in progress */
-	QUEUE_FSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
-	QUEUE_FSEQ_DATA		= (1 << 2), /* data write in progress */
-	QUEUE_FSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
-	QUEUE_FSEQ_DONE		= (1 << 4),
+	REQ_FSEQ_PREFLUSH	= (1 << 0), /* pre-flushing in progress */
+	REQ_FSEQ_DATA		= (1 << 1), /* data write in progress */
+	REQ_FSEQ_POSTFLUSH	= (1 << 2), /* post-flushing in progress */
+	REQ_FSEQ_DONE		= (1 << 3),
+
+	REQ_FSEQ_ACTIONS	= REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
+				  REQ_FSEQ_POSTFLUSH,
+
+	/*
+	 * If flush has been pending longer than the following timeout,
+	 * it's issued even if flush_data requests are still in flight.
+	 */
+	FLUSH_PENDING_TIMEOUT	= 5 * HZ,
 };
 
-static struct request *queue_next_fseq(struct request_queue *q);
+static bool blk_kick_flush(struct request_queue *q);
 
-unsigned blk_flush_cur_seq(struct request_queue *q)
+static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
 {
-	if (!q->flush_seq)
-		return 0;
-	return 1 << ffz(q->flush_seq);
+	unsigned int policy = 0;
+
+	if (fflags & REQ_FLUSH) {
+		if (rq->cmd_flags & REQ_FLUSH)
+			policy |= REQ_FSEQ_PREFLUSH;
+		if (blk_rq_sectors(rq))
+			policy |= REQ_FSEQ_DATA;
+		if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
+			policy |= REQ_FSEQ_POSTFLUSH;
+	}
+	return policy;
 }
 
-static struct request *blk_flush_complete_seq(struct request_queue *q,
-					      unsigned seq, int error)
+static unsigned int blk_flush_cur_seq(struct request *rq)
 {
-	struct request *next_rq = NULL;
-
-	if (error && !q->flush_err)
-		q->flush_err = error;
-
-	BUG_ON(q->flush_seq & seq);
-	q->flush_seq |= seq;
-
-	if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) {
-		/* not complete yet, queue the next flush sequence */
-		next_rq = queue_next_fseq(q);
-	} else {
-		/* complete this flush request */
-		__blk_end_request_all(q->orig_flush_rq, q->flush_err);
-		q->orig_flush_rq = NULL;
-		q->flush_seq = 0;
-
-		/* dispatch the next flush if there's one */
-		if (!list_empty(&q->pending_flushes)) {
-			next_rq = list_entry_rq(q->pending_flushes.next);
-			list_move(&next_rq->queuelist, &q->queue_head);
-		}
-	}
-	return next_rq;
+	return 1 << ffz(rq->flush.seq);
 }
 
-static void blk_flush_complete_seq_end_io(struct request_queue *q,
-					  unsigned seq, int error)
+static void blk_flush_restore_request(struct request *rq)
 {
-	bool was_empty = elv_queue_empty(q);
-	struct request *next_rq;
-
-	next_rq = blk_flush_complete_seq(q, seq, error);
-
 	/*
-	 * Moving a request silently to empty queue_head may stall the
-	 * queue.  Kick the queue in those cases.
+	 * After flush data completion, @rq->bio is %NULL but we need to
+	 * complete the bio again.  @rq->biotail is guaranteed to equal the
+	 * original @rq->bio.  Restore it.
 	 */
-	if (was_empty && next_rq)
-		__blk_run_queue(q);
+	rq->bio = rq->biotail;
+
+	/* make @rq a normal request */
+	rq->cmd_flags &= ~REQ_FLUSH_SEQ;
+	rq->end_io = NULL;
 }
 
-static void pre_flush_end_io(struct request *rq, int error)
+/**
+ * blk_flush_complete_seq - complete flush sequence
+ * @rq: FLUSH/FUA request being sequenced
+ * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
+ * @error: whether an error occurred
+ *
+ * @rq just completed @seq part of its flush sequence, record the
+ * completion and trigger the next step.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ *
+ * RETURNS:
+ * %true if requests were added to the dispatch queue, %false otherwise.
+ */
+static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
+				   int error)
 {
-	elv_completed_request(rq->q, rq);
-	blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error);
+	struct request_queue *q = rq->q;
+	struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+	bool queued = false;
+
+	BUG_ON(rq->flush.seq & seq);
+	rq->flush.seq |= seq;
+
+	if (likely(!error))
+		seq = blk_flush_cur_seq(rq);
+	else
+		seq = REQ_FSEQ_DONE;
+
+	switch (seq) {
+	case REQ_FSEQ_PREFLUSH:
+	case REQ_FSEQ_POSTFLUSH:
+		/* queue for flush */
+		if (list_empty(pending))
+			q->flush_pending_since = jiffies;
+		list_move_tail(&rq->flush.list, pending);
+		break;
+
+	case REQ_FSEQ_DATA:
+		list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
+		list_add(&rq->queuelist, &q->queue_head);
+		queued = true;
+		break;
+
+	case REQ_FSEQ_DONE:
+		/*
+		 * @rq was previously adjusted by blk_flush_issue() for
+		 * flush sequencing and may already have gone through the
+		 * flush data request completion path.  Restore @rq for
+		 * normal completion and end it.
+		 */
+		BUG_ON(!list_empty(&rq->queuelist));
+		list_del_init(&rq->flush.list);
+		blk_flush_restore_request(rq);
+		__blk_end_request_all(rq, error);
+		break;
+
+	default:
+		BUG();
+	}
+
+	return blk_kick_flush(q) | queued;
 }
 
-static void flush_data_end_io(struct request *rq, int error)
+static void flush_end_io(struct request *flush_rq, int error)
 {
-	elv_completed_request(rq->q, rq);
-	blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error);
+	struct request_queue *q = flush_rq->q;
+	struct list_head *running = &q->flush_queue[q->flush_running_idx];
+	bool was_empty = elv_queue_empty(q);
+	bool queued = false;
+	struct request *rq, *n;
+
+	BUG_ON(q->flush_pending_idx == q->flush_running_idx);
+
+	/* account completion of the flush request */
+	q->flush_running_idx ^= 1;
+	elv_completed_request(q, flush_rq);
+
+	/* and push the waiting requests to the next stage */
+	list_for_each_entry_safe(rq, n, running, flush.list) {
+		unsigned int seq = blk_flush_cur_seq(rq);
+
+		BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
+		queued |= blk_flush_complete_seq(rq, seq, error);
+	}
+
+	/* after populating an empty queue, kick it to avoid stall */
+	if (queued && was_empty)
+		__blk_run_queue(q);
 }
 
-static void post_flush_end_io(struct request *rq, int error)
+/**
+ * blk_kick_flush - consider issuing flush request
+ * @q: request_queue being kicked
+ *
+ * Flush related states of @q have changed, consider issuing flush request.
+ * Please read the comment at the top of this file for more info.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ *
+ * RETURNS:
+ * %true if flush was issued, %false otherwise.
+ */
+static bool blk_kick_flush(struct request_queue *q)
 {
-	elv_completed_request(rq->q, rq);
-	blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
+	struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+	struct request *first_rq =
+		list_first_entry(pending, struct request, flush.list);
+
+	/* C1 described at the top of this file */
+	if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
+		return false;
+
+	/* C2 and C3 */
+	if (!list_empty(&q->flush_data_in_flight) &&
+	    time_before(jiffies,
+			q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
+		return false;
+
+	/*
+	 * Issue flush and toggle pending_idx.  This makes pending_idx
+	 * different from running_idx, which means flush is in flight.
+	 */
+	blk_rq_init(q, &q->flush_rq);
+	q->flush_rq.cmd_type = REQ_TYPE_FS;
+	q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
+	q->flush_rq.rq_disk = first_rq->rq_disk;
+	q->flush_rq.end_io = flush_end_io;
+
+	q->flush_pending_idx ^= 1;
+	elv_insert(q, &q->flush_rq, ELEVATOR_INSERT_FRONT);
+	return true;
 }
 
-static void init_flush_request(struct request *rq, struct gendisk *disk)
+static void flush_data_end_io(struct request *rq, int error)
 {
-	rq->cmd_type = REQ_TYPE_FS;
-	rq->cmd_flags = WRITE_FLUSH;
-	rq->rq_disk = disk;
+	struct request_queue *q = rq->q;
+	bool was_empty = elv_queue_empty(q);
+
+	/* after populating an empty queue, kick it to avoid stall */
+	if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error) && was_empty)
+		__blk_run_queue(q);
 }
 
-static struct request *queue_next_fseq(struct request_queue *q)
+/**
+ * blk_insert_flush - insert a new FLUSH/FUA request
+ * @rq: request to insert
+ *
+ * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions.
+ * @rq is being submitted.  Analyze what needs to be done and put it on the
+ * right queue.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ */
+void blk_insert_flush(struct request *rq)
 {
-	struct request *orig_rq = q->orig_flush_rq;
-	struct request *rq = &q->flush_rq;
+	struct request_queue *q = rq->q;
+	unsigned int fflags = q->flush_flags;	/* may change, cache */
+	unsigned int policy = blk_flush_policy(fflags, rq);
 
-	blk_rq_init(q, rq);
+	BUG_ON(rq->end_io);
+	BUG_ON(!rq->bio || rq->bio != rq->biotail);
 
-	switch (blk_flush_cur_seq(q)) {
-	case QUEUE_FSEQ_PREFLUSH:
-		init_flush_request(rq, orig_rq->rq_disk);
-		rq->end_io = pre_flush_end_io;
-		break;
-	case QUEUE_FSEQ_DATA:
-		init_request_from_bio(rq, orig_rq->bio);
-		/*
-		 * orig_rq->rq_disk may be different from
-		 * bio->bi_bdev->bd_disk if orig_rq got here through
-		 * remapping drivers.  Make sure rq->rq_disk points
-		 * to the same one as orig_rq.
-		 */
-		rq->rq_disk = orig_rq->rq_disk;
-		rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
-		rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
-		rq->end_io = flush_data_end_io;
-		break;
-	case QUEUE_FSEQ_POSTFLUSH:
-		init_flush_request(rq, orig_rq->rq_disk);
-		rq->end_io = post_flush_end_io;
-		break;
-	default:
-		BUG();
+	/*
+	 * @policy now records what operations need to be done.  Adjust
+	 * REQ_FLUSH and FUA for the driver.
+	 */
+	rq->cmd_flags &= ~REQ_FLUSH;
+	if (!(fflags & REQ_FUA))
+		rq->cmd_flags &= ~REQ_FUA;
+
+	/*
+	 * If there's data but flush is not necessary, the request can be
+	 * processed directly without going through flush machinery.  Queue
+	 * for normal execution.
+	 */
+	if ((policy & REQ_FSEQ_DATA) &&
+	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
+		list_add(&rq->queuelist, &q->queue_head);
+		return;
 	}
 
+	/*
+	 * @rq should go through flush machinery.  Mark it part of flush
+	 * sequence and submit for further processing.
+	 */
+	memset(&rq->flush, 0, sizeof(rq->flush));
+	INIT_LIST_HEAD(&rq->flush.list);
 	rq->cmd_flags |= REQ_FLUSH_SEQ;
-	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-	return rq;
+	rq->end_io = flush_data_end_io;
+
+	blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
 }
 
-struct request *blk_do_flush(struct request_queue *q, struct request *rq)
+/**
+ * blk_abort_flushes - @q is being aborted, abort flush requests
+ * @q: request_queue being aborted
+ *
+ * To be called from elv_abort_queue().  @q is being aborted.  Prepare all
+ * FLUSH/FUA requests for abortion.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ */
+void blk_abort_flushes(struct request_queue *q)
 {
-	unsigned int fflags = q->flush_flags; /* may change, cache it */
-	bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
-	bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
-	bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
-	unsigned skip = 0;
+	struct request *rq, *n;
+	int i;
 
 	/*
-	 * Special case.  If there's data but flush is not necessary,
-	 * the request can be issued directly.
-	 *
-	 * Flush w/o data should be able to be issued directly too but
-	 * currently some drivers assume that rq->bio contains
-	 * non-zero data if it isn't NULL and empty FLUSH requests
-	 * getting here usually have bio's without data.
+	 * Requests in flight for data are already owned by the dispatch
+	 * queue or the device driver.  Just restore for normal completion.
 	 */
-	if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
-		rq->cmd_flags &= ~REQ_FLUSH;
-		if (!has_fua)
-			rq->cmd_flags &= ~REQ_FUA;
-		return rq;
+	list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
+		list_del_init(&rq->flush.list);
+		blk_flush_restore_request(rq);
 	}
 
 	/*
-	 * Sequenced flushes can't be processed in parallel.  If
-	 * another one is already in progress, queue for later
-	 * processing.
+	 * We need to give away requests on flush queues.  Restore for
+	 * normal completion and put them on the dispatch queue.
 	 */
-	if (q->flush_seq) {
-		list_move_tail(&rq->queuelist, &q->pending_flushes);
-		return NULL;
+	for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
+		list_for_each_entry_safe(rq, n, &q->flush_queue[i],
+					 flush.list) {
+			list_del_init(&rq->flush.list);
+			blk_flush_restore_request(rq);
+			list_add_tail(&rq->queuelist, &q->queue_head);
+		}
 	}
-
-	/*
-	 * Start a new flush sequence
-	 */
-	q->flush_err = 0;
-	q->flush_seq |= QUEUE_FSEQ_STARTED;
-
-	/* adjust FLUSH/FUA of the original request and stash it away */
-	rq->cmd_flags &= ~REQ_FLUSH;
-	if (!has_fua)
-		rq->cmd_flags &= ~REQ_FUA;
-	blk_dequeue_request(rq);
-	q->orig_flush_rq = rq;
-
-	/* skip unneded sequences and return the first one */
-	if (!do_preflush)
-		skip |= QUEUE_FSEQ_PREFLUSH;
-	if (!blk_rq_sectors(rq))
-		skip |= QUEUE_FSEQ_DATA;
-	if (!do_postflush)
-		skip |= QUEUE_FSEQ_POSTFLUSH;
-	return blk_flush_complete_seq(q, skip, 0);
 }
 
 static void bio_end_flush(struct bio *bio, int err)
diff --git a/block/blk.h b/block/blk.h
index 9d2ee8f4d9af..284b500852bd 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,21 +51,17 @@ static inline void blk_clear_rq_complete(struct request *rq)
  */
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
-struct request *blk_do_flush(struct request_queue *q, struct request *rq);
+void blk_insert_flush(struct request *rq);
+void blk_abort_flushes(struct request_queue *q);
 
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
 
 	while (1) {
-		while (!list_empty(&q->queue_head)) {
+		if (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
-			if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
-			    (rq->cmd_flags & REQ_FLUSH_SEQ))
-				return rq;
-			rq = blk_do_flush(q, rq);
-			if (rq)
-				return rq;
+			return rq;
 		}
 
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
diff --git a/block/elevator.c b/block/elevator.c
index 2569512830d3..270e0972eb9f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -673,6 +673,11 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
 
+	case ELEVATOR_INSERT_FLUSH:
+		rq->cmd_flags |= REQ_SOFTBARRIER;
+		blk_insert_flush(rq);
+		break;
+
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __func__, where);
@@ -785,6 +790,8 @@ void elv_abort_queue(struct request_queue *q)
 {
 	struct request *rq;
 
+	blk_abort_flushes(q);
+
 	while (!list_empty(&q->queue_head)) {
 		rq = list_entry_rq(q->queue_head.next);
 		rq->cmd_flags |= REQ_QUIET;
-- 
cgit v1.2.2


From 02a8f01b5a9f396d0327977af4c232d0f94c45fd Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Wed, 9 Feb 2011 14:20:03 +0100
Subject: cfq-iosched: Don't wait if queue already has requests.

Commit 7667aa0630407bc07dc38dcc79d29cc0a65553c1 added logic to wait for
the last queue of the group to become busy (have at least one request),
so that the group does not lose out for not being continuously
backlogged. The commit did not check for the condition that the last
queue already has some requests. As a result, if the queue already has
requests, wait_busy is set. Later on, cfq_select_queue() checks the
flag, and decides that since the queue has a request now and wait_busy
is set, the queue is expired.  This results in early expiration of the
queue.

This patch fixes the problem by adding a check to see if queue already
has requests. If it does, wait_busy is not set. As a result, time slices
do not expire early.

The queues with more than one request are usually buffered writers.
Testing shows improvement in isolation between buffered writers.

Cc: stable@kernel.org
Signed-off-by: Justin TerAvest <teravest@google.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ace168657136..7be4c7959625 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3432,6 +3432,10 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct cfq_io_context *cic = cfqd->active_cic;
 
+	/* If the queue already has requests, don't wait */
+	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
+		return false;
+
 	/* If there are other queues in the group, don't wait */
 	if (cfqq->cfqg->nr_cfqq > 1)
 		return false;
-- 
cgit v1.2.2


From 9d5a4e946ce5352f19400b6370f4cd8e72806278 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 11 Feb 2011 11:05:46 +0100
Subject: block: skip elevator data initialization for flush requests

Skip elevator initialization for flush requests by passing priv=0 to
blk_alloc_request() in get_request().  As such elv_set_request() is
never called for flush requests.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 05746093b45e..ab4a7696956d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -737,6 +737,25 @@ static void freed_request(struct request_queue *q, int sync, int priv)
 		__freed_request(q, sync ^ 1);
 }
 
+/*
+ * Determine if elevator data should be initialized when allocating the
+ * request associated with @bio.
+ */
+static bool blk_rq_should_init_elevator(struct bio *bio)
+{
+	if (!bio)
+		return true;
+
+	/*
+	 * Flush requests do not use the elevator so skip initialization.
+	 * This allows a request to share the flush and elevator data.
+	 */
+	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
+		return false;
+
+	return true;
+}
+
 /*
  * Get a free request, queue_lock must be held.
  * Returns NULL on failure, with queue_lock held.
@@ -749,7 +768,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = NULL;
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
-	int may_queue, priv;
+	int may_queue, priv = 0;
 
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
@@ -793,9 +812,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	rl->count[is_sync]++;
 	rl->starved[is_sync] = 0;
 
-	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	if (priv)
-		rl->elvpriv++;
+	if (blk_rq_should_init_elevator(bio)) {
+		priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+		if (priv)
+			rl->elvpriv++;
+	}
 
 	if (blk_queue_io_stat(q))
 		rw_flags |= REQ_IO_STAT;
-- 
cgit v1.2.2


From c186794dbb466b45cf40f942f2d09d6d5b4b0e42 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 11 Feb 2011 11:08:00 +0100
Subject: block: share request flush fields with  elevator_private

Flush requests are never put on the IO scheduler.  Convert request
structure's elevator_private* into an array and have the flush fields
share a union with it.

Reclaim the space lost in 'struct request' by moving 'completion_data'
back in the union with 'rb_node'.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 18 +++++++++---------
 block/elevator.c    |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4cd59b0d7c15..968455c57e1a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
 #define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
 
 #define RQ_CIC(rq)		\
-	((struct cfq_io_context *) (rq)->elevator_private)
-#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
-#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private3)
+	((struct cfq_io_context *) (rq)->elevator_private[0])
+#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private[1])
+#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private[2])
 
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
@@ -3589,12 +3589,12 @@ static void cfq_put_request(struct request *rq)
 
 		put_io_context(RQ_CIC(rq)->ioc);
 
-		rq->elevator_private = NULL;
-		rq->elevator_private2 = NULL;
+		rq->elevator_private[0] = NULL;
+		rq->elevator_private[1] = NULL;
 
 		/* Put down rq reference on cfqg */
 		cfq_put_cfqg(RQ_CFQG(rq));
-		rq->elevator_private3 = NULL;
+		rq->elevator_private[2] = NULL;
 
 		cfq_put_queue(cfqq);
 	}
@@ -3685,9 +3685,9 @@ new_queue:
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
-	rq->elevator_private = cic;
-	rq->elevator_private2 = cfqq;
-	rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
+	rq->elevator_private[0] = cic;
+	rq->elevator_private[1] = cfqq;
+	rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
 	return 0;
 
 queue_fail:
diff --git a/block/elevator.c b/block/elevator.c
index 270e0972eb9f..f98e92edc937 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -764,7 +764,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 	if (e->ops->elevator_set_req_fn)
 		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
 
-	rq->elevator_private = NULL;
+	rq->elevator_private[0] = NULL;
 	return 0;
 }
 
-- 
cgit v1.2.2


From 79775567e0439ca47eb9f501e52c4b713d44cf89 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 18 Jan 2011 10:13:13 +0100
Subject: [SCSI] block: improve detail in I/O error messages

Classify severity of I/O errors for target, nexus, and
transport errors.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Hannes Reinecke <hare@suse.de>
Acked-by: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: James Bottomley <James.Bottomley@suse.de>
---
 block/blk-core.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 2f4002f79a24..b3cf121f1de9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2044,9 +2044,26 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 
 	if (error && req->cmd_type == REQ_TYPE_FS &&
 	    !(req->cmd_flags & REQ_QUIET)) {
-		printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",
-				req->rq_disk ? req->rq_disk->disk_name : "?",
-				(unsigned long long)blk_rq_pos(req));
+		char *error_type;
+
+		switch (error) {
+		case -ENOLINK:
+			error_type = "recoverable transport";
+			break;
+		case -EREMOTEIO:
+			error_type = "critical target";
+			break;
+		case -EBADE:
+			error_type = "critical nexus";
+			break;
+		case -EIO:
+		default:
+			error_type = "I/O";
+			break;
+		}
+		printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
+		       error_type, req->rq_disk ? req->rq_disk->disk_name : "?",
+		       (unsigned long long)blk_rq_pos(req));
 	}
 
 	blk_account_io_completion(req, nr_bytes);
-- 
cgit v1.2.2


From 93b270f76e7ef3b81001576860c2701931cdc78b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 24 Feb 2011 17:25:47 +1100
Subject: Fix over-zealous flush_disk when changing device size.

There are two cases when we call flush_disk.
In one, the device has disappeared (check_disk_change) so any
data will hold becomes irrelevant.
In the oter, the device has changed size (check_disk_size_change)
so data we hold may be irrelevant.

In both cases it makes sense to discard any 'clean' buffers,
so they will be read back from the device if needed.

In the former case it makes sense to discard 'dirty' buffers
as there will never be anywhere safe to write the data.  In the
second case it *does*not* make sense to discard dirty buffers
as that will lead to file system corruption when you simply enlarge
the containing devices.

flush_disk calls __invalidate_devices.
__invalidate_device calls both invalidate_inodes and invalidate_bdev.

invalidate_inodes *does* discard I_DIRTY inodes and this does lead
to fs corruption.

invalidate_bev *does*not* discard dirty pages, but I don't really care
about that at present.

So this patch adds a flag to __invalidate_device (calling it
__invalidate_device2) to indicate whether dirty buffers should be
killed, and this is passed to invalidate_inodes which can choose to
skip dirty inodes.

flusk_disk then passes true from check_disk_change and false from
check_disk_size_change.

dm avoids tripping over this problem by calling i_size_write directly
rathher than using check_disk_size_change.

md does use check_disk_size_change and so is affected.

This regression was introduced by commit 608aeef17a which causes
check_disk_size_change to call flush_disk, so it is suitable for any
kernel since 2.6.27.

Cc: stable@kernel.org
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Andrew Patterson <andrew.patterson@hp.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 6a5b772aa201..cbf1112a885c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1355,7 +1355,7 @@ int invalidate_partition(struct gendisk *disk, int partno)
 	struct block_device *bdev = bdget_disk(disk, partno);
 	if (bdev) {
 		fsync_bdev(bdev);
-		res = __invalidate_device(bdev);
+		res = __invalidate_device(bdev, true);
 		bdput(bdev);
 	}
 	return res;
-- 
cgit v1.2.2


From 3c522cedb572bb8d2e4867f358bdaa7d0c53d88c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 24 Feb 2011 15:45:41 +0100
Subject: block: fix refcounting in BLKBSZSET

Adam Kovari and others reported that disconnecting an USB drive with
an ntfs-3g filesystem would cause "kernel BUG at fs/inode.c:1421!" to
be triggered.

The BUG could be traced back to ioctl(BLKBSZSET), which would
erroneously decrement the refcount on the bdev.  This is because
blkdev_get() expects the refcount to be already incremented and either
returns success or decrements the refcount and returns an error.

The bug was introduced by e525fd89 (block: make blkdev_get/put()
handle exclusive access), which didn't take into account this behavior
of blkdev_get().

This fixes
  https://bugzilla.kernel.org/show_bug.cgi?id=29202
(and likely 29792 too)

Reported-by: Adam Kovari <kovariadam@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/ioctl.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/ioctl.c b/block/ioctl.c
index 9049d460fa89..1124cd297263 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -294,9 +294,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 			return -EINVAL;
 		if (get_user(n, (int __user *) arg))
 			return -EFAULT;
-		if (!(mode & FMODE_EXCL) &&
-		    blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
-			return -EBUSY;
+		if (!(mode & FMODE_EXCL)) {
+			bdgrab(bdev);
+			if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
+				return -EBUSY;
+		}
 		ret = set_blocksize(bdev, n);
 		if (!(mode & FMODE_EXCL))
 			blkdev_put(bdev, mode | FMODE_EXCL);
-- 
cgit v1.2.2


From 450adcbe518ab3a3953d8475309525d22de77cba Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 1 Mar 2011 13:40:54 -0500
Subject: blk-throttle: Do not use kblockd workqueue for throtl work

o Dominik Klein reported a system hang issue while doing some blkio
  throttling testing.

  https://lkml.org/lkml/2011/2/24/173

o Some tracing revealed that CFQ was not dispatching any more jobs as
  queue unplug was not happening. And queue unplug was not happening
  because unplug work was not being called as there was one throttling
  work on same cpu which as not finished yet. And throttling work had not
  finished as it was tyring to dispatch a bio to CFQ but all the request
  descriptors were consume to it was put to sleep.

o So basically it is a cyclic dependecny between CFQ unplug work and
  throtl dispatch work. Tejun suggested that use separate workqueue for
  such cases.

o This patch uses a separate workqueue for throttle related work and
  does not rely on kblockd workqueue anymore.

Cc: stable@kernel.org
Reported-by: Dominik Klein <dk@in-telegence.net>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c     |  7 -------
 block/blk-throttle.c | 29 ++++++++++++++++++-----------
 2 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 2f4002f79a24..792ece276160 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2610,13 +2610,6 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
-int kblockd_schedule_delayed_work(struct request_queue *q,
-			struct delayed_work *dwork, unsigned long delay)
-{
-	return queue_delayed_work(kblockd_workqueue, dwork, delay);
-}
-EXPORT_SYMBOL(kblockd_schedule_delayed_work);
-
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a89043a3caa4..e36cc10a346c 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -20,6 +20,11 @@ static int throtl_quantum = 32;
 /* Throttling is performed over 100ms slice and after that slice is renewed */
 static unsigned long throtl_slice = HZ/10;	/* 100 ms */
 
+/* A workqueue to queue throttle related work */
+static struct workqueue_struct *kthrotld_workqueue;
+static void throtl_schedule_delayed_work(struct throtl_data *td,
+				unsigned long delay);
+
 struct throtl_rb_root {
 	struct rb_root rb;
 	struct rb_node *left;
@@ -345,10 +350,9 @@ static void throtl_schedule_next_dispatch(struct throtl_data *td)
 	update_min_dispatch_time(st);
 
 	if (time_before_eq(st->min_disptime, jiffies))
-		throtl_schedule_delayed_work(td->queue, 0);
+		throtl_schedule_delayed_work(td, 0);
 	else
-		throtl_schedule_delayed_work(td->queue,
-				(st->min_disptime - jiffies));
+		throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
 }
 
 static inline void
@@ -815,10 +819,10 @@ void blk_throtl_work(struct work_struct *work)
 }
 
 /* Call with queue lock held */
-void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
+static void
+throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 {
 
-	struct throtl_data *td = q->td;
 	struct delayed_work *dwork = &td->throtl_work;
 
 	if (total_nr_queued(td) > 0) {
@@ -827,12 +831,11 @@ void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
 		 * Cancel that and schedule a new one.
 		 */
 		__cancel_delayed_work(dwork);
-		kblockd_schedule_delayed_work(q, dwork, delay);
+		queue_delayed_work(kthrotld_workqueue, dwork, delay);
 		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
 				delay, jiffies);
 	}
 }
-EXPORT_SYMBOL(throtl_schedule_delayed_work);
 
 static void
 throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
@@ -920,7 +923,7 @@ static void throtl_update_blkio_group_read_bps(void *key,
 	smp_mb__after_atomic_inc();
 
 	/* Schedule a work now to process the limit change */
-	throtl_schedule_delayed_work(td->queue, 0);
+	throtl_schedule_delayed_work(td, 0);
 }
 
 static void throtl_update_blkio_group_write_bps(void *key,
@@ -934,7 +937,7 @@ static void throtl_update_blkio_group_write_bps(void *key,
 	smp_mb__before_atomic_inc();
 	atomic_inc(&td->limits_changed);
 	smp_mb__after_atomic_inc();
-	throtl_schedule_delayed_work(td->queue, 0);
+	throtl_schedule_delayed_work(td, 0);
 }
 
 static void throtl_update_blkio_group_read_iops(void *key,
@@ -948,7 +951,7 @@ static void throtl_update_blkio_group_read_iops(void *key,
 	smp_mb__before_atomic_inc();
 	atomic_inc(&td->limits_changed);
 	smp_mb__after_atomic_inc();
-	throtl_schedule_delayed_work(td->queue, 0);
+	throtl_schedule_delayed_work(td, 0);
 }
 
 static void throtl_update_blkio_group_write_iops(void *key,
@@ -962,7 +965,7 @@ static void throtl_update_blkio_group_write_iops(void *key,
 	smp_mb__before_atomic_inc();
 	atomic_inc(&td->limits_changed);
 	smp_mb__after_atomic_inc();
-	throtl_schedule_delayed_work(td->queue, 0);
+	throtl_schedule_delayed_work(td, 0);
 }
 
 void throtl_shutdown_timer_wq(struct request_queue *q)
@@ -1135,6 +1138,10 @@ void blk_throtl_exit(struct request_queue *q)
 
 static int __init throtl_init(void)
 {
+	kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
+	if (!kthrotld_workqueue)
+		panic("Failed to create kthrotld\n");
+
 	blkio_policy_register(&blkio_policy_throtl);
 	return 0;
 }
-- 
cgit v1.2.2


From 291d24f6d9e7bbef81454fade8a44720665c7302 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Tue, 1 Mar 2011 13:45:24 -0500
Subject: block: fix kernel-doc format for blkdev_issue_zeroout

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 1a320d2406b0..eec78becb355 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -132,7 +132,7 @@ static void bio_batch_end_io(struct bio *bio, int err)
 }
 
 /**
- * blkdev_issue_zeroout generate number of zero filed write bios
+ * blkdev_issue_zeroout - generate number of zero filed write bios
  * @bdev:	blockdev to issue
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
-- 
cgit v1.2.2


From 0bbfeb8320421989d3e12bd95fae86b9ac0712aa Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Tue, 1 Mar 2011 15:05:08 -0500
Subject: cfq-iosched: Always provide group isolation.

Effectively, make group_isolation=1 the default and remove the tunable.
The setting group_isolation=0 was because by default we idle on
sync-noidle tree and on fast devices, this can be very harmful for
throughput.

However, this problem can also be addressed by tuning slice_idle and
possibly group_idle on faster storage devices.

This change simplifies the CFQ code by removing the feature entirely.

Signed-off-by: Justin TerAvest <teravest@google.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 37 +------------------------------------
 1 file changed, 1 insertion(+), 36 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f27ff3efe6cd..3202c7e87fb3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -146,7 +146,6 @@ struct cfq_queue {
 	struct cfq_rb_root *service_tree;
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
-	struct cfq_group *orig_cfqg;
 	/* Number of sectors dispatched from queue in single dispatch round */
 	unsigned long nr_sectors;
 };
@@ -285,7 +284,6 @@ struct cfq_data {
 	unsigned int cfq_slice_idle;
 	unsigned int cfq_group_idle;
 	unsigned int cfq_latency;
-	unsigned int cfq_group_isolation;
 
 	unsigned int cic_index;
 	struct list_head cic_list;
@@ -1187,32 +1185,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	int new_cfqq = 1;
 	int group_changed = 0;
 
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	if (!cfqd->cfq_group_isolation
-	    && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
-	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
-		/* Move this cfq to root group */
-		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
-		if (!RB_EMPTY_NODE(&cfqq->rb_node))
-			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
-		cfqq->orig_cfqg = cfqq->cfqg;
-		cfqq->cfqg = &cfqd->root_group;
-		cfqd->root_group.ref++;
-		group_changed = 1;
-	} else if (!cfqd->cfq_group_isolation
-		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
-		/* cfqq is sequential now needs to go to its original group */
-		BUG_ON(cfqq->cfqg != &cfqd->root_group);
-		if (!RB_EMPTY_NODE(&cfqq->rb_node))
-			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
-		cfq_put_cfqg(cfqq->cfqg);
-		cfqq->cfqg = cfqq->orig_cfqg;
-		cfqq->orig_cfqg = NULL;
-		group_changed = 1;
-		cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
-	}
-#endif
-
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
 						cfqq_type(cfqq));
 	if (cfq_class_idle(cfqq)) {
@@ -2542,7 +2514,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 static void cfq_put_queue(struct cfq_queue *cfqq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
-	struct cfq_group *cfqg, *orig_cfqg;
+	struct cfq_group *cfqg;
 
 	BUG_ON(cfqq->ref <= 0);
 
@@ -2554,7 +2526,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	BUG_ON(rb_first(&cfqq->sort_list));
 	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
 	cfqg = cfqq->cfqg;
-	orig_cfqg = cfqq->orig_cfqg;
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
@@ -2564,8 +2535,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	kmem_cache_free(cfq_pool, cfqq);
 	cfq_put_cfqg(cfqg);
-	if (orig_cfqg)
-		cfq_put_cfqg(orig_cfqg);
 }
 
 /*
@@ -3953,7 +3922,6 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_group_idle = cfq_group_idle;
 	cfqd->cfq_latency = 1;
-	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
 	/*
 	 * we optimistically start assuming sync ops weren't delayed in last
@@ -4029,7 +3997,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
-SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -4063,7 +4030,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
 		UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
-STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
 #undef STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
@@ -4081,7 +4047,6 @@ static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(slice_idle),
 	CFQ_ATTR(group_idle),
 	CFQ_ATTR(low_latency),
-	CFQ_ATTR(group_isolation),
 	__ATTR_NULL
 };
 
-- 
cgit v1.2.2


From 1654e7411a1ad4999fe7890ef51d2a2bbb1fcf76 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 2 Mar 2011 08:48:05 -0500
Subject: block: add @force_kblockd to __blk_run_queue()

__blk_run_queue() automatically either calls q->request_fn() directly
or schedules kblockd depending on whether the function is recursed.
blk-flush implementation needs to be able to explicitly choose
kblockd.  Add @force_kblockd.

All the current users are converted to specify %false for the
parameter and this patch doesn't introduce any behavior change.

stable: This is prerequisite for fixing ide oops caused by the new
        blk-flush implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c    | 11 ++++++-----
 block/blk-flush.c   |  2 +-
 block/cfq-iosched.c |  6 +++---
 block/elevator.c    |  4 ++--
 4 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 792ece276160..518dd423a5fe 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -352,7 +352,7 @@ void blk_start_queue(struct request_queue *q)
 	WARN_ON(!irqs_disabled());
 
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-	__blk_run_queue(q);
+	__blk_run_queue(q, false);
 }
 EXPORT_SYMBOL(blk_start_queue);
 
@@ -403,13 +403,14 @@ EXPORT_SYMBOL(blk_sync_queue);
 /**
  * __blk_run_queue - run a single device queue
  * @q:	The queue to run
+ * @force_kblockd: Don't run @q->request_fn directly.  Use kblockd.
  *
  * Description:
  *    See @blk_run_queue. This variant must be called with the queue lock
  *    held and interrupts disabled.
  *
  */
-void __blk_run_queue(struct request_queue *q)
+void __blk_run_queue(struct request_queue *q, bool force_kblockd)
 {
 	blk_remove_plug(q);
 
@@ -423,7 +424,7 @@ void __blk_run_queue(struct request_queue *q)
 	 * Only recurse once to avoid overrunning the stack, let the unplug
 	 * handling reinvoke the handler shortly if we already got there.
 	 */
-	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+	if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
 		q->request_fn(q);
 		queue_flag_clear(QUEUE_FLAG_REENTER, q);
 	} else {
@@ -446,7 +447,7 @@ void blk_run_queue(struct request_queue *q)
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	__blk_run_queue(q);
+	__blk_run_queue(q, false);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_run_queue);
@@ -1053,7 +1054,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 
 	drive_stat_acct(rq, 1);
 	__elv_add_request(q, rq, where, 0);
-	__blk_run_queue(q);
+	__blk_run_queue(q, false);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 54b123d6563e..56adaa8d55cd 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -69,7 +69,7 @@ static void blk_flush_complete_seq_end_io(struct request_queue *q,
 	 * queue.  Kick the queue in those cases.
 	 */
 	if (was_empty && next_rq)
-		__blk_run_queue(q);
+		__blk_run_queue(q, false);
 }
 
 static void pre_flush_end_io(struct request *rq, int error)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7be4c7959625..ea83a4f0c27d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3355,7 +3355,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			    cfqd->busy_queues > 1) {
 				cfq_del_timer(cfqd, cfqq);
 				cfq_clear_cfqq_wait_request(cfqq);
-				__blk_run_queue(cfqd->queue);
+				__blk_run_queue(cfqd->queue, false);
 			} else {
 				cfq_blkiocg_update_idle_time_stats(
 						&cfqq->cfqg->blkg);
@@ -3370,7 +3370,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 * this new queue is RT and the current one is BE
 		 */
 		cfq_preempt_queue(cfqd, cfqq);
-		__blk_run_queue(cfqd->queue);
+		__blk_run_queue(cfqd->queue, false);
 	}
 }
 
@@ -3731,7 +3731,7 @@ static void cfq_kick_queue(struct work_struct *work)
 	struct request_queue *q = cfqd->queue;
 
 	spin_lock_irq(q->queue_lock);
-	__blk_run_queue(cfqd->queue);
+	__blk_run_queue(cfqd->queue, false);
 	spin_unlock_irq(q->queue_lock);
 }
 
diff --git a/block/elevator.c b/block/elevator.c
index 2569512830d3..236e93c1f46c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -602,7 +602,7 @@ void elv_quiesce_start(struct request_queue *q)
 	 */
 	elv_drain_elevator(q);
 	while (q->rq.elvpriv) {
-		__blk_run_queue(q);
+		__blk_run_queue(q, false);
 		spin_unlock_irq(q->queue_lock);
 		msleep(10);
 		spin_lock_irq(q->queue_lock);
@@ -651,7 +651,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		 *   with anything.  There's no point in delaying queue
 		 *   processing.
 		 */
-		__blk_run_queue(q);
+		__blk_run_queue(q, false);
 		break;
 
 	case ELEVATOR_INSERT_SORT:
-- 
cgit v1.2.2


From 255bb490c8c27eed484d538efe6ef6a7473bd3f6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 2 Mar 2011 08:48:06 -0500
Subject: block: blk-flush shouldn't call directly into q->request_fn()
 __blk_run_queue()

blk-flush decomposes a flush into sequence of multiple requests.  On
completion of a request, the next one is queued; however, block layer
must not implicitly call into q->request_fn() directly from completion
path.  This makes the queue behave unexpectedly when seen from the
drivers and violates the assumption that q->request_fn() is called
with process context + queue_lock.

This patch makes blk-flush the following two changes to make sure
q->request_fn() is not called directly from request completion path.

- blk_flush_complete_seq_end_io() now asks __blk_run_queue() to always
  use kblockd instead of calling directly into q->request_fn().

- queue_next_fseq() uses ELEVATOR_INSERT_REQUEUE instead of
  ELEVATOR_INSERT_FRONT so that elv_insert() doesn't try to unplug the
  request queue directly.

Reported by Jan in the following threads.

 http://thread.gmane.org/gmane.linux.ide/48778
 http://thread.gmane.org/gmane.linux.ide/48786

stable: applicable to v2.6.37.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jan Beulich <JBeulich@novell.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 56adaa8d55cd..b27d0208611b 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -66,10 +66,12 @@ static void blk_flush_complete_seq_end_io(struct request_queue *q,
 
 	/*
 	 * Moving a request silently to empty queue_head may stall the
-	 * queue.  Kick the queue in those cases.
+	 * queue.  Kick the queue in those cases.  This function is called
+	 * from request completion path and calling directly into
+	 * request_fn may confuse the driver.  Always use kblockd.
 	 */
 	if (was_empty && next_rq)
-		__blk_run_queue(q, false);
+		__blk_run_queue(q, true);
 }
 
 static void pre_flush_end_io(struct request *rq, int error)
@@ -130,7 +132,7 @@ static struct request *queue_next_fseq(struct request_queue *q)
 		BUG();
 	}
 
-	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+	elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
 	return rq;
 }
 
-- 
cgit v1.2.2


From 53f22956effe1c9e7961b8c6e4362ecca5e460b7 Mon Sep 17 00:00:00 2001
From: Liu Yuan <tailai.ly@taobao.com>
Date: Wed, 2 Mar 2011 11:00:15 -0500
Subject: block/genhd: Change some numerals into macros

Rename the numerals in the diskstats_show() into the macros.

Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Liu Yuan <tailai.ly@taobao.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 6a5b772aa201..73d85a8e3c85 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1158,14 +1158,14 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   "%u %lu %lu %llu %u %u %u %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
 			   disk_name(gp, hd->partno, buf),
-			   part_stat_read(hd, ios[0]),
-			   part_stat_read(hd, merges[0]),
-			   (unsigned long long)part_stat_read(hd, sectors[0]),
-			   jiffies_to_msecs(part_stat_read(hd, ticks[0])),
-			   part_stat_read(hd, ios[1]),
-			   part_stat_read(hd, merges[1]),
-			   (unsigned long long)part_stat_read(hd, sectors[1]),
-			   jiffies_to_msecs(part_stat_read(hd, ticks[1])),
+			   part_stat_read(hd, ios[READ]),
+			   part_stat_read(hd, merges[READ]),
+			   (unsigned long long)part_stat_read(hd, sectors[READ]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
+			   part_stat_read(hd, ios[WRITE]),
+			   part_stat_read(hd, merges[WRITE]),
+			   (unsigned long long)part_stat_read(hd, sectors[WRITE]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
 			   part_in_flight(hd),
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
-- 
cgit v1.2.2


From c94a96ac93b4f5b8d1ff8430b1afa1a25610cf53 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 2 Mar 2011 19:04:42 -0500
Subject: block: Initialize ->queue_lock to internal lock at queue allocation
 time

There does not seem to be a clear convention whether q->queue_lock is
initialized or not when blk_cleanup_queue() is called. In the past it
was not necessary but now blk_throtl_exit() takes up queue lock by
default and needs queue lock to be available.

In fact elevator_exit() code also has similar requirement just that it
is less stringent in the sense that elevator_exit() is called only if
elevator is initialized.

Two problems have been noticed because of ambiguity about spin lock
status.

      - If a driver calls blk_alloc_queue() and then soon calls
        blk_cleanup_queue() almost immediately, (because some other
	driver structure allocation failed or some other error happened)
	then blk_throtl_exit() will run into issues as queue lock is not
	initialized. Loop driver ran into this issue recently and I
	noticed error paths in md driver too. Similar error paths should
	exist in other drivers too.

      - If some driver provided external spin lock and zapped the lock
        before blk_cleanup_queue(), then it can lead to issues.

So this patch initializes the default queue lock at queue allocation time.

block throttling code is one of the users of queue lock and it is
initialized at the queue allocation time, so it makes sense to
initialize ->queue_lock also to internal lock. A driver can overide that
lock later. This will take care of the issue where a driver does not have
to worry about initializing the queue lock to default before calling
blk_cleanup_queue()

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c     | 16 +++++++++++++++-
 block/blk-settings.c |  7 -------
 2 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3cc17e6064d6..bc2b7c5004e1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -446,6 +446,11 @@ void blk_put_queue(struct request_queue *q)
 	kobject_put(&q->kobj);
 }
 
+/*
+ * Note: If a driver supplied the queue lock, it should not zap that lock
+ * unexpectedly as some queue cleanup components like elevator_exit() and
+ * blk_throtl_exit() need queue lock.
+ */
 void blk_cleanup_queue(struct request_queue *q)
 {
 	/*
@@ -540,6 +545,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	mutex_init(&q->sysfs_lock);
 	spin_lock_init(&q->__queue_lock);
 
+	/*
+	 * By default initialize queue_lock to internal lock and driver can
+	 * override it later if need be.
+	 */
+	q->queue_lock = &q->__queue_lock;
+
 	return q;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -624,7 +635,10 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
 	q->unprep_rq_fn		= NULL;
 	q->unplug_fn		= generic_unplug_device;
 	q->queue_flags		= QUEUE_FLAG_DEFAULT;
-	q->queue_lock		= lock;
+
+	/* Override internal queue lock with supplied lock pointer */
+	if (lock)
+		q->queue_lock		= lock;
 
 	/*
 	 * This also sets hw/phys segments, boundary and size
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 36c8c1f2af18..df649fa59ded 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -175,13 +175,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	blk_set_default_limits(&q->limits);
 	blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
 
-	/*
-	 * If the caller didn't supply a lock, fall back to our embedded
-	 * per-queue locks
-	 */
-	if (!q->queue_lock)
-		q->queue_lock = &q->__queue_lock;
-
 	/*
 	 * by default assume old behaviour and bounce for any highmem page
 	 */
-- 
cgit v1.2.2


From da527770007fce8e4541947d47918248286da875 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 2 Mar 2011 19:05:33 -0500
Subject: block: Move blk_throtl_exit() call to blk_cleanup_queue()

Move blk_throtl_exit() in blk_cleanup_queue() as blk_throtl_exit() is
written in such a way that it needs queue lock. In blk_release_queue()
there is no gurantee that ->queue_lock is still around.

Initially blk_throtl_exit() was in blk_cleanup_queue() but Ingo reported
one problem.

  https://lkml.org/lkml/2010/10/23/86

  And a quick fix moved blk_throtl_exit() to blk_release_queue().

        commit 7ad58c028652753814054f4e3ac58f925e7343f4
        Author: Jens Axboe <jaxboe@fusionio.com>
        Date:   Sat Oct 23 20:40:26 2010 +0200

        block: fix use-after-free bug in blk throttle code

This patch reverts above change and does not try to shutdown the
throtl work in blk_sync_queue(). By avoiding call to
throtl_shutdown_timer_wq() from blk_sync_queue(), we should also avoid
the problem reported by Ingo.

blk_sync_queue() seems to be used only by md driver and it seems to be
using it to make sure q->unplug_fn is not called as md registers its
own unplug functions and it is about to free up the data structures
used by unplug_fn(). Block throttle does not call back into unplug_fn()
or into md. So there is no need to cancel blk throttle work.

In fact I think cancelling block throttle work is bad because it might
happen that some bios are throttled and scheduled to be dispatched later
with the help of pending work and if work is cancelled, these bios might
never be dispatched.

Block layer also uses blk_sync_queue() during blk_cleanup_queue() and
blk_release_queue() time. That should be safe as we are also calling
blk_throtl_exit() which should make sure all the throttling related
data structures are cleaned up.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c     | 7 ++++++-
 block/blk-sysfs.c    | 2 --
 block/blk-throttle.c | 6 +++---
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index bc2b7c5004e1..accff29ad674 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -380,13 +380,16 @@ EXPORT_SYMBOL(blk_stop_queue);
  *     that its ->make_request_fn will not re-add plugging prior to calling
  *     this function.
  *
+ *     This function does not cancel any asynchronous activity arising
+ *     out of elevator or throttling code. That would require elevaotor_exit()
+ *     and blk_throtl_exit() to be called with queue lock initialized.
+ *
  */
 void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
 	cancel_work_sync(&q->unplug_work);
-	throtl_shutdown_timer_wq(q);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -469,6 +472,8 @@ void blk_cleanup_queue(struct request_queue *q)
 	if (q->elevator)
 		elevator_exit(q->elevator);
 
+	blk_throtl_exit(q);
+
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 41fb69150b4d..261c75c665ae 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -471,8 +471,6 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_sync_queue(q);
 
-	blk_throtl_exit(q);
-
 	if (rl->rq_pool)
 		mempool_destroy(rl->rq_pool);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a89043a3caa4..c0f623742165 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -965,7 +965,7 @@ static void throtl_update_blkio_group_write_iops(void *key,
 	throtl_schedule_delayed_work(td->queue, 0);
 }
 
-void throtl_shutdown_timer_wq(struct request_queue *q)
+static void throtl_shutdown_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
 
@@ -1099,7 +1099,7 @@ void blk_throtl_exit(struct request_queue *q)
 
 	BUG_ON(!td);
 
-	throtl_shutdown_timer_wq(q);
+	throtl_shutdown_wq(q);
 
 	spin_lock_irq(q->queue_lock);
 	throtl_release_tgs(td);
@@ -1129,7 +1129,7 @@ void blk_throtl_exit(struct request_queue *q)
 	 * update limits through cgroup and another work got queued, cancel
 	 * it.
 	 */
-	throtl_shutdown_timer_wq(q);
+	throtl_shutdown_wq(q);
 	throtl_td_free(td);
 }
 
-- 
cgit v1.2.2


From 93803e0140c6216b68fe926ccc611297120da273 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 7 Mar 2011 08:59:06 +0100
Subject: cfq-iosched: fix race in cfq_set_request()

We need to hold the queue lock over the reference increment,
it's not atomic anymore.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3202c7e87fb3..fb2141ec205c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3675,12 +3675,11 @@ new_queue:
 
 	cfqq->allocated[rw]++;
 
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
 	cfqq->ref++;
 	rq->elevator_private[0] = cic;
 	rq->elevator_private[1] = cfqq;
 	rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 0;
 
 queue_fail:
-- 
cgit v1.2.2


From ef8a41df8c140f10108de75b01b6369d6e49113c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 7 Mar 2011 09:26:29 +0100
Subject: cfq-iosched: give busy sync queue no dispatch limit

If there are a sync and an async queue and the sync queue's think time
is small, we can ignore the sync queue's dispatch quantum. Because the
sync queue will always preempt the async queue, we don't need to care
about async's latency.  This can fix a performance regression of
aiostress test, which is introduced by commit f8ae6e3eb825. The issue
should exist even without the commit, but the commit amplifies the
impact.

The initial post does the same optimization for RT queue too, but since
I have no real workload for it, Vivek suggests to drop it.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fb2141ec205c..135b1a48da23 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -237,6 +237,7 @@ struct cfq_data {
 	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 
 	unsigned int busy_queues;
+	unsigned int busy_sync_queues;
 
 	int rq_in_driver;
 	int rq_in_flight[2];
@@ -1344,6 +1345,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
+	if (cfq_cfqq_sync(cfqq))
+		cfqd->busy_sync_queues++;
 
 	cfq_resort_rr_list(cfqd, cfqq);
 }
@@ -1370,6 +1373,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
+	if (cfq_cfqq_sync(cfqq))
+		cfqd->busy_sync_queues--;
 }
 
 /*
@@ -2377,22 +2382,39 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * Does this cfqq already have too much IO in flight?
 	 */
 	if (cfqq->dispatched >= max_dispatch) {
+		bool promote_sync = false;
 		/*
 		 * idle queue must always only have a single IO in flight
 		 */
 		if (cfq_class_idle(cfqq))
 			return false;
 
+		/*
+		 * If there is only one sync queue, and its think time is
+		 * small, we can ignore async queue here and give the sync
+		 * queue no dispatch limit. The reason is a sync queue can
+		 * preempt async queue, limiting the sync queue doesn't make
+		 * sense. This is useful for aiostress test.
+		 */
+		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) {
+			struct cfq_io_context *cic = RQ_CIC(cfqq->next_rq);
+
+			if (sample_valid(cic->ttime_samples) &&
+				cic->ttime_mean < cfqd->cfq_slice_idle)
+				promote_sync = true;
+		}
+
 		/*
 		 * We have other queues, don't allow more IO from this one
 		 */
-		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
+		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
+				!promote_sync)
 			return false;
 
 		/*
 		 * Sole queue user, no limit
 		 */
-		if (cfqd->busy_queues == 1)
+		if (cfqd->busy_queues == 1 || promote_sync)
 			max_dispatch = -1;
 		else
 			/*
-- 
cgit v1.2.2


From a60327107b56573c305ecc78e471dbdbb4d2f426 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Mon, 7 Mar 2011 09:28:09 +0100
Subject: cfq-iosched: Fix update_vdisktime logic

The update_vdisktime logic is broken since commit
b54ce60eb7f61f8e314b8b241b0469eda3bb1d42, st->min_vdisktime never makes
a progress. Fix it.

Thanks Vivek for pointing it out.

Signed-off-by: Gui Jianfeng <guijianfen@cn.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 135b1a48da23..938ae52aa927 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -557,15 +557,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 
 static void update_min_vdisktime(struct cfq_rb_root *st)
 {
-	u64 vdisktime = st->min_vdisktime;
 	struct cfq_group *cfqg;
 
 	if (st->left) {
 		cfqg = rb_entry_cfqg(st->left);
-		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
+		st->min_vdisktime = max_vdisktime(st->min_vdisktime,
+						  cfqg->vdisktime);
 	}
-
-	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
 }
 
 /*
-- 
cgit v1.2.2


From 231d704b4ab7491473c0b1a9cd0c6e0d1cba85b9 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 7 Mar 2011 21:05:14 +0100
Subject: blk-throttle: process limit change only through one function

With the help of cgroup interface one can go and upate the bps/iops
limits of existing group. Once the limits are udpated, a thread is
woken up to see if some blocked group needs recalculation based on new
limits and needs to be requeued.

There was also a piece of code where I was checking for group limit
update when a fresh bio comes in. This patch gets rid of that piece of
code and keeps processing the limit change at one place
throtl_process_limit_change().  It just keeps the code simple and easy
to understand.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 061dee66e2a6..a29f09240c0f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1009,14 +1009,8 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 		/*
 		 * There is already another bio queued in same dir. No
 		 * need to update dispatch time.
-		 * Still update the disptime if rate limits on this group
-		 * were changed.
 		 */
-		if (!tg->limits_changed)
-			update_disptime = false;
-		else
-			tg->limits_changed = false;
-
+		update_disptime = false;
 		goto queue_bio;
 	}
 
-- 
cgit v1.2.2


From de701c74a34005e637e1ca2634fbf28fd1debba2 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 7 Mar 2011 21:09:32 +0100
Subject: blk-throttle: Some cleanups and race fixes in limit  update code

When throttle group limits are updated through cgroups, a thread is
woken up to process these updates. While reviewing that code, oleg noted
couple of race conditions existed in the code and he also suggested that
code can be simplified.

This patch fixes the races simplifies the code based on Oleg's suggestions:

	- Use xchg().
	- Introduced a common function throtl_update_blkio_group_common()
          which is shared now by all iops/bps update functions.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>

Fixed a merge issue, throtl_schedule_delayed_work() takes throtl_data
as the argument now, not the queue.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 96 ++++++++++++++++++++++------------------------------
 1 file changed, 40 insertions(+), 56 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a29f09240c0f..32dd3e4b041d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -102,7 +102,7 @@ struct throtl_data
 	/* Work for dispatching throttled bios */
 	struct delayed_work throtl_work;
 
-	atomic_t limits_changed;
+	bool limits_changed;
 };
 
 enum tg_state_flags {
@@ -201,6 +201,7 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
 	RB_CLEAR_NODE(&tg->rb_node);
 	bio_list_init(&tg->bio_lists[0]);
 	bio_list_init(&tg->bio_lists[1]);
+	td->limits_changed = false;
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -737,34 +738,27 @@ static void throtl_process_limit_change(struct throtl_data *td)
 	struct throtl_grp *tg;
 	struct hlist_node *pos, *n;
 
-	if (!atomic_read(&td->limits_changed))
+	if (!td->limits_changed)
 		return;
 
-	throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
+	xchg(&td->limits_changed, false);
 
-	/*
-	 * Make sure updates from throtl_update_blkio_group_read_bps() group
-	 * of functions to tg->limits_changed are visible. We do not
-	 * want update td->limits_changed to be visible but update to
-	 * tg->limits_changed not being visible yet on this cpu. Hence
-	 * the read barrier.
-	 */
-	smp_rmb();
+	throtl_log(td, "limits changed");
 
 	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
-		if (throtl_tg_on_rr(tg) && tg->limits_changed) {
-			throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
-				" riops=%u wiops=%u", tg->bps[READ],
-				tg->bps[WRITE], tg->iops[READ],
-				tg->iops[WRITE]);
+		if (!tg->limits_changed)
+			continue;
+
+		if (!xchg(&tg->limits_changed, false))
+			continue;
+
+		throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
+			" riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
+			tg->iops[READ], tg->iops[WRITE]);
+
+		if (throtl_tg_on_rr(tg))
 			tg_update_disptime(td, tg);
-			tg->limits_changed = false;
-		}
 	}
-
-	smp_mb__before_atomic_dec();
-	atomic_dec(&td->limits_changed);
-	smp_mb__after_atomic_dec();
 }
 
 /* Dispatch throttled bios. Should be called without queue lock held. */
@@ -898,6 +892,15 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
 	spin_unlock_irqrestore(td->queue->queue_lock, flags);
 }
 
+static void throtl_update_blkio_group_common(struct throtl_data *td,
+				struct throtl_grp *tg)
+{
+	xchg(&tg->limits_changed, true);
+	xchg(&td->limits_changed, true);
+	/* Schedule a work now to process the limit change */
+	throtl_schedule_delayed_work(td, 0);
+}
+
 /*
  * For all update functions, key should be a valid pointer because these
  * update functions are called under blkcg_lock, that means, blkg is
@@ -911,61 +914,40 @@ static void throtl_update_blkio_group_read_bps(void *key,
 				struct blkio_group *blkg, u64 read_bps)
 {
 	struct throtl_data *td = key;
+	struct throtl_grp *tg = tg_of_blkg(blkg);
 
-	tg_of_blkg(blkg)->bps[READ] = read_bps;
-	/* Make sure read_bps is updated before setting limits_changed */
-	smp_wmb();
-	tg_of_blkg(blkg)->limits_changed = true;
-
-	/* Make sure tg->limits_changed is updated before td->limits_changed */
-	smp_mb__before_atomic_inc();
-	atomic_inc(&td->limits_changed);
-	smp_mb__after_atomic_inc();
-
-	/* Schedule a work now to process the limit change */
-	throtl_schedule_delayed_work(td, 0);
+	tg->bps[READ] = read_bps;
+	throtl_update_blkio_group_common(td, tg);
 }
 
 static void throtl_update_blkio_group_write_bps(void *key,
 				struct blkio_group *blkg, u64 write_bps)
 {
 	struct throtl_data *td = key;
+	struct throtl_grp *tg = tg_of_blkg(blkg);
 
-	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
-	smp_wmb();
-	tg_of_blkg(blkg)->limits_changed = true;
-	smp_mb__before_atomic_inc();
-	atomic_inc(&td->limits_changed);
-	smp_mb__after_atomic_inc();
-	throtl_schedule_delayed_work(td, 0);
+	tg->bps[WRITE] = write_bps;
+	throtl_update_blkio_group_common(td, tg);
 }
 
 static void throtl_update_blkio_group_read_iops(void *key,
 			struct blkio_group *blkg, unsigned int read_iops)
 {
 	struct throtl_data *td = key;
+	struct throtl_grp *tg = tg_of_blkg(blkg);
 
-	tg_of_blkg(blkg)->iops[READ] = read_iops;
-	smp_wmb();
-	tg_of_blkg(blkg)->limits_changed = true;
-	smp_mb__before_atomic_inc();
-	atomic_inc(&td->limits_changed);
-	smp_mb__after_atomic_inc();
-	throtl_schedule_delayed_work(td, 0);
+	tg->iops[READ] = read_iops;
+	throtl_update_blkio_group_common(td, tg);
 }
 
 static void throtl_update_blkio_group_write_iops(void *key,
 			struct blkio_group *blkg, unsigned int write_iops)
 {
 	struct throtl_data *td = key;
+	struct throtl_grp *tg = tg_of_blkg(blkg);
 
-	tg_of_blkg(blkg)->iops[WRITE] = write_iops;
-	smp_wmb();
-	tg_of_blkg(blkg)->limits_changed = true;
-	smp_mb__before_atomic_inc();
-	atomic_inc(&td->limits_changed);
-	smp_mb__after_atomic_inc();
-	throtl_schedule_delayed_work(td, 0);
+	tg->iops[WRITE] = write_iops;
+	throtl_update_blkio_group_common(td, tg);
 }
 
 static void throtl_shutdown_wq(struct request_queue *q)
@@ -1012,6 +994,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 		 */
 		update_disptime = false;
 		goto queue_bio;
+
 	}
 
 	/* Bio is with-in rate limit of group */
@@ -1052,7 +1035,7 @@ int blk_throtl_init(struct request_queue *q)
 
 	INIT_HLIST_HEAD(&td->tg_list);
 	td->tg_service_tree = THROTL_RB_ROOT;
-	atomic_set(&td->limits_changed, 0);
+	td->limits_changed = false;
 
 	/* Init root group */
 	tg = &td->root_tg;
@@ -1064,6 +1047,7 @@ int blk_throtl_init(struct request_queue *q)
 	/* Practically unlimited BW */
 	tg->bps[0] = tg->bps[1] = -1;
 	tg->iops[0] = tg->iops[1] = -1;
+	td->limits_changed = false;
 
 	/*
 	 * Set root group reference to 2. One reference will be dropped when
-- 
cgit v1.2.2


From df457f845e5449be2e7d96668791f789b3770ac7 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Tue, 8 Mar 2011 19:45:00 +0100
Subject: blk-cgroup: Lower minimum weight from 100 to 10.

We've found that we still get good, useful isolation at weights this
low. I'd like to adjust the minimum so that any other changes can take
these values into account.

Signed-off-by: Justin TerAvest <teravest@google.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861bdd549..57e7234c5ae5 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -240,7 +240,7 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
 
 #endif
 
-#define BLKIO_WEIGHT_MIN	100
+#define BLKIO_WEIGHT_MIN	10
 #define BLKIO_WEIGHT_MAX	1000
 #define BLKIO_WEIGHT_DEFAULT	500
 
-- 
cgit v1.2.2


From facc31ddc3570a3a0d8951c94f16b898e01b464d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:27 +0100
Subject: block: Don't implicitly trigger event check on disk_unblock_events()

Currently, disk_unblock_events() implicitly kick event check if the
block count reaches zero.  This behavior is not described in the
comment and hinders with future changes.  Make the unblocker
explicitly check events by calling disk_check_events() as necessary.

This patch doesn't cause any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 3e2b57b55e38..c91a2dac6b6b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1494,7 +1494,7 @@ void disk_block_events(struct gendisk *disk)
 void disk_unblock_events(struct gendisk *disk)
 {
 	if (disk->ev)
-		__disk_unblock_events(disk, true);
+		__disk_unblock_events(disk, false);
 }
 
 /**
-- 
cgit v1.2.2


From 3cca6dc1c81e2407928dc4c6105252146fd3924f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 2 Mar 2011 11:08:00 -0500
Subject: block: add API for delaying work/request_fn a little bit

Currently we use plugging for that, but as plugging is going away,
we need an alternative mechanism.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3cc17e6064d6..e958c7a1e462 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -197,6 +197,32 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 
+static void blk_delay_work(struct work_struct *work)
+{
+	struct request_queue *q;
+
+	q = container_of(work, struct request_queue, delay_work.work);
+	spin_lock_irq(q->queue_lock);
+	q->request_fn(q);
+	spin_unlock_irq(q->queue_lock);
+}
+
+/**
+ * blk_delay_queue - restart queueing after defined interval
+ * @q:		The &struct request_queue in question
+ * @msecs:	Delay in msecs
+ *
+ * Description:
+ *   Sometimes queueing needs to be postponed for a little while, to allow
+ *   resources to come back. This function will make sure that queueing is
+ *   restarted around the specified time.
+ */
+void blk_delay_queue(struct request_queue *q, unsigned long msecs)
+{
+	schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs));
+}
+EXPORT_SYMBOL(blk_delay_queue);
+
 /*
  * "plug" the device if there are no outstanding requests: this will
  * force the transfer to start only after we have put all the requests
@@ -363,6 +389,7 @@ EXPORT_SYMBOL(blk_start_queue);
 void blk_stop_queue(struct request_queue *q)
 {
 	blk_remove_plug(q);
+	cancel_delayed_work(&q->delay_work);
 	queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
 EXPORT_SYMBOL(blk_stop_queue);
@@ -387,6 +414,7 @@ void blk_sync_queue(struct request_queue *q)
 	del_timer_sync(&q->timeout);
 	cancel_work_sync(&q->unplug_work);
 	throtl_shutdown_timer_wq(q);
+	cancel_delayed_work_sync(&q->delay_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -534,6 +562,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	INIT_LIST_HEAD(&q->flush_queue[1]);
 	INIT_LIST_HEAD(&q->flush_data_in_flight);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
+	INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
 
-- 
cgit v1.2.2


From 73c101011926c5832e6e141682180c4debe2cf45 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 8 Mar 2011 13:19:51 +0100
Subject: block: initial patch for on-stack per-task plugging

This patch adds support for creating a queuing context outside
of the queue itself. This enables us to batch up pieces of IO
before grabbing the block device queue lock and submitting them to
the IO scheduler.

The context is created on the stack of the process and assigned in
the task structure, so that we can auto-unplug it if we hit a schedule
event.

The current queue plugging happens implicitly if IO is submitted to
an empty device, yet callers have to remember to unplug that IO when
they are going to wait for it. This is an ugly API and has caused bugs
in the past. Additionally, it requires hacks in the vm (->sync_page()
callback) to handle that logic. By switching to an explicit plugging
scheme we make the API a lot nicer and can get rid of the ->sync_page()
hack in the vm.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c  | 369 +++++++++++++++++++++++++++++++++++++++---------------
 block/blk-flush.c |   3 +-
 block/elevator.c  |   6 +-
 3 files changed, 277 insertions(+), 101 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index e958c7a1e462..6efb55cc5af0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -27,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/fault-inject.h>
+#include <linux/list_sort.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -203,7 +204,7 @@ static void blk_delay_work(struct work_struct *work)
 
 	q = container_of(work, struct request_queue, delay_work.work);
 	spin_lock_irq(q->queue_lock);
-	q->request_fn(q);
+	__blk_run_queue(q);
 	spin_unlock_irq(q->queue_lock);
 }
 
@@ -686,6 +687,8 @@ int blk_get_queue(struct request_queue *q)
 
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
+	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
 	if (rq->cmd_flags & REQ_ELVPRIV)
 		elv_put_request(q, rq);
 	mempool_free(rq, q->rq.rq_pool);
@@ -1051,6 +1054,13 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL(blk_requeue_request);
 
+static void add_acct_request(struct request_queue *q, struct request *rq,
+			     int where)
+{
+	drive_stat_acct(rq, 1);
+	__elv_add_request(q, rq, where, 0);
+}
+
 /**
  * blk_insert_request - insert a special request into a request queue
  * @q:		request queue where request should be inserted
@@ -1093,8 +1103,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 
-	drive_stat_acct(rq, 1);
-	__elv_add_request(q, rq, where, 0);
+	add_acct_request(q, rq, where);
 	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
@@ -1215,6 +1224,113 @@ void blk_add_request_payload(struct request *rq, struct page *page,
 }
 EXPORT_SYMBOL_GPL(blk_add_request_payload);
 
+static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+				   struct bio *bio)
+{
+	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+
+	/*
+	 * Debug stuff, kill later
+	 */
+	if (!rq_mergeable(req)) {
+		blk_dump_rq_flags(req, "back");
+		return false;
+	}
+
+	if (!ll_back_merge_fn(q, req, bio))
+		return false;
+
+	trace_block_bio_backmerge(q, bio);
+
+	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+		blk_rq_set_mixed_merge(req);
+
+	req->biotail->bi_next = bio;
+	req->biotail = bio;
+	req->__data_len += bio->bi_size;
+	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+
+	drive_stat_acct(req, 0);
+	return true;
+}
+
+static bool bio_attempt_front_merge(struct request_queue *q,
+				    struct request *req, struct bio *bio)
+{
+	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+	sector_t sector;
+
+	/*
+	 * Debug stuff, kill later
+	 */
+	if (!rq_mergeable(req)) {
+		blk_dump_rq_flags(req, "front");
+		return false;
+	}
+
+	if (!ll_front_merge_fn(q, req, bio))
+		return false;
+
+	trace_block_bio_frontmerge(q, bio);
+
+	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+		blk_rq_set_mixed_merge(req);
+
+	sector = bio->bi_sector;
+
+	bio->bi_next = req->bio;
+	req->bio = bio;
+
+	/*
+	 * may not be valid. if the low level driver said
+	 * it didn't need a bounce buffer then it better
+	 * not touch req->buffer either...
+	 */
+	req->buffer = bio_data(bio);
+	req->__sector = bio->bi_sector;
+	req->__data_len += bio->bi_size;
+	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+
+	drive_stat_acct(req, 0);
+	return true;
+}
+
+/*
+ * Attempts to merge with the plugged list in the current process. Returns
+ * true if merge was succesful, otherwise false.
+ */
+static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
+			       struct bio *bio)
+{
+	struct blk_plug *plug;
+	struct request *rq;
+	bool ret = false;
+
+	plug = tsk->plug;
+	if (!plug)
+		goto out;
+
+	list_for_each_entry_reverse(rq, &plug->list, queuelist) {
+		int el_ret;
+
+		if (rq->q != q)
+			continue;
+
+		el_ret = elv_try_merge(rq, bio);
+		if (el_ret == ELEVATOR_BACK_MERGE) {
+			ret = bio_attempt_back_merge(q, rq, bio);
+			if (ret)
+				break;
+		} else if (el_ret == ELEVATOR_FRONT_MERGE) {
+			ret = bio_attempt_front_merge(q, rq, bio);
+			if (ret)
+				break;
+		}
+	}
+out:
+	return ret;
+}
+
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cpu = bio->bi_comp_cpu;
@@ -1230,26 +1346,12 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
-/*
- * Only disabling plugging for non-rotational devices if it does tagging
- * as well, otherwise we do need the proper merging
- */
-static inline bool queue_should_plug(struct request_queue *q)
-{
-	return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
-}
-
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
-	struct request *req;
-	int el_ret;
-	unsigned int bytes = bio->bi_size;
-	const unsigned short prio = bio_prio(bio);
 	const bool sync = !!(bio->bi_rw & REQ_SYNC);
-	const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
-	const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
-	int where = ELEVATOR_INSERT_SORT;
-	int rw_flags;
+	struct blk_plug *plug;
+	int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
+	struct request *req;
 
 	/*
 	 * low level driver can indicate that it wants pages above a
@@ -1258,78 +1360,36 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	 */
 	blk_queue_bounce(q, &bio);
 
-	spin_lock_irq(q->queue_lock);
-
 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+		spin_lock_irq(q->queue_lock);
 		where = ELEVATOR_INSERT_FLUSH;
 		goto get_rq;
 	}
 
-	if (elv_queue_empty(q))
-		goto get_rq;
-
-	el_ret = elv_merge(q, &req, bio);
-	switch (el_ret) {
-	case ELEVATOR_BACK_MERGE:
-		BUG_ON(!rq_mergeable(req));
-
-		if (!ll_back_merge_fn(q, req, bio))
-			break;
-
-		trace_block_bio_backmerge(q, bio);
-
-		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
-			blk_rq_set_mixed_merge(req);
-
-		req->biotail->bi_next = bio;
-		req->biotail = bio;
-		req->__data_len += bytes;
-		req->ioprio = ioprio_best(req->ioprio, prio);
-		if (!blk_rq_cpu_valid(req))
-			req->cpu = bio->bi_comp_cpu;
-		drive_stat_acct(req, 0);
-		elv_bio_merged(q, req, bio);
-		if (!attempt_back_merge(q, req))
-			elv_merged_request(q, req, el_ret);
+	/*
+	 * Check if we can merge with the plugged list before grabbing
+	 * any locks.
+	 */
+	if (attempt_plug_merge(current, q, bio))
 		goto out;
 
-	case ELEVATOR_FRONT_MERGE:
-		BUG_ON(!rq_mergeable(req));
-
-		if (!ll_front_merge_fn(q, req, bio))
-			break;
-
-		trace_block_bio_frontmerge(q, bio);
+	spin_lock_irq(q->queue_lock);
 
-		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
-			blk_rq_set_mixed_merge(req);
-			req->cmd_flags &= ~REQ_FAILFAST_MASK;
-			req->cmd_flags |= ff;
+	el_ret = elv_merge(q, &req, bio);
+	if (el_ret == ELEVATOR_BACK_MERGE) {
+		BUG_ON(req->cmd_flags & REQ_ON_PLUG);
+		if (bio_attempt_back_merge(q, req, bio)) {
+			if (!attempt_back_merge(q, req))
+				elv_merged_request(q, req, el_ret);
+			goto out_unlock;
+		}
+	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
+		BUG_ON(req->cmd_flags & REQ_ON_PLUG);
+		if (bio_attempt_front_merge(q, req, bio)) {
+			if (!attempt_front_merge(q, req))
+				elv_merged_request(q, req, el_ret);
+			goto out_unlock;
 		}
-
-		bio->bi_next = req->bio;
-		req->bio = bio;
-
-		/*
-		 * may not be valid. if the low level driver said
-		 * it didn't need a bounce buffer then it better
-		 * not touch req->buffer either...
-		 */
-		req->buffer = bio_data(bio);
-		req->__sector = bio->bi_sector;
-		req->__data_len += bytes;
-		req->ioprio = ioprio_best(req->ioprio, prio);
-		if (!blk_rq_cpu_valid(req))
-			req->cpu = bio->bi_comp_cpu;
-		drive_stat_acct(req, 0);
-		elv_bio_merged(q, req, bio);
-		if (!attempt_front_merge(q, req))
-			elv_merged_request(q, req, el_ret);
-		goto out;
-
-	/* ELV_NO_MERGE: elevator says don't/can't merge. */
-	default:
-		;
 	}
 
 get_rq:
@@ -1356,20 +1416,35 @@ get_rq:
 	 */
 	init_request_from_bio(req, bio);
 
-	spin_lock_irq(q->queue_lock);
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-	    bio_flagged(bio, BIO_CPU_AFFINE))
-		req->cpu = blk_cpu_to_group(smp_processor_id());
-	if (queue_should_plug(q) && elv_queue_empty(q))
-		blk_plug_device(q);
-
-	/* insert the request into the elevator */
-	drive_stat_acct(req, 1);
-	__elv_add_request(q, req, where, 0);
+	    bio_flagged(bio, BIO_CPU_AFFINE)) {
+		req->cpu = blk_cpu_to_group(get_cpu());
+		put_cpu();
+	}
+
+	plug = current->plug;
+	if (plug && !sync) {
+		if (!plug->should_sort && !list_empty(&plug->list)) {
+			struct request *__rq;
+
+			__rq = list_entry_rq(plug->list.prev);
+			if (__rq->q != q)
+				plug->should_sort = 1;
+		}
+		/*
+		 * Debug flag, kill later
+		 */
+		req->cmd_flags |= REQ_ON_PLUG;
+		list_add_tail(&req->queuelist, &plug->list);
+		drive_stat_acct(req, 1);
+	} else {
+		spin_lock_irq(q->queue_lock);
+		add_acct_request(q, req, where);
+		__blk_run_queue(q);
+out_unlock:
+		spin_unlock_irq(q->queue_lock);
+	}
 out:
-	if (unplug || !queue_should_plug(q))
-		__generic_unplug_device(q);
-	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
 
@@ -1772,9 +1847,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	 */
 	BUG_ON(blk_queued_rq(rq));
 
-	drive_stat_acct(rq, 1);
-	__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
-
+	add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	return 0;
@@ -2659,6 +2732,106 @@ int kblockd_schedule_delayed_work(struct request_queue *q,
 }
 EXPORT_SYMBOL(kblockd_schedule_delayed_work);
 
+#define PLUG_MAGIC	0x91827364
+
+void blk_start_plug(struct blk_plug *plug)
+{
+	struct task_struct *tsk = current;
+
+	plug->magic = PLUG_MAGIC;
+	INIT_LIST_HEAD(&plug->list);
+	plug->should_sort = 0;
+
+	/*
+	 * If this is a nested plug, don't actually assign it. It will be
+	 * flushed on its own.
+	 */
+	if (!tsk->plug) {
+		/*
+		 * Store ordering should not be needed here, since a potential
+		 * preempt will imply a full memory barrier
+		 */
+		tsk->plug = plug;
+	}
+}
+EXPORT_SYMBOL(blk_start_plug);
+
+static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct request *rqa = container_of(a, struct request, queuelist);
+	struct request *rqb = container_of(b, struct request, queuelist);
+
+	return !(rqa->q == rqb->q);
+}
+
+static void flush_plug_list(struct blk_plug *plug)
+{
+	struct request_queue *q;
+	unsigned long flags;
+	struct request *rq;
+
+	BUG_ON(plug->magic != PLUG_MAGIC);
+
+	if (list_empty(&plug->list))
+		return;
+
+	if (plug->should_sort)
+		list_sort(NULL, &plug->list, plug_rq_cmp);
+
+	q = NULL;
+	local_irq_save(flags);
+	while (!list_empty(&plug->list)) {
+		rq = list_entry_rq(plug->list.next);
+		list_del_init(&rq->queuelist);
+		BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
+		BUG_ON(!rq->q);
+		if (rq->q != q) {
+			if (q) {
+				__blk_run_queue(q);
+				spin_unlock(q->queue_lock);
+			}
+			q = rq->q;
+			spin_lock(q->queue_lock);
+		}
+		rq->cmd_flags &= ~REQ_ON_PLUG;
+
+		/*
+		 * rq is already accounted, so use raw insert
+		 */
+		__elv_add_request(q, rq, ELEVATOR_INSERT_SORT, 0);
+	}
+
+	if (q) {
+		__blk_run_queue(q);
+		spin_unlock(q->queue_lock);
+	}
+
+	BUG_ON(!list_empty(&plug->list));
+	local_irq_restore(flags);
+}
+
+static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
+{
+	flush_plug_list(plug);
+
+	if (plug == tsk->plug)
+		tsk->plug = NULL;
+}
+
+void blk_finish_plug(struct blk_plug *plug)
+{
+	if (plug)
+		__blk_finish_plug(current, plug);
+}
+EXPORT_SYMBOL(blk_finish_plug);
+
+void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
+{
+	__blk_finish_plug(tsk, plug);
+	tsk->plug = plug;
+}
+EXPORT_SYMBOL(__blk_flush_plug);
+
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-flush.c b/block/blk-flush.c
index a867e3f524f3..1e2aa8a8908c 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -264,10 +264,9 @@ static bool blk_kick_flush(struct request_queue *q)
 static void flush_data_end_io(struct request *rq, int error)
 {
 	struct request_queue *q = rq->q;
-	bool was_empty = elv_queue_empty(q);
 
 	/* after populating an empty queue, kick it to avoid stall */
-	if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error) && was_empty)
+	if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
 		__blk_run_queue(q);
 }
 
diff --git a/block/elevator.c b/block/elevator.c
index f98e92edc937..25713927c0d3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -113,7 +113,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_rq_merge_ok);
 
-static inline int elv_try_merge(struct request *__rq, struct bio *bio)
+int elv_try_merge(struct request *__rq, struct bio *bio)
 {
 	int ret = ELEVATOR_NO_MERGE;
 
@@ -421,6 +421,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 	struct list_head *entry;
 	int stop_flags;
 
+	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
 	if (q->last_merge == rq)
 		q->last_merge = NULL;
 
@@ -696,6 +698,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		       int plug)
 {
+	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
 	if (rq->cmd_flags & REQ_SOFTBARRIER) {
 		/* barriers are scheduling boundary, update end_sector */
 		if (rq->cmd_type == REQ_TYPE_FS ||
-- 
cgit v1.2.2


From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 10 Mar 2011 08:52:07 +0100
Subject: block: remove per-queue plugging

Code has been converted over to the new explicit on-stack plugging,
and delay users have been converted to use the new API for that.
So lets kill off the old plugging along with aops->sync_page().

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c         | 173 ++++++-----------------------------------------
 block/blk-exec.c         |   4 +-
 block/blk-flush.c        |   3 +-
 block/blk-settings.c     |   8 ---
 block/blk-throttle.c     |   1 -
 block/blk.h              |   2 -
 block/cfq-iosched.c      |   8 ---
 block/deadline-iosched.c |   9 ---
 block/elevator.c         |  43 +-----------
 block/noop-iosched.c     |   8 ---
 10 files changed, 26 insertions(+), 233 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6efb55cc5af0..82a45898ba76 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -198,6 +198,19 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 
+/*
+ * Make sure that plugs that were pending when this function was entered,
+ * are now complete and requests pushed to the queue.
+*/
+static inline void queue_sync_plugs(struct request_queue *q)
+{
+	/*
+	 * If the current process is plugged and has barriers submitted,
+	 * we will livelock if we don't unplug first.
+	 */
+	blk_flush_plug(current);
+}
+
 static void blk_delay_work(struct work_struct *work)
 {
 	struct request_queue *q;
@@ -224,137 +237,6 @@ void blk_delay_queue(struct request_queue *q, unsigned long msecs)
 }
 EXPORT_SYMBOL(blk_delay_queue);
 
-/*
- * "plug" the device if there are no outstanding requests: this will
- * force the transfer to start only after we have put all the requests
- * on the list.
- *
- * This is called with interrupts off and no requests on the queue and
- * with the queue lock held.
- */
-void blk_plug_device(struct request_queue *q)
-{
-	WARN_ON(!irqs_disabled());
-
-	/*
-	 * don't plug a stopped queue, it must be paired with blk_start_queue()
-	 * which will restart the queueing
-	 */
-	if (blk_queue_stopped(q))
-		return;
-
-	if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
-		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		trace_block_plug(q);
-	}
-}
-EXPORT_SYMBOL(blk_plug_device);
-
-/**
- * blk_plug_device_unlocked - plug a device without queue lock held
- * @q:    The &struct request_queue to plug
- *
- * Description:
- *   Like @blk_plug_device(), but grabs the queue lock and disables
- *   interrupts.
- **/
-void blk_plug_device_unlocked(struct request_queue *q)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_plug_device(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(blk_plug_device_unlocked);
-
-/*
- * remove the queue from the plugged list, if present. called with
- * queue lock held and interrupts disabled.
- */
-int blk_remove_plug(struct request_queue *q)
-{
-	WARN_ON(!irqs_disabled());
-
-	if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
-		return 0;
-
-	del_timer(&q->unplug_timer);
-	return 1;
-}
-EXPORT_SYMBOL(blk_remove_plug);
-
-/*
- * remove the plug and let it rip..
- */
-void __generic_unplug_device(struct request_queue *q)
-{
-	if (unlikely(blk_queue_stopped(q)))
-		return;
-	if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
-		return;
-
-	q->request_fn(q);
-}
-
-/**
- * generic_unplug_device - fire a request queue
- * @q:    The &struct request_queue in question
- *
- * Description:
- *   Linux uses plugging to build bigger requests queues before letting
- *   the device have at them. If a queue is plugged, the I/O scheduler
- *   is still adding and merging requests on the queue. Once the queue
- *   gets unplugged, the request_fn defined for the queue is invoked and
- *   transfers started.
- **/
-void generic_unplug_device(struct request_queue *q)
-{
-	if (blk_queue_plugged(q)) {
-		spin_lock_irq(q->queue_lock);
-		__generic_unplug_device(q);
-		spin_unlock_irq(q->queue_lock);
-	}
-}
-EXPORT_SYMBOL(generic_unplug_device);
-
-static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
-				   struct page *page)
-{
-	struct request_queue *q = bdi->unplug_io_data;
-
-	blk_unplug(q);
-}
-
-void blk_unplug_work(struct work_struct *work)
-{
-	struct request_queue *q =
-		container_of(work, struct request_queue, unplug_work);
-
-	trace_block_unplug_io(q);
-	q->unplug_fn(q);
-}
-
-void blk_unplug_timeout(unsigned long data)
-{
-	struct request_queue *q = (struct request_queue *)data;
-
-	trace_block_unplug_timer(q);
-	kblockd_schedule_work(q, &q->unplug_work);
-}
-
-void blk_unplug(struct request_queue *q)
-{
-	/*
-	 * devices don't necessarily have an ->unplug_fn defined
-	 */
-	if (q->unplug_fn) {
-		trace_block_unplug_io(q);
-		q->unplug_fn(q);
-	}
-}
-EXPORT_SYMBOL(blk_unplug);
-
 /**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
@@ -389,7 +271,6 @@ EXPORT_SYMBOL(blk_start_queue);
  **/
 void blk_stop_queue(struct request_queue *q)
 {
-	blk_remove_plug(q);
 	cancel_delayed_work(&q->delay_work);
 	queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
@@ -411,11 +292,10 @@ EXPORT_SYMBOL(blk_stop_queue);
  */
 void blk_sync_queue(struct request_queue *q)
 {
-	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
-	cancel_work_sync(&q->unplug_work);
 	throtl_shutdown_timer_wq(q);
 	cancel_delayed_work_sync(&q->delay_work);
+	queue_sync_plugs(q);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -430,14 +310,9 @@ EXPORT_SYMBOL(blk_sync_queue);
  */
 void __blk_run_queue(struct request_queue *q)
 {
-	blk_remove_plug(q);
-
 	if (unlikely(blk_queue_stopped(q)))
 		return;
 
-	if (elv_queue_empty(q))
-		return;
-
 	/*
 	 * Only recurse once to avoid overrunning the stack, let the unplug
 	 * handling reinvoke the handler shortly if we already got there.
@@ -445,10 +320,8 @@ void __blk_run_queue(struct request_queue *q)
 	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
 		q->request_fn(q);
 		queue_flag_clear(QUEUE_FLAG_REENTER, q);
-	} else {
-		queue_flag_set(QUEUE_FLAG_PLUGGED, q);
-		kblockd_schedule_work(q, &q->unplug_work);
-	}
+	} else
+		queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
 }
 EXPORT_SYMBOL(__blk_run_queue);
 
@@ -535,8 +408,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (!q)
 		return NULL;
 
-	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
-	q->backing_dev_info.unplug_io_data = q;
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
@@ -556,13 +427,11 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
-	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_LIST_HEAD(&q->flush_queue[0]);
 	INIT_LIST_HEAD(&q->flush_queue[1]);
 	INIT_LIST_HEAD(&q->flush_data_in_flight);
-	INIT_WORK(&q->unplug_work, blk_unplug_work);
 	INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
@@ -652,7 +521,6 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
 	q->unprep_rq_fn		= NULL;
-	q->unplug_fn		= generic_unplug_device;
 	q->queue_flags		= QUEUE_FLAG_DEFAULT;
 	q->queue_lock		= lock;
 
@@ -910,8 +778,8 @@ out:
 }
 
 /*
- * No available requests for this queue, unplug the device and wait for some
- * requests to become available.
+ * No available requests for this queue, wait for some requests to become
+ * available.
  *
  * Called with q->queue_lock held, and returns with it unlocked.
  */
@@ -932,7 +800,6 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 
 		trace_block_sleeprq(q, bio, rw_flags & 1);
 
-		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
 		io_schedule();
 
@@ -1058,7 +925,7 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
 			     int where)
 {
 	drive_stat_acct(rq, 1);
-	__elv_add_request(q, rq, where, 0);
+	__elv_add_request(q, rq, where);
 }
 
 /**
@@ -2798,7 +2665,7 @@ static void flush_plug_list(struct blk_plug *plug)
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
-		__elv_add_request(q, rq, ELEVATOR_INSERT_SORT, 0);
+		__elv_add_request(q, rq, ELEVATOR_INSERT_SORT);
 	}
 
 	if (q) {
diff --git a/block/blk-exec.c b/block/blk-exec.c
index cf1456a02acd..81e31819a597 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -54,8 +54,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	rq->end_io = done;
 	WARN_ON(irqs_disabled());
 	spin_lock_irq(q->queue_lock);
-	__elv_add_request(q, rq, where, 1);
-	__generic_unplug_device(q);
+	__elv_add_request(q, rq, where);
+	__blk_run_queue(q);
 	/* the queue is stopped so it won't be plugged+unplugged */
 	if (rq->cmd_type == REQ_TYPE_PM_RESUME)
 		q->request_fn(q);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 1e2aa8a8908c..671fa9da7560 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -194,7 +194,6 @@ static void flush_end_io(struct request *flush_rq, int error)
 {
 	struct request_queue *q = flush_rq->q;
 	struct list_head *running = &q->flush_queue[q->flush_running_idx];
-	bool was_empty = elv_queue_empty(q);
 	bool queued = false;
 	struct request *rq, *n;
 
@@ -213,7 +212,7 @@ static void flush_end_io(struct request *flush_rq, int error)
 	}
 
 	/* after populating an empty queue, kick it to avoid stall */
-	if (queued && was_empty)
+	if (queued)
 		__blk_run_queue(q);
 }
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 36c8c1f2af18..c8d68921dddb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -164,14 +164,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	blk_queue_congestion_threshold(q);
 	q->nr_batching = BLK_BATCH_REQ;
 
-	q->unplug_thresh = 4;		/* hmm */
-	q->unplug_delay = msecs_to_jiffies(3);	/* 3 milliseconds */
-	if (q->unplug_delay == 0)
-		q->unplug_delay = 1;
-
-	q->unplug_timer.function = blk_unplug_timeout;
-	q->unplug_timer.data = (unsigned long)q;
-
 	blk_set_default_limits(&q->limits);
 	blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a89043a3caa4..b8dcdc2663a1 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -800,7 +800,6 @@ out:
 	if (nr_disp) {
 		while((bio = bio_list_pop(&bio_list_on_stack)))
 			generic_make_request(bio);
-		blk_unplug(q);
 	}
 	return nr_disp;
 }
diff --git a/block/blk.h b/block/blk.h
index 284b500852bd..49d21af81d07 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,8 +18,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 
-void blk_unplug_work(struct work_struct *work);
-void blk_unplug_timeout(unsigned long data);
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_delete_timer(struct request *);
 void blk_add_timer(struct request *);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3202c7e87fb3..ef631539dd2a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -499,13 +499,6 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 	}
 }
 
-static int cfq_queue_empty(struct request_queue *q)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-
-	return !cfqd->rq_queued;
-}
-
 /*
  * Scale schedule slice based on io priority. Use the sync time slice only
  * if a queue is marked sync and has sync io queued. A sync queue with async
@@ -4061,7 +4054,6 @@ static struct elevator_type iosched_cfq = {
 		.elevator_add_req_fn =		cfq_insert_request,
 		.elevator_activate_req_fn =	cfq_activate_request,
 		.elevator_deactivate_req_fn =	cfq_deactivate_request,
-		.elevator_queue_empty_fn =	cfq_queue_empty,
 		.elevator_completed_req_fn =	cfq_completed_request,
 		.elevator_former_req_fn =	elv_rb_former_request,
 		.elevator_latter_req_fn =	elv_rb_latter_request,
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b547cbca7b23..5139c0ea1864 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -326,14 +326,6 @@ dispatch_request:
 	return 1;
 }
 
-static int deadline_queue_empty(struct request_queue *q)
-{
-	struct deadline_data *dd = q->elevator->elevator_data;
-
-	return list_empty(&dd->fifo_list[WRITE])
-		&& list_empty(&dd->fifo_list[READ]);
-}
-
 static void deadline_exit_queue(struct elevator_queue *e)
 {
 	struct deadline_data *dd = e->elevator_data;
@@ -445,7 +437,6 @@ static struct elevator_type iosched_deadline = {
 		.elevator_merge_req_fn =	deadline_merged_requests,
 		.elevator_dispatch_fn =		deadline_dispatch_requests,
 		.elevator_add_req_fn =		deadline_add_request,
-		.elevator_queue_empty_fn =	deadline_queue_empty,
 		.elevator_former_req_fn =	elv_rb_former_request,
 		.elevator_latter_req_fn =	elv_rb_latter_request,
 		.elevator_init_fn =		deadline_init_queue,
diff --git a/block/elevator.c b/block/elevator.c
index 25713927c0d3..3ea208256e78 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -619,21 +619,12 @@ void elv_quiesce_end(struct request_queue *q)
 
 void elv_insert(struct request_queue *q, struct request *rq, int where)
 {
-	int unplug_it = 1;
-
 	trace_block_rq_insert(q, rq);
 
 	rq->q = q;
 
 	switch (where) {
 	case ELEVATOR_INSERT_REQUEUE:
-		/*
-		 * Most requeues happen because of a busy condition,
-		 * don't force unplug of the queue for that case.
-		 * Clear unplug_it and fall through.
-		 */
-		unplug_it = 0;
-
 	case ELEVATOR_INSERT_FRONT:
 		rq->cmd_flags |= REQ_SOFTBARRIER;
 		list_add(&rq->queuelist, &q->queue_head);
@@ -679,24 +670,14 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		rq->cmd_flags |= REQ_SOFTBARRIER;
 		blk_insert_flush(rq);
 		break;
-
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __func__, where);
 		BUG();
 	}
-
-	if (unplug_it && blk_queue_plugged(q)) {
-		int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
-				- queue_in_flight(q);
-
-		if (nrq >= q->unplug_thresh)
-			__generic_unplug_device(q);
-	}
 }
 
-void __elv_add_request(struct request_queue *q, struct request *rq, int where,
-		       int plug)
+void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 {
 	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
 
@@ -711,38 +692,20 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		    where == ELEVATOR_INSERT_SORT)
 		where = ELEVATOR_INSERT_BACK;
 
-	if (plug)
-		blk_plug_device(q);
-
 	elv_insert(q, rq, where);
 }
 EXPORT_SYMBOL(__elv_add_request);
 
-void elv_add_request(struct request_queue *q, struct request *rq, int where,
-		     int plug)
+void elv_add_request(struct request_queue *q, struct request *rq, int where)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	__elv_add_request(q, rq, where, plug);
+	__elv_add_request(q, rq, where);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(elv_add_request);
 
-int elv_queue_empty(struct request_queue *q)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (!list_empty(&q->queue_head))
-		return 0;
-
-	if (e->ops->elevator_queue_empty_fn)
-		return e->ops->elevator_queue_empty_fn(q);
-
-	return 1;
-}
-EXPORT_SYMBOL(elv_queue_empty);
-
 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 232c4b38cd37..06389e9ef96d 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -39,13 +39,6 @@ static void noop_add_request(struct request_queue *q, struct request *rq)
 	list_add_tail(&rq->queuelist, &nd->queue);
 }
 
-static int noop_queue_empty(struct request_queue *q)
-{
-	struct noop_data *nd = q->elevator->elevator_data;
-
-	return list_empty(&nd->queue);
-}
-
 static struct request *
 noop_former_request(struct request_queue *q, struct request *rq)
 {
@@ -90,7 +83,6 @@ static struct elevator_type elevator_noop = {
 		.elevator_merge_req_fn		= noop_merged_requests,
 		.elevator_dispatch_fn		= noop_dispatch,
 		.elevator_add_req_fn		= noop_add_request,
-		.elevator_queue_empty_fn	= noop_queue_empty,
 		.elevator_former_req_fn		= noop_former_request,
 		.elevator_latter_req_fn		= noop_latter_request,
 		.elevator_init_fn		= noop_init_queue,
-- 
cgit v1.2.2


From 721a9602e6607417c6bc15b18e97a2f35266c690 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 9 Mar 2011 11:56:30 +0100
Subject: block: kill off REQ_UNPLUG

With the plugging now being explicitly controlled by the
submitter, callers need not pass down unplugging hints
to the block layer. If they want to unplug, it's because they
manually plugged on their own - in which case, they should just
unplug at will.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 82a45898ba76..7e9715ae18c8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1290,7 +1290,7 @@ get_rq:
 	}
 
 	plug = current->plug;
-	if (plug && !sync) {
+	if (plug) {
 		if (!plug->should_sort && !list_empty(&plug->list)) {
 			struct request *__rq;
 
-- 
cgit v1.2.2


From 69d60eb96ae8a73cf9b79cf28051caf973006011 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 9 Mar 2011 08:27:37 +0100
Subject: blk-throttle: Use blk_plug in throttle dispatch

Use plug in throttle dispatch also as we are dispatching a bunch of
bios in throttle context and some of them might merge.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b8dcdc2663a1..658ee505315b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -770,6 +770,7 @@ static int throtl_dispatch(struct request_queue *q)
 	unsigned int nr_disp = 0;
 	struct bio_list bio_list_on_stack;
 	struct bio *bio;
+	struct blk_plug plug;
 
 	spin_lock_irq(q->queue_lock);
 
@@ -798,8 +799,10 @@ out:
 	 * immediate dispatch
 	 */
 	if (nr_disp) {
+		blk_start_plug(&plug);
 		while((bio = bio_list_pop(&bio_list_on_stack)))
 			generic_make_request(bio);
+		blk_finish_plug(&plug);
 	}
 	return nr_disp;
 }
-- 
cgit v1.2.2


From 0aeea18964173715a1037034ef6838198f319319 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Fri, 11 Mar 2011 10:23:53 +0100
Subject: block: fix mis-synchronisation in blkdev_issue_zeroout()

BZ29402
https://bugzilla.kernel.org/show_bug.cgi?id=29402

We can hit serious mis-synchronization in bio completion path of
blkdev_issue_zeroout() leading to a panic.

The problem is that when we are going to wait_for_completion() in
blkdev_issue_zeroout() we check if the bb.done equals issued (number of
submitted bios). If it does, we can skip the wait_for_completition()
and just out of the function since there is nothing to wait for.
However, there is a ordering problem because bio_batch_end_io() is
calling atomic_inc(&bb->done) before complete(), hence it might seem to
blkdev_issue_zeroout() that all bios has been completed and exit. At
this point when bio_batch_end_io() is going to call complete(bb->wait),
bb and wait does not longer exist since it was allocated on stack in
blkdev_issue_zeroout() ==> panic!

(thread 1)                      (thread 2)
bio_batch_end_io()              blkdev_issue_zeroout()
  if(bb) {                      ...
    if (bb->end_io)             ...
      bb->end_io(bio, err);     ...
    atomic_inc(&bb->done);      ...
    ...                         while (issued != atomic_read(&bb.done))
    ...                         (let issued == bb.done)
    ...                         (do the rest of the function)
    ...                         return ret;
    complete(bb->wait);
    ^^^^^^^^
    panic

We can fix this easily by simplifying bio_batch and completion counting.

Also remove bio_end_io_t *end_io since it is not used.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Reported-by: Eric Whitney <eric.whitney@hp.com>
Tested-by: Eric Whitney <eric.whitney@hp.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
CC: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-lib.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index eec78becb355..bd3e8df4d5e2 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -109,7 +109,6 @@ struct bio_batch
 	atomic_t 		done;
 	unsigned long 		flags;
 	struct completion 	*wait;
-	bio_end_io_t		*end_io;
 };
 
 static void bio_batch_end_io(struct bio *bio, int err)
@@ -122,12 +121,9 @@ static void bio_batch_end_io(struct bio *bio, int err)
 		else
 			clear_bit(BIO_UPTODATE, &bb->flags);
 	}
-	if (bb) {
-		if (bb->end_io)
-			bb->end_io(bio, err);
-		atomic_inc(&bb->done);
-		complete(bb->wait);
-	}
+	if (bb)
+		if (atomic_dec_and_test(&bb->done))
+			complete(bb->wait);
 	bio_put(bio);
 }
 
@@ -150,13 +146,12 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	int ret;
 	struct bio *bio;
 	struct bio_batch bb;
-	unsigned int sz, issued = 0;
+	unsigned int sz;
 	DECLARE_COMPLETION_ONSTACK(wait);
 
-	atomic_set(&bb.done, 0);
+	atomic_set(&bb.done, 1);
 	bb.flags = 1 << BIO_UPTODATE;
 	bb.wait = &wait;
-	bb.end_io = NULL;
 
 submit:
 	ret = 0;
@@ -185,12 +180,12 @@ submit:
 				break;
 		}
 		ret = 0;
-		issued++;
+		atomic_inc(&bb.done);
 		submit_bio(WRITE, bio);
 	}
 
 	/* Wait for bios in-flight */
-	while (issued != atomic_read(&bb.done))
+	if (!atomic_dec_and_test(&bb.done))
 		wait_for_completion(&wait);
 
 	if (!test_bit(BIO_UPTODATE, &bb.flags))
-- 
cgit v1.2.2


From eba2ed9c9636d9e7ed203a2498ed230b48341709 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Fri, 11 Mar 2011 20:13:54 +0100
Subject: block: remove obsolete comments for blkdev_issue_zeroout.

barrier is already removed, so remove the obsolete comments
in blkdev_issue_zeroout.

Cc: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-lib.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index eec78becb355..cb56c4e5b494 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -140,8 +140,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
  *
  * Description:
  *  Generate and issue number of bios with zerofiled pages.
- *  Send barrier at the beginning and at the end if requested. This guarantie
- *  correct request ordering. Empty barrier allow us to avoid post queue flush.
  */
 
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-- 
cgit v1.2.2


From 167400d34070ebbc408dc0f447c4ddb4bf837360 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Sat, 12 Mar 2011 16:54:00 +0100
Subject: blk-cgroup: Add unaccounted time to timeslice_used.

There are two kind of times that tasks are not charged for: the first
seek and the extra time slice used over the allocated timeslice. Both
of these exported as a new unaccounted_time stat.

I think it would be good to have this reported in 'time' as well, but
that is probably a separate discussion.

Signed-off-by: Justin TerAvest <teravest@google.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c  | 16 +++++++++++++++-
 block/blk-cgroup.h  | 12 ++++++++++--
 block/cfq-iosched.c | 21 +++++++++++++--------
 block/cfq.h         |  6 +++---
 4 files changed, 41 insertions(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a3eb9e..77ee3c1ec1a7 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -371,12 +371,14 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 
-void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
+				unsigned long unaccounted_time)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
 	blkg->stats.time += time;
+	blkg->stats.unaccounted_time += unaccounted_time;
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
@@ -603,6 +605,9 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 	if (type == BLKIO_STAT_SECTORS)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 					blkg->stats.sectors, cb, dev);
+	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+					blkg->stats.unaccounted_time, cb, dev);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
 		uint64_t sum = blkg->stats.avg_queue_size_sum;
@@ -1106,6 +1111,9 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 		case BLKIO_PROP_sectors:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_SECTORS, 0);
+		case BLKIO_PROP_unaccounted_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_UNACCOUNTED_TIME, 0);
 		case BLKIO_PROP_io_service_bytes:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_SERVICE_BYTES, 1);
@@ -1261,6 +1269,12 @@ struct cftype blkio_files[] = {
 				BLKIO_PROP_sectors),
 		.read_map = blkiocg_file_read_map,
 	},
+	{
+		.name = "unaccounted_time",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_unaccounted_time),
+		.read_map = blkiocg_file_read_map,
+	},
 	{
 		.name = "io_service_bytes",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 57e7234c5ae5..10919fae2d3a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -49,6 +49,8 @@ enum stat_type {
 	/* All the single valued stats go below this */
 	BLKIO_STAT_TIME,
 	BLKIO_STAT_SECTORS,
+	/* Time not charged to this cgroup */
+	BLKIO_STAT_UNACCOUNTED_TIME,
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	BLKIO_STAT_AVG_QUEUE_SIZE,
 	BLKIO_STAT_IDLE_TIME,
@@ -81,6 +83,7 @@ enum blkcg_file_name_prop {
 	BLKIO_PROP_io_serviced,
 	BLKIO_PROP_time,
 	BLKIO_PROP_sectors,
+	BLKIO_PROP_unaccounted_time,
 	BLKIO_PROP_io_service_time,
 	BLKIO_PROP_io_wait_time,
 	BLKIO_PROP_io_merged,
@@ -114,6 +117,8 @@ struct blkio_group_stats {
 	/* total disk time and nr sectors dispatched by this group */
 	uint64_t time;
 	uint64_t sectors;
+	/* Time not charged to this cgroup */
+	uint64_t unaccounted_time;
 	uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* Sum of number of IOs queued across all samples */
@@ -293,7 +298,8 @@ extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-					unsigned long time);
+					unsigned long time,
+					unsigned long unaccounted_time);
 void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
 						bool direction, bool sync);
 void blkiocg_update_completion_stats(struct blkio_group *blkg,
@@ -319,7 +325,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 static inline struct blkio_group *
 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
 static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-						unsigned long time) {}
+						unsigned long time,
+						unsigned long unaccounted_time)
+{}
 static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 				uint64_t bytes, bool direction, bool sync) {}
 static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c826ef81c679..89e0d1cc14ba 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -899,7 +899,8 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
 
-static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
+static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
+						unsigned int *unaccounted_time)
 {
 	unsigned int slice_used;
 
@@ -918,8 +919,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 					1);
 	} else {
 		slice_used = jiffies - cfqq->slice_start;
-		if (slice_used > cfqq->allocated_slice)
+		if (slice_used > cfqq->allocated_slice) {
+			*unaccounted_time = slice_used - cfqq->allocated_slice;
 			slice_used = cfqq->allocated_slice;
+		}
+		if (time_after(cfqq->slice_start, cfqq->dispatch_start))
+			*unaccounted_time += cfqq->slice_start -
+					cfqq->dispatch_start;
 	}
 
 	return slice_used;
@@ -929,12 +935,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 				struct cfq_queue *cfqq)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	unsigned int used_sl, charge;
+	unsigned int used_sl, charge, unaccounted_sl = 0;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
 
 	BUG_ON(nr_sync < 0);
-	used_sl = charge = cfq_cfqq_slice_usage(cfqq);
+	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
 
 	if (iops_mode(cfqd))
 		charge = cfqq->slice_dispatch;
@@ -960,7 +966,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
 			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
 			iops_mode(cfqd), cfqq->nr_sectors);
-	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
+	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
+					  unaccounted_sl);
 	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
 }
 
@@ -3296,9 +3303,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 
 	cfq_service_tree_add(cfqd, cfqq, 1);
-
-	cfqq->slice_end = 0;
-	cfq_mark_cfqq_slice_new(cfqq);
+	__cfq_set_active_queue(cfqd, cfqq);
 }
 
 /*
diff --git a/block/cfq.h b/block/cfq.h
index 54a6d90f8e8c..2a155927e37c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -16,9 +16,9 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 }
 
 static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			unsigned long time)
+			unsigned long time, unsigned long unaccounted_time)
 {
-	blkiocg_update_timeslice_used(blkg, time);
+	blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
 }
 
 static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
@@ -85,7 +85,7 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 			unsigned long dequeue) {}
 
 static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			unsigned long time) {}
+			unsigned long time, unsigned long unaccounted_time) {}
 static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 				bool direction, bool sync) {}
-- 
cgit v1.2.2


From 8184f93eced1e304721c2a55c00d87d5a14f8907 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Thu, 17 Mar 2011 16:12:36 +0100
Subject: cfq-iosched: Don't update group weights when on service tree

Version 3 is updated to apply to for-2.6.39/core.

For version 2, I took Vivek's advice and made sure we update the group
weight from cfq_group_service_tree_add().

If a weight was updated while a group is on the service tree, the
calculation for the total weight of the service tree can be adjusted
improperly, which either leads to bad service tree weights, or
potentially crashes (if total_weight becomes 0).

This patch defers updates to the weight until a group is off the service
tree.

Signed-off-by: Justin TerAvest <teravest@google.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 53 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 89e0d1cc14ba..12e380b2c4e4 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -178,6 +178,8 @@ struct cfq_group {
 	/* group service_tree key */
 	u64 vdisktime;
 	unsigned int weight;
+	unsigned int new_weight;
+	bool needs_update;
 
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
@@ -853,7 +855,27 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 }
 
 static void
-cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
+cfq_update_group_weight(struct cfq_group *cfqg)
+{
+	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+	if (cfqg->needs_update) {
+		cfqg->weight = cfqg->new_weight;
+		cfqg->needs_update = false;
+	}
+}
+
+static void
+cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+
+	cfq_update_group_weight(cfqg);
+	__cfq_group_service_tree_add(st, cfqg);
+	st->total_weight += cfqg->weight;
+}
+
+static void
+cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_group *__cfqg;
@@ -874,13 +896,19 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
 	} else
 		cfqg->vdisktime = st->min_vdisktime;
+	cfq_group_service_tree_add(st, cfqg);
+}
 
-	__cfq_group_service_tree_add(st, cfqg);
-	st->total_weight += cfqg->weight;
+static void
+cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+	st->total_weight -= cfqg->weight;
+	if (!RB_EMPTY_NODE(&cfqg->rb_node))
+		cfq_rb_erase(&cfqg->rb_node, st);
 }
 
 static void
-cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
+cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 
@@ -892,9 +920,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		return;
 
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	st->total_weight -= cfqg->weight;
-	if (!RB_EMPTY_NODE(&cfqg->rb_node))
-		cfq_rb_erase(&cfqg->rb_node, st);
+	cfq_group_service_tree_del(st, cfqg);
 	cfqg->saved_workload_slice = 0;
 	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
@@ -948,9 +974,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		charge = cfqq->allocated_slice;
 
 	/* Can't update vdisktime while group is on service tree */
-	cfq_rb_erase(&cfqg->rb_node, st);
+	cfq_group_service_tree_del(st, cfqg);
 	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
-	__cfq_group_service_tree_add(st, cfqg);
+	/* If a new weight was requested, update now, off tree */
+	cfq_group_service_tree_add(st, cfqg);
 
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
@@ -982,7 +1009,9 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
 					unsigned int weight)
 {
-	cfqg_of_blkg(blkg)->weight = weight;
+	struct cfq_group *cfqg = cfqg_of_blkg(blkg);
+	cfqg->new_weight = weight;
+	cfqg->needs_update = true;
 }
 
 static struct cfq_group *
@@ -1255,7 +1284,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	service_tree->count++;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
-	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
+	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
 }
 
 static struct cfq_queue *
@@ -1368,7 +1397,7 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		cfqq->p_root = NULL;
 	}
 
-	cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+	cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
 	if (cfq_cfqq_sync(cfqq))
-- 
cgit v1.2.2


From 5e84ea3a9c662dc2d7a48703a4468fad954a3b7f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 21 Mar 2011 10:14:27 +0100
Subject: block: attempt to merge with existing requests on plug flush

One of the disadvantages of on-stack plugging is that we potentially
lose out on merging since all pending IO isn't always visible to
everybody. When we flush the on-stack plugs, right now we don't do
any checks to see if potential merge candidates could be utilized.

Correct this by adding a new insert variant, ELEVATOR_INSERT_SORT_MERGE.
It works just ELEVATOR_INSERT_SORT, but first checks whether we can
merge with an existing request before doing the insertion (if we fail
merging).

This fixes a regression with multiple processes issuing IO that
can be merged.

Thanks to Shaohua Li <shaohua.li@intel.com> for testing and fixing
an accounting bug.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c  |  2 +-
 block/blk-merge.c |  6 ++++++
 block/blk.h       |  2 ++
 block/elevator.c  | 52 +++++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 58 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index e1fcf7a24668..525693237a4a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2685,7 +2685,7 @@ static void flush_plug_list(struct blk_plug *plug)
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
-		__elv_add_request(q, rq, ELEVATOR_INSERT_SORT);
+		__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
 	}
 
 	if (q) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ea85e20d5e94..cfcc37cb222b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -465,3 +465,9 @@ int attempt_front_merge(struct request_queue *q, struct request *rq)
 
 	return 0;
 }
+
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+			  struct request *next)
+{
+	return attempt_merge(q, rq, next);
+}
diff --git a/block/blk.h b/block/blk.h
index 49d21af81d07..c8db371a921d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -103,6 +103,8 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		      struct bio *bio);
 int attempt_back_merge(struct request_queue *q, struct request *rq);
 int attempt_front_merge(struct request_queue *q, struct request *rq);
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+				struct request *next);
 void blk_recalc_rq_segments(struct request *rq);
 void blk_rq_set_mixed_merge(struct request *rq);
 
diff --git a/block/elevator.c b/block/elevator.c
index 542ce826b401..c387d3168734 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -521,6 +521,40 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 	return ELEVATOR_NO_MERGE;
 }
 
+/*
+ * Attempt to do an insertion back merge. Only check for the case where
+ * we can append 'rq' to an existing request, so we can throw 'rq' away
+ * afterwards.
+ *
+ * Returns true if we merged, false otherwise
+ */
+static bool elv_attempt_insert_merge(struct request_queue *q,
+				     struct request *rq)
+{
+	struct request *__rq;
+
+	if (blk_queue_nomerges(q))
+		return false;
+
+	/*
+	 * First try one-hit cache.
+	 */
+	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
+		return true;
+
+	if (blk_queue_noxmerges(q))
+		return false;
+
+	/*
+	 * See if our hash lookup can find a potential backmerge.
+	 */
+	__rq = elv_rqhash_find(q, blk_rq_pos(rq));
+	if (__rq && blk_attempt_req_merge(q, __rq, rq))
+		return true;
+
+	return false;
+}
+
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
 	struct elevator_queue *e = q->elevator;
@@ -538,14 +572,18 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
 	struct elevator_queue *e = q->elevator;
+	const int next_sorted = next->cmd_flags & REQ_SORTED;
 
-	if (e->ops->elevator_merge_req_fn)
+	if (next_sorted && e->ops->elevator_merge_req_fn)
 		e->ops->elevator_merge_req_fn(q, rq, next);
 
 	elv_rqhash_reposition(q, rq);
-	elv_rqhash_del(q, next);
 
-	q->nr_sorted--;
+	if (next_sorted) {
+		elv_rqhash_del(q, next);
+		q->nr_sorted--;
+	}
+
 	q->last_merge = rq;
 }
 
@@ -647,6 +685,14 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		__blk_run_queue(q, false);
 		break;
 
+	case ELEVATOR_INSERT_SORT_MERGE:
+		/*
+		 * If we succeed in merging this request with one in the
+		 * queue already, we are done - rq has now been freed,
+		 * so no need to do anything further.
+		 */
+		if (elv_attempt_insert_merge(q, rq))
+			break;
 	case ELEVATOR_INSERT_SORT:
 		BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
 		       !(rq->cmd_flags & REQ_DISCARD));
-- 
cgit v1.2.2


From eda5e0c91fed2d2a38a341b0957263406d330274 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Tue, 22 Mar 2011 21:26:49 +0100
Subject: cfq-iosched: Don't set active queue in preempt

Commit "Add unaccounted time to timeslice_used" changed the behavior of
cfq_preempt_queue to set cfqq active. Vivek pointed out that other
preemption rules might get involved, so we shouldn't manually set which
queue is active.

This cleans up the code to just clear the queue stats at preemption
time.

Signed-off-by: Justin TerAvest <teravest@google.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 12e380b2c4e4..69208d732903 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1620,27 +1620,33 @@ static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
 }
 
+static void cfq_clear_queue_stats(struct cfq_data *cfqd,
+				  struct cfq_queue *cfqq)
+{
+	cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
+	cfqq->slice_start = 0;
+	cfqq->dispatch_start = jiffies;
+	cfqq->allocated_slice = 0;
+	cfqq->slice_end = 0;
+	cfqq->slice_dispatch = 0;
+	cfqq->nr_sectors = 0;
+
+	cfq_clear_cfqq_wait_request(cfqq);
+	cfq_clear_cfqq_must_dispatch(cfqq);
+	cfq_clear_cfqq_must_alloc_slice(cfqq);
+	cfq_clear_cfqq_fifo_expire(cfqq);
+	cfq_mark_cfqq_slice_new(cfqq);
+
+	cfq_del_timer(cfqd, cfqq);
+}
+
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
 				   struct cfq_queue *cfqq)
 {
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
-		cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
-		cfqq->slice_start = 0;
-		cfqq->dispatch_start = jiffies;
-		cfqq->allocated_slice = 0;
-		cfqq->slice_end = 0;
-		cfqq->slice_dispatch = 0;
-		cfqq->nr_sectors = 0;
-
-		cfq_clear_cfqq_wait_request(cfqq);
-		cfq_clear_cfqq_must_dispatch(cfqq);
-		cfq_clear_cfqq_must_alloc_slice(cfqq);
-		cfq_clear_cfqq_fifo_expire(cfqq);
-		cfq_mark_cfqq_slice_new(cfqq);
-
-		cfq_del_timer(cfqd, cfqq);
+		cfq_clear_queue_stats(cfqd, cfqq);
 	}
 
 	cfqd->active_queue = cfqq;
@@ -3332,7 +3338,8 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 
 	cfq_service_tree_add(cfqd, cfqq, 1);
-	__cfq_set_active_queue(cfqd, cfqq);
+
+	cfq_clear_queue_stats(cfqd, cfqq);
 }
 
 /*
-- 
cgit v1.2.2


From 9026e521c0da0731eb31f9f9022dd00cc3cd8885 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Tue, 22 Mar 2011 21:26:54 +0100
Subject: blk-cgroup: Only give unaccounted_time under debug

This change moves unaccounted_time to only be reported when
CONFIG_DEBUG_BLK_CGROUP is true.

Signed-off-by: Justin TerAvest <teravest@google.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 77ee3c1ec1a7..2bef5705ce24 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -605,10 +605,10 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 	if (type == BLKIO_STAT_SECTORS)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 					blkg->stats.sectors, cb, dev);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
 	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 					blkg->stats.unaccounted_time, cb, dev);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
 	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
 		uint64_t sum = blkg->stats.avg_queue_size_sum;
 		uint64_t samples = blkg->stats.avg_queue_size_samples;
@@ -1111,9 +1111,6 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 		case BLKIO_PROP_sectors:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_SECTORS, 0);
-		case BLKIO_PROP_unaccounted_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_UNACCOUNTED_TIME, 0);
 		case BLKIO_PROP_io_service_bytes:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_SERVICE_BYTES, 1);
@@ -1133,6 +1130,9 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_QUEUED, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+		case BLKIO_PROP_unaccounted_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_UNACCOUNTED_TIME, 0);
 		case BLKIO_PROP_dequeue:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_DEQUEUE, 0);
@@ -1269,12 +1269,6 @@ struct cftype blkio_files[] = {
 				BLKIO_PROP_sectors),
 		.read_map = blkiocg_file_read_map,
 	},
-	{
-		.name = "unaccounted_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_unaccounted_time),
-		.read_map = blkiocg_file_read_map,
-	},
 	{
 		.name = "io_service_bytes",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
@@ -1396,6 +1390,12 @@ struct cftype blkio_files[] = {
 				BLKIO_PROP_dequeue),
 		.read_map = blkiocg_file_read_map,
 	},
+	{
+		.name = "unaccounted_time",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_unaccounted_time),
+		.read_map = blkiocg_file_read_map,
+	},
 #endif
 };
 
-- 
cgit v1.2.2


From 04521db04e9a11e74b0252d222051cb194487f4d Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 22 Mar 2011 21:54:29 +0100
Subject: blk-throttle: Reset group slice when limits are changed

Lina reported that if throttle limits are initially very high and then
dropped, then no new bio might be dispatched for a long time. And the
reason being that after dropping the limits we don't reset the existing
slice and do the rate calculation with new low rate and account the bios
dispatched at high rate. To fix it, reset the slice upon rate change.

https://lkml.org/lkml/2011/3/10/298

Another problem with very high limit is that we never queued the
bio on throtl service tree. That means we kept on extending the
group slice but never trimmed it. Fix that also by regulary
trimming the slice even if bio is not being queued up.

Reported-by: Lina Lu <lulina_nuaa@foxmail.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 37abbfc68590..5352bdafbcf0 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -756,6 +756,15 @@ static void throtl_process_limit_change(struct throtl_data *td)
 			" riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
 			tg->iops[READ], tg->iops[WRITE]);
 
+		/*
+		 * Restart the slices for both READ and WRITES. It
+		 * might happen that a group's limit are dropped
+		 * suddenly and we don't want to account recently
+		 * dispatched IO with new low rate
+		 */
+		throtl_start_new_slice(td, tg, 0);
+		throtl_start_new_slice(td, tg, 1);
+
 		if (throtl_tg_on_rr(tg))
 			tg_update_disptime(td, tg);
 	}
@@ -821,7 +830,8 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 
 	struct delayed_work *dwork = &td->throtl_work;
 
-	if (total_nr_queued(td) > 0) {
+	/* schedule work if limits changed even if no bio is queued */
+	if (total_nr_queued(td) > 0 || td->limits_changed) {
 		/*
 		 * We might have a work scheduled to be executed in future.
 		 * Cancel that and schedule a new one.
@@ -1002,6 +1012,19 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 	/* Bio is with-in rate limit of group */
 	if (tg_may_dispatch(td, tg, bio, NULL)) {
 		throtl_charge_bio(tg, bio);
+
+		/*
+		 * We need to trim slice even when bios are not being queued
+		 * otherwise it might happen that a bio is not queued for
+		 * a long time and slice keeps on extending and trim is not
+		 * called for a long time. Now if limits are reduced suddenly
+		 * we take into account all the IO dispatched so far at new
+		 * low rate and * newly queued IO gets a really long dispatch
+		 * time.
+		 *
+		 * So keep on trimming slice even if bio is not queued.
+		 */
+		throtl_trim_slice(td, tg, rw);
 		goto out;
 	}
 
-- 
cgit v1.2.2


From 62a37f6badd1ac97ba07d543b5d4be2f9cb17217 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Wed, 23 Mar 2011 08:25:44 +0100
Subject: cfq-iosched: Don't clear queue stats when preempt.

For v2, I added back lines to cfq_preempt_queue() that were removed
during updates for accounting unaccounted_time. Thanks for pointing out
that I'd missed these, Vivek.

Previous commit "cfq-iosched: Don't set active queue in preempt" wrongly
cleared stats for preempting queues when it shouldn't have, because when
we choose a queue to preempt, it still isn't necessarily scheduled next.

Thanks to Vivek Goyal for figuring this out and understanding how the
preemption code works.

Signed-off-by: Justin TerAvest <teravest@google.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 69208d732903..fea1b5a9b7e9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1620,33 +1620,27 @@ static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
 }
 
-static void cfq_clear_queue_stats(struct cfq_data *cfqd,
-				  struct cfq_queue *cfqq)
-{
-	cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
-	cfqq->slice_start = 0;
-	cfqq->dispatch_start = jiffies;
-	cfqq->allocated_slice = 0;
-	cfqq->slice_end = 0;
-	cfqq->slice_dispatch = 0;
-	cfqq->nr_sectors = 0;
-
-	cfq_clear_cfqq_wait_request(cfqq);
-	cfq_clear_cfqq_must_dispatch(cfqq);
-	cfq_clear_cfqq_must_alloc_slice(cfqq);
-	cfq_clear_cfqq_fifo_expire(cfqq);
-	cfq_mark_cfqq_slice_new(cfqq);
-
-	cfq_del_timer(cfqd, cfqq);
-}
-
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
 				   struct cfq_queue *cfqq)
 {
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
-		cfq_clear_queue_stats(cfqd, cfqq);
+		cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
+		cfqq->slice_start = 0;
+		cfqq->dispatch_start = jiffies;
+		cfqq->allocated_slice = 0;
+		cfqq->slice_end = 0;
+		cfqq->slice_dispatch = 0;
+		cfqq->nr_sectors = 0;
+
+		cfq_clear_cfqq_wait_request(cfqq);
+		cfq_clear_cfqq_must_dispatch(cfqq);
+		cfq_clear_cfqq_must_alloc_slice(cfqq);
+		cfq_clear_cfqq_fifo_expire(cfqq);
+		cfq_mark_cfqq_slice_new(cfqq);
+
+		cfq_del_timer(cfqd, cfqq);
 	}
 
 	cfqd->active_queue = cfqq;
@@ -3339,7 +3333,8 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	cfq_service_tree_add(cfqd, cfqq, 1);
 
-	cfq_clear_queue_stats(cfqd, cfqq);
+	cfqq->slice_end = 0;
+	cfq_mark_cfqq_slice_new(cfqq);
 }
 
 /*
-- 
cgit v1.2.2


From c4ade94fc00f8b34589719d8a347f658b6d3951e Mon Sep 17 00:00:00 2001
From: "Li, Shaohua" <shaohua.li@intel.com>
Date: Wed, 23 Mar 2011 08:30:34 +0100
Subject: cfq-iosched: removing unnecessary think time checking

Removing think time checking. A high thinktime queue might means the queue
dispatches several requests and then do away. Limitting such queue seems
meaningless. And also this can simplify code. This is suggested by Vivek.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fea1b5a9b7e9..7785169f3c8f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2417,19 +2417,14 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 			return false;
 
 		/*
-		 * If there is only one sync queue, and its think time is
-		 * small, we can ignore async queue here and give the sync
+		 * If there is only one sync queue
+		 * we can ignore async queue here and give the sync
 		 * queue no dispatch limit. The reason is a sync queue can
 		 * preempt async queue, limiting the sync queue doesn't make
 		 * sense. This is useful for aiostress test.
 		 */
-		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) {
-			struct cfq_io_context *cic = RQ_CIC(cfqq->next_rq);
-
-			if (sample_valid(cic->ttime_samples) &&
-				cic->ttime_mean < cfqd->cfq_slice_idle)
-				promote_sync = true;
-		}
+		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
+			promote_sync = true;
 
 		/*
 		 * We have other queues, don't allow more IO from this one
-- 
cgit v1.2.2


From 401a18e92ce32cd0ddfa5738899ca2b8114f2bbf Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 25 Mar 2011 16:57:52 +0100
Subject: block: fix bug with inserting flush requests as sort/merge

With the introduction of the on-stack plugging, we would assume
that any request being inserted was a normal file system request.
As flush/fua requires a special insert mode, this caused problems.

Fix this up by checking for this in flush_plug_list() and use
the appropriate insert mechanism.

Big thanks goes to Markus Tripplesdorf for tirelessly testing
patches, and to Sergey Senozhatsky for helping find the real
issue.

Reported-by: Markus Tripplesdorf <markus@trippelsdorf.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 59b5c00c0126..64e96ee1d6af 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2702,7 +2702,10 @@ static void flush_plug_list(struct blk_plug *plug)
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
-		__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
+		if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA))
+			__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
+		else
+			__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
 	}
 
 	if (q) {
-- 
cgit v1.2.2


From ad3d9d7ede04a9c71be7a9fe1a23961817f371f7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 25 Mar 2011 16:58:59 +0100
Subject: block: fix issue with calling blk_stop_queue() from the request_fn
 handler

When the queue work handler was converted to delayed work, the
stopping was inadvertently made sync as well. Change this back
to being async stop, using __cancel_delayed_work() instead of
cancel_delayed_work().

Reported-by: Jeremy Fitzhardinge <jeremy@goop.org>
Reported-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 64e96ee1d6af..e0a062363937 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -271,7 +271,7 @@ EXPORT_SYMBOL(blk_start_queue);
  **/
 void blk_stop_queue(struct request_queue *q)
 {
-	cancel_delayed_work(&q->delay_work);
+	__cancel_delayed_work(&q->delay_work);
 	queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
 EXPORT_SYMBOL(blk_stop_queue);
-- 
cgit v1.2.2


From 25985edcedea6396277003854657b5f3cb31a628 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Date: Wed, 30 Mar 2011 22:57:33 -0300
Subject: Fix common misspellings

Fixes generated by 'codespell' and manually reviewed.

Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
---
 block/blk-cgroup.c   | 4 ++--
 block/blk-core.c     | 2 +-
 block/blk-throttle.c | 2 +-
 block/blk.h          | 2 +-
 block/cfq-iosched.c  | 2 +-
 block/genhd.c        | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2bef5705ce24..f0605ab2a761 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -868,7 +868,7 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 }
 
 /*
- * Some rules/values in blkg have changed. Propogate those to respective
+ * Some rules/values in blkg have changed. Propagate those to respective
  * policies.
  */
 static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
@@ -903,7 +903,7 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 }
 
 /*
- * A policy node rule has been updated. Propogate this update to all the
+ * A policy node rule has been updated. Propagate this update to all the
  * block groups which might be affected by this update.
  */
 static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
diff --git a/block/blk-core.c b/block/blk-core.c
index e0a062363937..071ae6d2768b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1184,7 +1184,7 @@ static bool bio_attempt_front_merge(struct request_queue *q,
 
 /*
  * Attempts to merge with the plugged list in the current process. Returns
- * true if merge was succesful, otherwise false.
+ * true if merge was successful, otherwise false.
  */
 static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
 			       struct bio *bio)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5352bdafbcf0..c8b16c88b315 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -916,7 +916,7 @@ static void throtl_update_blkio_group_common(struct throtl_data *td,
 /*
  * For all update functions, key should be a valid pointer because these
  * update functions are called under blkcg_lock, that means, blkg is
- * valid and in turn key is valid. queue exit path can not race becuase
+ * valid and in turn key is valid. queue exit path can not race because
  * of blkcg_lock
  *
  * Can not take queue lock in update functions as queue lock under blkcg_lock
diff --git a/block/blk.h b/block/blk.h
index c8db371a921d..61263463e38e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -32,7 +32,7 @@ enum rq_atomic_flags {
 
 /*
  * EH timer and IO completion will both attempt to 'grab' the request, make
- * sure that only one of them suceeds
+ * sure that only one of them succeeds
  */
 static inline int blk_mark_rq_complete(struct request *rq)
 {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7785169f3c8f..3be881ec95ad 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -888,7 +888,7 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	/*
 	 * Currently put the group at the end. Later implement something
 	 * so that groups get lesser vtime based on their weights, so that
-	 * if group does not loose all if it was not continously backlogged.
+	 * if group does not loose all if it was not continuously backlogged.
 	 */
 	n = rb_last(&st->rb);
 	if (n) {
diff --git a/block/genhd.c b/block/genhd.c
index c91a2dac6b6b..b364bd038a18 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -739,7 +739,7 @@ void __init printk_all_partitions(void)
 
 		/*
 		 * Don't show empty devices or things that have been
-		 * surpressed
+		 * suppressed
 		 */
 		if (get_capacity(disk) == 0 ||
 		    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
-- 
cgit v1.2.2


From 8182924bc5850281985d73c312876746acd390b5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 30 Mar 2011 09:51:33 +0200
Subject: block: dump request state on seeing a corrupted request completion

Currently we just dump a non-informative 'request botched' message.
Lets actually try and print something sane to help debug issues
around this.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index e0a062363937..4fdf8953efbf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2163,7 +2163,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	 * size, something has gone terribly wrong.
 	 */
 	if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
-		printk(KERN_ERR "blk: request botched\n");
+		blk_dump_rq_flags(req, "request botched");
 		req->__data_len = blk_rq_cur_bytes(req);
 	}
 
-- 
cgit v1.2.2


From b710a480554f2be682bac3cb59b0e085ba3d644b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 30 Mar 2011 09:52:30 +0200
Subject: block: get rid of elv_insert() interface

Merge it with __elv_add_request(), it's pretty pointless to
have a function with only two callers. The main interface
is elv_add_request()/__elv_add_request().

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c |  4 ++--
 block/elevator.c  | 35 +++++++++++++++--------------------
 2 files changed, 17 insertions(+), 22 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 93d5fd8e51eb..d6275cbbaf35 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -261,7 +261,7 @@ static bool blk_kick_flush(struct request_queue *q)
 	q->flush_rq.end_io = flush_end_io;
 
 	q->flush_pending_idx ^= 1;
-	elv_insert(q, &q->flush_rq, ELEVATOR_INSERT_REQUEUE);
+	__elv_add_request(q, &q->flush_rq, ELEVATOR_INSERT_REQUEUE);
 	return true;
 }
 
@@ -281,7 +281,7 @@ static void flush_data_end_io(struct request *rq, int error)
  * blk_insert_flush - insert a new FLUSH/FUA request
  * @rq: request to insert
  *
- * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions.
+ * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
  * @rq is being submitted.  Analyze what needs to be done and put it on the
  * right queue.
  *
diff --git a/block/elevator.c b/block/elevator.c
index c387d3168734..0cdb4e7ebab4 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -610,7 +610,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 
 	rq->cmd_flags &= ~REQ_STARTED;
 
-	elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
+	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
 }
 
 void elv_drain_elevator(struct request_queue *q)
@@ -655,12 +655,25 @@ void elv_quiesce_end(struct request_queue *q)
 	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
 }
 
-void elv_insert(struct request_queue *q, struct request *rq, int where)
+void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 {
 	trace_block_rq_insert(q, rq);
 
 	rq->q = q;
 
+	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
+	if (rq->cmd_flags & REQ_SOFTBARRIER) {
+		/* barriers are scheduling boundary, update end_sector */
+		if (rq->cmd_type == REQ_TYPE_FS ||
+		    (rq->cmd_flags & REQ_DISCARD)) {
+			q->end_sector = rq_end_sector(rq);
+			q->boundary_rq = rq;
+		}
+	} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
+		    where == ELEVATOR_INSERT_SORT)
+		where = ELEVATOR_INSERT_BACK;
+
 	switch (where) {
 	case ELEVATOR_INSERT_REQUEUE:
 	case ELEVATOR_INSERT_FRONT:
@@ -722,24 +735,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		BUG();
 	}
 }
-
-void __elv_add_request(struct request_queue *q, struct request *rq, int where)
-{
-	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
-
-	if (rq->cmd_flags & REQ_SOFTBARRIER) {
-		/* barriers are scheduling boundary, update end_sector */
-		if (rq->cmd_type == REQ_TYPE_FS ||
-		    (rq->cmd_flags & REQ_DISCARD)) {
-			q->end_sector = rq_end_sector(rq);
-			q->boundary_rq = rq;
-		}
-	} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
-		    where == ELEVATOR_INSERT_SORT)
-		where = ELEVATOR_INSERT_BACK;
-
-	elv_insert(q, rq, where);
-}
 EXPORT_SYMBOL(__elv_add_request);
 
 void elv_add_request(struct request_queue *q, struct request *rq, int where)
-- 
cgit v1.2.2


From 53d63e6b0dfb95882ec0219ba6bbd50cde423794 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 30 Mar 2011 13:27:09 +0200
Subject: block: make the flush insertion use the tail of the dispatch list

It's not a preempt type request, in fact we have to insert it
behind requests that do specify INSERT_FRONT.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index d6275cbbaf35..eba4a2790c6c 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -261,7 +261,7 @@ static bool blk_kick_flush(struct request_queue *q)
 	q->flush_rq.end_io = flush_end_io;
 
 	q->flush_pending_idx ^= 1;
-	__elv_add_request(q, &q->flush_rq, ELEVATOR_INSERT_REQUEUE);
+	list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
 	return true;
 }
 
@@ -312,7 +312,7 @@ void blk_insert_flush(struct request *rq)
 	 */
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-		list_add(&rq->queuelist, &q->queue_head);
+		list_add_tail(&rq->queuelist, &q->queue_head);
 		return;
 	}
 
-- 
cgit v1.2.2


From 6f0379377047b18103b88ce33c03e5b19747ae57 Mon Sep 17 00:00:00 2001
From: Andreas Schwab <schwab@linux-m68k.org>
Date: Wed, 30 Mar 2011 12:21:56 +0200
Subject: blk-throttle: don't call xchg on bool

xchg does not work portably with smaller than 32bit types.

Signed-off-by: Andreas Schwab <schwab@linux-m68k.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5352bdafbcf0..6c98cfeeedf0 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -77,7 +77,7 @@ struct throtl_grp {
 	unsigned long slice_end[2];
 
 	/* Some throttle limits got updated for the group */
-	bool limits_changed;
+	int limits_changed;
 };
 
 struct throtl_data
@@ -102,7 +102,7 @@ struct throtl_data
 	/* Work for dispatching throttled bios */
 	struct delayed_work throtl_work;
 
-	bool limits_changed;
+	int limits_changed;
 };
 
 enum tg_state_flags {
-- 
cgit v1.2.2


From a63a5cf84dac7a23a57c800eea5734701e7d3c04 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 1 Apr 2011 21:02:31 +0200
Subject: dm: improve block integrity support

The current block integrity (DIF/DIX) support in DM is verifying that
all devices' integrity profiles match during DM device resume (which
is past the point of no return).  To some degree that is unavoidable
(stacked DM devices force this late checking).  But for most DM
devices (which aren't stacking on other DM devices) the ideal time to
verify all integrity profiles match is during table load.

Introduce the notion of an "initialized" integrity profile: a profile
that was blk_integrity_register()'d with a non-NULL 'blk_integrity'
template.  Add blk_integrity_is_initialized() to allow checking if a
profile was initialized.

Update DM integrity support to:
- check all devices with _initialized_ integrity profiles match
  during table load; uninitialized profiles (e.g. for underlying DM
  device(s) of a stacked DM device) are ignored.
- disallow a table load that would result in an integrity profile that
  conflicts with a DM device's existing (in-use) integrity profile
- avoid clearing an existing integrity profile
- validate all integrity profiles match during resume; but if they
  don't all we can do is report the mismatch (during resume we're past
  the point of no return)

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-integrity.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 54bcba6c02a7..129b9e209a3b 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -30,6 +30,8 @@
 
 static struct kmem_cache *integrity_cachep;
 
+static const char *bi_unsupported_name = "unsupported";
+
 /**
  * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
  * @q:		request queue
@@ -358,6 +360,14 @@ static struct kobj_type integrity_ktype = {
 	.release	= blk_integrity_release,
 };
 
+bool blk_integrity_is_initialized(struct gendisk *disk)
+{
+	struct blk_integrity *bi = blk_get_integrity(disk);
+
+	return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0);
+}
+EXPORT_SYMBOL(blk_integrity_is_initialized);
+
 /**
  * blk_integrity_register - Register a gendisk as being integrity-capable
  * @disk:	struct gendisk pointer to make integrity-aware
@@ -407,7 +417,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 		bi->get_tag_fn = template->get_tag_fn;
 		bi->tag_size = template->tag_size;
 	} else
-		bi->name = "unsupported";
+		bi->name = bi_unsupported_name;
 
 	return 0;
 }
-- 
cgit v1.2.2


From f83e826181f7f8fb152e4190d03854fc3a5dd040 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 4 Apr 2011 00:15:02 +0200
Subject: block: fix request sorting at unplug

Comparison function for list_sort() must be anticommutative,
otherwise it is not sorting in ordinary meaning.

But fortunately list_sort() always check ((*cmp)(priv, a, b) <= 0)
it not distinguish negative and zero, so comparison function can
implement only less-or-equal instead of full three-way comparison.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 4fdf8953efbf..725091d5496d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2665,7 +2665,7 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
 	struct request *rqa = container_of(a, struct request, queuelist);
 	struct request *rqb = container_of(b, struct request, queuelist);
 
-	return !(rqa->q == rqb->q);
+	return !(rqa->q <= rqb->q);
 }
 
 static void flush_plug_list(struct blk_plug *plug)
-- 
cgit v1.2.2


From 109b81296c63228578d4760794d8dd46e02eddfb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 11 Apr 2011 14:13:10 +0200
Subject: block: splice plug list to local context

If the request_fn ends up blocking, we could be re-entering
the plug flush. Since the list is protected by explicitly
not allowing schedule events, this isn't a terribly good idea.

Additionally, it can cause us to recurse. As request_fn called by
__blk_run_queue is allowed to 'schedule()' (after dropping the queue
lock of course), it is possible to get a recursive call:

 schedule -> blk_flush_plug -> __blk_finish_plug -> flush_plug_list
      -> __blk_run_queue -> request_fn -> schedule

We must make sure that the second schedule does not call into
blk_flush_plug again.  So instead of leaving the list of requests on
blk_plug->list, move them to a separate list leaving blk_plug->list
empty.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 90f22cc30799..eeaca0998df5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2673,19 +2673,24 @@ static void flush_plug_list(struct blk_plug *plug)
 	struct request_queue *q;
 	unsigned long flags;
 	struct request *rq;
+	LIST_HEAD(list);
 
 	BUG_ON(plug->magic != PLUG_MAGIC);
 
 	if (list_empty(&plug->list))
 		return;
 
-	if (plug->should_sort)
-		list_sort(NULL, &plug->list, plug_rq_cmp);
+	list_splice_init(&plug->list, &list);
+
+	if (plug->should_sort) {
+		list_sort(NULL, &list, plug_rq_cmp);
+		plug->should_sort = 0;
+	}
 
 	q = NULL;
 	local_irq_save(flags);
-	while (!list_empty(&plug->list)) {
-		rq = list_entry_rq(plug->list.next);
+	while (!list_empty(&list)) {
+		rq = list_entry_rq(list.next);
 		list_del_init(&rq->queuelist);
 		BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
 		BUG_ON(!rq->q);
@@ -2713,7 +2718,6 @@ static void flush_plug_list(struct blk_plug *plug)
 		spin_unlock(q->queue_lock);
 	}
 
-	BUG_ON(!list_empty(&plug->list));
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.2


From 94b5eb28b41cc79d9713696e0005ae167b5afd1b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 12 Apr 2011 10:12:19 +0200
Subject: block: fixup block IO unplug trace call

It was removed with the on-stack plugging, readd it and track the
depth of requests added when flushing the plug.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index eeaca0998df5..d20ce1e849c8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2668,12 +2668,19 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
 	return !(rqa->q <= rqb->q);
 }
 
+static void queue_unplugged(struct request_queue *q, unsigned int depth)
+{
+	trace_block_unplug_io(q, depth);
+	__blk_run_queue(q, false);
+}
+
 static void flush_plug_list(struct blk_plug *plug)
 {
 	struct request_queue *q;
 	unsigned long flags;
 	struct request *rq;
 	LIST_HEAD(list);
+	unsigned int depth;
 
 	BUG_ON(plug->magic != PLUG_MAGIC);
 
@@ -2688,6 +2695,7 @@ static void flush_plug_list(struct blk_plug *plug)
 	}
 
 	q = NULL;
+	depth = 0;
 	local_irq_save(flags);
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
@@ -2696,10 +2704,11 @@ static void flush_plug_list(struct blk_plug *plug)
 		BUG_ON(!rq->q);
 		if (rq->q != q) {
 			if (q) {
-				__blk_run_queue(q, false);
+				queue_unplugged(q, depth);
 				spin_unlock(q->queue_lock);
 			}
 			q = rq->q;
+			depth = 0;
 			spin_lock(q->queue_lock);
 		}
 		rq->cmd_flags &= ~REQ_ON_PLUG;
@@ -2711,10 +2720,12 @@ static void flush_plug_list(struct blk_plug *plug)
 			__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
 		else
 			__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
+
+		depth++;
 	}
 
 	if (q) {
-		__blk_run_queue(q, false);
+		queue_unplugged(q, depth);
 		spin_unlock(q->queue_lock);
 	}
 
-- 
cgit v1.2.2


From 188112722cce083c8f1a7d0d84f55c2cd885920c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 12 Apr 2011 10:11:24 +0200
Subject: block: add comment on why we save and disable interrupts in
 flush_plug_list()

It's done at the top to avoid doing it for every queue we unplug.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index d20ce1e849c8..0c0ea10e61ea 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2696,6 +2696,11 @@ static void flush_plug_list(struct blk_plug *plug)
 
 	q = NULL;
 	depth = 0;
+
+	/*
+	 * Save and disable interrupts here, to avoid doing it for every
+	 * queue lock we have to take.
+	 */
 	local_irq_save(flags);
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
-- 
cgit v1.2.2


From f75664570d8b75469cc468f23c2b27220984983b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 12 Apr 2011 10:17:31 +0200
Subject: block: add callback function for unplug notification

MD would like to know when a queue is unplugged, so it can flush
it's bitmap writes. Add such a callback.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c     |  3 +++
 block/blk-settings.c | 16 ++++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 0c0ea10e61ea..76850fc9cf23 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2672,6 +2672,9 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth)
 {
 	trace_block_unplug_io(q, depth);
 	__blk_run_queue(q, false);
+
+	if (q->unplugged_fn)
+		q->unplugged_fn(q);
 }
 
 static void flush_plug_list(struct blk_plug *plug)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1fa769293597..eb949045bb12 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -790,6 +790,22 @@ void blk_queue_flush(struct request_queue *q, unsigned int flush)
 }
 EXPORT_SYMBOL_GPL(blk_queue_flush);
 
+/**
+ * blk_queue_unplugged - register a callback for an unplug event
+ * @q:		the request queue for the device
+ * @fn:		the function to call
+ *
+ * Some stacked drivers may need to know when IO is dispatched on an
+ * unplug event. By registrering a callback here, they will be notified
+ * when someone flushes their on-stack queue plug. The function will be
+ * called with the queue lock held.
+ */
+void blk_queue_unplugged(struct request_queue *q, unplugged_fn *fn)
+{
+	q->unplugged_fn = fn;
+}
+EXPORT_SYMBOL(blk_queue_unplugged);
+
 static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
-- 
cgit v1.2.2


From dc6d36c9710d1fed42d1bbe7d8e4f742abd844c6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 12 Apr 2011 10:28:28 +0200
Subject: block: readd plug trace event

This was removed with the queue plug state. But we can easily readd
by checking if this is the first request going to this queue. It's
good information to have when tracing to see how effective the
plugging is.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 76850fc9cf23..52e756c526be 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1311,7 +1311,15 @@ get_rq:
 
 	plug = current->plug;
 	if (plug) {
-		if (!plug->should_sort && !list_empty(&plug->list)) {
+		/*
+		 * If this is the first request added after a plug, fire
+		 * of a plug trace. If others have been added before, check
+		 * if we have multiple devices in this plug. If so, make a
+		 * note to sort the list before dispatch.
+		 */
+		if (list_empty(&plug->list))
+			trace_block_plug(q);
+		else if (!plug->should_sort) {
 			struct request *__rq;
 
 			__rq = list_entry_rq(plug->list.prev);
-- 
cgit v1.2.2


From cf82c798394cd443eed7d91f998b79a63f341e91 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 12 Apr 2011 10:30:53 +0200
Subject: block: kill queue_sync_plugs()

The original use for this dates back to when we had to track write
requests for serializing around barriers. That's not needed anymore,
so kill it.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 52e756c526be..c6eaa1f56466 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -198,19 +198,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 
-/*
- * Make sure that plugs that were pending when this function was entered,
- * are now complete and requests pushed to the queue.
-*/
-static inline void queue_sync_plugs(struct request_queue *q)
-{
-	/*
-	 * If the current process is plugged and has barriers submitted,
-	 * we will livelock if we don't unplug first.
-	 */
-	blk_flush_plug(current);
-}
-
 static void blk_delay_work(struct work_struct *work)
 {
 	struct request_queue *q;
@@ -298,7 +285,6 @@ void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->timeout);
 	cancel_delayed_work_sync(&q->delay_work);
-	queue_sync_plugs(q);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
-- 
cgit v1.2.2


From f4af3c3d077a004762aaad052049c809fd8c6f0c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 12 Apr 2011 14:58:51 +0200
Subject: block: move queue run on unplug to kblockd

There are worries that we are now consuming a lot more stack in
some cases, since we potentially call into IO dispatch from
schedule() or io_schedule(). We can reduce this problem by moving
the running of the queue to kblockd, like the old plugging scheme
did as well.

This may or may not be a good idea from a performance perspective,
depending on how many tasks have queue plugs running at the same
time. For even the slightly contended case, doing just a single
queue run from kblockd instead of multiple runs directly from the
unpluggers will be faster.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index c6eaa1f56466..36b1a7559f94 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2665,7 +2665,7 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
 static void queue_unplugged(struct request_queue *q, unsigned int depth)
 {
 	trace_block_unplug_io(q, depth);
-	__blk_run_queue(q, false);
+	__blk_run_queue(q, true);
 
 	if (q->unplugged_fn)
 		q->unplugged_fn(q);
-- 
cgit v1.2.2


From 80656b67b3988f83edd86a280d9937124fe62050 Mon Sep 17 00:00:00 2001
From: Liu Yuan <tailai.ly@taobao.com>
Date: Wed, 13 Apr 2011 22:14:54 +0200
Subject: block, blk-sysfs: Use the variable directly instead of a function
 call

In the function blk_register_queue(), var _dev_ is already assigned by
disk_to_dev().So use it directly instead of calling disk_to_dev() again.

Signed-off-by: Liu Yuan <tailai.ly@taobao.com>

Modified by me to delete an empty line in the same function while
in there anyway.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-sysfs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 261c75c665ae..6d735122bc59 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -498,7 +498,6 @@ int blk_register_queue(struct gendisk *disk)
 {
 	int ret;
 	struct device *dev = disk_to_dev(disk);
-
 	struct request_queue *q = disk->queue;
 
 	if (WARN_ON(!q))
@@ -521,7 +520,7 @@ int blk_register_queue(struct gendisk *disk)
 	if (ret) {
 		kobject_uevent(&q->kobj, KOBJ_REMOVE);
 		kobject_del(&q->kobj);
-		blk_trace_remove_sysfs(disk_to_dev(disk));
+		blk_trace_remove_sysfs(dev);
 		kobject_put(&dev->kobj);
 		return ret;
 	}
-- 
cgit v1.2.2


From 88b996cd0652280cc9b9fc70008fda15f14175e1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 15 Apr 2011 15:20:10 +0200
Subject: block: cleanup the block plug helper functions

It's a bit of a mess currently. task->plug is being cleared
and reset in __blk_finish_plug(), and blk_finish_plug() is
testing for a NULL plug which cannot happen even from schedule()
anymore since it uses blk_needs_flush_plug() to determine
whether to call into this function at all.

So get rid of some of the cruft.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 36b1a7559f94..b598fa7720d4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2671,7 +2671,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth)
 		q->unplugged_fn(q);
 }
 
-static void flush_plug_list(struct blk_plug *plug)
+void blk_flush_plug_list(struct blk_plug *plug)
 {
 	struct request_queue *q;
 	unsigned long flags;
@@ -2733,28 +2733,16 @@ static void flush_plug_list(struct blk_plug *plug)
 
 	local_irq_restore(flags);
 }
-
-static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
-{
-	flush_plug_list(plug);
-
-	if (plug == tsk->plug)
-		tsk->plug = NULL;
-}
+EXPORT_SYMBOL(blk_flush_plug_list);
 
 void blk_finish_plug(struct blk_plug *plug)
 {
-	if (plug)
-		__blk_finish_plug(current, plug);
-}
-EXPORT_SYMBOL(blk_finish_plug);
+	blk_flush_plug_list(plug);
 
-void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
-{
-	__blk_finish_plug(tsk, plug);
-	tsk->plug = plug;
+	if (plug == current->plug)
+		current->plug = NULL;
 }
-EXPORT_SYMBOL(__blk_flush_plug);
+EXPORT_SYMBOL(blk_finish_plug);
 
 int __init blk_dev_init(void)
 {
-- 
cgit v1.2.2


From f6603783f9f099bf7a83b3f6c689bbbf74f0e96e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 15 Apr 2011 15:49:07 +0200
Subject: block: only force kblockd unplugging from the schedule() path

For the explicit unplugging, we'd prefer to kick things off
immediately and not pay the penalty of the latency to switch
to kblockd. So let blk_finish_plug() do the run inline, while
the implicit-on-schedule-out unplug will punt to kblockd.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index b598fa7720d4..3c8121072507 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2662,16 +2662,17 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
 	return !(rqa->q <= rqb->q);
 }
 
-static void queue_unplugged(struct request_queue *q, unsigned int depth)
+static void queue_unplugged(struct request_queue *q, unsigned int depth,
+			    bool force_kblockd)
 {
 	trace_block_unplug_io(q, depth);
-	__blk_run_queue(q, true);
+	__blk_run_queue(q, force_kblockd);
 
 	if (q->unplugged_fn)
 		q->unplugged_fn(q);
 }
 
-void blk_flush_plug_list(struct blk_plug *plug)
+void blk_flush_plug_list(struct blk_plug *plug, bool force_kblockd)
 {
 	struct request_queue *q;
 	unsigned long flags;
@@ -2706,7 +2707,7 @@ void blk_flush_plug_list(struct blk_plug *plug)
 		BUG_ON(!rq->q);
 		if (rq->q != q) {
 			if (q) {
-				queue_unplugged(q, depth);
+				queue_unplugged(q, depth, force_kblockd);
 				spin_unlock(q->queue_lock);
 			}
 			q = rq->q;
@@ -2727,7 +2728,7 @@ void blk_flush_plug_list(struct blk_plug *plug)
 	}
 
 	if (q) {
-		queue_unplugged(q, depth);
+		queue_unplugged(q, depth, force_kblockd);
 		spin_unlock(q->queue_lock);
 	}
 
@@ -2737,7 +2738,7 @@ EXPORT_SYMBOL(blk_flush_plug_list);
 
 void blk_finish_plug(struct blk_plug *plug)
 {
-	blk_flush_plug_list(plug);
+	blk_flush_plug_list(plug, false);
 
 	if (plug == current->plug)
 		current->plug = NULL;
-- 
cgit v1.2.2


From 49cac01e1fa74174d72adb0e872504a7fefd7c01 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Sat, 16 Apr 2011 13:51:05 +0200
Subject: block: make unplug timer trace event correspond to the schedule()
 unplug

It's a pretty close match to what we had before - the timer triggering
would mean that nobody unplugged the plug in due time, in the new
scheme this matches very closely what the schedule() unplug now is.
It's essentially the difference between an explicit unplug (IO unplug)
or an implicit unplug (timer unplug, we scheduled with pending IO
queued).

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3c8121072507..78b7b0cb7216 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2662,17 +2662,23 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
 	return !(rqa->q <= rqb->q);
 }
 
+/*
+ * If 'from_schedule' is true, then postpone the dispatch of requests
+ * until a safe kblockd context. We due this to avoid accidental big
+ * additional stack usage in driver dispatch, in places where the originally
+ * plugger did not intend it.
+ */
 static void queue_unplugged(struct request_queue *q, unsigned int depth,
-			    bool force_kblockd)
+			    bool from_schedule)
 {
-	trace_block_unplug_io(q, depth);
-	__blk_run_queue(q, force_kblockd);
+	trace_block_unplug(q, depth, !from_schedule);
+	__blk_run_queue(q, from_schedule);
 
 	if (q->unplugged_fn)
 		q->unplugged_fn(q);
 }
 
-void blk_flush_plug_list(struct blk_plug *plug, bool force_kblockd)
+void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct request_queue *q;
 	unsigned long flags;
@@ -2707,7 +2713,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool force_kblockd)
 		BUG_ON(!rq->q);
 		if (rq->q != q) {
 			if (q) {
-				queue_unplugged(q, depth, force_kblockd);
+				queue_unplugged(q, depth, from_schedule);
 				spin_unlock(q->queue_lock);
 			}
 			q = rq->q;
@@ -2728,7 +2734,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool force_kblockd)
 	}
 
 	if (q) {
-		queue_unplugged(q, depth, force_kblockd);
+		queue_unplugged(q, depth, from_schedule);
 		spin_unlock(q->queue_lock);
 	}
 
-- 
cgit v1.2.2


From 048c9374a749a27f16493cea033fa4a8ff492356 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 18 Apr 2011 09:52:22 +0200
Subject: block: Enhance new plugging support to support general callbacks

md/raid requires an unplug callback, but as it does not uses
requests the current code cannot provide one.

So allow arbitrary callbacks to be attached to the blk_plug.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 78b7b0cb7216..77edf0512338 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2638,6 +2638,7 @@ void blk_start_plug(struct blk_plug *plug)
 
 	plug->magic = PLUG_MAGIC;
 	INIT_LIST_HEAD(&plug->list);
+	INIT_LIST_HEAD(&plug->cb_list);
 	plug->should_sort = 0;
 
 	/*
@@ -2678,6 +2679,24 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
 		q->unplugged_fn(q);
 }
 
+static void flush_plug_callbacks(struct blk_plug *plug)
+{
+	LIST_HEAD(callbacks);
+
+	if (list_empty(&plug->cb_list))
+		return;
+
+	list_splice_init(&plug->cb_list, &callbacks);
+
+	while (!list_empty(&callbacks)) {
+		struct blk_plug_cb *cb = list_first_entry(&callbacks,
+							  struct blk_plug_cb,
+							  list);
+		list_del(&cb->list);
+		cb->callback(cb);
+	}
+}
+
 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct request_queue *q;
@@ -2688,6 +2707,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 	BUG_ON(plug->magic != PLUG_MAGIC);
 
+	flush_plug_callbacks(plug);
 	if (list_empty(&plug->list))
 		return;
 
-- 
cgit v1.2.2


From b4cb290e0a7d19235bd075c2ad4d60dbab0bac15 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 18 Apr 2011 09:54:05 +0200
Subject: Revert "block: add callback function for unplug notification"

MD can't use this since it really requires us to be able to
keep more than a single piece of state for the unplug. Commit
048c9374 added the required support for MD, so get rid of this
now unused code.

This reverts commit f75664570d8b75469cc468f23c2b27220984983b.

Conflicts:

	block/blk-core.c

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c     |  3 ---
 block/blk-settings.c | 16 ----------------
 2 files changed, 19 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 77edf0512338..09b262811fff 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2674,9 +2674,6 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
 {
 	trace_block_unplug(q, depth, !from_schedule);
 	__blk_run_queue(q, from_schedule);
-
-	if (q->unplugged_fn)
-		q->unplugged_fn(q);
 }
 
 static void flush_plug_callbacks(struct blk_plug *plug)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index eb949045bb12..1fa769293597 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -790,22 +790,6 @@ void blk_queue_flush(struct request_queue *q, unsigned int flush)
 }
 EXPORT_SYMBOL_GPL(blk_queue_flush);
 
-/**
- * blk_queue_unplugged - register a callback for an unplug event
- * @q:		the request queue for the device
- * @fn:		the function to call
- *
- * Some stacked drivers may need to know when IO is dispatched on an
- * unplug event. By registrering a callback here, they will be notified
- * when someone flushes their on-stack queue plug. The function will be
- * called with the queue lock held.
- */
-void blk_queue_unplugged(struct request_queue *q, unplugged_fn *fn)
-{
-	q->unplugged_fn = fn;
-}
-EXPORT_SYMBOL(blk_queue_unplugged);
-
 static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
-- 
cgit v1.2.2


From 99e22598e9a8e0a996d69c8c0f6b7027cb57720a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 18 Apr 2011 09:59:55 +0200
Subject: block: drop queue lock before calling __blk_run_queue() for kblockd
 punt

If we know we are going to punt to kblockd, we can drop the queue
lock before calling into __blk_run_queue() since it only does a
safe bit test and a workqueue call. Since kblockd needs to grab
this very lock as one of the first things it does, it's a good
optimization to drop the lock before waking kblockd.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 09b262811fff..5e413933bc3a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -295,7 +295,8 @@ EXPORT_SYMBOL(blk_sync_queue);
  *
  * Description:
  *    See @blk_run_queue. This variant must be called with the queue lock
- *    held and interrupts disabled.
+ *    held and interrupts disabled. If force_kblockd is true, then it is
+ *    safe to call this without holding the queue lock.
  *
  */
 void __blk_run_queue(struct request_queue *q, bool force_kblockd)
@@ -2671,9 +2672,23 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
  */
 static void queue_unplugged(struct request_queue *q, unsigned int depth,
 			    bool from_schedule)
+	__releases(q->queue_lock)
 {
 	trace_block_unplug(q, depth, !from_schedule);
-	__blk_run_queue(q, from_schedule);
+
+	/*
+	 * If we are punting this to kblockd, then we can safely drop
+	 * the queue_lock before waking kblockd (which needs to take
+	 * this lock).
+	 */
+	if (from_schedule) {
+		spin_unlock(q->queue_lock);
+		__blk_run_queue(q, true);
+	} else {
+		__blk_run_queue(q, false);
+		spin_unlock(q->queue_lock);
+	}
+
 }
 
 static void flush_plug_callbacks(struct blk_plug *plug)
@@ -2729,10 +2744,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
 		BUG_ON(!rq->q);
 		if (rq->q != q) {
-			if (q) {
+			/*
+			 * This drops the queue lock
+			 */
+			if (q)
 				queue_unplugged(q, depth, from_schedule);
-				spin_unlock(q->queue_lock);
-			}
 			q = rq->q;
 			depth = 0;
 			spin_lock(q->queue_lock);
@@ -2750,10 +2766,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		depth++;
 	}
 
-	if (q) {
+	/*
+	 * This drops the queue lock
+	 */
+	if (q)
 		queue_unplugged(q, depth, from_schedule);
-		spin_unlock(q->queue_lock);
-	}
 
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.2


From 4521cc4ed5173f92714f6999a69910c3385fed68 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 18 Apr 2011 11:36:39 +0200
Subject: block: blk_delay_queue() should use kblockd workqueue

Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 5e413933bc3a..e2bacfa46cc3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -220,7 +220,8 @@ static void blk_delay_work(struct work_struct *work)
  */
 void blk_delay_queue(struct request_queue *q, unsigned long msecs)
 {
-	schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs));
+	queue_delayed_work(kblockd_workqueue, &q->delay_work,
+				msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_delay_queue);
 
-- 
cgit v1.2.2


From 24ecfbe27f65563909b14492afda2f1c21f7c044 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 18 Apr 2011 11:41:33 +0200
Subject: block: add blk_run_queue_async

Instead of overloading __blk_run_queue to force an offload to kblockd
add a new blk_run_queue_async helper to do it explicitly.  I've kept
the blk_queue_stopped check for now, but I suspect it's not needed
as the check we do when the workqueue items runs should be enough.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c    | 36 ++++++++++++++++++++++++------------
 block/blk-exec.c    |  2 +-
 block/blk-flush.c   |  4 ++--
 block/blk.h         |  1 +
 block/cfq-iosched.c |  6 +++---
 block/elevator.c    |  4 ++--
 6 files changed, 33 insertions(+), 20 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index e2bacfa46cc3..5fa3dd2705c6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -204,7 +204,7 @@ static void blk_delay_work(struct work_struct *work)
 
 	q = container_of(work, struct request_queue, delay_work.work);
 	spin_lock_irq(q->queue_lock);
-	__blk_run_queue(q, false);
+	__blk_run_queue(q);
 	spin_unlock_irq(q->queue_lock);
 }
 
@@ -239,7 +239,7 @@ void blk_start_queue(struct request_queue *q)
 	WARN_ON(!irqs_disabled());
 
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-	__blk_run_queue(q, false);
+	__blk_run_queue(q);
 }
 EXPORT_SYMBOL(blk_start_queue);
 
@@ -296,11 +296,9 @@ EXPORT_SYMBOL(blk_sync_queue);
  *
  * Description:
  *    See @blk_run_queue. This variant must be called with the queue lock
- *    held and interrupts disabled. If force_kblockd is true, then it is
- *    safe to call this without holding the queue lock.
- *
+ *    held and interrupts disabled.
  */
-void __blk_run_queue(struct request_queue *q, bool force_kblockd)
+void __blk_run_queue(struct request_queue *q)
 {
 	if (unlikely(blk_queue_stopped(q)))
 		return;
@@ -309,7 +307,7 @@ void __blk_run_queue(struct request_queue *q, bool force_kblockd)
 	 * Only recurse once to avoid overrunning the stack, let the unplug
 	 * handling reinvoke the handler shortly if we already got there.
 	 */
-	if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
 		q->request_fn(q);
 		queue_flag_clear(QUEUE_FLAG_REENTER, q);
 	} else
@@ -317,6 +315,20 @@ void __blk_run_queue(struct request_queue *q, bool force_kblockd)
 }
 EXPORT_SYMBOL(__blk_run_queue);
 
+/**
+ * blk_run_queue_async - run a single device queue in workqueue context
+ * @q:	The queue to run
+ *
+ * Description:
+ *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
+ *    of us.
+ */
+void blk_run_queue_async(struct request_queue *q)
+{
+	if (likely(!blk_queue_stopped(q)))
+		queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
+}
+
 /**
  * blk_run_queue - run a single device queue
  * @q: The queue to run
@@ -330,7 +342,7 @@ void blk_run_queue(struct request_queue *q)
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	__blk_run_queue(q, false);
+	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_run_queue);
@@ -979,7 +991,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 		blk_queue_end_tag(q, rq);
 
 	add_acct_request(q, rq, where);
-	__blk_run_queue(q, false);
+	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
@@ -1323,7 +1335,7 @@ get_rq:
 	} else {
 		spin_lock_irq(q->queue_lock);
 		add_acct_request(q, req, where);
-		__blk_run_queue(q, false);
+		__blk_run_queue(q);
 out_unlock:
 		spin_unlock_irq(q->queue_lock);
 	}
@@ -2684,9 +2696,9 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
 	 */
 	if (from_schedule) {
 		spin_unlock(q->queue_lock);
-		__blk_run_queue(q, true);
+		blk_run_queue_async(q);
 	} else {
-		__blk_run_queue(q, false);
+		__blk_run_queue(q);
 		spin_unlock(q->queue_lock);
 	}
 
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 7482b7fa863b..81e31819a597 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -55,7 +55,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	WARN_ON(irqs_disabled());
 	spin_lock_irq(q->queue_lock);
 	__elv_add_request(q, rq, where);
-	__blk_run_queue(q, false);
+	__blk_run_queue(q);
 	/* the queue is stopped so it won't be plugged+unplugged */
 	if (rq->cmd_type == REQ_TYPE_PM_RESUME)
 		q->request_fn(q);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index eba4a2790c6c..6c9b5e189e62 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -218,7 +218,7 @@ static void flush_end_io(struct request *flush_rq, int error)
 	 * request_fn may confuse the driver.  Always use kblockd.
 	 */
 	if (queued)
-		__blk_run_queue(q, true);
+		blk_run_queue_async(q);
 }
 
 /**
@@ -274,7 +274,7 @@ static void flush_data_end_io(struct request *rq, int error)
 	 * the comment in flush_end_io().
 	 */
 	if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
-		__blk_run_queue(q, true);
+		blk_run_queue_async(q);
 }
 
 /**
diff --git a/block/blk.h b/block/blk.h
index 61263463e38e..c9df8fc3c999 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -22,6 +22,7 @@ void blk_rq_timed_out_timer(unsigned long data);
 void blk_delete_timer(struct request *);
 void blk_add_timer(struct request *);
 void __generic_unplug_device(struct request_queue *);
+void blk_run_queue_async(struct request_queue *q);
 
 /*
  * Internal atomic flags for request handling
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3be881ec95ad..46b0a1d1d925 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3368,7 +3368,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			    cfqd->busy_queues > 1) {
 				cfq_del_timer(cfqd, cfqq);
 				cfq_clear_cfqq_wait_request(cfqq);
-				__blk_run_queue(cfqd->queue, false);
+				__blk_run_queue(cfqd->queue);
 			} else {
 				cfq_blkiocg_update_idle_time_stats(
 						&cfqq->cfqg->blkg);
@@ -3383,7 +3383,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 * this new queue is RT and the current one is BE
 		 */
 		cfq_preempt_queue(cfqd, cfqq);
-		__blk_run_queue(cfqd->queue, false);
+		__blk_run_queue(cfqd->queue);
 	}
 }
 
@@ -3743,7 +3743,7 @@ static void cfq_kick_queue(struct work_struct *work)
 	struct request_queue *q = cfqd->queue;
 
 	spin_lock_irq(q->queue_lock);
-	__blk_run_queue(cfqd->queue, false);
+	__blk_run_queue(cfqd->queue);
 	spin_unlock_irq(q->queue_lock);
 }
 
diff --git a/block/elevator.c b/block/elevator.c
index 0cdb4e7ebab4..6f6abc08bb56 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -642,7 +642,7 @@ void elv_quiesce_start(struct request_queue *q)
 	 */
 	elv_drain_elevator(q);
 	while (q->rq.elvpriv) {
-		__blk_run_queue(q, false);
+		__blk_run_queue(q);
 		spin_unlock_irq(q->queue_lock);
 		msleep(10);
 		spin_lock_irq(q->queue_lock);
@@ -695,7 +695,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 		 *   with anything.  There's no point in delaying queue
 		 *   processing.
 		 */
-		__blk_run_queue(q, false);
+		__blk_run_queue(q);
 		break;
 
 	case ELEVATOR_INSERT_SORT_MERGE:
-- 
cgit v1.2.2


From bd900d4580107c899d43b262fbbd995f11097a43 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 18 Apr 2011 22:06:57 +0200
Subject: block: kill blk_flush_plug_list() export

With all drivers and file systems converted, we only have
in-core use of this function. So remove the export.

Reporteed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 5fa3dd2705c6..580eee5743e5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2787,7 +2787,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL(blk_flush_plug_list);
 
 void blk_finish_plug(struct blk_plug *plug)
 {
-- 
cgit v1.2.2


From 5f45c69589b7d2953584e6cd0b31e35dbe960ad0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 19 Apr 2011 09:10:35 +0200
Subject: cfq-iosched: read_lock() does not always imply rcu_read_lock()

For some configurations of CONFIG_PREEMPT that is not true. So
get rid of __call_for_each_cic() and always uses the explicitly
rcu_read_lock() protected call_for_each_cic() instead.

This fixes a potential bug related to IO scheduler removal or
online switching.

Thanks to Paul McKenney for clarifying this.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 46b0a1d1d925..5b52011e3a40 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2582,28 +2582,20 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 }
 
 /*
- * Must always be called with the rcu_read_lock() held
+ * Call func for each cic attached to this ioc.
  */
 static void
-__call_for_each_cic(struct io_context *ioc,
-		    void (*func)(struct io_context *, struct cfq_io_context *))
+call_for_each_cic(struct io_context *ioc,
+		  void (*func)(struct io_context *, struct cfq_io_context *))
 {
 	struct cfq_io_context *cic;
 	struct hlist_node *n;
 
+	rcu_read_lock();
+
 	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
 		func(ioc, cic);
-}
 
-/*
- * Call func for each cic attached to this ioc.
- */
-static void
-call_for_each_cic(struct io_context *ioc,
-		  void (*func)(struct io_context *, struct cfq_io_context *))
-{
-	rcu_read_lock();
-	__call_for_each_cic(ioc, func);
 	rcu_read_unlock();
 }
 
@@ -2664,7 +2656,7 @@ static void cfq_free_io_context(struct io_context *ioc)
 	 * should be ok to iterate over the known list, we will see all cic's
 	 * since no new ones are added.
 	 */
-	__call_for_each_cic(ioc, cic_free_func);
+	call_for_each_cic(ioc, cic_free_func);
 }
 
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
-- 
cgit v1.2.2


From c21e6beba8835d09bb80e34961430b13e60381c5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 19 Apr 2011 13:32:46 +0200
Subject: block: get rid of QUEUE_FLAG_REENTER

We are currently using this flag to check whether it's safe
to call into ->request_fn(). If it is set, we punt to kblockd.
But we get a lot of false positives and excessive punts to
kblockd, which hurts performance.

The only real abuser of this infrastructure is SCSI. So export
the async queue run and convert SCSI over to use that. There's
room for improvement in that SCSI need not always use the async
call, but this fixes our performance issue and they can fix that
up in due time.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 11 ++---------
 block/blk.h      |  1 -
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 580eee5743e5..40725b9091f1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -303,15 +303,7 @@ void __blk_run_queue(struct request_queue *q)
 	if (unlikely(blk_queue_stopped(q)))
 		return;
 
-	/*
-	 * Only recurse once to avoid overrunning the stack, let the unplug
-	 * handling reinvoke the handler shortly if we already got there.
-	 */
-	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
-		q->request_fn(q);
-		queue_flag_clear(QUEUE_FLAG_REENTER, q);
-	} else
-		queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
+	q->request_fn(q);
 }
 EXPORT_SYMBOL(__blk_run_queue);
 
@@ -328,6 +320,7 @@ void blk_run_queue_async(struct request_queue *q)
 	if (likely(!blk_queue_stopped(q)))
 		queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
 }
+EXPORT_SYMBOL(blk_run_queue_async);
 
 /**
  * blk_run_queue - run a single device queue
diff --git a/block/blk.h b/block/blk.h
index c9df8fc3c999..61263463e38e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -22,7 +22,6 @@ void blk_rq_timed_out_timer(unsigned long data);
 void blk_delete_timer(struct request *);
 void blk_add_timer(struct request *);
 void __generic_unplug_device(struct request_queue *);
-void blk_run_queue_async(struct request_queue *q);
 
 /*
  * Internal atomic flags for request handling
-- 
cgit v1.2.2


From d350e6b6e819df0a383ff34465720bfaa0f91c79 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 19 Apr 2011 13:34:14 +0200
Subject: block: remove stale kerneldoc member from __blk_run_queue()

We don't pass in a 'force_kblockd' anymore, get rid of the
stsale comment.

Reported-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 40725b9091f1..a2e58eeb3549 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -292,7 +292,6 @@ EXPORT_SYMBOL(blk_sync_queue);
 /**
  * __blk_run_queue - run a single device queue
  * @q:	The queue to run
- * @force_kblockd: Don't run @q->request_fn directly.  Use kblockd.
  *
  * Description:
  *    See @blk_run_queue. This variant must be called with the queue lock
-- 
cgit v1.2.2


From ed5302d3c25006a9edc7a7fbea97a30483f89ef7 Mon Sep 17 00:00:00 2001
From: Liu Yuan <tailai.ly@taobao.com>
Date: Tue, 19 Apr 2011 13:47:58 +0200
Subject: block, blk-sysfs: Fix an err return path in blk_register_queue()

We do not call blk_trace_remove_sysfs() in err return path
if kobject_add() fails. This path fixes it.

Cc: stable@kernel.org
Signed-off-by: Liu Yuan <tailai.ly@taobao.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-sysfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 6d735122bc59..5d696adbc4dc 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -508,8 +508,10 @@ int blk_register_queue(struct gendisk *disk)
 		return ret;
 
 	ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
-	if (ret < 0)
+	if (ret < 0) {
+		blk_trace_remove_sysfs(dev);
 		return ret;
+	}
 
 	kobject_uevent(&q->kobj, KOBJ_ADD);
 
-- 
cgit v1.2.2


From 60735b6362f29b52b5635a2dfa9ab5ad39948345 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 19 Apr 2011 13:50:40 +0200
Subject: block: Remove the extra check in queue_requests_store

In queue_requests_store, the code looks like
	if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
		blk_set_queue_full(q, BLK_RW_SYNC);
	} else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
		blk_clear_queue_full(q, BLK_RW_SYNC);
		wake_up(&rl->wait[BLK_RW_SYNC]);
	}
If we don't satify the situation of "if", we can get that
rl->count[BLK_RW_SYNC} < q->nr_quests. It is the same as
rl->count[BLK_RW_SYNC]+1 <= q->nr_requests.
All the "else" should satisfy the "else if" check so it isn't
needed actually.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5d696adbc4dc..bd236313f35d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -66,14 +66,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 
 	if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
 		blk_set_queue_full(q, BLK_RW_SYNC);
-	} else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
+	} else {
 		blk_clear_queue_full(q, BLK_RW_SYNC);
 		wake_up(&rl->wait[BLK_RW_SYNC]);
 	}
 
 	if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
 		blk_set_queue_full(q, BLK_RW_ASYNC);
-	} else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
+	} else {
 		blk_clear_queue_full(q, BLK_RW_ASYNC);
 		wake_up(&rl->wait[BLK_RW_ASYNC]);
 	}
-- 
cgit v1.2.2


From 3aa72873ffdcc2f7919743efbbefc351ec73f5cb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 21 Apr 2011 19:28:35 +0200
Subject: elevator: check for ELEVATOR_INSERT_SORT_MERGE in !elvpriv case too

The sort insert is the one that goes to the IO scheduler. With
the SORT_MERGE addition, we could bypass IO scheduler setup
but still ask the IO scheduler to insert the request. This would
cause an oops on switching IO schedulers through the sysfs
interface, unless the disk just happened to be idle while it
occured.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/elevator.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index 6f6abc08bb56..45ca1e34f582 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -671,7 +671,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 			q->boundary_rq = rq;
 		}
 	} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
-		    where == ELEVATOR_INSERT_SORT)
+		    (where == ELEVATOR_INSERT_SORT ||
+		     where == ELEVATOR_INSERT_SORT_MERGE))
 		where = ELEVATOR_INSERT_BACK;
 
 	switch (where) {
-- 
cgit v1.2.2


From 7c88a168da8003fd4d8fb6ae103c4ecf29cb1130 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 21 Apr 2011 19:43:58 +0200
Subject: block: don't propagate unlisted DISK_EVENTs to userland

DISK_EVENT_MEDIA_CHANGE is used for both userland visible event and
internal event for revalidation of removeable devices.  Some legacy
drivers don't implement proper event detection and continuously
generate events under certain circumstances.  For example, ide-cd
generates media changed continuously if there's no media in the drive,
which can lead to infinite loop of events jumping back and forth
between the driver and userland event handler.

This patch updates disk event infrastructure such that it never
propagates events not listed in disk->events to userland.  Those
events are processed the same for internal purposes but uevent
generation is suppressed.

This also ensures that userland only gets events which are advertised
in the @events sysfs node lowering risk of confusion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index b364bd038a18..2dd988723d73 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1588,9 +1588,13 @@ static void disk_events_workfn(struct work_struct *work)
 
 	spin_unlock_irq(&ev->lock);
 
-	/* tell userland about new events */
+	/*
+	 * Tell userland about new events.  Only the events listed in
+	 * @disk->events are reported.  Unlisted events are processed the
+	 * same internally but never get reported to userland.
+	 */
 	for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
-		if (events & (1 << i))
+		if (events & disk->events & (1 << i))
 			envp[nr_events++] = disk_uevents[i];
 
 	if (nr_events)
-- 
cgit v1.2.2


From addd0a09fc06179f2e02b4221775d9ab265c9fc7 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Thu, 5 May 2011 15:10:05 -0600
Subject: block: Remove 'plug/unplug' comment in blk_execute_rq_nowait

unplug is replaced with blk_run_queue now in blk_execute_rq_nowait,
so change the comment accordingly.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 81e31819a597..8a0e7ec056e7 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -56,7 +56,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	spin_lock_irq(q->queue_lock);
 	__elv_add_request(q, rq, where);
 	__blk_run_queue(q);
-	/* the queue is stopped so it won't be plugged+unplugged */
+	/* the queue is stopped so it won't be run */
 	if (rq->cmd_type == REQ_TYPE_PM_RESUME)
 		q->request_fn(q);
 	spin_unlock_irq(q->queue_lock);
-- 
cgit v1.2.2


From 490b94be0282c3b67f56453628ff0aaae827a670 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Thu, 5 May 2011 18:02:12 -0600
Subject: iosched: remove redundant sprintf

After the anticipatory scheduler was dropped, there was no need to
special-case the request_module string. As such, drop the redundant
sprintf and stack variable.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/elevator.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index 6f6abc08bb56..3cd0d8c84902 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -155,13 +155,8 @@ static struct elevator_type *elevator_get(const char *name)
 
 	e = elevator_find(name);
 	if (!e) {
-		char elv[ELV_NAME_MAX + strlen("-iosched")];
-
 		spin_unlock(&elv_list_lock);
-
-		snprintf(elv, sizeof(elv), "%s-iosched", name);
-
-		request_module("%s", elv);
+		request_module("%s-iosched", name);
 		spin_lock(&elv_list_lock);
 		e = elevator_find(name);
 	}
-- 
cgit v1.2.2


From f3876930952390a31c3a7fd68dd621464a36eb80 Mon Sep 17 00:00:00 2001
From: "shaohua.li@intel.com" <shaohua.li@intel.com>
Date: Fri, 6 May 2011 11:34:32 -0600
Subject: block: add a non-queueable flush flag

flush request isn't queueable in some drives. Add a flag to let driver
notify block layer about this. We can optimize flush performance with the
knowledge.

Stable: 2.6.39 only

Cc: stable@kernel.org
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1fa769293597..cd3c428e194f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -790,6 +790,12 @@ void blk_queue_flush(struct request_queue *q, unsigned int flush)
 }
 EXPORT_SYMBOL_GPL(blk_queue_flush);
 
+void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
+{
+	q->flush_not_queueable = !queueable;
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
+
 static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
-- 
cgit v1.2.2


From 3ac0cc4508709d42ec9aa351086c7d38bfc0660c Mon Sep 17 00:00:00 2001
From: "shaohua.li@intel.com" <shaohua.li@intel.com>
Date: Fri, 6 May 2011 11:34:41 -0600
Subject: block: hold queue if flush is running for non-queueable flush drive

In some drives, flush requests are non-queueable. When flush request is
running, normal read/write requests can't run. If block layer dispatches
such request, driver can't handle it and requeue it.  Tejun suggested we
can hold the queue when flush is running. This can avoid unnecessary
requeue.  Also this can improve performance. For example, we have
request flush1, write1, flush 2. flush1 is dispatched, then queue is
hold, write1 isn't inserted to queue. After flush1 is finished, flush2
will be dispatched. Since disk cache is already clean, flush2 will be
finished very soon, so looks like flush2 is folded to flush1.

In my test, the queue holding completely solves a regression introduced by
commit 53d63e6b0dfb95882ec0219ba6bbd50cde423794:

    block: make the flush insertion use the tail of the dispatch list

    It's not a preempt type request, in fact we have to insert it
    behind requests that do specify INSERT_FRONT.

which causes about 20% regression running a sysbench fileio
workload.

Stable: 2.6.39 only

Cc: stable@kernel.org
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 16 +++++++++++-----
 block/blk.h       | 21 ++++++++++++++++++++-
 2 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 6c9b5e189e62..bb21e4c36f70 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -212,13 +212,19 @@ static void flush_end_io(struct request *flush_rq, int error)
 	}
 
 	/*
-	 * Moving a request silently to empty queue_head may stall the
-	 * queue.  Kick the queue in those cases.  This function is called
-	 * from request completion path and calling directly into
-	 * request_fn may confuse the driver.  Always use kblockd.
+	 * Kick the queue to avoid stall for two cases:
+	 * 1. Moving a request silently to empty queue_head may stall the
+	 * queue.
+	 * 2. When flush request is running in non-queueable queue, the
+	 * queue is hold. Restart the queue after flush request is finished
+	 * to avoid stall.
+	 * This function is called from request completion path and calling
+	 * directly into request_fn may confuse the driver.  Always use
+	 * kblockd.
 	 */
-	if (queued)
+	if (queued || q->flush_queue_delayed)
 		blk_run_queue_async(q);
+	q->flush_queue_delayed = 0;
 }
 
 /**
diff --git a/block/blk.h b/block/blk.h
index c9df8fc3c999..83e4bff36201 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -62,7 +62,26 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 			rq = list_entry_rq(q->queue_head.next);
 			return rq;
 		}
-
+		/*
+		 * Flush request is running and flush request isn't queueable
+		 * in the drive, we can hold the queue till flush request is
+		 * finished. Even we don't do this, driver can't dispatch next
+		 * requests and will requeue them. And this can improve
+		 * throughput too. For example, we have request flush1, write1,
+		 * flush 2. flush1 is dispatched, then queue is hold, write1
+		 * isn't inserted to queue. After flush1 is finished, flush2
+		 * will be dispatched. Since disk cache is already clean,
+		 * flush2 will be finished very soon, so looks like flush2 is
+		 * folded to flush1.
+		 * Since the queue is hold, a flag is set to indicate the queue
+		 * should be restarted later. Please see flush_end_io() for
+		 * details.
+		 */
+		if (q->flush_pending_idx != q->flush_running_idx &&
+				!queue_flush_queueable(q)) {
+			q->flush_queue_delayed = 1;
+			return NULL;
+		}
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
 			return NULL;
 	}
-- 
cgit v1.2.2


From 5dba3089ed03f84b84c6c739df8330112f04a15d Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Fri, 6 May 2011 19:26:27 -0600
Subject: blkdev: Submit discard bio in batches in blkdev_issue_discard()

Currently we are waiting for every submitted REQ_DISCARD bio separately,
but it can have unwanted consequences of repeatedly flushing the queue,
so we rather submit bios in batches and wait for the entire batch, hence
narrowing the window of other ios going in.

Use bio_batch_end_io() and struct bio_batch for that purpose, the same
is used by blkdev_issue_zeroout(). Also change bio_batch_end_io() so we
always set !BIO_UPTODATE in the case of error and remove the check for
bb, since we are the only user of this function and we always set this.

Remove bio_get()/bio_put() from the blkdev_issue_discard() since
bio_alloc() and bio_batch_end_io() is doing the same thing, hence it is
not needed anymore.

I have done simple dd testing with surprising results. The script I have
used is:

for i in $(seq 10); do
        echo $i
        dd if=/dev/sdb1 of=/dev/sdc1 bs=4k &
        sleep 5
done
/usr/bin/time -f %e ./blkdiscard /dev/sdc1

Running time of BLKDISCARD on the whole device:
with patch              without patch
0.95                    15.58

So we can see that in this artificial test the kernel with the patch
applied is approx 16x faster in discarding the device.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
CC: Dmitry Monakhov <dmonakhov@openvz.org>
CC: Jens Axboe <jaxboe@fusionio.com>
CC: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-lib.c | 69 ++++++++++++++++++++++++---------------------------------
 1 file changed, 29 insertions(+), 40 deletions(-)

(limited to 'block')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 25de73e4759b..9260cb0b209b 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -9,17 +9,23 @@
 
 #include "blk.h"
 
-static void blkdev_discard_end_io(struct bio *bio, int err)
+struct bio_batch {
+	atomic_t		done;
+	unsigned long		flags;
+	struct completion	*wait;
+};
+
+static void bio_batch_end_io(struct bio *bio, int err)
 {
+	struct bio_batch *bb = bio->bi_private;
+
 	if (err) {
 		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+			set_bit(BIO_EOPNOTSUPP, &bb->flags);
+		clear_bit(BIO_UPTODATE, &bb->flags);
 	}
-
-	if (bio->bi_private)
-		complete(bio->bi_private);
-
+	if (atomic_dec_and_test(&bb->done))
+		complete(bb->wait);
 	bio_put(bio);
 }
 
@@ -41,6 +47,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	struct request_queue *q = bdev_get_queue(bdev);
 	int type = REQ_WRITE | REQ_DISCARD;
 	unsigned int max_discard_sectors;
+	struct bio_batch bb;
 	struct bio *bio;
 	int ret = 0;
 
@@ -67,7 +74,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		type |= REQ_SECURE;
 	}
 
-	while (nr_sects && !ret) {
+	atomic_set(&bb.done, 1);
+	bb.flags = 1 << BIO_UPTODATE;
+	bb.wait = &wait;
+
+	while (nr_sects) {
 		bio = bio_alloc(gfp_mask, 1);
 		if (!bio) {
 			ret = -ENOMEM;
@@ -75,9 +86,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		}
 
 		bio->bi_sector = sector;
-		bio->bi_end_io = blkdev_discard_end_io;
+		bio->bi_end_io = bio_batch_end_io;
 		bio->bi_bdev = bdev;
-		bio->bi_private = &wait;
+		bio->bi_private = &bb;
 
 		if (nr_sects > max_discard_sectors) {
 			bio->bi_size = max_discard_sectors << 9;
@@ -88,45 +99,23 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			nr_sects = 0;
 		}
 
-		bio_get(bio);
+		atomic_inc(&bb.done);
 		submit_bio(type, bio);
+	}
 
+	/* Wait for bios in-flight */
+	if (!atomic_dec_and_test(&bb.done))
 		wait_for_completion(&wait);
 
-		if (bio_flagged(bio, BIO_EOPNOTSUPP))
-			ret = -EOPNOTSUPP;
-		else if (!bio_flagged(bio, BIO_UPTODATE))
-			ret = -EIO;
-		bio_put(bio);
-	}
+	if (test_bit(BIO_EOPNOTSUPP, &bb.flags))
+		ret = -EOPNOTSUPP;
+	else if (!test_bit(BIO_UPTODATE, &bb.flags))
+		ret = -EIO;
 
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
-struct bio_batch
-{
-	atomic_t 		done;
-	unsigned long 		flags;
-	struct completion 	*wait;
-};
-
-static void bio_batch_end_io(struct bio *bio, int err)
-{
-	struct bio_batch *bb = bio->bi_private;
-
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bb->flags);
-		else
-			clear_bit(BIO_UPTODATE, &bb->flags);
-	}
-	if (bb)
-		if (atomic_dec_and_test(&bb->done))
-			complete(bb->wait);
-	bio_put(bio);
-}
-
 /**
  * blkdev_issue_zeroout - generate number of zero filed write bios
  * @bdev:	blockdev to issue
-- 
cgit v1.2.2


From 5baebe5c86acd6100536a466905880529f79cf1a Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Fri, 6 May 2011 19:26:28 -0600
Subject: blkdev: Simple cleanup in blkdev_issue_zeroout()

In blkdev_issue_zeroout() we are submitting regular WRITE bios, so we do
not need to check for -EOPNOTSUPP specifically in case of error. Also
there is no need to have label submit: because there is no way to jump
out from the while cycle without an error and we really want to exit,
rather than try again. And also remove the check for (sz == 0) since at
that point sz can never be zero.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
CC: Dmitry Monakhov <dmonakhov@openvz.org>
CC: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-lib.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'block')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9260cb0b209b..d7a98d3ed4aa 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -140,7 +140,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	bb.flags = 1 << BIO_UPTODATE;
 	bb.wait = &wait;
 
-submit:
 	ret = 0;
 	while (nr_sects != 0) {
 		bio = bio_alloc(gfp_mask,
@@ -157,9 +156,6 @@ submit:
 
 		while (nr_sects != 0) {
 			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
-			if (sz == 0)
-				/* bio has maximum size possible */
-				break;
 			ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
 			nr_sects -= ret >> 9;
 			sector += ret >> 9;
@@ -179,16 +175,6 @@ submit:
 		/* One of bios in the batch was completed with error.*/
 		ret = -EIO;
 
-	if (ret)
-		goto out;
-
-	if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
-		ret = -EOPNOTSUPP;
-		goto out;
-	}
-	if (nr_sects != 0)
-		goto submit;
-out:
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
-- 
cgit v1.2.2


From 8af1954d172a46a63e5e79dae523a6d74715e458 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Fri, 6 May 2011 19:30:01 -0600
Subject: blkdev: Do not return -EOPNOTSUPP if discard is supported

Currently we return -EOPNOTSUPP in blkdev_issue_discard() if any of the
bio fails due to underlying device not supporting discard request.
However, if the device is for example dm device composed of devices
which some of them support discard and some of them does not, it is ok
for some bios to fail with EOPNOTSUPP, but it does not mean that discard
is not supported at all.

This commit removes the check for bios failed with EOPNOTSUPP and change
blkdev_issue_discard() to return operation not supported if and only if
the device does not actually supports it, not just part of the device as
some bios might indicate.

This change also fixes problem with BLKDISCARD ioctl() which now works
correctly on such dm devices.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
CC: Jens Axboe <jaxboe@fusionio.com>
CC: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-lib.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index d7a98d3ed4aa..78e627e2581d 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -19,11 +19,8 @@ static void bio_batch_end_io(struct bio *bio, int err)
 {
 	struct bio_batch *bb = bio->bi_private;
 
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bb->flags);
+	if (err && (err != -EOPNOTSUPP))
 		clear_bit(BIO_UPTODATE, &bb->flags);
-	}
 	if (atomic_dec_and_test(&bb->done))
 		complete(bb->wait);
 	bio_put(bio);
@@ -107,9 +104,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	if (!atomic_dec_and_test(&bb.done))
 		wait_for_completion(&wait);
 
-	if (test_bit(BIO_EOPNOTSUPP, &bb.flags))
-		ret = -EOPNOTSUPP;
-	else if (!test_bit(BIO_UPTODATE, &bb.flags))
+	if (!test_bit(BIO_UPTODATE, &bb.flags))
 		ret = -EIO;
 
 	return ret;
-- 
cgit v1.2.2


From 70087dc38cc77ca8f46059564c00338777734762 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 16 May 2011 15:24:08 +0200
Subject: blk-throttle: Use task_subsys_state() to determine a task's
 blkio_cgroup

Currentlly we first map the task to cgroup and then cgroup to
blkio_cgroup. There is a more direct way to get to blkio_cgroup
from task using task_subsys_state(). Use that.

The real reason for the fix is that it also avoids a race in generic
cgroup code. During remount/umount rebind_subsystems() is called and
it can do following with and rcu protection.

cgrp->subsys[i] = NULL;

That means if somebody got hold of cgroup under rcu and then it tried
to do cgroup->subsys[] to get to blkio_cgroup, it would get NULL which
is wrong. I was running into this race condition with ltp running on a
upstream derived kernel and that lead to crash.

So ideally we should also fix cgroup generic code to wait for rcu
grace period before setting pointer to NULL. Li Zefan is not very keen
on introducing synchronize_wait() as he thinks it will slow
down moun/remount/umount operations.

So for the time being atleast fix the kernel crash by taking a more
direct route to blkio_cgroup.

One tester had reported a crash while running LTP on a derived kernel
and with this fix crash is no more seen while the test has been
running for over 6 days.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c   |  7 +++++++
 block/blk-cgroup.h   |  3 +++
 block/blk-throttle.c |  9 ++++-----
 block/cfq-iosched.c  | 11 +++++------
 4 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index f0605ab2a761..471fdcc5df85 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -114,6 +114,13 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
+struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
+{
+	return container_of(task_subsys_state(tsk, blkio_subsys_id),
+			    struct blkio_cgroup, css);
+}
+EXPORT_SYMBOL_GPL(task_blkio_cgroup);
+
 static inline void
 blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
 {
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 10919fae2d3a..c774930cc206 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -291,6 +291,7 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
+extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
 extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 	struct blkio_group *blkg, void *key, dev_t dev,
 	enum blkio_policy_id plid);
@@ -314,6 +315,8 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 struct cgroup;
 static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
+static inline struct blkio_cgroup *
+task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
 
 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 		struct blkio_group *blkg, void *key, dev_t dev,
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0475a22a420d..252a81a306f7 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -160,9 +160,8 @@ static void throtl_put_tg(struct throtl_grp *tg)
 }
 
 static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
-			struct cgroup *cgroup)
+			struct blkio_cgroup *blkcg)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 	struct throtl_grp *tg = NULL;
 	void *key = td;
 	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
@@ -229,12 +228,12 @@ done:
 
 static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 {
-	struct cgroup *cgroup;
 	struct throtl_grp *tg = NULL;
+	struct blkio_cgroup *blkcg;
 
 	rcu_read_lock();
-	cgroup = task_cgroup(current, blkio_subsys_id);
-	tg = throtl_find_alloc_tg(td, cgroup);
+	blkcg = task_blkio_cgroup(current);
+	tg = throtl_find_alloc_tg(td, blkcg);
 	if (!tg)
 		tg = &td->root_tg;
 	rcu_read_unlock();
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5b52011e3a40..ab7a9e6a9b1c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1014,10 +1014,9 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
 	cfqg->needs_update = true;
 }
 
-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
+		struct blkio_cgroup *blkcg, int create)
 {
-	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 	struct cfq_group *cfqg = NULL;
 	void *key = cfqd;
 	int i, j;
@@ -1079,12 +1078,12 @@ done:
  */
 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 {
-	struct cgroup *cgroup;
+	struct blkio_cgroup *blkcg;
 	struct cfq_group *cfqg = NULL;
 
 	rcu_read_lock();
-	cgroup = task_cgroup(current, blkio_subsys_id);
-	cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
+	blkcg = task_blkio_cgroup(current);
+	cfqg = cfq_find_alloc_cfqg(cfqd, blkcg, create);
 	if (!cfqg && create)
 		cfqg = &cfqd->root_group;
 	rcu_read_unlock();
-- 
cgit v1.2.2


From a934a00a69e940b126b9bdbf83e630ef5fe43523 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 18 May 2011 10:37:35 +0200
Subject: block: Fix discard topology stacking and reporting

In some cases we would end up stacking discard_zeroes_data incorrectly.
Fix this by enabling the feature by default for stacking drivers and
clearing it for low-level drivers. Incorporating a device that does not
support dzd will then cause the feature to be disabled in the stacking
driver.

Also ensure that the maximum discard value does not overflow when
exported in sysfs and return 0 in the alignment and dzd fields for
devices that don't support discard.

Reported-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 3 ++-
 block/blk-sysfs.c    | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index cd3c428e194f..fa1eb0449a05 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -120,7 +120,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
-	lim->discard_zeroes_data = -1;
+	lim->discard_zeroes_data = 1;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -166,6 +166,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 
 	blk_set_default_limits(&q->limits);
 	blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
+	q->limits.discard_zeroes_data = 0;
 
 	/*
 	 * by default assume old behaviour and bounce for any highmem page
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 6d735122bc59..53bd0c77bfda 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -152,7 +152,8 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
 
 static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(q->limits.max_discard_sectors << 9, page);
+	return sprintf(page, "%llu\n",
+		       (unsigned long long)q->limits.max_discard_sectors << 9);
 }
 
 static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
-- 
cgit v1.2.2


From 3ec717b7ca4ee1d75d77e4f6286430d8f01d1dbd Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Wed, 18 May 2011 11:22:43 +0200
Subject: block: don't delay blk_run_queue_async

Let's check a scenario:
1. blk_delay_queue(q, SCSI_QUEUE_DELAY);
2. blk_run_queue_async();
the second one will became a noop, because q->delay_work already has
WORK_STRUCT_PENDING_BIT set, so the delayed work will still run after
SCSI_QUEUE_DELAY. But blk_run_queue_async actually hopes the delayed
work runs immediately.

Fix this by doing a cancel on potentially pending delayed work
before queuing an immediate run of the workqueue.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index a2e58eeb3549..3fe00a14822a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -316,8 +316,10 @@ EXPORT_SYMBOL(__blk_run_queue);
  */
 void blk_run_queue_async(struct request_queue *q)
 {
-	if (likely(!blk_queue_stopped(q)))
+	if (likely(!blk_queue_stopped(q))) {
+		__cancel_delayed_work(&q->delay_work);
 		queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
+	}
 }
 EXPORT_SYMBOL(blk_run_queue_async);
 
-- 
cgit v1.2.2


From 0a58e077eb600d1efd7e54ad9926a75a39d7f8ae Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@suse.de>
Date: Wed, 18 May 2011 16:20:10 +0200
Subject: block: add proper state guards to __elv_next_request

blk_cleanup_queue() calls elevator_exit() and after this, we can't
touch the elevator without oopsing.  __elv_next_request() must check
for this state because in the refcounted queue model, we can still
call it after blk_cleanup_queue() has been called.

This was reported as causing an oops attributable to scsi.

Signed-off-by: James Bottomley <James.Bottomley@suse.de>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk.h b/block/blk.h
index 61263463e38e..4df474d37e6d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -62,7 +62,8 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 			return rq;
 		}
 
-		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
+		if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
+		    !q->elevator->ops->elevator_dispatch_fn(q, 0))
 			return NULL;
 	}
 }
-- 
cgit v1.2.2


From a29a171e7c46c60842b85729280e2f5690372683 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:19 -0400
Subject: blk-throttle: Do the new group initialization with the help of a
 function

Group initialization code seems to be at two places. root group
initialization in blk_throtl_init() and dynamically allocated group
in throtl_find_alloc_tg(). Create a common function and use at both
the places.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 64 ++++++++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 29 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 252a81a306f7..fa9a900c1254 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -159,6 +159,35 @@ static void throtl_put_tg(struct throtl_grp *tg)
 	kfree(tg);
 }
 
+static void throtl_init_group(struct throtl_grp *tg)
+{
+	INIT_HLIST_NODE(&tg->tg_node);
+	RB_CLEAR_NODE(&tg->rb_node);
+	bio_list_init(&tg->bio_lists[0]);
+	bio_list_init(&tg->bio_lists[1]);
+	tg->limits_changed = false;
+
+	/* Practically unlimited BW */
+	tg->bps[0] = tg->bps[1] = -1;
+	tg->iops[0] = tg->iops[1] = -1;
+
+	/*
+	 * Take the initial reference that will be released on destroy
+	 * This can be thought of a joint reference by cgroup and
+	 * request queue which will be dropped by either request queue
+	 * exit or cgroup deletion path depending on who is exiting first.
+	 */
+	atomic_set(&tg->ref, 1);
+}
+
+/* Should be called with rcu read lock held (needed for blkcg) */
+static void
+throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
+{
+	hlist_add_head(&tg->tg_node, &td->tg_list);
+	td->nr_undestroyed_grps++;
+}
+
 static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
 			struct blkio_cgroup *blkcg)
 {
@@ -196,19 +225,7 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
 	if (!tg)
 		goto done;
 
-	INIT_HLIST_NODE(&tg->tg_node);
-	RB_CLEAR_NODE(&tg->rb_node);
-	bio_list_init(&tg->bio_lists[0]);
-	bio_list_init(&tg->bio_lists[1]);
-	td->limits_changed = false;
-
-	/*
-	 * Take the initial reference that will be released on destroy
-	 * This can be thought of a joint reference by cgroup and
-	 * request queue which will be dropped by either request queue
-	 * exit or cgroup deletion path depending on who is exiting first.
-	 */
-	atomic_set(&tg->ref, 1);
+	throtl_init_group(tg);
 
 	/* Add group onto cgroup list */
 	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
@@ -220,8 +237,7 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
 	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
 	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
 
-	hlist_add_head(&tg->tg_node, &td->tg_list);
-	td->nr_undestroyed_grps++;
+	throtl_add_group_to_td_list(td, tg);
 done:
 	return tg;
 }
@@ -1060,18 +1076,11 @@ int blk_throtl_init(struct request_queue *q)
 	INIT_HLIST_HEAD(&td->tg_list);
 	td->tg_service_tree = THROTL_RB_ROOT;
 	td->limits_changed = false;
+	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
 
 	/* Init root group */
 	tg = &td->root_tg;
-	INIT_HLIST_NODE(&tg->tg_node);
-	RB_CLEAR_NODE(&tg->rb_node);
-	bio_list_init(&tg->bio_lists[0]);
-	bio_list_init(&tg->bio_lists[1]);
-
-	/* Practically unlimited BW */
-	tg->bps[0] = tg->bps[1] = -1;
-	tg->iops[0] = tg->iops[1] = -1;
-	td->limits_changed = false;
+	throtl_init_group(tg);
 
 	/*
 	 * Set root group reference to 2. One reference will be dropped when
@@ -1080,16 +1089,13 @@ int blk_throtl_init(struct request_queue *q)
 	 * as it is statically allocated and gets destroyed when throtl_data
 	 * goes away.
 	 */
-	atomic_set(&tg->ref, 2);
-	hlist_add_head(&tg->tg_node, &td->tg_list);
-	td->nr_undestroyed_grps++;
-
-	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+	atomic_inc(&tg->ref);
 
 	rcu_read_lock();
 	blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
 					0, BLKIO_POLICY_THROTL);
 	rcu_read_unlock();
+	throtl_add_group_to_td_list(td, tg);
 
 	/* Attach throtl data to request queue */
 	td->queue = q;
-- 
cgit v1.2.2


From a23e68695593d00b35a6cddf8e9c9ec03505ecb9 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:20 -0400
Subject: blk-cgroup: move some fields of unaccounted_time file under right
 config option

cgroup unaccounted_time file is created only if CONFIG_DEBUG_BLK_CGROUP=y.
there are some fields which are out side this config option. Fix that.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 2 ++
 block/blk-cgroup.h | 7 ++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 471fdcc5df85..b0592bca6970 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -385,7 +385,9 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
 	blkg->stats.time += time;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
 	blkg->stats.unaccounted_time += unaccounted_time;
+#endif
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c774930cc206..63f1ef4450d7 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -49,9 +49,9 @@ enum stat_type {
 	/* All the single valued stats go below this */
 	BLKIO_STAT_TIME,
 	BLKIO_STAT_SECTORS,
+#ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* Time not charged to this cgroup */
 	BLKIO_STAT_UNACCOUNTED_TIME,
-#ifdef CONFIG_DEBUG_BLK_CGROUP
 	BLKIO_STAT_AVG_QUEUE_SIZE,
 	BLKIO_STAT_IDLE_TIME,
 	BLKIO_STAT_EMPTY_TIME,
@@ -117,10 +117,11 @@ struct blkio_group_stats {
 	/* total disk time and nr sectors dispatched by this group */
 	uint64_t time;
 	uint64_t sectors;
-	/* Time not charged to this cgroup */
-	uint64_t unaccounted_time;
 	uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+	/* Time not charged to this cgroup */
+	uint64_t unaccounted_time;
+
 	/* Sum of number of IOs queued across all samples */
 	uint64_t avg_queue_size_sum;
 	/* Count of samples taken for average */
-- 
cgit v1.2.2


From 3e59cf9d66a87763fef6c232a4a8dc664461ca50 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:21 -0400
Subject: cfq-iosched: Get rid of redundant function parameter "create"

Nobody seems to be using cfq_find_alloc_cfqg() function parameter "create".
Get rid of that.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ab7a9e6a9b1c..a905b5519ab6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1015,7 +1015,7 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
 }
 
 static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
-		struct blkio_cgroup *blkcg, int create)
+		struct blkio_cgroup *blkcg)
 {
 	struct cfq_group *cfqg = NULL;
 	void *key = cfqd;
@@ -1030,7 +1030,7 @@ static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
 		cfqg->blkg.dev = MKDEV(major, minor);
 		goto done;
 	}
-	if (cfqg || !create)
+	if (cfqg)
 		goto done;
 
 	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
@@ -1073,18 +1073,18 @@ done:
 }
 
 /*
- * Search for the cfq group current task belongs to. If create = 1, then also
- * create the cfq group if it does not exist. request_queue lock must be held.
+ * Search for the cfq group current task belongs to. request_queue lock must
+ * be held.
  */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 {
 	struct blkio_cgroup *blkcg;
 	struct cfq_group *cfqg = NULL;
 
 	rcu_read_lock();
 	blkcg = task_blkio_cgroup(current);
-	cfqg = cfq_find_alloc_cfqg(cfqd, blkcg, create);
-	if (!cfqg && create)
+	cfqg = cfq_find_alloc_cfqg(cfqd, blkcg);
+	if (!cfqg)
 		cfqg = &cfqd->root_group;
 	rcu_read_unlock();
 	return cfqg;
@@ -1176,7 +1176,7 @@ void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
 }
 
 #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 {
 	return &cfqd->root_group;
 }
@@ -2911,7 +2911,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
 	struct cfq_group *cfqg;
 
 retry:
-	cfqg = cfq_get_cfqg(cfqd, 1);
+	cfqg = cfq_get_cfqg(cfqd);
 	cic = cfq_cic_lookup(cfqd, ioc);
 	/* cic always exists here */
 	cfqq = cic_to_cfqq(cic, is_sync);
-- 
cgit v1.2.2


From 56edf7d75db5b14d628b46623c414ffbeed68d7f Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:22 -0400
Subject: cfq-iosched: Fix a possible race with cfq cgroup removal code

blkg->key = cfqd is an rcu protected pointer and hence we used to do
call_rcu(cfqd->rcu_head) to free up cfqd after one rcu grace period.

The problem here is that even though cfqd is around, there are no
gurantees that associated request queue (td->queue) or q->queue_lock
is still around. A driver might have called blk_cleanup_queue() and
release the lock.

It might happen that after freeing up the lock we call
blkg->key->queue->queue_ock and crash. This is possible in following
path.

blkiocg_destroy()
 blkio_unlink_group_fn()
  cfq_unlink_blkio_group()

Hence, wait for an rcu peirod if there are groups which have not
been unlinked from blkcg->blkg_list. That way, if there are any groups
which are taking cfq_unlink_blkio_group() path, can safely take queue
lock.

This is how we have taken care of race in throttling logic also.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 48 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a905b5519ab6..e2e6719832e1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -300,7 +300,9 @@ struct cfq_data {
 
 	/* List of cfq groups being managed on this device*/
 	struct hlist_head cfqg_list;
-	struct rcu_head rcu;
+
+	/* Number of groups which are on blkcg->blkg_list */
+	unsigned int nr_blkcg_linked_grps;
 };
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -1063,6 +1065,7 @@ static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
 		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
 					0);
 
+	cfqd->nr_blkcg_linked_grps++;
 	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
 
 	/* Add group on cfqd list */
@@ -3815,15 +3818,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
 		cfq_put_queue(cfqd->async_idle_cfqq);
 }
 
-static void cfq_cfqd_free(struct rcu_head *head)
-{
-	kfree(container_of(head, struct cfq_data, rcu));
-}
-
 static void cfq_exit_queue(struct elevator_queue *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 	struct request_queue *q = cfqd->queue;
+	bool wait = false;
 
 	cfq_shutdown_timer_wq(cfqd);
 
@@ -3842,7 +3841,13 @@ static void cfq_exit_queue(struct elevator_queue *e)
 
 	cfq_put_async_queues(cfqd);
 	cfq_release_cfq_groups(cfqd);
-	cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
+
+	/*
+	 * If there are groups which we could not unlink from blkcg list,
+	 * wait for a rcu period for them to be freed.
+	 */
+	if (cfqd->nr_blkcg_linked_grps)
+		wait = true;
 
 	spin_unlock_irq(q->queue_lock);
 
@@ -3852,8 +3857,20 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	ida_remove(&cic_index_ida, cfqd->cic_index);
 	spin_unlock(&cic_index_lock);
 
-	/* Wait for cfqg->blkg->key accessors to exit their grace periods. */
-	call_rcu(&cfqd->rcu, cfq_cfqd_free);
+	/*
+	 * Wait for cfqg->blkg->key accessors to exit their grace periods.
+	 * Do this wait only if there are other unlinked groups out
+	 * there. This can happen if cgroup deletion path claimed the
+	 * responsibility of cleaning up a group before queue cleanup code
+	 * get to the group.
+	 *
+	 * Do not call synchronize_rcu() unconditionally as there are drivers
+	 * which create/delete request queue hundreds of times during scan/boot
+	 * and synchronize_rcu() can take significant time and slow down boot.
+	 */
+	if (wait)
+		synchronize_rcu();
+	kfree(cfqd);
 }
 
 static int cfq_alloc_cic_index(void)
@@ -3909,14 +3926,21 @@ static void *cfq_init_queue(struct request_queue *q)
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	/*
-	 * Take a reference to root group which we never drop. This is just
-	 * to make sure that cfq_put_cfqg() does not try to kfree root group
+	 * Set root group reference to 2. One reference will be dropped when
+	 * all groups on cfqd->cfqg_list are being deleted during queue exit.
+	 * Other reference will remain there as we don't want to delete this
+	 * group as it is statically allocated and gets destroyed when
+	 * throtl_data goes away.
 	 */
-	cfqg->ref = 1;
+	cfqg->ref = 2;
 	rcu_read_lock();
 	cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
 					(void *)cfqd, 0);
 	rcu_read_unlock();
+	cfqd->nr_blkcg_linked_grps++;
+
+	/* Add group on cfqd->cfqg_list */
+	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
 #endif
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
-- 
cgit v1.2.2


From f469a7b4d5b1d1d053200a9015fd25d59c057f49 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:23 -0400
Subject: blk-cgroup: Allow sleeping while dynamically allocating a group

Currently, all the cfq_group or throtl_group allocations happen while
we are holding ->queue_lock and sleeping is not allowed.

Soon, we will move to per cpu stats and also need to allocate the
per group stats. As one can not call alloc_percpu() from atomic
context as it can sleep, we need to drop ->queue_lock, allocate the
group, retake the lock and continue processing.

In throttling code, I check the queue DEAD flag again to make sure
that driver did not call blk_cleanup_queue() in the mean time.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c     |   3 +-
 block/blk-throttle.c | 141 +++++++++++++++++++++++++++++++++++++++------------
 block/cfq-iosched.c  | 128 ++++++++++++++++++++++++++++++++++------------
 3 files changed, 205 insertions(+), 67 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3fe00a14822a..9e8e297374b9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1550,7 +1550,8 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
-		blk_throtl_bio(q, &bio);
+		if (blk_throtl_bio(q, &bio))
+			goto end_io;
 
 		/*
 		 * If bio = NULL, bio has been throttled and will be submitted
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index fa9a900c1254..c201967b33cd 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -188,20 +188,46 @@ throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
 	td->nr_undestroyed_grps++;
 }
 
-static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
-			struct blkio_cgroup *blkcg)
+static void throtl_init_add_tg_lists(struct throtl_data *td,
+			struct throtl_grp *tg, struct blkio_cgroup *blkcg)
+{
+	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+	unsigned int major, minor;
+
+	/* Add group onto cgroup list */
+	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
+				MKDEV(major, minor), BLKIO_POLICY_THROTL);
+
+	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
+	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
+	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
+	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
+
+	throtl_add_group_to_td_list(td, tg);
+}
+
+/* Should be called without queue lock and outside of rcu period */
+static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
+{
+	struct throtl_grp *tg = NULL;
+
+	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+	if (!tg)
+		return NULL;
+
+	throtl_init_group(tg);
+	return tg;
+}
+
+static struct
+throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 {
 	struct throtl_grp *tg = NULL;
 	void *key = td;
 	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
 	unsigned int major, minor;
 
-	/*
-	 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
-	 * tree of blkg (instead of traversing through hash list all
-	 * the time.
-	 */
-
 	/*
 	 * This is the common case when there are no blkio cgroups.
  	 * Avoid lookup in this case
@@ -215,43 +241,83 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
 	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
 		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
 		tg->blkg.dev = MKDEV(major, minor);
-		goto done;
 	}
 
-	if (tg)
-		goto done;
-
-	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
-	if (!tg)
-		goto done;
-
-	throtl_init_group(tg);
-
-	/* Add group onto cgroup list */
-	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
-				MKDEV(major, minor), BLKIO_POLICY_THROTL);
-
-	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
-	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
-	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
-	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
-
-	throtl_add_group_to_td_list(td, tg);
-done:
 	return tg;
 }
 
+/*
+ * This function returns with queue lock unlocked in case of error, like
+ * request queue is no more
+ */
 static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 {
-	struct throtl_grp *tg = NULL;
+	struct throtl_grp *tg = NULL, *__tg = NULL;
 	struct blkio_cgroup *blkcg;
+	struct request_queue *q = td->queue;
 
 	rcu_read_lock();
 	blkcg = task_blkio_cgroup(current);
-	tg = throtl_find_alloc_tg(td, blkcg);
-	if (!tg)
+	tg = throtl_find_tg(td, blkcg);
+	if (tg) {
+		rcu_read_unlock();
+		return tg;
+	}
+
+	/*
+	 * Need to allocate a group. Allocation of group also needs allocation
+	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
+	 * we need to drop rcu lock and queue_lock before we call alloc
+	 *
+	 * Take the request queue reference to make sure queue does not
+	 * go away once we return from allocation.
+	 */
+	blk_get_queue(q);
+	rcu_read_unlock();
+	spin_unlock_irq(q->queue_lock);
+
+	tg = throtl_alloc_tg(td);
+	/*
+	 * We might have slept in group allocation. Make sure queue is not
+	 * dead
+	 */
+	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+		blk_put_queue(q);
+		if (tg)
+			kfree(tg);
+
+		return ERR_PTR(-ENODEV);
+	}
+	blk_put_queue(q);
+
+	/* Group allocated and queue is still alive. take the lock */
+	spin_lock_irq(q->queue_lock);
+
+	/*
+	 * Initialize the new group. After sleeping, read the blkcg again.
+	 */
+	rcu_read_lock();
+	blkcg = task_blkio_cgroup(current);
+
+	/*
+	 * If some other thread already allocated the group while we were
+	 * not holding queue lock, free up the group
+	 */
+	__tg = throtl_find_tg(td, blkcg);
+
+	if (__tg) {
+		kfree(tg);
+		rcu_read_unlock();
+		return __tg;
+	}
+
+	/* Group allocation failed. Account the IO to root group */
+	if (!tg) {
 		tg = &td->root_tg;
+		return tg;
+	}
+
+	throtl_init_add_tg_lists(td, tg, blkcg);
 	rcu_read_unlock();
 	return tg;
 }
@@ -1014,6 +1080,15 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 	spin_lock_irq(q->queue_lock);
 	tg = throtl_get_tg(td);
 
+	if (IS_ERR(tg)) {
+		if (PTR_ERR(tg)	== -ENODEV) {
+			/*
+			 * Queue is gone. No queue lock held here.
+			 */
+			return -ENODEV;
+		}
+	}
+
 	if (tg->nr_queued[rw]) {
 		/*
 		 * There is already another bio queued in same dir. No
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e2e6719832e1..606020fe93f3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1016,28 +1016,47 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
 	cfqg->needs_update = true;
 }
 
-static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
-		struct blkio_cgroup *blkcg)
+static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
+			struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
 {
-	struct cfq_group *cfqg = NULL;
-	void *key = cfqd;
-	int i, j;
-	struct cfq_rb_root *st;
 	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 	unsigned int major, minor;
 
-	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
-	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+	/*
+	 * Add group onto cgroup list. It might happen that bdi->dev is
+	 * not initialized yet. Initialize this new group without major
+	 * and minor info and this info will be filled in once a new thread
+	 * comes for IO.
+	 */
+	if (bdi->dev) {
 		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfqg->blkg.dev = MKDEV(major, minor);
-		goto done;
-	}
-	if (cfqg)
-		goto done;
+		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
+					(void *)cfqd, MKDEV(major, minor));
+	} else
+		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
+					(void *)cfqd, 0);
+
+	cfqd->nr_blkcg_linked_grps++;
+	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+
+	/* Add group on cfqd list */
+	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+/*
+ * Should be called from sleepable context. No request queue lock as per
+ * cpu stats are allocated dynamically and alloc_percpu needs to be called
+ * from sleepable context.
+ */
+static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
+{
+	struct cfq_group *cfqg = NULL;
+	int i, j;
+	struct cfq_rb_root *st;
 
 	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
 	if (!cfqg)
-		goto done;
+		return NULL;
 
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
@@ -1050,28 +1069,31 @@ static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
 	 * or cgroup deletion path depending on who is exiting first.
 	 */
 	cfqg->ref = 1;
+	return cfqg;
+}
+
+static struct cfq_group *
+cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
+{
+	struct cfq_group *cfqg = NULL;
+	void *key = cfqd;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	unsigned int major, minor;
 
 	/*
-	 * Add group onto cgroup list. It might happen that bdi->dev is
-	 * not initialized yet. Initialize this new group without major
-	 * and minor info and this info will be filled in once a new thread
-	 * comes for IO. See code above.
+	 * This is the common case when there are no blkio cgroups.
+	 * Avoid lookup in this case
 	 */
-	if (bdi->dev) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-					MKDEV(major, minor));
-	} else
-		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-					0);
-
-	cfqd->nr_blkcg_linked_grps++;
-	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+	if (blkcg == &blkio_root_cgroup)
+		cfqg = &cfqd->root_group;
+	else
+		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
 
-	/* Add group on cfqd list */
-	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		cfqg->blkg.dev = MKDEV(major, minor);
+	}
 
-done:
 	return cfqg;
 }
 
@@ -1082,13 +1104,53 @@ done:
 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 {
 	struct blkio_cgroup *blkcg;
-	struct cfq_group *cfqg = NULL;
+	struct cfq_group *cfqg = NULL, *__cfqg = NULL;
+	struct request_queue *q = cfqd->queue;
+
+	rcu_read_lock();
+	blkcg = task_blkio_cgroup(current);
+	cfqg = cfq_find_cfqg(cfqd, blkcg);
+	if (cfqg) {
+		rcu_read_unlock();
+		return cfqg;
+	}
+
+	/*
+	 * Need to allocate a group. Allocation of group also needs allocation
+	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
+	 * we need to drop rcu lock and queue_lock before we call alloc.
+	 *
+	 * Not taking any queue reference here and assuming that queue is
+	 * around by the time we return. CFQ queue allocation code does
+	 * the same. It might be racy though.
+	 */
+
+	rcu_read_unlock();
+	spin_unlock_irq(q->queue_lock);
+
+	cfqg = cfq_alloc_cfqg(cfqd);
+
+	spin_lock_irq(q->queue_lock);
 
 	rcu_read_lock();
 	blkcg = task_blkio_cgroup(current);
-	cfqg = cfq_find_alloc_cfqg(cfqd, blkcg);
+
+	/*
+	 * If some other thread already allocated the group while we were
+	 * not holding queue lock, free up the group
+	 */
+	__cfqg = cfq_find_cfqg(cfqd, blkcg);
+
+	if (__cfqg) {
+		kfree(cfqg);
+		rcu_read_unlock();
+		return __cfqg;
+	}
+
 	if (!cfqg)
 		cfqg = &cfqd->root_group;
+
+	cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
 	rcu_read_unlock();
 	return cfqg;
 }
-- 
cgit v1.2.2


From 29b125892f3317ada86b662e0b6ebc0f79be9037 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:24 -0400
Subject: blk-throttle: Dynamically allocate root group

Currently, we allocate root throtl_grp statically. But as we will be
introducing per cpu stat pointers and that will be allocated
dynamically even for root group, we might as well make whole root
throtl_grp allocation dynamic and treat it in same manner as other
groups.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c201967b33cd..68f2ac3f3b07 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -88,7 +88,7 @@ struct throtl_data
 	/* service tree for active throtl groups */
 	struct throtl_rb_root tg_service_tree;
 
-	struct throtl_grp root_tg;
+	struct throtl_grp *root_tg;
 	struct request_queue *queue;
 
 	/* Total Number of queued bios on READ and WRITE lists */
@@ -233,7 +233,7 @@ throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
  	 * Avoid lookup in this case
  	 */
 	if (blkcg == &blkio_root_cgroup)
-		tg = &td->root_tg;
+		tg = td->root_tg;
 	else
 		tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
 
@@ -313,7 +313,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 
 	/* Group allocation failed. Account the IO to root group */
 	if (!tg) {
-		tg = &td->root_tg;
+		tg = td->root_tg;
 		return tg;
 	}
 
@@ -1153,18 +1153,16 @@ int blk_throtl_init(struct request_queue *q)
 	td->limits_changed = false;
 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
 
-	/* Init root group */
-	tg = &td->root_tg;
-	throtl_init_group(tg);
+	/* alloc and Init root group. */
+	td->queue = q;
+	tg = throtl_alloc_tg(td);
 
-	/*
-	 * Set root group reference to 2. One reference will be dropped when
-	 * all groups on tg_list are being deleted during queue exit. Other
-	 * reference will remain there as we don't want to delete this group
-	 * as it is statically allocated and gets destroyed when throtl_data
-	 * goes away.
-	 */
-	atomic_inc(&tg->ref);
+	if (!tg) {
+		kfree(td);
+		return -ENOMEM;
+	}
+
+	td->root_tg = tg;
 
 	rcu_read_lock();
 	blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
@@ -1173,7 +1171,6 @@ int blk_throtl_init(struct request_queue *q)
 	throtl_add_group_to_td_list(td, tg);
 
 	/* Attach throtl data to request queue */
-	td->queue = q;
 	q->td = td;
 	return 0;
 }
-- 
cgit v1.2.2


From 269f541555d8f5da322d592fb3e13e23ed62d80c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:25 -0400
Subject: blk-throttle: Introduce a helper function to fill in device details

A helper function for the code which is used at 2-3 places. Makes reading
code little easier.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 68f2ac3f3b07..97ea7f82477d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -188,16 +188,34 @@ throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
 	td->nr_undestroyed_grps++;
 }
 
-static void throtl_init_add_tg_lists(struct throtl_data *td,
-			struct throtl_grp *tg, struct blkio_cgroup *blkcg)
+static void
+__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
 {
 	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
 	unsigned int major, minor;
 
+	if (!tg || tg->blkg.dev)
+		return;
+
+	/*
+	 * Fill in device details for a group which might not have been
+	 * filled at group creation time as queue was being instantiated
+	 * and driver had not attached a device yet
+	 */
+	if (bdi->dev && dev_name(bdi->dev)) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		tg->blkg.dev = MKDEV(major, minor);
+	}
+}
+
+static void throtl_init_add_tg_lists(struct throtl_data *td,
+			struct throtl_grp *tg, struct blkio_cgroup *blkcg)
+{
+	__throtl_tg_fill_dev_details(td, tg);
+
 	/* Add group onto cgroup list */
-	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
 	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
-				MKDEV(major, minor), BLKIO_POLICY_THROTL);
+				tg->blkg.dev, BLKIO_POLICY_THROTL);
 
 	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
 	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
@@ -225,8 +243,6 @@ throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 {
 	struct throtl_grp *tg = NULL;
 	void *key = td;
-	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
-	unsigned int major, minor;
 
 	/*
 	 * This is the common case when there are no blkio cgroups.
@@ -237,12 +253,7 @@ throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 	else
 		tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
 
-	/* Fill in device details for root group */
-	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-		tg->blkg.dev = MKDEV(major, minor);
-	}
-
+	__throtl_tg_fill_dev_details(td, tg);
 	return tg;
 }
 
-- 
cgit v1.2.2


From 5617cbef7723952cbdff28c7a10ff8a254945f4f Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:26 -0400
Subject: blk-throttle: Use helper function to add root throtl group to lists

Use same helper function for root group as we use with dynamically
allocated groups to add it to various lists.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 97ea7f82477d..b9412d1cea9e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1176,10 +1176,8 @@ int blk_throtl_init(struct request_queue *q)
 	td->root_tg = tg;
 
 	rcu_read_lock();
-	blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
-					0, BLKIO_POLICY_THROTL);
+	throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
 	rcu_read_unlock();
-	throtl_add_group_to_td_list(td, tg);
 
 	/* Attach throtl data to request queue */
 	q->td = td;
-- 
cgit v1.2.2


From 4843c69d496a8d2e4caab6182fe016b9a79136e0 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:27 -0400
Subject: blk-throttle: Free up a group only after one rcu grace period

Soon we will allow accessing a throtl_grp under rcu_read_lock(). Hence
start freeing up throtl_grp after one rcu grace period.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b9412d1cea9e..90ad40735f73 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -78,6 +78,8 @@ struct throtl_grp {
 
 	/* Some throttle limits got updated for the group */
 	int limits_changed;
+
+	struct rcu_head rcu_head;
 };
 
 struct throtl_data
@@ -151,12 +153,30 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
 	return tg;
 }
 
+static void throtl_free_tg(struct rcu_head *head)
+{
+	struct throtl_grp *tg;
+
+	tg = container_of(head, struct throtl_grp, rcu_head);
+	kfree(tg);
+}
+
 static void throtl_put_tg(struct throtl_grp *tg)
 {
 	BUG_ON(atomic_read(&tg->ref) <= 0);
 	if (!atomic_dec_and_test(&tg->ref))
 		return;
-	kfree(tg);
+
+	/*
+	 * A group is freed in rcu manner. But having an rcu lock does not
+	 * mean that one can access all the fields of blkg and assume these
+	 * are valid. For example, don't try to follow throtl_data and
+	 * request queue links.
+	 *
+	 * Having a reference to blkg under an rcu allows acess to only
+	 * values local to groups like group stats and group rate limits
+	 */
+	call_rcu(&tg->rcu_head, throtl_free_tg);
 }
 
 static void throtl_init_group(struct throtl_grp *tg)
-- 
cgit v1.2.2


From 5624a4e445e2ec27582984b068d7bf7f127cee10 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:28 -0400
Subject: blk-throttle: Make dispatch stats per cpu

Currently we take blkg_stat lock for even updating the stats. So even if
a group has no throttling rules (common case for root group), we end
up taking blkg_lock, for updating the stats.

Make dispatch stats per cpu so that these can be updated without taking
blkg lock.

If cpu goes offline, these stats simply disappear. No protection has
been provided for that yet. Do we really need anything for that?

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c   | 135 ++++++++++++++++++++++++++++++++++++++-------------
 block/blk-cgroup.h   |  27 ++++++++---
 block/blk-throttle.c |   9 ++++
 block/cfq-iosched.c  |  18 ++++++-
 4 files changed, 147 insertions(+), 42 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b0592bca6970..34bfcefdd924 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -392,20 +392,22 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 
+/*
+ * should be called under rcu read lock or queue lock to make sure blkg pointer
+ * is valid.
+ */
 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 				uint64_t bytes, bool direction, bool sync)
 {
-	struct blkio_group_stats *stats;
-	unsigned long flags;
+	struct blkio_group_stats_cpu *stats_cpu;
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	stats = &blkg->stats;
-	stats->sectors += bytes >> 9;
-	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
-			sync);
-	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
-			direction, sync);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+
+	stats_cpu->sectors += bytes >> 9;
+	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
+			1, direction, sync);
+	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
+			bytes, direction, sync);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
 
@@ -440,6 +442,20 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
+/*
+ * This function allocates the per cpu stats for blkio_group. Should be called
+ * from sleepable context as alloc_per_cpu() requires that.
+ */
+int blkio_alloc_blkg_stats(struct blkio_group *blkg)
+{
+	/* Allocate memory for per cpu stats */
+	blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
+	if (!blkg->stats_cpu)
+		return -ENOMEM;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
+
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 		struct blkio_group *blkg, void *key, dev_t dev,
 		enum blkio_policy_id plid)
@@ -600,6 +616,53 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
 	return val;
 }
 
+
+static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
+			enum stat_type_cpu type, enum stat_sub_type sub_type)
+{
+	int cpu;
+	struct blkio_group_stats_cpu *stats_cpu;
+	uint64_t val = 0;
+
+	for_each_possible_cpu(cpu) {
+		stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
+
+		if (type == BLKIO_STAT_CPU_SECTORS)
+			val += stats_cpu->sectors;
+		else
+			val += stats_cpu->stat_arr_cpu[type][sub_type];
+	}
+
+	return val;
+}
+
+static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
+		struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
+{
+	uint64_t disk_total, val;
+	char key_str[MAX_KEY_LEN];
+	enum stat_sub_type sub_type;
+
+	if (type == BLKIO_STAT_CPU_SECTORS) {
+		val = blkio_read_stat_cpu(blkg, type, 0);
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
+	}
+
+	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+			sub_type++) {
+		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+		val = blkio_read_stat_cpu(blkg, type, sub_type);
+		cb->fill(cb, key_str, val);
+	}
+
+	disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
+			blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
+
+	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+	cb->fill(cb, key_str, disk_total);
+	return disk_total;
+}
+
 /* This should be called with blkg->stats_lock held */
 static uint64_t blkio_get_stat(struct blkio_group *blkg,
 		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
@@ -611,9 +674,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 	if (type == BLKIO_STAT_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 					blkg->stats.time, cb, dev);
-	if (type == BLKIO_STAT_SECTORS)
-		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
-					blkg->stats.sectors, cb, dev);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
@@ -1077,8 +1137,8 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
 }
 
 static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
-		struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
-		bool show_total)
+		struct cftype *cft, struct cgroup_map_cb *cb,
+		enum stat_type type, bool show_total, bool pcpu)
 {
 	struct blkio_group *blkg;
 	struct hlist_node *n;
@@ -1089,10 +1149,15 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
 		if (blkg->dev) {
 			if (!cftype_blkg_same_policy(cft, blkg))
 				continue;
-			spin_lock_irq(&blkg->stats_lock);
-			cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
-						type);
-			spin_unlock_irq(&blkg->stats_lock);
+			if (pcpu)
+				cgroup_total += blkio_get_stat_cpu(blkg, cb,
+						blkg->dev, type);
+			else {
+				spin_lock_irq(&blkg->stats_lock);
+				cgroup_total += blkio_get_stat(blkg, cb,
+						blkg->dev, type);
+				spin_unlock_irq(&blkg->stats_lock);
+			}
 		}
 	}
 	if (show_total)
@@ -1116,47 +1181,47 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 		switch(name) {
 		case BLKIO_PROP_time:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_TIME, 0);
+						BLKIO_STAT_TIME, 0, 0);
 		case BLKIO_PROP_sectors:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_SECTORS, 0);
+						BLKIO_STAT_CPU_SECTORS, 0, 1);
 		case BLKIO_PROP_io_service_bytes:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_SERVICE_BYTES, 1);
+					BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
 		case BLKIO_PROP_io_serviced:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_SERVICED, 1);
+						BLKIO_STAT_CPU_SERVICED, 1, 1);
 		case BLKIO_PROP_io_service_time:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_SERVICE_TIME, 1);
+						BLKIO_STAT_SERVICE_TIME, 1, 0);
 		case BLKIO_PROP_io_wait_time:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_WAIT_TIME, 1);
+						BLKIO_STAT_WAIT_TIME, 1, 0);
 		case BLKIO_PROP_io_merged:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_MERGED, 1);
+						BLKIO_STAT_MERGED, 1, 0);
 		case BLKIO_PROP_io_queued:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_QUEUED, 1);
+						BLKIO_STAT_QUEUED, 1, 0);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 		case BLKIO_PROP_unaccounted_time:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_UNACCOUNTED_TIME, 0);
+					BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
 		case BLKIO_PROP_dequeue:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_DEQUEUE, 0);
+						BLKIO_STAT_DEQUEUE, 0, 0);
 		case BLKIO_PROP_avg_queue_size:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+					BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
 		case BLKIO_PROP_group_wait_time:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_GROUP_WAIT_TIME, 0);
+					BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
 		case BLKIO_PROP_idle_time:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_IDLE_TIME, 0);
+						BLKIO_STAT_IDLE_TIME, 0, 0);
 		case BLKIO_PROP_empty_time:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_EMPTY_TIME, 0);
+						BLKIO_STAT_EMPTY_TIME, 0, 0);
 #endif
 		default:
 			BUG();
@@ -1166,10 +1231,10 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 		switch(name){
 		case BLKIO_THROTL_io_service_bytes:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_SERVICE_BYTES, 1);
+						BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
 		case BLKIO_THROTL_io_serviced:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_SERVICED, 1);
+						BLKIO_STAT_CPU_SERVICED, 1, 1);
 		default:
 			BUG();
 		}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 63f1ef4450d7..fd730a24b491 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -36,10 +36,6 @@ enum stat_type {
 	 * request completion for IOs doen by this cgroup. This may not be
 	 * accurate when NCQ is turned on. */
 	BLKIO_STAT_SERVICE_TIME = 0,
-	/* Total bytes transferred */
-	BLKIO_STAT_SERVICE_BYTES,
-	/* Total IOs serviced, post merge */
-	BLKIO_STAT_SERVICED,
 	/* Total time spent waiting in scheduler queue in ns */
 	BLKIO_STAT_WAIT_TIME,
 	/* Number of IOs merged */
@@ -48,7 +44,6 @@ enum stat_type {
 	BLKIO_STAT_QUEUED,
 	/* All the single valued stats go below this */
 	BLKIO_STAT_TIME,
-	BLKIO_STAT_SECTORS,
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* Time not charged to this cgroup */
 	BLKIO_STAT_UNACCOUNTED_TIME,
@@ -60,6 +55,16 @@ enum stat_type {
 #endif
 };
 
+/* Per cpu stats */
+enum stat_type_cpu {
+	BLKIO_STAT_CPU_SECTORS,
+	/* Total bytes transferred */
+	BLKIO_STAT_CPU_SERVICE_BYTES,
+	/* Total IOs serviced, post merge */
+	BLKIO_STAT_CPU_SERVICED,
+	BLKIO_STAT_CPU_NR
+};
+
 enum stat_sub_type {
 	BLKIO_STAT_READ = 0,
 	BLKIO_STAT_WRITE,
@@ -116,7 +121,6 @@ struct blkio_cgroup {
 struct blkio_group_stats {
 	/* total disk time and nr sectors dispatched by this group */
 	uint64_t time;
-	uint64_t sectors;
 	uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* Time not charged to this cgroup */
@@ -146,6 +150,12 @@ struct blkio_group_stats {
 #endif
 };
 
+/* Per cpu blkio group stats */
+struct blkio_group_stats_cpu {
+	uint64_t sectors;
+	uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
+};
+
 struct blkio_group {
 	/* An rcu protected unique identifier for the group */
 	void *key;
@@ -161,6 +171,8 @@ struct blkio_group {
 	/* Need to serialize the stats in the case of reset/update */
 	spinlock_t stats_lock;
 	struct blkio_group_stats stats;
+	/* Per cpu stats pointer */
+	struct blkio_group_stats_cpu __percpu *stats_cpu;
 };
 
 struct blkio_policy_node {
@@ -296,6 +308,7 @@ extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
 extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 	struct blkio_group *blkg, void *key, dev_t dev,
 	enum blkio_policy_id plid);
+extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
@@ -323,6 +336,8 @@ static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 		struct blkio_group *blkg, void *key, dev_t dev,
 		enum blkio_policy_id plid) {}
 
+static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
+
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 90ad40735f73..c29a5a8cc18c 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -158,6 +158,7 @@ static void throtl_free_tg(struct rcu_head *head)
 	struct throtl_grp *tg;
 
 	tg = container_of(head, struct throtl_grp, rcu_head);
+	free_percpu(tg->blkg.stats_cpu);
 	kfree(tg);
 }
 
@@ -249,11 +250,19 @@ static void throtl_init_add_tg_lists(struct throtl_data *td,
 static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
 {
 	struct throtl_grp *tg = NULL;
+	int ret;
 
 	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
 	if (!tg)
 		return NULL;
 
+	ret = blkio_alloc_blkg_stats(&tg->blkg);
+
+	if (ret) {
+		kfree(tg);
+		return NULL;
+	}
+
 	throtl_init_group(tg);
 	return tg;
 }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 606020fe93f3..d646b279c8bb 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1051,7 +1051,7 @@ static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
 static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
 {
 	struct cfq_group *cfqg = NULL;
-	int i, j;
+	int i, j, ret;
 	struct cfq_rb_root *st;
 
 	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
@@ -1069,6 +1069,13 @@ static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
 	 * or cgroup deletion path depending on who is exiting first.
 	 */
 	cfqg->ref = 1;
+
+	ret = blkio_alloc_blkg_stats(&cfqg->blkg);
+	if (ret) {
+		kfree(cfqg);
+		return NULL;
+	}
+
 	return cfqg;
 }
 
@@ -1183,6 +1190,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
 		return;
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
+	free_percpu(cfqg->blkg.stats_cpu);
 	kfree(cfqg);
 }
 
@@ -3995,7 +4003,15 @@ static void *cfq_init_queue(struct request_queue *q)
 	 * throtl_data goes away.
 	 */
 	cfqg->ref = 2;
+
+	if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
+		kfree(cfqg);
+		kfree(cfqd);
+		return NULL;
+	}
+
 	rcu_read_lock();
+
 	cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
 					(void *)cfqd, 0);
 	rcu_read_unlock();
-- 
cgit v1.2.2


From 575969a0dd3fe65c6556bcb8f87c42303326ea55 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:29 -0400
Subject: blk-cgroup: Make 64bit per cpu stats safe on 32bit arch

Some of the stats are 64bit and updation will be non atomic on 32bit
architecture. Use sequence counters on 32bit arch to make reading
of stats safe.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 27 ++++++++++++++++++++++-----
 block/blk-cgroup.h |  2 ++
 2 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 34bfcefdd924..3622518e1c23 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -400,14 +400,25 @@ void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 				uint64_t bytes, bool direction, bool sync)
 {
 	struct blkio_group_stats_cpu *stats_cpu;
+	unsigned long flags;
+
+	/*
+	 * Disabling interrupts to provide mutual exclusion between two
+	 * writes on same cpu. It probably is not needed for 64bit. Not
+	 * optimizing that case yet.
+	 */
+	local_irq_save(flags);
 
 	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
 
+	u64_stats_update_begin(&stats_cpu->syncp);
 	stats_cpu->sectors += bytes >> 9;
 	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
 			1, direction, sync);
 	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
 			bytes, direction, sync);
+	u64_stats_update_end(&stats_cpu->syncp);
+	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
 
@@ -622,15 +633,21 @@ static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
 {
 	int cpu;
 	struct blkio_group_stats_cpu *stats_cpu;
-	uint64_t val = 0;
+	u64 val = 0, tval;
 
 	for_each_possible_cpu(cpu) {
+		unsigned int start;
 		stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
 
-		if (type == BLKIO_STAT_CPU_SECTORS)
-			val += stats_cpu->sectors;
-		else
-			val += stats_cpu->stat_arr_cpu[type][sub_type];
+		do {
+			start = u64_stats_fetch_begin(&stats_cpu->syncp);
+			if (type == BLKIO_STAT_CPU_SECTORS)
+				tval = stats_cpu->sectors;
+			else
+				tval = stats_cpu->stat_arr_cpu[type][sub_type];
+		} while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
+
+		val += tval;
 	}
 
 	return val;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index fd730a24b491..262226798093 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -14,6 +14,7 @@
  */
 
 #include <linux/cgroup.h>
+#include <linux/u64_stats_sync.h>
 
 enum blkio_policy_id {
 	BLKIO_POLICY_PROP = 0,		/* Proportional Bandwidth division */
@@ -154,6 +155,7 @@ struct blkio_group_stats {
 struct blkio_group_stats_cpu {
 	uint64_t sectors;
 	uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
+	struct u64_stats_sync syncp;
 };
 
 struct blkio_group {
-- 
cgit v1.2.2


From f0bdc8cdd9a2bcc2c84ae2a1fdbff4188b354d8d Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:30 -0400
Subject: blk-cgroup: Make cgroup stat reset path blkg->lock free for dispatch
 stats

Now dispatch stats update is lock free. But reset of these stats still
takes blkg->stats_lock and is dependent on that. As stats are per cpu,
we should be able to just reset the stats on each cpu without any locks.
(Atleast for 64bit arch).

On 32bit arch there is a small race where 64bit updates are not atomic.
The result of this race can be that in the presence of other writers,
one might not get 0 value after reset of a stat and might see something
intermediate

One can write more complicated code to cover this race like sending IPI
to other cpus to reset stats and for offline cpus, reset these directly.

Right not I am not taking that path because reset_update is more of a
debug feature and it can happen only on 32bit arch and possibility of
it happening is small. Will fix it if it becomes a real problem. For
the time being going for code simplicity.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 3622518e1c23..e41cc6f2ccc1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -537,6 +537,30 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
 }
 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
 
+static void blkio_reset_stats_cpu(struct blkio_group *blkg)
+{
+	struct blkio_group_stats_cpu *stats_cpu;
+	int i, j, k;
+	/*
+	 * Note: On 64 bit arch this should not be an issue. This has the
+	 * possibility of returning some inconsistent value on 32bit arch
+	 * as 64bit update on 32bit is non atomic. Taking care of this
+	 * corner case makes code very complicated, like sending IPIs to
+	 * cpus, taking care of stats of offline cpus etc.
+	 *
+	 * reset stats is anyway more of a debug feature and this sounds a
+	 * corner case. So I am not complicating the code yet until and
+	 * unless this becomes a real issue.
+	 */
+	for_each_possible_cpu(i) {
+		stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
+		stats_cpu->sectors = 0;
+		for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
+			for (k = 0; k < BLKIO_STAT_TOTAL; k++)
+				stats_cpu->stat_arr_cpu[j][k] = 0;
+	}
+}
+
 static int
 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 {
@@ -581,7 +605,11 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 		}
 #endif
 		spin_unlock(&blkg->stats_lock);
+
+		/* Reset Per cpu stats which don't take blkg->stats_lock */
+		blkio_reset_stats_cpu(blkg);
 	}
+
 	spin_unlock_irq(&blkcg->lock);
 	return 0;
 }
-- 
cgit v1.2.2


From af75cd3c67845ebe31d2df9a780889a5ebecef11 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 19 May 2011 15:38:31 -0400
Subject: blk-throttle: Make no throttling rule group processing lockless

Currently we take a queue lock on each bio to check if there are any
throttling rules associated with the group and also update the stats.
Now access the group under rcu and update the stats without taking
the queue lock. Queue lock is taken only if there are throttling rules
associated with the group.

So the common case of root group when there are no rules, save
unnecessary pounding of request queue lock.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c29a5a8cc18c..a62be8d0dc1b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -229,6 +229,22 @@ __throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
 	}
 }
 
+/*
+ * Should be called with without queue lock held. Here queue lock will be
+ * taken rarely. It will be taken only once during life time of a group
+ * if need be
+ */
+static void
+throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
+{
+	if (!tg || tg->blkg.dev)
+		return;
+
+	spin_lock_irq(td->queue->queue_lock);
+	__throtl_tg_fill_dev_details(td, tg);
+	spin_unlock_irq(td->queue->queue_lock);
+}
+
 static void throtl_init_add_tg_lists(struct throtl_data *td,
 			struct throtl_grp *tg, struct blkio_cgroup *blkcg)
 {
@@ -666,6 +682,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 	return 0;
 }
 
+static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
+	if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
+		return 1;
+	return 0;
+}
+
 /*
  * Returns whether one can dispatch a bio or not. Also returns approx number
  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
@@ -730,10 +752,6 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	tg->bytes_disp[rw] += bio->bi_size;
 	tg->io_disp[rw]++;
 
-	/*
-	 * TODO: This will take blkg->stats_lock. Figure out a way
-	 * to avoid this cost.
-	 */
 	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
 }
 
@@ -1111,12 +1129,39 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 	struct throtl_grp *tg;
 	struct bio *bio = *biop;
 	bool rw = bio_data_dir(bio), update_disptime = true;
+	struct blkio_cgroup *blkcg;
 
 	if (bio->bi_rw & REQ_THROTTLED) {
 		bio->bi_rw &= ~REQ_THROTTLED;
 		return 0;
 	}
 
+	/*
+	 * A throtl_grp pointer retrieved under rcu can be used to access
+	 * basic fields like stats and io rates. If a group has no rules,
+	 * just update the dispatch stats in lockless manner and return.
+	 */
+
+	rcu_read_lock();
+	blkcg = task_blkio_cgroup(current);
+	tg = throtl_find_tg(td, blkcg);
+	if (tg) {
+		throtl_tg_fill_dev_details(td, tg);
+
+		if (tg_no_rule_group(tg, rw)) {
+			blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
+					rw, bio->bi_rw & REQ_SYNC);
+			rcu_read_unlock();
+			return 0;
+		}
+	}
+	rcu_read_unlock();
+
+	/*
+	 * Either group has not been allocated yet or it is not an unlimited
+	 * IO group
+	 */
+
 	spin_lock_irq(q->queue_lock);
 	tg = throtl_get_tg(td);
 
-- 
cgit v1.2.2


From 771949d03b4f5295f648f09141325fd478f6c7ce Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 20 May 2011 20:52:16 +0200
Subject: block: get rid of on-stack plugging debug checks

We don't need them anymore, so kill:

- REQ_ON_PLUG checks in various places
- !rq_mergeable() check in plug merging

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 27 ---------------------------
 block/elevator.c |  4 ----
 2 files changed, 31 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 9e8e297374b9..7369eeeafe23 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -569,8 +569,6 @@ int blk_get_queue(struct request_queue *q)
 
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
-	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
-
 	if (rq->cmd_flags & REQ_ELVPRIV)
 		elv_put_request(q, rq);
 	mempool_free(rq, q->rq.rq_pool);
@@ -1110,14 +1108,6 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 {
 	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 
-	/*
-	 * Debug stuff, kill later
-	 */
-	if (!rq_mergeable(req)) {
-		blk_dump_rq_flags(req, "back");
-		return false;
-	}
-
 	if (!ll_back_merge_fn(q, req, bio))
 		return false;
 
@@ -1141,14 +1131,6 @@ static bool bio_attempt_front_merge(struct request_queue *q,
 	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	sector_t sector;
 
-	/*
-	 * Debug stuff, kill later
-	 */
-	if (!rq_mergeable(req)) {
-		blk_dump_rq_flags(req, "front");
-		return false;
-	}
-
 	if (!ll_front_merge_fn(q, req, bio))
 		return false;
 
@@ -1258,14 +1240,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 	el_ret = elv_merge(q, &req, bio);
 	if (el_ret == ELEVATOR_BACK_MERGE) {
-		BUG_ON(req->cmd_flags & REQ_ON_PLUG);
 		if (bio_attempt_back_merge(q, req, bio)) {
 			if (!attempt_back_merge(q, req))
 				elv_merged_request(q, req, el_ret);
 			goto out_unlock;
 		}
 	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
-		BUG_ON(req->cmd_flags & REQ_ON_PLUG);
 		if (bio_attempt_front_merge(q, req, bio)) {
 			if (!attempt_front_merge(q, req))
 				elv_merged_request(q, req, el_ret);
@@ -1320,10 +1300,6 @@ get_rq:
 			if (__rq->q != q)
 				plug->should_sort = 1;
 		}
-		/*
-		 * Debug flag, kill later
-		 */
-		req->cmd_flags |= REQ_ON_PLUG;
 		list_add_tail(&req->queuelist, &plug->list);
 		drive_stat_acct(req, 1);
 	} else {
@@ -2749,7 +2725,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
 		list_del_init(&rq->queuelist);
-		BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
 		BUG_ON(!rq->q);
 		if (rq->q != q) {
 			/*
@@ -2761,8 +2736,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 			depth = 0;
 			spin_lock(q->queue_lock);
 		}
-		rq->cmd_flags &= ~REQ_ON_PLUG;
-
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
diff --git a/block/elevator.c b/block/elevator.c
index 2a0b653c90fd..b0b38ce0dcb6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -416,8 +416,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 	struct list_head *entry;
 	int stop_flags;
 
-	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
-
 	if (q->last_merge == rq)
 		q->last_merge = NULL;
 
@@ -656,8 +654,6 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 
 	rq->q = q;
 
-	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
-
 	if (rq->cmd_flags & REQ_SOFTBARRIER) {
 		/* barriers are scheduling boundary, update end_sector */
 		if (rq->cmd_type == REQ_TYPE_FS ||
-- 
cgit v1.2.2


From 2abae55f5a41d29f21b5b259c33d218c950d5c90 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 23 May 2011 10:02:19 +0200
Subject: cfq-iosched: Fix a memory leak of per cpu stats for root group

We allocated per cpu stats struct for root group but did not free it.
Fix it.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d646b279c8bb..0316f6eece03 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3940,6 +3940,11 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	 */
 	if (wait)
 		synchronize_rcu();
+
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	/* Free up per cpu stats for root group */
+	free_percpu(cfqd->root_group.blkg.stats_cpu);
+#endif
 	kfree(cfqd);
 }
 
-- 
cgit v1.2.2


From 317389a7739675aa990b7e0d750a7c435f1d25d7 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 23 May 2011 10:02:19 +0200
Subject: cfq-iosched: Make IO merge related stats per cpu

Make BLKIO_STAT_MERGED per cpu hence gettring rid of need of taking
blkg->stats_lock.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 22 +++++++++++++++++-----
 block/blk-cgroup.h |  4 ++--
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e41cc6f2ccc1..07371cfdfae6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -441,15 +441,27 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 
+/*  Merged stats are per cpu.  */
 void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 					bool sync)
 {
+	struct blkio_group_stats_cpu *stats_cpu;
 	unsigned long flags;
 
-	spin_lock_irqsave(&blkg->stats_lock, flags);
-	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
-			sync);
-	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+	/*
+	 * Disabling interrupts to provide mutual exclusion between two
+	 * writes on same cpu. It probably is not needed for 64bit. Not
+	 * optimizing that case yet.
+	 */
+	local_irq_save(flags);
+
+	stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+
+	u64_stats_update_begin(&stats_cpu->syncp);
+	blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
+				direction, sync);
+	u64_stats_update_end(&stats_cpu->syncp);
+	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
@@ -1244,7 +1256,7 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 						BLKIO_STAT_WAIT_TIME, 1, 0);
 		case BLKIO_PROP_io_merged:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_MERGED, 1, 0);
+						BLKIO_STAT_CPU_MERGED, 1, 1);
 		case BLKIO_PROP_io_queued:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_QUEUED, 1, 0);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 262226798093..a71d2904ffb9 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -39,8 +39,6 @@ enum stat_type {
 	BLKIO_STAT_SERVICE_TIME = 0,
 	/* Total time spent waiting in scheduler queue in ns */
 	BLKIO_STAT_WAIT_TIME,
-	/* Number of IOs merged */
-	BLKIO_STAT_MERGED,
 	/* Number of IOs queued up */
 	BLKIO_STAT_QUEUED,
 	/* All the single valued stats go below this */
@@ -63,6 +61,8 @@ enum stat_type_cpu {
 	BLKIO_STAT_CPU_SERVICE_BYTES,
 	/* Total IOs serviced, post merge */
 	BLKIO_STAT_CPU_SERVICED,
+	/* Number of IOs merged */
+	BLKIO_STAT_CPU_MERGED,
 	BLKIO_STAT_CPU_NR
 };
 
-- 
cgit v1.2.2


From 95cf3dd9dbe6883a0328724e2110e3fc6465630b Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 23 May 2011 10:02:19 +0200
Subject: block: call elv_bio_merged() when merged

Commit 73c101011926 ("block: initial patch for on-stack per-task plugging")
removed calls to elv_bio_merged() when @bio merged with @req. Re-add them.

This in turn will update merged stats in associated group. That
should be safe as long as request has got reference to the blkio_group.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Divyesh Shah <dpshah@google.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 7369eeeafe23..c8303e9d919d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1122,6 +1122,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 
 	drive_stat_acct(req, 0);
+	elv_bio_merged(q, req, bio);
 	return true;
 }
 
@@ -1155,6 +1156,7 @@ static bool bio_attempt_front_merge(struct request_queue *q,
 	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 
 	drive_stat_acct(req, 0);
+	elv_bio_merged(q, req, bio);
 	return true;
 }
 
-- 
cgit v1.2.2


From 4cbadbd16e2fb727f6926597e0a580829e6222f1 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 23 May 2011 19:35:04 +0200
Subject: blk-cgroup: Initialize ioc->cgroup_changed at ioc creation time

If we don't explicitly initialize it to zero, CFQ might think that
cgroup of ioc has changed and it generates lots of unnecessary calls
to call_for_each_cic(changed_cgroup). Fix it.

cfq_get_io_context()
  cfq_ioc_set_cgroup()
     call_for_each_cic(ioc, changed_cgroup)

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-ioc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index b791022beef3..c898049dafd5 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -96,6 +96,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
 		INIT_HLIST_HEAD(&ret->cic_list);
 		ret->ioc_data = NULL;
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
+		ret->cgroup_changed = 0;
+#endif
 	}
 
 	return ret;
-- 
cgit v1.2.2


From b9f8ce059940064a732da49b5d6e618381dad956 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 24 May 2011 10:23:21 +0200
Subject: cfq-iosched: algebraic simplification in cfq_prio_to_maxrq()

Simplify the calculation in cfq_prio_to_maxrq(), plus replace CFQ_PRIO_LISTS to
IOPRIO_BE_NR since they are the same and IOPRIO_BE_NR looks more reasonable in
this context IMHO.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 0316f6eece03..00763e510228 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2102,7 +2102,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
 
-	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
+	return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
 }
 
 /*
-- 
cgit v1.2.2


From 229836bd63c5413a4f8eed93d610b432be141e9b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 24 May 2011 10:23:21 +0200
Subject: cfq-iosched: reduce bit operations in cfq_choose_req()

Reduce the number of bit operations in cfq_choose_req() on average
(and worst) cases.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 00763e510228..7ffceddee420 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -667,15 +667,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
 	if (rq2 == NULL)
 		return rq1;
 
-	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
-		return rq1;
-	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
-		return rq2;
-	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
-		return rq1;
-	else if ((rq2->cmd_flags & REQ_META) &&
-		 !(rq1->cmd_flags & REQ_META))
-		return rq2;
+	if (rq_is_sync(rq1) != rq_is_sync(rq2))
+		return rq_is_sync(rq1) ? rq1 : rq2;
+
+	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
+		return rq1->cmd_flags & REQ_META ? rq1 : rq2;
 
 	s1 = blk_rq_pos(rq1);
 	s2 = blk_rq_pos(rq2);
-- 
cgit v1.2.2


From 20359f27e8ff115f7cddf3da5b3a6cdcca2e650d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 24 May 2011 10:23:22 +0200
Subject: cfq-iosched: remove unused 'group_changed' in cfq_service_tree_add()

The 'group_changed' variable is initialized to 0 and never changed, so
checking the variable is meaningless.

It is a leftover from 0bbfeb832042 ("cfq-iosched: Always provide group
iosolation."). Let's get rid of it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Justin TerAvest <teravest@google.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7ffceddee420..6dd2179cf1a4 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1279,7 +1279,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_rb_root *service_tree;
 	int left;
 	int new_cfqq = 1;
-	int group_changed = 0;
 
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
 						cfqq_type(cfqq));
@@ -1350,7 +1349,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	rb_link_node(&cfqq->rb_node, parent, p);
 	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
 	service_tree->count++;
-	if ((add_front || !new_cfqq) && !group_changed)
+	if (add_front || !new_cfqq)
 		return;
 	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
 }
-- 
cgit v1.2.2


From 1547010e6e15a3f44f49381246421a1e19de526e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 24 May 2011 10:23:22 +0200
Subject: cfq-iosched: free cic_index if cfqd allocation fails

When struct cfq_data allocation fails, cic_index need to be freed.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6dd2179cf1a4..7c52d6888924 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3973,8 +3973,12 @@ static void *cfq_init_queue(struct request_queue *q)
 		return NULL;
 
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
-	if (!cfqd)
+	if (!cfqd) {
+		spin_lock(&cic_index_lock);
+		ida_remove(&cic_index_ida, i);
+		spin_unlock(&cic_index_lock);
 		return NULL;
+	}
 
 	/*
 	 * Don't need take queue_lock in the routine, since we are
-- 
cgit v1.2.2


From 75e3f3ee3c64968d42f4843ec49e579f84b5aa0c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 May 2011 21:06:50 +0200
Subject: block: always allocate genhd->ev if check_events is implemented

9fd097b149 (block: unexport DISK_EVENT_MEDIA_CHANGE for legacy/fringe
drivers) removed DISK_EVENT_MEDIA_CHANGE from legacy/fringe block
drivers which have inadequate ->check_events().  Combined with earlier
change 7c88a168da (block: don't propagate unlisted DISK_EVENTs to
userland), this enables using ->check_events() for internal processing
while avoiding enabling in-kernel block event polling which can lead
to infinite event loop.

Unfortunately, this made many drivers including floppy without any bit
set in disk->events and ->async_events in which case disk_add_events()
simply skipped allocation of disk->ev, which disables whole event
handling.  As ->check_events() is still used during open processing
for revalidation, this can lead to open failure.

This patch always allocates disk->ev if ->check_events is implemented.
In the long term, it would make sense to simply include the event
structure inline into genhd as it's now used by virtually all block
devices.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ondrej Zary <linux@rainbow-software.org>
Reported-by: Alex Villacis Lasso <avillaci@ceibo.fiec.espol.edu.ec>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 2dd988723d73..95822ae25cfe 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1728,7 +1728,7 @@ static void disk_add_events(struct gendisk *disk)
 {
 	struct disk_events *ev;
 
-	if (!disk->fops->check_events || !(disk->events | disk->async_events))
+	if (!disk->fops->check_events)
 		return;
 
 	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
-- 
cgit v1.2.2


From 700c4f3325495d2e0e619fb48b900ec942f1470b Mon Sep 17 00:00:00 2001
From: Luca Tettamanti <kronos.it@gmail.com>
Date: Thu, 26 May 2011 21:07:26 +0200
Subject: block: remove unused variable in bio_attempt_front_merge()

sector is never read inside the function.

Signed-off-by: Luca Tettamanti <kronos.it@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index c8303e9d919d..dd8ae71168c5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1130,7 +1130,6 @@ static bool bio_attempt_front_merge(struct request_queue *q,
 				    struct request *req, struct bio *bio)
 {
 	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
-	sector_t sector;
 
 	if (!ll_front_merge_fn(q, req, bio))
 		return false;
@@ -1140,8 +1139,6 @@ static bool bio_attempt_front_merge(struct request_queue *q,
 	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
 		blk_rq_set_mixed_merge(req);
 
-	sector = bio->bi_sector;
-
 	bio->bi_next = req->bio;
 	req->bio = bio;
 
-- 
cgit v1.2.2


From f780bdb7c1c73009cb57adcf99ef50027d80bf3c Mon Sep 17 00:00:00 2001
From: Ben Blum <bblum@andrew.cmu.edu>
Date: Thu, 26 May 2011 16:25:19 -0700
Subject: cgroups: add per-thread subsystem callbacks

Add cgroup subsystem callbacks for per-thread attachment in atomic contexts

Add can_attach_task(), pre_attach(), and attach_task() as new callbacks
for cgroups's subsystem interface.  Unlike can_attach and attach, these
are for per-thread operations, to be called potentially many times when
attaching an entire threadgroup.

Also, the old "bool threadgroup" interface is removed, as replaced by
this.  All subsystems are modified for the new interface - of note is
cpuset, which requires from/to nodemasks for attach to be globally scoped
(though per-cpuset would work too) to persist from its pre_attach to
attach_task and attach.

This is a pre-patch for cgroup-procs-writable.patch.

Signed-off-by: Ben Blum <bblum@andrew.cmu.edu>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-cgroup.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 07371cfdfae6..bcaf16ee6ad1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,10 +30,8 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
 						  struct cgroup *);
-static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
-			      struct task_struct *, bool);
-static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
-			   struct cgroup *, struct task_struct *, bool);
+static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
+static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
 
@@ -46,8 +44,8 @@ static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
 struct cgroup_subsys blkio_subsys = {
 	.name = "blkio",
 	.create = blkiocg_create,
-	.can_attach = blkiocg_can_attach,
-	.attach = blkiocg_attach,
+	.can_attach_task = blkiocg_can_attach_task,
+	.attach_task = blkiocg_attach_task,
 	.destroy = blkiocg_destroy,
 	.populate = blkiocg_populate,
 #ifdef CONFIG_BLK_CGROUP
@@ -1616,9 +1614,7 @@ done:
  * of the main cic data structures.  For now we allow a task to change
  * its cgroup only if it's the only owner of its ioc.
  */
-static int blkiocg_can_attach(struct cgroup_subsys *subsys,
-				struct cgroup *cgroup, struct task_struct *tsk,
-				bool threadgroup)
+static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 	struct io_context *ioc;
 	int ret = 0;
@@ -1633,9 +1629,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys,
 	return ret;
 }
 
-static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
-				struct cgroup *prev, struct task_struct *tsk,
-				bool threadgroup)
+static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 	struct io_context *ioc;
 
-- 
cgit v1.2.2


From d86e0e83b32bc84600adb0b6ea1fce389b266682 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 27 May 2011 07:44:43 +0200
Subject: block: export blk_{get,put}_queue()

We need them in SCSI to fix a bug, but currently they are not
exported to modules. Export them.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index dd8ae71168c5..d2f8f4049abd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -345,6 +345,7 @@ void blk_put_queue(struct request_queue *q)
 {
 	kobject_put(&q->kobj);
 }
+EXPORT_SYMBOL(blk_put_queue);
 
 /*
  * Note: If a driver supplied the queue lock, it should not zap that lock
@@ -566,6 +567,7 @@ int blk_get_queue(struct request_queue *q)
 
 	return 1;
 }
+EXPORT_SYMBOL(blk_get_queue);
 
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
-- 
cgit v1.2.2


From 4495a7d41dbda03841c2a1c2a5ce7135a45131ba Mon Sep 17 00:00:00 2001
From: Kyungmin Park <kyungmin.park@samsung.com>
Date: Tue, 31 May 2011 10:04:09 +0200
Subject: CFQ: Fix typo and remove unnecessary semicolon

Fix comment typo and remove unnecessary semicolon at macro

Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7c52d6888924..8a02c955c610 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -185,7 +185,7 @@ struct cfq_group {
 	int nr_cfqq;
 
 	/*
-	 * Per group busy queus average. Useful for workload slice calc. We
+	 * Per group busy queues average. Useful for workload slice calc. We
 	 * create the array for each prio class but at run time it is used
 	 * only for RT and BE class and slot for IDLE class remains unused.
 	 * This is primarily done to avoid confusion and a gcc warning.
@@ -369,16 +369,16 @@ CFQ_CFQQ_FNS(wait_busy);
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
 			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
-			blkg_path(&(cfqq)->cfqg->blkg), ##args);
+			blkg_path(&(cfqq)->cfqg->blkg), ##args)
 
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
 	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
-				blkg_path(&(cfqg)->blkg), ##args);      \
+				blkg_path(&(cfqg)->blkg), ##args)       \
 
 #else
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
-#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0);
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
 #endif
 #define cfq_log(cfqd, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
-- 
cgit v1.2.2


From 28304f485c3627cc5e1665b92e26eb7fcfe98088 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Thu, 2 Jun 2011 13:05:02 +0200
Subject: cfq-iosched: Remove bogus check in queue_fail path

queue_fail can only be reached if cic is NULL, so its check for cic must
be bogus.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8a02c955c610..3c7b537bf908 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3786,9 +3786,6 @@ new_queue:
 	return 0;
 
 queue_fail:
-	if (cic)
-		put_io_context(cic->ioc);
-
 	cfq_schedule_dispatch(cfqd);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	cfq_log(cfqd, "set_request fail");
-- 
cgit v1.2.2


From e2bd9678fc0085acf540dc4cb48ff961cd4d88c0 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Thu, 2 Jun 2011 13:05:02 +0200
Subject: block: Use hlist_entry() for io_context.cic_list.first

list_entry() and hlist_entry() are both simply aliases for
container_of(), but since io_context.cic_list.first is an hlist_node one
should at least use the correct alias.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-ioc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index c898049dafd5..342eae9b0d3c 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -21,7 +21,7 @@ static void cfq_dtor(struct io_context *ioc)
 	if (!hlist_empty(&ioc->cic_list)) {
 		struct cfq_io_context *cic;
 
-		cic = list_entry(ioc->cic_list.first, struct cfq_io_context,
+		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
 								cic_list);
 		cic->dtor(ioc);
 	}
@@ -57,7 +57,7 @@ static void cfq_exit(struct io_context *ioc)
 	if (!hlist_empty(&ioc->cic_list)) {
 		struct cfq_io_context *cic;
 
-		cic = list_entry(ioc->cic_list.first, struct cfq_io_context,
+		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
 								cic_list);
 		cic->exit(ioc);
 	}
-- 
cgit v1.2.2


From ab4bd22d3cce6977dc039664cc2d052e3147d662 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Sun, 5 Jun 2011 06:01:13 +0200
Subject: cfq-iosched: fix locking around ioc->ioc_data assignment

Since we are modifying this RCU pointer, we need to hold
the lock protecting it around it.

This fixes a potential reuse and double free of a cfq
io_context structure. The bug has been in CFQ for a long
time, it hit very few people but those it did hit seemed
to see it a lot.

Tracked in RH bugzilla here:

https://bugzilla.redhat.com/show_bug.cgi?id=577968

Credit goes to Paul Bolle for figuring out that the issue
was around the one-hit ioc->ioc_data cache. Thanks to his
hard work the issue is now fixed.

Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3c7b537bf908..545b8d4c829d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2772,8 +2772,11 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
 	smp_wmb();
 	cic->key = cfqd_dead_key(cfqd);
 
-	if (ioc->ioc_data == cic)
+	if (rcu_dereference(ioc->ioc_data) == cic) {
+		spin_lock(&ioc->lock);
 		rcu_assign_pointer(ioc->ioc_data, NULL);
+		spin_unlock(&ioc->lock);
+	}
 
 	if (cic->cfqq[BLK_RW_ASYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
-- 
cgit v1.2.2


From a9dce2a3b4f0686dd66cb44d4826a59508bce969 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jun 2011 20:43:54 +0200
Subject: block: don't use non-syncing event blocking in disk_check_events()

This patch is part of fix for triggering of WARN_ON_ONCE() in
disk_clear_events() reported in bug#34662.

  https://bugzilla.kernel.org/show_bug.cgi?id=34662

disk_clear_events() blocks events, schedules and flushes the event
work.  It expects the work to have started execution on schedule and
finished on return from flush.  WARN_ON_ONCE() triggers if the event
work hasn't executed as expected.  This problem happens because
__disk_block_events() fails to guarantee that the event work item is
not in flight on return from the function in race-free manner.  The
problem is two-fold and this patch addresses one of them.

When __disk_block_events() is called with @sync == %false, it bumps
event block count, calls cancel_delayed_work() and return.  This makes
it impossible to guarantee that event polling is not in flight on
return from syncing __disk_block_events() - if the first blocker was
non-syncing, polling could still be in progress and later syncing ones
would assume that the first blocker already canceled it.

Making __disk_block_events() cancel_sync regardless of block count
isn't feasible either as it may race with forced event checking in
disk_clear_events().

As disk_check_events() is the only user of non-syncing
__disk_block_events(), updating it to directly cancel and schedule
event work is the easiest way to solve the issue.

Note that there's another bug in __disk_block_events() and this patch
doesn't fix the issue completely.  Later patch will fix the other bug.

Signed-off-by: Tejun Heo <tj@kernel.org>
Tested-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Reported-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Reported-by: Borislav Petkov <bp@alien8.de>
Reported-by: Meelis Roos <mroos@linux.ee>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 95822ae25cfe..3f0933077642 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1508,10 +1508,18 @@ void disk_unblock_events(struct gendisk *disk)
  */
 void disk_check_events(struct gendisk *disk)
 {
-	if (disk->ev) {
-		__disk_block_events(disk, false);
-		__disk_unblock_events(disk, true);
+	struct disk_events *ev = disk->ev;
+	unsigned long flags;
+
+	if (!ev)
+		return;
+
+	spin_lock_irqsave(&ev->lock, flags);
+	if (!ev->block) {
+		cancel_delayed_work(&ev->dwork);
+		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
 	}
+	spin_unlock_irqrestore(&ev->lock, flags);
 }
 EXPORT_SYMBOL_GPL(disk_check_events);
 
-- 
cgit v1.2.2


From c3af54afbac3675337cedf326b7b127ffa7f7327 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jun 2011 20:43:55 +0200
Subject: block: remove non-syncing __disk_block_events() and fold it into
 disk_block_events()

After the previous update to disk_check_events(), nobody is using
non-syncing __disk_block_events().  Remove @sync and, as this makes
__disk_block_events() virtually identical to disk_block_events(),
remove the underscore prefixed version.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 55 ++++++++++++++++++++++++-------------------------------
 1 file changed, 24 insertions(+), 31 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 3f0933077642..ab0731d8976d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1414,22 +1414,36 @@ static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
 	return msecs_to_jiffies(intv_msecs);
 }
 
-static void __disk_block_events(struct gendisk *disk, bool sync)
+/**
+ * disk_block_events - block and flush disk event checking
+ * @disk: disk to block events for
+ *
+ * On return from this function, it is guaranteed that event checking
+ * isn't in progress and won't happen until unblocked by
+ * disk_unblock_events().  Events blocking is counted and the actual
+ * unblocking happens after the matching number of unblocks are done.
+ *
+ * Note that this intentionally does not block event checking from
+ * disk_clear_events().
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void disk_block_events(struct gendisk *disk)
 {
 	struct disk_events *ev = disk->ev;
 	unsigned long flags;
 	bool cancel;
 
+	if (!ev)
+		return;
+
 	spin_lock_irqsave(&ev->lock, flags);
 	cancel = !ev->block++;
 	spin_unlock_irqrestore(&ev->lock, flags);
 
-	if (cancel) {
-		if (sync)
-			cancel_delayed_work_sync(&disk->ev->dwork);
-		else
-			cancel_delayed_work(&disk->ev->dwork);
-	}
+	if (cancel)
+		cancel_delayed_work_sync(&disk->ev->dwork);
 }
 
 static void __disk_unblock_events(struct gendisk *disk, bool check_now)
@@ -1460,27 +1474,6 @@ out_unlock:
 	spin_unlock_irqrestore(&ev->lock, flags);
 }
 
-/**
- * disk_block_events - block and flush disk event checking
- * @disk: disk to block events for
- *
- * On return from this function, it is guaranteed that event checking
- * isn't in progress and won't happen until unblocked by
- * disk_unblock_events().  Events blocking is counted and the actual
- * unblocking happens after the matching number of unblocks are done.
- *
- * Note that this intentionally does not block event checking from
- * disk_clear_events().
- *
- * CONTEXT:
- * Might sleep.
- */
-void disk_block_events(struct gendisk *disk)
-{
-	if (disk->ev)
-		__disk_block_events(disk, true);
-}
-
 /**
  * disk_unblock_events - unblock disk event checking
  * @disk: disk to unblock events for
@@ -1554,7 +1547,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
 	spin_unlock_irq(&ev->lock);
 
 	/* uncondtionally schedule event check and wait for it to finish */
-	__disk_block_events(disk, true);
+	disk_block_events(disk);
 	queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
 	flush_delayed_work(&ev->dwork);
 	__disk_unblock_events(disk, false);
@@ -1672,7 +1665,7 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev,
 	if (intv < 0 && intv != -1)
 		return -EINVAL;
 
-	__disk_block_events(disk, true);
+	disk_block_events(disk);
 	disk->ev->poll_msecs = intv;
 	__disk_unblock_events(disk, true);
 
@@ -1778,7 +1771,7 @@ static void disk_del_events(struct gendisk *disk)
 	if (!disk->ev)
 		return;
 
-	__disk_block_events(disk, true);
+	disk_block_events(disk);
 
 	mutex_lock(&disk_events_mutex);
 	list_del_init(&disk->ev->node);
-- 
cgit v1.2.2


From fdd514e16bb2531c0c61ae8a1f87740ce217f630 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jun 2011 20:43:59 +0200
Subject: block: make disk_block_events() properly wait for work cancellation

disk_block_events() should guarantee that the event work is not in
flight on return and once blocked it shouldn't issue further
cancellations.

Because there was no synchronization between the first blocker doing
cancel_delayed_work_sync() and the following blockers, the following
blockers could finish before cancellation was complete, which broke
both guarantees - event work could be in flight and cancellation could
happen after return.

This bug triggered WARN_ON_ONCE() in disk_clear_events() reported in
bug#34662.

  https://bugzilla.kernel.org/show_bug.cgi?id=34662

Fix it by adding an outer mutex which protects both block count
manipulation and work cancellation.

-v2: Use outer mutex instead of bit waitqueue per Linus.

Signed-off-by: Tejun Heo <tj@kernel.org>
Tested-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Reported-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Reported-by: Borislav Petkov <bp@alien8.de>
Reported-by: Meelis Roos <mroos@linux.ee>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index ab0731d8976d..3608289c8ecd 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1371,6 +1371,7 @@ struct disk_events {
 	struct gendisk		*disk;		/* the associated disk */
 	spinlock_t		lock;
 
+	struct mutex		block_mutex;	/* protects blocking */
 	int			block;		/* event blocking depth */
 	unsigned int		pending;	/* events already sent out */
 	unsigned int		clearing;	/* events being cleared */
@@ -1438,12 +1439,20 @@ void disk_block_events(struct gendisk *disk)
 	if (!ev)
 		return;
 
+	/*
+	 * Outer mutex ensures that the first blocker completes canceling
+	 * the event work before further blockers are allowed to finish.
+	 */
+	mutex_lock(&ev->block_mutex);
+
 	spin_lock_irqsave(&ev->lock, flags);
 	cancel = !ev->block++;
 	spin_unlock_irqrestore(&ev->lock, flags);
 
 	if (cancel)
 		cancel_delayed_work_sync(&disk->ev->dwork);
+
+	mutex_unlock(&ev->block_mutex);
 }
 
 static void __disk_unblock_events(struct gendisk *disk, bool check_now)
@@ -1751,6 +1760,7 @@ static void disk_add_events(struct gendisk *disk)
 	INIT_LIST_HEAD(&ev->node);
 	ev->disk = disk;
 	spin_lock_init(&ev->lock);
+	mutex_init(&ev->block_mutex);
 	ev->block = 1;
 	ev->poll_msecs = -1;
 	INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
-- 
cgit v1.2.2


From 08e8138adebdd511e0955e8d6c051904bb4082af Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 13 Jun 2011 10:42:49 +0200
Subject: block: Add __attribute__((format(printf...) and fix fallout

Use the compiler to verify format strings and arguments.

Fix fallout.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c |  4 ++--
 block/cfq-iosched.c  | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a62be8d0dc1b..3689f833afdc 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -927,7 +927,7 @@ static int throtl_dispatch(struct request_queue *q)
 
 	bio_list_init(&bio_list_on_stack);
 
-	throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
+	throtl_log(td, "dispatch nr_queued=%d read=%u write=%u",
 			total_nr_queued(td), td->nr_queued[READ],
 			td->nr_queued[WRITE]);
 
@@ -1204,7 +1204,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 	}
 
 queue_bio:
-	throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
+	throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
 			" iodisp=%u iops=%u queued=%d/%d",
 			rw == READ ? 'R' : 'W',
 			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 545b8d4c829d..f3799432676d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -988,9 +988,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
-	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
-			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
-			iops_mode(cfqd), cfqq->nr_sectors);
+	cfq_log_cfqq(cfqq->cfqd, cfqq,
+		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
+		     used_sl, cfqq->slice_dispatch, charge,
+		     iops_mode(cfqd), cfqq->nr_sectors);
 	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
 					  unaccounted_sl);
 	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
@@ -2023,8 +2024,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 */
 	if (sample_valid(cic->ttime_samples) &&
 	    (cfqq->slice_end - jiffies < cic->ttime_mean)) {
-		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
-				cic->ttime_mean);
+		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
+			     cic->ttime_mean);
 		return;
 	}
 
-- 
cgit v1.2.2


From 3181faa85bda3dc3f5e630a1846526c9caaa38e3 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 27 Jun 2011 09:03:47 +0200
Subject: cfq-iosched: fix a rcu warning

I got a rcu warnning at boot. the ioc->ioc_data is rcu_deferenced, but
doesn't hold rcu_read_lock.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: stable@kernel.org # after ab4bd22d
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f3799432676d..fd566c1e2617 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2773,11 +2773,14 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
 	smp_wmb();
 	cic->key = cfqd_dead_key(cfqd);
 
+	rcu_read_lock();
 	if (rcu_dereference(ioc->ioc_data) == cic) {
+		rcu_read_unlock();
 		spin_lock(&ioc->lock);
 		rcu_assign_pointer(ioc->ioc_data, NULL);
 		spin_unlock(&ioc->lock);
-	}
+	} else
+		rcu_read_unlock();
 
 	if (cic->cfqq[BLK_RW_ASYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
-- 
cgit v1.2.2


From 726e99ab88db059fe1422e15376ae404f8c66eb4 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 27 Jun 2011 09:03:48 +0200
Subject: cfq-iosched: make code consistent

ioc->ioc_data is rcu protectd, so uses correct API to access it.
This doesn't change any behavior, but just make code consistent.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: stable@kernel.org # after ab4bd22d
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fd566c1e2617..ae21919f15e1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3087,7 +3087,8 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
 
 	spin_lock_irqsave(&ioc->lock, flags);
 
-	BUG_ON(ioc->ioc_data == cic);
+	BUG_ON(rcu_dereference_check(ioc->ioc_data,
+		lockdep_is_held(&ioc->lock)) == cic);
 
 	radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
 	hlist_del_rcu(&cic->cic_list);
-- 
cgit v1.2.2